chacha20_arm.c (7055B)
1 #include <stddef.h> 2 #include <stdint.h> 3 #include <string.h> 4 5 #if defined(__aarch64__) 6 7 #include <arm_neon.h> 8 9 /* 10 * ChaCha20 NEON kernel using intra-block parallelism. The 16-word 11 * state matrix 12 * 13 * s00 s01 s02 s03 14 * s04 s05 s06 s07 15 * s08 s09 s10 s11 16 * s12 s13 s14 s15 17 * 18 * is held in four 128-bit NEON registers v0..v3, one per row. A 19 * column quarter-round on (s00, s04, s08, s12), (s01, s05, s09, s13), 20 * etc., becomes one set of element-wise vector operations on 21 * (v0, v1, v2, v3) — four quarter-rounds in parallel. Diagonal 22 * rounds are reached by left-rotating v1, v2, v3 by 1, 2, 3 lanes 23 * respectively with VEXT before the second quarter-round, then 24 * rotating back. 25 */ 26 27 /* 32-bit left rotations. Rotate-by-16 reduces to REV32.u16; the 28 * others compile to a shift-shift-or pair (the compiler folds rotate- 29 * by-8 to a TBL with a constant shuffle on some targets). */ 30 #define ROTL32_16(x) \ 31 vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))) 32 #define ROTL32_12(x) \ 33 vorrq_u32(vshlq_n_u32((x), 12), vshrq_n_u32((x), 20)) 34 #define ROTL32_8(x) \ 35 vorrq_u32(vshlq_n_u32((x), 8), vshrq_n_u32((x), 24)) 36 #define ROTL32_7(x) \ 37 vorrq_u32(vshlq_n_u32((x), 7), vshrq_n_u32((x), 25)) 38 39 #define QUARTER(v0, v1, v2, v3) \ 40 do { \ 41 v0 = vaddq_u32(v0, v1); \ 42 v3 = veorq_u32(v3, v0); v3 = ROTL32_16(v3); \ 43 v2 = vaddq_u32(v2, v3); \ 44 v1 = veorq_u32(v1, v2); v1 = ROTL32_12(v1); \ 45 v0 = vaddq_u32(v0, v1); \ 46 v3 = veorq_u32(v3, v0); v3 = ROTL32_8(v3); \ 47 v2 = vaddq_u32(v2, v3); \ 48 v1 = veorq_u32(v1, v2); v1 = ROTL32_7(v1); \ 49 } while (0) 50 51 /* 20-round ChaCha20 core: 10 iterations of (column + diagonal). */ 52 static inline void chacha20_core(uint32x4_t *v0, uint32x4_t *v1, 53 uint32x4_t *v2, uint32x4_t *v3, 54 uint32x4_t s0, uint32x4_t s1, 55 uint32x4_t s2, uint32x4_t s3) { 56 uint32x4_t a = s0, b = s1, c = s2, d = s3; 57 for (int i = 0; i < 10; i++) { 58 QUARTER(a, b, c, d); 59 /* shift rows: row 1 left 1, row 2 left 2, row 3 left 3. */ 60 b = vextq_u32(b, b, 1); 61 c = vextq_u32(c, c, 2); 62 d = vextq_u32(d, d, 3); 63 QUARTER(a, b, c, d); 64 /* shift back. */ 65 b = vextq_u32(b, b, 3); 66 c = vextq_u32(c, c, 2); 67 d = vextq_u32(d, d, 1); 68 } 69 *v0 = vaddq_u32(a, s0); 70 *v1 = vaddq_u32(b, s1); 71 *v2 = vaddq_u32(c, s2); 72 *v3 = vaddq_u32(d, s3); 73 } 74 75 static const uint32_t chacha_constants[4] = { 76 0x61707865u, 0x3320646eu, 0x79622d32u, 0x6b206574u 77 }; 78 79 /* Set up the constant rows of the state from key + nonce. s3 80 * (counter + nonce) varies per block and is built inside the loop. */ 81 static inline void chacha20_setup(const uint8_t key[32], 82 const uint8_t nonce[12], 83 uint32x4_t *s0, uint32x4_t *s1, 84 uint32x4_t *s2, 85 uint32_t *n0, uint32_t *n1, 86 uint32_t *n2) { 87 *s0 = vld1q_u32(chacha_constants); 88 *s1 = vreinterpretq_u32_u8(vld1q_u8(key)); 89 *s2 = vreinterpretq_u32_u8(vld1q_u8(key + 16)); 90 memcpy(n0, nonce + 0, 4); 91 memcpy(n1, nonce + 4, 4); 92 memcpy(n2, nonce + 8, 4); 93 } 94 95 /* 96 * Generate one 64-byte ChaCha20 keystream block at 'out'. 97 */ 98 void chacha20_block_arm(const uint8_t key[32], uint32_t counter, 99 const uint8_t nonce[12], uint8_t out[64]) { 100 uint32x4_t s0, s1, s2; 101 uint32_t n0, n1, n2; 102 chacha20_setup(key, nonce, &s0, &s1, &s2, &n0, &n1, &n2); 103 104 uint32_t s3_in[4] = { counter, n0, n1, n2 }; 105 uint32x4_t s3 = vld1q_u32(s3_in); 106 uint32x4_t v0, v1, v2, v3; 107 chacha20_core(&v0, &v1, &v2, &v3, s0, s1, s2, s3); 108 109 vst1q_u8(out + 0, vreinterpretq_u8_u32(v0)); 110 vst1q_u8(out + 16, vreinterpretq_u8_u32(v1)); 111 vst1q_u8(out + 32, vreinterpretq_u8_u32(v2)); 112 vst1q_u8(out + 48, vreinterpretq_u8_u32(v3)); 113 } 114 115 /* 116 * Encrypt/decrypt 'inlen' bytes at 'in' into 'out' using ChaCha20 117 * with the given key, starting counter, and nonce. Stream cipher, 118 * so the same routine decrypts. 119 */ 120 void chacha20_cipher_arm(const uint8_t key[32], uint32_t counter, 121 const uint8_t nonce[12], 122 const uint8_t *in, uint8_t *out, 123 size_t inlen) { 124 uint32x4_t s0, s1, s2; 125 uint32_t n0, n1, n2; 126 chacha20_setup(key, nonce, &s0, &s1, &s2, &n0, &n1, &n2); 127 128 size_t pos = 0; 129 while (pos + 64 <= inlen) { 130 uint32_t s3_in[4] = { counter, n0, n1, n2 }; 131 uint32x4_t s3 = vld1q_u32(s3_in); 132 uint32x4_t v0, v1, v2, v3; 133 chacha20_core(&v0, &v1, &v2, &v3, s0, s1, s2, s3); 134 135 uint8x16_t i0 = vld1q_u8(in + pos + 0); 136 uint8x16_t i1 = vld1q_u8(in + pos + 16); 137 uint8x16_t i2 = vld1q_u8(in + pos + 32); 138 uint8x16_t i3 = vld1q_u8(in + pos + 48); 139 140 vst1q_u8(out + pos + 0, 141 veorq_u8(i0, vreinterpretq_u8_u32(v0))); 142 vst1q_u8(out + pos + 16, 143 veorq_u8(i1, vreinterpretq_u8_u32(v1))); 144 vst1q_u8(out + pos + 32, 145 veorq_u8(i2, vreinterpretq_u8_u32(v2))); 146 vst1q_u8(out + pos + 48, 147 veorq_u8(i3, vreinterpretq_u8_u32(v3))); 148 149 pos += 64; 150 counter++; 151 } 152 153 /* trailing partial block (< 64 bytes) */ 154 if (pos < inlen) { 155 uint32_t s3_in[4] = { counter, n0, n1, n2 }; 156 uint32x4_t s3 = vld1q_u32(s3_in); 157 uint32x4_t v0, v1, v2, v3; 158 chacha20_core(&v0, &v1, &v2, &v3, s0, s1, s2, s3); 159 160 uint8_t block[64]; 161 vst1q_u8(block + 0, vreinterpretq_u8_u32(v0)); 162 vst1q_u8(block + 16, vreinterpretq_u8_u32(v1)); 163 vst1q_u8(block + 32, vreinterpretq_u8_u32(v2)); 164 vst1q_u8(block + 48, vreinterpretq_u8_u32(v3)); 165 166 size_t remaining = inlen - pos; 167 for (size_t i = 0; i < remaining; i++) { 168 out[pos + i] = in[pos + i] ^ block[i]; 169 } 170 } 171 } 172 173 int chacha20_arm_available(void) { 174 return 1; 175 } 176 177 #else 178 179 /* stubs for non-aarch64 builds; never reached because dispatch is 180 * gated on 'chacha20_arm_available' returning 0 */ 181 182 void chacha20_block_arm(const uint8_t *key, uint32_t counter, 183 const uint8_t *nonce, uint8_t *out) { 184 (void)key; (void)counter; (void)nonce; (void)out; 185 } 186 187 void chacha20_cipher_arm(const uint8_t *key, uint32_t counter, 188 const uint8_t *nonce, 189 const uint8_t *in, uint8_t *out, 190 size_t inlen) { 191 (void)key; (void)counter; (void)nonce; 192 (void)in; (void)out; (void)inlen; 193 } 194 195 int chacha20_arm_available(void) { 196 return 0; 197 } 198 199 #endif