chacha

The ChaCha20 stream cipher (docs.ppad.tech/chacha).
git clone git://git.ppad.tech/chacha.git
Log | Files | Refs | README | LICENSE

chacha20_arm.c (7055B)


      1 #include <stddef.h>
      2 #include <stdint.h>
      3 #include <string.h>
      4 
      5 #if defined(__aarch64__)
      6 
      7 #include <arm_neon.h>
      8 
      9 /*
     10  * ChaCha20 NEON kernel using intra-block parallelism.  The 16-word
     11  * state matrix
     12  *
     13  *     s00 s01 s02 s03
     14  *     s04 s05 s06 s07
     15  *     s08 s09 s10 s11
     16  *     s12 s13 s14 s15
     17  *
     18  * is held in four 128-bit NEON registers v0..v3, one per row.  A
     19  * column quarter-round on (s00, s04, s08, s12), (s01, s05, s09, s13),
     20  * etc., becomes one set of element-wise vector operations on
     21  * (v0, v1, v2, v3) — four quarter-rounds in parallel.  Diagonal
     22  * rounds are reached by left-rotating v1, v2, v3 by 1, 2, 3 lanes
     23  * respectively with VEXT before the second quarter-round, then
     24  * rotating back.
     25  */
     26 
     27 /* 32-bit left rotations.  Rotate-by-16 reduces to REV32.u16; the
     28  * others compile to a shift-shift-or pair (the compiler folds rotate-
     29  * by-8 to a TBL with a constant shuffle on some targets).         */
     30 #define ROTL32_16(x) \
     31     vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)))
     32 #define ROTL32_12(x) \
     33     vorrq_u32(vshlq_n_u32((x), 12), vshrq_n_u32((x), 20))
     34 #define ROTL32_8(x) \
     35     vorrq_u32(vshlq_n_u32((x),  8), vshrq_n_u32((x), 24))
     36 #define ROTL32_7(x) \
     37     vorrq_u32(vshlq_n_u32((x),  7), vshrq_n_u32((x), 25))
     38 
     39 #define QUARTER(v0, v1, v2, v3)                          \
     40     do {                                                  \
     41         v0 = vaddq_u32(v0, v1);                           \
     42         v3 = veorq_u32(v3, v0); v3 = ROTL32_16(v3);       \
     43         v2 = vaddq_u32(v2, v3);                           \
     44         v1 = veorq_u32(v1, v2); v1 = ROTL32_12(v1);       \
     45         v0 = vaddq_u32(v0, v1);                           \
     46         v3 = veorq_u32(v3, v0); v3 = ROTL32_8(v3);        \
     47         v2 = vaddq_u32(v2, v3);                           \
     48         v1 = veorq_u32(v1, v2); v1 = ROTL32_7(v1);        \
     49     } while (0)
     50 
     51 /* 20-round ChaCha20 core: 10 iterations of (column + diagonal). */
     52 static inline void chacha20_core(uint32x4_t *v0, uint32x4_t *v1,
     53                                   uint32x4_t *v2, uint32x4_t *v3,
     54                                   uint32x4_t s0, uint32x4_t s1,
     55                                   uint32x4_t s2, uint32x4_t s3) {
     56     uint32x4_t a = s0, b = s1, c = s2, d = s3;
     57     for (int i = 0; i < 10; i++) {
     58         QUARTER(a, b, c, d);
     59         /* shift rows: row 1 left 1, row 2 left 2, row 3 left 3.    */
     60         b = vextq_u32(b, b, 1);
     61         c = vextq_u32(c, c, 2);
     62         d = vextq_u32(d, d, 3);
     63         QUARTER(a, b, c, d);
     64         /* shift back.                                              */
     65         b = vextq_u32(b, b, 3);
     66         c = vextq_u32(c, c, 2);
     67         d = vextq_u32(d, d, 1);
     68     }
     69     *v0 = vaddq_u32(a, s0);
     70     *v1 = vaddq_u32(b, s1);
     71     *v2 = vaddq_u32(c, s2);
     72     *v3 = vaddq_u32(d, s3);
     73 }
     74 
     75 static const uint32_t chacha_constants[4] = {
     76     0x61707865u, 0x3320646eu, 0x79622d32u, 0x6b206574u
     77 };
     78 
     79 /* Set up the constant rows of the state from key + nonce.  s3
     80  * (counter + nonce) varies per block and is built inside the loop. */
     81 static inline void chacha20_setup(const uint8_t key[32],
     82                                    const uint8_t nonce[12],
     83                                    uint32x4_t *s0, uint32x4_t *s1,
     84                                    uint32x4_t *s2,
     85                                    uint32_t *n0, uint32_t *n1,
     86                                    uint32_t *n2) {
     87     *s0 = vld1q_u32(chacha_constants);
     88     *s1 = vreinterpretq_u32_u8(vld1q_u8(key));
     89     *s2 = vreinterpretq_u32_u8(vld1q_u8(key + 16));
     90     memcpy(n0, nonce + 0, 4);
     91     memcpy(n1, nonce + 4, 4);
     92     memcpy(n2, nonce + 8, 4);
     93 }
     94 
     95 /*
     96  * Generate one 64-byte ChaCha20 keystream block at 'out'.
     97  */
     98 void chacha20_block_arm(const uint8_t key[32], uint32_t counter,
     99                         const uint8_t nonce[12], uint8_t out[64]) {
    100     uint32x4_t s0, s1, s2;
    101     uint32_t n0, n1, n2;
    102     chacha20_setup(key, nonce, &s0, &s1, &s2, &n0, &n1, &n2);
    103 
    104     uint32_t s3_in[4] = { counter, n0, n1, n2 };
    105     uint32x4_t s3 = vld1q_u32(s3_in);
    106     uint32x4_t v0, v1, v2, v3;
    107     chacha20_core(&v0, &v1, &v2, &v3, s0, s1, s2, s3);
    108 
    109     vst1q_u8(out + 0,  vreinterpretq_u8_u32(v0));
    110     vst1q_u8(out + 16, vreinterpretq_u8_u32(v1));
    111     vst1q_u8(out + 32, vreinterpretq_u8_u32(v2));
    112     vst1q_u8(out + 48, vreinterpretq_u8_u32(v3));
    113 }
    114 
    115 /*
    116  * Encrypt/decrypt 'inlen' bytes at 'in' into 'out' using ChaCha20
    117  * with the given key, starting counter, and nonce.  Stream cipher,
    118  * so the same routine decrypts.
    119  */
    120 void chacha20_cipher_arm(const uint8_t key[32], uint32_t counter,
    121                          const uint8_t nonce[12],
    122                          const uint8_t *in, uint8_t *out,
    123                          size_t inlen) {
    124     uint32x4_t s0, s1, s2;
    125     uint32_t n0, n1, n2;
    126     chacha20_setup(key, nonce, &s0, &s1, &s2, &n0, &n1, &n2);
    127 
    128     size_t pos = 0;
    129     while (pos + 64 <= inlen) {
    130         uint32_t s3_in[4] = { counter, n0, n1, n2 };
    131         uint32x4_t s3 = vld1q_u32(s3_in);
    132         uint32x4_t v0, v1, v2, v3;
    133         chacha20_core(&v0, &v1, &v2, &v3, s0, s1, s2, s3);
    134 
    135         uint8x16_t i0 = vld1q_u8(in + pos +  0);
    136         uint8x16_t i1 = vld1q_u8(in + pos + 16);
    137         uint8x16_t i2 = vld1q_u8(in + pos + 32);
    138         uint8x16_t i3 = vld1q_u8(in + pos + 48);
    139 
    140         vst1q_u8(out + pos +  0,
    141                  veorq_u8(i0, vreinterpretq_u8_u32(v0)));
    142         vst1q_u8(out + pos + 16,
    143                  veorq_u8(i1, vreinterpretq_u8_u32(v1)));
    144         vst1q_u8(out + pos + 32,
    145                  veorq_u8(i2, vreinterpretq_u8_u32(v2)));
    146         vst1q_u8(out + pos + 48,
    147                  veorq_u8(i3, vreinterpretq_u8_u32(v3)));
    148 
    149         pos += 64;
    150         counter++;
    151     }
    152 
    153     /* trailing partial block (< 64 bytes) */
    154     if (pos < inlen) {
    155         uint32_t s3_in[4] = { counter, n0, n1, n2 };
    156         uint32x4_t s3 = vld1q_u32(s3_in);
    157         uint32x4_t v0, v1, v2, v3;
    158         chacha20_core(&v0, &v1, &v2, &v3, s0, s1, s2, s3);
    159 
    160         uint8_t block[64];
    161         vst1q_u8(block +  0, vreinterpretq_u8_u32(v0));
    162         vst1q_u8(block + 16, vreinterpretq_u8_u32(v1));
    163         vst1q_u8(block + 32, vreinterpretq_u8_u32(v2));
    164         vst1q_u8(block + 48, vreinterpretq_u8_u32(v3));
    165 
    166         size_t remaining = inlen - pos;
    167         for (size_t i = 0; i < remaining; i++) {
    168             out[pos + i] = in[pos + i] ^ block[i];
    169         }
    170     }
    171 }
    172 
    173 int chacha20_arm_available(void) {
    174     return 1;
    175 }
    176 
    177 #else
    178 
    179 /* stubs for non-aarch64 builds; never reached because dispatch is
    180  * gated on 'chacha20_arm_available' returning 0                  */
    181 
    182 void chacha20_block_arm(const uint8_t *key, uint32_t counter,
    183                         const uint8_t *nonce, uint8_t *out) {
    184     (void)key; (void)counter; (void)nonce; (void)out;
    185 }
    186 
    187 void chacha20_cipher_arm(const uint8_t *key, uint32_t counter,
    188                          const uint8_t *nonce,
    189                          const uint8_t *in, uint8_t *out,
    190                          size_t inlen) {
    191     (void)key; (void)counter; (void)nonce;
    192     (void)in; (void)out; (void)inlen;
    193 }
    194 
    195 int chacha20_arm_available(void) {
    196     return 0;
    197 }
    198 
    199 #endif