sha256

Pure Haskell SHA-256, HMAC-SHA256 (docs.ppad.tech/sha256).
git clone git://git.ppad.tech/sha256.git
Log | Files | Refs | README | LICENSE

sha256_arm.c (5923B)


      1 #include <stdint.h>
      2 #include <string.h>
      3 
      4 #if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2)
      5 
      6 #include <arm_neon.h>
      7 
      8 static const uint32_t K[64] = {
      9     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     10     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     11     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
     12     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
     13     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
     14     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
     15     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
     16     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
     17     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
     18     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
     19     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
     20     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
     21     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
     22     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
     23     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
     24     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
     25 };
     26 
     27 /*
     28  * Process one 64-byte block using ARM SHA256 crypto instructions.
     29  *
     30  * state: pointer to 8 uint32_t words (a,b,c,d,e,f,g,h)
     31  * block: pointer to 64 bytes of message data
     32  *
     33  * The state is updated in place.
     34  */
     35 void sha256_block_arm(uint32_t *state, const uint8_t *block) {
     36     /* Load current hash state */
     37     uint32x4_t abcd = vld1q_u32(&state[0]);
     38     uint32x4_t efgh = vld1q_u32(&state[4]);
     39 
     40     /* Save original for final addition */
     41     uint32x4_t abcd_orig = abcd;
     42     uint32x4_t efgh_orig = efgh;
     43 
     44     /* Load message and convert from big-endian */
     45     uint32x4_t m0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(&block[0])));
     46     uint32x4_t m1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(&block[16])));
     47     uint32x4_t m2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(&block[32])));
     48     uint32x4_t m3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(&block[48])));
     49 
     50     uint32x4_t tmp, tmp2;
     51 
     52     /* Rounds 0-3 */
     53     tmp = vaddq_u32(m0, vld1q_u32(&K[0]));
     54     tmp2 = abcd;
     55     abcd = vsha256hq_u32(abcd, efgh, tmp);
     56     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
     57     m0 = vsha256su1q_u32(vsha256su0q_u32(m0, m1), m2, m3);
     58 
     59     /* Rounds 4-7 */
     60     tmp = vaddq_u32(m1, vld1q_u32(&K[4]));
     61     tmp2 = abcd;
     62     abcd = vsha256hq_u32(abcd, efgh, tmp);
     63     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
     64     m1 = vsha256su1q_u32(vsha256su0q_u32(m1, m2), m3, m0);
     65 
     66     /* Rounds 8-11 */
     67     tmp = vaddq_u32(m2, vld1q_u32(&K[8]));
     68     tmp2 = abcd;
     69     abcd = vsha256hq_u32(abcd, efgh, tmp);
     70     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
     71     m2 = vsha256su1q_u32(vsha256su0q_u32(m2, m3), m0, m1);
     72 
     73     /* Rounds 12-15 */
     74     tmp = vaddq_u32(m3, vld1q_u32(&K[12]));
     75     tmp2 = abcd;
     76     abcd = vsha256hq_u32(abcd, efgh, tmp);
     77     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
     78     m3 = vsha256su1q_u32(vsha256su0q_u32(m3, m0), m1, m2);
     79 
     80     /* Rounds 16-19 */
     81     tmp = vaddq_u32(m0, vld1q_u32(&K[16]));
     82     tmp2 = abcd;
     83     abcd = vsha256hq_u32(abcd, efgh, tmp);
     84     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
     85     m0 = vsha256su1q_u32(vsha256su0q_u32(m0, m1), m2, m3);
     86 
     87     /* Rounds 20-23 */
     88     tmp = vaddq_u32(m1, vld1q_u32(&K[20]));
     89     tmp2 = abcd;
     90     abcd = vsha256hq_u32(abcd, efgh, tmp);
     91     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
     92     m1 = vsha256su1q_u32(vsha256su0q_u32(m1, m2), m3, m0);
     93 
     94     /* Rounds 24-27 */
     95     tmp = vaddq_u32(m2, vld1q_u32(&K[24]));
     96     tmp2 = abcd;
     97     abcd = vsha256hq_u32(abcd, efgh, tmp);
     98     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
     99     m2 = vsha256su1q_u32(vsha256su0q_u32(m2, m3), m0, m1);
    100 
    101     /* Rounds 28-31 */
    102     tmp = vaddq_u32(m3, vld1q_u32(&K[28]));
    103     tmp2 = abcd;
    104     abcd = vsha256hq_u32(abcd, efgh, tmp);
    105     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
    106     m3 = vsha256su1q_u32(vsha256su0q_u32(m3, m0), m1, m2);
    107 
    108     /* Rounds 32-35 */
    109     tmp = vaddq_u32(m0, vld1q_u32(&K[32]));
    110     tmp2 = abcd;
    111     abcd = vsha256hq_u32(abcd, efgh, tmp);
    112     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
    113     m0 = vsha256su1q_u32(vsha256su0q_u32(m0, m1), m2, m3);
    114 
    115     /* Rounds 36-39 */
    116     tmp = vaddq_u32(m1, vld1q_u32(&K[36]));
    117     tmp2 = abcd;
    118     abcd = vsha256hq_u32(abcd, efgh, tmp);
    119     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
    120     m1 = vsha256su1q_u32(vsha256su0q_u32(m1, m2), m3, m0);
    121 
    122     /* Rounds 40-43 */
    123     tmp = vaddq_u32(m2, vld1q_u32(&K[40]));
    124     tmp2 = abcd;
    125     abcd = vsha256hq_u32(abcd, efgh, tmp);
    126     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
    127     m2 = vsha256su1q_u32(vsha256su0q_u32(m2, m3), m0, m1);
    128 
    129     /* Rounds 44-47 */
    130     tmp = vaddq_u32(m3, vld1q_u32(&K[44]));
    131     tmp2 = abcd;
    132     abcd = vsha256hq_u32(abcd, efgh, tmp);
    133     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
    134     m3 = vsha256su1q_u32(vsha256su0q_u32(m3, m0), m1, m2);
    135 
    136     /* Rounds 48-51 */
    137     tmp = vaddq_u32(m0, vld1q_u32(&K[48]));
    138     tmp2 = abcd;
    139     abcd = vsha256hq_u32(abcd, efgh, tmp);
    140     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
    141 
    142     /* Rounds 52-55 */
    143     tmp = vaddq_u32(m1, vld1q_u32(&K[52]));
    144     tmp2 = abcd;
    145     abcd = vsha256hq_u32(abcd, efgh, tmp);
    146     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
    147 
    148     /* Rounds 56-59 */
    149     tmp = vaddq_u32(m2, vld1q_u32(&K[56]));
    150     tmp2 = abcd;
    151     abcd = vsha256hq_u32(abcd, efgh, tmp);
    152     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
    153 
    154     /* Rounds 60-63 */
    155     tmp = vaddq_u32(m3, vld1q_u32(&K[60]));
    156     tmp2 = abcd;
    157     abcd = vsha256hq_u32(abcd, efgh, tmp);
    158     efgh = vsha256h2q_u32(efgh, tmp2, tmp);
    159 
    160     /* Add original state back */
    161     abcd = vaddq_u32(abcd, abcd_orig);
    162     efgh = vaddq_u32(efgh, efgh_orig);
    163 
    164     /* Store result */
    165     vst1q_u32(&state[0], abcd);
    166     vst1q_u32(&state[4], efgh);
    167 }
    168 
    169 /* Return 1 if ARM SHA2 is available, 0 otherwise */
    170 int sha256_arm_available(void) {
    171     return 1;
    172 }
    173 
    174 #else
    175 
    176 /* Stub implementations when ARM SHA2 is not available */
    177 void sha256_block_arm(uint32_t *state, const uint8_t *block) {
    178     (void)state;
    179     (void)block;
    180     /* Should never be called - use pure Haskell fallback */
    181 }
    182 
    183 int sha256_arm_available(void) {
    184     return 0;
    185 }
    186 
    187 #endif