sha512

Pure Haskell SHA-512, HMAC-SHA512 (docs.ppad.tech/sha512).
git clone git://git.ppad.tech/sha512.git
Log | Files | Refs | README | LICENSE

sha512_arm.c (17555B)


      1 #include <stdint.h>
      2 #include <string.h>
      3 
      4 #if defined(__aarch64__) && defined(__ARM_FEATURE_SHA512)
      5 
      6 #include <arm_neon.h>
      7 
      8 static const uint64_t K[80] = {
      9     0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
     10     0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
     11     0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
     12     0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
     13     0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
     14     0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
     15     0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
     16     0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
     17     0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
     18     0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
     19     0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
     20     0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
     21     0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
     22     0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
     23     0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
     24     0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
     25     0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
     26     0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
     27     0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
     28     0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
     29     0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
     30     0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
     31     0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
     32     0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
     33     0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
     34     0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
     35     0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
     36     0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
     37     0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
     38     0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
     39     0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
     40     0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
     41     0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
     42     0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
     43     0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
     44     0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
     45     0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
     46     0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
     47     0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
     48     0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
     49 };
     50 
     51 /*
     52  * Process one 128-byte block using ARM SHA512 crypto instructions.
     53  *
     54  * state: pointer to 8 uint64_t words (a,b,c,d,e,f,g,h)
     55  * block: pointer to 128 bytes of message data
     56  *
     57  * The state is updated in place.
     58  */
     59 void sha512_block_arm(uint64_t *state, const uint8_t *block) {
     60     /* Load current hash state */
     61     uint64x2_t ab = vld1q_u64(&state[0]);
     62     uint64x2_t cd = vld1q_u64(&state[2]);
     63     uint64x2_t ef = vld1q_u64(&state[4]);
     64     uint64x2_t gh = vld1q_u64(&state[6]);
     65 
     66     /* Save original for final addition */
     67     uint64x2_t ab_orig = ab;
     68     uint64x2_t cd_orig = cd;
     69     uint64x2_t ef_orig = ef;
     70     uint64x2_t gh_orig = gh;
     71 
     72     /* Load message and convert from big-endian */
     73     uint64x2_t m0 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[0])));
     74     uint64x2_t m1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[16])));
     75     uint64x2_t m2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[32])));
     76     uint64x2_t m3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[48])));
     77     uint64x2_t m4 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[64])));
     78     uint64x2_t m5 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[80])));
     79     uint64x2_t m6 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[96])));
     80     uint64x2_t m7 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[112])));
     81 
     82     uint64x2_t tmp;
     83 
     84     /* Rounds 0-1 */
     85     tmp = vaddq_u64(m0, vld1q_u64(&K[0]));
     86     tmp = vextq_u64(tmp, tmp, 1);
     87     tmp = vaddq_u64(gh, tmp);
     88     tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
     89     gh = vsha512h2q_u64(tmp, cd, ab);
     90     cd = vaddq_u64(cd, tmp);
     91     m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1));
     92 
     93     /* Rounds 2-3 */
     94     tmp = vaddq_u64(m1, vld1q_u64(&K[2]));
     95     tmp = vextq_u64(tmp, tmp, 1);
     96     tmp = vaddq_u64(ef, tmp);
     97     tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
     98     ef = vsha512h2q_u64(tmp, ab, gh);
     99     ab = vaddq_u64(ab, tmp);
    100     m1 = vsha512su1q_u64(vsha512su0q_u64(m1, m2), m0, vextq_u64(m5, m6, 1));
    101 
    102     /* Rounds 4-5 */
    103     tmp = vaddq_u64(m2, vld1q_u64(&K[4]));
    104     tmp = vextq_u64(tmp, tmp, 1);
    105     tmp = vaddq_u64(cd, tmp);
    106     tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
    107     cd = vsha512h2q_u64(tmp, gh, ef);
    108     gh = vaddq_u64(gh, tmp);
    109     m2 = vsha512su1q_u64(vsha512su0q_u64(m2, m3), m1, vextq_u64(m6, m7, 1));
    110 
    111     /* Rounds 6-7 */
    112     tmp = vaddq_u64(m3, vld1q_u64(&K[6]));
    113     tmp = vextq_u64(tmp, tmp, 1);
    114     tmp = vaddq_u64(ab, tmp);
    115     tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
    116     ab = vsha512h2q_u64(tmp, ef, cd);
    117     ef = vaddq_u64(ef, tmp);
    118     m3 = vsha512su1q_u64(vsha512su0q_u64(m3, m4), m2, vextq_u64(m7, m0, 1));
    119 
    120     /* Rounds 8-9 */
    121     tmp = vaddq_u64(m4, vld1q_u64(&K[8]));
    122     tmp = vextq_u64(tmp, tmp, 1);
    123     tmp = vaddq_u64(gh, tmp);
    124     tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
    125     gh = vsha512h2q_u64(tmp, cd, ab);
    126     cd = vaddq_u64(cd, tmp);
    127     m4 = vsha512su1q_u64(vsha512su0q_u64(m4, m5), m3, vextq_u64(m0, m1, 1));
    128 
    129     /* Rounds 10-11 */
    130     tmp = vaddq_u64(m5, vld1q_u64(&K[10]));
    131     tmp = vextq_u64(tmp, tmp, 1);
    132     tmp = vaddq_u64(ef, tmp);
    133     tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
    134     ef = vsha512h2q_u64(tmp, ab, gh);
    135     ab = vaddq_u64(ab, tmp);
    136     m5 = vsha512su1q_u64(vsha512su0q_u64(m5, m6), m4, vextq_u64(m1, m2, 1));
    137 
    138     /* Rounds 12-13 */
    139     tmp = vaddq_u64(m6, vld1q_u64(&K[12]));
    140     tmp = vextq_u64(tmp, tmp, 1);
    141     tmp = vaddq_u64(cd, tmp);
    142     tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
    143     cd = vsha512h2q_u64(tmp, gh, ef);
    144     gh = vaddq_u64(gh, tmp);
    145     m6 = vsha512su1q_u64(vsha512su0q_u64(m6, m7), m5, vextq_u64(m2, m3, 1));
    146 
    147     /* Rounds 14-15 */
    148     tmp = vaddq_u64(m7, vld1q_u64(&K[14]));
    149     tmp = vextq_u64(tmp, tmp, 1);
    150     tmp = vaddq_u64(ab, tmp);
    151     tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
    152     ab = vsha512h2q_u64(tmp, ef, cd);
    153     ef = vaddq_u64(ef, tmp);
    154     m7 = vsha512su1q_u64(vsha512su0q_u64(m7, m0), m6, vextq_u64(m3, m4, 1));
    155 
    156     /* Rounds 16-17 */
    157     tmp = vaddq_u64(m0, vld1q_u64(&K[16]));
    158     tmp = vextq_u64(tmp, tmp, 1);
    159     tmp = vaddq_u64(gh, tmp);
    160     tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
    161     gh = vsha512h2q_u64(tmp, cd, ab);
    162     cd = vaddq_u64(cd, tmp);
    163     m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1));
    164 
    165     /* Rounds 18-19 */
    166     tmp = vaddq_u64(m1, vld1q_u64(&K[18]));
    167     tmp = vextq_u64(tmp, tmp, 1);
    168     tmp = vaddq_u64(ef, tmp);
    169     tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
    170     ef = vsha512h2q_u64(tmp, ab, gh);
    171     ab = vaddq_u64(ab, tmp);
    172     m1 = vsha512su1q_u64(vsha512su0q_u64(m1, m2), m0, vextq_u64(m5, m6, 1));
    173 
    174     /* Rounds 20-21 */
    175     tmp = vaddq_u64(m2, vld1q_u64(&K[20]));
    176     tmp = vextq_u64(tmp, tmp, 1);
    177     tmp = vaddq_u64(cd, tmp);
    178     tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
    179     cd = vsha512h2q_u64(tmp, gh, ef);
    180     gh = vaddq_u64(gh, tmp);
    181     m2 = vsha512su1q_u64(vsha512su0q_u64(m2, m3), m1, vextq_u64(m6, m7, 1));
    182 
    183     /* Rounds 22-23 */
    184     tmp = vaddq_u64(m3, vld1q_u64(&K[22]));
    185     tmp = vextq_u64(tmp, tmp, 1);
    186     tmp = vaddq_u64(ab, tmp);
    187     tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
    188     ab = vsha512h2q_u64(tmp, ef, cd);
    189     ef = vaddq_u64(ef, tmp);
    190     m3 = vsha512su1q_u64(vsha512su0q_u64(m3, m4), m2, vextq_u64(m7, m0, 1));
    191 
    192     /* Rounds 24-25 */
    193     tmp = vaddq_u64(m4, vld1q_u64(&K[24]));
    194     tmp = vextq_u64(tmp, tmp, 1);
    195     tmp = vaddq_u64(gh, tmp);
    196     tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
    197     gh = vsha512h2q_u64(tmp, cd, ab);
    198     cd = vaddq_u64(cd, tmp);
    199     m4 = vsha512su1q_u64(vsha512su0q_u64(m4, m5), m3, vextq_u64(m0, m1, 1));
    200 
    201     /* Rounds 26-27 */
    202     tmp = vaddq_u64(m5, vld1q_u64(&K[26]));
    203     tmp = vextq_u64(tmp, tmp, 1);
    204     tmp = vaddq_u64(ef, tmp);
    205     tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
    206     ef = vsha512h2q_u64(tmp, ab, gh);
    207     ab = vaddq_u64(ab, tmp);
    208     m5 = vsha512su1q_u64(vsha512su0q_u64(m5, m6), m4, vextq_u64(m1, m2, 1));
    209 
    210     /* Rounds 28-29 */
    211     tmp = vaddq_u64(m6, vld1q_u64(&K[28]));
    212     tmp = vextq_u64(tmp, tmp, 1);
    213     tmp = vaddq_u64(cd, tmp);
    214     tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
    215     cd = vsha512h2q_u64(tmp, gh, ef);
    216     gh = vaddq_u64(gh, tmp);
    217     m6 = vsha512su1q_u64(vsha512su0q_u64(m6, m7), m5, vextq_u64(m2, m3, 1));
    218 
    219     /* Rounds 30-31 */
    220     tmp = vaddq_u64(m7, vld1q_u64(&K[30]));
    221     tmp = vextq_u64(tmp, tmp, 1);
    222     tmp = vaddq_u64(ab, tmp);
    223     tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
    224     ab = vsha512h2q_u64(tmp, ef, cd);
    225     ef = vaddq_u64(ef, tmp);
    226     m7 = vsha512su1q_u64(vsha512su0q_u64(m7, m0), m6, vextq_u64(m3, m4, 1));
    227 
    228     /* Rounds 32-33 */
    229     tmp = vaddq_u64(m0, vld1q_u64(&K[32]));
    230     tmp = vextq_u64(tmp, tmp, 1);
    231     tmp = vaddq_u64(gh, tmp);
    232     tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
    233     gh = vsha512h2q_u64(tmp, cd, ab);
    234     cd = vaddq_u64(cd, tmp);
    235     m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1));
    236 
    237     /* Rounds 34-35 */
    238     tmp = vaddq_u64(m1, vld1q_u64(&K[34]));
    239     tmp = vextq_u64(tmp, tmp, 1);
    240     tmp = vaddq_u64(ef, tmp);
    241     tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
    242     ef = vsha512h2q_u64(tmp, ab, gh);
    243     ab = vaddq_u64(ab, tmp);
    244     m1 = vsha512su1q_u64(vsha512su0q_u64(m1, m2), m0, vextq_u64(m5, m6, 1));
    245 
    246     /* Rounds 36-37 */
    247     tmp = vaddq_u64(m2, vld1q_u64(&K[36]));
    248     tmp = vextq_u64(tmp, tmp, 1);
    249     tmp = vaddq_u64(cd, tmp);
    250     tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
    251     cd = vsha512h2q_u64(tmp, gh, ef);
    252     gh = vaddq_u64(gh, tmp);
    253     m2 = vsha512su1q_u64(vsha512su0q_u64(m2, m3), m1, vextq_u64(m6, m7, 1));
    254 
    255     /* Rounds 38-39 */
    256     tmp = vaddq_u64(m3, vld1q_u64(&K[38]));
    257     tmp = vextq_u64(tmp, tmp, 1);
    258     tmp = vaddq_u64(ab, tmp);
    259     tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
    260     ab = vsha512h2q_u64(tmp, ef, cd);
    261     ef = vaddq_u64(ef, tmp);
    262     m3 = vsha512su1q_u64(vsha512su0q_u64(m3, m4), m2, vextq_u64(m7, m0, 1));
    263 
    264     /* Rounds 40-41 */
    265     tmp = vaddq_u64(m4, vld1q_u64(&K[40]));
    266     tmp = vextq_u64(tmp, tmp, 1);
    267     tmp = vaddq_u64(gh, tmp);
    268     tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
    269     gh = vsha512h2q_u64(tmp, cd, ab);
    270     cd = vaddq_u64(cd, tmp);
    271     m4 = vsha512su1q_u64(vsha512su0q_u64(m4, m5), m3, vextq_u64(m0, m1, 1));
    272 
    273     /* Rounds 42-43 */
    274     tmp = vaddq_u64(m5, vld1q_u64(&K[42]));
    275     tmp = vextq_u64(tmp, tmp, 1);
    276     tmp = vaddq_u64(ef, tmp);
    277     tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
    278     ef = vsha512h2q_u64(tmp, ab, gh);
    279     ab = vaddq_u64(ab, tmp);
    280     m5 = vsha512su1q_u64(vsha512su0q_u64(m5, m6), m4, vextq_u64(m1, m2, 1));
    281 
    282     /* Rounds 44-45 */
    283     tmp = vaddq_u64(m6, vld1q_u64(&K[44]));
    284     tmp = vextq_u64(tmp, tmp, 1);
    285     tmp = vaddq_u64(cd, tmp);
    286     tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
    287     cd = vsha512h2q_u64(tmp, gh, ef);
    288     gh = vaddq_u64(gh, tmp);
    289     m6 = vsha512su1q_u64(vsha512su0q_u64(m6, m7), m5, vextq_u64(m2, m3, 1));
    290 
    291     /* Rounds 46-47 */
    292     tmp = vaddq_u64(m7, vld1q_u64(&K[46]));
    293     tmp = vextq_u64(tmp, tmp, 1);
    294     tmp = vaddq_u64(ab, tmp);
    295     tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
    296     ab = vsha512h2q_u64(tmp, ef, cd);
    297     ef = vaddq_u64(ef, tmp);
    298     m7 = vsha512su1q_u64(vsha512su0q_u64(m7, m0), m6, vextq_u64(m3, m4, 1));
    299 
    300     /* Rounds 48-49 */
    301     tmp = vaddq_u64(m0, vld1q_u64(&K[48]));
    302     tmp = vextq_u64(tmp, tmp, 1);
    303     tmp = vaddq_u64(gh, tmp);
    304     tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
    305     gh = vsha512h2q_u64(tmp, cd, ab);
    306     cd = vaddq_u64(cd, tmp);
    307     m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1));
    308 
    309     /* Rounds 50-51 */
    310     tmp = vaddq_u64(m1, vld1q_u64(&K[50]));
    311     tmp = vextq_u64(tmp, tmp, 1);
    312     tmp = vaddq_u64(ef, tmp);
    313     tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
    314     ef = vsha512h2q_u64(tmp, ab, gh);
    315     ab = vaddq_u64(ab, tmp);
    316     m1 = vsha512su1q_u64(vsha512su0q_u64(m1, m2), m0, vextq_u64(m5, m6, 1));
    317 
    318     /* Rounds 52-53 */
    319     tmp = vaddq_u64(m2, vld1q_u64(&K[52]));
    320     tmp = vextq_u64(tmp, tmp, 1);
    321     tmp = vaddq_u64(cd, tmp);
    322     tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
    323     cd = vsha512h2q_u64(tmp, gh, ef);
    324     gh = vaddq_u64(gh, tmp);
    325     m2 = vsha512su1q_u64(vsha512su0q_u64(m2, m3), m1, vextq_u64(m6, m7, 1));
    326 
    327     /* Rounds 54-55 */
    328     tmp = vaddq_u64(m3, vld1q_u64(&K[54]));
    329     tmp = vextq_u64(tmp, tmp, 1);
    330     tmp = vaddq_u64(ab, tmp);
    331     tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
    332     ab = vsha512h2q_u64(tmp, ef, cd);
    333     ef = vaddq_u64(ef, tmp);
    334     m3 = vsha512su1q_u64(vsha512su0q_u64(m3, m4), m2, vextq_u64(m7, m0, 1));
    335 
    336     /* Rounds 56-57 */
    337     tmp = vaddq_u64(m4, vld1q_u64(&K[56]));
    338     tmp = vextq_u64(tmp, tmp, 1);
    339     tmp = vaddq_u64(gh, tmp);
    340     tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
    341     gh = vsha512h2q_u64(tmp, cd, ab);
    342     cd = vaddq_u64(cd, tmp);
    343     m4 = vsha512su1q_u64(vsha512su0q_u64(m4, m5), m3, vextq_u64(m0, m1, 1));
    344 
    345     /* Rounds 58-59 */
    346     tmp = vaddq_u64(m5, vld1q_u64(&K[58]));
    347     tmp = vextq_u64(tmp, tmp, 1);
    348     tmp = vaddq_u64(ef, tmp);
    349     tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
    350     ef = vsha512h2q_u64(tmp, ab, gh);
    351     ab = vaddq_u64(ab, tmp);
    352     m5 = vsha512su1q_u64(vsha512su0q_u64(m5, m6), m4, vextq_u64(m1, m2, 1));
    353 
    354     /* Rounds 60-61 */
    355     tmp = vaddq_u64(m6, vld1q_u64(&K[60]));
    356     tmp = vextq_u64(tmp, tmp, 1);
    357     tmp = vaddq_u64(cd, tmp);
    358     tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
    359     cd = vsha512h2q_u64(tmp, gh, ef);
    360     gh = vaddq_u64(gh, tmp);
    361     m6 = vsha512su1q_u64(vsha512su0q_u64(m6, m7), m5, vextq_u64(m2, m3, 1));
    362 
    363     /* Rounds 62-63 */
    364     tmp = vaddq_u64(m7, vld1q_u64(&K[62]));
    365     tmp = vextq_u64(tmp, tmp, 1);
    366     tmp = vaddq_u64(ab, tmp);
    367     tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
    368     ab = vsha512h2q_u64(tmp, ef, cd);
    369     ef = vaddq_u64(ef, tmp);
    370     m7 = vsha512su1q_u64(vsha512su0q_u64(m7, m0), m6, vextq_u64(m3, m4, 1));
    371 
    372     /* Rounds 64-65 */
    373     tmp = vaddq_u64(m0, vld1q_u64(&K[64]));
    374     tmp = vextq_u64(tmp, tmp, 1);
    375     tmp = vaddq_u64(gh, tmp);
    376     tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
    377     gh = vsha512h2q_u64(tmp, cd, ab);
    378     cd = vaddq_u64(cd, tmp);
    379 
    380     /* Rounds 66-67 */
    381     tmp = vaddq_u64(m1, vld1q_u64(&K[66]));
    382     tmp = vextq_u64(tmp, tmp, 1);
    383     tmp = vaddq_u64(ef, tmp);
    384     tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
    385     ef = vsha512h2q_u64(tmp, ab, gh);
    386     ab = vaddq_u64(ab, tmp);
    387 
    388     /* Rounds 68-69 */
    389     tmp = vaddq_u64(m2, vld1q_u64(&K[68]));
    390     tmp = vextq_u64(tmp, tmp, 1);
    391     tmp = vaddq_u64(cd, tmp);
    392     tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
    393     cd = vsha512h2q_u64(tmp, gh, ef);
    394     gh = vaddq_u64(gh, tmp);
    395 
    396     /* Rounds 70-71 */
    397     tmp = vaddq_u64(m3, vld1q_u64(&K[70]));
    398     tmp = vextq_u64(tmp, tmp, 1);
    399     tmp = vaddq_u64(ab, tmp);
    400     tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
    401     ab = vsha512h2q_u64(tmp, ef, cd);
    402     ef = vaddq_u64(ef, tmp);
    403 
    404     /* Rounds 72-73 */
    405     tmp = vaddq_u64(m4, vld1q_u64(&K[72]));
    406     tmp = vextq_u64(tmp, tmp, 1);
    407     tmp = vaddq_u64(gh, tmp);
    408     tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
    409     gh = vsha512h2q_u64(tmp, cd, ab);
    410     cd = vaddq_u64(cd, tmp);
    411 
    412     /* Rounds 74-75 */
    413     tmp = vaddq_u64(m5, vld1q_u64(&K[74]));
    414     tmp = vextq_u64(tmp, tmp, 1);
    415     tmp = vaddq_u64(ef, tmp);
    416     tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
    417     ef = vsha512h2q_u64(tmp, ab, gh);
    418     ab = vaddq_u64(ab, tmp);
    419 
    420     /* Rounds 76-77 */
    421     tmp = vaddq_u64(m6, vld1q_u64(&K[76]));
    422     tmp = vextq_u64(tmp, tmp, 1);
    423     tmp = vaddq_u64(cd, tmp);
    424     tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
    425     cd = vsha512h2q_u64(tmp, gh, ef);
    426     gh = vaddq_u64(gh, tmp);
    427 
    428     /* Rounds 78-79 */
    429     tmp = vaddq_u64(m7, vld1q_u64(&K[78]));
    430     tmp = vextq_u64(tmp, tmp, 1);
    431     tmp = vaddq_u64(ab, tmp);
    432     tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
    433     ab = vsha512h2q_u64(tmp, ef, cd);
    434     ef = vaddq_u64(ef, tmp);
    435 
    436     /* Add original state back */
    437     ab = vaddq_u64(ab, ab_orig);
    438     cd = vaddq_u64(cd, cd_orig);
    439     ef = vaddq_u64(ef, ef_orig);
    440     gh = vaddq_u64(gh, gh_orig);
    441 
    442     /* Store result */
    443     vst1q_u64(&state[0], ab);
    444     vst1q_u64(&state[2], cd);
    445     vst1q_u64(&state[4], ef);
    446     vst1q_u64(&state[6], gh);
    447 }
    448 
    449 /* Return 1 if ARM SHA512 is available, 0 otherwise */
    450 int sha512_arm_available(void) {
    451     return 1;
    452 }
    453 
    454 #else
    455 
    456 /* Stub implementations when ARM SHA512 is not available */
    457 void sha512_block_arm(uint64_t *state, const uint8_t *block) {
    458     (void)state;
    459     (void)block;
    460     /* Should never be called - use pure Haskell fallback */
    461 }
    462 
    463 int sha512_arm_available(void) {
    464     return 0;
    465 }
    466 
    467 #endif