sha256_arm.c (5923B)
1 #include <stdint.h> 2 #include <string.h> 3 4 #if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2) 5 6 #include <arm_neon.h> 7 8 static const uint32_t K[64] = { 9 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 10 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 11 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 12 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 13 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 14 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 15 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 16 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 17 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 18 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 19 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 20 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 21 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 22 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 23 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 24 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 25 }; 26 27 /* 28 * Process one 64-byte block using ARM SHA256 crypto instructions. 29 * 30 * state: pointer to 8 uint32_t words (a,b,c,d,e,f,g,h) 31 * block: pointer to 64 bytes of message data 32 * 33 * The state is updated in place. 34 */ 35 void sha256_block_arm(uint32_t *state, const uint8_t *block) { 36 /* Load current hash state */ 37 uint32x4_t abcd = vld1q_u32(&state[0]); 38 uint32x4_t efgh = vld1q_u32(&state[4]); 39 40 /* Save original for final addition */ 41 uint32x4_t abcd_orig = abcd; 42 uint32x4_t efgh_orig = efgh; 43 44 /* Load message and convert from big-endian */ 45 uint32x4_t m0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(&block[0]))); 46 uint32x4_t m1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(&block[16]))); 47 uint32x4_t m2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(&block[32]))); 48 uint32x4_t m3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(&block[48]))); 49 50 uint32x4_t tmp, tmp2; 51 52 /* Rounds 0-3 */ 53 tmp = vaddq_u32(m0, vld1q_u32(&K[0])); 54 tmp2 = abcd; 55 abcd = vsha256hq_u32(abcd, efgh, tmp); 56 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 57 m0 = vsha256su1q_u32(vsha256su0q_u32(m0, m1), m2, m3); 58 59 /* Rounds 4-7 */ 60 tmp = vaddq_u32(m1, vld1q_u32(&K[4])); 61 tmp2 = abcd; 62 abcd = vsha256hq_u32(abcd, efgh, tmp); 63 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 64 m1 = vsha256su1q_u32(vsha256su0q_u32(m1, m2), m3, m0); 65 66 /* Rounds 8-11 */ 67 tmp = vaddq_u32(m2, vld1q_u32(&K[8])); 68 tmp2 = abcd; 69 abcd = vsha256hq_u32(abcd, efgh, tmp); 70 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 71 m2 = vsha256su1q_u32(vsha256su0q_u32(m2, m3), m0, m1); 72 73 /* Rounds 12-15 */ 74 tmp = vaddq_u32(m3, vld1q_u32(&K[12])); 75 tmp2 = abcd; 76 abcd = vsha256hq_u32(abcd, efgh, tmp); 77 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 78 m3 = vsha256su1q_u32(vsha256su0q_u32(m3, m0), m1, m2); 79 80 /* Rounds 16-19 */ 81 tmp = vaddq_u32(m0, vld1q_u32(&K[16])); 82 tmp2 = abcd; 83 abcd = vsha256hq_u32(abcd, efgh, tmp); 84 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 85 m0 = vsha256su1q_u32(vsha256su0q_u32(m0, m1), m2, m3); 86 87 /* Rounds 20-23 */ 88 tmp = vaddq_u32(m1, vld1q_u32(&K[20])); 89 tmp2 = abcd; 90 abcd = vsha256hq_u32(abcd, efgh, tmp); 91 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 92 m1 = vsha256su1q_u32(vsha256su0q_u32(m1, m2), m3, m0); 93 94 /* Rounds 24-27 */ 95 tmp = vaddq_u32(m2, vld1q_u32(&K[24])); 96 tmp2 = abcd; 97 abcd = vsha256hq_u32(abcd, efgh, tmp); 98 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 99 m2 = vsha256su1q_u32(vsha256su0q_u32(m2, m3), m0, m1); 100 101 /* Rounds 28-31 */ 102 tmp = vaddq_u32(m3, vld1q_u32(&K[28])); 103 tmp2 = abcd; 104 abcd = vsha256hq_u32(abcd, efgh, tmp); 105 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 106 m3 = vsha256su1q_u32(vsha256su0q_u32(m3, m0), m1, m2); 107 108 /* Rounds 32-35 */ 109 tmp = vaddq_u32(m0, vld1q_u32(&K[32])); 110 tmp2 = abcd; 111 abcd = vsha256hq_u32(abcd, efgh, tmp); 112 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 113 m0 = vsha256su1q_u32(vsha256su0q_u32(m0, m1), m2, m3); 114 115 /* Rounds 36-39 */ 116 tmp = vaddq_u32(m1, vld1q_u32(&K[36])); 117 tmp2 = abcd; 118 abcd = vsha256hq_u32(abcd, efgh, tmp); 119 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 120 m1 = vsha256su1q_u32(vsha256su0q_u32(m1, m2), m3, m0); 121 122 /* Rounds 40-43 */ 123 tmp = vaddq_u32(m2, vld1q_u32(&K[40])); 124 tmp2 = abcd; 125 abcd = vsha256hq_u32(abcd, efgh, tmp); 126 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 127 m2 = vsha256su1q_u32(vsha256su0q_u32(m2, m3), m0, m1); 128 129 /* Rounds 44-47 */ 130 tmp = vaddq_u32(m3, vld1q_u32(&K[44])); 131 tmp2 = abcd; 132 abcd = vsha256hq_u32(abcd, efgh, tmp); 133 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 134 m3 = vsha256su1q_u32(vsha256su0q_u32(m3, m0), m1, m2); 135 136 /* Rounds 48-51 */ 137 tmp = vaddq_u32(m0, vld1q_u32(&K[48])); 138 tmp2 = abcd; 139 abcd = vsha256hq_u32(abcd, efgh, tmp); 140 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 141 142 /* Rounds 52-55 */ 143 tmp = vaddq_u32(m1, vld1q_u32(&K[52])); 144 tmp2 = abcd; 145 abcd = vsha256hq_u32(abcd, efgh, tmp); 146 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 147 148 /* Rounds 56-59 */ 149 tmp = vaddq_u32(m2, vld1q_u32(&K[56])); 150 tmp2 = abcd; 151 abcd = vsha256hq_u32(abcd, efgh, tmp); 152 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 153 154 /* Rounds 60-63 */ 155 tmp = vaddq_u32(m3, vld1q_u32(&K[60])); 156 tmp2 = abcd; 157 abcd = vsha256hq_u32(abcd, efgh, tmp); 158 efgh = vsha256h2q_u32(efgh, tmp2, tmp); 159 160 /* Add original state back */ 161 abcd = vaddq_u32(abcd, abcd_orig); 162 efgh = vaddq_u32(efgh, efgh_orig); 163 164 /* Store result */ 165 vst1q_u32(&state[0], abcd); 166 vst1q_u32(&state[4], efgh); 167 } 168 169 /* Return 1 if ARM SHA2 is available, 0 otherwise */ 170 int sha256_arm_available(void) { 171 return 1; 172 } 173 174 #else 175 176 /* Stub implementations when ARM SHA2 is not available */ 177 void sha256_block_arm(uint32_t *state, const uint8_t *block) { 178 (void)state; 179 (void)block; 180 /* Should never be called - use pure Haskell fallback */ 181 } 182 183 int sha256_arm_available(void) { 184 return 0; 185 } 186 187 #endif