modinv32_impl.h (35378B)
1 /*********************************************************************** 2 * Copyright (c) 2020 Peter Dettman * 3 * Distributed under the MIT software license, see the accompanying * 4 * file COPYING or https://www.opensource.org/licenses/mit-license.php.* 5 **********************************************************************/ 6 7 #ifndef SECP256K1_MODINV32_IMPL_H 8 #define SECP256K1_MODINV32_IMPL_H 9 10 #include "modinv32.h" 11 12 #include "util.h" 13 14 #include <stdlib.h> 15 16 /* This file implements modular inversion based on the paper "Fast constant-time gcd computation and 17 * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang. 18 * 19 * For an explanation of the algorithm, see doc/safegcd_implementation.md. This file contains an 20 * implementation for N=30, using 30-bit signed limbs represented as int32_t. 21 */ 22 23 #ifdef VERIFY 24 static const haskellsecp256k1_v0_1_0_modinv32_signed30 SECP256K1_SIGNED30_ONE = {{1}}; 25 26 /* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^30). */ 27 static void haskellsecp256k1_v0_1_0_modinv32_mul_30(haskellsecp256k1_v0_1_0_modinv32_signed30 *r, const haskellsecp256k1_v0_1_0_modinv32_signed30 *a, int alen, int32_t factor) { 28 const int32_t M30 = (int32_t)(UINT32_MAX >> 2); 29 int64_t c = 0; 30 int i; 31 for (i = 0; i < 8; ++i) { 32 if (i < alen) c += (int64_t)a->v[i] * factor; 33 r->v[i] = (int32_t)c & M30; c >>= 30; 34 } 35 if (8 < alen) c += (int64_t)a->v[8] * factor; 36 VERIFY_CHECK(c == (int32_t)c); 37 r->v[8] = (int32_t)c; 38 } 39 40 /* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A consists of alen limbs; b has 9. */ 41 static int haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(const haskellsecp256k1_v0_1_0_modinv32_signed30 *a, int alen, const haskellsecp256k1_v0_1_0_modinv32_signed30 *b, int32_t factor) { 42 int i; 43 haskellsecp256k1_v0_1_0_modinv32_signed30 am, bm; 44 haskellsecp256k1_v0_1_0_modinv32_mul_30(&am, a, alen, 1); /* Normalize all but the top limb of a. */ 45 haskellsecp256k1_v0_1_0_modinv32_mul_30(&bm, b, 9, factor); 46 for (i = 0; i < 8; ++i) { 47 /* Verify that all but the top limb of a and b are normalized. */ 48 VERIFY_CHECK(am.v[i] >> 30 == 0); 49 VERIFY_CHECK(bm.v[i] >> 30 == 0); 50 } 51 for (i = 8; i >= 0; --i) { 52 if (am.v[i] < bm.v[i]) return -1; 53 if (am.v[i] > bm.v[i]) return 1; 54 } 55 return 0; 56 } 57 #endif 58 59 /* Take as input a signed30 number in range (-2*modulus,modulus), and add a multiple of the modulus 60 * to it to bring it to range [0,modulus). If sign < 0, the input will also be negated in the 61 * process. The input must have limbs in range (-2^30,2^30). The output will have limbs in range 62 * [0,2^30). */ 63 static void haskellsecp256k1_v0_1_0_modinv32_normalize_30(haskellsecp256k1_v0_1_0_modinv32_signed30 *r, int32_t sign, const haskellsecp256k1_v0_1_0_modinv32_modinfo *modinfo) { 64 const int32_t M30 = (int32_t)(UINT32_MAX >> 2); 65 int32_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4], 66 r5 = r->v[5], r6 = r->v[6], r7 = r->v[7], r8 = r->v[8]; 67 volatile int32_t cond_add, cond_negate; 68 69 #ifdef VERIFY 70 /* Verify that all limbs are in range (-2^30,2^30). */ 71 int i; 72 for (i = 0; i < 9; ++i) { 73 VERIFY_CHECK(r->v[i] >= -M30); 74 VERIFY_CHECK(r->v[i] <= M30); 75 } 76 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, -2) > 0); /* r > -2*modulus */ 77 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 1) < 0); /* r < modulus */ 78 #endif 79 80 /* In a first step, add the modulus if the input is negative, and then negate if requested. 81 * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input 82 * limbs are in range (-2^30,2^30), this cannot overflow an int32_t. Note that the right 83 * shifts below are signed sign-extending shifts (see assumptions.h for tests that that is 84 * indeed the behavior of the right shift operator). */ 85 cond_add = r8 >> 31; 86 r0 += modinfo->modulus.v[0] & cond_add; 87 r1 += modinfo->modulus.v[1] & cond_add; 88 r2 += modinfo->modulus.v[2] & cond_add; 89 r3 += modinfo->modulus.v[3] & cond_add; 90 r4 += modinfo->modulus.v[4] & cond_add; 91 r5 += modinfo->modulus.v[5] & cond_add; 92 r6 += modinfo->modulus.v[6] & cond_add; 93 r7 += modinfo->modulus.v[7] & cond_add; 94 r8 += modinfo->modulus.v[8] & cond_add; 95 cond_negate = sign >> 31; 96 r0 = (r0 ^ cond_negate) - cond_negate; 97 r1 = (r1 ^ cond_negate) - cond_negate; 98 r2 = (r2 ^ cond_negate) - cond_negate; 99 r3 = (r3 ^ cond_negate) - cond_negate; 100 r4 = (r4 ^ cond_negate) - cond_negate; 101 r5 = (r5 ^ cond_negate) - cond_negate; 102 r6 = (r6 ^ cond_negate) - cond_negate; 103 r7 = (r7 ^ cond_negate) - cond_negate; 104 r8 = (r8 ^ cond_negate) - cond_negate; 105 /* Propagate the top bits, to bring limbs back to range (-2^30,2^30). */ 106 r1 += r0 >> 30; r0 &= M30; 107 r2 += r1 >> 30; r1 &= M30; 108 r3 += r2 >> 30; r2 &= M30; 109 r4 += r3 >> 30; r3 &= M30; 110 r5 += r4 >> 30; r4 &= M30; 111 r6 += r5 >> 30; r5 &= M30; 112 r7 += r6 >> 30; r6 &= M30; 113 r8 += r7 >> 30; r7 &= M30; 114 115 /* In a second step add the modulus again if the result is still negative, bringing r to range 116 * [0,modulus). */ 117 cond_add = r8 >> 31; 118 r0 += modinfo->modulus.v[0] & cond_add; 119 r1 += modinfo->modulus.v[1] & cond_add; 120 r2 += modinfo->modulus.v[2] & cond_add; 121 r3 += modinfo->modulus.v[3] & cond_add; 122 r4 += modinfo->modulus.v[4] & cond_add; 123 r5 += modinfo->modulus.v[5] & cond_add; 124 r6 += modinfo->modulus.v[6] & cond_add; 125 r7 += modinfo->modulus.v[7] & cond_add; 126 r8 += modinfo->modulus.v[8] & cond_add; 127 /* And propagate again. */ 128 r1 += r0 >> 30; r0 &= M30; 129 r2 += r1 >> 30; r1 &= M30; 130 r3 += r2 >> 30; r2 &= M30; 131 r4 += r3 >> 30; r3 &= M30; 132 r5 += r4 >> 30; r4 &= M30; 133 r6 += r5 >> 30; r5 &= M30; 134 r7 += r6 >> 30; r6 &= M30; 135 r8 += r7 >> 30; r7 &= M30; 136 137 r->v[0] = r0; 138 r->v[1] = r1; 139 r->v[2] = r2; 140 r->v[3] = r3; 141 r->v[4] = r4; 142 r->v[5] = r5; 143 r->v[6] = r6; 144 r->v[7] = r7; 145 r->v[8] = r8; 146 147 VERIFY_CHECK(r0 >> 30 == 0); 148 VERIFY_CHECK(r1 >> 30 == 0); 149 VERIFY_CHECK(r2 >> 30 == 0); 150 VERIFY_CHECK(r3 >> 30 == 0); 151 VERIFY_CHECK(r4 >> 30 == 0); 152 VERIFY_CHECK(r5 >> 30 == 0); 153 VERIFY_CHECK(r6 >> 30 == 0); 154 VERIFY_CHECK(r7 >> 30 == 0); 155 VERIFY_CHECK(r8 >> 30 == 0); 156 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 0) >= 0); /* r >= 0 */ 157 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 1) < 0); /* r < modulus */ 158 } 159 160 /* Data type for transition matrices (see section 3 of explanation). 161 * 162 * t = [ u v ] 163 * [ q r ] 164 */ 165 typedef struct { 166 int32_t u, v, q, r; 167 } haskellsecp256k1_v0_1_0_modinv32_trans2x2; 168 169 /* Compute the transition matrix and zeta for 30 divsteps. 170 * 171 * Input: zeta: initial zeta 172 * f0: bottom limb of initial f 173 * g0: bottom limb of initial g 174 * Output: t: transition matrix 175 * Return: final zeta 176 * 177 * Implements the divsteps_n_matrix function from the explanation. 178 */ 179 static int32_t haskellsecp256k1_v0_1_0_modinv32_divsteps_30(int32_t zeta, uint32_t f0, uint32_t g0, haskellsecp256k1_v0_1_0_modinv32_trans2x2 *t) { 180 /* u,v,q,r are the elements of the transformation matrix being built up, 181 * starting with the identity matrix. Semantically they are signed integers 182 * in range [-2^30,2^30], but here represented as unsigned mod 2^32. This 183 * permits left shifting (which is UB for negative numbers). The range 184 * being inside [-2^31,2^31) means that casting to signed works correctly. 185 */ 186 uint32_t u = 1, v = 0, q = 0, r = 1; 187 volatile uint32_t c1, c2; 188 uint32_t mask1, mask2, f = f0, g = g0, x, y, z; 189 int i; 190 191 for (i = 0; i < 30; ++i) { 192 VERIFY_CHECK((f & 1) == 1); /* f must always be odd */ 193 VERIFY_CHECK((u * f0 + v * g0) == f << i); 194 VERIFY_CHECK((q * f0 + r * g0) == g << i); 195 /* Compute conditional masks for (zeta < 0) and for (g & 1). */ 196 c1 = zeta >> 31; 197 mask1 = c1; 198 c2 = g & 1; 199 mask2 = -c2; 200 /* Compute x,y,z, conditionally negated versions of f,u,v. */ 201 x = (f ^ mask1) - mask1; 202 y = (u ^ mask1) - mask1; 203 z = (v ^ mask1) - mask1; 204 /* Conditionally add x,y,z to g,q,r. */ 205 g += x & mask2; 206 q += y & mask2; 207 r += z & mask2; 208 /* In what follows, mask1 is a condition mask for (zeta < 0) and (g & 1). */ 209 mask1 &= mask2; 210 /* Conditionally change zeta into -zeta-2 or zeta-1. */ 211 zeta = (zeta ^ mask1) - 1; 212 /* Conditionally add g,q,r to f,u,v. */ 213 f += g & mask1; 214 u += q & mask1; 215 v += r & mask1; 216 /* Shifts */ 217 g >>= 1; 218 u <<= 1; 219 v <<= 1; 220 /* Bounds on zeta that follow from the bounds on iteration count (max 20*30 divsteps). */ 221 VERIFY_CHECK(zeta >= -601 && zeta <= 601); 222 } 223 /* Return data in t and return value. */ 224 t->u = (int32_t)u; 225 t->v = (int32_t)v; 226 t->q = (int32_t)q; 227 t->r = (int32_t)r; 228 /* The determinant of t must be a power of two. This guarantees that multiplication with t 229 * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which 230 * will be divided out again). As each divstep's individual matrix has determinant 2, the 231 * aggregate of 30 of them will have determinant 2^30. */ 232 VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30); 233 return zeta; 234 } 235 236 /* haskellsecp256k1_v0_1_0_modinv32_inv256[i] = -(2*i+1)^-1 (mod 256) */ 237 static const uint8_t haskellsecp256k1_v0_1_0_modinv32_inv256[128] = { 238 0xFF, 0x55, 0x33, 0x49, 0xC7, 0x5D, 0x3B, 0x11, 0x0F, 0xE5, 0xC3, 0x59, 239 0xD7, 0xED, 0xCB, 0x21, 0x1F, 0x75, 0x53, 0x69, 0xE7, 0x7D, 0x5B, 0x31, 240 0x2F, 0x05, 0xE3, 0x79, 0xF7, 0x0D, 0xEB, 0x41, 0x3F, 0x95, 0x73, 0x89, 241 0x07, 0x9D, 0x7B, 0x51, 0x4F, 0x25, 0x03, 0x99, 0x17, 0x2D, 0x0B, 0x61, 242 0x5F, 0xB5, 0x93, 0xA9, 0x27, 0xBD, 0x9B, 0x71, 0x6F, 0x45, 0x23, 0xB9, 243 0x37, 0x4D, 0x2B, 0x81, 0x7F, 0xD5, 0xB3, 0xC9, 0x47, 0xDD, 0xBB, 0x91, 244 0x8F, 0x65, 0x43, 0xD9, 0x57, 0x6D, 0x4B, 0xA1, 0x9F, 0xF5, 0xD3, 0xE9, 245 0x67, 0xFD, 0xDB, 0xB1, 0xAF, 0x85, 0x63, 0xF9, 0x77, 0x8D, 0x6B, 0xC1, 246 0xBF, 0x15, 0xF3, 0x09, 0x87, 0x1D, 0xFB, 0xD1, 0xCF, 0xA5, 0x83, 0x19, 247 0x97, 0xAD, 0x8B, 0xE1, 0xDF, 0x35, 0x13, 0x29, 0xA7, 0x3D, 0x1B, 0xF1, 248 0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01 249 }; 250 251 /* Compute the transition matrix and eta for 30 divsteps (variable time). 252 * 253 * Input: eta: initial eta 254 * f0: bottom limb of initial f 255 * g0: bottom limb of initial g 256 * Output: t: transition matrix 257 * Return: final eta 258 * 259 * Implements the divsteps_n_matrix_var function from the explanation. 260 */ 261 static int32_t haskellsecp256k1_v0_1_0_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint32_t g0, haskellsecp256k1_v0_1_0_modinv32_trans2x2 *t) { 262 /* Transformation matrix; see comments in haskellsecp256k1_v0_1_0_modinv32_divsteps_30. */ 263 uint32_t u = 1, v = 0, q = 0, r = 1; 264 uint32_t f = f0, g = g0, m; 265 uint16_t w; 266 int i = 30, limit, zeros; 267 268 for (;;) { 269 /* Use a sentinel bit to count zeros only up to i. */ 270 zeros = haskellsecp256k1_v0_1_0_ctz32_var(g | (UINT32_MAX << i)); 271 /* Perform zeros divsteps at once; they all just divide g by two. */ 272 g >>= zeros; 273 u <<= zeros; 274 v <<= zeros; 275 eta -= zeros; 276 i -= zeros; 277 /* We're done once we've done 30 divsteps. */ 278 if (i == 0) break; 279 VERIFY_CHECK((f & 1) == 1); 280 VERIFY_CHECK((g & 1) == 1); 281 VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i)); 282 VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i)); 283 /* Bounds on eta that follow from the bounds on iteration count (max 25*30 divsteps). */ 284 VERIFY_CHECK(eta >= -751 && eta <= 751); 285 /* If eta is negative, negate it and replace f,g with g,-f. */ 286 if (eta < 0) { 287 uint32_t tmp; 288 eta = -eta; 289 tmp = f; f = g; g = -tmp; 290 tmp = u; u = q; q = -tmp; 291 tmp = v; v = r; r = -tmp; 292 } 293 /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more 294 * than i can be cancelled out (as we'd be done before that point), and no more than eta+1 295 * can be done as its sign will flip once that happens. */ 296 limit = ((int)eta + 1) > i ? i : ((int)eta + 1); 297 /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */ 298 VERIFY_CHECK(limit > 0 && limit <= 30); 299 m = (UINT32_MAX >> (32 - limit)) & 255U; 300 /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */ 301 w = (g * haskellsecp256k1_v0_1_0_modinv32_inv256[(f >> 1) & 127]) & m; 302 /* Do so. */ 303 g += f * w; 304 q += u * w; 305 r += v * w; 306 VERIFY_CHECK((g & m) == 0); 307 } 308 /* Return data in t and return value. */ 309 t->u = (int32_t)u; 310 t->v = (int32_t)v; 311 t->q = (int32_t)q; 312 t->r = (int32_t)r; 313 /* The determinant of t must be a power of two. This guarantees that multiplication with t 314 * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which 315 * will be divided out again). As each divstep's individual matrix has determinant 2, the 316 * aggregate of 30 of them will have determinant 2^30. */ 317 VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30); 318 return eta; 319 } 320 321 /* Compute the transition matrix and eta for 30 posdivsteps (variable time, eta=-delta), and keeps track 322 * of the Jacobi symbol along the way. f0 and g0 must be f and g mod 2^32 rather than 2^30, because 323 * Jacobi tracking requires knowing (f mod 8) rather than just (f mod 2). 324 * 325 * Input: eta: initial eta 326 * f0: bottom limb of initial f 327 * g0: bottom limb of initial g 328 * Output: t: transition matrix 329 * Input/Output: (*jacp & 1) is bitflipped if and only if the Jacobi symbol of (f | g) changes sign 330 * by applying the returned transformation matrix to it. The other bits of *jacp may 331 * change, but are meaningless. 332 * Return: final eta 333 */ 334 static int32_t haskellsecp256k1_v0_1_0_modinv32_posdivsteps_30_var(int32_t eta, uint32_t f0, uint32_t g0, haskellsecp256k1_v0_1_0_modinv32_trans2x2 *t, int *jacp) { 335 /* Transformation matrix. */ 336 uint32_t u = 1, v = 0, q = 0, r = 1; 337 uint32_t f = f0, g = g0, m; 338 uint16_t w; 339 int i = 30, limit, zeros; 340 int jac = *jacp; 341 342 for (;;) { 343 /* Use a sentinel bit to count zeros only up to i. */ 344 zeros = haskellsecp256k1_v0_1_0_ctz32_var(g | (UINT32_MAX << i)); 345 /* Perform zeros divsteps at once; they all just divide g by two. */ 346 g >>= zeros; 347 u <<= zeros; 348 v <<= zeros; 349 eta -= zeros; 350 i -= zeros; 351 /* Update the bottom bit of jac: when dividing g by an odd power of 2, 352 * if (f mod 8) is 3 or 5, the Jacobi symbol changes sign. */ 353 jac ^= (zeros & ((f >> 1) ^ (f >> 2))); 354 /* We're done once we've done 30 posdivsteps. */ 355 if (i == 0) break; 356 VERIFY_CHECK((f & 1) == 1); 357 VERIFY_CHECK((g & 1) == 1); 358 VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i)); 359 VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i)); 360 /* If eta is negative, negate it and replace f,g with g,f. */ 361 if (eta < 0) { 362 uint32_t tmp; 363 eta = -eta; 364 /* Update bottom bit of jac: when swapping f and g, the Jacobi symbol changes sign 365 * if both f and g are 3 mod 4. */ 366 jac ^= ((f & g) >> 1); 367 tmp = f; f = g; g = tmp; 368 tmp = u; u = q; q = tmp; 369 tmp = v; v = r; r = tmp; 370 } 371 /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more 372 * than i can be cancelled out (as we'd be done before that point), and no more than eta+1 373 * can be done as its sign will flip once that happens. */ 374 limit = ((int)eta + 1) > i ? i : ((int)eta + 1); 375 /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */ 376 VERIFY_CHECK(limit > 0 && limit <= 30); 377 m = (UINT32_MAX >> (32 - limit)) & 255U; 378 /* Find what multiple of f must be added to g to cancel its bottom min(limit, 8) bits. */ 379 w = (g * haskellsecp256k1_v0_1_0_modinv32_inv256[(f >> 1) & 127]) & m; 380 /* Do so. */ 381 g += f * w; 382 q += u * w; 383 r += v * w; 384 VERIFY_CHECK((g & m) == 0); 385 } 386 /* Return data in t and return value. */ 387 t->u = (int32_t)u; 388 t->v = (int32_t)v; 389 t->q = (int32_t)q; 390 t->r = (int32_t)r; 391 /* The determinant of t must be a power of two. This guarantees that multiplication with t 392 * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which 393 * will be divided out again). As each divstep's individual matrix has determinant 2 or -2, 394 * the aggregate of 30 of them will have determinant 2^30 or -2^30. */ 395 VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30 || 396 (int64_t)t->u * t->r - (int64_t)t->v * t->q == -(((int64_t)1) << 30)); 397 *jacp = jac; 398 return eta; 399 } 400 401 /* Compute (t/2^30) * [d, e] mod modulus, where t is a transition matrix for 30 divsteps. 402 * 403 * On input and output, d and e are in range (-2*modulus,modulus). All output limbs will be in range 404 * (-2^30,2^30). 405 * 406 * This implements the update_de function from the explanation. 407 */ 408 static void haskellsecp256k1_v0_1_0_modinv32_update_de_30(haskellsecp256k1_v0_1_0_modinv32_signed30 *d, haskellsecp256k1_v0_1_0_modinv32_signed30 *e, const haskellsecp256k1_v0_1_0_modinv32_trans2x2 *t, const haskellsecp256k1_v0_1_0_modinv32_modinfo* modinfo) { 409 const int32_t M30 = (int32_t)(UINT32_MAX >> 2); 410 const int32_t u = t->u, v = t->v, q = t->q, r = t->r; 411 int32_t di, ei, md, me, sd, se; 412 int64_t cd, ce; 413 int i; 414 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, -2) > 0); /* d > -2*modulus */ 415 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, 1) < 0); /* d < modulus */ 416 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, -2) > 0); /* e > -2*modulus */ 417 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, 1) < 0); /* e < modulus */ 418 VERIFY_CHECK(labs(u) <= (M30 + 1 - labs(v))); /* |u|+|v| <= 2^30 */ 419 VERIFY_CHECK(labs(q) <= (M30 + 1 - labs(r))); /* |q|+|r| <= 2^30 */ 420 421 /* [md,me] start as zero; plus [u,q] if d is negative; plus [v,r] if e is negative. */ 422 sd = d->v[8] >> 31; 423 se = e->v[8] >> 31; 424 md = (u & sd) + (v & se); 425 me = (q & sd) + (r & se); 426 /* Begin computing t*[d,e]. */ 427 di = d->v[0]; 428 ei = e->v[0]; 429 cd = (int64_t)u * di + (int64_t)v * ei; 430 ce = (int64_t)q * di + (int64_t)r * ei; 431 /* Correct md,me so that t*[d,e]+modulus*[md,me] has 30 zero bottom bits. */ 432 md -= (modinfo->modulus_inv30 * (uint32_t)cd + md) & M30; 433 me -= (modinfo->modulus_inv30 * (uint32_t)ce + me) & M30; 434 /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */ 435 cd += (int64_t)modinfo->modulus.v[0] * md; 436 ce += (int64_t)modinfo->modulus.v[0] * me; 437 /* Verify that the low 30 bits of the computation are indeed zero, and then throw them away. */ 438 VERIFY_CHECK(((int32_t)cd & M30) == 0); cd >>= 30; 439 VERIFY_CHECK(((int32_t)ce & M30) == 0); ce >>= 30; 440 /* Now iteratively compute limb i=1..8 of t*[d,e]+modulus*[md,me], and store them in output 441 * limb i-1 (shifting down by 30 bits). */ 442 for (i = 1; i < 9; ++i) { 443 di = d->v[i]; 444 ei = e->v[i]; 445 cd += (int64_t)u * di + (int64_t)v * ei; 446 ce += (int64_t)q * di + (int64_t)r * ei; 447 cd += (int64_t)modinfo->modulus.v[i] * md; 448 ce += (int64_t)modinfo->modulus.v[i] * me; 449 d->v[i - 1] = (int32_t)cd & M30; cd >>= 30; 450 e->v[i - 1] = (int32_t)ce & M30; ce >>= 30; 451 } 452 /* What remains is limb 9 of t*[d,e]+modulus*[md,me]; store it as output limb 8. */ 453 d->v[8] = (int32_t)cd; 454 e->v[8] = (int32_t)ce; 455 456 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, -2) > 0); /* d > -2*modulus */ 457 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(d, 9, &modinfo->modulus, 1) < 0); /* d < modulus */ 458 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, -2) > 0); /* e > -2*modulus */ 459 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(e, 9, &modinfo->modulus, 1) < 0); /* e < modulus */ 460 } 461 462 /* Compute (t/2^30) * [f, g], where t is a transition matrix for 30 divsteps. 463 * 464 * This implements the update_fg function from the explanation. 465 */ 466 static void haskellsecp256k1_v0_1_0_modinv32_update_fg_30(haskellsecp256k1_v0_1_0_modinv32_signed30 *f, haskellsecp256k1_v0_1_0_modinv32_signed30 *g, const haskellsecp256k1_v0_1_0_modinv32_trans2x2 *t) { 467 const int32_t M30 = (int32_t)(UINT32_MAX >> 2); 468 const int32_t u = t->u, v = t->v, q = t->q, r = t->r; 469 int32_t fi, gi; 470 int64_t cf, cg; 471 int i; 472 /* Start computing t*[f,g]. */ 473 fi = f->v[0]; 474 gi = g->v[0]; 475 cf = (int64_t)u * fi + (int64_t)v * gi; 476 cg = (int64_t)q * fi + (int64_t)r * gi; 477 /* Verify that the bottom 30 bits of the result are zero, and then throw them away. */ 478 VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30; 479 VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30; 480 /* Now iteratively compute limb i=1..8 of t*[f,g], and store them in output limb i-1 (shifting 481 * down by 30 bits). */ 482 for (i = 1; i < 9; ++i) { 483 fi = f->v[i]; 484 gi = g->v[i]; 485 cf += (int64_t)u * fi + (int64_t)v * gi; 486 cg += (int64_t)q * fi + (int64_t)r * gi; 487 f->v[i - 1] = (int32_t)cf & M30; cf >>= 30; 488 g->v[i - 1] = (int32_t)cg & M30; cg >>= 30; 489 } 490 /* What remains is limb 9 of t*[f,g]; store it as output limb 8. */ 491 f->v[8] = (int32_t)cf; 492 g->v[8] = (int32_t)cg; 493 } 494 495 /* Compute (t/2^30) * [f, g], where t is a transition matrix for 30 divsteps. 496 * 497 * Version that operates on a variable number of limbs in f and g. 498 * 499 * This implements the update_fg function from the explanation in modinv64_impl.h. 500 */ 501 static void haskellsecp256k1_v0_1_0_modinv32_update_fg_30_var(int len, haskellsecp256k1_v0_1_0_modinv32_signed30 *f, haskellsecp256k1_v0_1_0_modinv32_signed30 *g, const haskellsecp256k1_v0_1_0_modinv32_trans2x2 *t) { 502 const int32_t M30 = (int32_t)(UINT32_MAX >> 2); 503 const int32_t u = t->u, v = t->v, q = t->q, r = t->r; 504 int32_t fi, gi; 505 int64_t cf, cg; 506 int i; 507 VERIFY_CHECK(len > 0); 508 /* Start computing t*[f,g]. */ 509 fi = f->v[0]; 510 gi = g->v[0]; 511 cf = (int64_t)u * fi + (int64_t)v * gi; 512 cg = (int64_t)q * fi + (int64_t)r * gi; 513 /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */ 514 VERIFY_CHECK(((int32_t)cf & M30) == 0); cf >>= 30; 515 VERIFY_CHECK(((int32_t)cg & M30) == 0); cg >>= 30; 516 /* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting 517 * down by 30 bits). */ 518 for (i = 1; i < len; ++i) { 519 fi = f->v[i]; 520 gi = g->v[i]; 521 cf += (int64_t)u * fi + (int64_t)v * gi; 522 cg += (int64_t)q * fi + (int64_t)r * gi; 523 f->v[i - 1] = (int32_t)cf & M30; cf >>= 30; 524 g->v[i - 1] = (int32_t)cg & M30; cg >>= 30; 525 } 526 /* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */ 527 f->v[len - 1] = (int32_t)cf; 528 g->v[len - 1] = (int32_t)cg; 529 } 530 531 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */ 532 static void haskellsecp256k1_v0_1_0_modinv32(haskellsecp256k1_v0_1_0_modinv32_signed30 *x, const haskellsecp256k1_v0_1_0_modinv32_modinfo *modinfo) { 533 /* Start with d=0, e=1, f=modulus, g=x, zeta=-1. */ 534 haskellsecp256k1_v0_1_0_modinv32_signed30 d = {{0}}; 535 haskellsecp256k1_v0_1_0_modinv32_signed30 e = {{1}}; 536 haskellsecp256k1_v0_1_0_modinv32_signed30 f = modinfo->modulus; 537 haskellsecp256k1_v0_1_0_modinv32_signed30 g = *x; 538 int i; 539 int32_t zeta = -1; /* zeta = -(delta+1/2); delta is initially 1/2. */ 540 541 /* Do 20 iterations of 30 divsteps each = 600 divsteps. 590 suffices for 256-bit inputs. */ 542 for (i = 0; i < 20; ++i) { 543 /* Compute transition matrix and new zeta after 30 divsteps. */ 544 haskellsecp256k1_v0_1_0_modinv32_trans2x2 t; 545 zeta = haskellsecp256k1_v0_1_0_modinv32_divsteps_30(zeta, f.v[0], g.v[0], &t); 546 /* Update d,e using that transition matrix. */ 547 haskellsecp256k1_v0_1_0_modinv32_update_de_30(&d, &e, &t, modinfo); 548 /* Update f,g using that transition matrix. */ 549 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) > 0); /* f > -modulus */ 550 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) <= 0); /* f <= modulus */ 551 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, -1) > 0); /* g > -modulus */ 552 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, 1) < 0); /* g < modulus */ 553 554 haskellsecp256k1_v0_1_0_modinv32_update_fg_30(&f, &g, &t); 555 556 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) > 0); /* f > -modulus */ 557 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) <= 0); /* f <= modulus */ 558 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, -1) > 0); /* g > -modulus */ 559 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, 9, &modinfo->modulus, 1) < 0); /* g < modulus */ 560 } 561 562 /* At this point sufficient iterations have been performed that g must have reached 0 563 * and (if g was not originally 0) f must now equal +/- GCD of the initial f, g 564 * values i.e. +/- 1, and d now contains +/- the modular inverse. */ 565 566 /* g == 0 */ 567 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, 9, &SECP256K1_SIGNED30_ONE, 0) == 0); 568 /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */ 569 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, 9, &SECP256K1_SIGNED30_ONE, -1) == 0 || 570 haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, 9, &SECP256K1_SIGNED30_ONE, 1) == 0 || 571 (haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(x, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 && 572 haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&d, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 && 573 (haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, 1) == 0 || 574 haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, 9, &modinfo->modulus, -1) == 0))); 575 576 /* Optionally negate d, normalize to [0,modulus), and return it. */ 577 haskellsecp256k1_v0_1_0_modinv32_normalize_30(&d, f.v[8], modinfo); 578 *x = d; 579 } 580 581 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (variable time). */ 582 static void haskellsecp256k1_v0_1_0_modinv32_var(haskellsecp256k1_v0_1_0_modinv32_signed30 *x, const haskellsecp256k1_v0_1_0_modinv32_modinfo *modinfo) { 583 /* Start with d=0, e=1, f=modulus, g=x, eta=-1. */ 584 haskellsecp256k1_v0_1_0_modinv32_signed30 d = {{0, 0, 0, 0, 0, 0, 0, 0, 0}}; 585 haskellsecp256k1_v0_1_0_modinv32_signed30 e = {{1, 0, 0, 0, 0, 0, 0, 0, 0}}; 586 haskellsecp256k1_v0_1_0_modinv32_signed30 f = modinfo->modulus; 587 haskellsecp256k1_v0_1_0_modinv32_signed30 g = *x; 588 #ifdef VERIFY 589 int i = 0; 590 #endif 591 int j, len = 9; 592 int32_t eta = -1; /* eta = -delta; delta is initially 1 (faster for the variable-time code) */ 593 int32_t cond, fn, gn; 594 595 /* Do iterations of 30 divsteps each until g=0. */ 596 while (1) { 597 /* Compute transition matrix and new eta after 30 divsteps. */ 598 haskellsecp256k1_v0_1_0_modinv32_trans2x2 t; 599 eta = haskellsecp256k1_v0_1_0_modinv32_divsteps_30_var(eta, f.v[0], g.v[0], &t); 600 /* Update d,e using that transition matrix. */ 601 haskellsecp256k1_v0_1_0_modinv32_update_de_30(&d, &e, &t, modinfo); 602 /* Update f,g using that transition matrix. */ 603 604 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */ 605 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */ 606 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */ 607 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 1) < 0); /* g < modulus */ 608 609 haskellsecp256k1_v0_1_0_modinv32_update_fg_30_var(len, &f, &g, &t); 610 /* If the bottom limb of g is 0, there is a chance g=0. */ 611 if (g.v[0] == 0) { 612 cond = 0; 613 /* Check if all other limbs are also 0. */ 614 for (j = 1; j < len; ++j) { 615 cond |= g.v[j]; 616 } 617 /* If so, we're done. */ 618 if (cond == 0) break; 619 } 620 621 /* Determine if len>1 and limb (len-1) of both f and g is 0 or -1. */ 622 fn = f.v[len - 1]; 623 gn = g.v[len - 1]; 624 cond = ((int32_t)len - 2) >> 31; 625 cond |= fn ^ (fn >> 31); 626 cond |= gn ^ (gn >> 31); 627 /* If so, reduce length, propagating the sign of f and g's top limb into the one below. */ 628 if (cond == 0) { 629 f.v[len - 2] |= (uint32_t)fn << 30; 630 g.v[len - 2] |= (uint32_t)gn << 30; 631 --len; 632 } 633 634 VERIFY_CHECK(++i < 25); /* We should never need more than 25*30 = 750 divsteps */ 635 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */ 636 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */ 637 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */ 638 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 1) < 0); /* g < modulus */ 639 } 640 641 /* At this point g is 0 and (if g was not originally 0) f must now equal +/- GCD of 642 * the initial f, g values i.e. +/- 1, and d now contains +/- the modular inverse. */ 643 644 /* g == 0 */ 645 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, len, &SECP256K1_SIGNED30_ONE, 0) == 0); 646 /* |f| == 1, or (x == 0 and d == 0 and |f|=modulus) */ 647 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &SECP256K1_SIGNED30_ONE, -1) == 0 || 648 haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &SECP256K1_SIGNED30_ONE, 1) == 0 || 649 (haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(x, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 && 650 haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&d, 9, &SECP256K1_SIGNED30_ONE, 0) == 0 && 651 (haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) == 0 || 652 haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) == 0))); 653 654 /* Optionally negate d, normalize to [0,modulus), and return it. */ 655 haskellsecp256k1_v0_1_0_modinv32_normalize_30(&d, f.v[len - 1], modinfo); 656 *x = d; 657 } 658 659 /* Do up to 50 iterations of 30 posdivsteps (up to 1500 steps; more is extremely rare) each until f=1. 660 * In VERIFY mode use a lower number of iterations (750, close to the median 756), so failure actually occurs. */ 661 #ifdef VERIFY 662 #define JACOBI32_ITERATIONS 25 663 #else 664 #define JACOBI32_ITERATIONS 50 665 #endif 666 667 /* Compute the Jacobi symbol of x modulo modinfo->modulus (variable time). gcd(x,modulus) must be 1. */ 668 static int haskellsecp256k1_v0_1_0_jacobi32_maybe_var(const haskellsecp256k1_v0_1_0_modinv32_signed30 *x, const haskellsecp256k1_v0_1_0_modinv32_modinfo *modinfo) { 669 /* Start with f=modulus, g=x, eta=-1. */ 670 haskellsecp256k1_v0_1_0_modinv32_signed30 f = modinfo->modulus; 671 haskellsecp256k1_v0_1_0_modinv32_signed30 g = *x; 672 int j, len = 9; 673 int32_t eta = -1; /* eta = -delta; delta is initially 1 */ 674 int32_t cond, fn, gn; 675 int jac = 0; 676 int count; 677 678 /* The input limbs must all be non-negative. */ 679 VERIFY_CHECK(g.v[0] >= 0 && g.v[1] >= 0 && g.v[2] >= 0 && g.v[3] >= 0 && g.v[4] >= 0 && g.v[5] >= 0 && g.v[6] >= 0 && g.v[7] >= 0 && g.v[8] >= 0); 680 681 /* If x > 0, then if the loop below converges, it converges to f=g=gcd(x,modulus). Since we 682 * require that gcd(x,modulus)=1 and modulus>=3, x cannot be 0. Thus, we must reach f=1 (or 683 * time out). */ 684 VERIFY_CHECK((g.v[0] | g.v[1] | g.v[2] | g.v[3] | g.v[4] | g.v[5] | g.v[6] | g.v[7] | g.v[8]) != 0); 685 686 for (count = 0; count < JACOBI32_ITERATIONS; ++count) { 687 /* Compute transition matrix and new eta after 30 posdivsteps. */ 688 haskellsecp256k1_v0_1_0_modinv32_trans2x2 t; 689 eta = haskellsecp256k1_v0_1_0_modinv32_posdivsteps_30_var(eta, f.v[0] | ((uint32_t)f.v[1] << 30), g.v[0] | ((uint32_t)g.v[1] << 30), &t, &jac); 690 /* Update f,g using that transition matrix. */ 691 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 0) > 0); /* f > 0 */ 692 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */ 693 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 0) > 0); /* g > 0 */ 694 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 1) < 0); /* g < modulus */ 695 696 haskellsecp256k1_v0_1_0_modinv32_update_fg_30_var(len, &f, &g, &t); 697 /* If the bottom limb of f is 1, there is a chance that f=1. */ 698 if (f.v[0] == 1) { 699 cond = 0; 700 /* Check if the other limbs are also 0. */ 701 for (j = 1; j < len; ++j) { 702 cond |= f.v[j]; 703 } 704 /* If so, we're done. If f=1, the Jacobi symbol (g | f)=1. */ 705 if (cond == 0) return 1 - 2*(jac & 1); 706 } 707 708 /* Determine if len>1 and limb (len-1) of both f and g is 0. */ 709 fn = f.v[len - 1]; 710 gn = g.v[len - 1]; 711 cond = ((int32_t)len - 2) >> 31; 712 cond |= fn; 713 cond |= gn; 714 /* If so, reduce length. */ 715 if (cond == 0) --len; 716 717 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 0) > 0); /* f > 0 */ 718 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */ 719 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 0) > 0); /* g > 0 */ 720 VERIFY_CHECK(haskellsecp256k1_v0_1_0_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, 1) < 0); /* g < modulus */ 721 } 722 723 /* The loop failed to converge to f=g after 1500 iterations. Return 0, indicating unknown result. */ 724 return 0; 725 } 726 727 #endif /* SECP256K1_MODINV32_IMPL_H */