field_10x26_arm.s (28449B)
1 @ vim: set tabstop=8 softtabstop=8 shiftwidth=8 noexpandtab syntax=armasm: 2 /*********************************************************************** 3 * Copyright (c) 2014 Wladimir J. van der Laan * 4 * Distributed under the MIT software license, see the accompanying * 5 * file COPYING or https://www.opensource.org/licenses/mit-license.php.* 6 ***********************************************************************/ 7 /* 8 ARM implementation of field_10x26 inner loops. 9 10 Note: 11 12 - To avoid unnecessary loads and make use of available registers, two 13 'passes' have every time been interleaved, with the odd passes accumulating c' and d' 14 which will be added to c and d respectively in the even passes 15 16 */ 17 18 .syntax unified 19 @ eabi attributes - see readelf -A 20 .eabi_attribute 24, 1 @ Tag_ABI_align_needed = 8-byte 21 .eabi_attribute 25, 1 @ Tag_ABI_align_preserved = 8-byte, except leaf SP 22 .text 23 24 @ Field constants 25 .set field_R0, 0x3d10 26 .set field_R1, 0x400 27 .set field_not_M, 0xfc000000 @ ~M = ~0x3ffffff 28 29 .align 2 30 .global haskellsecp256k1_v0_1_0_fe_mul_inner 31 .type haskellsecp256k1_v0_1_0_fe_mul_inner, %function 32 .hidden haskellsecp256k1_v0_1_0_fe_mul_inner 33 @ Arguments: 34 @ r0 r Restrict: can overlap with a, not with b 35 @ r1 a 36 @ r2 b 37 @ Stack (total 4+10*4 = 44) 38 @ sp + #0 saved 'r' pointer 39 @ sp + #4 + 4*X t0,t1,t2,t3,t4,t5,t6,t7,u8,t9 40 haskellsecp256k1_v0_1_0_fe_mul_inner: 41 stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14} 42 sub sp, sp, #48 @ frame=44 + alignment 43 str r0, [sp, #0] @ save result address, we need it only at the end 44 45 /****************************************** 46 * Main computation code. 47 ****************************************** 48 49 Allocation: 50 r0,r14,r7,r8 scratch 51 r1 a (pointer) 52 r2 b (pointer) 53 r3:r4 c 54 r5:r6 d 55 r11:r12 c' 56 r9:r10 d' 57 58 Note: do not write to r[] here, it may overlap with a[] 59 */ 60 61 /* A - interleaved with B */ 62 ldr r7, [r1, #0*4] @ a[0] 63 ldr r8, [r2, #9*4] @ b[9] 64 ldr r0, [r1, #1*4] @ a[1] 65 umull r5, r6, r7, r8 @ d = a[0] * b[9] 66 ldr r14, [r2, #8*4] @ b[8] 67 umull r9, r10, r0, r8 @ d' = a[1] * b[9] 68 ldr r7, [r1, #2*4] @ a[2] 69 umlal r5, r6, r0, r14 @ d += a[1] * b[8] 70 ldr r8, [r2, #7*4] @ b[7] 71 umlal r9, r10, r7, r14 @ d' += a[2] * b[8] 72 ldr r0, [r1, #3*4] @ a[3] 73 umlal r5, r6, r7, r8 @ d += a[2] * b[7] 74 ldr r14, [r2, #6*4] @ b[6] 75 umlal r9, r10, r0, r8 @ d' += a[3] * b[7] 76 ldr r7, [r1, #4*4] @ a[4] 77 umlal r5, r6, r0, r14 @ d += a[3] * b[6] 78 ldr r8, [r2, #5*4] @ b[5] 79 umlal r9, r10, r7, r14 @ d' += a[4] * b[6] 80 ldr r0, [r1, #5*4] @ a[5] 81 umlal r5, r6, r7, r8 @ d += a[4] * b[5] 82 ldr r14, [r2, #4*4] @ b[4] 83 umlal r9, r10, r0, r8 @ d' += a[5] * b[5] 84 ldr r7, [r1, #6*4] @ a[6] 85 umlal r5, r6, r0, r14 @ d += a[5] * b[4] 86 ldr r8, [r2, #3*4] @ b[3] 87 umlal r9, r10, r7, r14 @ d' += a[6] * b[4] 88 ldr r0, [r1, #7*4] @ a[7] 89 umlal r5, r6, r7, r8 @ d += a[6] * b[3] 90 ldr r14, [r2, #2*4] @ b[2] 91 umlal r9, r10, r0, r8 @ d' += a[7] * b[3] 92 ldr r7, [r1, #8*4] @ a[8] 93 umlal r5, r6, r0, r14 @ d += a[7] * b[2] 94 ldr r8, [r2, #1*4] @ b[1] 95 umlal r9, r10, r7, r14 @ d' += a[8] * b[2] 96 ldr r0, [r1, #9*4] @ a[9] 97 umlal r5, r6, r7, r8 @ d += a[8] * b[1] 98 ldr r14, [r2, #0*4] @ b[0] 99 umlal r9, r10, r0, r8 @ d' += a[9] * b[1] 100 ldr r7, [r1, #0*4] @ a[0] 101 umlal r5, r6, r0, r14 @ d += a[9] * b[0] 102 @ r7,r14 used in B 103 104 bic r0, r5, field_not_M @ t9 = d & M 105 str r0, [sp, #4 + 4*9] 106 mov r5, r5, lsr #26 @ d >>= 26 107 orr r5, r5, r6, asl #6 108 mov r6, r6, lsr #26 109 110 /* B */ 111 umull r3, r4, r7, r14 @ c = a[0] * b[0] 112 adds r5, r5, r9 @ d += d' 113 adc r6, r6, r10 114 115 bic r0, r5, field_not_M @ u0 = d & M 116 mov r5, r5, lsr #26 @ d >>= 26 117 orr r5, r5, r6, asl #6 118 mov r6, r6, lsr #26 119 movw r14, field_R0 @ c += u0 * R0 120 umlal r3, r4, r0, r14 121 122 bic r14, r3, field_not_M @ t0 = c & M 123 str r14, [sp, #4 + 0*4] 124 mov r3, r3, lsr #26 @ c >>= 26 125 orr r3, r3, r4, asl #6 126 mov r4, r4, lsr #26 127 mov r14, field_R1 @ c += u0 * R1 128 umlal r3, r4, r0, r14 129 130 /* C - interleaved with D */ 131 ldr r7, [r1, #0*4] @ a[0] 132 ldr r8, [r2, #2*4] @ b[2] 133 ldr r14, [r2, #1*4] @ b[1] 134 umull r11, r12, r7, r8 @ c' = a[0] * b[2] 135 ldr r0, [r1, #1*4] @ a[1] 136 umlal r3, r4, r7, r14 @ c += a[0] * b[1] 137 ldr r8, [r2, #0*4] @ b[0] 138 umlal r11, r12, r0, r14 @ c' += a[1] * b[1] 139 ldr r7, [r1, #2*4] @ a[2] 140 umlal r3, r4, r0, r8 @ c += a[1] * b[0] 141 ldr r14, [r2, #9*4] @ b[9] 142 umlal r11, r12, r7, r8 @ c' += a[2] * b[0] 143 ldr r0, [r1, #3*4] @ a[3] 144 umlal r5, r6, r7, r14 @ d += a[2] * b[9] 145 ldr r8, [r2, #8*4] @ b[8] 146 umull r9, r10, r0, r14 @ d' = a[3] * b[9] 147 ldr r7, [r1, #4*4] @ a[4] 148 umlal r5, r6, r0, r8 @ d += a[3] * b[8] 149 ldr r14, [r2, #7*4] @ b[7] 150 umlal r9, r10, r7, r8 @ d' += a[4] * b[8] 151 ldr r0, [r1, #5*4] @ a[5] 152 umlal r5, r6, r7, r14 @ d += a[4] * b[7] 153 ldr r8, [r2, #6*4] @ b[6] 154 umlal r9, r10, r0, r14 @ d' += a[5] * b[7] 155 ldr r7, [r1, #6*4] @ a[6] 156 umlal r5, r6, r0, r8 @ d += a[5] * b[6] 157 ldr r14, [r2, #5*4] @ b[5] 158 umlal r9, r10, r7, r8 @ d' += a[6] * b[6] 159 ldr r0, [r1, #7*4] @ a[7] 160 umlal r5, r6, r7, r14 @ d += a[6] * b[5] 161 ldr r8, [r2, #4*4] @ b[4] 162 umlal r9, r10, r0, r14 @ d' += a[7] * b[5] 163 ldr r7, [r1, #8*4] @ a[8] 164 umlal r5, r6, r0, r8 @ d += a[7] * b[4] 165 ldr r14, [r2, #3*4] @ b[3] 166 umlal r9, r10, r7, r8 @ d' += a[8] * b[4] 167 ldr r0, [r1, #9*4] @ a[9] 168 umlal r5, r6, r7, r14 @ d += a[8] * b[3] 169 ldr r8, [r2, #2*4] @ b[2] 170 umlal r9, r10, r0, r14 @ d' += a[9] * b[3] 171 umlal r5, r6, r0, r8 @ d += a[9] * b[2] 172 173 bic r0, r5, field_not_M @ u1 = d & M 174 mov r5, r5, lsr #26 @ d >>= 26 175 orr r5, r5, r6, asl #6 176 mov r6, r6, lsr #26 177 movw r14, field_R0 @ c += u1 * R0 178 umlal r3, r4, r0, r14 179 180 bic r14, r3, field_not_M @ t1 = c & M 181 str r14, [sp, #4 + 1*4] 182 mov r3, r3, lsr #26 @ c >>= 26 183 orr r3, r3, r4, asl #6 184 mov r4, r4, lsr #26 185 mov r14, field_R1 @ c += u1 * R1 186 umlal r3, r4, r0, r14 187 188 /* D */ 189 adds r3, r3, r11 @ c += c' 190 adc r4, r4, r12 191 adds r5, r5, r9 @ d += d' 192 adc r6, r6, r10 193 194 bic r0, r5, field_not_M @ u2 = d & M 195 mov r5, r5, lsr #26 @ d >>= 26 196 orr r5, r5, r6, asl #6 197 mov r6, r6, lsr #26 198 movw r14, field_R0 @ c += u2 * R0 199 umlal r3, r4, r0, r14 200 201 bic r14, r3, field_not_M @ t2 = c & M 202 str r14, [sp, #4 + 2*4] 203 mov r3, r3, lsr #26 @ c >>= 26 204 orr r3, r3, r4, asl #6 205 mov r4, r4, lsr #26 206 mov r14, field_R1 @ c += u2 * R1 207 umlal r3, r4, r0, r14 208 209 /* E - interleaved with F */ 210 ldr r7, [r1, #0*4] @ a[0] 211 ldr r8, [r2, #4*4] @ b[4] 212 umull r11, r12, r7, r8 @ c' = a[0] * b[4] 213 ldr r8, [r2, #3*4] @ b[3] 214 umlal r3, r4, r7, r8 @ c += a[0] * b[3] 215 ldr r7, [r1, #1*4] @ a[1] 216 umlal r11, r12, r7, r8 @ c' += a[1] * b[3] 217 ldr r8, [r2, #2*4] @ b[2] 218 umlal r3, r4, r7, r8 @ c += a[1] * b[2] 219 ldr r7, [r1, #2*4] @ a[2] 220 umlal r11, r12, r7, r8 @ c' += a[2] * b[2] 221 ldr r8, [r2, #1*4] @ b[1] 222 umlal r3, r4, r7, r8 @ c += a[2] * b[1] 223 ldr r7, [r1, #3*4] @ a[3] 224 umlal r11, r12, r7, r8 @ c' += a[3] * b[1] 225 ldr r8, [r2, #0*4] @ b[0] 226 umlal r3, r4, r7, r8 @ c += a[3] * b[0] 227 ldr r7, [r1, #4*4] @ a[4] 228 umlal r11, r12, r7, r8 @ c' += a[4] * b[0] 229 ldr r8, [r2, #9*4] @ b[9] 230 umlal r5, r6, r7, r8 @ d += a[4] * b[9] 231 ldr r7, [r1, #5*4] @ a[5] 232 umull r9, r10, r7, r8 @ d' = a[5] * b[9] 233 ldr r8, [r2, #8*4] @ b[8] 234 umlal r5, r6, r7, r8 @ d += a[5] * b[8] 235 ldr r7, [r1, #6*4] @ a[6] 236 umlal r9, r10, r7, r8 @ d' += a[6] * b[8] 237 ldr r8, [r2, #7*4] @ b[7] 238 umlal r5, r6, r7, r8 @ d += a[6] * b[7] 239 ldr r7, [r1, #7*4] @ a[7] 240 umlal r9, r10, r7, r8 @ d' += a[7] * b[7] 241 ldr r8, [r2, #6*4] @ b[6] 242 umlal r5, r6, r7, r8 @ d += a[7] * b[6] 243 ldr r7, [r1, #8*4] @ a[8] 244 umlal r9, r10, r7, r8 @ d' += a[8] * b[6] 245 ldr r8, [r2, #5*4] @ b[5] 246 umlal r5, r6, r7, r8 @ d += a[8] * b[5] 247 ldr r7, [r1, #9*4] @ a[9] 248 umlal r9, r10, r7, r8 @ d' += a[9] * b[5] 249 ldr r8, [r2, #4*4] @ b[4] 250 umlal r5, r6, r7, r8 @ d += a[9] * b[4] 251 252 bic r0, r5, field_not_M @ u3 = d & M 253 mov r5, r5, lsr #26 @ d >>= 26 254 orr r5, r5, r6, asl #6 255 mov r6, r6, lsr #26 256 movw r14, field_R0 @ c += u3 * R0 257 umlal r3, r4, r0, r14 258 259 bic r14, r3, field_not_M @ t3 = c & M 260 str r14, [sp, #4 + 3*4] 261 mov r3, r3, lsr #26 @ c >>= 26 262 orr r3, r3, r4, asl #6 263 mov r4, r4, lsr #26 264 mov r14, field_R1 @ c += u3 * R1 265 umlal r3, r4, r0, r14 266 267 /* F */ 268 adds r3, r3, r11 @ c += c' 269 adc r4, r4, r12 270 adds r5, r5, r9 @ d += d' 271 adc r6, r6, r10 272 273 bic r0, r5, field_not_M @ u4 = d & M 274 mov r5, r5, lsr #26 @ d >>= 26 275 orr r5, r5, r6, asl #6 276 mov r6, r6, lsr #26 277 movw r14, field_R0 @ c += u4 * R0 278 umlal r3, r4, r0, r14 279 280 bic r14, r3, field_not_M @ t4 = c & M 281 str r14, [sp, #4 + 4*4] 282 mov r3, r3, lsr #26 @ c >>= 26 283 orr r3, r3, r4, asl #6 284 mov r4, r4, lsr #26 285 mov r14, field_R1 @ c += u4 * R1 286 umlal r3, r4, r0, r14 287 288 /* G - interleaved with H */ 289 ldr r7, [r1, #0*4] @ a[0] 290 ldr r8, [r2, #6*4] @ b[6] 291 ldr r14, [r2, #5*4] @ b[5] 292 umull r11, r12, r7, r8 @ c' = a[0] * b[6] 293 ldr r0, [r1, #1*4] @ a[1] 294 umlal r3, r4, r7, r14 @ c += a[0] * b[5] 295 ldr r8, [r2, #4*4] @ b[4] 296 umlal r11, r12, r0, r14 @ c' += a[1] * b[5] 297 ldr r7, [r1, #2*4] @ a[2] 298 umlal r3, r4, r0, r8 @ c += a[1] * b[4] 299 ldr r14, [r2, #3*4] @ b[3] 300 umlal r11, r12, r7, r8 @ c' += a[2] * b[4] 301 ldr r0, [r1, #3*4] @ a[3] 302 umlal r3, r4, r7, r14 @ c += a[2] * b[3] 303 ldr r8, [r2, #2*4] @ b[2] 304 umlal r11, r12, r0, r14 @ c' += a[3] * b[3] 305 ldr r7, [r1, #4*4] @ a[4] 306 umlal r3, r4, r0, r8 @ c += a[3] * b[2] 307 ldr r14, [r2, #1*4] @ b[1] 308 umlal r11, r12, r7, r8 @ c' += a[4] * b[2] 309 ldr r0, [r1, #5*4] @ a[5] 310 umlal r3, r4, r7, r14 @ c += a[4] * b[1] 311 ldr r8, [r2, #0*4] @ b[0] 312 umlal r11, r12, r0, r14 @ c' += a[5] * b[1] 313 ldr r7, [r1, #6*4] @ a[6] 314 umlal r3, r4, r0, r8 @ c += a[5] * b[0] 315 ldr r14, [r2, #9*4] @ b[9] 316 umlal r11, r12, r7, r8 @ c' += a[6] * b[0] 317 ldr r0, [r1, #7*4] @ a[7] 318 umlal r5, r6, r7, r14 @ d += a[6] * b[9] 319 ldr r8, [r2, #8*4] @ b[8] 320 umull r9, r10, r0, r14 @ d' = a[7] * b[9] 321 ldr r7, [r1, #8*4] @ a[8] 322 umlal r5, r6, r0, r8 @ d += a[7] * b[8] 323 ldr r14, [r2, #7*4] @ b[7] 324 umlal r9, r10, r7, r8 @ d' += a[8] * b[8] 325 ldr r0, [r1, #9*4] @ a[9] 326 umlal r5, r6, r7, r14 @ d += a[8] * b[7] 327 ldr r8, [r2, #6*4] @ b[6] 328 umlal r9, r10, r0, r14 @ d' += a[9] * b[7] 329 umlal r5, r6, r0, r8 @ d += a[9] * b[6] 330 331 bic r0, r5, field_not_M @ u5 = d & M 332 mov r5, r5, lsr #26 @ d >>= 26 333 orr r5, r5, r6, asl #6 334 mov r6, r6, lsr #26 335 movw r14, field_R0 @ c += u5 * R0 336 umlal r3, r4, r0, r14 337 338 bic r14, r3, field_not_M @ t5 = c & M 339 str r14, [sp, #4 + 5*4] 340 mov r3, r3, lsr #26 @ c >>= 26 341 orr r3, r3, r4, asl #6 342 mov r4, r4, lsr #26 343 mov r14, field_R1 @ c += u5 * R1 344 umlal r3, r4, r0, r14 345 346 /* H */ 347 adds r3, r3, r11 @ c += c' 348 adc r4, r4, r12 349 adds r5, r5, r9 @ d += d' 350 adc r6, r6, r10 351 352 bic r0, r5, field_not_M @ u6 = d & M 353 mov r5, r5, lsr #26 @ d >>= 26 354 orr r5, r5, r6, asl #6 355 mov r6, r6, lsr #26 356 movw r14, field_R0 @ c += u6 * R0 357 umlal r3, r4, r0, r14 358 359 bic r14, r3, field_not_M @ t6 = c & M 360 str r14, [sp, #4 + 6*4] 361 mov r3, r3, lsr #26 @ c >>= 26 362 orr r3, r3, r4, asl #6 363 mov r4, r4, lsr #26 364 mov r14, field_R1 @ c += u6 * R1 365 umlal r3, r4, r0, r14 366 367 /* I - interleaved with J */ 368 ldr r8, [r2, #8*4] @ b[8] 369 ldr r7, [r1, #0*4] @ a[0] 370 ldr r14, [r2, #7*4] @ b[7] 371 umull r11, r12, r7, r8 @ c' = a[0] * b[8] 372 ldr r0, [r1, #1*4] @ a[1] 373 umlal r3, r4, r7, r14 @ c += a[0] * b[7] 374 ldr r8, [r2, #6*4] @ b[6] 375 umlal r11, r12, r0, r14 @ c' += a[1] * b[7] 376 ldr r7, [r1, #2*4] @ a[2] 377 umlal r3, r4, r0, r8 @ c += a[1] * b[6] 378 ldr r14, [r2, #5*4] @ b[5] 379 umlal r11, r12, r7, r8 @ c' += a[2] * b[6] 380 ldr r0, [r1, #3*4] @ a[3] 381 umlal r3, r4, r7, r14 @ c += a[2] * b[5] 382 ldr r8, [r2, #4*4] @ b[4] 383 umlal r11, r12, r0, r14 @ c' += a[3] * b[5] 384 ldr r7, [r1, #4*4] @ a[4] 385 umlal r3, r4, r0, r8 @ c += a[3] * b[4] 386 ldr r14, [r2, #3*4] @ b[3] 387 umlal r11, r12, r7, r8 @ c' += a[4] * b[4] 388 ldr r0, [r1, #5*4] @ a[5] 389 umlal r3, r4, r7, r14 @ c += a[4] * b[3] 390 ldr r8, [r2, #2*4] @ b[2] 391 umlal r11, r12, r0, r14 @ c' += a[5] * b[3] 392 ldr r7, [r1, #6*4] @ a[6] 393 umlal r3, r4, r0, r8 @ c += a[5] * b[2] 394 ldr r14, [r2, #1*4] @ b[1] 395 umlal r11, r12, r7, r8 @ c' += a[6] * b[2] 396 ldr r0, [r1, #7*4] @ a[7] 397 umlal r3, r4, r7, r14 @ c += a[6] * b[1] 398 ldr r8, [r2, #0*4] @ b[0] 399 umlal r11, r12, r0, r14 @ c' += a[7] * b[1] 400 ldr r7, [r1, #8*4] @ a[8] 401 umlal r3, r4, r0, r8 @ c += a[7] * b[0] 402 ldr r14, [r2, #9*4] @ b[9] 403 umlal r11, r12, r7, r8 @ c' += a[8] * b[0] 404 ldr r0, [r1, #9*4] @ a[9] 405 umlal r5, r6, r7, r14 @ d += a[8] * b[9] 406 ldr r8, [r2, #8*4] @ b[8] 407 umull r9, r10, r0, r14 @ d' = a[9] * b[9] 408 umlal r5, r6, r0, r8 @ d += a[9] * b[8] 409 410 bic r0, r5, field_not_M @ u7 = d & M 411 mov r5, r5, lsr #26 @ d >>= 26 412 orr r5, r5, r6, asl #6 413 mov r6, r6, lsr #26 414 movw r14, field_R0 @ c += u7 * R0 415 umlal r3, r4, r0, r14 416 417 bic r14, r3, field_not_M @ t7 = c & M 418 str r14, [sp, #4 + 7*4] 419 mov r3, r3, lsr #26 @ c >>= 26 420 orr r3, r3, r4, asl #6 421 mov r4, r4, lsr #26 422 mov r14, field_R1 @ c += u7 * R1 423 umlal r3, r4, r0, r14 424 425 /* J */ 426 adds r3, r3, r11 @ c += c' 427 adc r4, r4, r12 428 adds r5, r5, r9 @ d += d' 429 adc r6, r6, r10 430 431 bic r0, r5, field_not_M @ u8 = d & M 432 str r0, [sp, #4 + 8*4] 433 mov r5, r5, lsr #26 @ d >>= 26 434 orr r5, r5, r6, asl #6 435 mov r6, r6, lsr #26 436 movw r14, field_R0 @ c += u8 * R0 437 umlal r3, r4, r0, r14 438 439 /****************************************** 440 * compute and write back result 441 ****************************************** 442 Allocation: 443 r0 r 444 r3:r4 c 445 r5:r6 d 446 r7 t0 447 r8 t1 448 r9 t2 449 r11 u8 450 r12 t9 451 r1,r2,r10,r14 scratch 452 453 Note: do not read from a[] after here, it may overlap with r[] 454 */ 455 ldr r0, [sp, #0] 456 add r1, sp, #4 + 3*4 @ r[3..7] = t3..7, r11=u8, r12=t9 457 ldmia r1, {r2,r7,r8,r9,r10,r11,r12} 458 add r1, r0, #3*4 459 stmia r1, {r2,r7,r8,r9,r10} 460 461 bic r2, r3, field_not_M @ r[8] = c & M 462 str r2, [r0, #8*4] 463 mov r3, r3, lsr #26 @ c >>= 26 464 orr r3, r3, r4, asl #6 465 mov r4, r4, lsr #26 466 mov r14, field_R1 @ c += u8 * R1 467 umlal r3, r4, r11, r14 468 movw r14, field_R0 @ c += d * R0 469 umlal r3, r4, r5, r14 470 adds r3, r3, r12 @ c += t9 471 adc r4, r4, #0 472 473 add r1, sp, #4 + 0*4 @ r7,r8,r9 = t0,t1,t2 474 ldmia r1, {r7,r8,r9} 475 476 ubfx r2, r3, #0, #22 @ r[9] = c & (M >> 4) 477 str r2, [r0, #9*4] 478 mov r3, r3, lsr #22 @ c >>= 22 479 orr r3, r3, r4, asl #10 480 mov r4, r4, lsr #22 481 movw r14, field_R1 << 4 @ c += d * (R1 << 4) 482 umlal r3, r4, r5, r14 483 484 movw r14, field_R0 >> 4 @ d = c * (R0 >> 4) + t0 (64x64 multiply+add) 485 umull r5, r6, r3, r14 @ d = c.lo * (R0 >> 4) 486 adds r5, r5, r7 @ d.lo += t0 487 mla r6, r14, r4, r6 @ d.hi += c.hi * (R0 >> 4) 488 adc r6, r6, 0 @ d.hi += carry 489 490 bic r2, r5, field_not_M @ r[0] = d & M 491 str r2, [r0, #0*4] 492 493 mov r5, r5, lsr #26 @ d >>= 26 494 orr r5, r5, r6, asl #6 495 mov r6, r6, lsr #26 496 497 movw r14, field_R1 >> 4 @ d += c * (R1 >> 4) + t1 (64x64 multiply+add) 498 umull r1, r2, r3, r14 @ tmp = c.lo * (R1 >> 4) 499 adds r5, r5, r8 @ d.lo += t1 500 adc r6, r6, #0 @ d.hi += carry 501 adds r5, r5, r1 @ d.lo += tmp.lo 502 mla r2, r14, r4, r2 @ tmp.hi += c.hi * (R1 >> 4) 503 adc r6, r6, r2 @ d.hi += carry + tmp.hi 504 505 bic r2, r5, field_not_M @ r[1] = d & M 506 str r2, [r0, #1*4] 507 mov r5, r5, lsr #26 @ d >>= 26 (ignore hi) 508 orr r5, r5, r6, asl #6 509 510 add r5, r5, r9 @ d += t2 511 str r5, [r0, #2*4] @ r[2] = d 512 513 add sp, sp, #48 514 ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc} 515 .size haskellsecp256k1_v0_1_0_fe_mul_inner, .-haskellsecp256k1_v0_1_0_fe_mul_inner 516 517 .align 2 518 .global haskellsecp256k1_v0_1_0_fe_sqr_inner 519 .type haskellsecp256k1_v0_1_0_fe_sqr_inner, %function 520 .hidden haskellsecp256k1_v0_1_0_fe_sqr_inner 521 @ Arguments: 522 @ r0 r Can overlap with a 523 @ r1 a 524 @ Stack (total 4+10*4 = 44) 525 @ sp + #0 saved 'r' pointer 526 @ sp + #4 + 4*X t0,t1,t2,t3,t4,t5,t6,t7,u8,t9 527 haskellsecp256k1_v0_1_0_fe_sqr_inner: 528 stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14} 529 sub sp, sp, #48 @ frame=44 + alignment 530 str r0, [sp, #0] @ save result address, we need it only at the end 531 /****************************************** 532 * Main computation code. 533 ****************************************** 534 535 Allocation: 536 r0,r14,r2,r7,r8 scratch 537 r1 a (pointer) 538 r3:r4 c 539 r5:r6 d 540 r11:r12 c' 541 r9:r10 d' 542 543 Note: do not write to r[] here, it may overlap with a[] 544 */ 545 /* A interleaved with B */ 546 ldr r0, [r1, #1*4] @ a[1]*2 547 ldr r7, [r1, #0*4] @ a[0] 548 mov r0, r0, asl #1 549 ldr r14, [r1, #9*4] @ a[9] 550 umull r3, r4, r7, r7 @ c = a[0] * a[0] 551 ldr r8, [r1, #8*4] @ a[8] 552 mov r7, r7, asl #1 553 umull r5, r6, r7, r14 @ d = a[0]*2 * a[9] 554 ldr r7, [r1, #2*4] @ a[2]*2 555 umull r9, r10, r0, r14 @ d' = a[1]*2 * a[9] 556 ldr r14, [r1, #7*4] @ a[7] 557 umlal r5, r6, r0, r8 @ d += a[1]*2 * a[8] 558 mov r7, r7, asl #1 559 ldr r0, [r1, #3*4] @ a[3]*2 560 umlal r9, r10, r7, r8 @ d' += a[2]*2 * a[8] 561 ldr r8, [r1, #6*4] @ a[6] 562 umlal r5, r6, r7, r14 @ d += a[2]*2 * a[7] 563 mov r0, r0, asl #1 564 ldr r7, [r1, #4*4] @ a[4]*2 565 umlal r9, r10, r0, r14 @ d' += a[3]*2 * a[7] 566 ldr r14, [r1, #5*4] @ a[5] 567 mov r7, r7, asl #1 568 umlal r5, r6, r0, r8 @ d += a[3]*2 * a[6] 569 umlal r9, r10, r7, r8 @ d' += a[4]*2 * a[6] 570 umlal r5, r6, r7, r14 @ d += a[4]*2 * a[5] 571 umlal r9, r10, r14, r14 @ d' += a[5] * a[5] 572 573 bic r0, r5, field_not_M @ t9 = d & M 574 str r0, [sp, #4 + 9*4] 575 mov r5, r5, lsr #26 @ d >>= 26 576 orr r5, r5, r6, asl #6 577 mov r6, r6, lsr #26 578 579 /* B */ 580 adds r5, r5, r9 @ d += d' 581 adc r6, r6, r10 582 583 bic r0, r5, field_not_M @ u0 = d & M 584 mov r5, r5, lsr #26 @ d >>= 26 585 orr r5, r5, r6, asl #6 586 mov r6, r6, lsr #26 587 movw r14, field_R0 @ c += u0 * R0 588 umlal r3, r4, r0, r14 589 bic r14, r3, field_not_M @ t0 = c & M 590 str r14, [sp, #4 + 0*4] 591 mov r3, r3, lsr #26 @ c >>= 26 592 orr r3, r3, r4, asl #6 593 mov r4, r4, lsr #26 594 mov r14, field_R1 @ c += u0 * R1 595 umlal r3, r4, r0, r14 596 597 /* C interleaved with D */ 598 ldr r0, [r1, #0*4] @ a[0]*2 599 ldr r14, [r1, #1*4] @ a[1] 600 mov r0, r0, asl #1 601 ldr r8, [r1, #2*4] @ a[2] 602 umlal r3, r4, r0, r14 @ c += a[0]*2 * a[1] 603 mov r7, r8, asl #1 @ a[2]*2 604 umull r11, r12, r14, r14 @ c' = a[1] * a[1] 605 ldr r14, [r1, #9*4] @ a[9] 606 umlal r11, r12, r0, r8 @ c' += a[0]*2 * a[2] 607 ldr r0, [r1, #3*4] @ a[3]*2 608 ldr r8, [r1, #8*4] @ a[8] 609 umlal r5, r6, r7, r14 @ d += a[2]*2 * a[9] 610 mov r0, r0, asl #1 611 ldr r7, [r1, #4*4] @ a[4]*2 612 umull r9, r10, r0, r14 @ d' = a[3]*2 * a[9] 613 ldr r14, [r1, #7*4] @ a[7] 614 umlal r5, r6, r0, r8 @ d += a[3]*2 * a[8] 615 mov r7, r7, asl #1 616 ldr r0, [r1, #5*4] @ a[5]*2 617 umlal r9, r10, r7, r8 @ d' += a[4]*2 * a[8] 618 ldr r8, [r1, #6*4] @ a[6] 619 mov r0, r0, asl #1 620 umlal r5, r6, r7, r14 @ d += a[4]*2 * a[7] 621 umlal r9, r10, r0, r14 @ d' += a[5]*2 * a[7] 622 umlal r5, r6, r0, r8 @ d += a[5]*2 * a[6] 623 umlal r9, r10, r8, r8 @ d' += a[6] * a[6] 624 625 bic r0, r5, field_not_M @ u1 = d & M 626 mov r5, r5, lsr #26 @ d >>= 26 627 orr r5, r5, r6, asl #6 628 mov r6, r6, lsr #26 629 movw r14, field_R0 @ c += u1 * R0 630 umlal r3, r4, r0, r14 631 bic r14, r3, field_not_M @ t1 = c & M 632 str r14, [sp, #4 + 1*4] 633 mov r3, r3, lsr #26 @ c >>= 26 634 orr r3, r3, r4, asl #6 635 mov r4, r4, lsr #26 636 mov r14, field_R1 @ c += u1 * R1 637 umlal r3, r4, r0, r14 638 639 /* D */ 640 adds r3, r3, r11 @ c += c' 641 adc r4, r4, r12 642 adds r5, r5, r9 @ d += d' 643 adc r6, r6, r10 644 645 bic r0, r5, field_not_M @ u2 = d & M 646 mov r5, r5, lsr #26 @ d >>= 26 647 orr r5, r5, r6, asl #6 648 mov r6, r6, lsr #26 649 movw r14, field_R0 @ c += u2 * R0 650 umlal r3, r4, r0, r14 651 bic r14, r3, field_not_M @ t2 = c & M 652 str r14, [sp, #4 + 2*4] 653 mov r3, r3, lsr #26 @ c >>= 26 654 orr r3, r3, r4, asl #6 655 mov r4, r4, lsr #26 656 mov r14, field_R1 @ c += u2 * R1 657 umlal r3, r4, r0, r14 658 659 /* E interleaved with F */ 660 ldr r7, [r1, #0*4] @ a[0]*2 661 ldr r0, [r1, #1*4] @ a[1]*2 662 ldr r14, [r1, #2*4] @ a[2] 663 mov r7, r7, asl #1 664 ldr r8, [r1, #3*4] @ a[3] 665 ldr r2, [r1, #4*4] 666 umlal r3, r4, r7, r8 @ c += a[0]*2 * a[3] 667 mov r0, r0, asl #1 668 umull r11, r12, r7, r2 @ c' = a[0]*2 * a[4] 669 mov r2, r2, asl #1 @ a[4]*2 670 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[3] 671 ldr r8, [r1, #9*4] @ a[9] 672 umlal r3, r4, r0, r14 @ c += a[1]*2 * a[2] 673 ldr r0, [r1, #5*4] @ a[5]*2 674 umlal r11, r12, r14, r14 @ c' += a[2] * a[2] 675 ldr r14, [r1, #8*4] @ a[8] 676 mov r0, r0, asl #1 677 umlal r5, r6, r2, r8 @ d += a[4]*2 * a[9] 678 ldr r7, [r1, #6*4] @ a[6]*2 679 umull r9, r10, r0, r8 @ d' = a[5]*2 * a[9] 680 mov r7, r7, asl #1 681 ldr r8, [r1, #7*4] @ a[7] 682 umlal r5, r6, r0, r14 @ d += a[5]*2 * a[8] 683 umlal r9, r10, r7, r14 @ d' += a[6]*2 * a[8] 684 umlal r5, r6, r7, r8 @ d += a[6]*2 * a[7] 685 umlal r9, r10, r8, r8 @ d' += a[7] * a[7] 686 687 bic r0, r5, field_not_M @ u3 = d & M 688 mov r5, r5, lsr #26 @ d >>= 26 689 orr r5, r5, r6, asl #6 690 mov r6, r6, lsr #26 691 movw r14, field_R0 @ c += u3 * R0 692 umlal r3, r4, r0, r14 693 bic r14, r3, field_not_M @ t3 = c & M 694 str r14, [sp, #4 + 3*4] 695 mov r3, r3, lsr #26 @ c >>= 26 696 orr r3, r3, r4, asl #6 697 mov r4, r4, lsr #26 698 mov r14, field_R1 @ c += u3 * R1 699 umlal r3, r4, r0, r14 700 701 /* F */ 702 adds r3, r3, r11 @ c += c' 703 adc r4, r4, r12 704 adds r5, r5, r9 @ d += d' 705 adc r6, r6, r10 706 707 bic r0, r5, field_not_M @ u4 = d & M 708 mov r5, r5, lsr #26 @ d >>= 26 709 orr r5, r5, r6, asl #6 710 mov r6, r6, lsr #26 711 movw r14, field_R0 @ c += u4 * R0 712 umlal r3, r4, r0, r14 713 bic r14, r3, field_not_M @ t4 = c & M 714 str r14, [sp, #4 + 4*4] 715 mov r3, r3, lsr #26 @ c >>= 26 716 orr r3, r3, r4, asl #6 717 mov r4, r4, lsr #26 718 mov r14, field_R1 @ c += u4 * R1 719 umlal r3, r4, r0, r14 720 721 /* G interleaved with H */ 722 ldr r7, [r1, #0*4] @ a[0]*2 723 ldr r0, [r1, #1*4] @ a[1]*2 724 mov r7, r7, asl #1 725 ldr r8, [r1, #5*4] @ a[5] 726 ldr r2, [r1, #6*4] @ a[6] 727 umlal r3, r4, r7, r8 @ c += a[0]*2 * a[5] 728 ldr r14, [r1, #4*4] @ a[4] 729 mov r0, r0, asl #1 730 umull r11, r12, r7, r2 @ c' = a[0]*2 * a[6] 731 ldr r7, [r1, #2*4] @ a[2]*2 732 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[5] 733 mov r7, r7, asl #1 734 ldr r8, [r1, #3*4] @ a[3] 735 umlal r3, r4, r0, r14 @ c += a[1]*2 * a[4] 736 mov r0, r2, asl #1 @ a[6]*2 737 umlal r11, r12, r7, r14 @ c' += a[2]*2 * a[4] 738 ldr r14, [r1, #9*4] @ a[9] 739 umlal r3, r4, r7, r8 @ c += a[2]*2 * a[3] 740 ldr r7, [r1, #7*4] @ a[7]*2 741 umlal r11, r12, r8, r8 @ c' += a[3] * a[3] 742 mov r7, r7, asl #1 743 ldr r8, [r1, #8*4] @ a[8] 744 umlal r5, r6, r0, r14 @ d += a[6]*2 * a[9] 745 umull r9, r10, r7, r14 @ d' = a[7]*2 * a[9] 746 umlal r5, r6, r7, r8 @ d += a[7]*2 * a[8] 747 umlal r9, r10, r8, r8 @ d' += a[8] * a[8] 748 749 bic r0, r5, field_not_M @ u5 = d & M 750 mov r5, r5, lsr #26 @ d >>= 26 751 orr r5, r5, r6, asl #6 752 mov r6, r6, lsr #26 753 movw r14, field_R0 @ c += u5 * R0 754 umlal r3, r4, r0, r14 755 bic r14, r3, field_not_M @ t5 = c & M 756 str r14, [sp, #4 + 5*4] 757 mov r3, r3, lsr #26 @ c >>= 26 758 orr r3, r3, r4, asl #6 759 mov r4, r4, lsr #26 760 mov r14, field_R1 @ c += u5 * R1 761 umlal r3, r4, r0, r14 762 763 /* H */ 764 adds r3, r3, r11 @ c += c' 765 adc r4, r4, r12 766 adds r5, r5, r9 @ d += d' 767 adc r6, r6, r10 768 769 bic r0, r5, field_not_M @ u6 = d & M 770 mov r5, r5, lsr #26 @ d >>= 26 771 orr r5, r5, r6, asl #6 772 mov r6, r6, lsr #26 773 movw r14, field_R0 @ c += u6 * R0 774 umlal r3, r4, r0, r14 775 bic r14, r3, field_not_M @ t6 = c & M 776 str r14, [sp, #4 + 6*4] 777 mov r3, r3, lsr #26 @ c >>= 26 778 orr r3, r3, r4, asl #6 779 mov r4, r4, lsr #26 780 mov r14, field_R1 @ c += u6 * R1 781 umlal r3, r4, r0, r14 782 783 /* I interleaved with J */ 784 ldr r7, [r1, #0*4] @ a[0]*2 785 ldr r0, [r1, #1*4] @ a[1]*2 786 mov r7, r7, asl #1 787 ldr r8, [r1, #7*4] @ a[7] 788 ldr r2, [r1, #8*4] @ a[8] 789 umlal r3, r4, r7, r8 @ c += a[0]*2 * a[7] 790 ldr r14, [r1, #6*4] @ a[6] 791 mov r0, r0, asl #1 792 umull r11, r12, r7, r2 @ c' = a[0]*2 * a[8] 793 ldr r7, [r1, #2*4] @ a[2]*2 794 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[7] 795 ldr r8, [r1, #5*4] @ a[5] 796 umlal r3, r4, r0, r14 @ c += a[1]*2 * a[6] 797 ldr r0, [r1, #3*4] @ a[3]*2 798 mov r7, r7, asl #1 799 umlal r11, r12, r7, r14 @ c' += a[2]*2 * a[6] 800 ldr r14, [r1, #4*4] @ a[4] 801 mov r0, r0, asl #1 802 umlal r3, r4, r7, r8 @ c += a[2]*2 * a[5] 803 mov r2, r2, asl #1 @ a[8]*2 804 umlal r11, r12, r0, r8 @ c' += a[3]*2 * a[5] 805 umlal r3, r4, r0, r14 @ c += a[3]*2 * a[4] 806 umlal r11, r12, r14, r14 @ c' += a[4] * a[4] 807 ldr r8, [r1, #9*4] @ a[9] 808 umlal r5, r6, r2, r8 @ d += a[8]*2 * a[9] 809 @ r8 will be used in J 810 811 bic r0, r5, field_not_M @ u7 = d & M 812 mov r5, r5, lsr #26 @ d >>= 26 813 orr r5, r5, r6, asl #6 814 mov r6, r6, lsr #26 815 movw r14, field_R0 @ c += u7 * R0 816 umlal r3, r4, r0, r14 817 bic r14, r3, field_not_M @ t7 = c & M 818 str r14, [sp, #4 + 7*4] 819 mov r3, r3, lsr #26 @ c >>= 26 820 orr r3, r3, r4, asl #6 821 mov r4, r4, lsr #26 822 mov r14, field_R1 @ c += u7 * R1 823 umlal r3, r4, r0, r14 824 825 /* J */ 826 adds r3, r3, r11 @ c += c' 827 adc r4, r4, r12 828 umlal r5, r6, r8, r8 @ d += a[9] * a[9] 829 830 bic r0, r5, field_not_M @ u8 = d & M 831 str r0, [sp, #4 + 8*4] 832 mov r5, r5, lsr #26 @ d >>= 26 833 orr r5, r5, r6, asl #6 834 mov r6, r6, lsr #26 835 movw r14, field_R0 @ c += u8 * R0 836 umlal r3, r4, r0, r14 837 838 /****************************************** 839 * compute and write back result 840 ****************************************** 841 Allocation: 842 r0 r 843 r3:r4 c 844 r5:r6 d 845 r7 t0 846 r8 t1 847 r9 t2 848 r11 u8 849 r12 t9 850 r1,r2,r10,r14 scratch 851 852 Note: do not read from a[] after here, it may overlap with r[] 853 */ 854 ldr r0, [sp, #0] 855 add r1, sp, #4 + 3*4 @ r[3..7] = t3..7, r11=u8, r12=t9 856 ldmia r1, {r2,r7,r8,r9,r10,r11,r12} 857 add r1, r0, #3*4 858 stmia r1, {r2,r7,r8,r9,r10} 859 860 bic r2, r3, field_not_M @ r[8] = c & M 861 str r2, [r0, #8*4] 862 mov r3, r3, lsr #26 @ c >>= 26 863 orr r3, r3, r4, asl #6 864 mov r4, r4, lsr #26 865 mov r14, field_R1 @ c += u8 * R1 866 umlal r3, r4, r11, r14 867 movw r14, field_R0 @ c += d * R0 868 umlal r3, r4, r5, r14 869 adds r3, r3, r12 @ c += t9 870 adc r4, r4, #0 871 872 add r1, sp, #4 + 0*4 @ r7,r8,r9 = t0,t1,t2 873 ldmia r1, {r7,r8,r9} 874 875 ubfx r2, r3, #0, #22 @ r[9] = c & (M >> 4) 876 str r2, [r0, #9*4] 877 mov r3, r3, lsr #22 @ c >>= 22 878 orr r3, r3, r4, asl #10 879 mov r4, r4, lsr #22 880 movw r14, field_R1 << 4 @ c += d * (R1 << 4) 881 umlal r3, r4, r5, r14 882 883 movw r14, field_R0 >> 4 @ d = c * (R0 >> 4) + t0 (64x64 multiply+add) 884 umull r5, r6, r3, r14 @ d = c.lo * (R0 >> 4) 885 adds r5, r5, r7 @ d.lo += t0 886 mla r6, r14, r4, r6 @ d.hi += c.hi * (R0 >> 4) 887 adc r6, r6, 0 @ d.hi += carry 888 889 bic r2, r5, field_not_M @ r[0] = d & M 890 str r2, [r0, #0*4] 891 892 mov r5, r5, lsr #26 @ d >>= 26 893 orr r5, r5, r6, asl #6 894 mov r6, r6, lsr #26 895 896 movw r14, field_R1 >> 4 @ d += c * (R1 >> 4) + t1 (64x64 multiply+add) 897 umull r1, r2, r3, r14 @ tmp = c.lo * (R1 >> 4) 898 adds r5, r5, r8 @ d.lo += t1 899 adc r6, r6, #0 @ d.hi += carry 900 adds r5, r5, r1 @ d.lo += tmp.lo 901 mla r2, r14, r4, r2 @ tmp.hi += c.hi * (R1 >> 4) 902 adc r6, r6, r2 @ d.hi += carry + tmp.hi 903 904 bic r2, r5, field_not_M @ r[1] = d & M 905 str r2, [r0, #1*4] 906 mov r5, r5, lsr #26 @ d >>= 26 (ignore hi) 907 orr r5, r5, r6, asl #6 908 909 add r5, r5, r9 @ d += t2 910 str r5, [r0, #2*4] @ r[2] = d 911 912 add sp, sp, #48 913 ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc} 914 .size haskellsecp256k1_v0_1_0_fe_sqr_inner, .-haskellsecp256k1_v0_1_0_fe_sqr_inner 915 916 .section .note.GNU-stack,"",%progbits