github.com/luckypickle/go-ethereum-vet@v1.14.2/crypto/secp256k1/libsecp256k1/src/asm/field_10x26_arm.s (about) 1 @ vim: set tabstop=8 softtabstop=8 shiftwidth=8 noexpandtab syntax=armasm: 2 /********************************************************************** 3 * Copyright (c) 2014 Wladimir J. van der Laan * 4 * Distributed under the MIT software license, see the accompanying * 5 * file COPYING or http://www.opensource.org/licenses/mit-license.php.* 6 **********************************************************************/ 7 /* 8 ARM implementation of field_10x26 inner loops. 9 10 Note: 11 12 - To avoid unnecessary loads and make use of available registers, two 13 'passes' have every time been interleaved, with the odd passes accumulating c' and d' 14 which will be added to c and d respectively in the even passes 15 16 */ 17 18 .syntax unified 19 .arch armv7-a 20 @ eabi attributes - see readelf -A 21 .eabi_attribute 8, 1 @ Tag_ARM_ISA_use = yes 22 .eabi_attribute 9, 0 @ Tag_Thumb_ISA_use = no 23 .eabi_attribute 10, 0 @ Tag_FP_arch = none 24 .eabi_attribute 24, 1 @ Tag_ABI_align_needed = 8-byte 25 .eabi_attribute 25, 1 @ Tag_ABI_align_preserved = 8-byte, except leaf SP 26 .eabi_attribute 30, 2 @ Tag_ABI_optimization_goals = Aggressive Speed 27 .eabi_attribute 34, 1 @ Tag_CPU_unaligned_access = v6 28 .text 29 30 @ Field constants 31 .set field_R0, 0x3d10 32 .set field_R1, 0x400 33 .set field_not_M, 0xfc000000 @ ~M = ~0x3ffffff 34 35 .align 2 36 .global vet_secp256k1_fe_mul_inner 37 .type vet_secp256k1_fe_mul_inner, %function 38 @ Arguments: 39 @ r0 r Restrict: can overlap with a, not with b 40 @ r1 a 41 @ r2 b 42 @ Stack (total 4+10*4 = 44) 43 @ sp + #0 saved 'r' pointer 44 @ sp + #4 + 4*X t0,t1,t2,t3,t4,t5,t6,t7,u8,t9 45 vet_secp256k1_fe_mul_inner: 46 stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14} 47 sub sp, sp, #48 @ frame=44 + alignment 48 str r0, [sp, #0] @ save result address, we need it only at the end 49 50 /****************************************** 51 * Main computation code. 52 ****************************************** 53 54 Allocation: 55 r0,r14,r7,r8 scratch 56 r1 a (pointer) 57 r2 b (pointer) 58 r3:r4 c 59 r5:r6 d 60 r11:r12 c' 61 r9:r10 d' 62 63 Note: do not write to r[] here, it may overlap with a[] 64 */ 65 66 /* A - interleaved with B */ 67 ldr r7, [r1, #0*4] @ a[0] 68 ldr r8, [r2, #9*4] @ b[9] 69 ldr r0, [r1, #1*4] @ a[1] 70 umull r5, r6, r7, r8 @ d = a[0] * b[9] 71 ldr r14, [r2, #8*4] @ b[8] 72 umull r9, r10, r0, r8 @ d' = a[1] * b[9] 73 ldr r7, [r1, #2*4] @ a[2] 74 umlal r5, r6, r0, r14 @ d += a[1] * b[8] 75 ldr r8, [r2, #7*4] @ b[7] 76 umlal r9, r10, r7, r14 @ d' += a[2] * b[8] 77 ldr r0, [r1, #3*4] @ a[3] 78 umlal r5, r6, r7, r8 @ d += a[2] * b[7] 79 ldr r14, [r2, #6*4] @ b[6] 80 umlal r9, r10, r0, r8 @ d' += a[3] * b[7] 81 ldr r7, [r1, #4*4] @ a[4] 82 umlal r5, r6, r0, r14 @ d += a[3] * b[6] 83 ldr r8, [r2, #5*4] @ b[5] 84 umlal r9, r10, r7, r14 @ d' += a[4] * b[6] 85 ldr r0, [r1, #5*4] @ a[5] 86 umlal r5, r6, r7, r8 @ d += a[4] * b[5] 87 ldr r14, [r2, #4*4] @ b[4] 88 umlal r9, r10, r0, r8 @ d' += a[5] * b[5] 89 ldr r7, [r1, #6*4] @ a[6] 90 umlal r5, r6, r0, r14 @ d += a[5] * b[4] 91 ldr r8, [r2, #3*4] @ b[3] 92 umlal r9, r10, r7, r14 @ d' += a[6] * b[4] 93 ldr r0, [r1, #7*4] @ a[7] 94 umlal r5, r6, r7, r8 @ d += a[6] * b[3] 95 ldr r14, [r2, #2*4] @ b[2] 96 umlal r9, r10, r0, r8 @ d' += a[7] * b[3] 97 ldr r7, [r1, #8*4] @ a[8] 98 umlal r5, r6, r0, r14 @ d += a[7] * b[2] 99 ldr r8, [r2, #1*4] @ b[1] 100 umlal r9, r10, r7, r14 @ d' += a[8] * b[2] 101 ldr r0, [r1, #9*4] @ a[9] 102 umlal r5, r6, r7, r8 @ d += a[8] * b[1] 103 ldr r14, [r2, #0*4] @ b[0] 104 umlal r9, r10, r0, r8 @ d' += a[9] * b[1] 105 ldr r7, [r1, #0*4] @ a[0] 106 umlal r5, r6, r0, r14 @ d += a[9] * b[0] 107 @ r7,r14 used in B 108 109 bic r0, r5, field_not_M @ t9 = d & M 110 str r0, [sp, #4 + 4*9] 111 mov r5, r5, lsr #26 @ d >>= 26 112 orr r5, r5, r6, asl #6 113 mov r6, r6, lsr #26 114 115 /* B */ 116 umull r3, r4, r7, r14 @ c = a[0] * b[0] 117 adds r5, r5, r9 @ d += d' 118 adc r6, r6, r10 119 120 bic r0, r5, field_not_M @ u0 = d & M 121 mov r5, r5, lsr #26 @ d >>= 26 122 orr r5, r5, r6, asl #6 123 mov r6, r6, lsr #26 124 movw r14, field_R0 @ c += u0 * R0 125 umlal r3, r4, r0, r14 126 127 bic r14, r3, field_not_M @ t0 = c & M 128 str r14, [sp, #4 + 0*4] 129 mov r3, r3, lsr #26 @ c >>= 26 130 orr r3, r3, r4, asl #6 131 mov r4, r4, lsr #26 132 mov r14, field_R1 @ c += u0 * R1 133 umlal r3, r4, r0, r14 134 135 /* C - interleaved with D */ 136 ldr r7, [r1, #0*4] @ a[0] 137 ldr r8, [r2, #2*4] @ b[2] 138 ldr r14, [r2, #1*4] @ b[1] 139 umull r11, r12, r7, r8 @ c' = a[0] * b[2] 140 ldr r0, [r1, #1*4] @ a[1] 141 umlal r3, r4, r7, r14 @ c += a[0] * b[1] 142 ldr r8, [r2, #0*4] @ b[0] 143 umlal r11, r12, r0, r14 @ c' += a[1] * b[1] 144 ldr r7, [r1, #2*4] @ a[2] 145 umlal r3, r4, r0, r8 @ c += a[1] * b[0] 146 ldr r14, [r2, #9*4] @ b[9] 147 umlal r11, r12, r7, r8 @ c' += a[2] * b[0] 148 ldr r0, [r1, #3*4] @ a[3] 149 umlal r5, r6, r7, r14 @ d += a[2] * b[9] 150 ldr r8, [r2, #8*4] @ b[8] 151 umull r9, r10, r0, r14 @ d' = a[3] * b[9] 152 ldr r7, [r1, #4*4] @ a[4] 153 umlal r5, r6, r0, r8 @ d += a[3] * b[8] 154 ldr r14, [r2, #7*4] @ b[7] 155 umlal r9, r10, r7, r8 @ d' += a[4] * b[8] 156 ldr r0, [r1, #5*4] @ a[5] 157 umlal r5, r6, r7, r14 @ d += a[4] * b[7] 158 ldr r8, [r2, #6*4] @ b[6] 159 umlal r9, r10, r0, r14 @ d' += a[5] * b[7] 160 ldr r7, [r1, #6*4] @ a[6] 161 umlal r5, r6, r0, r8 @ d += a[5] * b[6] 162 ldr r14, [r2, #5*4] @ b[5] 163 umlal r9, r10, r7, r8 @ d' += a[6] * b[6] 164 ldr r0, [r1, #7*4] @ a[7] 165 umlal r5, r6, r7, r14 @ d += a[6] * b[5] 166 ldr r8, [r2, #4*4] @ b[4] 167 umlal r9, r10, r0, r14 @ d' += a[7] * b[5] 168 ldr r7, [r1, #8*4] @ a[8] 169 umlal r5, r6, r0, r8 @ d += a[7] * b[4] 170 ldr r14, [r2, #3*4] @ b[3] 171 umlal r9, r10, r7, r8 @ d' += a[8] * b[4] 172 ldr r0, [r1, #9*4] @ a[9] 173 umlal r5, r6, r7, r14 @ d += a[8] * b[3] 174 ldr r8, [r2, #2*4] @ b[2] 175 umlal r9, r10, r0, r14 @ d' += a[9] * b[3] 176 umlal r5, r6, r0, r8 @ d += a[9] * b[2] 177 178 bic r0, r5, field_not_M @ u1 = d & M 179 mov r5, r5, lsr #26 @ d >>= 26 180 orr r5, r5, r6, asl #6 181 mov r6, r6, lsr #26 182 movw r14, field_R0 @ c += u1 * R0 183 umlal r3, r4, r0, r14 184 185 bic r14, r3, field_not_M @ t1 = c & M 186 str r14, [sp, #4 + 1*4] 187 mov r3, r3, lsr #26 @ c >>= 26 188 orr r3, r3, r4, asl #6 189 mov r4, r4, lsr #26 190 mov r14, field_R1 @ c += u1 * R1 191 umlal r3, r4, r0, r14 192 193 /* D */ 194 adds r3, r3, r11 @ c += c' 195 adc r4, r4, r12 196 adds r5, r5, r9 @ d += d' 197 adc r6, r6, r10 198 199 bic r0, r5, field_not_M @ u2 = d & M 200 mov r5, r5, lsr #26 @ d >>= 26 201 orr r5, r5, r6, asl #6 202 mov r6, r6, lsr #26 203 movw r14, field_R0 @ c += u2 * R0 204 umlal r3, r4, r0, r14 205 206 bic r14, r3, field_not_M @ t2 = c & M 207 str r14, [sp, #4 + 2*4] 208 mov r3, r3, lsr #26 @ c >>= 26 209 orr r3, r3, r4, asl #6 210 mov r4, r4, lsr #26 211 mov r14, field_R1 @ c += u2 * R1 212 umlal r3, r4, r0, r14 213 214 /* E - interleaved with F */ 215 ldr r7, [r1, #0*4] @ a[0] 216 ldr r8, [r2, #4*4] @ b[4] 217 umull r11, r12, r7, r8 @ c' = a[0] * b[4] 218 ldr r8, [r2, #3*4] @ b[3] 219 umlal r3, r4, r7, r8 @ c += a[0] * b[3] 220 ldr r7, [r1, #1*4] @ a[1] 221 umlal r11, r12, r7, r8 @ c' += a[1] * b[3] 222 ldr r8, [r2, #2*4] @ b[2] 223 umlal r3, r4, r7, r8 @ c += a[1] * b[2] 224 ldr r7, [r1, #2*4] @ a[2] 225 umlal r11, r12, r7, r8 @ c' += a[2] * b[2] 226 ldr r8, [r2, #1*4] @ b[1] 227 umlal r3, r4, r7, r8 @ c += a[2] * b[1] 228 ldr r7, [r1, #3*4] @ a[3] 229 umlal r11, r12, r7, r8 @ c' += a[3] * b[1] 230 ldr r8, [r2, #0*4] @ b[0] 231 umlal r3, r4, r7, r8 @ c += a[3] * b[0] 232 ldr r7, [r1, #4*4] @ a[4] 233 umlal r11, r12, r7, r8 @ c' += a[4] * b[0] 234 ldr r8, [r2, #9*4] @ b[9] 235 umlal r5, r6, r7, r8 @ d += a[4] * b[9] 236 ldr r7, [r1, #5*4] @ a[5] 237 umull r9, r10, r7, r8 @ d' = a[5] * b[9] 238 ldr r8, [r2, #8*4] @ b[8] 239 umlal r5, r6, r7, r8 @ d += a[5] * b[8] 240 ldr r7, [r1, #6*4] @ a[6] 241 umlal r9, r10, r7, r8 @ d' += a[6] * b[8] 242 ldr r8, [r2, #7*4] @ b[7] 243 umlal r5, r6, r7, r8 @ d += a[6] * b[7] 244 ldr r7, [r1, #7*4] @ a[7] 245 umlal r9, r10, r7, r8 @ d' += a[7] * b[7] 246 ldr r8, [r2, #6*4] @ b[6] 247 umlal r5, r6, r7, r8 @ d += a[7] * b[6] 248 ldr r7, [r1, #8*4] @ a[8] 249 umlal r9, r10, r7, r8 @ d' += a[8] * b[6] 250 ldr r8, [r2, #5*4] @ b[5] 251 umlal r5, r6, r7, r8 @ d += a[8] * b[5] 252 ldr r7, [r1, #9*4] @ a[9] 253 umlal r9, r10, r7, r8 @ d' += a[9] * b[5] 254 ldr r8, [r2, #4*4] @ b[4] 255 umlal r5, r6, r7, r8 @ d += a[9] * b[4] 256 257 bic r0, r5, field_not_M @ u3 = d & M 258 mov r5, r5, lsr #26 @ d >>= 26 259 orr r5, r5, r6, asl #6 260 mov r6, r6, lsr #26 261 movw r14, field_R0 @ c += u3 * R0 262 umlal r3, r4, r0, r14 263 264 bic r14, r3, field_not_M @ t3 = c & M 265 str r14, [sp, #4 + 3*4] 266 mov r3, r3, lsr #26 @ c >>= 26 267 orr r3, r3, r4, asl #6 268 mov r4, r4, lsr #26 269 mov r14, field_R1 @ c += u3 * R1 270 umlal r3, r4, r0, r14 271 272 /* F */ 273 adds r3, r3, r11 @ c += c' 274 adc r4, r4, r12 275 adds r5, r5, r9 @ d += d' 276 adc r6, r6, r10 277 278 bic r0, r5, field_not_M @ u4 = d & M 279 mov r5, r5, lsr #26 @ d >>= 26 280 orr r5, r5, r6, asl #6 281 mov r6, r6, lsr #26 282 movw r14, field_R0 @ c += u4 * R0 283 umlal r3, r4, r0, r14 284 285 bic r14, r3, field_not_M @ t4 = c & M 286 str r14, [sp, #4 + 4*4] 287 mov r3, r3, lsr #26 @ c >>= 26 288 orr r3, r3, r4, asl #6 289 mov r4, r4, lsr #26 290 mov r14, field_R1 @ c += u4 * R1 291 umlal r3, r4, r0, r14 292 293 /* G - interleaved with H */ 294 ldr r7, [r1, #0*4] @ a[0] 295 ldr r8, [r2, #6*4] @ b[6] 296 ldr r14, [r2, #5*4] @ b[5] 297 umull r11, r12, r7, r8 @ c' = a[0] * b[6] 298 ldr r0, [r1, #1*4] @ a[1] 299 umlal r3, r4, r7, r14 @ c += a[0] * b[5] 300 ldr r8, [r2, #4*4] @ b[4] 301 umlal r11, r12, r0, r14 @ c' += a[1] * b[5] 302 ldr r7, [r1, #2*4] @ a[2] 303 umlal r3, r4, r0, r8 @ c += a[1] * b[4] 304 ldr r14, [r2, #3*4] @ b[3] 305 umlal r11, r12, r7, r8 @ c' += a[2] * b[4] 306 ldr r0, [r1, #3*4] @ a[3] 307 umlal r3, r4, r7, r14 @ c += a[2] * b[3] 308 ldr r8, [r2, #2*4] @ b[2] 309 umlal r11, r12, r0, r14 @ c' += a[3] * b[3] 310 ldr r7, [r1, #4*4] @ a[4] 311 umlal r3, r4, r0, r8 @ c += a[3] * b[2] 312 ldr r14, [r2, #1*4] @ b[1] 313 umlal r11, r12, r7, r8 @ c' += a[4] * b[2] 314 ldr r0, [r1, #5*4] @ a[5] 315 umlal r3, r4, r7, r14 @ c += a[4] * b[1] 316 ldr r8, [r2, #0*4] @ b[0] 317 umlal r11, r12, r0, r14 @ c' += a[5] * b[1] 318 ldr r7, [r1, #6*4] @ a[6] 319 umlal r3, r4, r0, r8 @ c += a[5] * b[0] 320 ldr r14, [r2, #9*4] @ b[9] 321 umlal r11, r12, r7, r8 @ c' += a[6] * b[0] 322 ldr r0, [r1, #7*4] @ a[7] 323 umlal r5, r6, r7, r14 @ d += a[6] * b[9] 324 ldr r8, [r2, #8*4] @ b[8] 325 umull r9, r10, r0, r14 @ d' = a[7] * b[9] 326 ldr r7, [r1, #8*4] @ a[8] 327 umlal r5, r6, r0, r8 @ d += a[7] * b[8] 328 ldr r14, [r2, #7*4] @ b[7] 329 umlal r9, r10, r7, r8 @ d' += a[8] * b[8] 330 ldr r0, [r1, #9*4] @ a[9] 331 umlal r5, r6, r7, r14 @ d += a[8] * b[7] 332 ldr r8, [r2, #6*4] @ b[6] 333 umlal r9, r10, r0, r14 @ d' += a[9] * b[7] 334 umlal r5, r6, r0, r8 @ d += a[9] * b[6] 335 336 bic r0, r5, field_not_M @ u5 = d & M 337 mov r5, r5, lsr #26 @ d >>= 26 338 orr r5, r5, r6, asl #6 339 mov r6, r6, lsr #26 340 movw r14, field_R0 @ c += u5 * R0 341 umlal r3, r4, r0, r14 342 343 bic r14, r3, field_not_M @ t5 = c & M 344 str r14, [sp, #4 + 5*4] 345 mov r3, r3, lsr #26 @ c >>= 26 346 orr r3, r3, r4, asl #6 347 mov r4, r4, lsr #26 348 mov r14, field_R1 @ c += u5 * R1 349 umlal r3, r4, r0, r14 350 351 /* H */ 352 adds r3, r3, r11 @ c += c' 353 adc r4, r4, r12 354 adds r5, r5, r9 @ d += d' 355 adc r6, r6, r10 356 357 bic r0, r5, field_not_M @ u6 = d & M 358 mov r5, r5, lsr #26 @ d >>= 26 359 orr r5, r5, r6, asl #6 360 mov r6, r6, lsr #26 361 movw r14, field_R0 @ c += u6 * R0 362 umlal r3, r4, r0, r14 363 364 bic r14, r3, field_not_M @ t6 = c & M 365 str r14, [sp, #4 + 6*4] 366 mov r3, r3, lsr #26 @ c >>= 26 367 orr r3, r3, r4, asl #6 368 mov r4, r4, lsr #26 369 mov r14, field_R1 @ c += u6 * R1 370 umlal r3, r4, r0, r14 371 372 /* I - interleaved with J */ 373 ldr r8, [r2, #8*4] @ b[8] 374 ldr r7, [r1, #0*4] @ a[0] 375 ldr r14, [r2, #7*4] @ b[7] 376 umull r11, r12, r7, r8 @ c' = a[0] * b[8] 377 ldr r0, [r1, #1*4] @ a[1] 378 umlal r3, r4, r7, r14 @ c += a[0] * b[7] 379 ldr r8, [r2, #6*4] @ b[6] 380 umlal r11, r12, r0, r14 @ c' += a[1] * b[7] 381 ldr r7, [r1, #2*4] @ a[2] 382 umlal r3, r4, r0, r8 @ c += a[1] * b[6] 383 ldr r14, [r2, #5*4] @ b[5] 384 umlal r11, r12, r7, r8 @ c' += a[2] * b[6] 385 ldr r0, [r1, #3*4] @ a[3] 386 umlal r3, r4, r7, r14 @ c += a[2] * b[5] 387 ldr r8, [r2, #4*4] @ b[4] 388 umlal r11, r12, r0, r14 @ c' += a[3] * b[5] 389 ldr r7, [r1, #4*4] @ a[4] 390 umlal r3, r4, r0, r8 @ c += a[3] * b[4] 391 ldr r14, [r2, #3*4] @ b[3] 392 umlal r11, r12, r7, r8 @ c' += a[4] * b[4] 393 ldr r0, [r1, #5*4] @ a[5] 394 umlal r3, r4, r7, r14 @ c += a[4] * b[3] 395 ldr r8, [r2, #2*4] @ b[2] 396 umlal r11, r12, r0, r14 @ c' += a[5] * b[3] 397 ldr r7, [r1, #6*4] @ a[6] 398 umlal r3, r4, r0, r8 @ c += a[5] * b[2] 399 ldr r14, [r2, #1*4] @ b[1] 400 umlal r11, r12, r7, r8 @ c' += a[6] * b[2] 401 ldr r0, [r1, #7*4] @ a[7] 402 umlal r3, r4, r7, r14 @ c += a[6] * b[1] 403 ldr r8, [r2, #0*4] @ b[0] 404 umlal r11, r12, r0, r14 @ c' += a[7] * b[1] 405 ldr r7, [r1, #8*4] @ a[8] 406 umlal r3, r4, r0, r8 @ c += a[7] * b[0] 407 ldr r14, [r2, #9*4] @ b[9] 408 umlal r11, r12, r7, r8 @ c' += a[8] * b[0] 409 ldr r0, [r1, #9*4] @ a[9] 410 umlal r5, r6, r7, r14 @ d += a[8] * b[9] 411 ldr r8, [r2, #8*4] @ b[8] 412 umull r9, r10, r0, r14 @ d' = a[9] * b[9] 413 umlal r5, r6, r0, r8 @ d += a[9] * b[8] 414 415 bic r0, r5, field_not_M @ u7 = d & M 416 mov r5, r5, lsr #26 @ d >>= 26 417 orr r5, r5, r6, asl #6 418 mov r6, r6, lsr #26 419 movw r14, field_R0 @ c += u7 * R0 420 umlal r3, r4, r0, r14 421 422 bic r14, r3, field_not_M @ t7 = c & M 423 str r14, [sp, #4 + 7*4] 424 mov r3, r3, lsr #26 @ c >>= 26 425 orr r3, r3, r4, asl #6 426 mov r4, r4, lsr #26 427 mov r14, field_R1 @ c += u7 * R1 428 umlal r3, r4, r0, r14 429 430 /* J */ 431 adds r3, r3, r11 @ c += c' 432 adc r4, r4, r12 433 adds r5, r5, r9 @ d += d' 434 adc r6, r6, r10 435 436 bic r0, r5, field_not_M @ u8 = d & M 437 str r0, [sp, #4 + 8*4] 438 mov r5, r5, lsr #26 @ d >>= 26 439 orr r5, r5, r6, asl #6 440 mov r6, r6, lsr #26 441 movw r14, field_R0 @ c += u8 * R0 442 umlal r3, r4, r0, r14 443 444 /****************************************** 445 * compute and write back result 446 ****************************************** 447 Allocation: 448 r0 r 449 r3:r4 c 450 r5:r6 d 451 r7 t0 452 r8 t1 453 r9 t2 454 r11 u8 455 r12 t9 456 r1,r2,r10,r14 scratch 457 458 Note: do not read from a[] after here, it may overlap with r[] 459 */ 460 ldr r0, [sp, #0] 461 add r1, sp, #4 + 3*4 @ r[3..7] = t3..7, r11=u8, r12=t9 462 ldmia r1, {r2,r7,r8,r9,r10,r11,r12} 463 add r1, r0, #3*4 464 stmia r1, {r2,r7,r8,r9,r10} 465 466 bic r2, r3, field_not_M @ r[8] = c & M 467 str r2, [r0, #8*4] 468 mov r3, r3, lsr #26 @ c >>= 26 469 orr r3, r3, r4, asl #6 470 mov r4, r4, lsr #26 471 mov r14, field_R1 @ c += u8 * R1 472 umlal r3, r4, r11, r14 473 movw r14, field_R0 @ c += d * R0 474 umlal r3, r4, r5, r14 475 adds r3, r3, r12 @ c += t9 476 adc r4, r4, #0 477 478 add r1, sp, #4 + 0*4 @ r7,r8,r9 = t0,t1,t2 479 ldmia r1, {r7,r8,r9} 480 481 ubfx r2, r3, #0, #22 @ r[9] = c & (M >> 4) 482 str r2, [r0, #9*4] 483 mov r3, r3, lsr #22 @ c >>= 22 484 orr r3, r3, r4, asl #10 485 mov r4, r4, lsr #22 486 movw r14, field_R1 << 4 @ c += d * (R1 << 4) 487 umlal r3, r4, r5, r14 488 489 movw r14, field_R0 >> 4 @ d = c * (R0 >> 4) + t0 (64x64 multiply+add) 490 umull r5, r6, r3, r14 @ d = c.lo * (R0 >> 4) 491 adds r5, r5, r7 @ d.lo += t0 492 mla r6, r14, r4, r6 @ d.hi += c.hi * (R0 >> 4) 493 adc r6, r6, 0 @ d.hi += carry 494 495 bic r2, r5, field_not_M @ r[0] = d & M 496 str r2, [r0, #0*4] 497 498 mov r5, r5, lsr #26 @ d >>= 26 499 orr r5, r5, r6, asl #6 500 mov r6, r6, lsr #26 501 502 movw r14, field_R1 >> 4 @ d += c * (R1 >> 4) + t1 (64x64 multiply+add) 503 umull r1, r2, r3, r14 @ tmp = c.lo * (R1 >> 4) 504 adds r5, r5, r8 @ d.lo += t1 505 adc r6, r6, #0 @ d.hi += carry 506 adds r5, r5, r1 @ d.lo += tmp.lo 507 mla r2, r14, r4, r2 @ tmp.hi += c.hi * (R1 >> 4) 508 adc r6, r6, r2 @ d.hi += carry + tmp.hi 509 510 bic r2, r5, field_not_M @ r[1] = d & M 511 str r2, [r0, #1*4] 512 mov r5, r5, lsr #26 @ d >>= 26 (ignore hi) 513 orr r5, r5, r6, asl #6 514 515 add r5, r5, r9 @ d += t2 516 str r5, [r0, #2*4] @ r[2] = d 517 518 add sp, sp, #48 519 ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc} 520 .size vet_secp256k1_fe_mul_inner, .-vet_secp256k1_fe_mul_inner 521 522 .align 2 523 .global vet_secp256k1_fe_sqr_inner 524 .type vet_secp256k1_fe_sqr_inner, %function 525 @ Arguments: 526 @ r0 r Can overlap with a 527 @ r1 a 528 @ Stack (total 4+10*4 = 44) 529 @ sp + #0 saved 'r' pointer 530 @ sp + #4 + 4*X t0,t1,t2,t3,t4,t5,t6,t7,u8,t9 531 vet_secp256k1_fe_sqr_inner: 532 stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r14} 533 sub sp, sp, #48 @ frame=44 + alignment 534 str r0, [sp, #0] @ save result address, we need it only at the end 535 /****************************************** 536 * Main computation code. 537 ****************************************** 538 539 Allocation: 540 r0,r14,r2,r7,r8 scratch 541 r1 a (pointer) 542 r3:r4 c 543 r5:r6 d 544 r11:r12 c' 545 r9:r10 d' 546 547 Note: do not write to r[] here, it may overlap with a[] 548 */ 549 /* A interleaved with B */ 550 ldr r0, [r1, #1*4] @ a[1]*2 551 ldr r7, [r1, #0*4] @ a[0] 552 mov r0, r0, asl #1 553 ldr r14, [r1, #9*4] @ a[9] 554 umull r3, r4, r7, r7 @ c = a[0] * a[0] 555 ldr r8, [r1, #8*4] @ a[8] 556 mov r7, r7, asl #1 557 umull r5, r6, r7, r14 @ d = a[0]*2 * a[9] 558 ldr r7, [r1, #2*4] @ a[2]*2 559 umull r9, r10, r0, r14 @ d' = a[1]*2 * a[9] 560 ldr r14, [r1, #7*4] @ a[7] 561 umlal r5, r6, r0, r8 @ d += a[1]*2 * a[8] 562 mov r7, r7, asl #1 563 ldr r0, [r1, #3*4] @ a[3]*2 564 umlal r9, r10, r7, r8 @ d' += a[2]*2 * a[8] 565 ldr r8, [r1, #6*4] @ a[6] 566 umlal r5, r6, r7, r14 @ d += a[2]*2 * a[7] 567 mov r0, r0, asl #1 568 ldr r7, [r1, #4*4] @ a[4]*2 569 umlal r9, r10, r0, r14 @ d' += a[3]*2 * a[7] 570 ldr r14, [r1, #5*4] @ a[5] 571 mov r7, r7, asl #1 572 umlal r5, r6, r0, r8 @ d += a[3]*2 * a[6] 573 umlal r9, r10, r7, r8 @ d' += a[4]*2 * a[6] 574 umlal r5, r6, r7, r14 @ d += a[4]*2 * a[5] 575 umlal r9, r10, r14, r14 @ d' += a[5] * a[5] 576 577 bic r0, r5, field_not_M @ t9 = d & M 578 str r0, [sp, #4 + 9*4] 579 mov r5, r5, lsr #26 @ d >>= 26 580 orr r5, r5, r6, asl #6 581 mov r6, r6, lsr #26 582 583 /* B */ 584 adds r5, r5, r9 @ d += d' 585 adc r6, r6, r10 586 587 bic r0, r5, field_not_M @ u0 = d & M 588 mov r5, r5, lsr #26 @ d >>= 26 589 orr r5, r5, r6, asl #6 590 mov r6, r6, lsr #26 591 movw r14, field_R0 @ c += u0 * R0 592 umlal r3, r4, r0, r14 593 bic r14, r3, field_not_M @ t0 = c & M 594 str r14, [sp, #4 + 0*4] 595 mov r3, r3, lsr #26 @ c >>= 26 596 orr r3, r3, r4, asl #6 597 mov r4, r4, lsr #26 598 mov r14, field_R1 @ c += u0 * R1 599 umlal r3, r4, r0, r14 600 601 /* C interleaved with D */ 602 ldr r0, [r1, #0*4] @ a[0]*2 603 ldr r14, [r1, #1*4] @ a[1] 604 mov r0, r0, asl #1 605 ldr r8, [r1, #2*4] @ a[2] 606 umlal r3, r4, r0, r14 @ c += a[0]*2 * a[1] 607 mov r7, r8, asl #1 @ a[2]*2 608 umull r11, r12, r14, r14 @ c' = a[1] * a[1] 609 ldr r14, [r1, #9*4] @ a[9] 610 umlal r11, r12, r0, r8 @ c' += a[0]*2 * a[2] 611 ldr r0, [r1, #3*4] @ a[3]*2 612 ldr r8, [r1, #8*4] @ a[8] 613 umlal r5, r6, r7, r14 @ d += a[2]*2 * a[9] 614 mov r0, r0, asl #1 615 ldr r7, [r1, #4*4] @ a[4]*2 616 umull r9, r10, r0, r14 @ d' = a[3]*2 * a[9] 617 ldr r14, [r1, #7*4] @ a[7] 618 umlal r5, r6, r0, r8 @ d += a[3]*2 * a[8] 619 mov r7, r7, asl #1 620 ldr r0, [r1, #5*4] @ a[5]*2 621 umlal r9, r10, r7, r8 @ d' += a[4]*2 * a[8] 622 ldr r8, [r1, #6*4] @ a[6] 623 mov r0, r0, asl #1 624 umlal r5, r6, r7, r14 @ d += a[4]*2 * a[7] 625 umlal r9, r10, r0, r14 @ d' += a[5]*2 * a[7] 626 umlal r5, r6, r0, r8 @ d += a[5]*2 * a[6] 627 umlal r9, r10, r8, r8 @ d' += a[6] * a[6] 628 629 bic r0, r5, field_not_M @ u1 = d & M 630 mov r5, r5, lsr #26 @ d >>= 26 631 orr r5, r5, r6, asl #6 632 mov r6, r6, lsr #26 633 movw r14, field_R0 @ c += u1 * R0 634 umlal r3, r4, r0, r14 635 bic r14, r3, field_not_M @ t1 = c & M 636 str r14, [sp, #4 + 1*4] 637 mov r3, r3, lsr #26 @ c >>= 26 638 orr r3, r3, r4, asl #6 639 mov r4, r4, lsr #26 640 mov r14, field_R1 @ c += u1 * R1 641 umlal r3, r4, r0, r14 642 643 /* D */ 644 adds r3, r3, r11 @ c += c' 645 adc r4, r4, r12 646 adds r5, r5, r9 @ d += d' 647 adc r6, r6, r10 648 649 bic r0, r5, field_not_M @ u2 = d & M 650 mov r5, r5, lsr #26 @ d >>= 26 651 orr r5, r5, r6, asl #6 652 mov r6, r6, lsr #26 653 movw r14, field_R0 @ c += u2 * R0 654 umlal r3, r4, r0, r14 655 bic r14, r3, field_not_M @ t2 = c & M 656 str r14, [sp, #4 + 2*4] 657 mov r3, r3, lsr #26 @ c >>= 26 658 orr r3, r3, r4, asl #6 659 mov r4, r4, lsr #26 660 mov r14, field_R1 @ c += u2 * R1 661 umlal r3, r4, r0, r14 662 663 /* E interleaved with F */ 664 ldr r7, [r1, #0*4] @ a[0]*2 665 ldr r0, [r1, #1*4] @ a[1]*2 666 ldr r14, [r1, #2*4] @ a[2] 667 mov r7, r7, asl #1 668 ldr r8, [r1, #3*4] @ a[3] 669 ldr r2, [r1, #4*4] 670 umlal r3, r4, r7, r8 @ c += a[0]*2 * a[3] 671 mov r0, r0, asl #1 672 umull r11, r12, r7, r2 @ c' = a[0]*2 * a[4] 673 mov r2, r2, asl #1 @ a[4]*2 674 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[3] 675 ldr r8, [r1, #9*4] @ a[9] 676 umlal r3, r4, r0, r14 @ c += a[1]*2 * a[2] 677 ldr r0, [r1, #5*4] @ a[5]*2 678 umlal r11, r12, r14, r14 @ c' += a[2] * a[2] 679 ldr r14, [r1, #8*4] @ a[8] 680 mov r0, r0, asl #1 681 umlal r5, r6, r2, r8 @ d += a[4]*2 * a[9] 682 ldr r7, [r1, #6*4] @ a[6]*2 683 umull r9, r10, r0, r8 @ d' = a[5]*2 * a[9] 684 mov r7, r7, asl #1 685 ldr r8, [r1, #7*4] @ a[7] 686 umlal r5, r6, r0, r14 @ d += a[5]*2 * a[8] 687 umlal r9, r10, r7, r14 @ d' += a[6]*2 * a[8] 688 umlal r5, r6, r7, r8 @ d += a[6]*2 * a[7] 689 umlal r9, r10, r8, r8 @ d' += a[7] * a[7] 690 691 bic r0, r5, field_not_M @ u3 = d & M 692 mov r5, r5, lsr #26 @ d >>= 26 693 orr r5, r5, r6, asl #6 694 mov r6, r6, lsr #26 695 movw r14, field_R0 @ c += u3 * R0 696 umlal r3, r4, r0, r14 697 bic r14, r3, field_not_M @ t3 = c & M 698 str r14, [sp, #4 + 3*4] 699 mov r3, r3, lsr #26 @ c >>= 26 700 orr r3, r3, r4, asl #6 701 mov r4, r4, lsr #26 702 mov r14, field_R1 @ c += u3 * R1 703 umlal r3, r4, r0, r14 704 705 /* F */ 706 adds r3, r3, r11 @ c += c' 707 adc r4, r4, r12 708 adds r5, r5, r9 @ d += d' 709 adc r6, r6, r10 710 711 bic r0, r5, field_not_M @ u4 = d & M 712 mov r5, r5, lsr #26 @ d >>= 26 713 orr r5, r5, r6, asl #6 714 mov r6, r6, lsr #26 715 movw r14, field_R0 @ c += u4 * R0 716 umlal r3, r4, r0, r14 717 bic r14, r3, field_not_M @ t4 = c & M 718 str r14, [sp, #4 + 4*4] 719 mov r3, r3, lsr #26 @ c >>= 26 720 orr r3, r3, r4, asl #6 721 mov r4, r4, lsr #26 722 mov r14, field_R1 @ c += u4 * R1 723 umlal r3, r4, r0, r14 724 725 /* G interleaved with H */ 726 ldr r7, [r1, #0*4] @ a[0]*2 727 ldr r0, [r1, #1*4] @ a[1]*2 728 mov r7, r7, asl #1 729 ldr r8, [r1, #5*4] @ a[5] 730 ldr r2, [r1, #6*4] @ a[6] 731 umlal r3, r4, r7, r8 @ c += a[0]*2 * a[5] 732 ldr r14, [r1, #4*4] @ a[4] 733 mov r0, r0, asl #1 734 umull r11, r12, r7, r2 @ c' = a[0]*2 * a[6] 735 ldr r7, [r1, #2*4] @ a[2]*2 736 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[5] 737 mov r7, r7, asl #1 738 ldr r8, [r1, #3*4] @ a[3] 739 umlal r3, r4, r0, r14 @ c += a[1]*2 * a[4] 740 mov r0, r2, asl #1 @ a[6]*2 741 umlal r11, r12, r7, r14 @ c' += a[2]*2 * a[4] 742 ldr r14, [r1, #9*4] @ a[9] 743 umlal r3, r4, r7, r8 @ c += a[2]*2 * a[3] 744 ldr r7, [r1, #7*4] @ a[7]*2 745 umlal r11, r12, r8, r8 @ c' += a[3] * a[3] 746 mov r7, r7, asl #1 747 ldr r8, [r1, #8*4] @ a[8] 748 umlal r5, r6, r0, r14 @ d += a[6]*2 * a[9] 749 umull r9, r10, r7, r14 @ d' = a[7]*2 * a[9] 750 umlal r5, r6, r7, r8 @ d += a[7]*2 * a[8] 751 umlal r9, r10, r8, r8 @ d' += a[8] * a[8] 752 753 bic r0, r5, field_not_M @ u5 = d & M 754 mov r5, r5, lsr #26 @ d >>= 26 755 orr r5, r5, r6, asl #6 756 mov r6, r6, lsr #26 757 movw r14, field_R0 @ c += u5 * R0 758 umlal r3, r4, r0, r14 759 bic r14, r3, field_not_M @ t5 = c & M 760 str r14, [sp, #4 + 5*4] 761 mov r3, r3, lsr #26 @ c >>= 26 762 orr r3, r3, r4, asl #6 763 mov r4, r4, lsr #26 764 mov r14, field_R1 @ c += u5 * R1 765 umlal r3, r4, r0, r14 766 767 /* H */ 768 adds r3, r3, r11 @ c += c' 769 adc r4, r4, r12 770 adds r5, r5, r9 @ d += d' 771 adc r6, r6, r10 772 773 bic r0, r5, field_not_M @ u6 = d & M 774 mov r5, r5, lsr #26 @ d >>= 26 775 orr r5, r5, r6, asl #6 776 mov r6, r6, lsr #26 777 movw r14, field_R0 @ c += u6 * R0 778 umlal r3, r4, r0, r14 779 bic r14, r3, field_not_M @ t6 = c & M 780 str r14, [sp, #4 + 6*4] 781 mov r3, r3, lsr #26 @ c >>= 26 782 orr r3, r3, r4, asl #6 783 mov r4, r4, lsr #26 784 mov r14, field_R1 @ c += u6 * R1 785 umlal r3, r4, r0, r14 786 787 /* I interleaved with J */ 788 ldr r7, [r1, #0*4] @ a[0]*2 789 ldr r0, [r1, #1*4] @ a[1]*2 790 mov r7, r7, asl #1 791 ldr r8, [r1, #7*4] @ a[7] 792 ldr r2, [r1, #8*4] @ a[8] 793 umlal r3, r4, r7, r8 @ c += a[0]*2 * a[7] 794 ldr r14, [r1, #6*4] @ a[6] 795 mov r0, r0, asl #1 796 umull r11, r12, r7, r2 @ c' = a[0]*2 * a[8] 797 ldr r7, [r1, #2*4] @ a[2]*2 798 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[7] 799 ldr r8, [r1, #5*4] @ a[5] 800 umlal r3, r4, r0, r14 @ c += a[1]*2 * a[6] 801 ldr r0, [r1, #3*4] @ a[3]*2 802 mov r7, r7, asl #1 803 umlal r11, r12, r7, r14 @ c' += a[2]*2 * a[6] 804 ldr r14, [r1, #4*4] @ a[4] 805 mov r0, r0, asl #1 806 umlal r3, r4, r7, r8 @ c += a[2]*2 * a[5] 807 mov r2, r2, asl #1 @ a[8]*2 808 umlal r11, r12, r0, r8 @ c' += a[3]*2 * a[5] 809 umlal r3, r4, r0, r14 @ c += a[3]*2 * a[4] 810 umlal r11, r12, r14, r14 @ c' += a[4] * a[4] 811 ldr r8, [r1, #9*4] @ a[9] 812 umlal r5, r6, r2, r8 @ d += a[8]*2 * a[9] 813 @ r8 will be used in J 814 815 bic r0, r5, field_not_M @ u7 = d & M 816 mov r5, r5, lsr #26 @ d >>= 26 817 orr r5, r5, r6, asl #6 818 mov r6, r6, lsr #26 819 movw r14, field_R0 @ c += u7 * R0 820 umlal r3, r4, r0, r14 821 bic r14, r3, field_not_M @ t7 = c & M 822 str r14, [sp, #4 + 7*4] 823 mov r3, r3, lsr #26 @ c >>= 26 824 orr r3, r3, r4, asl #6 825 mov r4, r4, lsr #26 826 mov r14, field_R1 @ c += u7 * R1 827 umlal r3, r4, r0, r14 828 829 /* J */ 830 adds r3, r3, r11 @ c += c' 831 adc r4, r4, r12 832 umlal r5, r6, r8, r8 @ d += a[9] * a[9] 833 834 bic r0, r5, field_not_M @ u8 = d & M 835 str r0, [sp, #4 + 8*4] 836 mov r5, r5, lsr #26 @ d >>= 26 837 orr r5, r5, r6, asl #6 838 mov r6, r6, lsr #26 839 movw r14, field_R0 @ c += u8 * R0 840 umlal r3, r4, r0, r14 841 842 /****************************************** 843 * compute and write back result 844 ****************************************** 845 Allocation: 846 r0 r 847 r3:r4 c 848 r5:r6 d 849 r7 t0 850 r8 t1 851 r9 t2 852 r11 u8 853 r12 t9 854 r1,r2,r10,r14 scratch 855 856 Note: do not read from a[] after here, it may overlap with r[] 857 */ 858 ldr r0, [sp, #0] 859 add r1, sp, #4 + 3*4 @ r[3..7] = t3..7, r11=u8, r12=t9 860 ldmia r1, {r2,r7,r8,r9,r10,r11,r12} 861 add r1, r0, #3*4 862 stmia r1, {r2,r7,r8,r9,r10} 863 864 bic r2, r3, field_not_M @ r[8] = c & M 865 str r2, [r0, #8*4] 866 mov r3, r3, lsr #26 @ c >>= 26 867 orr r3, r3, r4, asl #6 868 mov r4, r4, lsr #26 869 mov r14, field_R1 @ c += u8 * R1 870 umlal r3, r4, r11, r14 871 movw r14, field_R0 @ c += d * R0 872 umlal r3, r4, r5, r14 873 adds r3, r3, r12 @ c += t9 874 adc r4, r4, #0 875 876 add r1, sp, #4 + 0*4 @ r7,r8,r9 = t0,t1,t2 877 ldmia r1, {r7,r8,r9} 878 879 ubfx r2, r3, #0, #22 @ r[9] = c & (M >> 4) 880 str r2, [r0, #9*4] 881 mov r3, r3, lsr #22 @ c >>= 22 882 orr r3, r3, r4, asl #10 883 mov r4, r4, lsr #22 884 movw r14, field_R1 << 4 @ c += d * (R1 << 4) 885 umlal r3, r4, r5, r14 886 887 movw r14, field_R0 >> 4 @ d = c * (R0 >> 4) + t0 (64x64 multiply+add) 888 umull r5, r6, r3, r14 @ d = c.lo * (R0 >> 4) 889 adds r5, r5, r7 @ d.lo += t0 890 mla r6, r14, r4, r6 @ d.hi += c.hi * (R0 >> 4) 891 adc r6, r6, 0 @ d.hi += carry 892 893 bic r2, r5, field_not_M @ r[0] = d & M 894 str r2, [r0, #0*4] 895 896 mov r5, r5, lsr #26 @ d >>= 26 897 orr r5, r5, r6, asl #6 898 mov r6, r6, lsr #26 899 900 movw r14, field_R1 >> 4 @ d += c * (R1 >> 4) + t1 (64x64 multiply+add) 901 umull r1, r2, r3, r14 @ tmp = c.lo * (R1 >> 4) 902 adds r5, r5, r8 @ d.lo += t1 903 adc r6, r6, #0 @ d.hi += carry 904 adds r5, r5, r1 @ d.lo += tmp.lo 905 mla r2, r14, r4, r2 @ tmp.hi += c.hi * (R1 >> 4) 906 adc r6, r6, r2 @ d.hi += carry + tmp.hi 907 908 bic r2, r5, field_not_M @ r[1] = d & M 909 str r2, [r0, #1*4] 910 mov r5, r5, lsr #26 @ d >>= 26 (ignore hi) 911 orr r5, r5, r6, asl #6 912 913 add r5, r5, r9 @ d += t2 914 str r5, [r0, #2*4] @ r[2] = d 915 916 add sp, sp, #48 917 ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc} 918 .size vet_secp256k1_fe_sqr_inner, .-vet_secp256k1_fe_sqr_inner 919