github.com/cloudflare/circl@v1.5.0/ecc/p384/arith_arm64.s (about) 1 // +build arm64,!purego 2 3 #include "textflag.h" 4 5 TEXT ·fp384Cmov(SB), NOSPLIT, $0 6 MOVD x+0(FP), R0 7 MOVD y+8(FP), R1 8 MOVW b+16(FP), R2 9 CMP $0, R2 10 LDP 0(R0), (R3, R5) 11 LDP 0(R1), (R4, R6) 12 CSEL NE,R4,R3,R7 13 CSEL NE,R6,R5,R8 14 STP (R7, R8), 0(R0) 15 LDP 16(R0), (R3, R5) 16 LDP 16(R1), (R4, R6) 17 CSEL NE,R4,R3,R7 18 CSEL NE,R6,R5,R8 19 STP (R7, R8), 16(R0) 20 LDP 32(R0), (R3, R5) 21 LDP 32(R1), (R4, R6) 22 CSEL NE,R4,R3,R7 23 CSEL NE,R6,R5,R8 24 STP (R7, R8), 32(R0) 25 RET 26 27 // Compute c = -a mod p 28 TEXT ·fp384Neg(SB), NOSPLIT, $0-16 29 MOVD c+0(FP), R0 30 MOVD a+8(FP), R1 31 32 // Load p in R2-R7, a in R8-R13 33 // Compute p-a in R8-R13 34 LDP ·p+0(SB), (R2, R3) 35 LDP 0(R1), (R8, R9) 36 SUBS R8, R2, R8 37 SBCS R9, R3, R9 38 LDP ·p+16(SB), (R4, R5) 39 LDP 16(R1), (R10, R11) 40 SBCS R10, R4, R10 41 SBCS R11, R5, R11 42 LDP ·p+32(SB), (R6, R7) 43 LDP 32(R1), (R12, R13) 44 SBCS R12, R6, R12 45 SBC R13, R7, R13 46 47 // Compute (p-a)-p in R2-R7 48 SUBS R2, R8, R2 49 SBCS R3, R9, R3 50 SBCS R4, R10, R4 51 SBCS R5, R11, R5 52 SBCS R6, R12, R6 53 SBCS R7, R13, R7 54 55 // If (p-a)-p < 0 (nearly always), return p-a 56 // Only return (p-a)-p for a = 0 57 // Store result in c 58 CSEL CC, R8, R2, R2 59 CSEL CC, R9, R3, R3 60 STP (R2, R3), 0(R0) 61 CSEL CC, R10, R4, R4 62 CSEL CC, R11, R5, R5 63 STP (R4, R5), 16(R0) 64 CSEL CC, R12, R6, R6 65 CSEL CC, R13, R7, R7 66 STP (R6, R7), 32(R0) 67 68 RET 69 70 // Compute c = a+b mod p 71 TEXT ·fp384Add(SB), NOSPLIT, $0-24 72 MOVD c+0(FP), R0 73 MOVD a+8(FP), R1 74 MOVD b+16(FP), R2 75 76 // Load a in R3-R8, b in R9-R14 77 // Compute a+b in R3-R9 78 LDP 0(R1), (R3, R4) 79 LDP 0(R2), (R9, R10) 80 ADDS R9, R3 81 ADCS R10, R4 82 LDP 16(R1), (R5, R6) 83 LDP 16(R2), (R11, R12) 84 ADCS R11, R5 85 ADCS R12, R6 86 LDP 32(R1), (R7, R8) 87 LDP 32(R2), (R13, R14) 88 ADCS R13, R7 89 ADCS R14, R8 90 ADC ZR, ZR, R9 91 92 // Load p in R10-R15 93 LDP ·p+ 0(SB), (R10, R11) 94 LDP ·p+16(SB), (R12, R13) 95 LDP ·p+32(SB), (R14, R15) 96 97 // Compute a+b-p in R10-R16 98 SUBS R10, R3, R10 99 SBCS R11, R4, R11 100 SBCS R12, R5, R12 101 SBCS R13, R6, R13 102 SBCS R14, R7, R14 103 SBCS R15, R8, R15 104 SBCS ZR, R9, R16 105 106 // If a+b-p is negative, return a+b 107 // Store result in c 108 CSEL CC, R3, R10, R3 109 CSEL CC, R4, R11, R4 110 STP (R3, R4), 0(R0) 111 CSEL CC, R5, R12, R5 112 CSEL CC, R6, R13, R6 113 STP (R5, R6), 16(R0) 114 CSEL CC, R7, R14, R7 115 CSEL CC, R8, R15, R8 116 STP (R7, R8), 32(R0) 117 118 RET 119 120 // Compute c = a-b mod p 121 TEXT ·fp384Sub(SB), NOSPLIT, $0-24 122 MOVD c+0(FP), R0 123 MOVD a+8(FP), R1 124 MOVD b+16(FP), R2 125 126 // Load a in R3-R8, b in R9-R14 127 // Compute a-b in R3-R9 128 LDP 0(R1), (R3, R4) 129 LDP 0(R2), (R9, R10) 130 SUBS R9, R3 131 SBCS R10, R4 132 LDP 16(R1), (R5, R6) 133 LDP 16(R2), (R11, R12) 134 SBCS R11, R5 135 SBCS R12, R6 136 LDP 32(R1), (R7, R8) 137 LDP 32(R2), (R13, R14) 138 SBCS R13, R7 139 SBCS R14, R8 140 SBC ZR, ZR, R9 141 142 // Load p in R10-R15 143 // If a-b < 0, (a-b)+p to R3-R8 144 // Store result in c 145 LDP ·p+ 0(SB), (R10, R11) 146 AND R9, R10 147 LDP ·p+16(SB), (R12, R13) 148 AND R9, R11 149 AND R9, R12 150 LDP ·p+32(SB), (R14, R15) 151 AND R9, R13 152 AND R9, R14 153 AND R9, R15 154 155 ADDS R10, R3 156 ADCS R11, R4 157 STP (R3, R4), 0(R0) 158 ADCS R12, R5 159 ADCS R13, R6 160 STP (R5, R6), 16(R0) 161 ADCS R14, R7 162 ADC R15, R8 163 STP (R7, R8), 32(R0) 164 165 RET 166 167 // Expects that A0*B0 is already in C0(low),C3(high) and A0*B1 in C1(low),C2(high) 168 // C0 is not actually touched 169 // Result of (A0-A2) * (B0-B2) will be in C0-C5 170 // Inputs remain intact 171 #define mul192x192comba(A0,A1,A2, B0,B1,B2, C0,C1,C2,C3,C4,C5, S0,S1,S2,S3) \ 172 MUL A1, B0, S2 \ 173 UMULH A1, B0, S3 \ 174 \ 175 ADDS C3, C1 \ 176 ADCS ZR, C2 \ 177 ADC ZR, ZR, C3 \ 178 \ 179 MUL A0, B2, S0 \ 180 UMULH A0, B2, S1 \ 181 \ 182 ADDS S2, C1 \ 183 ADCS S3, C2 \ 184 ADC ZR, C3 \ 185 \ 186 MUL A1, B1, S2 \ 187 UMULH A1, B1, S3 \ 188 \ 189 ADDS S0, C2 \ 190 ADCS S1, C3 \ 191 ADC ZR, ZR, C4 \ 192 \ 193 MUL A2, B0, S0 \ 194 UMULH A2, B0, S1 \ 195 \ 196 ADDS S2, C2 \ 197 ADCS S3, C3 \ 198 ADC ZR, C4 \ 199 \ 200 MUL A1, B2, S2 \ 201 UMULH A1, B2, S3 \ 202 \ 203 ADDS S0, C2 \ 204 ADCS S1, C3 \ 205 ADC ZR, C4 \ 206 \ 207 MUL A2, B1, S0 \ 208 UMULH A2, B1, S1 \ 209 \ 210 ADDS S2, C3 \ 211 ADCS S3, C4 \ 212 ADC ZR, ZR, C5 \ 213 \ 214 MUL A2, B2, S2 \ 215 UMULH A2, B2, S3 \ 216 \ 217 ADDS S0, C3 \ 218 ADCS S1, C4 \ 219 ADC ZR, C5 \ 220 \ 221 ADDS S2, C4 \ 222 ADC S3, C5 223 224 225 // Assumes that there are at least 96 bytes left on the stack 226 // Expects that X and Y point to input 227 // X and Y get overwritten, Z0 will be in Y 228 #define mul384x384karatsuba(X,Y, Z1,Z2,Z3,Z4,Z5,Z6,Z7,Z8,Z9,Z10,Z11, T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12) \ 229 /* Load a in Z1-Z6, b in T12,Z7-Z11 */ \ 230 LDP 0(X), ( Z1, Z2) \ 231 LDP 0(Y), (T12, Z7) \ 232 MUL Z1, Z7, T1 \ 233 UMULH Z1, T12, T3 \ 234 LDP 16(X), ( Z3, Z4) \ 235 LDP 16(Y), ( Z8, Z9) \ 236 MUL Z1, T12, T0 \ 237 UMULH Z1, Z7, T2 \ 238 LDP 32(X), ( Z5, Z6) \ 239 LDP 32(Y), (Z10, Z11) \ 240 \ 241 /* Compute aL*bL in T0-T5 */ \ 242 mul192x192comba(Z1,Z2,Z3, T12,Z7,Z8, T0,T1,T2,T3,T4,T5, T6,T7,T8,T9) \ 243 \ 244 /* Compute aH*bH in T6-T11, destroys aL and bL */ \ 245 MUL Z4, Z10, T7 \ 246 MUL Z4, Z9, T6 \ 247 UMULH Z4, Z9, T9 \ 248 UMULH Z4, Z10, T8 \ 249 mul192x192comba(Z4,Z5,Z6, Z9,Z10,Z11, T6,T7,T8,T9,T10,T11, Z1,Z2,T12,Z7) \ 250 \ 251 /* Compute aL*bL + aH*bH in Z1-Z6,T12, destroys aH */ \ 252 ADDS T0, T6, Z1 \ 253 ADCS T1, T7, Z2 \ 254 ADCS T2, T8, Z3 \ 255 ADCS T3, T9, Z4 \ 256 ADCS T4, T10, Z5 \ 257 ADCS T5, T11, Z6 \ 258 ADC ZR, ZR, T12 \ 259 \ 260 /* Add to T0-T11 and store on stack */ \ 261 STP ( T0, T1), -16(RSP) \ 262 ADDS Z1, T3 \ 263 STP ( T2, T3), -32(RSP) \ 264 ADCS Z2, T4 \ 265 ADCS Z3, T5 \ 266 STP ( T4, T5), -48(RSP) \ 267 ADCS Z4, T6 \ 268 ADCS Z5, T7 \ 269 STP ( T6, T7), -64(RSP) \ 270 ADCS Z6, T8 \ 271 ADC ZR, T12 \ 272 STP ( T8, T9), -80(RSP) \ 273 STP (T10, T11), -96(RSP) \ 274 \ 275 /* Load a to Z1-Z6 */ \ 276 LDP 0(X), (Z1, Z2) \ 277 LDP 16(X), (Z3, Z4) \ 278 LDP 32(X), (Z5, Z6) \ 279 \ 280 /* Compute |aL-aH| to Z1-Z3, keep borrow in X */ \ 281 SUBS Z4, Z1 \ 282 SBCS Z5, Z2 \ 283 SBCS Z6, Z3 \ 284 SBC ZR, ZR, X \ 285 NEGS Z1, Z4 \ 286 NGCS Z2, Z5 \ 287 NGC Z3, Z6 \ 288 ADDS $1, X \ 289 \ 290 /* Load b to Z7-Z11,T0 */ \ 291 LDP 0(Y), ( Z7, Z8) \ 292 LDP 16(Y), ( Z9, Z10) \ 293 LDP 32(Y), (Z11, T0) \ 294 \ 295 CSEL EQ, Z4, Z1, Z1 \ 296 CSEL EQ, Z5, Z2 ,Z2 \ 297 CSEL EQ, Z6, Z3, Z3 \ 298 \ 299 /* Compute |bH-bL| to Z7-Z9, keep borrow in Y */ \ 300 SUBS Z7, Z10 \ 301 SBCS Z8, Z11 \ 302 SBCS Z9, T0 \ 303 SBC ZR, ZR, Y \ 304 NEGS Z10, Z7 \ 305 NGCS Z11, Z8 \ 306 NGC T0, Z9 \ 307 ADDS $1, Y \ 308 CSEL EQ, Z7, Z10, Z7 \ 309 CSEL EQ, Z8, Z11, Z8 \ 310 CSEL EQ, Z9, T0, Z9 \ 311 \ 312 /* Combine borrows */ \ 313 EOR Y, X \ 314 \ 315 /* Compute |aL-aH|*|bH-bL| to Z10,Z11,T0-T3 */ \ 316 MUL Z1, Z8, Z11 \ 317 MUL Z1, Z7, Z10 \ 318 UMULH Z1, Z8, T0 \ 319 UMULH Z1, Z7, T1 \ 320 mul192x192comba(Z1,Z2,Z3, Z7,Z8,Z9, Z10,Z11,T0,T1,T2,T3, T4,T5,T6,T7) \ 321 \ 322 /* The result has to be negated if exactly one of the operands was negative */ \ 323 NEGS Z10, Y \ 324 NGCS Z11, Z1 \ 325 NGCS T0, Z2 \ 326 NGCS T1, Z3 \ 327 NGCS T2, Z4 \ 328 NGCS T3, Z5 \ 329 NGC ZR, T4 \ 330 \ 331 AND T4, X \ 332 CMP $1, X \ 333 CSEL EQ, Y, Z10, Z10 \ 334 CSEL EQ, Z1, Z11, Z11 \ 335 CSEL EQ, Z2, T0, T0 \ 336 CSEL EQ, Z3, T1, T1 \ 337 CSEL EQ, Z4, T2, T2 \ 338 CSEL EQ, Z5, T3, T3 \ 339 \ 340 /* Add that to the middle part */ \ 341 LDP -16(RSP), ( Y, Z1) \ 342 LDP -32(RSP), ( Z2, Z3) \ 343 LDP -48(RSP), ( Z4, Z5) \ 344 ADDS Z10, Z3 \ 345 ADCS Z11, Z4 \ 346 LDP -64(RSP), ( Z6, Z7) \ 347 ADCS T0, Z5 \ 348 ADCS T1, Z6 \ 349 LDP -80(RSP), ( Z8, Z9) \ 350 ADCS T2, Z7 \ 351 ADCS T3, Z8 \ 352 LDP -96(RSP), (Z10, Z11) \ 353 ADCS T12, Z9 \ 354 ADCS ZR, Z10 \ 355 ADC ZR, Z11 \ 356 SUBS X, Z9 \ 357 SBCS ZR, Z10 \ 358 SBC ZR, Z11 359 360 // Compute c = a*b*R^-1 mod p 361 TEXT ·fp384Mul(SB), NOSPLIT, $200-24 362 MOVD c+0(FP), R0 363 MOVD a+8(FP), R1 364 MOVD b+16(FP), R2 365 366 // Compute a*b in R2-R13 367 mul384x384karatsuba(R1, R2, R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,R13, R14,R15,R16,R17,R19,R20,R21,R22,R23,R24,R25,R26,R27) 368 369 // Store a*b on the stack 370 STP ( R2, R3), -112(RSP) 371 STP ( R4, R5), -128(RSP) 372 STP ( R6, R7), -144(RSP) 373 STP ( R8, R9), -160(RSP) 374 STP (R10, R11), -176(RSP) 375 STP (R12, R13), -192(RSP) 376 377 // Compute m = a*b*pp mod 2^384 in R19-R24 378 // Store it temporarily in c 379 MOVD ·pp+0(SB), R14 380 MUL R14, R2, R19 381 UMULH R14, R2, R20 382 383 MUL R14, R3, R16 384 UMULH R14, R3, R21 385 ADDS R16, R20 386 ADC ZR, R21 387 388 MUL R14, R4, R16 389 UMULH R14, R4, R22 390 ADDS R16, R21 391 ADC ZR, R22 392 393 MUL R14, R5, R16 394 UMULH R14, R5, R23 395 ADDS R16, R22 396 ADC ZR, R23 397 398 MUL R14, R6, R16 399 UMULH R14, R6, R24 400 ADDS R16, R23 401 ADC ZR, R24 402 403 MADD R14, R24, R7, R24 404 405 // ·pp+8(SB) = 1, so we can just add 406 ADDS R2, R20 407 STP (R19, R20), 0(R0) 408 ADCS R3, R21 409 ADCS R4, R22 410 ADCS R5, R23 411 ADC R6, R24 412 413 LDP ·pp+16(SB), (R14, R15) 414 MUL R14, R2, R8 415 UMULH R14, R2, R9 416 417 MUL R14, R3, R16 418 UMULH R14, R3, R10 419 ADDS R16, R9 420 ADC ZR, R10 421 422 MUL R14, R4, R16 423 UMULH R14, R4, R11 424 ADDS R16, R10 425 ADC ZR, R11 426 427 MUL R14, R5, R16 428 ADD R16, R11 429 430 ADDS R8, R21 431 ADCS R9, R22 432 ADCS R10, R23 433 ADC R11, R24 434 435 MUL R15, R2, R8 436 UMULH R15, R2, R9 437 438 MUL R15, R3, R16 439 UMULH R15, R3, R10 440 ADDS R16, R9 441 ADC ZR, R10 442 443 MADD R15, R10, R4, R10 444 445 ADDS R8, R22 446 STP (R21, R22), 16(R0) 447 ADCS R9, R23 448 ADC R10, R24 449 450 LDP ·pp+32(SB), (R14, R15) 451 MUL R14, R2, R8 452 UMULH R14, R2, R9 453 454 MADD R14, R9, R3, R9 455 456 ADDS R8, R23 457 ADC R9, R24 458 459 MADD R15, R24, R2, R24 460 STP (R23, R24), 32(R0) 461 462 // Compute m*p in R1-R12 463 MOVD $·p(SB), R1 464 mul384x384karatsuba(R0, R1, R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12, R13,R14,R15,R16,R17,R19,R20,R21,R22,R23,R24,R25,R26) 465 466 // Add a*b to m*p in R1-R12,R26 467 LDP -112(RSP), (R13, R14) 468 ADDS R13, R1 469 LDP -128(RSP), (R15, R16) 470 ADCS R14, R2 471 ADCS R15, R3 472 LDP -144(RSP), (R17, R19) 473 ADCS R16, R4 474 ADCS R17, R5 475 LDP -160(RSP), (R20, R21) 476 ADCS R19, R6 477 ADCS R20, R7 478 LDP -176(RSP), (R22, R23) 479 ADCS R21, R8 480 ADCS R22, R9 481 LDP -192(RSP), (R24, R25) 482 ADCS R23, R10 483 ADCS R24, R11 484 ADCS R25, R12 485 ADC ZR, ZR, R26 486 487 // Reduce the top half mod p 488 LDP ·p+ 0(SB), (R13, R14) 489 SUBS R13, R7, R13 490 LDP ·p+16(SB), (R15, R16) 491 SBCS R14, R8, R14 492 SBCS R15, R9, R15 493 LDP ·p+32(SB), (R17, R19) 494 SBCS R16, R10, R16 495 SBCS R17, R11, R17 496 SBCS R19, R12, R19 497 SBCS ZR, R26 498 499 // Store result in c 500 MOVD c+0(FP), R0 501 CSEL CC, R7, R13, R7 502 CSEL CC, R8, R14, R8 503 STP ( R7, R8), 0(R0) 504 CSEL CC, R9, R15, R9 505 CSEL CC, R10, R16, R10 506 STP ( R9, R10), 16(R0) 507 CSEL CC, R11, R17, R11 508 CSEL CC, R12, R19, R12 509 STP (R11, R12), 32(R0) 510 511 RET