github.com/cloudflare/circl@v1.5.0/ecc/p384/arith_amd64.s (about) 1 // +build amd64,!purego 2 3 #include "textflag.h" 4 5 #define storeBlock(a0,a1,a2,a3,a4,a5, r) \ 6 MOVQ a0, 0+r \ 7 MOVQ a1, 8+r \ 8 MOVQ a2, 16+r \ 9 MOVQ a3, 24+r \ 10 MOVQ a4, 32+r \ 11 MOVQ a5, 40+r 12 13 #define loadBlock(r, a0,a1,a2,a3,a4,a5) \ 14 MOVQ 0+r, a0 \ 15 MOVQ 8+r, a1 \ 16 MOVQ 16+r, a2 \ 17 MOVQ 24+r, a3 \ 18 MOVQ 32+r, a4 \ 19 MOVQ 40+r, a5 20 21 #define fp384Carry(a0,a1,a2,a3,a4,a5,a6, b0,b1,b2,b3,b4,b5,b6) \ 22 \ // b = a-p 23 MOVQ a0, b0 \ 24 MOVQ a1, b1 \ 25 MOVQ a2, b2 \ 26 MOVQ a3, b3 \ 27 MOVQ a4, b4 \ 28 MOVQ a5, b5 \ 29 MOVQ a6, b6 \ 30 \ 31 SUBQ ·p+0(SB), b0 \ 32 SBBQ ·p+8(SB), b1 \ 33 SBBQ ·p+16(SB), b2 \ 34 SBBQ ·p+24(SB), b3 \ 35 SBBQ ·p+32(SB), b4 \ 36 SBBQ ·p+40(SB), b5 \ 37 SBBQ $0, b6 \ 38 \ 39 \ // if b is negative then return a 40 \ // else return b 41 CMOVQCC b0, a0 \ 42 CMOVQCC b1, a1 \ 43 CMOVQCC b2, a2 \ 44 CMOVQCC b3, a3 \ 45 CMOVQCC b4, a4 \ 46 CMOVQCC b5, a5 47 48 #define mul(a0,a1,a2,a3,a4,a5, rb, stack) \ 49 \ // a0 50 MOVQ a0, AX \ 51 MULQ 0+rb \ 52 MOVQ AX, R8 \ 53 MOVQ DX, R9 \ 54 MOVQ a0, AX \ 55 MULQ 8+rb \ 56 ADDQ AX, R9 \ 57 ADCQ $0, DX \ 58 MOVQ DX, R10 \ 59 MOVQ a0, AX \ 60 MULQ 16+rb \ 61 ADDQ AX, R10 \ 62 ADCQ $0, DX \ 63 MOVQ DX, R11 \ 64 MOVQ a0, AX \ 65 MULQ 24+rb \ 66 ADDQ AX, R11 \ 67 ADCQ $0, DX \ 68 MOVQ DX, R12 \ 69 MOVQ a0, AX \ 70 MULQ 32+rb \ 71 ADDQ AX, R12 \ 72 ADCQ $0, DX \ 73 MOVQ DX, R13 \ 74 MOVQ a0, AX \ 75 MULQ 40+rb \ 76 ADDQ AX, R13 \ 77 ADCQ $0, DX \ 78 MOVQ DX, R14 \ 79 \ 80 storeBlock(R8,R9,R10,R11,R12,R13, 0+stack) \ 81 MOVQ R14, 48+stack \ 82 \ 83 \ // a1 84 MOVQ a1, AX \ 85 MULQ 0+rb \ 86 MOVQ AX, R8 \ 87 MOVQ DX, R9 \ 88 MOVQ a1, AX \ 89 MULQ 8+rb \ 90 ADDQ AX, R9 \ 91 ADCQ $0, DX \ 92 MOVQ DX, R10 \ 93 MOVQ a1, AX \ 94 MULQ 16+rb \ 95 ADDQ AX, R10 \ 96 ADCQ $0, DX \ 97 MOVQ DX, R11 \ 98 MOVQ a1, AX \ 99 MULQ 24+rb \ 100 ADDQ AX, R11 \ 101 ADCQ $0, DX \ 102 MOVQ DX, R12 \ 103 MOVQ a1, AX \ 104 MULQ 32+rb \ 105 ADDQ AX, R12 \ 106 ADCQ $0, DX \ 107 MOVQ DX, R13 \ 108 MOVQ a1, AX \ 109 MULQ 40+rb \ 110 ADDQ AX, R13 \ 111 ADCQ $0, DX \ 112 MOVQ DX, R14 \ 113 \ 114 ADDQ 8+stack, R8 \ 115 ADCQ 16+stack, R9 \ 116 ADCQ 24+stack, R10 \ 117 ADCQ 32+stack, R11 \ 118 ADCQ 40+stack, R12 \ 119 ADCQ 48+stack, R13 \ 120 ADCQ $0, R14 \ 121 storeBlock(R8,R9,R10,R11,R12,R13, 8+stack) \ 122 MOVQ R14, 56+stack \ 123 \ 124 \ // a2 125 MOVQ a2, AX \ 126 MULQ 0+rb \ 127 MOVQ AX, R8 \ 128 MOVQ DX, R9 \ 129 MOVQ a2, AX \ 130 MULQ 8+rb \ 131 ADDQ AX, R9 \ 132 ADCQ $0, DX \ 133 MOVQ DX, R10 \ 134 MOVQ a2, AX \ 135 MULQ 16+rb \ 136 ADDQ AX, R10 \ 137 ADCQ $0, DX \ 138 MOVQ DX, R11 \ 139 MOVQ a2, AX \ 140 MULQ 24+rb \ 141 ADDQ AX, R11 \ 142 ADCQ $0, DX \ 143 MOVQ DX, R12 \ 144 MOVQ a2, AX \ 145 MULQ 32+rb \ 146 ADDQ AX, R12 \ 147 ADCQ $0, DX \ 148 MOVQ DX, R13 \ 149 MOVQ a2, AX \ 150 MULQ 40+rb \ 151 ADDQ AX, R13 \ 152 ADCQ $0, DX \ 153 MOVQ DX, R14 \ 154 \ 155 ADDQ 16+stack, R8 \ 156 ADCQ 24+stack, R9 \ 157 ADCQ 32+stack, R10 \ 158 ADCQ 40+stack, R11 \ 159 ADCQ 48+stack, R12 \ 160 ADCQ 56+stack, R13 \ 161 ADCQ $0, R14 \ 162 storeBlock(R8,R9,R10,R11,R12,R13, 16+stack) \ 163 MOVQ R14, 64+stack \ 164 \ 165 \ // a3 166 MOVQ a3, AX \ 167 MULQ 0+rb \ 168 MOVQ AX, R8 \ 169 MOVQ DX, R9 \ 170 MOVQ a3, AX \ 171 MULQ 8+rb \ 172 ADDQ AX, R9 \ 173 ADCQ $0, DX \ 174 MOVQ DX, R10 \ 175 MOVQ a3, AX \ 176 MULQ 16+rb \ 177 ADDQ AX, R10 \ 178 ADCQ $0, DX \ 179 MOVQ DX, R11 \ 180 MOVQ a3, AX \ 181 MULQ 24+rb \ 182 ADDQ AX, R11 \ 183 ADCQ $0, DX \ 184 MOVQ DX, R12 \ 185 MOVQ a3, AX \ 186 MULQ 32+rb \ 187 ADDQ AX, R12 \ 188 ADCQ $0, DX \ 189 MOVQ DX, R13 \ 190 MOVQ a3, AX \ 191 MULQ 40+rb \ 192 ADDQ AX, R13 \ 193 ADCQ $0, DX \ 194 MOVQ DX, R14 \ 195 \ 196 ADDQ 24+stack, R8 \ 197 ADCQ 32+stack, R9 \ 198 ADCQ 40+stack, R10 \ 199 ADCQ 48+stack, R11 \ 200 ADCQ 56+stack, R12 \ 201 ADCQ 64+stack, R13 \ 202 ADCQ $0, R14 \ 203 storeBlock(R8,R9,R10,R11,R12,R13, 24+stack) \ 204 MOVQ R14, 72+stack \ 205 \ 206 \ // a4 207 MOVQ a4, AX \ 208 MULQ 0+rb \ 209 MOVQ AX, R8 \ 210 MOVQ DX, R9 \ 211 MOVQ a4, AX \ 212 MULQ 8+rb \ 213 ADDQ AX, R9 \ 214 ADCQ $0, DX \ 215 MOVQ DX, R10 \ 216 MOVQ a4, AX \ 217 MULQ 16+rb \ 218 ADDQ AX, R10 \ 219 ADCQ $0, DX \ 220 MOVQ DX, R11 \ 221 MOVQ a4, AX \ 222 MULQ 24+rb \ 223 ADDQ AX, R11 \ 224 ADCQ $0, DX \ 225 MOVQ DX, R12 \ 226 MOVQ a4, AX \ 227 MULQ 32+rb \ 228 ADDQ AX, R12 \ 229 ADCQ $0, DX \ 230 MOVQ DX, R13 \ 231 MOVQ a4, AX \ 232 MULQ 40+rb \ 233 ADDQ AX, R13 \ 234 ADCQ $0, DX \ 235 MOVQ DX, R14 \ 236 \ 237 ADDQ 32+stack, R8 \ 238 ADCQ 40+stack, R9 \ 239 ADCQ 48+stack, R10 \ 240 ADCQ 56+stack, R11 \ 241 ADCQ 64+stack, R12 \ 242 ADCQ 72+stack, R13 \ 243 ADCQ $0, R14 \ 244 storeBlock(R8,R9,R10,R11,R12,R13, 32+stack) \ 245 MOVQ R14, 80+stack \ 246 \ 247 \ // a5 248 MOVQ a5, AX \ 249 MULQ 0+rb \ 250 MOVQ AX, R8 \ 251 MOVQ DX, R9 \ 252 MOVQ a5, AX \ 253 MULQ 8+rb \ 254 ADDQ AX, R9 \ 255 ADCQ $0, DX \ 256 MOVQ DX, R10 \ 257 MOVQ a5, AX \ 258 MULQ 16+rb \ 259 ADDQ AX, R10 \ 260 ADCQ $0, DX \ 261 MOVQ DX, R11 \ 262 MOVQ a5, AX \ 263 MULQ 24+rb \ 264 ADDQ AX, R11 \ 265 ADCQ $0, DX \ 266 MOVQ DX, R12 \ 267 MOVQ a5, AX \ 268 MULQ 32+rb \ 269 ADDQ AX, R12 \ 270 ADCQ $0, DX \ 271 MOVQ DX, R13 \ 272 MOVQ a5, AX \ 273 MULQ 40+rb \ 274 ADDQ AX, R13 \ 275 ADCQ $0, DX \ 276 MOVQ DX, R14 \ 277 \ 278 ADDQ 40+stack, R8 \ 279 ADCQ 48+stack, R9 \ 280 ADCQ 56+stack, R10 \ 281 ADCQ 64+stack, R11 \ 282 ADCQ 72+stack, R12 \ 283 ADCQ 80+stack, R13 \ 284 ADCQ $0, R14 \ 285 storeBlock(R8,R9,R10,R11,R12,R13, 40+stack) \ 286 MOVQ R14, 88+stack 287 288 #define fp384Reduce(stack) \ 289 \ // m = (T * P') mod R, store m in R8:R9:R10:R11:R12:R13 290 MOVQ ·pp+0(SB), AX \ 291 MULQ 0+stack \ 292 MOVQ AX, R8 ; MOVQ R8, 96+stack\ 293 MOVQ DX, R9 \ 294 MOVQ ·pp+0(SB), AX \ 295 MULQ 8+stack \ 296 ADDQ AX, R9 \ 297 ADCQ $0, DX \ 298 MOVQ DX, R10 \ 299 MOVQ ·pp+0(SB), AX \ 300 MULQ 16+stack \ 301 ADDQ AX, R10 \ 302 ADCQ $0, DX \ 303 MOVQ DX, R11 \ 304 MOVQ ·pp+0(SB), AX \ 305 MULQ 24+stack \ 306 ADDQ AX, R11 \ 307 ADCQ $0, DX \ 308 MOVQ DX, R12 \ 309 MOVQ ·pp+0(SB), AX \ 310 MULQ 32+stack \ 311 ADDQ AX, R12 \ 312 ADCQ $0, DX \ 313 MOVQ DX, R13 \ 314 MOVQ ·pp+0(SB), AX \ 315 MULQ 40+stack \ 316 ADDQ AX, R13 \ 317 \ 318 ADDQ 0+stack, R9 \ 319 ADCQ 8+stack, R10 \ 320 ADCQ 16+stack, R11 \ 321 ADCQ 24+stack, R12 \ 322 ADCQ 32+stack, R13 \ 323 \ 324 MOVQ ·pp+16(SB), AX \ 325 MULQ 0+stack \ 326 MOVQ AX, R14 \ 327 MOVQ DX, R8 \ 328 MOVQ ·pp+16(SB), AX \ 329 MULQ 8+stack \ 330 ADDQ AX, R8 \ 331 ADCQ $0, DX \ 332 MOVQ DX, BX \ 333 MOVQ ·pp+16(SB), AX \ 334 MULQ 16+stack \ 335 ADDQ AX, BX \ 336 ADCQ $0, DX \ 337 MOVQ DX, CX \ 338 MOVQ ·pp+16(SB), AX \ 339 MULQ 24+stack \ 340 ADDQ AX, CX \ 341 \ 342 ADDQ R14, R10 \ 343 ADCQ R8, R11 \ 344 ADCQ BX, R12 \ 345 ADCQ CX, R13 \ 346 \ 347 MOVQ ·pp+24(SB), AX \ 348 MULQ 0+stack \ 349 MOVQ AX, R14 \ 350 MOVQ DX, R8 \ 351 MOVQ ·pp+24(SB), AX \ 352 MULQ 8+stack \ 353 ADDQ AX, R8 \ 354 ADCQ $0, DX \ 355 MOVQ DX, BX \ 356 MOVQ ·pp+24(SB), AX \ 357 MULQ 16+stack \ 358 ADDQ AX, BX \ 359 \ 360 ADDQ R14, R11 \ 361 ADCQ R8, R12 \ 362 ADCQ BX, R13 \ 363 \ 364 MOVQ ·pp+32(SB), AX \ 365 MULQ 0+stack \ 366 MOVQ AX, R14 \ 367 MOVQ DX, R8 \ 368 MOVQ ·pp+32(SB), AX \ 369 MULQ 8+stack \ 370 ADDQ AX, R8 \ 371 \ 372 ADDQ R14, R12 \ 373 ADCQ R8, R13 \ 374 \ 375 MOVQ ·pp+40(SB), AX \ 376 MULQ 0+stack \ 377 ADDQ AX, R13 \ 378 \ 379 MOVQ 96+stack, R8 \ 380 \ 381 storeBlock(R8,R9,R10,R11,R12,R13, 96+stack) \ 382 \ 383 \ // m * P 384 mul(·p+0(SB),·p+8(SB),·p+16(SB),·p+24(SB),·p+32(SB),·p+40(SB), 96+stack, 144+stack) \ 385 \ 386 \ // Add the 768-bit intermediate to m*N 387 MOVQ $0, R15 \ 388 loadBlock(144+stack, R8,R9,R10,R11,R12,R13) \ 389 loadBlock(192+stack, R14,SI,AX,BX,CX,DX) \ 390 \ 391 ADDQ 0+stack, R8 \ 392 ADCQ 8+stack, R9 \ 393 ADCQ 16+stack, R10 \ 394 ADCQ 24+stack, R11 \ 395 ADCQ 32+stack, R12 \ 396 ADCQ 40+stack, R13 \ 397 ADCQ 48+stack, R14 \ 398 ADCQ 56+stack, SI \ 399 ADCQ 64+stack, AX \ 400 ADCQ 72+stack, BX \ 401 ADCQ 80+stack, CX \ 402 ADCQ 88+stack, DX \ 403 ADCQ $0, R15 \ 404 \ 405 fp384Carry(R14,SI,AX,BX,CX,DX,R15, R8,R9,R10,R11,R12,R13,DI) 406 407 #define mulBMI2(a0,a1,a2,a3,a4,a5, rb, stack) \ 408 MOVQ a0, DX \ 409 MULXQ 0+rb, R8, R9; MOVQ R8, 0+stack; MOVQ $0, R8 \ 410 MULXQ 8+rb, AX, R10 \ 411 ADDQ AX, R9 \ 412 MULXQ 16+rb, AX, R11 \ 413 ADCQ AX, R10 \ 414 MULXQ 24+rb, AX, R12 \ 415 ADCQ AX, R11 \ 416 MULXQ 32+rb, AX, R13 \ 417 ADCQ AX, R12 \ 418 MULXQ 40+rb, AX, R14 \ 419 ADCQ AX, R13 \ 420 ADCQ $0, R14 \ 421 \ 422 MOVQ a1, DX \ 423 MULXQ 0+rb, AX, BX \ 424 ADDQ AX, R9; MOVQ R9, 8+stack; MOVL $0, R9 \ 425 ADCQ BX, R10 \ 426 MULXQ 16+rb, AX, BX \ 427 ADCQ AX, R11 \ 428 ADCQ BX, R12 \ 429 MULXQ 32+rb, AX, BX \ 430 ADCQ AX, R13 \ 431 ADCQ BX, R14 \ 432 ADCQ $0, R8 \ 433 MULXQ 8+rb, AX, BX \ 434 ADDQ AX, R10 \ 435 ADCQ BX, R11 \ 436 MULXQ 24+rb, AX, BX \ 437 ADCQ AX, R12 \ 438 ADCQ BX, R13 \ 439 MULXQ 40+rb, AX, BX \ 440 ADCQ AX, R14 \ 441 ADCQ BX, R8 \ 442 ADCQ $0, R9 \ 443 \ 444 MOVQ a2, DX \ 445 MULXQ 0+rb, AX, BX \ 446 ADDQ AX, R10; MOVQ R10, 16+stack; MOVL $0, R10 \ 447 ADCQ BX, R11 \ 448 MULXQ 16+rb, AX, BX \ 449 ADCQ AX, R12 \ 450 ADCQ BX, R13 \ 451 MULXQ 32+rb, AX, BX \ 452 ADCQ AX, R14 \ 453 ADCQ BX, R8 \ 454 ADCQ $0, R9 \ 455 MULXQ 8+rb, AX, BX \ 456 ADDQ AX, R11 \ 457 ADCQ BX, R12 \ 458 MULXQ 24+rb, AX, BX \ 459 ADCQ AX, R13 \ 460 ADCQ BX, R14 \ 461 MULXQ 40+rb, AX, BX \ 462 ADCQ AX, R8 \ 463 ADCQ BX, R9 \ 464 ADCQ $0, R10 \ 465 \ 466 MOVQ a3, DX \ 467 MULXQ 0+rb, AX, BX \ 468 ADDQ AX, R11; MOVQ R11, 24+stack; MOVL $0, R11 \ 469 ADCQ BX, R12 \ 470 MULXQ 16+rb, AX, BX \ 471 ADCQ AX, R13 \ 472 ADCQ BX, R14 \ 473 MULXQ 32+rb, AX, BX \ 474 ADCQ AX, R8 \ 475 ADCQ BX, R9 \ 476 ADCQ $0, R10 \ 477 MULXQ 8+rb, AX, BX \ 478 ADDQ AX, R12 \ 479 ADCQ BX, R13 \ 480 MULXQ 24+rb, AX, BX \ 481 ADCQ AX, R14 \ 482 ADCQ BX, R8 \ 483 MULXQ 40+rb, AX, BX \ 484 ADCQ AX, R9 \ 485 ADCQ BX, R10 \ 486 ADCQ $0, R11 \ 487 \ 488 MOVQ a4, DX \ 489 MULXQ 0+rb, AX, BX \ 490 ADDQ AX, R12; MOVQ R12, 32+stack; MOVL $0, R12 \ 491 ADCQ BX, R13 \ 492 MULXQ 16+rb, AX, BX \ 493 ADCQ AX, R14 \ 494 ADCQ BX, R8 \ 495 MULXQ 32+rb, AX, BX \ 496 ADCQ AX, R9 \ 497 ADCQ BX, R10 \ 498 ADCQ $0, R11 \ 499 MULXQ 8+rb, AX, BX \ 500 ADDQ AX, R13 \ 501 ADCQ BX, R14 \ 502 MULXQ 24+rb, AX, BX \ 503 ADCQ AX, R8 \ 504 ADCQ BX, R9 \ 505 MULXQ 40+rb, AX, BX \ 506 ADCQ AX, R10 \ 507 ADCQ BX, R11 \ 508 ADCQ $0, R12 \ 509 \ 510 MOVQ a5, DX \ 511 MULXQ 0+rb, AX, BX \ 512 ADDQ AX, R13; MOVQ R13, 40+stack \ 513 ADCQ BX, R14 \ 514 MULXQ 16+rb, AX, BX \ 515 ADCQ AX, R8 \ 516 ADCQ BX, R9 \ 517 MULXQ 32+rb, AX, BX \ 518 ADCQ AX, R10 \ 519 ADCQ BX, R11 \ 520 ADCQ $0, R12 \ 521 MULXQ 8+rb, AX, BX \ 522 ADDQ AX, R14 \ 523 ADCQ BX, R8 \ 524 MULXQ 24+rb, AX, BX \ 525 ADCQ AX, R9 \ 526 ADCQ BX, R10 \ 527 MULXQ 40+rb, AX, BX \ 528 ADCQ AX, R11 \ 529 ADCQ BX, R12 530 531 #define fp384ReduceBMI2(stack) \ 532 \ // m = (T * P') mod R, store m in R8:R9:R10:R11:R12:R13 533 MOVQ ·pp+0(SB), DX \ 534 MULXQ 0+stack, R8, R9 \ 535 MULXQ 8+stack, AX, R10 \ 536 ADDQ AX, R9 \ 537 MULXQ 16+stack, AX, R11 \ 538 ADCQ AX, R10 \ 539 MULXQ 24+stack, AX, R12 \ 540 ADCQ AX, R11 \ 541 MULXQ 32+stack, AX, R13 \ 542 ADCQ AX, R12 \ 543 MULXQ 40+stack, AX, BX \ 544 ADCQ AX, R13 \ 545 \ 546 ADDQ 0+stack, R9 \ 547 ADCQ 8+stack, R10 \ 548 ADCQ 16+stack, R11 \ 549 ADCQ 24+stack, R12 \ 550 ADCQ 32+stack, R13 \ 551 \ 552 MOVQ ·pp+16(SB), DX \ 553 MULXQ 0+stack, AX, BX \ 554 ADDQ AX, R10 \ 555 ADCQ BX, R11 \ 556 MULXQ 16+stack, AX, BX \ 557 ADCQ AX, R12 \ 558 ADCQ BX, R13 \ 559 MULXQ 8+stack, AX, BX \ 560 ADDQ AX, R11 \ 561 ADCQ BX, R12 \ 562 MULXQ 24+stack, AX, BX \ 563 ADCQ AX, R13 \ 564 \ 565 MOVQ ·pp+24(SB), DX \ 566 MULXQ 0+stack, AX, BX \ 567 ADDQ AX, R11 \ 568 ADCQ BX, R12 \ 569 MULXQ 16+stack, AX, BX \ 570 ADCQ AX, R13 \ 571 MULXQ 8+stack, AX, BX \ 572 ADDQ AX, R12 \ 573 ADCQ BX, R13 \ 574 \ 575 MOVQ ·pp+32(SB), DX \ 576 MULXQ 0+stack, AX, BX \ 577 ADDQ AX, R12 \ 578 ADCQ BX, R13 \ 579 MULXQ 8+stack, AX, BX \ 580 ADDQ AX, R13 \ 581 \ 582 MOVQ ·pp+40(SB), DX \ 583 MULXQ 0+stack, AX, BX \ 584 ADDQ AX, R13 \ 585 \ 586 storeBlock(R8,R9,R10,R11,R12,R13, 96+stack) \ 587 \ 588 \ // m * P 589 mulBMI2(·p+0(SB),·p+8(SB),·p+16(SB),·p+24(SB),·p+32(SB),·p+40(SB), 96+stack, 144+stack) \ 590 \ 591 \ // Add the 768-bit intermediate to m*N 592 loadBlock(144+stack, AX,R13,BX,CX,DX,DI) \ 593 \ 594 ADDQ 0+stack, AX \ 595 ADCQ 8+stack, R13 \ 596 ADCQ 16+stack, BX \ 597 ADCQ 24+stack, CX \ 598 ADCQ 32+stack, DX \ 599 ADCQ 40+stack, DI \ 600 ADCQ 48+stack, R14 \ 601 ADCQ 56+stack, R8 \ 602 ADCQ 64+stack, R9 \ 603 ADCQ 72+stack, R10 \ 604 ADCQ 80+stack, R11 \ 605 ADCQ 88+stack, R12 \ 606 MOVQ $0, 0+stack \ 607 ADCQ $0, 0+stack \ 608 \ 609 fp384Carry(R14,R8,R9,R10,R11,R12, 0+stack, AX,R13,BX,CX,DX,DI,SI) 610 611 TEXT ·fp384Neg(SB), NOSPLIT, $0-16 612 MOVQ ·p+0(SB), R8 613 MOVQ ·p+8(SB), R9 614 MOVQ ·p+16(SB), R10 615 MOVQ ·p+24(SB), R11 616 MOVQ ·p+32(SB), R12 617 MOVQ ·p+40(SB), R13 618 619 MOVQ a+8(FP), DI 620 SUBQ 0(DI), R8 621 SBBQ 8(DI), R9 622 SBBQ 16(DI), R10 623 SBBQ 24(DI), R11 624 SBBQ 32(DI), R12 625 SBBQ 40(DI), R13 626 627 MOVQ $0, R15 628 fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI) 629 630 MOVQ c+0(FP), DI 631 storeBlock(R8,R9,R10,R11,R12,R13, 0(DI)) 632 RET 633 634 TEXT ·fp384Add(SB), NOSPLIT, $0-24 635 MOVQ a+8(FP), DI 636 MOVQ b+16(FP), SI 637 638 loadBlock(0(DI), R8,R9,R10,R11,R12,R13) 639 MOVQ $0, R15 640 641 ADDQ 0(SI), R8 642 ADCQ 8(SI), R9 643 ADCQ 16(SI), R10 644 ADCQ 24(SI), R11 645 ADCQ 32(SI), R12 646 ADCQ 40(SI), R13 647 ADCQ $0, R15 648 649 fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI) 650 651 MOVQ c+0(FP), DI 652 storeBlock(R8,R9,R10,R11,R12,R13, 0(DI)) 653 RET 654 655 TEXT ·fp384Sub(SB), NOSPLIT, $0-24 656 MOVQ ·p+0(SB), R8 657 MOVQ ·p+8(SB), R9 658 MOVQ ·p+16(SB), R10 659 MOVQ ·p+24(SB), R11 660 MOVQ ·p+32(SB), R12 661 MOVQ ·p+40(SB), R13 662 663 MOVQ b+16(FP), DI 664 SUBQ 0(DI), R8 665 SBBQ 8(DI), R9 666 SBBQ 16(DI), R10 667 SBBQ 24(DI), R11 668 SBBQ 32(DI), R12 669 SBBQ 40(DI), R13 670 671 MOVQ $0, R15 672 MOVQ a+8(FP), DI 673 ADDQ 0(DI), R8 674 ADCQ 8(DI), R9 675 ADCQ 16(DI), R10 676 ADCQ 24(DI), R11 677 ADCQ 32(DI), R12 678 ADCQ 40(DI), R13 679 ADCQ $0, R15 680 681 fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI) 682 683 MOVQ c+0(FP), DI 684 storeBlock(R8,R9,R10,R11,R12,R13, 0(DI)) 685 RET 686 687 TEXT ·fp384Mul(SB), NOSPLIT, $240-24 688 MOVQ a+8(FP), DI 689 MOVQ b+16(FP), SI 690 691 // Jump to a slightly different implementation if MULX isn't supported. 692 CMPB ·hasBMI2(SB), $0 693 JE nobmi2Mul 694 695 // T = a * b 696 mulBMI2(0(DI),8(DI),16(DI),24(DI),32(DI),40(DI), 0(SI), 0(SP)) 697 storeBlock(R14,R8,R9,R10,R11,R12, 48(SP)) 698 699 // Reduce T. 700 fp384ReduceBMI2(0(SP)) 701 702 MOVQ c+0(FP), DI 703 storeBlock(R14,R8,R9,R10,R11,R12, 0(DI)) 704 JMP end 705 706 nobmi2Mul: 707 // T = a * b 708 mul(0(DI),8(DI),16(DI),24(DI),32(DI),40(DI), 0(SI), 0(SP)) 709 710 // Reduce T. 711 fp384Reduce(0(SP)) 712 713 MOVQ c+0(FP), DI 714 storeBlock(R14,SI,AX,BX,CX,DX, 0(DI)) 715 716 end: 717 RET 718 719 TEXT ·fp384Cmov(SB), NOSPLIT, $0 720 MOVQ x+0(FP), DI 721 MOVQ y+8(FP), SI 722 MOVQ b+16(FP), BX 723 TESTQ BX, BX 724 MOVQ 0(DI), AX; MOVQ 0(SI), DX; CMOVQNE DX, AX; MOVQ AX, 0(DI); 725 MOVQ 8(DI), AX; MOVQ 8(SI), DX; CMOVQNE DX, AX; MOVQ AX, 8(DI); 726 MOVQ 16(DI), AX; MOVQ 16(SI), DX; CMOVQNE DX, AX; MOVQ AX, 16(DI); 727 MOVQ 24(DI), AX; MOVQ 24(SI), DX; CMOVQNE DX, AX; MOVQ AX, 24(DI); 728 MOVQ 32(DI), AX; MOVQ 32(SI), DX; CMOVQNE DX, AX; MOVQ AX, 32(DI); 729 MOVQ 40(DI), AX; MOVQ 40(SI), DX; CMOVQNE DX, AX; MOVQ AX, 40(DI); 730 RET