github.com/consensys/gnark-crypto@v0.14.0/ecc/bn254/internal/fptower/e2_amd64.s (about) 1 // Copyright 2020 ConsenSys Software Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "textflag.h" 16 #include "funcdata.h" 17 18 // modulus q 19 DATA q<>+0(SB)/8, $0x3c208c16d87cfd47 20 DATA q<>+8(SB)/8, $0x97816a916871ca8d 21 DATA q<>+16(SB)/8, $0xb85045b68181585d 22 DATA q<>+24(SB)/8, $0x30644e72e131a029 23 GLOBL q<>(SB), (RODATA+NOPTR), $32 24 25 // qInv0 q'[0] 26 DATA qInv0<>(SB)/8, $0x87d20782e4866389 27 GLOBL qInv0<>(SB), (RODATA+NOPTR), $8 28 29 #define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \ 30 MOVQ ra0, rb0; \ 31 SUBQ q<>(SB), ra0; \ 32 MOVQ ra1, rb1; \ 33 SBBQ q<>+8(SB), ra1; \ 34 MOVQ ra2, rb2; \ 35 SBBQ q<>+16(SB), ra2; \ 36 MOVQ ra3, rb3; \ 37 SBBQ q<>+24(SB), ra3; \ 38 CMOVQCS rb0, ra0; \ 39 CMOVQCS rb1, ra1; \ 40 CMOVQCS rb2, ra2; \ 41 CMOVQCS rb3, ra3; \ 42 43 // this code is generated and identical to fp.Mul(...) 44 #define MUL() \ 45 XORQ AX, AX; \ 46 MOVQ SI, DX; \ 47 MULXQ R14, R10, R11; \ 48 MULXQ R15, AX, R12; \ 49 ADOXQ AX, R11; \ 50 MULXQ CX, AX, R13; \ 51 ADOXQ AX, R12; \ 52 MULXQ BX, AX, BP; \ 53 ADOXQ AX, R13; \ 54 MOVQ $0, AX; \ 55 ADOXQ AX, BP; \ 56 PUSHQ BP; \ 57 MOVQ qInv0<>(SB), DX; \ 58 IMULQ R10, DX; \ 59 XORQ AX, AX; \ 60 MULXQ q<>+0(SB), AX, BP; \ 61 ADCXQ R10, AX; \ 62 MOVQ BP, R10; \ 63 POPQ BP; \ 64 ADCXQ R11, R10; \ 65 MULXQ q<>+8(SB), AX, R11; \ 66 ADOXQ AX, R10; \ 67 ADCXQ R12, R11; \ 68 MULXQ q<>+16(SB), AX, R12; \ 69 ADOXQ AX, R11; \ 70 ADCXQ R13, R12; \ 71 MULXQ q<>+24(SB), AX, R13; \ 72 ADOXQ AX, R12; \ 73 MOVQ $0, AX; \ 74 ADCXQ AX, R13; \ 75 ADOXQ BP, R13; \ 76 XORQ AX, AX; \ 77 MOVQ DI, DX; \ 78 MULXQ R14, AX, BP; \ 79 ADOXQ AX, R10; \ 80 ADCXQ BP, R11; \ 81 MULXQ R15, AX, BP; \ 82 ADOXQ AX, R11; \ 83 ADCXQ BP, R12; \ 84 MULXQ CX, AX, BP; \ 85 ADOXQ AX, R12; \ 86 ADCXQ BP, R13; \ 87 MULXQ BX, AX, BP; \ 88 ADOXQ AX, R13; \ 89 MOVQ $0, AX; \ 90 ADCXQ AX, BP; \ 91 ADOXQ AX, BP; \ 92 PUSHQ BP; \ 93 MOVQ qInv0<>(SB), DX; \ 94 IMULQ R10, DX; \ 95 XORQ AX, AX; \ 96 MULXQ q<>+0(SB), AX, BP; \ 97 ADCXQ R10, AX; \ 98 MOVQ BP, R10; \ 99 POPQ BP; \ 100 ADCXQ R11, R10; \ 101 MULXQ q<>+8(SB), AX, R11; \ 102 ADOXQ AX, R10; \ 103 ADCXQ R12, R11; \ 104 MULXQ q<>+16(SB), AX, R12; \ 105 ADOXQ AX, R11; \ 106 ADCXQ R13, R12; \ 107 MULXQ q<>+24(SB), AX, R13; \ 108 ADOXQ AX, R12; \ 109 MOVQ $0, AX; \ 110 ADCXQ AX, R13; \ 111 ADOXQ BP, R13; \ 112 XORQ AX, AX; \ 113 MOVQ R8, DX; \ 114 MULXQ R14, AX, BP; \ 115 ADOXQ AX, R10; \ 116 ADCXQ BP, R11; \ 117 MULXQ R15, AX, BP; \ 118 ADOXQ AX, R11; \ 119 ADCXQ BP, R12; \ 120 MULXQ CX, AX, BP; \ 121 ADOXQ AX, R12; \ 122 ADCXQ BP, R13; \ 123 MULXQ BX, AX, BP; \ 124 ADOXQ AX, R13; \ 125 MOVQ $0, AX; \ 126 ADCXQ AX, BP; \ 127 ADOXQ AX, BP; \ 128 PUSHQ BP; \ 129 MOVQ qInv0<>(SB), DX; \ 130 IMULQ R10, DX; \ 131 XORQ AX, AX; \ 132 MULXQ q<>+0(SB), AX, BP; \ 133 ADCXQ R10, AX; \ 134 MOVQ BP, R10; \ 135 POPQ BP; \ 136 ADCXQ R11, R10; \ 137 MULXQ q<>+8(SB), AX, R11; \ 138 ADOXQ AX, R10; \ 139 ADCXQ R12, R11; \ 140 MULXQ q<>+16(SB), AX, R12; \ 141 ADOXQ AX, R11; \ 142 ADCXQ R13, R12; \ 143 MULXQ q<>+24(SB), AX, R13; \ 144 ADOXQ AX, R12; \ 145 MOVQ $0, AX; \ 146 ADCXQ AX, R13; \ 147 ADOXQ BP, R13; \ 148 XORQ AX, AX; \ 149 MOVQ R9, DX; \ 150 MULXQ R14, AX, BP; \ 151 ADOXQ AX, R10; \ 152 ADCXQ BP, R11; \ 153 MULXQ R15, AX, BP; \ 154 ADOXQ AX, R11; \ 155 ADCXQ BP, R12; \ 156 MULXQ CX, AX, BP; \ 157 ADOXQ AX, R12; \ 158 ADCXQ BP, R13; \ 159 MULXQ BX, AX, BP; \ 160 ADOXQ AX, R13; \ 161 MOVQ $0, AX; \ 162 ADCXQ AX, BP; \ 163 ADOXQ AX, BP; \ 164 PUSHQ BP; \ 165 MOVQ qInv0<>(SB), DX; \ 166 IMULQ R10, DX; \ 167 XORQ AX, AX; \ 168 MULXQ q<>+0(SB), AX, BP; \ 169 ADCXQ R10, AX; \ 170 MOVQ BP, R10; \ 171 POPQ BP; \ 172 ADCXQ R11, R10; \ 173 MULXQ q<>+8(SB), AX, R11; \ 174 ADOXQ AX, R10; \ 175 ADCXQ R12, R11; \ 176 MULXQ q<>+16(SB), AX, R12; \ 177 ADOXQ AX, R11; \ 178 ADCXQ R13, R12; \ 179 MULXQ q<>+24(SB), AX, R13; \ 180 ADOXQ AX, R12; \ 181 MOVQ $0, AX; \ 182 ADCXQ AX, R13; \ 183 ADOXQ BP, R13; \ 184 185 TEXT ·addE2(SB), NOSPLIT, $0-24 186 MOVQ x+8(FP), AX 187 MOVQ 0(AX), BX 188 MOVQ 8(AX), SI 189 MOVQ 16(AX), DI 190 MOVQ 24(AX), R8 191 MOVQ y+16(FP), DX 192 ADDQ 0(DX), BX 193 ADCQ 8(DX), SI 194 ADCQ 16(DX), DI 195 ADCQ 24(DX), R8 196 197 // reduce element(BX,SI,DI,R8) using temp registers (R9,R10,R11,R12) 198 REDUCE(BX,SI,DI,R8,R9,R10,R11,R12) 199 200 MOVQ res+0(FP), CX 201 MOVQ BX, 0(CX) 202 MOVQ SI, 8(CX) 203 MOVQ DI, 16(CX) 204 MOVQ R8, 24(CX) 205 MOVQ 32(AX), BX 206 MOVQ 40(AX), SI 207 MOVQ 48(AX), DI 208 MOVQ 56(AX), R8 209 ADDQ 32(DX), BX 210 ADCQ 40(DX), SI 211 ADCQ 48(DX), DI 212 ADCQ 56(DX), R8 213 214 // reduce element(BX,SI,DI,R8) using temp registers (R13,R14,R15,R9) 215 REDUCE(BX,SI,DI,R8,R13,R14,R15,R9) 216 217 MOVQ BX, 32(CX) 218 MOVQ SI, 40(CX) 219 MOVQ DI, 48(CX) 220 MOVQ R8, 56(CX) 221 RET 222 223 TEXT ·doubleE2(SB), NOSPLIT, $0-16 224 MOVQ res+0(FP), DX 225 MOVQ x+8(FP), AX 226 MOVQ 0(AX), CX 227 MOVQ 8(AX), BX 228 MOVQ 16(AX), SI 229 MOVQ 24(AX), DI 230 ADDQ CX, CX 231 ADCQ BX, BX 232 ADCQ SI, SI 233 ADCQ DI, DI 234 235 // reduce element(CX,BX,SI,DI) using temp registers (R8,R9,R10,R11) 236 REDUCE(CX,BX,SI,DI,R8,R9,R10,R11) 237 238 MOVQ CX, 0(DX) 239 MOVQ BX, 8(DX) 240 MOVQ SI, 16(DX) 241 MOVQ DI, 24(DX) 242 MOVQ 32(AX), CX 243 MOVQ 40(AX), BX 244 MOVQ 48(AX), SI 245 MOVQ 56(AX), DI 246 ADDQ CX, CX 247 ADCQ BX, BX 248 ADCQ SI, SI 249 ADCQ DI, DI 250 251 // reduce element(CX,BX,SI,DI) using temp registers (R12,R13,R14,R15) 252 REDUCE(CX,BX,SI,DI,R12,R13,R14,R15) 253 254 MOVQ CX, 32(DX) 255 MOVQ BX, 40(DX) 256 MOVQ SI, 48(DX) 257 MOVQ DI, 56(DX) 258 RET 259 260 TEXT ·subE2(SB), NOSPLIT, $0-24 261 XORQ DI, DI 262 MOVQ x+8(FP), SI 263 MOVQ 0(SI), AX 264 MOVQ 8(SI), DX 265 MOVQ 16(SI), CX 266 MOVQ 24(SI), BX 267 MOVQ y+16(FP), SI 268 SUBQ 0(SI), AX 269 SBBQ 8(SI), DX 270 SBBQ 16(SI), CX 271 SBBQ 24(SI), BX 272 MOVQ x+8(FP), SI 273 MOVQ $0x3c208c16d87cfd47, R8 274 MOVQ $0x97816a916871ca8d, R9 275 MOVQ $0xb85045b68181585d, R10 276 MOVQ $0x30644e72e131a029, R11 277 CMOVQCC DI, R8 278 CMOVQCC DI, R9 279 CMOVQCC DI, R10 280 CMOVQCC DI, R11 281 ADDQ R8, AX 282 ADCQ R9, DX 283 ADCQ R10, CX 284 ADCQ R11, BX 285 MOVQ res+0(FP), R12 286 MOVQ AX, 0(R12) 287 MOVQ DX, 8(R12) 288 MOVQ CX, 16(R12) 289 MOVQ BX, 24(R12) 290 MOVQ 32(SI), AX 291 MOVQ 40(SI), DX 292 MOVQ 48(SI), CX 293 MOVQ 56(SI), BX 294 MOVQ y+16(FP), SI 295 SUBQ 32(SI), AX 296 SBBQ 40(SI), DX 297 SBBQ 48(SI), CX 298 SBBQ 56(SI), BX 299 MOVQ $0x3c208c16d87cfd47, R13 300 MOVQ $0x97816a916871ca8d, R14 301 MOVQ $0xb85045b68181585d, R15 302 MOVQ $0x30644e72e131a029, R8 303 CMOVQCC DI, R13 304 CMOVQCC DI, R14 305 CMOVQCC DI, R15 306 CMOVQCC DI, R8 307 ADDQ R13, AX 308 ADCQ R14, DX 309 ADCQ R15, CX 310 ADCQ R8, BX 311 MOVQ res+0(FP), SI 312 MOVQ AX, 32(SI) 313 MOVQ DX, 40(SI) 314 MOVQ CX, 48(SI) 315 MOVQ BX, 56(SI) 316 RET 317 318 TEXT ·negE2(SB), NOSPLIT, $0-16 319 MOVQ res+0(FP), DX 320 MOVQ x+8(FP), AX 321 MOVQ 0(AX), BX 322 MOVQ 8(AX), SI 323 MOVQ 16(AX), DI 324 MOVQ 24(AX), R8 325 MOVQ BX, AX 326 ORQ SI, AX 327 ORQ DI, AX 328 ORQ R8, AX 329 TESTQ AX, AX 330 JNE l1 331 MOVQ AX, 0(DX) 332 MOVQ AX, 8(DX) 333 MOVQ AX, 16(DX) 334 MOVQ AX, 24(DX) 335 JMP l3 336 337 l1: 338 MOVQ $0x3c208c16d87cfd47, CX 339 SUBQ BX, CX 340 MOVQ CX, 0(DX) 341 MOVQ $0x97816a916871ca8d, CX 342 SBBQ SI, CX 343 MOVQ CX, 8(DX) 344 MOVQ $0xb85045b68181585d, CX 345 SBBQ DI, CX 346 MOVQ CX, 16(DX) 347 MOVQ $0x30644e72e131a029, CX 348 SBBQ R8, CX 349 MOVQ CX, 24(DX) 350 351 l3: 352 MOVQ x+8(FP), AX 353 MOVQ 32(AX), BX 354 MOVQ 40(AX), SI 355 MOVQ 48(AX), DI 356 MOVQ 56(AX), R8 357 MOVQ BX, AX 358 ORQ SI, AX 359 ORQ DI, AX 360 ORQ R8, AX 361 TESTQ AX, AX 362 JNE l2 363 MOVQ AX, 32(DX) 364 MOVQ AX, 40(DX) 365 MOVQ AX, 48(DX) 366 MOVQ AX, 56(DX) 367 RET 368 369 l2: 370 MOVQ $0x3c208c16d87cfd47, CX 371 SUBQ BX, CX 372 MOVQ CX, 32(DX) 373 MOVQ $0x97816a916871ca8d, CX 374 SBBQ SI, CX 375 MOVQ CX, 40(DX) 376 MOVQ $0xb85045b68181585d, CX 377 SBBQ DI, CX 378 MOVQ CX, 48(DX) 379 MOVQ $0x30644e72e131a029, CX 380 SBBQ R8, CX 381 MOVQ CX, 56(DX) 382 RET 383 384 TEXT ·mulNonResE2(SB), NOSPLIT, $0-16 385 MOVQ x+8(FP), R10 386 MOVQ 0(R10), AX 387 MOVQ 8(R10), DX 388 MOVQ 16(R10), CX 389 MOVQ 24(R10), BX 390 ADDQ AX, AX 391 ADCQ DX, DX 392 ADCQ CX, CX 393 ADCQ BX, BX 394 395 // reduce element(AX,DX,CX,BX) using temp registers (R11,R12,R13,R14) 396 REDUCE(AX,DX,CX,BX,R11,R12,R13,R14) 397 398 ADDQ AX, AX 399 ADCQ DX, DX 400 ADCQ CX, CX 401 ADCQ BX, BX 402 403 // reduce element(AX,DX,CX,BX) using temp registers (R15,R11,R12,R13) 404 REDUCE(AX,DX,CX,BX,R15,R11,R12,R13) 405 406 ADDQ AX, AX 407 ADCQ DX, DX 408 ADCQ CX, CX 409 ADCQ BX, BX 410 411 // reduce element(AX,DX,CX,BX) using temp registers (R14,R15,R11,R12) 412 REDUCE(AX,DX,CX,BX,R14,R15,R11,R12) 413 414 ADDQ 0(R10), AX 415 ADCQ 8(R10), DX 416 ADCQ 16(R10), CX 417 ADCQ 24(R10), BX 418 419 // reduce element(AX,DX,CX,BX) using temp registers (R13,R14,R15,R11) 420 REDUCE(AX,DX,CX,BX,R13,R14,R15,R11) 421 422 MOVQ 32(R10), SI 423 MOVQ 40(R10), DI 424 MOVQ 48(R10), R8 425 MOVQ 56(R10), R9 426 XORQ R12, R12 427 SUBQ SI, AX 428 SBBQ DI, DX 429 SBBQ R8, CX 430 SBBQ R9, BX 431 MOVQ $0x3c208c16d87cfd47, R13 432 MOVQ $0x97816a916871ca8d, R14 433 MOVQ $0xb85045b68181585d, R15 434 MOVQ $0x30644e72e131a029, R11 435 CMOVQCC R12, R13 436 CMOVQCC R12, R14 437 CMOVQCC R12, R15 438 CMOVQCC R12, R11 439 ADDQ R13, AX 440 ADCQ R14, DX 441 ADCQ R15, CX 442 ADCQ R11, BX 443 ADDQ SI, SI 444 ADCQ DI, DI 445 ADCQ R8, R8 446 ADCQ R9, R9 447 448 // reduce element(SI,DI,R8,R9) using temp registers (R13,R14,R15,R11) 449 REDUCE(SI,DI,R8,R9,R13,R14,R15,R11) 450 451 ADDQ SI, SI 452 ADCQ DI, DI 453 ADCQ R8, R8 454 ADCQ R9, R9 455 456 // reduce element(SI,DI,R8,R9) using temp registers (R12,R13,R14,R15) 457 REDUCE(SI,DI,R8,R9,R12,R13,R14,R15) 458 459 ADDQ SI, SI 460 ADCQ DI, DI 461 ADCQ R8, R8 462 ADCQ R9, R9 463 464 // reduce element(SI,DI,R8,R9) using temp registers (R11,R12,R13,R14) 465 REDUCE(SI,DI,R8,R9,R11,R12,R13,R14) 466 467 ADDQ 32(R10), SI 468 ADCQ 40(R10), DI 469 ADCQ 48(R10), R8 470 ADCQ 56(R10), R9 471 472 // reduce element(SI,DI,R8,R9) using temp registers (R15,R11,R12,R13) 473 REDUCE(SI,DI,R8,R9,R15,R11,R12,R13) 474 475 ADDQ 0(R10), SI 476 ADCQ 8(R10), DI 477 ADCQ 16(R10), R8 478 ADCQ 24(R10), R9 479 480 // reduce element(SI,DI,R8,R9) using temp registers (R14,R15,R11,R12) 481 REDUCE(SI,DI,R8,R9,R14,R15,R11,R12) 482 483 MOVQ res+0(FP), R10 484 MOVQ AX, 0(R10) 485 MOVQ DX, 8(R10) 486 MOVQ CX, 16(R10) 487 MOVQ BX, 24(R10) 488 MOVQ SI, 32(R10) 489 MOVQ DI, 40(R10) 490 MOVQ R8, 48(R10) 491 MOVQ R9, 56(R10) 492 RET 493 494 TEXT ·mulAdxE2(SB), $64-24 495 NO_LOCAL_POINTERS 496 497 // var a, b, c fp.Element 498 // a.Add(&x.A0, &x.A1) 499 // b.Add(&y.A0, &y.A1) 500 // a.Mul(&a, &b) 501 // b.Mul(&x.A0, &y.A0) 502 // c.Mul(&x.A1, &y.A1) 503 // z.A1.Sub(&a, &b).Sub(&z.A1, &c) 504 // z.A0.Sub(&b, &c) 505 506 CMPB ·supportAdx(SB), $1 507 JNE l4 508 MOVQ x+8(FP), AX 509 MOVQ y+16(FP), DX 510 MOVQ 32(AX), R14 511 MOVQ 40(AX), R15 512 MOVQ 48(AX), CX 513 MOVQ 56(AX), BX 514 MOVQ 32(DX), SI 515 MOVQ 40(DX), DI 516 MOVQ 48(DX), R8 517 MOVQ 56(DX), R9 518 519 // mul (R14,R15,CX,BX) with (SI,DI,R8,R9) into (R10,R11,R12,R13) 520 MUL() 521 522 // reduce element(R10,R11,R12,R13) using temp registers (SI,DI,R8,R9) 523 REDUCE(R10,R11,R12,R13,SI,DI,R8,R9) 524 525 MOVQ R10, s4-40(SP) 526 MOVQ R11, s5-48(SP) 527 MOVQ R12, s6-56(SP) 528 MOVQ R13, s7-64(SP) 529 MOVQ x+8(FP), AX 530 MOVQ y+16(FP), DX 531 ADDQ 0(AX), R14 532 ADCQ 8(AX), R15 533 ADCQ 16(AX), CX 534 ADCQ 24(AX), BX 535 MOVQ 0(DX), SI 536 MOVQ 8(DX), DI 537 MOVQ 16(DX), R8 538 MOVQ 24(DX), R9 539 ADDQ 32(DX), SI 540 ADCQ 40(DX), DI 541 ADCQ 48(DX), R8 542 ADCQ 56(DX), R9 543 544 // mul (R14,R15,CX,BX) with (SI,DI,R8,R9) into (R10,R11,R12,R13) 545 MUL() 546 547 // reduce element(R10,R11,R12,R13) using temp registers (SI,DI,R8,R9) 548 REDUCE(R10,R11,R12,R13,SI,DI,R8,R9) 549 550 MOVQ R10, s0-8(SP) 551 MOVQ R11, s1-16(SP) 552 MOVQ R12, s2-24(SP) 553 MOVQ R13, s3-32(SP) 554 MOVQ x+8(FP), AX 555 MOVQ y+16(FP), DX 556 MOVQ 0(AX), R14 557 MOVQ 8(AX), R15 558 MOVQ 16(AX), CX 559 MOVQ 24(AX), BX 560 MOVQ 0(DX), SI 561 MOVQ 8(DX), DI 562 MOVQ 16(DX), R8 563 MOVQ 24(DX), R9 564 565 // mul (R14,R15,CX,BX) with (SI,DI,R8,R9) into (R10,R11,R12,R13) 566 MUL() 567 568 // reduce element(R10,R11,R12,R13) using temp registers (SI,DI,R8,R9) 569 REDUCE(R10,R11,R12,R13,SI,DI,R8,R9) 570 571 XORQ DX, DX 572 MOVQ s0-8(SP), R14 573 MOVQ s1-16(SP), R15 574 MOVQ s2-24(SP), CX 575 MOVQ s3-32(SP), BX 576 SUBQ R10, R14 577 SBBQ R11, R15 578 SBBQ R12, CX 579 SBBQ R13, BX 580 MOVQ $0x3c208c16d87cfd47, SI 581 MOVQ $0x97816a916871ca8d, DI 582 MOVQ $0xb85045b68181585d, R8 583 MOVQ $0x30644e72e131a029, R9 584 CMOVQCC DX, SI 585 CMOVQCC DX, DI 586 CMOVQCC DX, R8 587 CMOVQCC DX, R9 588 ADDQ SI, R14 589 ADCQ DI, R15 590 ADCQ R8, CX 591 ADCQ R9, BX 592 SUBQ s4-40(SP), R14 593 SBBQ s5-48(SP), R15 594 SBBQ s6-56(SP), CX 595 SBBQ s7-64(SP), BX 596 MOVQ $0x3c208c16d87cfd47, SI 597 MOVQ $0x97816a916871ca8d, DI 598 MOVQ $0xb85045b68181585d, R8 599 MOVQ $0x30644e72e131a029, R9 600 CMOVQCC DX, SI 601 CMOVQCC DX, DI 602 CMOVQCC DX, R8 603 CMOVQCC DX, R9 604 ADDQ SI, R14 605 ADCQ DI, R15 606 ADCQ R8, CX 607 ADCQ R9, BX 608 MOVQ res+0(FP), AX 609 MOVQ R14, 32(AX) 610 MOVQ R15, 40(AX) 611 MOVQ CX, 48(AX) 612 MOVQ BX, 56(AX) 613 MOVQ s4-40(SP), SI 614 MOVQ s5-48(SP), DI 615 MOVQ s6-56(SP), R8 616 MOVQ s7-64(SP), R9 617 SUBQ SI, R10 618 SBBQ DI, R11 619 SBBQ R8, R12 620 SBBQ R9, R13 621 MOVQ $0x3c208c16d87cfd47, R14 622 MOVQ $0x97816a916871ca8d, R15 623 MOVQ $0xb85045b68181585d, CX 624 MOVQ $0x30644e72e131a029, BX 625 CMOVQCC DX, R14 626 CMOVQCC DX, R15 627 CMOVQCC DX, CX 628 CMOVQCC DX, BX 629 ADDQ R14, R10 630 ADCQ R15, R11 631 ADCQ CX, R12 632 ADCQ BX, R13 633 MOVQ R10, 0(AX) 634 MOVQ R11, 8(AX) 635 MOVQ R12, 16(AX) 636 MOVQ R13, 24(AX) 637 RET 638 639 l4: 640 MOVQ res+0(FP), AX 641 MOVQ AX, (SP) 642 MOVQ x+8(FP), AX 643 MOVQ AX, 8(SP) 644 MOVQ y+16(FP), AX 645 MOVQ AX, 16(SP) 646 CALL ·mulGenericE2(SB) 647 RET 648 649 TEXT ·squareAdxE2(SB), $16-16 650 NO_LOCAL_POINTERS 651 652 // z.A0 = (x.A0 + x.A1) * (x.A0 - x.A1) 653 // z.A1 = 2 * x.A0 * x.A1 654 655 CMPB ·supportAdx(SB), $1 656 JNE l5 657 658 // 2 * x.A0 * x.A1 659 MOVQ x+8(FP), AX 660 661 // x.A0[0] -> SI 662 // x.A0[1] -> DI 663 // x.A0[2] -> R8 664 // x.A0[3] -> R9 665 MOVQ 0(AX), SI 666 MOVQ 8(AX), DI 667 MOVQ 16(AX), R8 668 MOVQ 24(AX), R9 669 670 // 2 * x.A1[0] -> R14 671 // 2 * x.A1[1] -> R15 672 // 2 * x.A1[2] -> CX 673 // 2 * x.A1[3] -> BX 674 MOVQ 32(AX), R14 675 MOVQ 40(AX), R15 676 MOVQ 48(AX), CX 677 MOVQ 56(AX), BX 678 ADDQ R14, R14 679 ADCQ R15, R15 680 ADCQ CX, CX 681 ADCQ BX, BX 682 683 // mul (R14,R15,CX,BX) with (SI,DI,R8,R9) into (R10,R11,R12,R13) 684 MUL() 685 686 // reduce element(R10,R11,R12,R13) using temp registers (R14,R15,CX,BX) 687 REDUCE(R10,R11,R12,R13,R14,R15,CX,BX) 688 689 MOVQ x+8(FP), AX 690 691 // x.A1[0] -> R14 692 // x.A1[1] -> R15 693 // x.A1[2] -> CX 694 // x.A1[3] -> BX 695 MOVQ 32(AX), R14 696 MOVQ 40(AX), R15 697 MOVQ 48(AX), CX 698 MOVQ 56(AX), BX 699 MOVQ res+0(FP), DX 700 MOVQ R10, 32(DX) 701 MOVQ R11, 40(DX) 702 MOVQ R12, 48(DX) 703 MOVQ R13, 56(DX) 704 MOVQ R14, R10 705 MOVQ R15, R11 706 MOVQ CX, R12 707 MOVQ BX, R13 708 709 // Add(&x.A0, &x.A1) 710 ADDQ SI, R14 711 ADCQ DI, R15 712 ADCQ R8, CX 713 ADCQ R9, BX 714 XORQ BP, BP 715 716 // Sub(&x.A0, &x.A1) 717 SUBQ R10, SI 718 SBBQ R11, DI 719 SBBQ R12, R8 720 SBBQ R13, R9 721 MOVQ $0x3c208c16d87cfd47, R10 722 MOVQ $0x97816a916871ca8d, R11 723 MOVQ $0xb85045b68181585d, R12 724 MOVQ $0x30644e72e131a029, R13 725 CMOVQCC BP, R10 726 CMOVQCC BP, R11 727 CMOVQCC BP, R12 728 CMOVQCC BP, R13 729 ADDQ R10, SI 730 ADCQ R11, DI 731 ADCQ R12, R8 732 ADCQ R13, R9 733 734 // mul (R14,R15,CX,BX) with (SI,DI,R8,R9) into (R10,R11,R12,R13) 735 MUL() 736 737 // reduce element(R10,R11,R12,R13) using temp registers (R14,R15,CX,BX) 738 REDUCE(R10,R11,R12,R13,R14,R15,CX,BX) 739 740 MOVQ res+0(FP), AX 741 MOVQ R10, 0(AX) 742 MOVQ R11, 8(AX) 743 MOVQ R12, 16(AX) 744 MOVQ R13, 24(AX) 745 RET 746 747 l5: 748 MOVQ res+0(FP), AX 749 MOVQ AX, (SP) 750 MOVQ x+8(FP), AX 751 MOVQ AX, 8(SP) 752 CALL ·squareGenericE2(SB) 753 RET