github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_macros_amd64.s (about) 1 #define res_ptr DI 2 #define x_ptr SI 3 #define y_ptr CX 4 5 #define acc0 R8 6 #define acc1 R9 7 #define acc2 R10 8 #define acc3 R11 9 #define acc4 R12 10 #define acc5 R13 11 #define t0 R14 12 13 DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff 14 DATA p256p<>+0x08(SB)/8, $0xffffffff00000000 15 DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff 16 DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff 17 DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975 18 DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123 19 DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b 20 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 21 DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff 22 DATA p256one<>+0x00(SB)/8, $0x0000000000000001 23 DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff 24 DATA p256one<>+0x10(SB)/8, $0x0000000000000000 25 DATA p256one<>+0x18(SB)/8, $0x0000000100000000 26 GLOBL p256p<>(SB), 8, $32 27 GLOBL p256ordK0<>(SB), 8, $8 28 GLOBL p256ord<>(SB), 8, $32 29 GLOBL p256one<>(SB), 8, $32 30 31 #define p256SqrMontReduceInline \ 32 \ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1] 33 MOVQ acc0, AX \ 34 MOVQ acc0, DX \ 35 SHLQ $32, AX \ 36 SHRQ $32, DX \ 37 \// calculate the negative part: [1, -0x100000000, 0, -0x100000000] * acc0 + [0, acc3, acc2, acc1] 38 SUBQ AX, acc1 \ 39 SBBQ DX, acc2 \ 40 SBBQ AX, acc3 \ 41 MOVQ acc0, AX \ 42 SBBQ DX, acc0 \ 43 \ // calculate the positive part: [0, 0, 0, AX] + [acc0, acc3, acc2, acc1], 44 \ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry. 45 ADDQ AX, acc1 \ 46 ADCQ $0, acc2 \ 47 ADCQ $0, acc3 \ 48 ADCQ $0, acc0 \ 49 \ // Second reduction step 50 MOVQ acc1, AX \ 51 MOVQ acc1, DX \ 52 SHLQ $32, AX \ 53 SHRQ $32, DX \ 54 \ 55 SUBQ AX, acc2 \ 56 SBBQ DX, acc3 \ 57 SBBQ AX, acc0 \ 58 MOVQ acc1, AX \ 59 SBBQ DX, acc1 \ 60 \ 61 ADDQ AX, acc2 \ 62 ADCQ $0, acc3 \ 63 ADCQ $0, acc0 \ 64 ADCQ $0, acc1 \ 65 \ // Third reduction step 66 MOVQ acc2, AX \ 67 MOVQ acc2, DX \ 68 SHLQ $32, AX \ 69 SHRQ $32, DX \ 70 \ 71 SUBQ AX, acc3 \ 72 SBBQ DX, acc0 \ 73 SBBQ AX, acc1 \ 74 MOVQ acc2, AX \ 75 SBBQ DX, acc2 \ 76 \ 77 ADDQ AX, acc3 \ 78 ADCQ $0, acc0 \ 79 ADCQ $0, acc1 \ 80 ADCQ $0, acc2 \ 81 \ // Last reduction step 82 XORQ t0, t0 \ 83 MOVQ acc3, AX \ 84 MOVQ acc3, DX \ 85 SHLQ $32, AX \ 86 SHRQ $32, DX \ 87 \ 88 SUBQ AX, acc0 \ 89 SBBQ DX, acc1 \ 90 SBBQ AX, acc2 \ 91 MOVQ acc3, AX \ 92 SBBQ DX, acc3 \ 93 \ 94 ADDQ AX, acc0 \ 95 ADCQ $0, acc1 \ 96 ADCQ $0, acc2 \ 97 ADCQ $0, acc3 \ 98 \ // Add bits [511:256] of the sqr result 99 ADCQ acc4, acc0 \ 100 ADCQ acc5, acc1 \ 101 ADCQ y_ptr, acc2 \ 102 ADCQ x_ptr, acc3 \ 103 ADCQ $0, t0 104 105 /* ---------------------------------------*/ 106 #define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ 107 MOVQ a0, b0 \ 108 MOVQ a1, b1 \ 109 MOVQ a2, b2 \ 110 MOVQ a3, b3 \ 111 \ // Subtract p256 112 SUBQ $-1, a0 \ 113 SBBQ p256p<>+0x08(SB), a1 \ 114 SBBQ $-1, a2 \ 115 SBBQ p256p<>+0x018(SB), a3 \ 116 SBBQ $0, a4 \ 117 \ // If the result of the subtraction is negative, restore the previous result 118 CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1) 119 CMOVQCS b1, a1 \ 120 CMOVQCS b2, a2 \ 121 CMOVQCS b3, a3 \ 122 \ 123 MOVQ a0, (8*0)(res) \ 124 MOVQ a1, (8*1)(res) \ 125 MOVQ a2, (8*2)(res) \ 126 MOVQ a3, (8*3)(res) 127 128 /* ---------------------------------------*/ 129 #define p256OrdReduceInline(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \ 130 \// Copy result [255:0] 131 MOVQ a0, b0 \ 132 MOVQ a1, b1 \ 133 MOVQ a2, b2 \ 134 MOVQ a3, b3 \ 135 \// Subtract p256ord 136 SUBQ p256ord<>+0x00(SB), a0 \ 137 SBBQ p256ord<>+0x08(SB) ,a1 \ 138 SBBQ p256ord<>+0x10(SB), a2 \ 139 SBBQ p256ord<>+0x18(SB), a3 \ 140 SBBQ $0, a4 \ 141 \ // If the result of the subtraction is negative, restore the previous result 142 CMOVQCS b0, a0 \ // CMOVQCS: Move if below (CF == 1) 143 CMOVQCS b1, a1 \ 144 CMOVQCS b2, a2 \ 145 CMOVQCS b3, a3 \ 146 \ 147 MOVQ a0, (8*0)(res) \ 148 MOVQ a1, (8*1)(res) \ 149 MOVQ a2, (8*2)(res) \ 150 MOVQ a3, (8*3)(res) 151 152 /* ---------------------------------------*/ 153 #define sm2P256SqrReductionInline \ 154 \ // First reduction step 155 MOVQ acc0, mul0 \ 156 MOVQ acc0, mul1 \ 157 SHLQ $32, mul0 \ 158 SHRQ $32, mul1 \ 159 \ 160 SUBQ mul0, acc1 \ 161 SBBQ mul1, acc2 \ 162 SBBQ mul0, acc3 \ 163 MOVQ acc0, mul0 \ 164 SBBQ mul1, acc0 \ 165 \ 166 ADDQ mul0, acc1 \ 167 ADCQ $0, acc2 \ 168 ADCQ $0, acc3 \ 169 ADCQ $0, acc0 \ 170 \ // Second reduction step 171 MOVQ acc1, mul0 \ 172 MOVQ acc1, mul1 \ 173 SHLQ $32, mul0 \ 174 SHRQ $32, mul1 \ 175 \ 176 SUBQ mul0, acc2 \ 177 SBBQ mul1, acc3 \ 178 SBBQ mul0, acc0 \ 179 MOVQ acc1, mul0 \ 180 SBBQ mul1, acc1 \ 181 \ 182 ADDQ mul0, acc2 \ 183 ADCQ $0, acc3 \ 184 ADCQ $0, acc0 \ 185 ADCQ $0, acc1 \ 186 \ // Third reduction step 187 MOVQ acc2, mul0 \ 188 MOVQ acc2, mul1 \ 189 SHLQ $32, mul0 \ 190 SHRQ $32, mul1 \ 191 \ 192 SUBQ mul0, acc3 \ 193 SBBQ mul1, acc0 \ 194 SBBQ mul0, acc1 \ 195 MOVQ acc2, mul0 \ 196 SBBQ mul1, acc2 \ 197 \ 198 ADDQ mul0, acc3 \ 199 ADCQ $0, acc0 \ 200 ADCQ $0, acc1 \ 201 ADCQ $0, acc2 \ 202 \ // Last reduction step 203 MOVQ acc3, mul0 \ 204 MOVQ acc3, mul1 \ 205 SHLQ $32, mul0 \ 206 SHRQ $32, mul1 \ 207 \ 208 SUBQ mul0, acc0 \ 209 SBBQ mul1, acc1 \ 210 SBBQ mul0, acc2 \ 211 MOVQ acc3, mul0 \ 212 SBBQ mul1, acc3 \ 213 \ 214 ADDQ mul0, acc0 \ 215 ADCQ $0, acc1 \ 216 ADCQ $0, acc2 \ 217 ADCQ $0, acc3 \ 218 MOVQ $0, mul0 \ 219 \ // Add bits [511:256] of the result 220 ADCQ acc0, t0 \ 221 ADCQ acc1, t1 \ 222 ADCQ acc2, t2 \ 223 ADCQ acc3, t3 \ 224 ADCQ $0, mul0 \ 225 \ // Copy result 226 MOVQ t0, acc4 \ 227 MOVQ t1, acc5 \ 228 MOVQ t2, acc6 \ 229 MOVQ t3, acc7 \ 230 \ // Subtract p256 231 SUBQ $-1, acc4 \ 232 SBBQ p256p<>+0x08(SB), acc5 \ 233 SBBQ $-1, acc6 \ 234 SBBQ p256p<>+0x018(SB), acc7\ 235 SBBQ $0, mul0 \ 236 \ // If the result of the subtraction is negative, restore the previous result 237 CMOVQCS t0, acc4 \ // CMOVQCS: Move if below (CF == 1) 238 CMOVQCS t1, acc5 \ 239 CMOVQCS t2, acc6 \ 240 CMOVQCS t3, acc7 241 242 /* ---------------------------------------*/ 243 #define sm2P256MulReductionInline \ 244 \// First reduction step 245 MOVQ acc0, mul0 \ 246 MOVQ acc0, mul1 \ 247 SHLQ $32, mul0 \ 248 SHRQ $32, mul1 \ 249 \ 250 SUBQ mul0, acc1 \ 251 SBBQ mul1, acc2 \ 252 SBBQ mul0, acc3 \ 253 MOVQ acc0, mul0 \ 254 SBBQ mul1, acc0 \ 255 \ 256 ADDQ mul0, acc1 \ 257 ADCQ $0, acc2 \ 258 ADCQ $0, acc3 \ 259 ADCQ $0, acc0 \ 260 \// Second reduction step 261 MOVQ acc1, mul0 \ 262 MOVQ acc1, mul1 \ 263 SHLQ $32, mul0 \ 264 SHRQ $32, mul1 \ 265 \ 266 SUBQ mul0, acc2 \ 267 SBBQ mul1, acc3 \ 268 SBBQ mul0, acc0 \ 269 MOVQ acc1, mul0 \ 270 SBBQ mul1, acc1 \ 271 \ 272 ADDQ mul0, acc2 \ 273 ADCQ $0, acc3 \ 274 ADCQ $0, acc0 \ 275 ADCQ $0, acc1 \ 276 \// Third reduction step 277 MOVQ acc2, mul0 \ 278 MOVQ acc2, mul1 \ 279 SHLQ $32, mul0 \ 280 SHRQ $32, mul1 \ 281 \ 282 SUBQ mul0, acc3 \ 283 SBBQ mul1, acc0 \ 284 SBBQ mul0, acc1 \ 285 MOVQ acc2, mul0 \ 286 SBBQ mul1, acc2 \ 287 \ 288 ADDQ mul0, acc3 \ 289 ADCQ $0, acc0 \ 290 ADCQ $0, acc1 \ 291 ADCQ $0, acc2 \ 292 \// Last reduction step 293 MOVQ acc3, mul0 \ 294 MOVQ acc3, mul1 \ 295 SHLQ $32, mul0 \ 296 SHRQ $32, mul1 \ 297 \ 298 SUBQ mul0, acc0 \ 299 SBBQ mul1, acc1 \ 300 SBBQ mul0, acc2 \ 301 MOVQ acc3, mul0 \ 302 SBBQ mul1, acc3 \ 303 \ 304 ADDQ mul0, acc0 \ 305 ADCQ $0, acc1 \ 306 ADCQ $0, acc2 \ 307 ADCQ $0, acc3 308 309 /* ---------------------------------------*/ 310 #define p256SqrRound(t1) \ 311 \// y[1:] * y[0] 312 MOVQ (8*0)(x_ptr), t0;\ 313 \ 314 MOVQ (8*1)(x_ptr), AX;\ 315 MULQ t0;\ 316 MOVQ AX, acc1;\ 317 MOVQ DX, acc2;\ 318 \ 319 MOVQ (8*2)(x_ptr), AX;\ 320 MULQ t0;\ 321 ADDQ AX, acc2;\ 322 ADCQ $0, DX;\ 323 MOVQ DX, acc3;\ 324 \ 325 MOVQ (8*3)(x_ptr), AX;\ 326 MULQ t0;\ 327 ADDQ AX, acc3;\ 328 ADCQ $0, DX;\ 329 MOVQ DX, acc4;\ 330 \// y[2:] * y[1] 331 MOVQ (8*1)(x_ptr), t0;\ 332 \ 333 MOVQ (8*2)(x_ptr), AX;\ 334 MULQ t0;\ 335 ADDQ AX, acc3;\ 336 ADCQ $0, DX;\ 337 MOVQ DX, t1;\ 338 \ 339 MOVQ (8*3)(x_ptr), AX;\ 340 MULQ t0;\ 341 ADDQ t1, acc4;\ 342 ADCQ $0, DX;\ 343 ADDQ AX, acc4;\ 344 ADCQ $0, DX;\ 345 MOVQ DX, acc5;\ 346 \// y[3] * y[2] 347 MOVQ (8*2)(x_ptr), t0;\ 348 \ 349 MOVQ (8*3)(x_ptr), AX;\ 350 MULQ t0;\ 351 ADDQ AX, acc5;\ 352 ADCQ $0, DX;\ 353 MOVQ DX, y_ptr;\ 354 XORQ t1, t1;\ 355 \// *2 356 ADDQ acc1, acc1;\ 357 ADCQ acc2, acc2;\ 358 ADCQ acc3, acc3;\ 359 ADCQ acc4, acc4;\ 360 ADCQ acc5, acc5;\ 361 ADCQ y_ptr, y_ptr;\ 362 ADCQ $0, t1;\ 363 \// Missing products 364 MOVQ (8*0)(x_ptr), AX;\ 365 MULQ AX;\ 366 MOVQ AX, acc0;\ 367 MOVQ DX, t0;\ 368 \ 369 MOVQ (8*1)(x_ptr), AX;\ 370 MULQ AX;\ 371 ADDQ t0, acc1;\ 372 ADCQ AX, acc2;\ 373 ADCQ $0, DX;\ 374 MOVQ DX, t0;\ 375 \ 376 MOVQ (8*2)(x_ptr), AX;\ 377 MULQ AX;\ 378 ADDQ t0, acc3;\ 379 ADCQ AX, acc4;\ 380 ADCQ $0, DX;\ 381 MOVQ DX, t0;\ 382 \ 383 MOVQ (8*3)(x_ptr), AX;\ 384 MULQ AX;\ 385 ADDQ t0, acc5;\ 386 ADCQ AX, y_ptr;\ 387 ADCQ DX, t1;\ 388 MOVQ t1, x_ptr;\ 389 \// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] 390 p256SqrMontReduceInline;\ 391 p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\ 392 MOVQ res_ptr, x_ptr; 393 394 /* ---------------------------------------*/ 395 #define p256SqrRoundAdx(t1) \ 396 XORQ acc0, acc0;\ 397 XORQ y_ptr, y_ptr;\ 398 \// x[1:] * x[0] 399 MOVQ (8*0)(x_ptr), DX;\ 400 MULXQ (8*1)(x_ptr), acc1, acc2;\ 401 \ 402 MULXQ (8*2)(x_ptr), AX, acc3;\ 403 ADOXQ AX, acc2;\ 404 \ 405 MULXQ (8*3)(x_ptr), AX, acc4;\ 406 ADOXQ AX, acc3;\ 407 ADOXQ y_ptr, acc4;\ 408 \ 409 \// x[2:] * x[1] 410 MOVQ (8*1)(x_ptr), DX;\ 411 MULXQ (8*2)(x_ptr), AX, t1;\ 412 ADOXQ AX, acc3;\ 413 \ 414 MULXQ (8*3)(x_ptr), AX, acc5;\ 415 ADCXQ t1, AX;\ 416 ADOXQ AX, acc4;\ 417 ADCXQ y_ptr, acc5;\ 418 \ 419 \// y[x] * x[2] 420 MOVQ (8*2)(x_ptr), DX;\ 421 MULXQ (8*3)(x_ptr), AX, y_ptr ;\ 422 ADOXQ AX, acc5;\ 423 ADOXQ acc0, y_ptr;\ 424 \ 425 XORQ t1, t1;\ 426 \ 427 \// *2 428 ADOXQ acc1, acc1;\ 429 ADOXQ acc2, acc2;\ 430 ADOXQ acc3, acc3;\ 431 ADOXQ acc4, acc4;\ 432 ADOXQ acc5, acc5;\ 433 ADOXQ y_ptr, y_ptr;\ 434 ADOXQ acc0, t1;\ 435 \ 436 \// Missing products 437 MOVQ (8*0)(x_ptr), DX;\ 438 MULXQ DX, acc0, t0;\ 439 ADCXQ t0, acc1;\ 440 \ 441 MOVQ (8*1)(x_ptr), DX;\ 442 MULXQ DX, AX, t0;\ 443 ADCXQ AX, acc2;\ 444 ADCXQ t0, acc3;\ 445 \ 446 MOVQ (8*2)(x_ptr), DX;\ 447 MULXQ DX, AX, t0 ;\ 448 ADCXQ AX, acc4;\ 449 ADCXQ t0, acc5;\ 450 \ 451 MOVQ (8*3)(x_ptr), DX;\ 452 MULXQ DX, AX, x_ptr;\ 453 ADCXQ AX, y_ptr;\ 454 ADCXQ t1, x_ptr;\ 455 \ 456 \// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] 457 p256SqrMontReduceInline;\ 458 p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\ 459 MOVQ res_ptr, x_ptr; 460 461 /* ---------------------------------------*/ 462 #define p256OrdSqrRound(t1) \ 463 \// y[1:] * y[0] 464 MOVQ (8*0)(x_ptr), t0;\ 465 \ 466 MOVQ (8*1)(x_ptr), AX;\ 467 MULQ t0;\ 468 MOVQ AX, acc1;\ 469 MOVQ DX, acc2;\ 470 \ 471 MOVQ (8*2)(x_ptr), AX;\ 472 MULQ t0;\ 473 ADDQ AX, acc2;\ 474 ADCQ $0, DX;\ 475 MOVQ DX, acc3;\ 476 \ 477 MOVQ (8*3)(x_ptr), AX;\ 478 MULQ t0;\ 479 ADDQ AX, acc3;\ 480 ADCQ $0, DX;\ 481 MOVQ DX, acc4;\ 482 \// y[2:] * y[1] 483 MOVQ (8*1)(x_ptr), t0;\ 484 \ 485 MOVQ (8*2)(x_ptr), AX;\ 486 MULQ t0;\ 487 ADDQ AX, acc3;\ 488 ADCQ $0, DX;\ 489 MOVQ DX, t1;\ 490 \ 491 MOVQ (8*3)(x_ptr), AX;\ 492 MULQ t0;\ 493 ADDQ t1, acc4;\ 494 ADCQ $0, DX;\ 495 ADDQ AX, acc4;\ 496 ADCQ $0, DX;\ 497 MOVQ DX, acc5;\ 498 \// y[3] * y[2] 499 MOVQ (8*2)(x_ptr), t0;\ 500 \ 501 MOVQ (8*3)(x_ptr), AX;\ 502 MULQ t0;\ 503 ADDQ AX, acc5;\ 504 ADCQ $0, DX;\ 505 MOVQ DX, y_ptr;\ 506 XORQ t1, t1;\ 507 \// *2 508 ADDQ acc1, acc1;\ 509 ADCQ acc2, acc2;\ 510 ADCQ acc3, acc3;\ 511 ADCQ acc4, acc4;\ 512 ADCQ acc5, acc5;\ 513 ADCQ y_ptr, y_ptr;\ 514 ADCQ $0, t1;\ 515 \// Missing products 516 MOVQ (8*0)(x_ptr), AX;\ 517 MULQ AX;\ 518 MOVQ AX, acc0;\ 519 MOVQ DX, t0;\ 520 \ 521 MOVQ (8*1)(x_ptr), AX;\ 522 MULQ AX;\ 523 ADDQ t0, acc1;\ 524 ADCQ AX, acc2;\ 525 ADCQ $0, DX;\ 526 MOVQ DX, t0;\ 527 \ 528 MOVQ (8*2)(x_ptr), AX;\ 529 MULQ AX;\ 530 ADDQ t0, acc3;\ 531 ADCQ AX, acc4;\ 532 ADCQ $0, DX;\ 533 MOVQ DX, t0;\ 534 \ 535 MOVQ (8*3)(x_ptr), AX;\ 536 MULQ AX;\ 537 ADDQ t0, acc5;\ 538 ADCQ AX, y_ptr;\ 539 ADCQ DX, t1;\ 540 MOVQ t1, x_ptr;\ 541 \ 542 \// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] 543 MOVQ acc0, AX;\ 544 MULQ p256ordK0<>(SB);\ 545 MOVQ AX, t0;\ // Y = t0 = (k0 * acc0) mod 2^64 546 \ 547 MOVQ p256ord<>+0x00(SB), AX;\ 548 MULQ t0;\ 549 ADDQ AX, acc0;\ // (carry1, acc0) = acc0 + L(t0 * ord0) 550 ADCQ $0, DX;\ // DX = carry1 + H(t0 * ord0) 551 MOVQ DX, t1;\ // t1 = carry1 + H(t0 * ord0) 552 MOVQ t0, acc0;\ // acc0 = t0 553 \ 554 \// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0 555 MOVQ t0, AX;\ 556 MOVQ t0, DX;\ 557 SHLQ $32, AX;\ 558 SHRQ $32, DX;\ 559 \ 560 SUBQ t0, acc2;\ 561 SBBQ AX, acc3;\ 562 SBBQ DX, acc0;\ 563 \ 564 MOVQ p256ord<>+0x08(SB), AX;\ 565 MULQ t0;\ 566 ADDQ t1, acc1;\ // (carry2, acc1) = acc1 + t1 567 ADCQ $0, DX;\ // DX = carry2 + H(t0*ord1) 568 \ 569 ADDQ AX, acc1;\ // (carry3, acc1) = acc1 + t1 + L(t0*ord1) 570 ADCQ DX, acc2;\ 571 ADCQ $0, acc3;\ 572 ADCQ $0, acc0;\ 573 \ 574 \// Second reduction step 575 MOVQ acc1, AX;\ 576 MULQ p256ordK0<>(SB);\ 577 MOVQ AX, t0;\ 578 \ 579 MOVQ p256ord<>+0x00(SB), AX;\ 580 MULQ t0;\ 581 ADDQ AX, acc1;\ 582 ADCQ $0, DX;\ 583 MOVQ DX, t1;\ 584 MOVQ t0, acc1;\ 585 \ 586 MOVQ t0, AX;\ 587 MOVQ t0, DX;\ 588 SHLQ $32, AX;\ 589 SHRQ $32, DX;\ 590 \ 591 SUBQ t0, acc3;\ 592 SBBQ AX, acc0;\ 593 SBBQ DX, acc1;\ 594 \ 595 MOVQ p256ord<>+0x08(SB), AX;\ 596 MULQ t0;\ 597 ADDQ t1, acc2;\ 598 ADCQ $0, DX;\ 599 \ 600 ADDQ AX, acc2;\ 601 ADCQ DX, acc3;\ 602 ADCQ $0, acc0;\ 603 ADCQ $0, acc1;\ 604 \ 605 \// Third reduction step 606 MOVQ acc2, AX;\ 607 MULQ p256ordK0<>(SB);\ 608 MOVQ AX, t0;\ 609 \ 610 MOVQ p256ord<>+0x00(SB), AX;\ 611 MULQ t0;\ 612 ADDQ AX, acc2;\ 613 ADCQ $0, DX;\ 614 MOVQ DX, t1;\ 615 MOVQ t0, acc2;\ 616 \ 617 MOVQ t0, AX;\ 618 MOVQ t0, DX;\ 619 SHLQ $32, AX;\ 620 SHRQ $32, DX;\ 621 \ 622 SUBQ t0, acc0;\ 623 SBBQ AX, acc1;\ 624 SBBQ DX, acc2;\ 625 \ 626 MOVQ p256ord<>+0x08(SB), AX;\ 627 MULQ t0;\ 628 ADDQ t1, acc3;\ 629 ADCQ $0, DX;\ 630 \ 631 ADDQ AX, acc3;\ 632 ADCQ DX, acc0;\ 633 ADCQ $0, acc1;\ 634 ADCQ $0, acc2;\ 635 \ 636 \// Last reduction step 637 MOVQ acc3, AX;\ 638 MULQ p256ordK0<>(SB);\ 639 MOVQ AX, t0;\ 640 \ 641 MOVQ p256ord<>+0x00(SB), AX;\ 642 MULQ t0;\ 643 ADDQ AX, acc3;\ 644 ADCQ $0, DX;\ 645 MOVQ DX, t1;\ 646 MOVQ t0, acc3;\ 647 \ 648 MOVQ t0, AX;\ 649 MOVQ t0, DX;\ 650 SHLQ $32, AX;\ 651 SHRQ $32, DX;\ 652 \ 653 SUBQ t0, acc1;\ 654 SBBQ AX, acc2;\ 655 SBBQ DX, acc3;\ 656 \ 657 MOVQ p256ord<>+0x08(SB), AX;\ 658 MULQ t0;\ 659 ADDQ t1, acc0;\ 660 ADCQ $0, DX;\ 661 \ 662 ADDQ AX, acc0;\ 663 ADCQ DX, acc1;\ 664 ADCQ $0, acc2;\ 665 ADCQ $0, acc3;\ 666 XORQ t0, t0;\ 667 \// Add bits [511:256] of the sqr result 668 ADCQ acc4, acc0;\ 669 ADCQ acc5, acc1;\ 670 ADCQ y_ptr, acc2;\ 671 ADCQ x_ptr, acc3;\ 672 ADCQ $0, t0;\ 673 \ 674 p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\ 675 MOVQ res_ptr, x_ptr; 676 677 /* ---------------------------------------*/ 678 #define p256OrdSqrRoundAdx(t1) \ 679 XORQ acc0, acc0;\ 680 XORQ y_ptr, y_ptr;\ 681 \// y[1:] * y[0] 682 MOVQ (8*0)(x_ptr), DX;\ 683 MULXQ (8*1)(x_ptr), acc1, acc2 ;\ 684 \ 685 MULXQ (8*2)(x_ptr), AX, acc3;\ 686 ADOXQ AX, acc2;\ 687 \ 688 MULXQ (8*3)(x_ptr), AX, acc4;\ 689 ADOXQ AX, acc3;\ 690 ADOXQ y_ptr, acc4;\ 691 \ 692 \// y[2:] * y[1] 693 MOVQ (8*1)(x_ptr), DX;\ 694 MULXQ (8*2)(x_ptr), AX, t1;\ 695 ADOXQ AX, acc3;\ 696 \ 697 MULXQ (8*3)(x_ptr), AX, acc5;\ 698 ADCXQ t1, AX;\ 699 ADOXQ AX, acc4;\ 700 ADCXQ y_ptr, acc5;\ 701 \ 702 \// y[3] * y[2] 703 MOVQ (8*2)(x_ptr), DX;\ 704 MULXQ (8*3)(x_ptr), AX, y_ptr;\ 705 ADOXQ AX, acc5;\ 706 ADOXQ acc0, y_ptr;\ 707 \ 708 XORQ t1, t1;\ 709 \// *2 710 ADOXQ acc1, acc1;\ 711 ADOXQ acc2, acc2;\ 712 ADOXQ acc3, acc3;\ 713 ADOXQ acc4, acc4;\ 714 ADOXQ acc5, acc5;\ 715 ADOXQ y_ptr, y_ptr;\ 716 ADOXQ acc0, t1;\ 717 \ 718 \// Missing products 719 MOVQ (8*0)(x_ptr), DX;\ 720 MULXQ DX, acc0, t0;\ 721 ADCXQ t0, acc1;\ 722 \ 723 MOVQ (8*1)(x_ptr), DX;\ 724 MULXQ DX, AX, t0;\ 725 ADCXQ AX, acc2;\ 726 ADCXQ t0, acc3;\ 727 \ 728 MOVQ (8*2)(x_ptr), DX;\ 729 MULXQ DX, AX, t0 ;\ 730 ADCXQ AX, acc4;\ 731 ADCXQ t0, acc5;\ 732 \ 733 MOVQ (8*3)(x_ptr), DX;\ 734 MULXQ DX, AX, x_ptr;\ 735 ADCXQ AX, y_ptr;\ 736 ADCXQ t1, x_ptr;\ 737 \ 738 \// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] 739 \// First reduction step 740 MOVQ acc0, DX;\ 741 MULXQ p256ordK0<>(SB), DX, AX;\ 742 \ 743 MULXQ p256ord<>+0x00(SB), AX, t0;\ 744 ADOXQ AX, acc0;\// (carry1, acc0) = acc0 + t0 * ord0 745 \ 746 MULXQ p256ord<>+0x08(SB), AX, t1;\ 747 ADCXQ t0, AX;\ 748 ADOXQ AX, acc1;\ 749 \ 750 MULXQ p256ord<>+0x10(SB), AX, t0;\ 751 ADCXQ t1, AX;\ 752 ADOXQ AX, acc2;\ 753 \ 754 MULXQ p256ord<>+0x18(SB), AX, acc0;\ 755 ADCXQ t0, AX;\ 756 ADOXQ AX, acc3;\ 757 MOVQ $0, t0;\ 758 ADCXQ t0, acc0;\ 759 ADOXQ t0, acc0;\ 760 \ 761 \// Second reduction step 762 MOVQ acc1, DX;\ 763 MULXQ p256ordK0<>(SB), DX, AX;\ 764 \ 765 MULXQ p256ord<>+0x00(SB), AX, t0;\ 766 ADOXQ AX, acc1;\ 767 \ 768 MULXQ p256ord<>+0x08(SB), AX, t1;\ 769 ADCXQ t0, AX;\ 770 ADOXQ AX, acc2;\ 771 \ 772 MULXQ p256ord<>+0x10(SB), AX, t0;\ 773 ADCXQ t1, AX;\ 774 ADOXQ AX, acc3;\ 775 \ 776 MULXQ p256ord<>+0x18(SB), AX, acc1;\ 777 ADCXQ t0, AX;\ 778 ADOXQ AX, acc0;\ 779 MOVQ $0, t0;\ 780 ADCXQ t0, acc1;\ 781 ADOXQ t0, acc1;\ 782 \ 783 \// Third reduction step 784 MOVQ acc2, DX;\ 785 MULXQ p256ordK0<>(SB), DX, AX;\ 786 \ 787 MULXQ p256ord<>+0x00(SB), AX, t0;\ 788 ADOXQ AX, acc2;\ 789 \ 790 MULXQ p256ord<>+0x08(SB), AX, t1;\ 791 ADCXQ t0, AX;\ 792 ADOXQ AX, acc3;\ 793 \ 794 MULXQ p256ord<>+0x10(SB), AX, t0;\ 795 ADCXQ t1, AX;\ 796 ADOXQ AX, acc0;\ 797 \ 798 MULXQ p256ord<>+0x18(SB), AX, acc2;\ 799 ADCXQ t0, AX;\ 800 ADOXQ AX, acc1;\ 801 MOVQ $0, t0;\ 802 ADCXQ t0, acc2;\ 803 ADOXQ t0, acc2;\ 804 \ 805 \// Last reduction step 806 MOVQ acc3, DX;\ 807 MULXQ p256ordK0<>(SB), DX, AX;\ 808 \ 809 MULXQ p256ord<>+0x00(SB), AX, t0;\ 810 ADOXQ AX, acc3;\ 811 \ 812 MULXQ p256ord<>+0x08(SB), AX, t1;\ 813 ADCXQ t0, AX;\ 814 ADOXQ AX, acc0;\ 815 \ 816 MULXQ p256ord<>+0x10(SB), AX, t0;\ 817 ADCXQ t1, AX;\ 818 ADOXQ AX, acc1;\ 819 \ 820 MULXQ p256ord<>+0x18(SB), AX, acc3;\ 821 ADCXQ t0, AX;\ 822 ADOXQ AX, acc2;\ 823 MOVQ $0, t0;\ 824 ADCXQ t0, acc3;\ 825 ADOXQ t0, acc3;\ 826 \ 827 XORQ t1, t1;\ 828 \// Add bits [511:256] of the sqr result 829 ADCXQ acc4, acc0;\ 830 ADCXQ acc5, acc1;\ 831 ADCXQ y_ptr, acc2;\ 832 ADCXQ x_ptr, acc3;\ 833 ADCXQ t1, t0;\ 834 \ 835 p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\ 836 MOVQ res_ptr, x_ptr; 837 838 // Below marcors are used for point operation 839 /* ---------------------------------------*/ 840 // [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4] 841 #define p256MulBy2Inline\ 842 XORQ mul0, mul0;\ 843 ADDQ acc4, acc4;\ 844 ADCQ acc5, acc5;\ 845 ADCQ acc6, acc6;\ 846 ADCQ acc7, acc7;\ 847 ADCQ $0, mul0;\ 848 MOVQ acc4, t0;\ 849 MOVQ acc5, t1;\ 850 MOVQ acc6, t2;\ 851 MOVQ acc7, t3;\ 852 SUBQ $-1, t0;\ 853 SBBQ p256p<>+0x08(SB), t1;\ 854 SBBQ $-1, t2;\ 855 SBBQ p256p<>+0x018(SB), t3;\ 856 SBBQ $0, mul0;\ 857 CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1) 858 CMOVQCS acc5, t1;\ 859 CMOVQCS acc6, t2;\ 860 CMOVQCS acc7, t3; 861 862 /* ---------------------------------------*/ 863 // [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4] 864 #define p256MulBy2Inline2\ 865 XORQ mul0, mul0;\ 866 ADDQ acc4, acc4;\ 867 ADCQ acc5, acc5;\ 868 ADCQ acc6, acc6;\ 869 ADCQ acc7, acc7;\ 870 ADCQ $0, mul0;\ 871 MOVQ acc4, t0;\ 872 MOVQ acc5, t1;\ 873 MOVQ acc6, t2;\ 874 MOVQ acc7, t3;\ 875 SUBQ $-1, acc4;\ 876 SBBQ p256p<>+0x08(SB), acc5;\ 877 SBBQ $-1, acc6;\ 878 SBBQ p256p<>+0x018(SB), acc7;\ 879 SBBQ $0, mul0;\ 880 CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1) 881 CMOVQCS t1, acc5;\ 882 CMOVQCS t2, acc6;\ 883 CMOVQCS t3, acc7; 884 885 /* ---------------------------------------*/ 886 // [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4] 887 #define p256TripleInline\ 888 XORQ mul0, mul0;\ 889 MOVQ acc4, acc0;\ 890 MOVQ acc5, acc1;\ 891 MOVQ acc6, acc2;\ 892 MOVQ acc7, acc3;\ 893 ADDQ acc4, acc4;\ 894 ADCQ acc5, acc5;\ 895 ADCQ acc6, acc6;\ 896 ADCQ acc7, acc7;\ 897 ADCQ $0, mul0;\ 898 MOVQ acc4, t0;\ 899 MOVQ acc5, t1;\ 900 MOVQ acc6, t2;\ 901 MOVQ acc7, t3;\ 902 SUBQ $-1, acc4;\ 903 SBBQ p256p<>+0x08(SB), acc5;\ 904 SBBQ $-1, acc6;\ 905 SBBQ p256p<>+0x018(SB), acc7;\ 906 SBBQ $0, mul0;\ 907 CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1) 908 CMOVQCS t1, acc5;\ 909 CMOVQCS t2, acc6;\ 910 CMOVQCS t3, acc7;\ 911 XORQ mul0, mul0;\ 912 ADDQ acc0, acc4;\ 913 ADCQ acc1, acc5;\ 914 ADCQ acc2, acc6;\ 915 ADCQ acc3, acc7;\ 916 ADCQ $0, mul0;\ 917 MOVQ acc4, t0;\ 918 MOVQ acc5, t1;\ 919 MOVQ acc6, t2;\ 920 MOVQ acc7, t3;\ 921 SUBQ $-1, t0;\ 922 SBBQ p256p<>+0x08(SB), t1;\ 923 SBBQ $-1, t2;\ 924 SBBQ p256p<>+0x018(SB), t3;\ 925 SBBQ $0, mul0;\ 926 CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1) 927 CMOVQCS acc5, t1;\ 928 CMOVQCS acc6, t2;\ 929 CMOVQCS acc7, t3; 930 931 /* ---------------------------------------*/ 932 // [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0] 933 #define p256AddInline \ 934 XORQ mul0, mul0;\ 935 ADDQ t0, acc4;\ 936 ADCQ t1, acc5;\ 937 ADCQ t2, acc6;\ 938 ADCQ t3, acc7;\ 939 ADCQ $0, mul0;\ 940 MOVQ acc4, t0;\ 941 MOVQ acc5, t1;\ 942 MOVQ acc6, t2;\ 943 MOVQ acc7, t3;\ 944 SUBQ $-1, t0;\ 945 SBBQ p256p<>+0x08(SB), t1;\ 946 SBBQ $-1, t2;\ 947 SBBQ p256p<>+0x018(SB), t3;\ 948 SBBQ $0, mul0;\ 949 CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1) 950 CMOVQCS acc5, t1;\ 951 CMOVQCS acc6, t2;\ 952 CMOVQCS acc7, t3; 953 954 /* ---------------------------------------*/ 955 // [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0] 956 #define p256SubInline \ 957 XORQ mul0, mul0;\ 958 SUBQ t0, acc4;\ 959 SBBQ t1, acc5;\ 960 SBBQ t2, acc6;\ 961 SBBQ t3, acc7;\ 962 SBBQ $0, mul0;\ 963 MOVQ acc4, t0;\ 964 MOVQ acc5, t1;\ 965 MOVQ acc6, t2;\ 966 MOVQ acc7, t3;\ 967 ADDQ $-1, t0;\ 968 ADCQ p256p<>+0x08(SB), t1;\ 969 ADCQ $-1, t2;\ 970 ADCQ p256p<>+0x018(SB), t3;\ 971 ANDQ $1, mul0;\ 972 CMOVQEQ acc4, t0;\ // CMOVQEQ: Move if equal (ZF == 1) 973 CMOVQEQ acc5, t1;\ 974 CMOVQEQ acc6, t2;\ 975 CMOVQEQ acc7, t3;\ 976 977 /* ---------------------------------------*/ 978 // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0] 979 #define p256SubInline2 \ 980 XORQ mul0, mul0;\ 981 SUBQ t0, acc4;\ 982 SBBQ t1, acc5;\ 983 SBBQ t2, acc6;\ 984 SBBQ t3, acc7;\ 985 SBBQ $0, mul0;\ 986 MOVQ acc4, acc0;\ 987 MOVQ acc5, acc1;\ 988 MOVQ acc6, acc2;\ 989 MOVQ acc7, acc3;\ 990 ADDQ $-1, acc4;\ 991 ADCQ p256p<>+0x08(SB), acc5;\ 992 ADCQ $-1, acc6;\ 993 ADCQ p256p<>+0x018(SB), acc7;\ 994 ANDQ $1, mul0;\ 995 CMOVQEQ acc0, acc4;\ // CMOVQEQ: Move if equal (ZF == 1) 996 CMOVQEQ acc1, acc5;\ 997 CMOVQEQ acc2, acc6;\ 998 CMOVQEQ acc3, acc7;\ 999 1000 #define p256SqrInternalInline \ 1001 MOVQ acc4, mul0;\ 1002 MULQ acc5;\ 1003 MOVQ mul0, acc1;\ 1004 MOVQ mul1, acc2;\ 1005 \ 1006 MOVQ acc4, mul0;\ 1007 MULQ acc6;\ 1008 ADDQ mul0, acc2;\ 1009 ADCQ $0, mul1;\ 1010 MOVQ mul1, acc3;\ 1011 \ 1012 MOVQ acc4, mul0;\ 1013 MULQ acc7;\ 1014 ADDQ mul0, acc3;\ 1015 ADCQ $0, mul1;\ 1016 MOVQ mul1, t0;\ 1017 \ 1018 MOVQ acc5, mul0;\ 1019 MULQ acc6;\ 1020 ADDQ mul0, acc3;\ 1021 ADCQ $0, mul1;\ 1022 MOVQ mul1, acc0;\ 1023 \ 1024 MOVQ acc5, mul0;\ 1025 MULQ acc7;\ 1026 ADDQ acc0, t0;\ 1027 ADCQ $0, mul1;\ 1028 ADDQ mul0, t0;\ 1029 ADCQ $0, mul1;\ 1030 MOVQ mul1, t1;\ 1031 \ 1032 MOVQ acc6, mul0;\ 1033 MULQ acc7;\ 1034 ADDQ mul0, t1;\ 1035 ADCQ $0, mul1;\ 1036 MOVQ mul1, t2;\ 1037 XORQ t3, t3;\ 1038 \// *2 1039 ADDQ acc1, acc1;\ 1040 ADCQ acc2, acc2;\ 1041 ADCQ acc3, acc3;\ 1042 ADCQ t0, t0;\ 1043 ADCQ t1, t1;\ 1044 ADCQ t2, t2;\ 1045 ADCQ $0, t3;\ 1046 \// Missing products 1047 MOVQ acc4, mul0;\ 1048 MULQ mul0;\ 1049 MOVQ mul0, acc0;\ 1050 MOVQ mul1, acc4;\ 1051 \ 1052 MOVQ acc5, mul0;\ 1053 MULQ mul0;\ 1054 ADDQ acc4, acc1;\ 1055 ADCQ mul0, acc2;\ 1056 ADCQ $0, mul1;\ 1057 MOVQ mul1, acc4;\ 1058 \ 1059 MOVQ acc6, mul0;\ 1060 MULQ mul0;\ 1061 ADDQ acc4, acc3;\ 1062 ADCQ mul0, t0;\ 1063 ADCQ $0, mul1;\ 1064 MOVQ mul1, acc4;\ 1065 \ 1066 MOVQ acc7, mul0;\ 1067 MULQ mul0;\ 1068 ADDQ acc4, t1;\ 1069 ADCQ mul0, t2;\ 1070 ADCQ mul1, t3;\ 1071 \// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] 1072 sm2P256SqrReductionInline; 1073 1074 #define p256SqrInternalInlineAdx \ 1075 XORQ acc0, acc0;\ 1076 XORQ t2, t2;\ 1077 MOVQ acc4, mul1;\ 1078 MULXQ acc5, acc1, acc2;\ 1079 \ 1080 MULXQ acc6, mul0, acc3;\ 1081 ADOXQ mul0, acc2;\ 1082 \ 1083 MULXQ acc7, mul0, t0;\ 1084 ADOXQ mul0, acc3;\ 1085 ADOXQ t2, t0;\ 1086 \ 1087 MOVQ acc5, mul1;\ 1088 MULXQ acc6, mul0, t3;\ 1089 ADOXQ mul0, acc3;\ 1090 \ 1091 MULXQ acc7, mul0, t1;\ 1092 ADCXQ t3, mul0;\ 1093 ADOXQ mul0, t0;\ 1094 ADCXQ t2, t1;\ 1095 \ 1096 MOVQ acc6, mul1;\ 1097 MULXQ acc7, mul0, t2;\ 1098 ADOXQ mul0, t1;\ 1099 ADOXQ acc0, t2;\ 1100 XORQ t3, t3;\ 1101 \ 1102 \// *2 1103 ADOXQ acc1, acc1;\ 1104 ADOXQ acc2, acc2;\ 1105 ADOXQ acc3, acc3;\ 1106 ADOXQ t0, t0;\ 1107 ADOXQ t1, t1;\ 1108 ADOXQ t2, t2;\ 1109 ADOXQ acc0, t3;\ 1110 \ 1111 \// Missing products 1112 MOVQ acc4, mul1;\ 1113 MULXQ mul1, acc0, acc4;\ 1114 ADDQ acc4, acc1;\ 1115 \ 1116 MOVQ acc5, mul1;\ 1117 MULXQ mul1, mul0, acc4;\ 1118 ADCXQ mul0, acc2;\ 1119 ADCXQ acc4, acc3;\ 1120 \ 1121 MOVQ acc6, mul1;\ 1122 MULXQ mul1, mul0, acc4;\ 1123 ADCXQ mul0, t0;\ 1124 ADCXQ acc4, t1;\ 1125 \ 1126 MOVQ acc7, mul1;\ 1127 MULXQ mul1, mul0, acc4;\ 1128 ADCXQ mul0, t2;\ 1129 ADCXQ acc4, t3;\ 1130 \// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] 1131 sm2P256SqrReductionInline; 1132 1133 // p256IsZeroInline returns 1 in AX if [acc4..acc7] represents zero and zero 1134 // otherwise. It writes to [acc4..acc7], t0 and t1. 1135 #define p256IsZeroInline \ 1136 \// AX contains a flag that is set if the input is zero. 1137 XORQ AX, AX;\ 1138 MOVQ $1, t1;\ 1139 \// Check whether [acc4..acc7] are all zero. 1140 MOVQ acc4, t0;\ 1141 ORQ acc5, t0;\ 1142 ORQ acc6, t0;\ 1143 ORQ acc7, t0;\ 1144 \// Set the zero flag if so. (CMOV of a constant to a register doesn't 1145 \// appear to be supported in Go. Thus t1 = 1.) 1146 CMOVQEQ t1, AX;\ // CMOVQEQ: Move if equal (ZF == 1) 1147 \// XOR [acc4..acc7] with P and compare with zero again. 1148 XORQ $-1, acc4;\ 1149 XORQ p256p<>+0x08(SB), acc5;\ 1150 XORQ $-1, acc6;\ 1151 XORQ p256p<>+0x018(SB), acc7;\ 1152 ORQ acc5, acc4;\ 1153 ORQ acc6, acc4;\ 1154 ORQ acc7, acc4;\ 1155 \// Set the zero flag if so. 1156 \// CMOVQEQ: Move if equal (ZF == 1) 1157 CMOVQEQ t1, AX; 1158 1159 #define p256PointDoubleInit() \ 1160 MOVOU (16*0)(BX), X0;\ 1161 MOVOU (16*1)(BX), X1;\ 1162 MOVOU (16*2)(BX), X2;\ 1163 MOVOU (16*3)(BX), X3;\ 1164 MOVOU (16*4)(BX), X4;\ 1165 MOVOU (16*5)(BX), X5;\ 1166 \ 1167 MOVOU X0, x(16*0);\ 1168 MOVOU X1, x(16*1);\ 1169 MOVOU X2, y(16*0);\ 1170 MOVOU X3, y(16*1);\ 1171 MOVOU X4, z(16*0);\ 1172 MOVOU X5, z(16*1);