github.com/remobjects/goldbaselibrary@v0.0.0-20230924164425-d458680a936b/Source/Gold/math/big/arith_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT ·mulWW(SB),NOSPLIT,$0 14 MOVQ x+0(FP), AX 15 MULQ y+8(FP) 16 MOVQ DX, z1+16(FP) 17 MOVQ AX, z0+24(FP) 18 RET 19 20 21 // func divWW(x1, x0, y Word) (q, r Word) 22 TEXT ·divWW(SB),NOSPLIT,$0 23 MOVQ x1+0(FP), DX 24 MOVQ x0+8(FP), AX 25 DIVQ y+16(FP) 26 MOVQ AX, q+24(FP) 27 MOVQ DX, r+32(FP) 28 RET 29 30 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. 31 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. 32 // This is faster than using rotate instructions. 33 34 // func addVV(z, x, y []Word) (c Word) 35 TEXT ·addVV(SB),NOSPLIT,$0 36 MOVQ z_len+8(FP), DI 37 MOVQ x+24(FP), R8 38 MOVQ y+48(FP), R9 39 MOVQ z+0(FP), R10 40 41 MOVQ $0, CX // c = 0 42 MOVQ $0, SI // i = 0 43 44 // s/JL/JMP/ below to disable the unrolled loop 45 SUBQ $4, DI // n -= 4 46 JL V1 // if n < 0 goto V1 47 48 U1: // n >= 0 49 // regular loop body unrolled 4x 50 ADDQ CX, CX // restore CF 51 MOVQ 0(R8)(SI*8), R11 52 MOVQ 8(R8)(SI*8), R12 53 MOVQ 16(R8)(SI*8), R13 54 MOVQ 24(R8)(SI*8), R14 55 ADCQ 0(R9)(SI*8), R11 56 ADCQ 8(R9)(SI*8), R12 57 ADCQ 16(R9)(SI*8), R13 58 ADCQ 24(R9)(SI*8), R14 59 MOVQ R11, 0(R10)(SI*8) 60 MOVQ R12, 8(R10)(SI*8) 61 MOVQ R13, 16(R10)(SI*8) 62 MOVQ R14, 24(R10)(SI*8) 63 SBBQ CX, CX // save CF 64 65 ADDQ $4, SI // i += 4 66 SUBQ $4, DI // n -= 4 67 JGE U1 // if n >= 0 goto U1 68 69 V1: ADDQ $4, DI // n += 4 70 JLE E1 // if n <= 0 goto E1 71 72 L1: // n > 0 73 ADDQ CX, CX // restore CF 74 MOVQ 0(R8)(SI*8), R11 75 ADCQ 0(R9)(SI*8), R11 76 MOVQ R11, 0(R10)(SI*8) 77 SBBQ CX, CX // save CF 78 79 ADDQ $1, SI // i++ 80 SUBQ $1, DI // n-- 81 JG L1 // if n > 0 goto L1 82 83 E1: NEGQ CX 84 MOVQ CX, c+72(FP) // return c 85 RET 86 87 88 // func subVV(z, x, y []Word) (c Word) 89 // (same as addVV except for SBBQ instead of ADCQ and label names) 90 TEXT ·subVV(SB),NOSPLIT,$0 91 MOVQ z_len+8(FP), DI 92 MOVQ x+24(FP), R8 93 MOVQ y+48(FP), R9 94 MOVQ z+0(FP), R10 95 96 MOVQ $0, CX // c = 0 97 MOVQ $0, SI // i = 0 98 99 // s/JL/JMP/ below to disable the unrolled loop 100 SUBQ $4, DI // n -= 4 101 JL V2 // if n < 0 goto V2 102 103 U2: // n >= 0 104 // regular loop body unrolled 4x 105 ADDQ CX, CX // restore CF 106 MOVQ 0(R8)(SI*8), R11 107 MOVQ 8(R8)(SI*8), R12 108 MOVQ 16(R8)(SI*8), R13 109 MOVQ 24(R8)(SI*8), R14 110 SBBQ 0(R9)(SI*8), R11 111 SBBQ 8(R9)(SI*8), R12 112 SBBQ 16(R9)(SI*8), R13 113 SBBQ 24(R9)(SI*8), R14 114 MOVQ R11, 0(R10)(SI*8) 115 MOVQ R12, 8(R10)(SI*8) 116 MOVQ R13, 16(R10)(SI*8) 117 MOVQ R14, 24(R10)(SI*8) 118 SBBQ CX, CX // save CF 119 120 ADDQ $4, SI // i += 4 121 SUBQ $4, DI // n -= 4 122 JGE U2 // if n >= 0 goto U2 123 124 V2: ADDQ $4, DI // n += 4 125 JLE E2 // if n <= 0 goto E2 126 127 L2: // n > 0 128 ADDQ CX, CX // restore CF 129 MOVQ 0(R8)(SI*8), R11 130 SBBQ 0(R9)(SI*8), R11 131 MOVQ R11, 0(R10)(SI*8) 132 SBBQ CX, CX // save CF 133 134 ADDQ $1, SI // i++ 135 SUBQ $1, DI // n-- 136 JG L2 // if n > 0 goto L2 137 138 E2: NEGQ CX 139 MOVQ CX, c+72(FP) // return c 140 RET 141 142 143 // func addVW(z, x []Word, y Word) (c Word) 144 TEXT ·addVW(SB),NOSPLIT,$0 145 MOVQ z_len+8(FP), DI 146 CMPQ DI, $32 147 JG large 148 MOVQ x+24(FP), R8 149 MOVQ y+48(FP), CX // c = y 150 MOVQ z+0(FP), R10 151 152 MOVQ $0, SI // i = 0 153 154 // s/JL/JMP/ below to disable the unrolled loop 155 SUBQ $4, DI // n -= 4 156 JL V3 // if n < 4 goto V3 157 158 U3: // n >= 0 159 // regular loop body unrolled 4x 160 MOVQ 0(R8)(SI*8), R11 161 MOVQ 8(R8)(SI*8), R12 162 MOVQ 16(R8)(SI*8), R13 163 MOVQ 24(R8)(SI*8), R14 164 ADDQ CX, R11 165 ADCQ $0, R12 166 ADCQ $0, R13 167 ADCQ $0, R14 168 SBBQ CX, CX // save CF 169 NEGQ CX 170 MOVQ R11, 0(R10)(SI*8) 171 MOVQ R12, 8(R10)(SI*8) 172 MOVQ R13, 16(R10)(SI*8) 173 MOVQ R14, 24(R10)(SI*8) 174 175 ADDQ $4, SI // i += 4 176 SUBQ $4, DI // n -= 4 177 JGE U3 // if n >= 0 goto U3 178 179 V3: ADDQ $4, DI // n += 4 180 JLE E3 // if n <= 0 goto E3 181 182 L3: // n > 0 183 ADDQ 0(R8)(SI*8), CX 184 MOVQ CX, 0(R10)(SI*8) 185 SBBQ CX, CX // save CF 186 NEGQ CX 187 188 ADDQ $1, SI // i++ 189 SUBQ $1, DI // n-- 190 JG L3 // if n > 0 goto L3 191 192 E3: MOVQ CX, c+56(FP) // return c 193 RET 194 large: 195 JMP ·addVWlarge(SB) 196 197 198 // func subVW(z, x []Word, y Word) (c Word) 199 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 200 TEXT ·subVW(SB),NOSPLIT,$0 201 MOVQ z_len+8(FP), DI 202 CMPQ DI, $32 203 JG large 204 MOVQ x+24(FP), R8 205 MOVQ y+48(FP), CX // c = y 206 MOVQ z+0(FP), R10 207 208 MOVQ $0, SI // i = 0 209 210 // s/JL/JMP/ below to disable the unrolled loop 211 SUBQ $4, DI // n -= 4 212 JL V4 // if n < 4 goto V4 213 214 U4: // n >= 0 215 // regular loop body unrolled 4x 216 MOVQ 0(R8)(SI*8), R11 217 MOVQ 8(R8)(SI*8), R12 218 MOVQ 16(R8)(SI*8), R13 219 MOVQ 24(R8)(SI*8), R14 220 SUBQ CX, R11 221 SBBQ $0, R12 222 SBBQ $0, R13 223 SBBQ $0, R14 224 SBBQ CX, CX // save CF 225 NEGQ CX 226 MOVQ R11, 0(R10)(SI*8) 227 MOVQ R12, 8(R10)(SI*8) 228 MOVQ R13, 16(R10)(SI*8) 229 MOVQ R14, 24(R10)(SI*8) 230 231 ADDQ $4, SI // i += 4 232 SUBQ $4, DI // n -= 4 233 JGE U4 // if n >= 0 goto U4 234 235 V4: ADDQ $4, DI // n += 4 236 JLE E4 // if n <= 0 goto E4 237 238 L4: // n > 0 239 MOVQ 0(R8)(SI*8), R11 240 SUBQ CX, R11 241 MOVQ R11, 0(R10)(SI*8) 242 SBBQ CX, CX // save CF 243 NEGQ CX 244 245 ADDQ $1, SI // i++ 246 SUBQ $1, DI // n-- 247 JG L4 // if n > 0 goto L4 248 249 E4: MOVQ CX, c+56(FP) // return c 250 RET 251 large: 252 JMP ·subVWlarge(SB) 253 254 255 // func shlVU(z, x []Word, s uint) (c Word) 256 TEXT ·shlVU(SB),NOSPLIT,$0 257 MOVQ z_len+8(FP), BX // i = z 258 SUBQ $1, BX // i-- 259 JL X8b // i < 0 (n <= 0) 260 261 // n > 0 262 MOVQ z+0(FP), R10 263 MOVQ x+24(FP), R8 264 MOVQ s+48(FP), CX 265 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 266 MOVQ $0, DX 267 SHLQ CX, AX, DX // w1>>ŝ 268 MOVQ DX, c+56(FP) 269 270 CMPQ BX, $0 271 JLE X8a // i <= 0 272 273 // i > 0 274 L8: MOVQ AX, DX // w = w1 275 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 276 SHLQ CX, AX, DX // w<<s | w1>>ŝ 277 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ 278 SUBQ $1, BX // i-- 279 JG L8 // i > 0 280 281 // i <= 0 282 X8a: SHLQ CX, AX // w1<<s 283 MOVQ AX, (R10) // z[0] = w1<<s 284 RET 285 286 X8b: MOVQ $0, c+56(FP) 287 RET 288 289 290 // func shrVU(z, x []Word, s uint) (c Word) 291 TEXT ·shrVU(SB),NOSPLIT,$0 292 MOVQ z_len+8(FP), R11 293 SUBQ $1, R11 // n-- 294 JL X9b // n < 0 (n <= 0) 295 296 // n > 0 297 MOVQ z+0(FP), R10 298 MOVQ x+24(FP), R8 299 MOVQ s+48(FP), CX 300 MOVQ (R8), AX // w1 = x[0] 301 MOVQ $0, DX 302 SHRQ CX, AX, DX // w1<<ŝ 303 MOVQ DX, c+56(FP) 304 305 MOVQ $0, BX // i = 0 306 JMP E9 307 308 // i < n-1 309 L9: MOVQ AX, DX // w = w1 310 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 311 SHRQ CX, AX, DX // w>>s | w1<<ŝ 312 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ 313 ADDQ $1, BX // i++ 314 315 E9: CMPQ BX, R11 316 JL L9 // i < n-1 317 318 // i >= n-1 319 X9a: SHRQ CX, AX // w1>>s 320 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 321 RET 322 323 X9b: MOVQ $0, c+56(FP) 324 RET 325 326 327 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 328 TEXT ·mulAddVWW(SB),NOSPLIT,$0 329 MOVQ z+0(FP), R10 330 MOVQ x+24(FP), R8 331 MOVQ y+48(FP), R9 332 MOVQ r+56(FP), CX // c = r 333 MOVQ z_len+8(FP), R11 334 MOVQ $0, BX // i = 0 335 336 CMPQ R11, $4 337 JL E5 338 339 U5: // i+4 <= n 340 // regular loop body unrolled 4x 341 MOVQ (0*8)(R8)(BX*8), AX 342 MULQ R9 343 ADDQ CX, AX 344 ADCQ $0, DX 345 MOVQ AX, (0*8)(R10)(BX*8) 346 MOVQ DX, CX 347 MOVQ (1*8)(R8)(BX*8), AX 348 MULQ R9 349 ADDQ CX, AX 350 ADCQ $0, DX 351 MOVQ AX, (1*8)(R10)(BX*8) 352 MOVQ DX, CX 353 MOVQ (2*8)(R8)(BX*8), AX 354 MULQ R9 355 ADDQ CX, AX 356 ADCQ $0, DX 357 MOVQ AX, (2*8)(R10)(BX*8) 358 MOVQ DX, CX 359 MOVQ (3*8)(R8)(BX*8), AX 360 MULQ R9 361 ADDQ CX, AX 362 ADCQ $0, DX 363 MOVQ AX, (3*8)(R10)(BX*8) 364 MOVQ DX, CX 365 ADDQ $4, BX // i += 4 366 367 LEAQ 4(BX), DX 368 CMPQ DX, R11 369 JLE U5 370 JMP E5 371 372 L5: MOVQ (R8)(BX*8), AX 373 MULQ R9 374 ADDQ CX, AX 375 ADCQ $0, DX 376 MOVQ AX, (R10)(BX*8) 377 MOVQ DX, CX 378 ADDQ $1, BX // i++ 379 380 E5: CMPQ BX, R11 // i < n 381 JL L5 382 383 MOVQ CX, c+64(FP) 384 RET 385 386 387 // func addMulVVW(z, x []Word, y Word) (c Word) 388 TEXT ·addMulVVW(SB),NOSPLIT,$0 389 CMPB ·support_adx(SB), $1 390 JEQ adx 391 MOVQ z+0(FP), R10 392 MOVQ x+24(FP), R8 393 MOVQ y+48(FP), R9 394 MOVQ z_len+8(FP), R11 395 MOVQ $0, BX // i = 0 396 MOVQ $0, CX // c = 0 397 MOVQ R11, R12 398 ANDQ $-2, R12 399 CMPQ R11, $2 400 JAE A6 401 JMP E6 402 403 A6: 404 MOVQ (R8)(BX*8), AX 405 MULQ R9 406 ADDQ (R10)(BX*8), AX 407 ADCQ $0, DX 408 ADDQ CX, AX 409 ADCQ $0, DX 410 MOVQ DX, CX 411 MOVQ AX, (R10)(BX*8) 412 413 MOVQ (8)(R8)(BX*8), AX 414 MULQ R9 415 ADDQ (8)(R10)(BX*8), AX 416 ADCQ $0, DX 417 ADDQ CX, AX 418 ADCQ $0, DX 419 MOVQ DX, CX 420 MOVQ AX, (8)(R10)(BX*8) 421 422 ADDQ $2, BX 423 CMPQ BX, R12 424 JL A6 425 JMP E6 426 427 L6: MOVQ (R8)(BX*8), AX 428 MULQ R9 429 ADDQ CX, AX 430 ADCQ $0, DX 431 ADDQ AX, (R10)(BX*8) 432 ADCQ $0, DX 433 MOVQ DX, CX 434 ADDQ $1, BX // i++ 435 436 E6: CMPQ BX, R11 // i < n 437 JL L6 438 439 MOVQ CX, c+56(FP) 440 RET 441 442 adx: 443 MOVQ z_len+8(FP), R11 444 MOVQ z+0(FP), R10 445 MOVQ x+24(FP), R8 446 MOVQ y+48(FP), DX 447 MOVQ $0, BX // i = 0 448 MOVQ $0, CX // carry 449 CMPQ R11, $8 450 JAE adx_loop_header 451 CMPQ BX, R11 452 JL adx_short 453 MOVQ CX, c+56(FP) 454 RET 455 456 adx_loop_header: 457 MOVQ R11, R13 458 ANDQ $-8, R13 459 adx_loop: 460 XORQ R9, R9 // unset flags 461 MULXQ (R8), SI, DI 462 ADCXQ CX,SI 463 ADOXQ (R10), SI 464 MOVQ SI,(R10) 465 466 MULXQ 8(R8), AX, CX 467 ADCXQ DI, AX 468 ADOXQ 8(R10), AX 469 MOVQ AX, 8(R10) 470 471 MULXQ 16(R8), SI, DI 472 ADCXQ CX, SI 473 ADOXQ 16(R10), SI 474 MOVQ SI, 16(R10) 475 476 MULXQ 24(R8), AX, CX 477 ADCXQ DI, AX 478 ADOXQ 24(R10), AX 479 MOVQ AX, 24(R10) 480 481 MULXQ 32(R8), SI, DI 482 ADCXQ CX, SI 483 ADOXQ 32(R10), SI 484 MOVQ SI, 32(R10) 485 486 MULXQ 40(R8), AX, CX 487 ADCXQ DI, AX 488 ADOXQ 40(R10), AX 489 MOVQ AX, 40(R10) 490 491 MULXQ 48(R8), SI, DI 492 ADCXQ CX, SI 493 ADOXQ 48(R10), SI 494 MOVQ SI, 48(R10) 495 496 MULXQ 56(R8), AX, CX 497 ADCXQ DI, AX 498 ADOXQ 56(R10), AX 499 MOVQ AX, 56(R10) 500 501 ADCXQ R9, CX 502 ADOXQ R9, CX 503 504 ADDQ $64, R8 505 ADDQ $64, R10 506 ADDQ $8, BX 507 508 CMPQ BX, R13 509 JL adx_loop 510 MOVQ z+0(FP), R10 511 MOVQ x+24(FP), R8 512 CMPQ BX, R11 513 JL adx_short 514 MOVQ CX, c+56(FP) 515 RET 516 517 adx_short: 518 MULXQ (R8)(BX*8), SI, DI 519 ADDQ CX, SI 520 ADCQ $0, DI 521 ADDQ SI, (R10)(BX*8) 522 ADCQ $0, DI 523 MOVQ DI, CX 524 ADDQ $1, BX // i++ 525 526 CMPQ BX, R11 527 JL adx_short 528 529 MOVQ CX, c+56(FP) 530 RET 531 532 533 534 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) 535 TEXT ·divWVW(SB),NOSPLIT,$0 536 MOVQ z+0(FP), R10 537 MOVQ xn+24(FP), DX // r = xn 538 MOVQ x+32(FP), R8 539 MOVQ y+56(FP), R9 540 MOVQ z_len+8(FP), BX // i = z 541 JMP E7 542 543 L7: MOVQ (R8)(BX*8), AX 544 DIVQ R9 545 MOVQ AX, (R10)(BX*8) 546 547 E7: SUBQ $1, BX // i-- 548 JGE L7 // i >= 0 549 550 MOVQ DX, r+64(FP) 551 RET