github.com/megatontech/mynoteforgo@v0.0.0-20200507084910-5d0c6ea6e890/源码/math/big/arith_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT ·mulWW(SB),NOSPLIT,$0 14 MOVQ x+0(FP), AX 15 MULQ y+8(FP) 16 MOVQ DX, z1+16(FP) 17 MOVQ AX, z0+24(FP) 18 RET 19 20 21 // func divWW(x1, x0, y Word) (q, r Word) 22 TEXT ·divWW(SB),NOSPLIT,$0 23 MOVQ x1+0(FP), DX 24 MOVQ x0+8(FP), AX 25 DIVQ y+16(FP) 26 MOVQ AX, q+24(FP) 27 MOVQ DX, r+32(FP) 28 RET 29 30 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. 31 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. 32 // This is faster than using rotate instructions. 33 34 // func addVV(z, x, y []Word) (c Word) 35 TEXT ·addVV(SB),NOSPLIT,$0 36 MOVQ z_len+8(FP), DI 37 MOVQ x+24(FP), R8 38 MOVQ y+48(FP), R9 39 MOVQ z+0(FP), R10 40 41 MOVQ $0, CX // c = 0 42 MOVQ $0, SI // i = 0 43 44 // s/JL/JMP/ below to disable the unrolled loop 45 SUBQ $4, DI // n -= 4 46 JL V1 // if n < 0 goto V1 47 48 U1: // n >= 0 49 // regular loop body unrolled 4x 50 ADDQ CX, CX // restore CF 51 MOVQ 0(R8)(SI*8), R11 52 MOVQ 8(R8)(SI*8), R12 53 MOVQ 16(R8)(SI*8), R13 54 MOVQ 24(R8)(SI*8), R14 55 ADCQ 0(R9)(SI*8), R11 56 ADCQ 8(R9)(SI*8), R12 57 ADCQ 16(R9)(SI*8), R13 58 ADCQ 24(R9)(SI*8), R14 59 MOVQ R11, 0(R10)(SI*8) 60 MOVQ R12, 8(R10)(SI*8) 61 MOVQ R13, 16(R10)(SI*8) 62 MOVQ R14, 24(R10)(SI*8) 63 SBBQ CX, CX // save CF 64 65 ADDQ $4, SI // i += 4 66 SUBQ $4, DI // n -= 4 67 JGE U1 // if n >= 0 goto U1 68 69 V1: ADDQ $4, DI // n += 4 70 JLE E1 // if n <= 0 goto E1 71 72 L1: // n > 0 73 ADDQ CX, CX // restore CF 74 MOVQ 0(R8)(SI*8), R11 75 ADCQ 0(R9)(SI*8), R11 76 MOVQ R11, 0(R10)(SI*8) 77 SBBQ CX, CX // save CF 78 79 ADDQ $1, SI // i++ 80 SUBQ $1, DI // n-- 81 JG L1 // if n > 0 goto L1 82 83 E1: NEGQ CX 84 MOVQ CX, c+72(FP) // return c 85 RET 86 87 88 // func subVV(z, x, y []Word) (c Word) 89 // (same as addVV except for SBBQ instead of ADCQ and label names) 90 TEXT ·subVV(SB),NOSPLIT,$0 91 MOVQ z_len+8(FP), DI 92 MOVQ x+24(FP), R8 93 MOVQ y+48(FP), R9 94 MOVQ z+0(FP), R10 95 96 MOVQ $0, CX // c = 0 97 MOVQ $0, SI // i = 0 98 99 // s/JL/JMP/ below to disable the unrolled loop 100 SUBQ $4, DI // n -= 4 101 JL V2 // if n < 0 goto V2 102 103 U2: // n >= 0 104 // regular loop body unrolled 4x 105 ADDQ CX, CX // restore CF 106 MOVQ 0(R8)(SI*8), R11 107 MOVQ 8(R8)(SI*8), R12 108 MOVQ 16(R8)(SI*8), R13 109 MOVQ 24(R8)(SI*8), R14 110 SBBQ 0(R9)(SI*8), R11 111 SBBQ 8(R9)(SI*8), R12 112 SBBQ 16(R9)(SI*8), R13 113 SBBQ 24(R9)(SI*8), R14 114 MOVQ R11, 0(R10)(SI*8) 115 MOVQ R12, 8(R10)(SI*8) 116 MOVQ R13, 16(R10)(SI*8) 117 MOVQ R14, 24(R10)(SI*8) 118 SBBQ CX, CX // save CF 119 120 ADDQ $4, SI // i += 4 121 SUBQ $4, DI // n -= 4 122 JGE U2 // if n >= 0 goto U2 123 124 V2: ADDQ $4, DI // n += 4 125 JLE E2 // if n <= 0 goto E2 126 127 L2: // n > 0 128 ADDQ CX, CX // restore CF 129 MOVQ 0(R8)(SI*8), R11 130 SBBQ 0(R9)(SI*8), R11 131 MOVQ R11, 0(R10)(SI*8) 132 SBBQ CX, CX // save CF 133 134 ADDQ $1, SI // i++ 135 SUBQ $1, DI // n-- 136 JG L2 // if n > 0 goto L2 137 138 E2: NEGQ CX 139 MOVQ CX, c+72(FP) // return c 140 RET 141 142 143 // func addVW(z, x []Word, y Word) (c Word) 144 TEXT ·addVW(SB),NOSPLIT,$0 145 MOVQ z_len+8(FP), DI 146 MOVQ x+24(FP), R8 147 MOVQ y+48(FP), CX // c = y 148 MOVQ z+0(FP), R10 149 150 MOVQ $0, SI // i = 0 151 152 // s/JL/JMP/ below to disable the unrolled loop 153 SUBQ $4, DI // n -= 4 154 JL V3 // if n < 4 goto V3 155 156 U3: // n >= 0 157 // regular loop body unrolled 4x 158 MOVQ 0(R8)(SI*8), R11 159 MOVQ 8(R8)(SI*8), R12 160 MOVQ 16(R8)(SI*8), R13 161 MOVQ 24(R8)(SI*8), R14 162 ADDQ CX, R11 163 ADCQ $0, R12 164 ADCQ $0, R13 165 ADCQ $0, R14 166 SBBQ CX, CX // save CF 167 NEGQ CX 168 MOVQ R11, 0(R10)(SI*8) 169 MOVQ R12, 8(R10)(SI*8) 170 MOVQ R13, 16(R10)(SI*8) 171 MOVQ R14, 24(R10)(SI*8) 172 173 ADDQ $4, SI // i += 4 174 SUBQ $4, DI // n -= 4 175 JGE U3 // if n >= 0 goto U3 176 177 V3: ADDQ $4, DI // n += 4 178 JLE E3 // if n <= 0 goto E3 179 180 L3: // n > 0 181 ADDQ 0(R8)(SI*8), CX 182 MOVQ CX, 0(R10)(SI*8) 183 SBBQ CX, CX // save CF 184 NEGQ CX 185 186 ADDQ $1, SI // i++ 187 SUBQ $1, DI // n-- 188 JG L3 // if n > 0 goto L3 189 190 E3: MOVQ CX, c+56(FP) // return c 191 RET 192 193 194 // func subVW(z, x []Word, y Word) (c Word) 195 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 196 TEXT ·subVW(SB),NOSPLIT,$0 197 MOVQ z_len+8(FP), DI 198 MOVQ x+24(FP), R8 199 MOVQ y+48(FP), CX // c = y 200 MOVQ z+0(FP), R10 201 202 MOVQ $0, SI // i = 0 203 204 // s/JL/JMP/ below to disable the unrolled loop 205 SUBQ $4, DI // n -= 4 206 JL V4 // if n < 4 goto V4 207 208 U4: // n >= 0 209 // regular loop body unrolled 4x 210 MOVQ 0(R8)(SI*8), R11 211 MOVQ 8(R8)(SI*8), R12 212 MOVQ 16(R8)(SI*8), R13 213 MOVQ 24(R8)(SI*8), R14 214 SUBQ CX, R11 215 SBBQ $0, R12 216 SBBQ $0, R13 217 SBBQ $0, R14 218 SBBQ CX, CX // save CF 219 NEGQ CX 220 MOVQ R11, 0(R10)(SI*8) 221 MOVQ R12, 8(R10)(SI*8) 222 MOVQ R13, 16(R10)(SI*8) 223 MOVQ R14, 24(R10)(SI*8) 224 225 ADDQ $4, SI // i += 4 226 SUBQ $4, DI // n -= 4 227 JGE U4 // if n >= 0 goto U4 228 229 V4: ADDQ $4, DI // n += 4 230 JLE E4 // if n <= 0 goto E4 231 232 L4: // n > 0 233 MOVQ 0(R8)(SI*8), R11 234 SUBQ CX, R11 235 MOVQ R11, 0(R10)(SI*8) 236 SBBQ CX, CX // save CF 237 NEGQ CX 238 239 ADDQ $1, SI // i++ 240 SUBQ $1, DI // n-- 241 JG L4 // if n > 0 goto L4 242 243 E4: MOVQ CX, c+56(FP) // return c 244 RET 245 246 247 // func shlVU(z, x []Word, s uint) (c Word) 248 TEXT ·shlVU(SB),NOSPLIT,$0 249 MOVQ z_len+8(FP), BX // i = z 250 SUBQ $1, BX // i-- 251 JL X8b // i < 0 (n <= 0) 252 253 // n > 0 254 MOVQ z+0(FP), R10 255 MOVQ x+24(FP), R8 256 MOVQ s+48(FP), CX 257 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 258 MOVQ $0, DX 259 SHLQ CX, DX:AX // w1>>ŝ 260 MOVQ DX, c+56(FP) 261 262 CMPQ BX, $0 263 JLE X8a // i <= 0 264 265 // i > 0 266 L8: MOVQ AX, DX // w = w1 267 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 268 SHLQ CX, DX:AX // w<<s | w1>>ŝ 269 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ 270 SUBQ $1, BX // i-- 271 JG L8 // i > 0 272 273 // i <= 0 274 X8a: SHLQ CX, AX // w1<<s 275 MOVQ AX, (R10) // z[0] = w1<<s 276 RET 277 278 X8b: MOVQ $0, c+56(FP) 279 RET 280 281 282 // func shrVU(z, x []Word, s uint) (c Word) 283 TEXT ·shrVU(SB),NOSPLIT,$0 284 MOVQ z_len+8(FP), R11 285 SUBQ $1, R11 // n-- 286 JL X9b // n < 0 (n <= 0) 287 288 // n > 0 289 MOVQ z+0(FP), R10 290 MOVQ x+24(FP), R8 291 MOVQ s+48(FP), CX 292 MOVQ (R8), AX // w1 = x[0] 293 MOVQ $0, DX 294 SHRQ CX, DX:AX // w1<<ŝ 295 MOVQ DX, c+56(FP) 296 297 MOVQ $0, BX // i = 0 298 JMP E9 299 300 // i < n-1 301 L9: MOVQ AX, DX // w = w1 302 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 303 SHRQ CX, DX:AX // w>>s | w1<<ŝ 304 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ 305 ADDQ $1, BX // i++ 306 307 E9: CMPQ BX, R11 308 JL L9 // i < n-1 309 310 // i >= n-1 311 X9a: SHRQ CX, AX // w1>>s 312 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 313 RET 314 315 X9b: MOVQ $0, c+56(FP) 316 RET 317 318 319 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 320 TEXT ·mulAddVWW(SB),NOSPLIT,$0 321 MOVQ z+0(FP), R10 322 MOVQ x+24(FP), R8 323 MOVQ y+48(FP), R9 324 MOVQ r+56(FP), CX // c = r 325 MOVQ z_len+8(FP), R11 326 MOVQ $0, BX // i = 0 327 328 CMPQ R11, $4 329 JL E5 330 331 U5: // i+4 <= n 332 // regular loop body unrolled 4x 333 MOVQ (0*8)(R8)(BX*8), AX 334 MULQ R9 335 ADDQ CX, AX 336 ADCQ $0, DX 337 MOVQ AX, (0*8)(R10)(BX*8) 338 MOVQ DX, CX 339 MOVQ (1*8)(R8)(BX*8), AX 340 MULQ R9 341 ADDQ CX, AX 342 ADCQ $0, DX 343 MOVQ AX, (1*8)(R10)(BX*8) 344 MOVQ DX, CX 345 MOVQ (2*8)(R8)(BX*8), AX 346 MULQ R9 347 ADDQ CX, AX 348 ADCQ $0, DX 349 MOVQ AX, (2*8)(R10)(BX*8) 350 MOVQ DX, CX 351 MOVQ (3*8)(R8)(BX*8), AX 352 MULQ R9 353 ADDQ CX, AX 354 ADCQ $0, DX 355 MOVQ AX, (3*8)(R10)(BX*8) 356 MOVQ DX, CX 357 ADDQ $4, BX // i += 4 358 359 LEAQ 4(BX), DX 360 CMPQ DX, R11 361 JLE U5 362 JMP E5 363 364 L5: MOVQ (R8)(BX*8), AX 365 MULQ R9 366 ADDQ CX, AX 367 ADCQ $0, DX 368 MOVQ AX, (R10)(BX*8) 369 MOVQ DX, CX 370 ADDQ $1, BX // i++ 371 372 E5: CMPQ BX, R11 // i < n 373 JL L5 374 375 MOVQ CX, c+64(FP) 376 RET 377 378 379 // func addMulVVW(z, x []Word, y Word) (c Word) 380 TEXT ·addMulVVW(SB),NOSPLIT,$0 381 CMPB ·support_adx(SB), $1 382 JEQ adx 383 MOVQ z+0(FP), R10 384 MOVQ x+24(FP), R8 385 MOVQ y+48(FP), R9 386 MOVQ z_len+8(FP), R11 387 MOVQ $0, BX // i = 0 388 MOVQ $0, CX // c = 0 389 MOVQ R11, R12 390 ANDQ $-2, R12 391 CMPQ R11, $2 392 JAE A6 393 JMP E6 394 395 A6: 396 MOVQ (R8)(BX*8), AX 397 MULQ R9 398 ADDQ (R10)(BX*8), AX 399 ADCQ $0, DX 400 ADDQ CX, AX 401 ADCQ $0, DX 402 MOVQ DX, CX 403 MOVQ AX, (R10)(BX*8) 404 405 MOVQ (8)(R8)(BX*8), AX 406 MULQ R9 407 ADDQ (8)(R10)(BX*8), AX 408 ADCQ $0, DX 409 ADDQ CX, AX 410 ADCQ $0, DX 411 MOVQ DX, CX 412 MOVQ AX, (8)(R10)(BX*8) 413 414 ADDQ $2, BX 415 CMPQ BX, R12 416 JL A6 417 JMP E6 418 419 L6: MOVQ (R8)(BX*8), AX 420 MULQ R9 421 ADDQ CX, AX 422 ADCQ $0, DX 423 ADDQ AX, (R10)(BX*8) 424 ADCQ $0, DX 425 MOVQ DX, CX 426 ADDQ $1, BX // i++ 427 428 E6: CMPQ BX, R11 // i < n 429 JL L6 430 431 MOVQ CX, c+56(FP) 432 RET 433 434 adx: 435 MOVQ z_len+8(FP), R11 436 MOVQ z+0(FP), R10 437 MOVQ x+24(FP), R8 438 MOVQ y+48(FP), DX 439 MOVQ $0, BX // i = 0 440 MOVQ $0, CX // carry 441 CMPQ R11, $8 442 JAE adx_loop_header 443 CMPQ BX, R11 444 JL adx_short 445 MOVQ CX, c+56(FP) 446 RET 447 448 adx_loop_header: 449 MOVQ R11, R13 450 ANDQ $-8, R13 451 adx_loop: 452 XORQ R9, R9 // unset flags 453 MULXQ (R8), SI, DI 454 ADCXQ CX,SI 455 ADOXQ (R10), SI 456 MOVQ SI,(R10) 457 458 MULXQ 8(R8), AX, CX 459 ADCXQ DI, AX 460 ADOXQ 8(R10), AX 461 MOVQ AX, 8(R10) 462 463 MULXQ 16(R8), SI, DI 464 ADCXQ CX, SI 465 ADOXQ 16(R10), SI 466 MOVQ SI, 16(R10) 467 468 MULXQ 24(R8), AX, CX 469 ADCXQ DI, AX 470 ADOXQ 24(R10), AX 471 MOVQ AX, 24(R10) 472 473 MULXQ 32(R8), SI, DI 474 ADCXQ CX, SI 475 ADOXQ 32(R10), SI 476 MOVQ SI, 32(R10) 477 478 MULXQ 40(R8), AX, CX 479 ADCXQ DI, AX 480 ADOXQ 40(R10), AX 481 MOVQ AX, 40(R10) 482 483 MULXQ 48(R8), SI, DI 484 ADCXQ CX, SI 485 ADOXQ 48(R10), SI 486 MOVQ SI, 48(R10) 487 488 MULXQ 56(R8), AX, CX 489 ADCXQ DI, AX 490 ADOXQ 56(R10), AX 491 MOVQ AX, 56(R10) 492 493 ADCXQ R9, CX 494 ADOXQ R9, CX 495 496 ADDQ $64, R8 497 ADDQ $64, R10 498 ADDQ $8, BX 499 500 CMPQ BX, R13 501 JL adx_loop 502 MOVQ z+0(FP), R10 503 MOVQ x+24(FP), R8 504 CMPQ BX, R11 505 JL adx_short 506 MOVQ CX, c+56(FP) 507 RET 508 509 adx_short: 510 MULXQ (R8)(BX*8), SI, DI 511 ADDQ CX, SI 512 ADCQ $0, DI 513 ADDQ SI, (R10)(BX*8) 514 ADCQ $0, DI 515 MOVQ DI, CX 516 ADDQ $1, BX // i++ 517 518 CMPQ BX, R11 519 JL adx_short 520 521 MOVQ CX, c+56(FP) 522 RET 523 524 525 526 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) 527 TEXT ·divWVW(SB),NOSPLIT,$0 528 MOVQ z+0(FP), R10 529 MOVQ xn+24(FP), DX // r = xn 530 MOVQ x+32(FP), R8 531 MOVQ y+56(FP), R9 532 MOVQ z_len+8(FP), BX // i = z 533 JMP E7 534 535 L7: MOVQ (R8)(BX*8), AX 536 DIVQ R9 537 MOVQ AX, (R10)(BX*8) 538 539 E7: SUBQ $1, BX // i-- 540 JGE L7 // i >= 0 541 542 MOVQ DX, r+64(FP) 543 RET