github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/math/big/arith_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT ·mulWW(SB),NOSPLIT,$0 14 MOVQ x+0(FP), AX 15 MULQ y+8(FP) 16 MOVQ DX, z1+16(FP) 17 MOVQ AX, z0+24(FP) 18 RET 19 20 21 22 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. 23 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. 24 // This is faster than using rotate instructions. 25 26 // func addVV(z, x, y []Word) (c Word) 27 TEXT ·addVV(SB),NOSPLIT,$0 28 MOVQ z_len+8(FP), DI 29 MOVQ x+24(FP), R8 30 MOVQ y+48(FP), R9 31 MOVQ z+0(FP), R10 32 33 MOVQ $0, CX // c = 0 34 MOVQ $0, SI // i = 0 35 36 // s/JL/JMP/ below to disable the unrolled loop 37 SUBQ $4, DI // n -= 4 38 JL V1 // if n < 0 goto V1 39 40 U1: // n >= 0 41 // regular loop body unrolled 4x 42 ADDQ CX, CX // restore CF 43 MOVQ 0(R8)(SI*8), R11 44 MOVQ 8(R8)(SI*8), R12 45 MOVQ 16(R8)(SI*8), R13 46 MOVQ 24(R8)(SI*8), R14 47 ADCQ 0(R9)(SI*8), R11 48 ADCQ 8(R9)(SI*8), R12 49 ADCQ 16(R9)(SI*8), R13 50 ADCQ 24(R9)(SI*8), R14 51 MOVQ R11, 0(R10)(SI*8) 52 MOVQ R12, 8(R10)(SI*8) 53 MOVQ R13, 16(R10)(SI*8) 54 MOVQ R14, 24(R10)(SI*8) 55 SBBQ CX, CX // save CF 56 57 ADDQ $4, SI // i += 4 58 SUBQ $4, DI // n -= 4 59 JGE U1 // if n >= 0 goto U1 60 61 V1: ADDQ $4, DI // n += 4 62 JLE E1 // if n <= 0 goto E1 63 64 L1: // n > 0 65 ADDQ CX, CX // restore CF 66 MOVQ 0(R8)(SI*8), R11 67 ADCQ 0(R9)(SI*8), R11 68 MOVQ R11, 0(R10)(SI*8) 69 SBBQ CX, CX // save CF 70 71 ADDQ $1, SI // i++ 72 SUBQ $1, DI // n-- 73 JG L1 // if n > 0 goto L1 74 75 E1: NEGQ CX 76 MOVQ CX, c+72(FP) // return c 77 RET 78 79 80 // func subVV(z, x, y []Word) (c Word) 81 // (same as addVV except for SBBQ instead of ADCQ and label names) 82 TEXT ·subVV(SB),NOSPLIT,$0 83 MOVQ z_len+8(FP), DI 84 MOVQ x+24(FP), R8 85 MOVQ y+48(FP), R9 86 MOVQ z+0(FP), R10 87 88 MOVQ $0, CX // c = 0 89 MOVQ $0, SI // i = 0 90 91 // s/JL/JMP/ below to disable the unrolled loop 92 SUBQ $4, DI // n -= 4 93 JL V2 // if n < 0 goto V2 94 95 U2: // n >= 0 96 // regular loop body unrolled 4x 97 ADDQ CX, CX // restore CF 98 MOVQ 0(R8)(SI*8), R11 99 MOVQ 8(R8)(SI*8), R12 100 MOVQ 16(R8)(SI*8), R13 101 MOVQ 24(R8)(SI*8), R14 102 SBBQ 0(R9)(SI*8), R11 103 SBBQ 8(R9)(SI*8), R12 104 SBBQ 16(R9)(SI*8), R13 105 SBBQ 24(R9)(SI*8), R14 106 MOVQ R11, 0(R10)(SI*8) 107 MOVQ R12, 8(R10)(SI*8) 108 MOVQ R13, 16(R10)(SI*8) 109 MOVQ R14, 24(R10)(SI*8) 110 SBBQ CX, CX // save CF 111 112 ADDQ $4, SI // i += 4 113 SUBQ $4, DI // n -= 4 114 JGE U2 // if n >= 0 goto U2 115 116 V2: ADDQ $4, DI // n += 4 117 JLE E2 // if n <= 0 goto E2 118 119 L2: // n > 0 120 ADDQ CX, CX // restore CF 121 MOVQ 0(R8)(SI*8), R11 122 SBBQ 0(R9)(SI*8), R11 123 MOVQ R11, 0(R10)(SI*8) 124 SBBQ CX, CX // save CF 125 126 ADDQ $1, SI // i++ 127 SUBQ $1, DI // n-- 128 JG L2 // if n > 0 goto L2 129 130 E2: NEGQ CX 131 MOVQ CX, c+72(FP) // return c 132 RET 133 134 135 // func addVW(z, x []Word, y Word) (c Word) 136 TEXT ·addVW(SB),NOSPLIT,$0 137 MOVQ z_len+8(FP), DI 138 CMPQ DI, $32 139 JG large 140 MOVQ x+24(FP), R8 141 MOVQ y+48(FP), CX // c = y 142 MOVQ z+0(FP), R10 143 144 MOVQ $0, SI // i = 0 145 146 // s/JL/JMP/ below to disable the unrolled loop 147 SUBQ $4, DI // n -= 4 148 JL V3 // if n < 4 goto V3 149 150 U3: // n >= 0 151 // regular loop body unrolled 4x 152 MOVQ 0(R8)(SI*8), R11 153 MOVQ 8(R8)(SI*8), R12 154 MOVQ 16(R8)(SI*8), R13 155 MOVQ 24(R8)(SI*8), R14 156 ADDQ CX, R11 157 ADCQ $0, R12 158 ADCQ $0, R13 159 ADCQ $0, R14 160 SBBQ CX, CX // save CF 161 NEGQ CX 162 MOVQ R11, 0(R10)(SI*8) 163 MOVQ R12, 8(R10)(SI*8) 164 MOVQ R13, 16(R10)(SI*8) 165 MOVQ R14, 24(R10)(SI*8) 166 167 ADDQ $4, SI // i += 4 168 SUBQ $4, DI // n -= 4 169 JGE U3 // if n >= 0 goto U3 170 171 V3: ADDQ $4, DI // n += 4 172 JLE E3 // if n <= 0 goto E3 173 174 L3: // n > 0 175 ADDQ 0(R8)(SI*8), CX 176 MOVQ CX, 0(R10)(SI*8) 177 SBBQ CX, CX // save CF 178 NEGQ CX 179 180 ADDQ $1, SI // i++ 181 SUBQ $1, DI // n-- 182 JG L3 // if n > 0 goto L3 183 184 E3: MOVQ CX, c+56(FP) // return c 185 RET 186 large: 187 JMP ·addVWlarge(SB) 188 189 190 // func subVW(z, x []Word, y Word) (c Word) 191 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 192 TEXT ·subVW(SB),NOSPLIT,$0 193 MOVQ z_len+8(FP), DI 194 CMPQ DI, $32 195 JG large 196 MOVQ x+24(FP), R8 197 MOVQ y+48(FP), CX // c = y 198 MOVQ z+0(FP), R10 199 200 MOVQ $0, SI // i = 0 201 202 // s/JL/JMP/ below to disable the unrolled loop 203 SUBQ $4, DI // n -= 4 204 JL V4 // if n < 4 goto V4 205 206 U4: // n >= 0 207 // regular loop body unrolled 4x 208 MOVQ 0(R8)(SI*8), R11 209 MOVQ 8(R8)(SI*8), R12 210 MOVQ 16(R8)(SI*8), R13 211 MOVQ 24(R8)(SI*8), R14 212 SUBQ CX, R11 213 SBBQ $0, R12 214 SBBQ $0, R13 215 SBBQ $0, R14 216 SBBQ CX, CX // save CF 217 NEGQ CX 218 MOVQ R11, 0(R10)(SI*8) 219 MOVQ R12, 8(R10)(SI*8) 220 MOVQ R13, 16(R10)(SI*8) 221 MOVQ R14, 24(R10)(SI*8) 222 223 ADDQ $4, SI // i += 4 224 SUBQ $4, DI // n -= 4 225 JGE U4 // if n >= 0 goto U4 226 227 V4: ADDQ $4, DI // n += 4 228 JLE E4 // if n <= 0 goto E4 229 230 L4: // n > 0 231 MOVQ 0(R8)(SI*8), R11 232 SUBQ CX, R11 233 MOVQ R11, 0(R10)(SI*8) 234 SBBQ CX, CX // save CF 235 NEGQ CX 236 237 ADDQ $1, SI // i++ 238 SUBQ $1, DI // n-- 239 JG L4 // if n > 0 goto L4 240 241 E4: MOVQ CX, c+56(FP) // return c 242 RET 243 large: 244 JMP ·subVWlarge(SB) 245 246 247 // func shlVU(z, x []Word, s uint) (c Word) 248 TEXT ·shlVU(SB),NOSPLIT,$0 249 MOVQ z_len+8(FP), BX // i = z 250 SUBQ $1, BX // i-- 251 JL X8b // i < 0 (n <= 0) 252 253 // n > 0 254 MOVQ z+0(FP), R10 255 MOVQ x+24(FP), R8 256 MOVQ s+48(FP), CX 257 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 258 MOVQ $0, DX 259 SHLQ CX, AX, DX // w1>>ŝ 260 MOVQ DX, c+56(FP) 261 262 CMPQ BX, $0 263 JLE X8a // i <= 0 264 265 // i > 0 266 L8: MOVQ AX, DX // w = w1 267 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 268 SHLQ CX, AX, DX // w<<s | w1>>ŝ 269 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ 270 SUBQ $1, BX // i-- 271 JG L8 // i > 0 272 273 // i <= 0 274 X8a: SHLQ CX, AX // w1<<s 275 MOVQ AX, (R10) // z[0] = w1<<s 276 RET 277 278 X8b: MOVQ $0, c+56(FP) 279 RET 280 281 282 // func shrVU(z, x []Word, s uint) (c Word) 283 TEXT ·shrVU(SB),NOSPLIT,$0 284 MOVQ z_len+8(FP), R11 285 SUBQ $1, R11 // n-- 286 JL X9b // n < 0 (n <= 0) 287 288 // n > 0 289 MOVQ z+0(FP), R10 290 MOVQ x+24(FP), R8 291 MOVQ s+48(FP), CX 292 MOVQ (R8), AX // w1 = x[0] 293 MOVQ $0, DX 294 SHRQ CX, AX, DX // w1<<ŝ 295 MOVQ DX, c+56(FP) 296 297 MOVQ $0, BX // i = 0 298 JMP E9 299 300 // i < n-1 301 L9: MOVQ AX, DX // w = w1 302 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 303 SHRQ CX, AX, DX // w>>s | w1<<ŝ 304 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ 305 ADDQ $1, BX // i++ 306 307 E9: CMPQ BX, R11 308 JL L9 // i < n-1 309 310 // i >= n-1 311 X9a: SHRQ CX, AX // w1>>s 312 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 313 RET 314 315 X9b: MOVQ $0, c+56(FP) 316 RET 317 318 319 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 320 TEXT ·mulAddVWW(SB),NOSPLIT,$0 321 MOVQ z+0(FP), R10 322 MOVQ x+24(FP), R8 323 MOVQ y+48(FP), R9 324 MOVQ r+56(FP), CX // c = r 325 MOVQ z_len+8(FP), R11 326 MOVQ $0, BX // i = 0 327 328 CMPQ R11, $4 329 JL E5 330 331 U5: // i+4 <= n 332 // regular loop body unrolled 4x 333 MOVQ (0*8)(R8)(BX*8), AX 334 MULQ R9 335 ADDQ CX, AX 336 ADCQ $0, DX 337 MOVQ AX, (0*8)(R10)(BX*8) 338 MOVQ DX, CX 339 MOVQ (1*8)(R8)(BX*8), AX 340 MULQ R9 341 ADDQ CX, AX 342 ADCQ $0, DX 343 MOVQ AX, (1*8)(R10)(BX*8) 344 MOVQ DX, CX 345 MOVQ (2*8)(R8)(BX*8), AX 346 MULQ R9 347 ADDQ CX, AX 348 ADCQ $0, DX 349 MOVQ AX, (2*8)(R10)(BX*8) 350 MOVQ DX, CX 351 MOVQ (3*8)(R8)(BX*8), AX 352 MULQ R9 353 ADDQ CX, AX 354 ADCQ $0, DX 355 MOVQ AX, (3*8)(R10)(BX*8) 356 MOVQ DX, CX 357 ADDQ $4, BX // i += 4 358 359 LEAQ 4(BX), DX 360 CMPQ DX, R11 361 JLE U5 362 JMP E5 363 364 L5: MOVQ (R8)(BX*8), AX 365 MULQ R9 366 ADDQ CX, AX 367 ADCQ $0, DX 368 MOVQ AX, (R10)(BX*8) 369 MOVQ DX, CX 370 ADDQ $1, BX // i++ 371 372 E5: CMPQ BX, R11 // i < n 373 JL L5 374 375 MOVQ CX, c+64(FP) 376 RET 377 378 379 // func addMulVVW(z, x []Word, y Word) (c Word) 380 TEXT ·addMulVVW(SB),NOSPLIT,$0 381 CMPB ·support_adx(SB), $1 382 JEQ adx 383 MOVQ z+0(FP), R10 384 MOVQ x+24(FP), R8 385 MOVQ y+48(FP), R9 386 MOVQ z_len+8(FP), R11 387 MOVQ $0, BX // i = 0 388 MOVQ $0, CX // c = 0 389 MOVQ R11, R12 390 ANDQ $-2, R12 391 CMPQ R11, $2 392 JAE A6 393 JMP E6 394 395 A6: 396 MOVQ (R8)(BX*8), AX 397 MULQ R9 398 ADDQ (R10)(BX*8), AX 399 ADCQ $0, DX 400 ADDQ CX, AX 401 ADCQ $0, DX 402 MOVQ DX, CX 403 MOVQ AX, (R10)(BX*8) 404 405 MOVQ (8)(R8)(BX*8), AX 406 MULQ R9 407 ADDQ (8)(R10)(BX*8), AX 408 ADCQ $0, DX 409 ADDQ CX, AX 410 ADCQ $0, DX 411 MOVQ DX, CX 412 MOVQ AX, (8)(R10)(BX*8) 413 414 ADDQ $2, BX 415 CMPQ BX, R12 416 JL A6 417 JMP E6 418 419 L6: MOVQ (R8)(BX*8), AX 420 MULQ R9 421 ADDQ CX, AX 422 ADCQ $0, DX 423 ADDQ AX, (R10)(BX*8) 424 ADCQ $0, DX 425 MOVQ DX, CX 426 ADDQ $1, BX // i++ 427 428 E6: CMPQ BX, R11 // i < n 429 JL L6 430 431 MOVQ CX, c+56(FP) 432 RET 433 434 adx: 435 MOVQ z_len+8(FP), R11 436 MOVQ z+0(FP), R10 437 MOVQ x+24(FP), R8 438 MOVQ y+48(FP), DX 439 MOVQ $0, BX // i = 0 440 MOVQ $0, CX // carry 441 CMPQ R11, $8 442 JAE adx_loop_header 443 CMPQ BX, R11 444 JL adx_short 445 MOVQ CX, c+56(FP) 446 RET 447 448 adx_loop_header: 449 MOVQ R11, R13 450 ANDQ $-8, R13 451 adx_loop: 452 XORQ R9, R9 // unset flags 453 MULXQ (R8), SI, DI 454 ADCXQ CX,SI 455 ADOXQ (R10), SI 456 MOVQ SI,(R10) 457 458 MULXQ 8(R8), AX, CX 459 ADCXQ DI, AX 460 ADOXQ 8(R10), AX 461 MOVQ AX, 8(R10) 462 463 MULXQ 16(R8), SI, DI 464 ADCXQ CX, SI 465 ADOXQ 16(R10), SI 466 MOVQ SI, 16(R10) 467 468 MULXQ 24(R8), AX, CX 469 ADCXQ DI, AX 470 ADOXQ 24(R10), AX 471 MOVQ AX, 24(R10) 472 473 MULXQ 32(R8), SI, DI 474 ADCXQ CX, SI 475 ADOXQ 32(R10), SI 476 MOVQ SI, 32(R10) 477 478 MULXQ 40(R8), AX, CX 479 ADCXQ DI, AX 480 ADOXQ 40(R10), AX 481 MOVQ AX, 40(R10) 482 483 MULXQ 48(R8), SI, DI 484 ADCXQ CX, SI 485 ADOXQ 48(R10), SI 486 MOVQ SI, 48(R10) 487 488 MULXQ 56(R8), AX, CX 489 ADCXQ DI, AX 490 ADOXQ 56(R10), AX 491 MOVQ AX, 56(R10) 492 493 ADCXQ R9, CX 494 ADOXQ R9, CX 495 496 ADDQ $64, R8 497 ADDQ $64, R10 498 ADDQ $8, BX 499 500 CMPQ BX, R13 501 JL adx_loop 502 MOVQ z+0(FP), R10 503 MOVQ x+24(FP), R8 504 CMPQ BX, R11 505 JL adx_short 506 MOVQ CX, c+56(FP) 507 RET 508 509 adx_short: 510 MULXQ (R8)(BX*8), SI, DI 511 ADDQ CX, SI 512 ADCQ $0, DI 513 ADDQ SI, (R10)(BX*8) 514 ADCQ $0, DI 515 MOVQ DI, CX 516 ADDQ $1, BX // i++ 517 518 CMPQ BX, R11 519 JL adx_short 520 521 MOVQ CX, c+56(FP) 522 RET 523 524 525