github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/math/big/arith_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !math_big_pure_go 6 // +build !math_big_pure_go 7 8 #include "textflag.h" 9 10 // This file provides fast assembly versions for the elementary 11 // arithmetic operations on vectors implemented in arith.go. 12 13 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. 14 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. 15 // This is faster than using rotate instructions. 16 17 // func addVV(z, x, y []Word) (c Word) 18 TEXT ·addVV(SB),NOSPLIT,$0 19 MOVQ z_len+8(FP), DI 20 MOVQ x+24(FP), R8 21 MOVQ y+48(FP), R9 22 MOVQ z+0(FP), R10 23 24 MOVQ $0, CX // c = 0 25 MOVQ $0, SI // i = 0 26 27 // s/JL/JMP/ below to disable the unrolled loop 28 SUBQ $4, DI // n -= 4 29 JL V1 // if n < 0 goto V1 30 31 U1: // n >= 0 32 // regular loop body unrolled 4x 33 ADDQ CX, CX // restore CF 34 MOVQ 0(R8)(SI*8), R11 35 MOVQ 8(R8)(SI*8), R12 36 MOVQ 16(R8)(SI*8), R13 37 MOVQ 24(R8)(SI*8), R14 38 ADCQ 0(R9)(SI*8), R11 39 ADCQ 8(R9)(SI*8), R12 40 ADCQ 16(R9)(SI*8), R13 41 ADCQ 24(R9)(SI*8), R14 42 MOVQ R11, 0(R10)(SI*8) 43 MOVQ R12, 8(R10)(SI*8) 44 MOVQ R13, 16(R10)(SI*8) 45 MOVQ R14, 24(R10)(SI*8) 46 SBBQ CX, CX // save CF 47 48 ADDQ $4, SI // i += 4 49 SUBQ $4, DI // n -= 4 50 JGE U1 // if n >= 0 goto U1 51 52 V1: ADDQ $4, DI // n += 4 53 JLE E1 // if n <= 0 goto E1 54 55 L1: // n > 0 56 ADDQ CX, CX // restore CF 57 MOVQ 0(R8)(SI*8), R11 58 ADCQ 0(R9)(SI*8), R11 59 MOVQ R11, 0(R10)(SI*8) 60 SBBQ CX, CX // save CF 61 62 ADDQ $1, SI // i++ 63 SUBQ $1, DI // n-- 64 JG L1 // if n > 0 goto L1 65 66 E1: NEGQ CX 67 MOVQ CX, c+72(FP) // return c 68 RET 69 70 71 // func subVV(z, x, y []Word) (c Word) 72 // (same as addVV except for SBBQ instead of ADCQ and label names) 73 TEXT ·subVV(SB),NOSPLIT,$0 74 MOVQ z_len+8(FP), DI 75 MOVQ x+24(FP), R8 76 MOVQ y+48(FP), R9 77 MOVQ z+0(FP), R10 78 79 MOVQ $0, CX // c = 0 80 MOVQ $0, SI // i = 0 81 82 // s/JL/JMP/ below to disable the unrolled loop 83 SUBQ $4, DI // n -= 4 84 JL V2 // if n < 0 goto V2 85 86 U2: // n >= 0 87 // regular loop body unrolled 4x 88 ADDQ CX, CX // restore CF 89 MOVQ 0(R8)(SI*8), R11 90 MOVQ 8(R8)(SI*8), R12 91 MOVQ 16(R8)(SI*8), R13 92 MOVQ 24(R8)(SI*8), R14 93 SBBQ 0(R9)(SI*8), R11 94 SBBQ 8(R9)(SI*8), R12 95 SBBQ 16(R9)(SI*8), R13 96 SBBQ 24(R9)(SI*8), R14 97 MOVQ R11, 0(R10)(SI*8) 98 MOVQ R12, 8(R10)(SI*8) 99 MOVQ R13, 16(R10)(SI*8) 100 MOVQ R14, 24(R10)(SI*8) 101 SBBQ CX, CX // save CF 102 103 ADDQ $4, SI // i += 4 104 SUBQ $4, DI // n -= 4 105 JGE U2 // if n >= 0 goto U2 106 107 V2: ADDQ $4, DI // n += 4 108 JLE E2 // if n <= 0 goto E2 109 110 L2: // n > 0 111 ADDQ CX, CX // restore CF 112 MOVQ 0(R8)(SI*8), R11 113 SBBQ 0(R9)(SI*8), R11 114 MOVQ R11, 0(R10)(SI*8) 115 SBBQ CX, CX // save CF 116 117 ADDQ $1, SI // i++ 118 SUBQ $1, DI // n-- 119 JG L2 // if n > 0 goto L2 120 121 E2: NEGQ CX 122 MOVQ CX, c+72(FP) // return c 123 RET 124 125 126 // func addVW(z, x []Word, y Word) (c Word) 127 TEXT ·addVW(SB),NOSPLIT,$0 128 MOVQ z_len+8(FP), DI 129 CMPQ DI, $32 130 JG large 131 MOVQ x+24(FP), R8 132 MOVQ y+48(FP), CX // c = y 133 MOVQ z+0(FP), R10 134 135 MOVQ $0, SI // i = 0 136 137 // s/JL/JMP/ below to disable the unrolled loop 138 SUBQ $4, DI // n -= 4 139 JL V3 // if n < 4 goto V3 140 141 U3: // n >= 0 142 // regular loop body unrolled 4x 143 MOVQ 0(R8)(SI*8), R11 144 MOVQ 8(R8)(SI*8), R12 145 MOVQ 16(R8)(SI*8), R13 146 MOVQ 24(R8)(SI*8), R14 147 ADDQ CX, R11 148 ADCQ $0, R12 149 ADCQ $0, R13 150 ADCQ $0, R14 151 SBBQ CX, CX // save CF 152 NEGQ CX 153 MOVQ R11, 0(R10)(SI*8) 154 MOVQ R12, 8(R10)(SI*8) 155 MOVQ R13, 16(R10)(SI*8) 156 MOVQ R14, 24(R10)(SI*8) 157 158 ADDQ $4, SI // i += 4 159 SUBQ $4, DI // n -= 4 160 JGE U3 // if n >= 0 goto U3 161 162 V3: ADDQ $4, DI // n += 4 163 JLE E3 // if n <= 0 goto E3 164 165 L3: // n > 0 166 ADDQ 0(R8)(SI*8), CX 167 MOVQ CX, 0(R10)(SI*8) 168 SBBQ CX, CX // save CF 169 NEGQ CX 170 171 ADDQ $1, SI // i++ 172 SUBQ $1, DI // n-- 173 JG L3 // if n > 0 goto L3 174 175 E3: MOVQ CX, c+56(FP) // return c 176 RET 177 large: 178 JMP ·addVWlarge(SB) 179 180 181 // func subVW(z, x []Word, y Word) (c Word) 182 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 183 TEXT ·subVW(SB),NOSPLIT,$0 184 MOVQ z_len+8(FP), DI 185 CMPQ DI, $32 186 JG large 187 MOVQ x+24(FP), R8 188 MOVQ y+48(FP), CX // c = y 189 MOVQ z+0(FP), R10 190 191 MOVQ $0, SI // i = 0 192 193 // s/JL/JMP/ below to disable the unrolled loop 194 SUBQ $4, DI // n -= 4 195 JL V4 // if n < 4 goto V4 196 197 U4: // n >= 0 198 // regular loop body unrolled 4x 199 MOVQ 0(R8)(SI*8), R11 200 MOVQ 8(R8)(SI*8), R12 201 MOVQ 16(R8)(SI*8), R13 202 MOVQ 24(R8)(SI*8), R14 203 SUBQ CX, R11 204 SBBQ $0, R12 205 SBBQ $0, R13 206 SBBQ $0, R14 207 SBBQ CX, CX // save CF 208 NEGQ CX 209 MOVQ R11, 0(R10)(SI*8) 210 MOVQ R12, 8(R10)(SI*8) 211 MOVQ R13, 16(R10)(SI*8) 212 MOVQ R14, 24(R10)(SI*8) 213 214 ADDQ $4, SI // i += 4 215 SUBQ $4, DI // n -= 4 216 JGE U4 // if n >= 0 goto U4 217 218 V4: ADDQ $4, DI // n += 4 219 JLE E4 // if n <= 0 goto E4 220 221 L4: // n > 0 222 MOVQ 0(R8)(SI*8), R11 223 SUBQ CX, R11 224 MOVQ R11, 0(R10)(SI*8) 225 SBBQ CX, CX // save CF 226 NEGQ CX 227 228 ADDQ $1, SI // i++ 229 SUBQ $1, DI // n-- 230 JG L4 // if n > 0 goto L4 231 232 E4: MOVQ CX, c+56(FP) // return c 233 RET 234 large: 235 JMP ·subVWlarge(SB) 236 237 238 // func shlVU(z, x []Word, s uint) (c Word) 239 TEXT ·shlVU(SB),NOSPLIT,$0 240 MOVQ z_len+8(FP), BX // i = z 241 SUBQ $1, BX // i-- 242 JL X8b // i < 0 (n <= 0) 243 244 // n > 0 245 MOVQ z+0(FP), R10 246 MOVQ x+24(FP), R8 247 MOVQ s+48(FP), CX 248 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 249 MOVQ $0, DX 250 SHLQ CX, AX, DX // w1>>ŝ 251 MOVQ DX, c+56(FP) 252 253 CMPQ BX, $0 254 JLE X8a // i <= 0 255 256 // i > 0 257 L8: MOVQ AX, DX // w = w1 258 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 259 SHLQ CX, AX, DX // w<<s | w1>>ŝ 260 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ 261 SUBQ $1, BX // i-- 262 JG L8 // i > 0 263 264 // i <= 0 265 X8a: SHLQ CX, AX // w1<<s 266 MOVQ AX, (R10) // z[0] = w1<<s 267 RET 268 269 X8b: MOVQ $0, c+56(FP) 270 RET 271 272 273 // func shrVU(z, x []Word, s uint) (c Word) 274 TEXT ·shrVU(SB),NOSPLIT,$0 275 MOVQ z_len+8(FP), R11 276 SUBQ $1, R11 // n-- 277 JL X9b // n < 0 (n <= 0) 278 279 // n > 0 280 MOVQ z+0(FP), R10 281 MOVQ x+24(FP), R8 282 MOVQ s+48(FP), CX 283 MOVQ (R8), AX // w1 = x[0] 284 MOVQ $0, DX 285 SHRQ CX, AX, DX // w1<<ŝ 286 MOVQ DX, c+56(FP) 287 288 MOVQ $0, BX // i = 0 289 JMP E9 290 291 // i < n-1 292 L9: MOVQ AX, DX // w = w1 293 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 294 SHRQ CX, AX, DX // w>>s | w1<<ŝ 295 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ 296 ADDQ $1, BX // i++ 297 298 E9: CMPQ BX, R11 299 JL L9 // i < n-1 300 301 // i >= n-1 302 X9a: SHRQ CX, AX // w1>>s 303 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 304 RET 305 306 X9b: MOVQ $0, c+56(FP) 307 RET 308 309 310 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 311 TEXT ·mulAddVWW(SB),NOSPLIT,$0 312 MOVQ z+0(FP), R10 313 MOVQ x+24(FP), R8 314 MOVQ y+48(FP), R9 315 MOVQ r+56(FP), CX // c = r 316 MOVQ z_len+8(FP), R11 317 MOVQ $0, BX // i = 0 318 319 CMPQ R11, $4 320 JL E5 321 322 U5: // i+4 <= n 323 // regular loop body unrolled 4x 324 MOVQ (0*8)(R8)(BX*8), AX 325 MULQ R9 326 ADDQ CX, AX 327 ADCQ $0, DX 328 MOVQ AX, (0*8)(R10)(BX*8) 329 MOVQ DX, CX 330 MOVQ (1*8)(R8)(BX*8), AX 331 MULQ R9 332 ADDQ CX, AX 333 ADCQ $0, DX 334 MOVQ AX, (1*8)(R10)(BX*8) 335 MOVQ DX, CX 336 MOVQ (2*8)(R8)(BX*8), AX 337 MULQ R9 338 ADDQ CX, AX 339 ADCQ $0, DX 340 MOVQ AX, (2*8)(R10)(BX*8) 341 MOVQ DX, CX 342 MOVQ (3*8)(R8)(BX*8), AX 343 MULQ R9 344 ADDQ CX, AX 345 ADCQ $0, DX 346 MOVQ AX, (3*8)(R10)(BX*8) 347 MOVQ DX, CX 348 ADDQ $4, BX // i += 4 349 350 LEAQ 4(BX), DX 351 CMPQ DX, R11 352 JLE U5 353 JMP E5 354 355 L5: MOVQ (R8)(BX*8), AX 356 MULQ R9 357 ADDQ CX, AX 358 ADCQ $0, DX 359 MOVQ AX, (R10)(BX*8) 360 MOVQ DX, CX 361 ADDQ $1, BX // i++ 362 363 E5: CMPQ BX, R11 // i < n 364 JL L5 365 366 MOVQ CX, c+64(FP) 367 RET 368 369 370 // func addMulVVW(z, x []Word, y Word) (c Word) 371 TEXT ·addMulVVW(SB),NOSPLIT,$0 372 CMPB ·support_adx(SB), $1 373 JEQ adx 374 MOVQ z+0(FP), R10 375 MOVQ x+24(FP), R8 376 MOVQ y+48(FP), R9 377 MOVQ z_len+8(FP), R11 378 MOVQ $0, BX // i = 0 379 MOVQ $0, CX // c = 0 380 MOVQ R11, R12 381 ANDQ $-2, R12 382 CMPQ R11, $2 383 JAE A6 384 JMP E6 385 386 A6: 387 MOVQ (R8)(BX*8), AX 388 MULQ R9 389 ADDQ (R10)(BX*8), AX 390 ADCQ $0, DX 391 ADDQ CX, AX 392 ADCQ $0, DX 393 MOVQ DX, CX 394 MOVQ AX, (R10)(BX*8) 395 396 MOVQ (8)(R8)(BX*8), AX 397 MULQ R9 398 ADDQ (8)(R10)(BX*8), AX 399 ADCQ $0, DX 400 ADDQ CX, AX 401 ADCQ $0, DX 402 MOVQ DX, CX 403 MOVQ AX, (8)(R10)(BX*8) 404 405 ADDQ $2, BX 406 CMPQ BX, R12 407 JL A6 408 JMP E6 409 410 L6: MOVQ (R8)(BX*8), AX 411 MULQ R9 412 ADDQ CX, AX 413 ADCQ $0, DX 414 ADDQ AX, (R10)(BX*8) 415 ADCQ $0, DX 416 MOVQ DX, CX 417 ADDQ $1, BX // i++ 418 419 E6: CMPQ BX, R11 // i < n 420 JL L6 421 422 MOVQ CX, c+56(FP) 423 RET 424 425 adx: 426 MOVQ z_len+8(FP), R11 427 MOVQ z+0(FP), R10 428 MOVQ x+24(FP), R8 429 MOVQ y+48(FP), DX 430 MOVQ $0, BX // i = 0 431 MOVQ $0, CX // carry 432 CMPQ R11, $8 433 JAE adx_loop_header 434 CMPQ BX, R11 435 JL adx_short 436 MOVQ CX, c+56(FP) 437 RET 438 439 adx_loop_header: 440 MOVQ R11, R13 441 ANDQ $-8, R13 442 adx_loop: 443 XORQ R9, R9 // unset flags 444 MULXQ (R8), SI, DI 445 ADCXQ CX,SI 446 ADOXQ (R10), SI 447 MOVQ SI,(R10) 448 449 MULXQ 8(R8), AX, CX 450 ADCXQ DI, AX 451 ADOXQ 8(R10), AX 452 MOVQ AX, 8(R10) 453 454 MULXQ 16(R8), SI, DI 455 ADCXQ CX, SI 456 ADOXQ 16(R10), SI 457 MOVQ SI, 16(R10) 458 459 MULXQ 24(R8), AX, CX 460 ADCXQ DI, AX 461 ADOXQ 24(R10), AX 462 MOVQ AX, 24(R10) 463 464 MULXQ 32(R8), SI, DI 465 ADCXQ CX, SI 466 ADOXQ 32(R10), SI 467 MOVQ SI, 32(R10) 468 469 MULXQ 40(R8), AX, CX 470 ADCXQ DI, AX 471 ADOXQ 40(R10), AX 472 MOVQ AX, 40(R10) 473 474 MULXQ 48(R8), SI, DI 475 ADCXQ CX, SI 476 ADOXQ 48(R10), SI 477 MOVQ SI, 48(R10) 478 479 MULXQ 56(R8), AX, CX 480 ADCXQ DI, AX 481 ADOXQ 56(R10), AX 482 MOVQ AX, 56(R10) 483 484 ADCXQ R9, CX 485 ADOXQ R9, CX 486 487 ADDQ $64, R8 488 ADDQ $64, R10 489 ADDQ $8, BX 490 491 CMPQ BX, R13 492 JL adx_loop 493 MOVQ z+0(FP), R10 494 MOVQ x+24(FP), R8 495 CMPQ BX, R11 496 JL adx_short 497 MOVQ CX, c+56(FP) 498 RET 499 500 adx_short: 501 MULXQ (R8)(BX*8), SI, DI 502 ADDQ CX, SI 503 ADCQ $0, DI 504 ADDQ SI, (R10)(BX*8) 505 ADCQ $0, DI 506 MOVQ DI, CX 507 ADDQ $1, BX // i++ 508 509 CMPQ BX, R11 510 JL adx_short 511 512 MOVQ CX, c+56(FP) 513 RET 514 515 516