github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/math/big/arith_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. 13 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. 14 // This is faster than using rotate instructions. 15 16 // func addVV(z, x, y []Word) (c Word) 17 TEXT ·addVV(SB),NOSPLIT,$0 18 MOVQ z_len+8(FP), DI 19 MOVQ x+24(FP), R8 20 MOVQ y+48(FP), R9 21 MOVQ z+0(FP), R10 22 23 MOVQ $0, CX // c = 0 24 MOVQ $0, SI // i = 0 25 26 // s/JL/JMP/ below to disable the unrolled loop 27 SUBQ $4, DI // n -= 4 28 JL V1 // if n < 0 goto V1 29 30 U1: // n >= 0 31 // regular loop body unrolled 4x 32 ADDQ CX, CX // restore CF 33 MOVQ 0(R8)(SI*8), R11 34 MOVQ 8(R8)(SI*8), R12 35 MOVQ 16(R8)(SI*8), R13 36 MOVQ 24(R8)(SI*8), R14 37 ADCQ 0(R9)(SI*8), R11 38 ADCQ 8(R9)(SI*8), R12 39 ADCQ 16(R9)(SI*8), R13 40 ADCQ 24(R9)(SI*8), R14 41 MOVQ R11, 0(R10)(SI*8) 42 MOVQ R12, 8(R10)(SI*8) 43 MOVQ R13, 16(R10)(SI*8) 44 MOVQ R14, 24(R10)(SI*8) 45 SBBQ CX, CX // save CF 46 47 ADDQ $4, SI // i += 4 48 SUBQ $4, DI // n -= 4 49 JGE U1 // if n >= 0 goto U1 50 51 V1: ADDQ $4, DI // n += 4 52 JLE E1 // if n <= 0 goto E1 53 54 L1: // n > 0 55 ADDQ CX, CX // restore CF 56 MOVQ 0(R8)(SI*8), R11 57 ADCQ 0(R9)(SI*8), R11 58 MOVQ R11, 0(R10)(SI*8) 59 SBBQ CX, CX // save CF 60 61 ADDQ $1, SI // i++ 62 SUBQ $1, DI // n-- 63 JG L1 // if n > 0 goto L1 64 65 E1: NEGQ CX 66 MOVQ CX, c+72(FP) // return c 67 RET 68 69 70 // func subVV(z, x, y []Word) (c Word) 71 // (same as addVV except for SBBQ instead of ADCQ and label names) 72 TEXT ·subVV(SB),NOSPLIT,$0 73 MOVQ z_len+8(FP), DI 74 MOVQ x+24(FP), R8 75 MOVQ y+48(FP), R9 76 MOVQ z+0(FP), R10 77 78 MOVQ $0, CX // c = 0 79 MOVQ $0, SI // i = 0 80 81 // s/JL/JMP/ below to disable the unrolled loop 82 SUBQ $4, DI // n -= 4 83 JL V2 // if n < 0 goto V2 84 85 U2: // n >= 0 86 // regular loop body unrolled 4x 87 ADDQ CX, CX // restore CF 88 MOVQ 0(R8)(SI*8), R11 89 MOVQ 8(R8)(SI*8), R12 90 MOVQ 16(R8)(SI*8), R13 91 MOVQ 24(R8)(SI*8), R14 92 SBBQ 0(R9)(SI*8), R11 93 SBBQ 8(R9)(SI*8), R12 94 SBBQ 16(R9)(SI*8), R13 95 SBBQ 24(R9)(SI*8), R14 96 MOVQ R11, 0(R10)(SI*8) 97 MOVQ R12, 8(R10)(SI*8) 98 MOVQ R13, 16(R10)(SI*8) 99 MOVQ R14, 24(R10)(SI*8) 100 SBBQ CX, CX // save CF 101 102 ADDQ $4, SI // i += 4 103 SUBQ $4, DI // n -= 4 104 JGE U2 // if n >= 0 goto U2 105 106 V2: ADDQ $4, DI // n += 4 107 JLE E2 // if n <= 0 goto E2 108 109 L2: // n > 0 110 ADDQ CX, CX // restore CF 111 MOVQ 0(R8)(SI*8), R11 112 SBBQ 0(R9)(SI*8), R11 113 MOVQ R11, 0(R10)(SI*8) 114 SBBQ CX, CX // save CF 115 116 ADDQ $1, SI // i++ 117 SUBQ $1, DI // n-- 118 JG L2 // if n > 0 goto L2 119 120 E2: NEGQ CX 121 MOVQ CX, c+72(FP) // return c 122 RET 123 124 125 // func addVW(z, x []Word, y Word) (c Word) 126 TEXT ·addVW(SB),NOSPLIT,$0 127 MOVQ z_len+8(FP), DI 128 CMPQ DI, $32 129 JG large 130 MOVQ x+24(FP), R8 131 MOVQ y+48(FP), CX // c = y 132 MOVQ z+0(FP), R10 133 134 MOVQ $0, SI // i = 0 135 136 // s/JL/JMP/ below to disable the unrolled loop 137 SUBQ $4, DI // n -= 4 138 JL V3 // if n < 4 goto V3 139 140 U3: // n >= 0 141 // regular loop body unrolled 4x 142 MOVQ 0(R8)(SI*8), R11 143 MOVQ 8(R8)(SI*8), R12 144 MOVQ 16(R8)(SI*8), R13 145 MOVQ 24(R8)(SI*8), R14 146 ADDQ CX, R11 147 ADCQ $0, R12 148 ADCQ $0, R13 149 ADCQ $0, R14 150 SBBQ CX, CX // save CF 151 NEGQ CX 152 MOVQ R11, 0(R10)(SI*8) 153 MOVQ R12, 8(R10)(SI*8) 154 MOVQ R13, 16(R10)(SI*8) 155 MOVQ R14, 24(R10)(SI*8) 156 157 ADDQ $4, SI // i += 4 158 SUBQ $4, DI // n -= 4 159 JGE U3 // if n >= 0 goto U3 160 161 V3: ADDQ $4, DI // n += 4 162 JLE E3 // if n <= 0 goto E3 163 164 L3: // n > 0 165 ADDQ 0(R8)(SI*8), CX 166 MOVQ CX, 0(R10)(SI*8) 167 SBBQ CX, CX // save CF 168 NEGQ CX 169 170 ADDQ $1, SI // i++ 171 SUBQ $1, DI // n-- 172 JG L3 // if n > 0 goto L3 173 174 E3: MOVQ CX, c+56(FP) // return c 175 RET 176 large: 177 JMP ·addVWlarge(SB) 178 179 180 // func subVW(z, x []Word, y Word) (c Word) 181 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 182 TEXT ·subVW(SB),NOSPLIT,$0 183 MOVQ z_len+8(FP), DI 184 CMPQ DI, $32 185 JG large 186 MOVQ x+24(FP), R8 187 MOVQ y+48(FP), CX // c = y 188 MOVQ z+0(FP), R10 189 190 MOVQ $0, SI // i = 0 191 192 // s/JL/JMP/ below to disable the unrolled loop 193 SUBQ $4, DI // n -= 4 194 JL V4 // if n < 4 goto V4 195 196 U4: // n >= 0 197 // regular loop body unrolled 4x 198 MOVQ 0(R8)(SI*8), R11 199 MOVQ 8(R8)(SI*8), R12 200 MOVQ 16(R8)(SI*8), R13 201 MOVQ 24(R8)(SI*8), R14 202 SUBQ CX, R11 203 SBBQ $0, R12 204 SBBQ $0, R13 205 SBBQ $0, R14 206 SBBQ CX, CX // save CF 207 NEGQ CX 208 MOVQ R11, 0(R10)(SI*8) 209 MOVQ R12, 8(R10)(SI*8) 210 MOVQ R13, 16(R10)(SI*8) 211 MOVQ R14, 24(R10)(SI*8) 212 213 ADDQ $4, SI // i += 4 214 SUBQ $4, DI // n -= 4 215 JGE U4 // if n >= 0 goto U4 216 217 V4: ADDQ $4, DI // n += 4 218 JLE E4 // if n <= 0 goto E4 219 220 L4: // n > 0 221 MOVQ 0(R8)(SI*8), R11 222 SUBQ CX, R11 223 MOVQ R11, 0(R10)(SI*8) 224 SBBQ CX, CX // save CF 225 NEGQ CX 226 227 ADDQ $1, SI // i++ 228 SUBQ $1, DI // n-- 229 JG L4 // if n > 0 goto L4 230 231 E4: MOVQ CX, c+56(FP) // return c 232 RET 233 large: 234 JMP ·subVWlarge(SB) 235 236 237 // func shlVU(z, x []Word, s uint) (c Word) 238 TEXT ·shlVU(SB),NOSPLIT,$0 239 MOVQ z_len+8(FP), BX // i = z 240 SUBQ $1, BX // i-- 241 JL X8b // i < 0 (n <= 0) 242 243 // n > 0 244 MOVQ z+0(FP), R10 245 MOVQ x+24(FP), R8 246 MOVQ s+48(FP), CX 247 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 248 MOVQ $0, DX 249 SHLQ CX, AX, DX // w1>>ŝ 250 MOVQ DX, c+56(FP) 251 252 CMPQ BX, $0 253 JLE X8a // i <= 0 254 255 // i > 0 256 L8: MOVQ AX, DX // w = w1 257 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 258 SHLQ CX, AX, DX // w<<s | w1>>ŝ 259 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ 260 SUBQ $1, BX // i-- 261 JG L8 // i > 0 262 263 // i <= 0 264 X8a: SHLQ CX, AX // w1<<s 265 MOVQ AX, (R10) // z[0] = w1<<s 266 RET 267 268 X8b: MOVQ $0, c+56(FP) 269 RET 270 271 272 // func shrVU(z, x []Word, s uint) (c Word) 273 TEXT ·shrVU(SB),NOSPLIT,$0 274 MOVQ z_len+8(FP), R11 275 SUBQ $1, R11 // n-- 276 JL X9b // n < 0 (n <= 0) 277 278 // n > 0 279 MOVQ z+0(FP), R10 280 MOVQ x+24(FP), R8 281 MOVQ s+48(FP), CX 282 MOVQ (R8), AX // w1 = x[0] 283 MOVQ $0, DX 284 SHRQ CX, AX, DX // w1<<ŝ 285 MOVQ DX, c+56(FP) 286 287 MOVQ $0, BX // i = 0 288 JMP E9 289 290 // i < n-1 291 L9: MOVQ AX, DX // w = w1 292 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 293 SHRQ CX, AX, DX // w>>s | w1<<ŝ 294 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ 295 ADDQ $1, BX // i++ 296 297 E9: CMPQ BX, R11 298 JL L9 // i < n-1 299 300 // i >= n-1 301 X9a: SHRQ CX, AX // w1>>s 302 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 303 RET 304 305 X9b: MOVQ $0, c+56(FP) 306 RET 307 308 309 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 310 TEXT ·mulAddVWW(SB),NOSPLIT,$0 311 MOVQ z+0(FP), R10 312 MOVQ x+24(FP), R8 313 MOVQ y+48(FP), R9 314 MOVQ r+56(FP), CX // c = r 315 MOVQ z_len+8(FP), R11 316 MOVQ $0, BX // i = 0 317 318 CMPQ R11, $4 319 JL E5 320 321 U5: // i+4 <= n 322 // regular loop body unrolled 4x 323 MOVQ (0*8)(R8)(BX*8), AX 324 MULQ R9 325 ADDQ CX, AX 326 ADCQ $0, DX 327 MOVQ AX, (0*8)(R10)(BX*8) 328 MOVQ DX, CX 329 MOVQ (1*8)(R8)(BX*8), AX 330 MULQ R9 331 ADDQ CX, AX 332 ADCQ $0, DX 333 MOVQ AX, (1*8)(R10)(BX*8) 334 MOVQ DX, CX 335 MOVQ (2*8)(R8)(BX*8), AX 336 MULQ R9 337 ADDQ CX, AX 338 ADCQ $0, DX 339 MOVQ AX, (2*8)(R10)(BX*8) 340 MOVQ DX, CX 341 MOVQ (3*8)(R8)(BX*8), AX 342 MULQ R9 343 ADDQ CX, AX 344 ADCQ $0, DX 345 MOVQ AX, (3*8)(R10)(BX*8) 346 MOVQ DX, CX 347 ADDQ $4, BX // i += 4 348 349 LEAQ 4(BX), DX 350 CMPQ DX, R11 351 JLE U5 352 JMP E5 353 354 L5: MOVQ (R8)(BX*8), AX 355 MULQ R9 356 ADDQ CX, AX 357 ADCQ $0, DX 358 MOVQ AX, (R10)(BX*8) 359 MOVQ DX, CX 360 ADDQ $1, BX // i++ 361 362 E5: CMPQ BX, R11 // i < n 363 JL L5 364 365 MOVQ CX, c+64(FP) 366 RET 367 368 369 // func addMulVVW(z, x []Word, y Word) (c Word) 370 TEXT ·addMulVVW(SB),NOSPLIT,$0 371 CMPB ·support_adx(SB), $1 372 JEQ adx 373 MOVQ z+0(FP), R10 374 MOVQ x+24(FP), R8 375 MOVQ y+48(FP), R9 376 MOVQ z_len+8(FP), R11 377 MOVQ $0, BX // i = 0 378 MOVQ $0, CX // c = 0 379 MOVQ R11, R12 380 ANDQ $-2, R12 381 CMPQ R11, $2 382 JAE A6 383 JMP E6 384 385 A6: 386 MOVQ (R8)(BX*8), AX 387 MULQ R9 388 ADDQ (R10)(BX*8), AX 389 ADCQ $0, DX 390 ADDQ CX, AX 391 ADCQ $0, DX 392 MOVQ DX, CX 393 MOVQ AX, (R10)(BX*8) 394 395 MOVQ (8)(R8)(BX*8), AX 396 MULQ R9 397 ADDQ (8)(R10)(BX*8), AX 398 ADCQ $0, DX 399 ADDQ CX, AX 400 ADCQ $0, DX 401 MOVQ DX, CX 402 MOVQ AX, (8)(R10)(BX*8) 403 404 ADDQ $2, BX 405 CMPQ BX, R12 406 JL A6 407 JMP E6 408 409 L6: MOVQ (R8)(BX*8), AX 410 MULQ R9 411 ADDQ CX, AX 412 ADCQ $0, DX 413 ADDQ AX, (R10)(BX*8) 414 ADCQ $0, DX 415 MOVQ DX, CX 416 ADDQ $1, BX // i++ 417 418 E6: CMPQ BX, R11 // i < n 419 JL L6 420 421 MOVQ CX, c+56(FP) 422 RET 423 424 adx: 425 MOVQ z_len+8(FP), R11 426 MOVQ z+0(FP), R10 427 MOVQ x+24(FP), R8 428 MOVQ y+48(FP), DX 429 MOVQ $0, BX // i = 0 430 MOVQ $0, CX // carry 431 CMPQ R11, $8 432 JAE adx_loop_header 433 CMPQ BX, R11 434 JL adx_short 435 MOVQ CX, c+56(FP) 436 RET 437 438 adx_loop_header: 439 MOVQ R11, R13 440 ANDQ $-8, R13 441 adx_loop: 442 XORQ R9, R9 // unset flags 443 MULXQ (R8), SI, DI 444 ADCXQ CX,SI 445 ADOXQ (R10), SI 446 MOVQ SI,(R10) 447 448 MULXQ 8(R8), AX, CX 449 ADCXQ DI, AX 450 ADOXQ 8(R10), AX 451 MOVQ AX, 8(R10) 452 453 MULXQ 16(R8), SI, DI 454 ADCXQ CX, SI 455 ADOXQ 16(R10), SI 456 MOVQ SI, 16(R10) 457 458 MULXQ 24(R8), AX, CX 459 ADCXQ DI, AX 460 ADOXQ 24(R10), AX 461 MOVQ AX, 24(R10) 462 463 MULXQ 32(R8), SI, DI 464 ADCXQ CX, SI 465 ADOXQ 32(R10), SI 466 MOVQ SI, 32(R10) 467 468 MULXQ 40(R8), AX, CX 469 ADCXQ DI, AX 470 ADOXQ 40(R10), AX 471 MOVQ AX, 40(R10) 472 473 MULXQ 48(R8), SI, DI 474 ADCXQ CX, SI 475 ADOXQ 48(R10), SI 476 MOVQ SI, 48(R10) 477 478 MULXQ 56(R8), AX, CX 479 ADCXQ DI, AX 480 ADOXQ 56(R10), AX 481 MOVQ AX, 56(R10) 482 483 ADCXQ R9, CX 484 ADOXQ R9, CX 485 486 ADDQ $64, R8 487 ADDQ $64, R10 488 ADDQ $8, BX 489 490 CMPQ BX, R13 491 JL adx_loop 492 MOVQ z+0(FP), R10 493 MOVQ x+24(FP), R8 494 CMPQ BX, R11 495 JL adx_short 496 MOVQ CX, c+56(FP) 497 RET 498 499 adx_short: 500 MULXQ (R8)(BX*8), SI, DI 501 ADDQ CX, SI 502 ADCQ $0, DI 503 ADDQ SI, (R10)(BX*8) 504 ADCQ $0, DI 505 MOVQ DI, CX 506 ADDQ $1, BX // i++ 507 508 CMPQ BX, R11 509 JL adx_short 510 511 MOVQ CX, c+56(FP) 512 RET 513 514 515