github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/math/big/arith_arm64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !math_big_pure_go 6 // +build !math_big_pure_go 7 8 #include "textflag.h" 9 10 // This file provides fast assembly versions for the elementary 11 // arithmetic operations on vectors implemented in arith.go. 12 13 // TODO: Consider re-implementing using Advanced SIMD 14 // once the assembler supports those instructions. 15 16 // func addVV(z, x, y []Word) (c Word) 17 TEXT ·addVV(SB),NOSPLIT,$0 18 MOVD z_len+8(FP), R0 19 MOVD x+24(FP), R8 20 MOVD y+48(FP), R9 21 MOVD z+0(FP), R10 22 ADDS $0, R0 // clear carry flag 23 TBZ $0, R0, two 24 MOVD.P 8(R8), R11 25 MOVD.P 8(R9), R15 26 ADCS R15, R11 27 MOVD.P R11, 8(R10) 28 SUB $1, R0 29 two: 30 TBZ $1, R0, loop 31 LDP.P 16(R8), (R11, R12) 32 LDP.P 16(R9), (R15, R16) 33 ADCS R15, R11 34 ADCS R16, R12 35 STP.P (R11, R12), 16(R10) 36 SUB $2, R0 37 loop: 38 CBZ R0, done // careful not to touch the carry flag 39 LDP.P 32(R8), (R11, R12) 40 LDP -16(R8), (R13, R14) 41 LDP.P 32(R9), (R15, R16) 42 LDP -16(R9), (R17, R19) 43 ADCS R15, R11 44 ADCS R16, R12 45 ADCS R17, R13 46 ADCS R19, R14 47 STP.P (R11, R12), 32(R10) 48 STP (R13, R14), -16(R10) 49 SUB $4, R0 50 B loop 51 done: 52 CSET HS, R0 // extract carry flag 53 MOVD R0, c+72(FP) 54 RET 55 56 57 // func subVV(z, x, y []Word) (c Word) 58 TEXT ·subVV(SB),NOSPLIT,$0 59 MOVD z_len+8(FP), R0 60 MOVD x+24(FP), R8 61 MOVD y+48(FP), R9 62 MOVD z+0(FP), R10 63 CMP R0, R0 // set carry flag 64 TBZ $0, R0, two 65 MOVD.P 8(R8), R11 66 MOVD.P 8(R9), R15 67 SBCS R15, R11 68 MOVD.P R11, 8(R10) 69 SUB $1, R0 70 two: 71 TBZ $1, R0, loop 72 LDP.P 16(R8), (R11, R12) 73 LDP.P 16(R9), (R15, R16) 74 SBCS R15, R11 75 SBCS R16, R12 76 STP.P (R11, R12), 16(R10) 77 SUB $2, R0 78 loop: 79 CBZ R0, done // careful not to touch the carry flag 80 LDP.P 32(R8), (R11, R12) 81 LDP -16(R8), (R13, R14) 82 LDP.P 32(R9), (R15, R16) 83 LDP -16(R9), (R17, R19) 84 SBCS R15, R11 85 SBCS R16, R12 86 SBCS R17, R13 87 SBCS R19, R14 88 STP.P (R11, R12), 32(R10) 89 STP (R13, R14), -16(R10) 90 SUB $4, R0 91 B loop 92 done: 93 CSET LO, R0 // extract carry flag 94 MOVD R0, c+72(FP) 95 RET 96 97 #define vwOneOp(instr, op1) \ 98 MOVD.P 8(R1), R4; \ 99 instr op1, R4; \ 100 MOVD.P R4, 8(R3); 101 102 // handle the first 1~4 elements before starting iteration in addVW/subVW 103 #define vwPreIter(instr1, instr2, counter, target) \ 104 vwOneOp(instr1, R2); \ 105 SUB $1, counter; \ 106 CBZ counter, target; \ 107 vwOneOp(instr2, $0); \ 108 SUB $1, counter; \ 109 CBZ counter, target; \ 110 vwOneOp(instr2, $0); \ 111 SUB $1, counter; \ 112 CBZ counter, target; \ 113 vwOneOp(instr2, $0); 114 115 // do one iteration of add or sub in addVW/subVW 116 #define vwOneIter(instr, counter, exit) \ 117 CBZ counter, exit; \ // careful not to touch the carry flag 118 LDP.P 32(R1), (R4, R5); \ 119 LDP -16(R1), (R6, R7); \ 120 instr $0, R4, R8; \ 121 instr $0, R5, R9; \ 122 instr $0, R6, R10; \ 123 instr $0, R7, R11; \ 124 STP.P (R8, R9), 32(R3); \ 125 STP (R10, R11), -16(R3); \ 126 SUB $4, counter; 127 128 // do one iteration of copy in addVW/subVW 129 #define vwOneIterCopy(counter, exit) \ 130 CBZ counter, exit; \ 131 LDP.P 32(R1), (R4, R5); \ 132 LDP -16(R1), (R6, R7); \ 133 STP.P (R4, R5), 32(R3); \ 134 STP (R6, R7), -16(R3); \ 135 SUB $4, counter; 136 137 // func addVW(z, x []Word, y Word) (c Word) 138 // The 'large' branch handles large 'z'. It checks the carry flag on every iteration 139 // and switches to copy if we are done with carries. The copying is skipped as well 140 // if 'x' and 'z' happen to share the same underlying storage. 141 // The overhead of the checking and branching is visible when 'z' are small (~5%), 142 // so set a threshold of 32, and remain the small-sized part entirely untouched. 143 TEXT ·addVW(SB),NOSPLIT,$0 144 MOVD z+0(FP), R3 145 MOVD z_len+8(FP), R0 146 MOVD x+24(FP), R1 147 MOVD y+48(FP), R2 148 CMP $32, R0 149 BGE large // large-sized 'z' and 'x' 150 CBZ R0, len0 // the length of z is 0 151 MOVD.P 8(R1), R4 152 ADDS R2, R4 // z[0] = x[0] + y, set carry 153 MOVD.P R4, 8(R3) 154 SUB $1, R0 155 CBZ R0, len1 // the length of z is 1 156 TBZ $0, R0, two 157 MOVD.P 8(R1), R4 // do it once 158 ADCS $0, R4 159 MOVD.P R4, 8(R3) 160 SUB $1, R0 161 two: // do it twice 162 TBZ $1, R0, loop 163 LDP.P 16(R1), (R4, R5) 164 ADCS $0, R4, R8 // c, z[i] = x[i] + c 165 ADCS $0, R5, R9 166 STP.P (R8, R9), 16(R3) 167 SUB $2, R0 168 loop: // do four times per round 169 vwOneIter(ADCS, R0, len1) 170 B loop 171 len1: 172 CSET HS, R2 // extract carry flag 173 len0: 174 MOVD R2, c+56(FP) 175 done: 176 RET 177 large: 178 AND $0x3, R0, R10 179 AND $~0x3, R0 180 // unrolling for the first 1~4 elements to avoid saving the carry 181 // flag in each step, adjust $R0 if we unrolled 4 elements 182 vwPreIter(ADDS, ADCS, R10, add4) 183 SUB $4, R0 184 add4: 185 BCC copy 186 vwOneIter(ADCS, R0, len1) 187 B add4 188 copy: 189 MOVD ZR, c+56(FP) 190 CMP R1, R3 191 BEQ done 192 copy_4: // no carry flag, copy the rest 193 vwOneIterCopy(R0, done) 194 B copy_4 195 196 // func subVW(z, x []Word, y Word) (c Word) 197 // The 'large' branch handles large 'z'. It checks the carry flag on every iteration 198 // and switches to copy if we are done with carries. The copying is skipped as well 199 // if 'x' and 'z' happen to share the same underlying storage. 200 // The overhead of the checking and branching is visible when 'z' are small (~5%), 201 // so set a threshold of 32, and remain the small-sized part entirely untouched. 202 TEXT ·subVW(SB),NOSPLIT,$0 203 MOVD z+0(FP), R3 204 MOVD z_len+8(FP), R0 205 MOVD x+24(FP), R1 206 MOVD y+48(FP), R2 207 CMP $32, R0 208 BGE large // large-sized 'z' and 'x' 209 CBZ R0, len0 // the length of z is 0 210 MOVD.P 8(R1), R4 211 SUBS R2, R4 // z[0] = x[0] - y, set carry 212 MOVD.P R4, 8(R3) 213 SUB $1, R0 214 CBZ R0, len1 // the length of z is 1 215 TBZ $0, R0, two // do it once 216 MOVD.P 8(R1), R4 217 SBCS $0, R4 218 MOVD.P R4, 8(R3) 219 SUB $1, R0 220 two: // do it twice 221 TBZ $1, R0, loop 222 LDP.P 16(R1), (R4, R5) 223 SBCS $0, R4, R8 // c, z[i] = x[i] + c 224 SBCS $0, R5, R9 225 STP.P (R8, R9), 16(R3) 226 SUB $2, R0 227 loop: // do four times per round 228 vwOneIter(SBCS, R0, len1) 229 B loop 230 len1: 231 CSET LO, R2 // extract carry flag 232 len0: 233 MOVD R2, c+56(FP) 234 done: 235 RET 236 large: 237 AND $0x3, R0, R10 238 AND $~0x3, R0 239 // unrolling for the first 1~4 elements to avoid saving the carry 240 // flag in each step, adjust $R0 if we unrolled 4 elements 241 vwPreIter(SUBS, SBCS, R10, sub4) 242 SUB $4, R0 243 sub4: 244 BCS copy 245 vwOneIter(SBCS, R0, len1) 246 B sub4 247 copy: 248 MOVD ZR, c+56(FP) 249 CMP R1, R3 250 BEQ done 251 copy_4: // no carry flag, copy the rest 252 vwOneIterCopy(R0, done) 253 B copy_4 254 255 // func shlVU(z, x []Word, s uint) (c Word) 256 // This implementation handles the shift operation from the high word to the low word, 257 // which may be an error for the case where the low word of x overlaps with the high 258 // word of z. When calling this function directly, you need to pay attention to this 259 // situation. 260 TEXT ·shlVU(SB),NOSPLIT,$0 261 LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z) 262 MOVD x+24(FP), R2 263 MOVD s+48(FP), R3 264 ADD R1<<3, R0 // R0 = &z[n] 265 ADD R1<<3, R2 // R2 = &x[n] 266 CBZ R1, len0 267 CBZ R3, copy // if the number of shift is 0, just copy x to z 268 MOVD $64, R4 269 SUB R3, R4 270 // handling the most significant element x[n-1] 271 MOVD.W -8(R2), R6 272 LSR R4, R6, R5 // return value 273 LSL R3, R6, R8 // x[i] << s 274 SUB $1, R1 275 one: TBZ $0, R1, two 276 MOVD.W -8(R2), R6 277 LSR R4, R6, R7 278 ORR R8, R7 279 LSL R3, R6, R8 280 SUB $1, R1 281 MOVD.W R7, -8(R0) 282 two: 283 TBZ $1, R1, loop 284 LDP.W -16(R2), (R6, R7) 285 LSR R4, R7, R10 286 ORR R8, R10 287 LSL R3, R7 288 LSR R4, R6, R9 289 ORR R7, R9 290 LSL R3, R6, R8 291 SUB $2, R1 292 STP.W (R9, R10), -16(R0) 293 loop: 294 CBZ R1, done 295 LDP.W -32(R2), (R10, R11) 296 LDP 16(R2), (R12, R13) 297 LSR R4, R13, R23 298 ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) 299 LSL R3, R13 300 LSR R4, R12, R22 301 ORR R13, R22 302 LSL R3, R12 303 LSR R4, R11, R21 304 ORR R12, R21 305 LSL R3, R11 306 LSR R4, R10, R20 307 ORR R11, R20 308 LSL R3, R10, R8 309 STP.W (R20, R21), -32(R0) 310 STP (R22, R23), 16(R0) 311 SUB $4, R1 312 B loop 313 done: 314 MOVD.W R8, -8(R0) // the first element x[0] 315 MOVD R5, c+56(FP) // the part moved out from x[n-1] 316 RET 317 copy: 318 CMP R0, R2 319 BEQ len0 320 TBZ $0, R1, ctwo 321 MOVD.W -8(R2), R4 322 MOVD.W R4, -8(R0) 323 SUB $1, R1 324 ctwo: 325 TBZ $1, R1, cloop 326 LDP.W -16(R2), (R4, R5) 327 STP.W (R4, R5), -16(R0) 328 SUB $2, R1 329 cloop: 330 CBZ R1, len0 331 LDP.W -32(R2), (R4, R5) 332 LDP 16(R2), (R6, R7) 333 STP.W (R4, R5), -32(R0) 334 STP (R6, R7), 16(R0) 335 SUB $4, R1 336 B cloop 337 len0: 338 MOVD $0, c+56(FP) 339 RET 340 341 // func shrVU(z, x []Word, s uint) (c Word) 342 // This implementation handles the shift operation from the low word to the high word, 343 // which may be an error for the case where the high word of x overlaps with the low 344 // word of z. When calling this function directly, you need to pay attention to this 345 // situation. 346 TEXT ·shrVU(SB),NOSPLIT,$0 347 MOVD z+0(FP), R0 348 MOVD z_len+8(FP), R1 349 MOVD x+24(FP), R2 350 MOVD s+48(FP), R3 351 MOVD $0, R8 352 MOVD $64, R4 353 SUB R3, R4 354 CBZ R1, len0 355 CBZ R3, copy // if the number of shift is 0, just copy x to z 356 357 MOVD.P 8(R2), R20 358 LSR R3, R20, R8 359 LSL R4, R20 360 MOVD R20, c+56(FP) // deal with the first element 361 SUB $1, R1 362 363 TBZ $0, R1, two 364 MOVD.P 8(R2), R6 365 LSL R4, R6, R20 366 ORR R8, R20 367 LSR R3, R6, R8 368 MOVD.P R20, 8(R0) 369 SUB $1, R1 370 two: 371 TBZ $1, R1, loop 372 LDP.P 16(R2), (R6, R7) 373 LSL R4, R6, R20 374 LSR R3, R6 375 ORR R8, R20 376 LSL R4, R7, R21 377 LSR R3, R7, R8 378 ORR R6, R21 379 STP.P (R20, R21), 16(R0) 380 SUB $2, R1 381 loop: 382 CBZ R1, done 383 LDP.P 32(R2), (R10, R11) 384 LDP -16(R2), (R12, R13) 385 LSL R4, R10, R20 386 LSR R3, R10 387 ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) 388 LSL R4, R11, R21 389 LSR R3, R11 390 ORR R10, R21 391 LSL R4, R12, R22 392 LSR R3, R12 393 ORR R11, R22 394 LSL R4, R13, R23 395 LSR R3, R13, R8 396 ORR R12, R23 397 STP.P (R20, R21), 32(R0) 398 STP (R22, R23), -16(R0) 399 SUB $4, R1 400 B loop 401 done: 402 MOVD R8, (R0) // deal with the last element 403 RET 404 copy: 405 CMP R0, R2 406 BEQ len0 407 TBZ $0, R1, ctwo 408 MOVD.P 8(R2), R3 409 MOVD.P R3, 8(R0) 410 SUB $1, R1 411 ctwo: 412 TBZ $1, R1, cloop 413 LDP.P 16(R2), (R4, R5) 414 STP.P (R4, R5), 16(R0) 415 SUB $2, R1 416 cloop: 417 CBZ R1, len0 418 LDP.P 32(R2), (R4, R5) 419 LDP -16(R2), (R6, R7) 420 STP.P (R4, R5), 32(R0) 421 STP (R6, R7), -16(R0) 422 SUB $4, R1 423 B cloop 424 len0: 425 MOVD $0, c+56(FP) 426 RET 427 428 429 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 430 TEXT ·mulAddVWW(SB),NOSPLIT,$0 431 MOVD z+0(FP), R1 432 MOVD z_len+8(FP), R0 433 MOVD x+24(FP), R2 434 MOVD y+48(FP), R3 435 MOVD r+56(FP), R4 436 // c, z = x * y + r 437 TBZ $0, R0, two 438 MOVD.P 8(R2), R5 439 MUL R3, R5, R7 440 UMULH R3, R5, R8 441 ADDS R4, R7 442 ADC $0, R8, R4 // c, z[i] = x[i] * y + r 443 MOVD.P R7, 8(R1) 444 SUB $1, R0 445 two: 446 TBZ $1, R0, loop 447 LDP.P 16(R2), (R5, R6) 448 MUL R3, R5, R10 449 UMULH R3, R5, R11 450 ADDS R4, R10 451 MUL R3, R6, R12 452 UMULH R3, R6, R13 453 ADCS R12, R11 454 ADC $0, R13, R4 455 456 STP.P (R10, R11), 16(R1) 457 SUB $2, R0 458 loop: 459 CBZ R0, done 460 LDP.P 32(R2), (R5, R6) 461 LDP -16(R2), (R7, R8) 462 463 MUL R3, R5, R10 464 UMULH R3, R5, R11 465 ADDS R4, R10 466 MUL R3, R6, R12 467 UMULH R3, R6, R13 468 ADCS R11, R12 469 470 MUL R3, R7, R14 471 UMULH R3, R7, R15 472 ADCS R13, R14 473 MUL R3, R8, R16 474 UMULH R3, R8, R17 475 ADCS R15, R16 476 ADC $0, R17, R4 477 478 STP.P (R10, R12), 32(R1) 479 STP (R14, R16), -16(R1) 480 SUB $4, R0 481 B loop 482 done: 483 MOVD R4, c+64(FP) 484 RET 485 486 487 // func addMulVVW(z, x []Word, y Word) (c Word) 488 TEXT ·addMulVVW(SB),NOSPLIT,$0 489 MOVD z+0(FP), R1 490 MOVD z_len+8(FP), R0 491 MOVD x+24(FP), R2 492 MOVD y+48(FP), R3 493 MOVD $0, R4 494 495 TBZ $0, R0, two 496 497 MOVD.P 8(R2), R5 498 MOVD (R1), R6 499 500 MUL R5, R3, R7 501 UMULH R5, R3, R8 502 503 ADDS R7, R6 504 ADC $0, R8, R4 505 506 MOVD.P R6, 8(R1) 507 SUB $1, R0 508 509 two: 510 TBZ $1, R0, loop 511 512 LDP.P 16(R2), (R5, R10) 513 LDP (R1), (R6, R11) 514 515 MUL R10, R3, R13 516 UMULH R10, R3, R12 517 518 MUL R5, R3, R7 519 UMULH R5, R3, R8 520 521 ADDS R4, R6 522 ADCS R13, R11 523 ADC $0, R12 524 525 ADDS R7, R6 526 ADCS R8, R11 527 ADC $0, R12, R4 528 529 STP.P (R6, R11), 16(R1) 530 SUB $2, R0 531 532 // The main loop of this code operates on a block of 4 words every iteration 533 // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] 534 // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next 535 // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. 536 loop: 537 CBZ R0, done 538 539 LDP.P 16(R2), (R5, R6) 540 LDP.P 16(R2), (R7, R8) 541 542 LDP (R1), (R9, R10) 543 ADDS R4, R9 544 MUL R6, R3, R14 545 ADCS R14, R10 546 MUL R7, R3, R15 547 LDP 16(R1), (R11, R12) 548 ADCS R15, R11 549 MUL R8, R3, R16 550 ADCS R16, R12 551 UMULH R8, R3, R20 552 ADC $0, R20 553 554 MUL R5, R3, R13 555 ADDS R13, R9 556 UMULH R5, R3, R17 557 ADCS R17, R10 558 UMULH R6, R3, R21 559 STP.P (R9, R10), 16(R1) 560 ADCS R21, R11 561 UMULH R7, R3, R19 562 ADCS R19, R12 563 STP.P (R11, R12), 16(R1) 564 ADC $0, R20, R4 565 566 SUB $4, R0 567 B loop 568 569 done: 570 MOVD R4, c+56(FP) 571 RET 572 573