github.com/code-reading/golang@v0.0.0-20220303082512-ba5bc0e589a3/go/src/math/big/arith_arm64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !math_big_pure_go 6 // +build !math_big_pure_go 7 8 #include "textflag.h" 9 10 // This file provides fast assembly versions for the elementary 11 // arithmetic operations on vectors implemented in arith.go. 12 13 // TODO: Consider re-implementing using Advanced SIMD 14 // once the assembler supports those instructions. 15 16 // func mulWW(x, y Word) (z1, z0 Word) 17 TEXT ·mulWW(SB),NOSPLIT,$0 18 MOVD x+0(FP), R0 19 MOVD y+8(FP), R1 20 MUL R0, R1, R2 21 UMULH R0, R1, R3 22 MOVD R3, z1+16(FP) 23 MOVD R2, z0+24(FP) 24 RET 25 26 27 // func addVV(z, x, y []Word) (c Word) 28 TEXT ·addVV(SB),NOSPLIT,$0 29 MOVD z_len+8(FP), R0 30 MOVD x+24(FP), R8 31 MOVD y+48(FP), R9 32 MOVD z+0(FP), R10 33 ADDS $0, R0 // clear carry flag 34 TBZ $0, R0, two 35 MOVD.P 8(R8), R11 36 MOVD.P 8(R9), R15 37 ADCS R15, R11 38 MOVD.P R11, 8(R10) 39 SUB $1, R0 40 two: 41 TBZ $1, R0, loop 42 LDP.P 16(R8), (R11, R12) 43 LDP.P 16(R9), (R15, R16) 44 ADCS R15, R11 45 ADCS R16, R12 46 STP.P (R11, R12), 16(R10) 47 SUB $2, R0 48 loop: 49 CBZ R0, done // careful not to touch the carry flag 50 LDP.P 32(R8), (R11, R12) 51 LDP -16(R8), (R13, R14) 52 LDP.P 32(R9), (R15, R16) 53 LDP -16(R9), (R17, R19) 54 ADCS R15, R11 55 ADCS R16, R12 56 ADCS R17, R13 57 ADCS R19, R14 58 STP.P (R11, R12), 32(R10) 59 STP (R13, R14), -16(R10) 60 SUB $4, R0 61 B loop 62 done: 63 CSET HS, R0 // extract carry flag 64 MOVD R0, c+72(FP) 65 RET 66 67 68 // func subVV(z, x, y []Word) (c Word) 69 TEXT ·subVV(SB),NOSPLIT,$0 70 MOVD z_len+8(FP), R0 71 MOVD x+24(FP), R8 72 MOVD y+48(FP), R9 73 MOVD z+0(FP), R10 74 CMP R0, R0 // set carry flag 75 TBZ $0, R0, two 76 MOVD.P 8(R8), R11 77 MOVD.P 8(R9), R15 78 SBCS R15, R11 79 MOVD.P R11, 8(R10) 80 SUB $1, R0 81 two: 82 TBZ $1, R0, loop 83 LDP.P 16(R8), (R11, R12) 84 LDP.P 16(R9), (R15, R16) 85 SBCS R15, R11 86 SBCS R16, R12 87 STP.P (R11, R12), 16(R10) 88 SUB $2, R0 89 loop: 90 CBZ R0, done // careful not to touch the carry flag 91 LDP.P 32(R8), (R11, R12) 92 LDP -16(R8), (R13, R14) 93 LDP.P 32(R9), (R15, R16) 94 LDP -16(R9), (R17, R19) 95 SBCS R15, R11 96 SBCS R16, R12 97 SBCS R17, R13 98 SBCS R19, R14 99 STP.P (R11, R12), 32(R10) 100 STP (R13, R14), -16(R10) 101 SUB $4, R0 102 B loop 103 done: 104 CSET LO, R0 // extract carry flag 105 MOVD R0, c+72(FP) 106 RET 107 108 #define vwOneOp(instr, op1) \ 109 MOVD.P 8(R1), R4; \ 110 instr op1, R4; \ 111 MOVD.P R4, 8(R3); 112 113 // handle the first 1~4 elements before starting iteration in addVW/subVW 114 #define vwPreIter(instr1, instr2, counter, target) \ 115 vwOneOp(instr1, R2); \ 116 SUB $1, counter; \ 117 CBZ counter, target; \ 118 vwOneOp(instr2, $0); \ 119 SUB $1, counter; \ 120 CBZ counter, target; \ 121 vwOneOp(instr2, $0); \ 122 SUB $1, counter; \ 123 CBZ counter, target; \ 124 vwOneOp(instr2, $0); 125 126 // do one iteration of add or sub in addVW/subVW 127 #define vwOneIter(instr, counter, exit) \ 128 CBZ counter, exit; \ // careful not to touch the carry flag 129 LDP.P 32(R1), (R4, R5); \ 130 LDP -16(R1), (R6, R7); \ 131 instr $0, R4, R8; \ 132 instr $0, R5, R9; \ 133 instr $0, R6, R10; \ 134 instr $0, R7, R11; \ 135 STP.P (R8, R9), 32(R3); \ 136 STP (R10, R11), -16(R3); \ 137 SUB $4, counter; 138 139 // do one iteration of copy in addVW/subVW 140 #define vwOneIterCopy(counter, exit) \ 141 CBZ counter, exit; \ 142 LDP.P 32(R1), (R4, R5); \ 143 LDP -16(R1), (R6, R7); \ 144 STP.P (R4, R5), 32(R3); \ 145 STP (R6, R7), -16(R3); \ 146 SUB $4, counter; 147 148 // func addVW(z, x []Word, y Word) (c Word) 149 // The 'large' branch handles large 'z'. It checks the carry flag on every iteration 150 // and switches to copy if we are done with carries. The copying is skipped as well 151 // if 'x' and 'z' happen to share the same underlying storage. 152 // The overhead of the checking and branching is visible when 'z' are small (~5%), 153 // so set a threshold of 32, and remain the small-sized part entirely untouched. 154 TEXT ·addVW(SB),NOSPLIT,$0 155 MOVD z+0(FP), R3 156 MOVD z_len+8(FP), R0 157 MOVD x+24(FP), R1 158 MOVD y+48(FP), R2 159 CMP $32, R0 160 BGE large // large-sized 'z' and 'x' 161 CBZ R0, len0 // the length of z is 0 162 MOVD.P 8(R1), R4 163 ADDS R2, R4 // z[0] = x[0] + y, set carry 164 MOVD.P R4, 8(R3) 165 SUB $1, R0 166 CBZ R0, len1 // the length of z is 1 167 TBZ $0, R0, two 168 MOVD.P 8(R1), R4 // do it once 169 ADCS $0, R4 170 MOVD.P R4, 8(R3) 171 SUB $1, R0 172 two: // do it twice 173 TBZ $1, R0, loop 174 LDP.P 16(R1), (R4, R5) 175 ADCS $0, R4, R8 // c, z[i] = x[i] + c 176 ADCS $0, R5, R9 177 STP.P (R8, R9), 16(R3) 178 SUB $2, R0 179 loop: // do four times per round 180 vwOneIter(ADCS, R0, len1) 181 B loop 182 len1: 183 CSET HS, R2 // extract carry flag 184 len0: 185 MOVD R2, c+56(FP) 186 done: 187 RET 188 large: 189 AND $0x3, R0, R10 190 AND $~0x3, R0 191 // unrolling for the first 1~4 elements to avoid saving the carry 192 // flag in each step, adjust $R0 if we unrolled 4 elements 193 vwPreIter(ADDS, ADCS, R10, add4) 194 SUB $4, R0 195 add4: 196 BCC copy 197 vwOneIter(ADCS, R0, len1) 198 B add4 199 copy: 200 MOVD ZR, c+56(FP) 201 CMP R1, R3 202 BEQ done 203 copy_4: // no carry flag, copy the rest 204 vwOneIterCopy(R0, done) 205 B copy_4 206 207 // func subVW(z, x []Word, y Word) (c Word) 208 // The 'large' branch handles large 'z'. It checks the carry flag on every iteration 209 // and switches to copy if we are done with carries. The copying is skipped as well 210 // if 'x' and 'z' happen to share the same underlying storage. 211 // The overhead of the checking and branching is visible when 'z' are small (~5%), 212 // so set a threshold of 32, and remain the small-sized part entirely untouched. 213 TEXT ·subVW(SB),NOSPLIT,$0 214 MOVD z+0(FP), R3 215 MOVD z_len+8(FP), R0 216 MOVD x+24(FP), R1 217 MOVD y+48(FP), R2 218 CMP $32, R0 219 BGE large // large-sized 'z' and 'x' 220 CBZ R0, len0 // the length of z is 0 221 MOVD.P 8(R1), R4 222 SUBS R2, R4 // z[0] = x[0] - y, set carry 223 MOVD.P R4, 8(R3) 224 SUB $1, R0 225 CBZ R0, len1 // the length of z is 1 226 TBZ $0, R0, two // do it once 227 MOVD.P 8(R1), R4 228 SBCS $0, R4 229 MOVD.P R4, 8(R3) 230 SUB $1, R0 231 two: // do it twice 232 TBZ $1, R0, loop 233 LDP.P 16(R1), (R4, R5) 234 SBCS $0, R4, R8 // c, z[i] = x[i] + c 235 SBCS $0, R5, R9 236 STP.P (R8, R9), 16(R3) 237 SUB $2, R0 238 loop: // do four times per round 239 vwOneIter(SBCS, R0, len1) 240 B loop 241 len1: 242 CSET LO, R2 // extract carry flag 243 len0: 244 MOVD R2, c+56(FP) 245 done: 246 RET 247 large: 248 AND $0x3, R0, R10 249 AND $~0x3, R0 250 // unrolling for the first 1~4 elements to avoid saving the carry 251 // flag in each step, adjust $R0 if we unrolled 4 elements 252 vwPreIter(SUBS, SBCS, R10, sub4) 253 SUB $4, R0 254 sub4: 255 BCS copy 256 vwOneIter(SBCS, R0, len1) 257 B sub4 258 copy: 259 MOVD ZR, c+56(FP) 260 CMP R1, R3 261 BEQ done 262 copy_4: // no carry flag, copy the rest 263 vwOneIterCopy(R0, done) 264 B copy_4 265 266 // func shlVU(z, x []Word, s uint) (c Word) 267 // This implementation handles the shift operation from the high word to the low word, 268 // which may be an error for the case where the low word of x overlaps with the high 269 // word of z. When calling this function directly, you need to pay attention to this 270 // situation. 271 TEXT ·shlVU(SB),NOSPLIT,$0 272 LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z) 273 MOVD x+24(FP), R2 274 MOVD s+48(FP), R3 275 ADD R1<<3, R0 // R0 = &z[n] 276 ADD R1<<3, R2 // R2 = &x[n] 277 CBZ R1, len0 278 CBZ R3, copy // if the number of shift is 0, just copy x to z 279 MOVD $64, R4 280 SUB R3, R4 281 // handling the most significant element x[n-1] 282 MOVD.W -8(R2), R6 283 LSR R4, R6, R5 // return value 284 LSL R3, R6, R8 // x[i] << s 285 SUB $1, R1 286 one: TBZ $0, R1, two 287 MOVD.W -8(R2), R6 288 LSR R4, R6, R7 289 ORR R8, R7 290 LSL R3, R6, R8 291 SUB $1, R1 292 MOVD.W R7, -8(R0) 293 two: 294 TBZ $1, R1, loop 295 LDP.W -16(R2), (R6, R7) 296 LSR R4, R7, R10 297 ORR R8, R10 298 LSL R3, R7 299 LSR R4, R6, R9 300 ORR R7, R9 301 LSL R3, R6, R8 302 SUB $2, R1 303 STP.W (R9, R10), -16(R0) 304 loop: 305 CBZ R1, done 306 LDP.W -32(R2), (R10, R11) 307 LDP 16(R2), (R12, R13) 308 LSR R4, R13, R23 309 ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) 310 LSL R3, R13 311 LSR R4, R12, R22 312 ORR R13, R22 313 LSL R3, R12 314 LSR R4, R11, R21 315 ORR R12, R21 316 LSL R3, R11 317 LSR R4, R10, R20 318 ORR R11, R20 319 LSL R3, R10, R8 320 STP.W (R20, R21), -32(R0) 321 STP (R22, R23), 16(R0) 322 SUB $4, R1 323 B loop 324 done: 325 MOVD.W R8, -8(R0) // the first element x[0] 326 MOVD R5, c+56(FP) // the part moved out from x[n-1] 327 RET 328 copy: 329 CMP R0, R2 330 BEQ len0 331 TBZ $0, R1, ctwo 332 MOVD.W -8(R2), R4 333 MOVD.W R4, -8(R0) 334 SUB $1, R1 335 ctwo: 336 TBZ $1, R1, cloop 337 LDP.W -16(R2), (R4, R5) 338 STP.W (R4, R5), -16(R0) 339 SUB $2, R1 340 cloop: 341 CBZ R1, len0 342 LDP.W -32(R2), (R4, R5) 343 LDP 16(R2), (R6, R7) 344 STP.W (R4, R5), -32(R0) 345 STP (R6, R7), 16(R0) 346 SUB $4, R1 347 B cloop 348 len0: 349 MOVD $0, c+56(FP) 350 RET 351 352 // func shrVU(z, x []Word, s uint) (c Word) 353 // This implementation handles the shift operation from the low word to the high word, 354 // which may be an error for the case where the high word of x overlaps with the low 355 // word of z. When calling this function directly, you need to pay attention to this 356 // situation. 357 TEXT ·shrVU(SB),NOSPLIT,$0 358 MOVD z+0(FP), R0 359 MOVD z_len+8(FP), R1 360 MOVD x+24(FP), R2 361 MOVD s+48(FP), R3 362 MOVD $0, R8 363 MOVD $64, R4 364 SUB R3, R4 365 CBZ R1, len0 366 CBZ R3, copy // if the number of shift is 0, just copy x to z 367 368 MOVD.P 8(R2), R20 369 LSR R3, R20, R8 370 LSL R4, R20 371 MOVD R20, c+56(FP) // deal with the first element 372 SUB $1, R1 373 374 TBZ $0, R1, two 375 MOVD.P 8(R2), R6 376 LSL R4, R6, R20 377 ORR R8, R20 378 LSR R3, R6, R8 379 MOVD.P R20, 8(R0) 380 SUB $1, R1 381 two: 382 TBZ $1, R1, loop 383 LDP.P 16(R2), (R6, R7) 384 LSL R4, R6, R20 385 LSR R3, R6 386 ORR R8, R20 387 LSL R4, R7, R21 388 LSR R3, R7, R8 389 ORR R6, R21 390 STP.P (R20, R21), 16(R0) 391 SUB $2, R1 392 loop: 393 CBZ R1, done 394 LDP.P 32(R2), (R10, R11) 395 LDP -16(R2), (R12, R13) 396 LSL R4, R10, R20 397 LSR R3, R10 398 ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) 399 LSL R4, R11, R21 400 LSR R3, R11 401 ORR R10, R21 402 LSL R4, R12, R22 403 LSR R3, R12 404 ORR R11, R22 405 LSL R4, R13, R23 406 LSR R3, R13, R8 407 ORR R12, R23 408 STP.P (R20, R21), 32(R0) 409 STP (R22, R23), -16(R0) 410 SUB $4, R1 411 B loop 412 done: 413 MOVD R8, (R0) // deal with the last element 414 RET 415 copy: 416 CMP R0, R2 417 BEQ len0 418 TBZ $0, R1, ctwo 419 MOVD.P 8(R2), R3 420 MOVD.P R3, 8(R0) 421 SUB $1, R1 422 ctwo: 423 TBZ $1, R1, cloop 424 LDP.P 16(R2), (R4, R5) 425 STP.P (R4, R5), 16(R0) 426 SUB $2, R1 427 cloop: 428 CBZ R1, len0 429 LDP.P 32(R2), (R4, R5) 430 LDP -16(R2), (R6, R7) 431 STP.P (R4, R5), 32(R0) 432 STP (R6, R7), -16(R0) 433 SUB $4, R1 434 B cloop 435 len0: 436 MOVD $0, c+56(FP) 437 RET 438 439 440 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 441 TEXT ·mulAddVWW(SB),NOSPLIT,$0 442 MOVD z+0(FP), R1 443 MOVD z_len+8(FP), R0 444 MOVD x+24(FP), R2 445 MOVD y+48(FP), R3 446 MOVD r+56(FP), R4 447 // c, z = x * y + r 448 TBZ $0, R0, two 449 MOVD.P 8(R2), R5 450 MUL R3, R5, R7 451 UMULH R3, R5, R8 452 ADDS R4, R7 453 ADC $0, R8, R4 // c, z[i] = x[i] * y + r 454 MOVD.P R7, 8(R1) 455 SUB $1, R0 456 two: 457 TBZ $1, R0, loop 458 LDP.P 16(R2), (R5, R6) 459 MUL R3, R5, R10 460 UMULH R3, R5, R11 461 ADDS R4, R10 462 MUL R3, R6, R12 463 UMULH R3, R6, R13 464 ADCS R12, R11 465 ADC $0, R13, R4 466 467 STP.P (R10, R11), 16(R1) 468 SUB $2, R0 469 loop: 470 CBZ R0, done 471 LDP.P 32(R2), (R5, R6) 472 LDP -16(R2), (R7, R8) 473 474 MUL R3, R5, R10 475 UMULH R3, R5, R11 476 ADDS R4, R10 477 MUL R3, R6, R12 478 UMULH R3, R6, R13 479 ADCS R11, R12 480 481 MUL R3, R7, R14 482 UMULH R3, R7, R15 483 ADCS R13, R14 484 MUL R3, R8, R16 485 UMULH R3, R8, R17 486 ADCS R15, R16 487 ADC $0, R17, R4 488 489 STP.P (R10, R12), 32(R1) 490 STP (R14, R16), -16(R1) 491 SUB $4, R0 492 B loop 493 done: 494 MOVD R4, c+64(FP) 495 RET 496 497 498 // func addMulVVW(z, x []Word, y Word) (c Word) 499 TEXT ·addMulVVW(SB),NOSPLIT,$0 500 MOVD z+0(FP), R1 501 MOVD z_len+8(FP), R0 502 MOVD x+24(FP), R2 503 MOVD y+48(FP), R3 504 MOVD $0, R4 505 506 TBZ $0, R0, two 507 508 MOVD.P 8(R2), R5 509 MOVD (R1), R6 510 511 MUL R5, R3, R7 512 UMULH R5, R3, R8 513 514 ADDS R7, R6 515 ADC $0, R8, R4 516 517 MOVD.P R6, 8(R1) 518 SUB $1, R0 519 520 two: 521 TBZ $1, R0, loop 522 523 LDP.P 16(R2), (R5, R10) 524 LDP (R1), (R6, R11) 525 526 MUL R10, R3, R13 527 UMULH R10, R3, R12 528 529 MUL R5, R3, R7 530 UMULH R5, R3, R8 531 532 ADDS R4, R6 533 ADCS R13, R11 534 ADC $0, R12 535 536 ADDS R7, R6 537 ADCS R8, R11 538 ADC $0, R12, R4 539 540 STP.P (R6, R11), 16(R1) 541 SUB $2, R0 542 543 // The main loop of this code operates on a block of 4 words every iteration 544 // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] 545 // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next 546 // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. 547 loop: 548 CBZ R0, done 549 550 LDP.P 16(R2), (R5, R6) 551 LDP.P 16(R2), (R7, R8) 552 553 LDP (R1), (R9, R10) 554 ADDS R4, R9 555 MUL R6, R3, R14 556 ADCS R14, R10 557 MUL R7, R3, R15 558 LDP 16(R1), (R11, R12) 559 ADCS R15, R11 560 MUL R8, R3, R16 561 ADCS R16, R12 562 UMULH R8, R3, R20 563 ADC $0, R20 564 565 MUL R5, R3, R13 566 ADDS R13, R9 567 UMULH R5, R3, R17 568 ADCS R17, R10 569 UMULH R6, R3, R21 570 STP.P (R9, R10), 16(R1) 571 ADCS R21, R11 572 UMULH R7, R3, R19 573 ADCS R19, R12 574 STP.P (R11, R12), 16(R1) 575 ADC $0, R20, R4 576 577 SUB $4, R0 578 B loop 579 580 done: 581 MOVD R4, c+56(FP) 582 RET 583 584