github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/math/big/arith_arm64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // TODO: Consider re-implementing using Advanced SIMD 13 // once the assembler supports those instructions. 14 15 // func mulWW(x, y Word) (z1, z0 Word) 16 TEXT ·mulWW(SB),NOSPLIT,$0 17 MOVD x+0(FP), R0 18 MOVD y+8(FP), R1 19 MUL R0, R1, R2 20 UMULH R0, R1, R3 21 MOVD R3, z1+16(FP) 22 MOVD R2, z0+24(FP) 23 RET 24 25 26 // func addVV(z, x, y []Word) (c Word) 27 TEXT ·addVV(SB),NOSPLIT,$0 28 MOVD z_len+8(FP), R0 29 MOVD x+24(FP), R8 30 MOVD y+48(FP), R9 31 MOVD z+0(FP), R10 32 ADDS $0, R0 // clear carry flag 33 TBZ $0, R0, two 34 MOVD.P 8(R8), R11 35 MOVD.P 8(R9), R15 36 ADCS R15, R11 37 MOVD.P R11, 8(R10) 38 SUB $1, R0 39 two: 40 TBZ $1, R0, loop 41 LDP.P 16(R8), (R11, R12) 42 LDP.P 16(R9), (R15, R16) 43 ADCS R15, R11 44 ADCS R16, R12 45 STP.P (R11, R12), 16(R10) 46 SUB $2, R0 47 loop: 48 CBZ R0, done // careful not to touch the carry flag 49 LDP.P 32(R8), (R11, R12) 50 LDP -16(R8), (R13, R14) 51 LDP.P 32(R9), (R15, R16) 52 LDP -16(R9), (R17, R19) 53 ADCS R15, R11 54 ADCS R16, R12 55 ADCS R17, R13 56 ADCS R19, R14 57 STP.P (R11, R12), 32(R10) 58 STP (R13, R14), -16(R10) 59 SUB $4, R0 60 B loop 61 done: 62 CSET HS, R0 // extract carry flag 63 MOVD R0, c+72(FP) 64 RET 65 66 67 // func subVV(z, x, y []Word) (c Word) 68 TEXT ·subVV(SB),NOSPLIT,$0 69 MOVD z_len+8(FP), R0 70 MOVD x+24(FP), R8 71 MOVD y+48(FP), R9 72 MOVD z+0(FP), R10 73 CMP R0, R0 // set carry flag 74 TBZ $0, R0, two 75 MOVD.P 8(R8), R11 76 MOVD.P 8(R9), R15 77 SBCS R15, R11 78 MOVD.P R11, 8(R10) 79 SUB $1, R0 80 two: 81 TBZ $1, R0, loop 82 LDP.P 16(R8), (R11, R12) 83 LDP.P 16(R9), (R15, R16) 84 SBCS R15, R11 85 SBCS R16, R12 86 STP.P (R11, R12), 16(R10) 87 SUB $2, R0 88 loop: 89 CBZ R0, done // careful not to touch the carry flag 90 LDP.P 32(R8), (R11, R12) 91 LDP -16(R8), (R13, R14) 92 LDP.P 32(R9), (R15, R16) 93 LDP -16(R9), (R17, R19) 94 SBCS R15, R11 95 SBCS R16, R12 96 SBCS R17, R13 97 SBCS R19, R14 98 STP.P (R11, R12), 32(R10) 99 STP (R13, R14), -16(R10) 100 SUB $4, R0 101 B loop 102 done: 103 CSET LO, R0 // extract carry flag 104 MOVD R0, c+72(FP) 105 RET 106 107 #define vwOneOp(instr, op1) \ 108 MOVD.P 8(R1), R4; \ 109 instr op1, R4; \ 110 MOVD.P R4, 8(R3); 111 112 // handle the first 1~4 elements before starting iteration in addVW/subVW 113 #define vwPreIter(instr1, instr2, counter, target) \ 114 vwOneOp(instr1, R2); \ 115 SUB $1, counter; \ 116 CBZ counter, target; \ 117 vwOneOp(instr2, $0); \ 118 SUB $1, counter; \ 119 CBZ counter, target; \ 120 vwOneOp(instr2, $0); \ 121 SUB $1, counter; \ 122 CBZ counter, target; \ 123 vwOneOp(instr2, $0); 124 125 // do one iteration of add or sub in addVW/subVW 126 #define vwOneIter(instr, counter, exit) \ 127 CBZ counter, exit; \ // careful not to touch the carry flag 128 LDP.P 32(R1), (R4, R5); \ 129 LDP -16(R1), (R6, R7); \ 130 instr $0, R4, R8; \ 131 instr $0, R5, R9; \ 132 instr $0, R6, R10; \ 133 instr $0, R7, R11; \ 134 STP.P (R8, R9), 32(R3); \ 135 STP (R10, R11), -16(R3); \ 136 SUB $4, counter; 137 138 // do one iteration of copy in addVW/subVW 139 #define vwOneIterCopy(counter, exit) \ 140 CBZ counter, exit; \ 141 LDP.P 32(R1), (R4, R5); \ 142 LDP -16(R1), (R6, R7); \ 143 STP.P (R4, R5), 32(R3); \ 144 STP (R6, R7), -16(R3); \ 145 SUB $4, counter; 146 147 // func addVW(z, x []Word, y Word) (c Word) 148 // The 'large' branch handles large 'z'. It checks the carry flag on every iteration 149 // and switches to copy if we are done with carries. The copying is skipped as well 150 // if 'x' and 'z' happen to share the same underlying storage. 151 // The overhead of the checking and branching is visible when 'z' are small (~5%), 152 // so set a threshold of 32, and remain the small-sized part entirely untouched. 153 TEXT ·addVW(SB),NOSPLIT,$0 154 MOVD z+0(FP), R3 155 MOVD z_len+8(FP), R0 156 MOVD x+24(FP), R1 157 MOVD y+48(FP), R2 158 CMP $32, R0 159 BGE large // large-sized 'z' and 'x' 160 CBZ R0, len0 // the length of z is 0 161 MOVD.P 8(R1), R4 162 ADDS R2, R4 // z[0] = x[0] + y, set carry 163 MOVD.P R4, 8(R3) 164 SUB $1, R0 165 CBZ R0, len1 // the length of z is 1 166 TBZ $0, R0, two 167 MOVD.P 8(R1), R4 // do it once 168 ADCS $0, R4 169 MOVD.P R4, 8(R3) 170 SUB $1, R0 171 two: // do it twice 172 TBZ $1, R0, loop 173 LDP.P 16(R1), (R4, R5) 174 ADCS $0, R4, R8 // c, z[i] = x[i] + c 175 ADCS $0, R5, R9 176 STP.P (R8, R9), 16(R3) 177 SUB $2, R0 178 loop: // do four times per round 179 vwOneIter(ADCS, R0, len1) 180 B loop 181 len1: 182 CSET HS, R2 // extract carry flag 183 len0: 184 MOVD R2, c+56(FP) 185 done: 186 RET 187 large: 188 AND $0x3, R0, R10 189 AND $~0x3, R0 190 // unrolling for the first 1~4 elements to avoid saving the carry 191 // flag in each step, adjust $R0 if we unrolled 4 elements 192 vwPreIter(ADDS, ADCS, R10, add4) 193 SUB $4, R0 194 add4: 195 BCC copy 196 vwOneIter(ADCS, R0, len1) 197 B add4 198 copy: 199 MOVD ZR, c+56(FP) 200 CMP R1, R3 201 BEQ done 202 copy_4: // no carry flag, copy the rest 203 vwOneIterCopy(R0, done) 204 B copy_4 205 206 // func subVW(z, x []Word, y Word) (c Word) 207 // The 'large' branch handles large 'z'. It checks the carry flag on every iteration 208 // and switches to copy if we are done with carries. The copying is skipped as well 209 // if 'x' and 'z' happen to share the same underlying storage. 210 // The overhead of the checking and branching is visible when 'z' are small (~5%), 211 // so set a threshold of 32, and remain the small-sized part entirely untouched. 212 TEXT ·subVW(SB),NOSPLIT,$0 213 MOVD z+0(FP), R3 214 MOVD z_len+8(FP), R0 215 MOVD x+24(FP), R1 216 MOVD y+48(FP), R2 217 CMP $32, R0 218 BGE large // large-sized 'z' and 'x' 219 CBZ R0, len0 // the length of z is 0 220 MOVD.P 8(R1), R4 221 SUBS R2, R4 // z[0] = x[0] - y, set carry 222 MOVD.P R4, 8(R3) 223 SUB $1, R0 224 CBZ R0, len1 // the length of z is 1 225 TBZ $0, R0, two // do it once 226 MOVD.P 8(R1), R4 227 SBCS $0, R4 228 MOVD.P R4, 8(R3) 229 SUB $1, R0 230 two: // do it twice 231 TBZ $1, R0, loop 232 LDP.P 16(R1), (R4, R5) 233 SBCS $0, R4, R8 // c, z[i] = x[i] + c 234 SBCS $0, R5, R9 235 STP.P (R8, R9), 16(R3) 236 SUB $2, R0 237 loop: // do four times per round 238 vwOneIter(SBCS, R0, len1) 239 B loop 240 len1: 241 CSET LO, R2 // extract carry flag 242 len0: 243 MOVD R2, c+56(FP) 244 done: 245 RET 246 large: 247 AND $0x3, R0, R10 248 AND $~0x3, R0 249 // unrolling for the first 1~4 elements to avoid saving the carry 250 // flag in each step, adjust $R0 if we unrolled 4 elements 251 vwPreIter(SUBS, SBCS, R10, sub4) 252 SUB $4, R0 253 sub4: 254 BCS copy 255 vwOneIter(SBCS, R0, len1) 256 B sub4 257 copy: 258 MOVD ZR, c+56(FP) 259 CMP R1, R3 260 BEQ done 261 copy_4: // no carry flag, copy the rest 262 vwOneIterCopy(R0, done) 263 B copy_4 264 265 // func shlVU(z, x []Word, s uint) (c Word) 266 // This implementation handles the shift operation from the high word to the low word, 267 // which may be an error for the case where the low word of x overlaps with the high 268 // word of z. When calling this function directly, you need to pay attention to this 269 // situation. 270 TEXT ·shlVU(SB),NOSPLIT,$0 271 LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z) 272 MOVD x+24(FP), R2 273 MOVD s+48(FP), R3 274 ADD R1<<3, R0 // R0 = &z[n] 275 ADD R1<<3, R2 // R2 = &x[n] 276 CBZ R1, len0 277 CBZ R3, copy // if the number of shift is 0, just copy x to z 278 MOVD $64, R4 279 SUB R3, R4 280 // handling the most significant element x[n-1] 281 MOVD.W -8(R2), R6 282 LSR R4, R6, R5 // return value 283 LSL R3, R6, R8 // x[i] << s 284 SUB $1, R1 285 one: TBZ $0, R1, two 286 MOVD.W -8(R2), R6 287 LSR R4, R6, R7 288 ORR R8, R7 289 LSL R3, R6, R8 290 SUB $1, R1 291 MOVD.W R7, -8(R0) 292 two: 293 TBZ $1, R1, loop 294 LDP.W -16(R2), (R6, R7) 295 LSR R4, R7, R10 296 ORR R8, R10 297 LSL R3, R7 298 LSR R4, R6, R9 299 ORR R7, R9 300 LSL R3, R6, R8 301 SUB $2, R1 302 STP.W (R9, R10), -16(R0) 303 loop: 304 CBZ R1, done 305 LDP.W -32(R2), (R10, R11) 306 LDP 16(R2), (R12, R13) 307 LSR R4, R13, R23 308 ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) 309 LSL R3, R13 310 LSR R4, R12, R22 311 ORR R13, R22 312 LSL R3, R12 313 LSR R4, R11, R21 314 ORR R12, R21 315 LSL R3, R11 316 LSR R4, R10, R20 317 ORR R11, R20 318 LSL R3, R10, R8 319 STP.W (R20, R21), -32(R0) 320 STP (R22, R23), 16(R0) 321 SUB $4, R1 322 B loop 323 done: 324 MOVD.W R8, -8(R0) // the first element x[0] 325 MOVD R5, c+56(FP) // the part moved out from x[n-1] 326 RET 327 copy: 328 CMP R0, R2 329 BEQ len0 330 TBZ $0, R1, ctwo 331 MOVD.W -8(R2), R4 332 MOVD.W R4, -8(R0) 333 SUB $1, R1 334 ctwo: 335 TBZ $1, R1, cloop 336 LDP.W -16(R2), (R4, R5) 337 STP.W (R4, R5), -16(R0) 338 SUB $2, R1 339 cloop: 340 CBZ R1, len0 341 LDP.W -32(R2), (R4, R5) 342 LDP 16(R2), (R6, R7) 343 STP.W (R4, R5), -32(R0) 344 STP (R6, R7), 16(R0) 345 SUB $4, R1 346 B cloop 347 len0: 348 MOVD $0, c+56(FP) 349 RET 350 351 // func shrVU(z, x []Word, s uint) (c Word) 352 // This implementation handles the shift operation from the low word to the high word, 353 // which may be an error for the case where the high word of x overlaps with the low 354 // word of z. When calling this function directly, you need to pay attention to this 355 // situation. 356 TEXT ·shrVU(SB),NOSPLIT,$0 357 MOVD z+0(FP), R0 358 MOVD z_len+8(FP), R1 359 MOVD x+24(FP), R2 360 MOVD s+48(FP), R3 361 MOVD $0, R8 362 MOVD $64, R4 363 SUB R3, R4 364 CBZ R1, len0 365 CBZ R3, copy // if the number of shift is 0, just copy x to z 366 367 MOVD.P 8(R2), R20 368 LSR R3, R20, R8 369 LSL R4, R20 370 MOVD R20, c+56(FP) // deal with the first element 371 SUB $1, R1 372 373 TBZ $0, R1, two 374 MOVD.P 8(R2), R6 375 LSL R4, R6, R20 376 ORR R8, R20 377 LSR R3, R6, R8 378 MOVD.P R20, 8(R0) 379 SUB $1, R1 380 two: 381 TBZ $1, R1, loop 382 LDP.P 16(R2), (R6, R7) 383 LSL R4, R6, R20 384 LSR R3, R6 385 ORR R8, R20 386 LSL R4, R7, R21 387 LSR R3, R7, R8 388 ORR R6, R21 389 STP.P (R20, R21), 16(R0) 390 SUB $2, R1 391 loop: 392 CBZ R1, done 393 LDP.P 32(R2), (R10, R11) 394 LDP -16(R2), (R12, R13) 395 LSL R4, R10, R20 396 LSR R3, R10 397 ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) 398 LSL R4, R11, R21 399 LSR R3, R11 400 ORR R10, R21 401 LSL R4, R12, R22 402 LSR R3, R12 403 ORR R11, R22 404 LSL R4, R13, R23 405 LSR R3, R13, R8 406 ORR R12, R23 407 STP.P (R20, R21), 32(R0) 408 STP (R22, R23), -16(R0) 409 SUB $4, R1 410 B loop 411 done: 412 MOVD R8, (R0) // deal with the last element 413 RET 414 copy: 415 CMP R0, R2 416 BEQ len0 417 TBZ $0, R1, ctwo 418 MOVD.P 8(R2), R3 419 MOVD.P R3, 8(R0) 420 SUB $1, R1 421 ctwo: 422 TBZ $1, R1, cloop 423 LDP.P 16(R2), (R4, R5) 424 STP.P (R4, R5), 16(R0) 425 SUB $2, R1 426 cloop: 427 CBZ R1, len0 428 LDP.P 32(R2), (R4, R5) 429 LDP -16(R2), (R6, R7) 430 STP.P (R4, R5), 32(R0) 431 STP (R6, R7), -16(R0) 432 SUB $4, R1 433 B cloop 434 len0: 435 MOVD $0, c+56(FP) 436 RET 437 438 439 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 440 TEXT ·mulAddVWW(SB),NOSPLIT,$0 441 MOVD z+0(FP), R1 442 MOVD z_len+8(FP), R0 443 MOVD x+24(FP), R2 444 MOVD y+48(FP), R3 445 MOVD r+56(FP), R4 446 // c, z = x * y + r 447 TBZ $0, R0, two 448 MOVD.P 8(R2), R5 449 MUL R3, R5, R7 450 UMULH R3, R5, R8 451 ADDS R4, R7 452 ADC $0, R8, R4 // c, z[i] = x[i] * y + r 453 MOVD.P R7, 8(R1) 454 SUB $1, R0 455 two: 456 TBZ $1, R0, loop 457 LDP.P 16(R2), (R5, R6) 458 MUL R3, R5, R10 459 UMULH R3, R5, R11 460 ADDS R4, R10 461 MUL R3, R6, R12 462 UMULH R3, R6, R13 463 ADCS R12, R11 464 ADC $0, R13, R4 465 466 STP.P (R10, R11), 16(R1) 467 SUB $2, R0 468 loop: 469 CBZ R0, done 470 LDP.P 32(R2), (R5, R6) 471 LDP -16(R2), (R7, R8) 472 473 MUL R3, R5, R10 474 UMULH R3, R5, R11 475 ADDS R4, R10 476 MUL R3, R6, R12 477 UMULH R3, R6, R13 478 ADCS R11, R12 479 480 MUL R3, R7, R14 481 UMULH R3, R7, R15 482 ADCS R13, R14 483 MUL R3, R8, R16 484 UMULH R3, R8, R17 485 ADCS R15, R16 486 ADC $0, R17, R4 487 488 STP.P (R10, R12), 32(R1) 489 STP (R14, R16), -16(R1) 490 SUB $4, R0 491 B loop 492 done: 493 MOVD R4, c+64(FP) 494 RET 495 496 497 // func addMulVVW(z, x []Word, y Word) (c Word) 498 TEXT ·addMulVVW(SB),NOSPLIT,$0 499 MOVD z+0(FP), R1 500 MOVD z_len+8(FP), R0 501 MOVD x+24(FP), R2 502 MOVD y+48(FP), R3 503 MOVD $0, R4 504 505 TBZ $0, R0, two 506 507 MOVD.P 8(R2), R5 508 MOVD (R1), R6 509 510 MUL R5, R3, R7 511 UMULH R5, R3, R8 512 513 ADDS R7, R6 514 ADC $0, R8, R4 515 516 MOVD.P R6, 8(R1) 517 SUB $1, R0 518 519 two: 520 TBZ $1, R0, loop 521 522 LDP.P 16(R2), (R5, R10) 523 LDP (R1), (R6, R11) 524 525 MUL R10, R3, R13 526 UMULH R10, R3, R12 527 528 MUL R5, R3, R7 529 UMULH R5, R3, R8 530 531 ADDS R4, R6 532 ADCS R13, R11 533 ADC $0, R12 534 535 ADDS R7, R6 536 ADCS R8, R11 537 ADC $0, R12, R4 538 539 STP.P (R6, R11), 16(R1) 540 SUB $2, R0 541 542 // The main loop of this code operates on a block of 4 words every iteration 543 // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] 544 // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next 545 // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. 546 loop: 547 CBZ R0, done 548 549 LDP.P 16(R2), (R5, R6) 550 LDP.P 16(R2), (R7, R8) 551 552 LDP (R1), (R9, R10) 553 ADDS R4, R9 554 MUL R6, R3, R14 555 ADCS R14, R10 556 MUL R7, R3, R15 557 LDP 16(R1), (R11, R12) 558 ADCS R15, R11 559 MUL R8, R3, R16 560 ADCS R16, R12 561 UMULH R8, R3, R20 562 ADC $0, R20 563 564 MUL R5, R3, R13 565 ADDS R13, R9 566 UMULH R5, R3, R17 567 ADCS R17, R10 568 UMULH R6, R3, R21 569 STP.P (R9, R10), 16(R1) 570 ADCS R21, R11 571 UMULH R7, R3, R19 572 ADCS R19, R12 573 STP.P (R11, R12), 16(R1) 574 ADC $0, R20, R4 575 576 SUB $4, R0 577 B loop 578 579 done: 580 MOVD R4, c+56(FP) 581 RET 582 583