github.com/remobjects/goldbaselibrary@v0.0.0-20230924164425-d458680a936b/Source/Gold/math/big/arith_arm64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // TODO: Consider re-implementing using Advanced SIMD 13 // once the assembler supports those instructions. 14 15 // func mulWW(x, y Word) (z1, z0 Word) 16 TEXT ·mulWW(SB),NOSPLIT,$0 17 MOVD x+0(FP), R0 18 MOVD y+8(FP), R1 19 MUL R0, R1, R2 20 UMULH R0, R1, R3 21 MOVD R3, z1+16(FP) 22 MOVD R2, z0+24(FP) 23 RET 24 25 26 // func divWW(x1, x0, y Word) (q, r Word) 27 TEXT ·divWW(SB),NOSPLIT,$0 28 B ·divWW_g(SB) // ARM64 has no multiword division 29 30 31 // func addVV(z, x, y []Word) (c Word) 32 TEXT ·addVV(SB),NOSPLIT,$0 33 MOVD z_len+8(FP), R0 34 MOVD x+24(FP), R8 35 MOVD y+48(FP), R9 36 MOVD z+0(FP), R10 37 ADDS $0, R0 // clear carry flag 38 TBZ $0, R0, two 39 MOVD.P 8(R8), R11 40 MOVD.P 8(R9), R15 41 ADCS R15, R11 42 MOVD.P R11, 8(R10) 43 SUB $1, R0 44 two: 45 TBZ $1, R0, loop 46 LDP.P 16(R8), (R11, R12) 47 LDP.P 16(R9), (R15, R16) 48 ADCS R15, R11 49 ADCS R16, R12 50 STP.P (R11, R12), 16(R10) 51 SUB $2, R0 52 loop: 53 CBZ R0, done // careful not to touch the carry flag 54 LDP.P 32(R8), (R11, R12) 55 LDP -16(R8), (R13, R14) 56 LDP.P 32(R9), (R15, R16) 57 LDP -16(R9), (R17, R19) 58 ADCS R15, R11 59 ADCS R16, R12 60 ADCS R17, R13 61 ADCS R19, R14 62 STP.P (R11, R12), 32(R10) 63 STP (R13, R14), -16(R10) 64 SUB $4, R0 65 B loop 66 done: 67 CSET HS, R0 // extract carry flag 68 MOVD R0, c+72(FP) 69 RET 70 71 72 // func subVV(z, x, y []Word) (c Word) 73 TEXT ·subVV(SB),NOSPLIT,$0 74 MOVD z_len+8(FP), R0 75 MOVD x+24(FP), R8 76 MOVD y+48(FP), R9 77 MOVD z+0(FP), R10 78 CMP R0, R0 // set carry flag 79 TBZ $0, R0, two 80 MOVD.P 8(R8), R11 81 MOVD.P 8(R9), R15 82 SBCS R15, R11 83 MOVD.P R11, 8(R10) 84 SUB $1, R0 85 two: 86 TBZ $1, R0, loop 87 LDP.P 16(R8), (R11, R12) 88 LDP.P 16(R9), (R15, R16) 89 SBCS R15, R11 90 SBCS R16, R12 91 STP.P (R11, R12), 16(R10) 92 SUB $2, R0 93 loop: 94 CBZ R0, done // careful not to touch the carry flag 95 LDP.P 32(R8), (R11, R12) 96 LDP -16(R8), (R13, R14) 97 LDP.P 32(R9), (R15, R16) 98 LDP -16(R9), (R17, R19) 99 SBCS R15, R11 100 SBCS R16, R12 101 SBCS R17, R13 102 SBCS R19, R14 103 STP.P (R11, R12), 32(R10) 104 STP (R13, R14), -16(R10) 105 SUB $4, R0 106 B loop 107 done: 108 CSET LO, R0 // extract carry flag 109 MOVD R0, c+72(FP) 110 RET 111 112 113 // func addVW(z, x []Word, y Word) (c Word) 114 TEXT ·addVW(SB),NOSPLIT,$0 115 MOVD z+0(FP), R3 116 MOVD z_len+8(FP), R0 117 MOVD x+24(FP), R1 118 MOVD y+48(FP), R2 119 CBZ R0, len0 // the length of z is 0 120 MOVD.P 8(R1), R4 121 ADDS R2, R4 // z[0] = x[0] + y, set carry 122 MOVD.P R4, 8(R3) 123 SUB $1, R0 124 CBZ R0, len1 // the length of z is 1 125 TBZ $0, R0, two 126 MOVD.P 8(R1), R4 // do it once 127 ADCS $0, R4 128 MOVD.P R4, 8(R3) 129 SUB $1, R0 130 two: // do it twice 131 TBZ $1, R0, loop 132 LDP.P 16(R1), (R4, R5) 133 ADCS $0, R4, R8 // c, z[i] = x[i] + c 134 ADCS $0, R5, R9 135 STP.P (R8, R9), 16(R3) 136 SUB $2, R0 137 loop: // do four times per round 138 CBZ R0, len1 // careful not to touch the carry flag 139 LDP.P 32(R1), (R4, R5) 140 LDP -16(R1), (R6, R7) 141 ADCS $0, R4, R8 142 ADCS $0, R5, R9 143 ADCS $0, R6, R10 144 ADCS $0, R7, R11 145 STP.P (R8, R9), 32(R3) 146 STP (R10, R11), -16(R3) 147 SUB $4, R0 148 B loop 149 len1: 150 CSET HS, R2 // extract carry flag 151 len0: 152 MOVD R2, c+56(FP) 153 RET 154 155 // func subVW(z, x []Word, y Word) (c Word) 156 TEXT ·subVW(SB),NOSPLIT,$0 157 MOVD z+0(FP), R3 158 MOVD z_len+8(FP), R0 159 MOVD x+24(FP), R1 160 MOVD y+48(FP), R2 161 CBZ R0, len0 // the length of z is 0 162 MOVD.P 8(R1), R4 163 SUBS R2, R4 // z[0] = x[0] - y, set carry 164 MOVD.P R4, 8(R3) 165 SUB $1, R0 166 CBZ R0, len1 // the length of z is 1 167 TBZ $0, R0, two // do it once 168 MOVD.P 8(R1), R4 169 SBCS $0, R4 170 MOVD.P R4, 8(R3) 171 SUB $1, R0 172 two: // do it twice 173 TBZ $1, R0, loop 174 LDP.P 16(R1), (R4, R5) 175 SBCS $0, R4, R8 // c, z[i] = x[i] + c 176 SBCS $0, R5, R9 177 STP.P (R8, R9), 16(R3) 178 SUB $2, R0 179 loop: // do four times per round 180 CBZ R0, len1 // careful not to touch the carry flag 181 LDP.P 32(R1), (R4, R5) 182 LDP -16(R1), (R6, R7) 183 SBCS $0, R4, R8 184 SBCS $0, R5, R9 185 SBCS $0, R6, R10 186 SBCS $0, R7, R11 187 STP.P (R8, R9), 32(R3) 188 STP (R10, R11), -16(R3) 189 SUB $4, R0 190 B loop 191 len1: 192 CSET LO, R2 // extract carry flag 193 len0: 194 MOVD R2, c+56(FP) 195 RET 196 197 // func shlVU(z, x []Word, s uint) (c Word) 198 // This implementation handles the shift operation from the high word to the low word, 199 // which may be an error for the case where the low word of x overlaps with the high 200 // word of z. When calling this function directly, you need to pay attention to this 201 // situation. 202 TEXT ·shlVU(SB),NOSPLIT,$0 203 LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z) 204 MOVD x+24(FP), R2 205 MOVD s+48(FP), R3 206 ADD R1<<3, R0 // R0 = &z[n] 207 ADD R1<<3, R2 // R2 = &x[n] 208 CBZ R1, len0 209 CBZ R3, copy // if the number of shift is 0, just copy x to z 210 MOVD $64, R4 211 SUB R3, R4 212 // handling the most significant element x[n-1] 213 MOVD.W -8(R2), R6 214 LSR R4, R6, R5 // return value 215 LSL R3, R6, R8 // x[i] << s 216 SUB $1, R1 217 one: TBZ $0, R1, two 218 MOVD.W -8(R2), R6 219 LSR R4, R6, R7 220 ORR R8, R7 221 LSL R3, R6, R8 222 SUB $1, R1 223 MOVD.W R7, -8(R0) 224 two: 225 TBZ $1, R1, loop 226 LDP.W -16(R2), (R6, R7) 227 LSR R4, R7, R10 228 ORR R8, R10 229 LSL R3, R7 230 LSR R4, R6, R9 231 ORR R7, R9 232 LSL R3, R6, R8 233 SUB $2, R1 234 STP.W (R9, R10), -16(R0) 235 loop: 236 CBZ R1, done 237 LDP.W -32(R2), (R10, R11) 238 LDP 16(R2), (R12, R13) 239 LSR R4, R13, R23 240 ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) 241 LSL R3, R13 242 LSR R4, R12, R22 243 ORR R13, R22 244 LSL R3, R12 245 LSR R4, R11, R21 246 ORR R12, R21 247 LSL R3, R11 248 LSR R4, R10, R20 249 ORR R11, R20 250 LSL R3, R10, R8 251 STP.W (R20, R21), -32(R0) 252 STP (R22, R23), 16(R0) 253 SUB $4, R1 254 B loop 255 done: 256 MOVD.W R8, -8(R0) // the first element x[0] 257 MOVD R5, c+56(FP) // the part moved out from x[n-1] 258 RET 259 copy: 260 CMP R0, R2 261 BEQ len0 262 TBZ $0, R1, ctwo 263 MOVD.W -8(R2), R4 264 MOVD.W R4, -8(R0) 265 SUB $1, R1 266 ctwo: 267 TBZ $1, R1, cloop 268 LDP.W -16(R2), (R4, R5) 269 STP.W (R4, R5), -16(R0) 270 SUB $2, R1 271 cloop: 272 CBZ R1, len0 273 LDP.W -32(R2), (R4, R5) 274 LDP 16(R2), (R6, R7) 275 STP.W (R4, R5), -32(R0) 276 STP (R6, R7), 16(R0) 277 SUB $4, R1 278 B cloop 279 len0: 280 MOVD $0, c+56(FP) 281 RET 282 283 // func shrVU(z, x []Word, s uint) (c Word) 284 // This implementation handles the shift operation from the low word to the high word, 285 // which may be an error for the case where the high word of x overlaps with the low 286 // word of z. When calling this function directly, you need to pay attention to this 287 // situation. 288 TEXT ·shrVU(SB),NOSPLIT,$0 289 MOVD z+0(FP), R0 290 MOVD z_len+8(FP), R1 291 MOVD x+24(FP), R2 292 MOVD s+48(FP), R3 293 MOVD $0, R8 294 MOVD $64, R4 295 SUB R3, R4 296 CBZ R1, len0 297 CBZ R3, copy // if the number of shift is 0, just copy x to z 298 299 MOVD.P 8(R2), R20 300 LSR R3, R20, R8 301 LSL R4, R20 302 MOVD R20, c+56(FP) // deal with the first element 303 SUB $1, R1 304 305 TBZ $0, R1, two 306 MOVD.P 8(R2), R6 307 LSL R4, R6, R20 308 ORR R8, R20 309 LSR R3, R6, R8 310 MOVD.P R20, 8(R0) 311 SUB $1, R1 312 two: 313 TBZ $1, R1, loop 314 LDP.P 16(R2), (R6, R7) 315 LSL R4, R6, R20 316 LSR R3, R6 317 ORR R8, R20 318 LSL R4, R7, R21 319 LSR R3, R7, R8 320 ORR R6, R21 321 STP.P (R20, R21), 16(R0) 322 SUB $2, R1 323 loop: 324 CBZ R1, done 325 LDP.P 32(R2), (R10, R11) 326 LDP -16(R2), (R12, R13) 327 LSL R4, R10, R20 328 LSR R3, R10 329 ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) 330 LSL R4, R11, R21 331 LSR R3, R11 332 ORR R10, R21 333 LSL R4, R12, R22 334 LSR R3, R12 335 ORR R11, R22 336 LSL R4, R13, R23 337 LSR R3, R13, R8 338 ORR R12, R23 339 STP.P (R20, R21), 32(R0) 340 STP (R22, R23), -16(R0) 341 SUB $4, R1 342 B loop 343 done: 344 MOVD R8, (R0) // deal with the last element 345 RET 346 copy: 347 CMP R0, R2 348 BEQ len0 349 TBZ $0, R1, ctwo 350 MOVD.P 8(R2), R3 351 MOVD.P R3, 8(R0) 352 SUB $1, R1 353 ctwo: 354 TBZ $1, R1, cloop 355 LDP.P 16(R2), (R4, R5) 356 STP.P (R4, R5), 16(R0) 357 SUB $2, R1 358 cloop: 359 CBZ R1, len0 360 LDP.P 32(R2), (R4, R5) 361 LDP -16(R2), (R6, R7) 362 STP.P (R4, R5), 32(R0) 363 STP (R6, R7), -16(R0) 364 SUB $4, R1 365 B cloop 366 len0: 367 MOVD $0, c+56(FP) 368 RET 369 370 371 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 372 TEXT ·mulAddVWW(SB),NOSPLIT,$0 373 MOVD z+0(FP), R1 374 MOVD z_len+8(FP), R0 375 MOVD x+24(FP), R2 376 MOVD y+48(FP), R3 377 MOVD r+56(FP), R4 378 // c, z = x * y + r 379 TBZ $0, R0, two 380 MOVD.P 8(R2), R5 381 MUL R3, R5, R7 382 UMULH R3, R5, R8 383 ADDS R4, R7 384 ADC $0, R8, R4 // c, z[i] = x[i] * y + r 385 MOVD.P R7, 8(R1) 386 SUB $1, R0 387 two: 388 TBZ $1, R0, loop 389 LDP.P 16(R2), (R5, R6) 390 MUL R3, R5, R10 391 UMULH R3, R5, R11 392 ADDS R4, R10 393 MUL R3, R6, R12 394 UMULH R3, R6, R13 395 ADCS R12, R11 396 ADC $0, R13, R4 397 398 STP.P (R10, R11), 16(R1) 399 SUB $2, R0 400 loop: 401 CBZ R0, done 402 LDP.P 32(R2), (R5, R6) 403 LDP -16(R2), (R7, R8) 404 405 MUL R3, R5, R10 406 UMULH R3, R5, R11 407 ADDS R4, R10 408 MUL R3, R6, R12 409 UMULH R3, R6, R13 410 ADCS R11, R12 411 412 MUL R3, R7, R14 413 UMULH R3, R7, R15 414 ADCS R13, R14 415 MUL R3, R8, R16 416 UMULH R3, R8, R17 417 ADCS R15, R16 418 ADC $0, R17, R4 419 420 STP.P (R10, R12), 32(R1) 421 STP (R14, R16), -16(R1) 422 SUB $4, R0 423 B loop 424 done: 425 MOVD R4, c+64(FP) 426 RET 427 428 429 // func addMulVVW(z, x []Word, y Word) (c Word) 430 TEXT ·addMulVVW(SB),NOSPLIT,$0 431 MOVD z+0(FP), R1 432 MOVD z_len+8(FP), R0 433 MOVD x+24(FP), R2 434 MOVD y+48(FP), R3 435 MOVD $0, R4 436 437 TBZ $0, R0, two 438 439 MOVD.P 8(R2), R5 440 MOVD (R1), R6 441 442 MUL R5, R3, R7 443 UMULH R5, R3, R8 444 445 ADDS R7, R6 446 ADC $0, R8, R4 447 448 MOVD.P R6, 8(R1) 449 SUB $1, R0 450 451 two: 452 TBZ $1, R0, loop 453 454 LDP.P 16(R2), (R5, R10) 455 LDP (R1), (R6, R11) 456 457 MUL R10, R3, R13 458 UMULH R10, R3, R12 459 460 MUL R5, R3, R7 461 UMULH R5, R3, R8 462 463 ADDS R4, R6 464 ADCS R13, R11 465 ADC $0, R12 466 467 ADDS R7, R6 468 ADCS R8, R11 469 ADC $0, R12, R4 470 471 STP.P (R6, R11), 16(R1) 472 SUB $2, R0 473 474 // The main loop of this code operates on a block of 4 words every iteration 475 // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] 476 // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next 477 // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. 478 loop: 479 CBZ R0, done 480 481 LDP.P 16(R2), (R5, R6) 482 LDP.P 16(R2), (R7, R8) 483 484 LDP (R1), (R9, R10) 485 ADDS R4, R9 486 MUL R6, R3, R14 487 ADCS R14, R10 488 MUL R7, R3, R15 489 LDP 16(R1), (R11, R12) 490 ADCS R15, R11 491 MUL R8, R3, R16 492 ADCS R16, R12 493 UMULH R8, R3, R20 494 ADC $0, R20 495 496 MUL R5, R3, R13 497 ADDS R13, R9 498 UMULH R5, R3, R17 499 ADCS R17, R10 500 UMULH R6, R3, R21 501 STP.P (R9, R10), 16(R1) 502 ADCS R21, R11 503 UMULH R7, R3, R19 504 ADCS R19, R12 505 STP.P (R11, R12), 16(R1) 506 ADC $0, R20, R4 507 508 SUB $4, R0 509 B loop 510 511 done: 512 MOVD R4, c+56(FP) 513 RET 514 515 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) 516 TEXT ·divWVW(SB),NOSPLIT,$0 517 B ·divWVW_g(SB)