github.com/megatontech/mynoteforgo@v0.0.0-20200507084910-5d0c6ea6e890/源码/math/big/arith_arm64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // TODO: Consider re-implementing using Advanced SIMD 13 // once the assembler supports those instructions. 14 15 // func mulWW(x, y Word) (z1, z0 Word) 16 TEXT ·mulWW(SB),NOSPLIT,$0 17 MOVD x+0(FP), R0 18 MOVD y+8(FP), R1 19 MUL R0, R1, R2 20 UMULH R0, R1, R3 21 MOVD R3, z1+16(FP) 22 MOVD R2, z0+24(FP) 23 RET 24 25 26 // func divWW(x1, x0, y Word) (q, r Word) 27 TEXT ·divWW(SB),NOSPLIT,$0 28 B ·divWW_g(SB) // ARM64 has no multiword division 29 30 31 // func addVV(z, x, y []Word) (c Word) 32 TEXT ·addVV(SB),NOSPLIT,$0 33 MOVD z_len+8(FP), R0 34 MOVD x+24(FP), R8 35 MOVD y+48(FP), R9 36 MOVD z+0(FP), R10 37 ADDS $0, R0 // clear carry flag 38 TBZ $0, R0, two 39 MOVD.P 8(R8), R11 40 MOVD.P 8(R9), R15 41 ADCS R15, R11 42 MOVD.P R11, 8(R10) 43 SUB $1, R0 44 two: 45 TBZ $1, R0, loop 46 LDP.P 16(R8), (R11, R12) 47 LDP.P 16(R9), (R15, R16) 48 ADCS R15, R11 49 ADCS R16, R12 50 STP.P (R11, R12), 16(R10) 51 SUB $2, R0 52 loop: 53 CBZ R0, done // careful not to touch the carry flag 54 LDP.P 32(R8), (R11, R12) 55 LDP -16(R8), (R13, R14) 56 LDP.P 32(R9), (R15, R16) 57 LDP -16(R9), (R17, R19) 58 ADCS R15, R11 59 ADCS R16, R12 60 ADCS R17, R13 61 ADCS R19, R14 62 STP.P (R11, R12), 32(R10) 63 STP (R13, R14), -16(R10) 64 SUB $4, R0 65 B loop 66 done: 67 CSET HS, R0 // extract carry flag 68 MOVD R0, c+72(FP) 69 RET 70 71 72 // func subVV(z, x, y []Word) (c Word) 73 TEXT ·subVV(SB),NOSPLIT,$0 74 MOVD z_len+8(FP), R0 75 MOVD x+24(FP), R8 76 MOVD y+48(FP), R9 77 MOVD z+0(FP), R10 78 CMP R0, R0 // set carry flag 79 TBZ $0, R0, two 80 MOVD.P 8(R8), R11 81 MOVD.P 8(R9), R15 82 SBCS R15, R11 83 MOVD.P R11, 8(R10) 84 SUB $1, R0 85 two: 86 TBZ $1, R0, loop 87 LDP.P 16(R8), (R11, R12) 88 LDP.P 16(R9), (R15, R16) 89 SBCS R15, R11 90 SBCS R16, R12 91 STP.P (R11, R12), 16(R10) 92 SUB $2, R0 93 loop: 94 CBZ R0, done // careful not to touch the carry flag 95 LDP.P 32(R8), (R11, R12) 96 LDP -16(R8), (R13, R14) 97 LDP.P 32(R9), (R15, R16) 98 LDP -16(R9), (R17, R19) 99 SBCS R15, R11 100 SBCS R16, R12 101 SBCS R17, R13 102 SBCS R19, R14 103 STP.P (R11, R12), 32(R10) 104 STP (R13, R14), -16(R10) 105 SUB $4, R0 106 B loop 107 done: 108 CSET LO, R0 // extract carry flag 109 MOVD R0, c+72(FP) 110 RET 111 112 113 // func addVW(z, x []Word, y Word) (c Word) 114 TEXT ·addVW(SB),NOSPLIT,$0 115 MOVD z+0(FP), R3 116 MOVD z_len+8(FP), R0 117 MOVD x+24(FP), R1 118 MOVD y+48(FP), R2 119 CBZ R0, len0 // the length of z is 0 120 MOVD.P 8(R1), R4 121 ADDS R2, R4 // z[0] = x[0] + y, set carry 122 MOVD.P R4, 8(R3) 123 SUB $1, R0 124 CBZ R0, len1 // the length of z is 1 125 TBZ $0, R0, two 126 MOVD.P 8(R1), R4 // do it once 127 ADCS $0, R4 128 MOVD.P R4, 8(R3) 129 SUB $1, R0 130 two: // do it twice 131 TBZ $1, R0, loop 132 LDP.P 16(R1), (R4, R5) 133 ADCS $0, R4, R8 // c, z[i] = x[i] + c 134 ADCS $0, R5, R9 135 STP.P (R8, R9), 16(R3) 136 SUB $2, R0 137 loop: // do four times per round 138 CBZ R0, len1 // careful not to touch the carry flag 139 LDP.P 32(R1), (R4, R5) 140 LDP -16(R1), (R6, R7) 141 ADCS $0, R4, R8 142 ADCS $0, R5, R9 143 ADCS $0, R6, R10 144 ADCS $0, R7, R11 145 STP.P (R8, R9), 32(R3) 146 STP (R10, R11), -16(R3) 147 SUB $4, R0 148 B loop 149 len1: 150 CSET HS, R2 // extract carry flag 151 len0: 152 MOVD R2, c+56(FP) 153 RET 154 155 // func subVW(z, x []Word, y Word) (c Word) 156 TEXT ·subVW(SB),NOSPLIT,$0 157 MOVD z+0(FP), R3 158 MOVD z_len+8(FP), R0 159 MOVD x+24(FP), R1 160 MOVD y+48(FP), R2 161 CBZ R0, len0 // the length of z is 0 162 MOVD.P 8(R1), R4 163 SUBS R2, R4 // z[0] = x[0] - y, set carry 164 MOVD.P R4, 8(R3) 165 SUB $1, R0 166 CBZ R0, len1 // the length of z is 1 167 TBZ $0, R0, two // do it once 168 MOVD.P 8(R1), R4 169 SBCS $0, R4 170 MOVD.P R4, 8(R3) 171 SUB $1, R0 172 two: // do it twice 173 TBZ $1, R0, loop 174 LDP.P 16(R1), (R4, R5) 175 SBCS $0, R4, R8 // c, z[i] = x[i] + c 176 SBCS $0, R5, R9 177 STP.P (R8, R9), 16(R3) 178 SUB $2, R0 179 loop: // do four times per round 180 CBZ R0, len1 // careful not to touch the carry flag 181 LDP.P 32(R1), (R4, R5) 182 LDP -16(R1), (R6, R7) 183 SBCS $0, R4, R8 184 SBCS $0, R5, R9 185 SBCS $0, R6, R10 186 SBCS $0, R7, R11 187 STP.P (R8, R9), 32(R3) 188 STP (R10, R11), -16(R3) 189 SUB $4, R0 190 B loop 191 len1: 192 CSET LO, R2 // extract carry flag 193 len0: 194 MOVD R2, c+56(FP) 195 RET 196 197 198 // func shlVU(z, x []Word, s uint) (c Word) 199 TEXT ·shlVU(SB),NOSPLIT,$0 200 MOVD z+0(FP), R0 201 MOVD z_len+8(FP), R1 202 MOVD x+24(FP), R2 203 MOVD s+48(FP), R3 204 MOVD $0, R8 // in order not to affect the first element, R8 is initialized to zero 205 MOVD $64, R4 206 SUB R3, R4 207 CBZ R1, len0 208 CBZ R3, copy // if the number of shift is 0, just copy x to z 209 210 TBZ $0, R1, two 211 MOVD.P 8(R2), R6 212 LSR R4, R6, R8 213 LSL R3, R6 214 MOVD.P R6, 8(R0) 215 SUB $1, R1 216 two: 217 TBZ $1, R1, loop 218 LDP.P 16(R2), (R6, R7) 219 LSR R4, R6, R9 220 LSL R3, R6 221 ORR R8, R6 222 LSR R4, R7, R8 223 LSL R3, R7 224 ORR R9, R7 225 STP.P (R6, R7), 16(R0) 226 SUB $2, R1 227 loop: 228 CBZ R1, done 229 LDP.P 32(R2), (R10, R11) 230 LDP -16(R2), (R12, R13) 231 LSR R4, R10, R20 232 LSL R3, R10 233 ORR R8, R10 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) 234 LSR R4, R11, R21 235 LSL R3, R11 236 ORR R20, R11 237 LSR R4, R12, R22 238 LSL R3, R12 239 ORR R21, R12 240 LSR R4, R13, R8 241 LSL R3, R13 242 ORR R22, R13 243 STP.P (R10, R11), 32(R0) 244 STP (R12, R13), -16(R0) 245 SUB $4, R1 246 B loop 247 done: 248 MOVD R8, c+56(FP) // the part moved out from the last element 249 RET 250 copy: 251 TBZ $0, R1, ctwo 252 MOVD.P 8(R2), R3 253 MOVD.P R3, 8(R0) 254 SUB $1, R1 255 ctwo: 256 TBZ $1, R1, cloop 257 LDP.P 16(R2), (R4, R5) 258 STP.P (R4, R5), 16(R0) 259 SUB $2, R1 260 cloop: 261 CBZ R1, len0 262 LDP.P 32(R2), (R4, R5) 263 LDP -16(R2), (R6, R7) 264 STP.P (R4, R5), 32(R0) 265 STP (R6, R7), -16(R0) 266 SUB $4, R1 267 B cloop 268 len0: 269 MOVD $0, c+56(FP) 270 RET 271 272 273 // func shrVU(z, x []Word, s uint) (c Word) 274 TEXT ·shrVU(SB),NOSPLIT,$0 275 MOVD z+0(FP), R0 276 MOVD z_len+8(FP), R1 277 MOVD x+24(FP), R2 278 MOVD s+48(FP), R3 279 MOVD $0, R8 280 MOVD $64, R4 281 SUB R3, R4 282 CBZ R1, len0 283 CBZ R3, copy // if the number of shift is 0, just copy x to z 284 285 MOVD.P 8(R2), R20 286 LSR R3, R20, R8 287 LSL R4, R20 288 MOVD R20, c+56(FP) // deal with the first element 289 SUB $1, R1 290 291 TBZ $0, R1, two 292 MOVD.P 8(R2), R6 293 LSL R4, R6, R20 294 ORR R8, R20 295 LSR R3, R6, R8 296 MOVD.P R20, 8(R0) 297 SUB $1, R1 298 two: 299 TBZ $1, R1, loop 300 LDP.P 16(R2), (R6, R7) 301 LSL R4, R6, R20 302 LSR R3, R6 303 ORR R8, R20 304 LSL R4, R7, R21 305 LSR R3, R7, R8 306 ORR R6, R21 307 STP.P (R20, R21), 16(R0) 308 SUB $2, R1 309 loop: 310 CBZ R1, done 311 LDP.P 32(R2), (R10, R11) 312 LDP -16(R2), (R12, R13) 313 LSL R4, R10, R20 314 LSR R3, R10 315 ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) 316 LSL R4, R11, R21 317 LSR R3, R11 318 ORR R10, R21 319 LSL R4, R12, R22 320 LSR R3, R12 321 ORR R11, R22 322 LSL R4, R13, R23 323 LSR R3, R13, R8 324 ORR R12, R23 325 STP.P (R20, R21), 32(R0) 326 STP (R22, R23), -16(R0) 327 SUB $4, R1 328 B loop 329 done: 330 MOVD R8, (R0) // deal with the last element 331 RET 332 copy: 333 TBZ $0, R1, ctwo 334 MOVD.P 8(R2), R3 335 MOVD.P R3, 8(R0) 336 SUB $1, R1 337 ctwo: 338 TBZ $1, R1, cloop 339 LDP.P 16(R2), (R4, R5) 340 STP.P (R4, R5), 16(R0) 341 SUB $2, R1 342 cloop: 343 CBZ R1, len0 344 LDP.P 32(R2), (R4, R5) 345 LDP -16(R2), (R6, R7) 346 STP.P (R4, R5), 32(R0) 347 STP (R6, R7), -16(R0) 348 SUB $4, R1 349 B cloop 350 len0: 351 MOVD $0, c+56(FP) 352 RET 353 354 355 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 356 TEXT ·mulAddVWW(SB),NOSPLIT,$0 357 MOVD z+0(FP), R1 358 MOVD z_len+8(FP), R0 359 MOVD x+24(FP), R2 360 MOVD y+48(FP), R3 361 MOVD r+56(FP), R4 362 loop: 363 CBZ R0, done 364 MOVD.P 8(R2), R5 365 UMULH R5, R3, R7 366 MUL R5, R3, R6 367 ADDS R4, R6 368 ADC $0, R7 369 MOVD.P R6, 8(R1) 370 MOVD R7, R4 371 SUB $1, R0 372 B loop 373 done: 374 MOVD R4, c+64(FP) 375 RET 376 377 378 // func addMulVVW(z, x []Word, y Word) (c Word) 379 TEXT ·addMulVVW(SB),NOSPLIT,$0 380 MOVD z+0(FP), R1 381 MOVD z_len+8(FP), R0 382 MOVD x+24(FP), R2 383 MOVD y+48(FP), R3 384 MOVD $0, R4 385 386 TBZ $0, R0, two 387 388 MOVD.P 8(R2), R5 389 MOVD (R1), R6 390 391 MUL R5, R3, R7 392 UMULH R5, R3, R8 393 394 ADDS R7, R6 395 ADC $0, R8, R4 396 397 MOVD.P R6, 8(R1) 398 SUB $1, R0 399 400 two: 401 TBZ $1, R0, loop 402 403 LDP.P 16(R2), (R5, R10) 404 LDP (R1), (R6, R11) 405 406 MUL R10, R3, R13 407 UMULH R10, R3, R12 408 409 MUL R5, R3, R7 410 UMULH R5, R3, R8 411 412 ADDS R4, R6 413 ADCS R13, R11 414 ADC $0, R12 415 416 ADDS R7, R6 417 ADCS R8, R11 418 ADC $0, R12, R4 419 420 STP.P (R6, R11), 16(R1) 421 SUB $2, R0 422 423 // The main loop of this code operates on a block of 4 words every iteration 424 // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] 425 // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next 426 // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. 427 loop: 428 CBZ R0, done 429 430 LDP.P 16(R2), (R5, R6) 431 LDP.P 16(R2), (R7, R8) 432 433 LDP (R1), (R9, R10) 434 ADDS R4, R9 435 MUL R6, R3, R14 436 ADCS R14, R10 437 MUL R7, R3, R15 438 LDP 16(R1), (R11, R12) 439 ADCS R15, R11 440 MUL R8, R3, R16 441 ADCS R16, R12 442 UMULH R8, R3, R20 443 ADC $0, R20 444 445 MUL R5, R3, R13 446 ADDS R13, R9 447 UMULH R5, R3, R17 448 ADCS R17, R10 449 UMULH R6, R3, R21 450 STP.P (R9, R10), 16(R1) 451 ADCS R21, R11 452 UMULH R7, R3, R19 453 ADCS R19, R12 454 STP.P (R11, R12), 16(R1) 455 ADC $0, R20, R4 456 457 SUB $4, R0 458 B loop 459 460 done: 461 MOVD R4, c+56(FP) 462 RET 463 464 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) 465 TEXT ·divWVW(SB),NOSPLIT,$0 466 B ·divWVW_g(SB)