github.com/c9s/go@v0.0.0-20180120015821-984e81f64e0c/src/math/big/arith_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT ·mulWW(SB),NOSPLIT,$0 14 MOVQ x+0(FP), AX 15 MULQ y+8(FP) 16 MOVQ DX, z1+16(FP) 17 MOVQ AX, z0+24(FP) 18 RET 19 20 21 // func divWW(x1, x0, y Word) (q, r Word) 22 TEXT ·divWW(SB),NOSPLIT,$0 23 MOVQ x1+0(FP), DX 24 MOVQ x0+8(FP), AX 25 DIVQ y+16(FP) 26 MOVQ AX, q+24(FP) 27 MOVQ DX, r+32(FP) 28 RET 29 30 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. 31 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. 32 // This is faster than using rotate instructions. 33 34 // func addVV(z, x, y []Word) (c Word) 35 TEXT ·addVV(SB),NOSPLIT,$0 36 MOVQ z_len+8(FP), DI 37 MOVQ x+24(FP), R8 38 MOVQ y+48(FP), R9 39 MOVQ z+0(FP), R10 40 41 MOVQ $0, CX // c = 0 42 MOVQ $0, SI // i = 0 43 44 // s/JL/JMP/ below to disable the unrolled loop 45 SUBQ $4, DI // n -= 4 46 JL V1 // if n < 0 goto V1 47 48 U1: // n >= 0 49 // regular loop body unrolled 4x 50 ADDQ CX, CX // restore CF 51 MOVQ 0(R8)(SI*8), R11 52 MOVQ 8(R8)(SI*8), R12 53 MOVQ 16(R8)(SI*8), R13 54 MOVQ 24(R8)(SI*8), R14 55 ADCQ 0(R9)(SI*8), R11 56 ADCQ 8(R9)(SI*8), R12 57 ADCQ 16(R9)(SI*8), R13 58 ADCQ 24(R9)(SI*8), R14 59 MOVQ R11, 0(R10)(SI*8) 60 MOVQ R12, 8(R10)(SI*8) 61 MOVQ R13, 16(R10)(SI*8) 62 MOVQ R14, 24(R10)(SI*8) 63 SBBQ CX, CX // save CF 64 65 ADDQ $4, SI // i += 4 66 SUBQ $4, DI // n -= 4 67 JGE U1 // if n >= 0 goto U1 68 69 V1: ADDQ $4, DI // n += 4 70 JLE E1 // if n <= 0 goto E1 71 72 L1: // n > 0 73 ADDQ CX, CX // restore CF 74 MOVQ 0(R8)(SI*8), R11 75 ADCQ 0(R9)(SI*8), R11 76 MOVQ R11, 0(R10)(SI*8) 77 SBBQ CX, CX // save CF 78 79 ADDQ $1, SI // i++ 80 SUBQ $1, DI // n-- 81 JG L1 // if n > 0 goto L1 82 83 E1: NEGQ CX 84 MOVQ CX, c+72(FP) // return c 85 RET 86 87 88 // func subVV(z, x, y []Word) (c Word) 89 // (same as addVV except for SBBQ instead of ADCQ and label names) 90 TEXT ·subVV(SB),NOSPLIT,$0 91 MOVQ z_len+8(FP), DI 92 MOVQ x+24(FP), R8 93 MOVQ y+48(FP), R9 94 MOVQ z+0(FP), R10 95 96 MOVQ $0, CX // c = 0 97 MOVQ $0, SI // i = 0 98 99 // s/JL/JMP/ below to disable the unrolled loop 100 SUBQ $4, DI // n -= 4 101 JL V2 // if n < 0 goto V2 102 103 U2: // n >= 0 104 // regular loop body unrolled 4x 105 ADDQ CX, CX // restore CF 106 MOVQ 0(R8)(SI*8), R11 107 MOVQ 8(R8)(SI*8), R12 108 MOVQ 16(R8)(SI*8), R13 109 MOVQ 24(R8)(SI*8), R14 110 SBBQ 0(R9)(SI*8), R11 111 SBBQ 8(R9)(SI*8), R12 112 SBBQ 16(R9)(SI*8), R13 113 SBBQ 24(R9)(SI*8), R14 114 MOVQ R11, 0(R10)(SI*8) 115 MOVQ R12, 8(R10)(SI*8) 116 MOVQ R13, 16(R10)(SI*8) 117 MOVQ R14, 24(R10)(SI*8) 118 SBBQ CX, CX // save CF 119 120 ADDQ $4, SI // i += 4 121 SUBQ $4, DI // n -= 4 122 JGE U2 // if n >= 0 goto U2 123 124 V2: ADDQ $4, DI // n += 4 125 JLE E2 // if n <= 0 goto E2 126 127 L2: // n > 0 128 ADDQ CX, CX // restore CF 129 MOVQ 0(R8)(SI*8), R11 130 SBBQ 0(R9)(SI*8), R11 131 MOVQ R11, 0(R10)(SI*8) 132 SBBQ CX, CX // save CF 133 134 ADDQ $1, SI // i++ 135 SUBQ $1, DI // n-- 136 JG L2 // if n > 0 goto L2 137 138 E2: NEGQ CX 139 MOVQ CX, c+72(FP) // return c 140 RET 141 142 143 // func addVW(z, x []Word, y Word) (c Word) 144 TEXT ·addVW(SB),NOSPLIT,$0 145 MOVQ z_len+8(FP), DI 146 MOVQ x+24(FP), R8 147 MOVQ y+48(FP), CX // c = y 148 MOVQ z+0(FP), R10 149 150 MOVQ $0, SI // i = 0 151 152 // s/JL/JMP/ below to disable the unrolled loop 153 SUBQ $4, DI // n -= 4 154 JL V3 // if n < 4 goto V3 155 156 U3: // n >= 0 157 // regular loop body unrolled 4x 158 MOVQ 0(R8)(SI*8), R11 159 MOVQ 8(R8)(SI*8), R12 160 MOVQ 16(R8)(SI*8), R13 161 MOVQ 24(R8)(SI*8), R14 162 ADDQ CX, R11 163 ADCQ $0, R12 164 ADCQ $0, R13 165 ADCQ $0, R14 166 SBBQ CX, CX // save CF 167 NEGQ CX 168 MOVQ R11, 0(R10)(SI*8) 169 MOVQ R12, 8(R10)(SI*8) 170 MOVQ R13, 16(R10)(SI*8) 171 MOVQ R14, 24(R10)(SI*8) 172 173 ADDQ $4, SI // i += 4 174 SUBQ $4, DI // n -= 4 175 JGE U3 // if n >= 0 goto U3 176 177 V3: ADDQ $4, DI // n += 4 178 JLE E3 // if n <= 0 goto E3 179 180 L3: // n > 0 181 ADDQ 0(R8)(SI*8), CX 182 MOVQ CX, 0(R10)(SI*8) 183 SBBQ CX, CX // save CF 184 NEGQ CX 185 186 ADDQ $1, SI // i++ 187 SUBQ $1, DI // n-- 188 JG L3 // if n > 0 goto L3 189 190 E3: MOVQ CX, c+56(FP) // return c 191 RET 192 193 194 // func subVW(z, x []Word, y Word) (c Word) 195 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 196 TEXT ·subVW(SB),NOSPLIT,$0 197 MOVQ z_len+8(FP), DI 198 MOVQ x+24(FP), R8 199 MOVQ y+48(FP), CX // c = y 200 MOVQ z+0(FP), R10 201 202 MOVQ $0, SI // i = 0 203 204 // s/JL/JMP/ below to disable the unrolled loop 205 SUBQ $4, DI // n -= 4 206 JL V4 // if n < 4 goto V4 207 208 U4: // n >= 0 209 // regular loop body unrolled 4x 210 MOVQ 0(R8)(SI*8), R11 211 MOVQ 8(R8)(SI*8), R12 212 MOVQ 16(R8)(SI*8), R13 213 MOVQ 24(R8)(SI*8), R14 214 SUBQ CX, R11 215 SBBQ $0, R12 216 SBBQ $0, R13 217 SBBQ $0, R14 218 SBBQ CX, CX // save CF 219 NEGQ CX 220 MOVQ R11, 0(R10)(SI*8) 221 MOVQ R12, 8(R10)(SI*8) 222 MOVQ R13, 16(R10)(SI*8) 223 MOVQ R14, 24(R10)(SI*8) 224 225 ADDQ $4, SI // i += 4 226 SUBQ $4, DI // n -= 4 227 JGE U4 // if n >= 0 goto U4 228 229 V4: ADDQ $4, DI // n += 4 230 JLE E4 // if n <= 0 goto E4 231 232 L4: // n > 0 233 MOVQ 0(R8)(SI*8), R11 234 SUBQ CX, R11 235 MOVQ R11, 0(R10)(SI*8) 236 SBBQ CX, CX // save CF 237 NEGQ CX 238 239 ADDQ $1, SI // i++ 240 SUBQ $1, DI // n-- 241 JG L4 // if n > 0 goto L4 242 243 E4: MOVQ CX, c+56(FP) // return c 244 RET 245 246 247 // func shlVU(z, x []Word, s uint) (c Word) 248 TEXT ·shlVU(SB),NOSPLIT,$0 249 MOVQ z_len+8(FP), BX // i = z 250 SUBQ $1, BX // i-- 251 JL X8b // i < 0 (n <= 0) 252 253 // n > 0 254 MOVQ z+0(FP), R10 255 MOVQ x+24(FP), R8 256 MOVQ s+48(FP), CX 257 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 258 MOVQ $0, DX 259 SHLQ CX, DX:AX // w1>>ŝ 260 MOVQ DX, c+56(FP) 261 262 CMPQ BX, $0 263 JLE X8a // i <= 0 264 265 // i > 0 266 L8: MOVQ AX, DX // w = w1 267 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 268 SHLQ CX, DX:AX // w<<s | w1>>ŝ 269 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ 270 SUBQ $1, BX // i-- 271 JG L8 // i > 0 272 273 // i <= 0 274 X8a: SHLQ CX, AX // w1<<s 275 MOVQ AX, (R10) // z[0] = w1<<s 276 RET 277 278 X8b: MOVQ $0, c+56(FP) 279 RET 280 281 282 // func shrVU(z, x []Word, s uint) (c Word) 283 TEXT ·shrVU(SB),NOSPLIT,$0 284 MOVQ z_len+8(FP), R11 285 SUBQ $1, R11 // n-- 286 JL X9b // n < 0 (n <= 0) 287 288 // n > 0 289 MOVQ z+0(FP), R10 290 MOVQ x+24(FP), R8 291 MOVQ s+48(FP), CX 292 MOVQ (R8), AX // w1 = x[0] 293 MOVQ $0, DX 294 SHRQ CX, DX:AX // w1<<ŝ 295 MOVQ DX, c+56(FP) 296 297 MOVQ $0, BX // i = 0 298 JMP E9 299 300 // i < n-1 301 L9: MOVQ AX, DX // w = w1 302 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 303 SHRQ CX, DX:AX // w>>s | w1<<ŝ 304 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ 305 ADDQ $1, BX // i++ 306 307 E9: CMPQ BX, R11 308 JL L9 // i < n-1 309 310 // i >= n-1 311 X9a: SHRQ CX, AX // w1>>s 312 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 313 RET 314 315 X9b: MOVQ $0, c+56(FP) 316 RET 317 318 319 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 320 TEXT ·mulAddVWW(SB),NOSPLIT,$0 321 MOVQ z+0(FP), R10 322 MOVQ x+24(FP), R8 323 MOVQ y+48(FP), R9 324 MOVQ r+56(FP), CX // c = r 325 MOVQ z_len+8(FP), R11 326 MOVQ $0, BX // i = 0 327 328 CMPQ R11, $4 329 JL E5 330 331 U5: // i+4 <= n 332 // regular loop body unrolled 4x 333 MOVQ (0*8)(R8)(BX*8), AX 334 MULQ R9 335 ADDQ CX, AX 336 ADCQ $0, DX 337 MOVQ AX, (0*8)(R10)(BX*8) 338 MOVQ DX, CX 339 MOVQ (1*8)(R8)(BX*8), AX 340 MULQ R9 341 ADDQ CX, AX 342 ADCQ $0, DX 343 MOVQ AX, (1*8)(R10)(BX*8) 344 MOVQ DX, CX 345 MOVQ (2*8)(R8)(BX*8), AX 346 MULQ R9 347 ADDQ CX, AX 348 ADCQ $0, DX 349 MOVQ AX, (2*8)(R10)(BX*8) 350 MOVQ DX, CX 351 MOVQ (3*8)(R8)(BX*8), AX 352 MULQ R9 353 ADDQ CX, AX 354 ADCQ $0, DX 355 MOVQ AX, (3*8)(R10)(BX*8) 356 MOVQ DX, CX 357 ADDQ $4, BX // i += 4 358 359 LEAQ 4(BX), DX 360 CMPQ DX, R11 361 JLE U5 362 JMP E5 363 364 L5: MOVQ (R8)(BX*8), AX 365 MULQ R9 366 ADDQ CX, AX 367 ADCQ $0, DX 368 MOVQ AX, (R10)(BX*8) 369 MOVQ DX, CX 370 ADDQ $1, BX // i++ 371 372 E5: CMPQ BX, R11 // i < n 373 JL L5 374 375 MOVQ CX, c+64(FP) 376 RET 377 378 379 // func addMulVVW(z, x []Word, y Word) (c Word) 380 TEXT ·addMulVVW(SB),NOSPLIT,$0 381 MOVQ z+0(FP), R10 382 MOVQ x+24(FP), R8 383 MOVQ y+48(FP), R9 384 MOVQ z_len+8(FP), R11 385 MOVQ $0, BX // i = 0 386 MOVQ $0, CX // c = 0 387 MOVQ R11, R12 388 ANDQ $-2, R12 389 CMPQ R11, $2 390 JAE A6 391 JMP E6 392 393 A6: 394 MOVQ (R8)(BX*8), AX 395 MULQ R9 396 ADDQ (R10)(BX*8), AX 397 ADCQ $0, DX 398 ADDQ CX, AX 399 ADCQ $0, DX 400 MOVQ DX, CX 401 MOVQ AX, (R10)(BX*8) 402 403 MOVQ (8)(R8)(BX*8), AX 404 MULQ R9 405 ADDQ (8)(R10)(BX*8), AX 406 ADCQ $0, DX 407 ADDQ CX, AX 408 ADCQ $0, DX 409 MOVQ DX, CX 410 MOVQ AX, (8)(R10)(BX*8) 411 412 ADDQ $2, BX 413 CMPQ BX, R12 414 JL A6 415 JMP E6 416 417 L6: MOVQ (R8)(BX*8), AX 418 MULQ R9 419 ADDQ CX, AX 420 ADCQ $0, DX 421 ADDQ AX, (R10)(BX*8) 422 ADCQ $0, DX 423 MOVQ DX, CX 424 ADDQ $1, BX // i++ 425 426 E6: CMPQ BX, R11 // i < n 427 JL L6 428 429 MOVQ CX, c+56(FP) 430 RET 431 432 433 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) 434 TEXT ·divWVW(SB),NOSPLIT,$0 435 MOVQ z+0(FP), R10 436 MOVQ xn+24(FP), DX // r = xn 437 MOVQ x+32(FP), R8 438 MOVQ y+56(FP), R9 439 MOVQ z_len+8(FP), BX // i = z 440 JMP E7 441 442 L7: MOVQ (R8)(BX*8), AX 443 DIVQ R9 444 MOVQ AX, (R10)(BX*8) 445 446 E7: SUBQ $1, BX // i-- 447 JGE L7 // i >= 0 448 449 MOVQ DX, r+64(FP) 450 RET