github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/math/big/arith_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT ·mulWW(SB),NOSPLIT,$0 14 MOVQ x+0(FP), AX 15 MULQ y+8(FP) 16 MOVQ DX, z1+16(FP) 17 MOVQ AX, z0+24(FP) 18 RET 19 20 21 // func divWW(x1, x0, y Word) (q, r Word) 22 TEXT ·divWW(SB),NOSPLIT,$0 23 MOVQ x1+0(FP), DX 24 MOVQ x0+8(FP), AX 25 DIVQ y+16(FP) 26 MOVQ AX, q+24(FP) 27 MOVQ DX, r+32(FP) 28 RET 29 30 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. 31 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. 32 // This is faster than using rotate instructions. 33 // 34 // CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit! 35 36 // func addVV(z, x, y []Word) (c Word) 37 TEXT ·addVV(SB),NOSPLIT,$0 38 MOVQ z_len+8(FP), DI 39 MOVQ x+24(FP), R8 40 MOVQ y+48(FP), R9 41 MOVQ z+0(FP), R10 42 43 MOVQ $0, CX // c = 0 44 MOVQ $0, SI // i = 0 45 46 // s/JL/JMP/ below to disable the unrolled loop 47 SUBQ $4, DI // n -= 4 48 JL V1 // if n < 0 goto V1 49 50 U1: // n >= 0 51 // regular loop body unrolled 4x 52 ADDQ CX, CX // restore CF 53 MOVQ 0(R8)(SI*8), R11 54 MOVQ 8(R8)(SI*8), R12 55 MOVQ 16(R8)(SI*8), R13 56 MOVQ 24(R8)(SI*8), R14 57 ADCQ 0(R9)(SI*8), R11 58 ADCQ 8(R9)(SI*8), R12 59 ADCQ 16(R9)(SI*8), R13 60 ADCQ 24(R9)(SI*8), R14 61 MOVQ R11, 0(R10)(SI*8) 62 MOVQ R12, 8(R10)(SI*8) 63 MOVQ R13, 16(R10)(SI*8) 64 MOVQ R14, 24(R10)(SI*8) 65 SBBQ CX, CX // save CF 66 67 ADDQ $4, SI // i += 4 68 SUBQ $4, DI // n -= 4 69 JGE U1 // if n >= 0 goto U1 70 71 V1: ADDQ $4, DI // n += 4 72 JLE E1 // if n <= 0 goto E1 73 74 L1: // n > 0 75 ADDQ CX, CX // restore CF 76 MOVQ 0(R8)(SI*8), R11 77 ADCQ 0(R9)(SI*8), R11 78 MOVQ R11, 0(R10)(SI*8) 79 SBBQ CX, CX // save CF 80 81 ADDQ $1, SI // i++ 82 SUBQ $1, DI // n-- 83 JG L1 // if n > 0 goto L1 84 85 E1: NEGQ CX 86 MOVQ CX, c+72(FP) // return c 87 RET 88 89 90 // func subVV(z, x, y []Word) (c Word) 91 // (same as addVV except for SBBQ instead of ADCQ and label names) 92 TEXT ·subVV(SB),NOSPLIT,$0 93 MOVQ z_len+8(FP), DI 94 MOVQ x+24(FP), R8 95 MOVQ y+48(FP), R9 96 MOVQ z+0(FP), R10 97 98 MOVQ $0, CX // c = 0 99 MOVQ $0, SI // i = 0 100 101 // s/JL/JMP/ below to disable the unrolled loop 102 SUBQ $4, DI // n -= 4 103 JL V2 // if n < 0 goto V2 104 105 U2: // n >= 0 106 // regular loop body unrolled 4x 107 ADDQ CX, CX // restore CF 108 MOVQ 0(R8)(SI*8), R11 109 MOVQ 8(R8)(SI*8), R12 110 MOVQ 16(R8)(SI*8), R13 111 MOVQ 24(R8)(SI*8), R14 112 SBBQ 0(R9)(SI*8), R11 113 SBBQ 8(R9)(SI*8), R12 114 SBBQ 16(R9)(SI*8), R13 115 SBBQ 24(R9)(SI*8), R14 116 MOVQ R11, 0(R10)(SI*8) 117 MOVQ R12, 8(R10)(SI*8) 118 MOVQ R13, 16(R10)(SI*8) 119 MOVQ R14, 24(R10)(SI*8) 120 SBBQ CX, CX // save CF 121 122 ADDQ $4, SI // i += 4 123 SUBQ $4, DI // n -= 4 124 JGE U2 // if n >= 0 goto U2 125 126 V2: ADDQ $4, DI // n += 4 127 JLE E2 // if n <= 0 goto E2 128 129 L2: // n > 0 130 ADDQ CX, CX // restore CF 131 MOVQ 0(R8)(SI*8), R11 132 SBBQ 0(R9)(SI*8), R11 133 MOVQ R11, 0(R10)(SI*8) 134 SBBQ CX, CX // save CF 135 136 ADDQ $1, SI // i++ 137 SUBQ $1, DI // n-- 138 JG L2 // if n > 0 goto L2 139 140 E2: NEGQ CX 141 MOVQ CX, c+72(FP) // return c 142 RET 143 144 145 // func addVW(z, x []Word, y Word) (c Word) 146 TEXT ·addVW(SB),NOSPLIT,$0 147 MOVQ z_len+8(FP), DI 148 MOVQ x+24(FP), R8 149 MOVQ y+48(FP), CX // c = y 150 MOVQ z+0(FP), R10 151 152 MOVQ $0, SI // i = 0 153 154 // s/JL/JMP/ below to disable the unrolled loop 155 SUBQ $4, DI // n -= 4 156 JL V3 // if n < 4 goto V3 157 158 U3: // n >= 0 159 // regular loop body unrolled 4x 160 MOVQ 0(R8)(SI*8), R11 161 MOVQ 8(R8)(SI*8), R12 162 MOVQ 16(R8)(SI*8), R13 163 MOVQ 24(R8)(SI*8), R14 164 ADDQ CX, R11 165 ADCQ $0, R12 166 ADCQ $0, R13 167 ADCQ $0, R14 168 SBBQ CX, CX // save CF 169 NEGQ CX 170 MOVQ R11, 0(R10)(SI*8) 171 MOVQ R12, 8(R10)(SI*8) 172 MOVQ R13, 16(R10)(SI*8) 173 MOVQ R14, 24(R10)(SI*8) 174 175 ADDQ $4, SI // i += 4 176 SUBQ $4, DI // n -= 4 177 JGE U3 // if n >= 0 goto U3 178 179 V3: ADDQ $4, DI // n += 4 180 JLE E3 // if n <= 0 goto E3 181 182 L3: // n > 0 183 ADDQ 0(R8)(SI*8), CX 184 MOVQ CX, 0(R10)(SI*8) 185 SBBQ CX, CX // save CF 186 NEGQ CX 187 188 ADDQ $1, SI // i++ 189 SUBQ $1, DI // n-- 190 JG L3 // if n > 0 goto L3 191 192 E3: MOVQ CX, c+56(FP) // return c 193 RET 194 195 196 // func subVW(z, x []Word, y Word) (c Word) 197 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 198 TEXT ·subVW(SB),NOSPLIT,$0 199 MOVQ z_len+8(FP), DI 200 MOVQ x+24(FP), R8 201 MOVQ y+48(FP), CX // c = y 202 MOVQ z+0(FP), R10 203 204 MOVQ $0, SI // i = 0 205 206 // s/JL/JMP/ below to disable the unrolled loop 207 SUBQ $4, DI // n -= 4 208 JL V4 // if n < 4 goto V4 209 210 U4: // n >= 0 211 // regular loop body unrolled 4x 212 MOVQ 0(R8)(SI*8), R11 213 MOVQ 8(R8)(SI*8), R12 214 MOVQ 16(R8)(SI*8), R13 215 MOVQ 24(R8)(SI*8), R14 216 SUBQ CX, R11 217 SBBQ $0, R12 218 SBBQ $0, R13 219 SBBQ $0, R14 220 SBBQ CX, CX // save CF 221 NEGQ CX 222 MOVQ R11, 0(R10)(SI*8) 223 MOVQ R12, 8(R10)(SI*8) 224 MOVQ R13, 16(R10)(SI*8) 225 MOVQ R14, 24(R10)(SI*8) 226 227 ADDQ $4, SI // i += 4 228 SUBQ $4, DI // n -= 4 229 JGE U4 // if n >= 0 goto U4 230 231 V4: ADDQ $4, DI // n += 4 232 JLE E4 // if n <= 0 goto E4 233 234 L4: // n > 0 235 MOVQ 0(R8)(SI*8), R11 236 SUBQ CX, R11 237 MOVQ R11, 0(R10)(SI*8) 238 SBBQ CX, CX // save CF 239 NEGQ CX 240 241 ADDQ $1, SI // i++ 242 SUBQ $1, DI // n-- 243 JG L4 // if n > 0 goto L4 244 245 E4: MOVQ CX, c+56(FP) // return c 246 RET 247 248 249 // func shlVU(z, x []Word, s uint) (c Word) 250 TEXT ·shlVU(SB),NOSPLIT,$0 251 MOVQ z_len+8(FP), BX // i = z 252 SUBQ $1, BX // i-- 253 JL X8b // i < 0 (n <= 0) 254 255 // n > 0 256 MOVQ z+0(FP), R10 257 MOVQ x+24(FP), R8 258 MOVQ s+48(FP), CX 259 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 260 MOVQ $0, DX 261 SHLQ CX, DX:AX // w1>>ŝ 262 MOVQ DX, c+56(FP) 263 264 CMPQ BX, $0 265 JLE X8a // i <= 0 266 267 // i > 0 268 L8: MOVQ AX, DX // w = w1 269 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 270 SHLQ CX, DX:AX // w<<s | w1>>ŝ 271 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ 272 SUBQ $1, BX // i-- 273 JG L8 // i > 0 274 275 // i <= 0 276 X8a: SHLQ CX, AX // w1<<s 277 MOVQ AX, (R10) // z[0] = w1<<s 278 RET 279 280 X8b: MOVQ $0, c+56(FP) 281 RET 282 283 284 // func shrVU(z, x []Word, s uint) (c Word) 285 TEXT ·shrVU(SB),NOSPLIT,$0 286 MOVQ z_len+8(FP), R11 287 SUBQ $1, R11 // n-- 288 JL X9b // n < 0 (n <= 0) 289 290 // n > 0 291 MOVQ z+0(FP), R10 292 MOVQ x+24(FP), R8 293 MOVQ s+48(FP), CX 294 MOVQ (R8), AX // w1 = x[0] 295 MOVQ $0, DX 296 SHRQ CX, DX:AX // w1<<ŝ 297 MOVQ DX, c+56(FP) 298 299 MOVQ $0, BX // i = 0 300 JMP E9 301 302 // i < n-1 303 L9: MOVQ AX, DX // w = w1 304 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 305 SHRQ CX, DX:AX // w>>s | w1<<ŝ 306 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ 307 ADDQ $1, BX // i++ 308 309 E9: CMPQ BX, R11 310 JL L9 // i < n-1 311 312 // i >= n-1 313 X9a: SHRQ CX, AX // w1>>s 314 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 315 RET 316 317 X9b: MOVQ $0, c+56(FP) 318 RET 319 320 321 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 322 TEXT ·mulAddVWW(SB),NOSPLIT,$0 323 MOVQ z+0(FP), R10 324 MOVQ x+24(FP), R8 325 MOVQ y+48(FP), R9 326 MOVQ r+56(FP), CX // c = r 327 MOVQ z_len+8(FP), R11 328 MOVQ $0, BX // i = 0 329 330 CMPQ R11, $4 331 JL E5 332 333 U5: // i+4 <= n 334 // regular loop body unrolled 4x 335 MOVQ (0*8)(R8)(BX*8), AX 336 MULQ R9 337 ADDQ CX, AX 338 ADCQ $0, DX 339 MOVQ AX, (0*8)(R10)(BX*8) 340 MOVQ DX, CX 341 MOVQ (1*8)(R8)(BX*8), AX 342 MULQ R9 343 ADDQ CX, AX 344 ADCQ $0, DX 345 MOVQ AX, (1*8)(R10)(BX*8) 346 MOVQ DX, CX 347 MOVQ (2*8)(R8)(BX*8), AX 348 MULQ R9 349 ADDQ CX, AX 350 ADCQ $0, DX 351 MOVQ AX, (2*8)(R10)(BX*8) 352 MOVQ DX, CX 353 MOVQ (3*8)(R8)(BX*8), AX 354 MULQ R9 355 ADDQ CX, AX 356 ADCQ $0, DX 357 MOVQ AX, (3*8)(R10)(BX*8) 358 MOVQ DX, CX 359 ADDQ $4, BX // i += 4 360 361 LEAQ 4(BX), DX 362 CMPQ DX, R11 363 JLE U5 364 JMP E5 365 366 L5: MOVQ (R8)(BX*8), AX 367 MULQ R9 368 ADDQ CX, AX 369 ADCQ $0, DX 370 MOVQ AX, (R10)(BX*8) 371 MOVQ DX, CX 372 ADDQ $1, BX // i++ 373 374 E5: CMPQ BX, R11 // i < n 375 JL L5 376 377 MOVQ CX, c+64(FP) 378 RET 379 380 381 // func addMulVVW(z, x []Word, y Word) (c Word) 382 TEXT ·addMulVVW(SB),NOSPLIT,$0 383 MOVQ z+0(FP), R10 384 MOVQ x+24(FP), R8 385 MOVQ y+48(FP), R9 386 MOVQ z_len+8(FP), R11 387 MOVQ $0, BX // i = 0 388 MOVQ $0, CX // c = 0 389 MOVQ R11, R12 390 ANDQ $-2, R12 391 CMPQ R11, $2 392 JAE A6 393 JMP E6 394 395 A6: 396 MOVQ (R8)(BX*8), AX 397 MULQ R9 398 ADDQ (R10)(BX*8), AX 399 ADCQ $0, DX 400 ADDQ CX, AX 401 ADCQ $0, DX 402 MOVQ DX, CX 403 MOVQ AX, (R10)(BX*8) 404 405 MOVQ (8)(R8)(BX*8), AX 406 MULQ R9 407 ADDQ (8)(R10)(BX*8), AX 408 ADCQ $0, DX 409 ADDQ CX, AX 410 ADCQ $0, DX 411 MOVQ DX, CX 412 MOVQ AX, (8)(R10)(BX*8) 413 414 ADDQ $2, BX 415 CMPQ BX, R12 416 JL A6 417 JMP E6 418 419 L6: MOVQ (R8)(BX*8), AX 420 MULQ R9 421 ADDQ CX, AX 422 ADCQ $0, DX 423 ADDQ AX, (R10)(BX*8) 424 ADCQ $0, DX 425 MOVQ DX, CX 426 ADDQ $1, BX // i++ 427 428 E6: CMPQ BX, R11 // i < n 429 JL L6 430 431 MOVQ CX, c+56(FP) 432 RET 433 434 435 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) 436 TEXT ·divWVW(SB),NOSPLIT,$0 437 MOVQ z+0(FP), R10 438 MOVQ xn+24(FP), DX // r = xn 439 MOVQ x+32(FP), R8 440 MOVQ y+56(FP), R9 441 MOVQ z_len+8(FP), BX // i = z 442 JMP E7 443 444 L7: MOVQ (R8)(BX*8), AX 445 DIVQ R9 446 MOVQ AX, (R10)(BX*8) 447 448 E7: SUBQ $1, BX // i-- 449 JGE L7 // i >= 0 450 451 MOVQ DX, r+64(FP) 452 RET