github.com/rsc/go@v0.0.0-20150416155037-e040fd465409/src/math/big/arith_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT ·mulWW(SB),NOSPLIT,$0 14 MOVQ x+0(FP), AX 15 MULQ y+8(FP) 16 MOVQ DX, z1+16(FP) 17 MOVQ AX, z0+24(FP) 18 RET 19 20 21 // func divWW(x1, x0, y Word) (q, r Word) 22 TEXT ·divWW(SB),NOSPLIT,$0 23 MOVQ x1+0(FP), DX 24 MOVQ x0+8(FP), AX 25 DIVQ y+16(FP) 26 MOVQ AX, q+24(FP) 27 MOVQ DX, r+32(FP) 28 RET 29 30 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. 31 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. 32 // This is faster than using rotate instructions. 33 // 34 // CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit! 35 36 // func addVV(z, x, y []Word) (c Word) 37 TEXT ·addVV(SB),NOSPLIT,$0 38 MOVQ z_len+8(FP), DI 39 MOVQ x+24(FP), R8 40 MOVQ y+48(FP), R9 41 MOVQ z+0(FP), R10 42 43 MOVQ $0, CX // c = 0 44 MOVQ $0, SI // i = 0 45 46 // s/JL/JMP/ below to disable the unrolled loop 47 SUBQ $4, DI // n -= 4 48 JL V1 // if n < 0 goto V1 49 50 U1: // n >= 0 51 // regular loop body unrolled 4x 52 ADDQ CX, CX // restore CF 53 MOVQ 0(R8)(SI*8), R11 54 MOVQ 8(R8)(SI*8), R12 55 MOVQ 16(R8)(SI*8), R13 56 MOVQ 24(R8)(SI*8), R14 57 ADCQ 0(R9)(SI*8), R11 58 ADCQ 8(R9)(SI*8), R12 59 ADCQ 16(R9)(SI*8), R13 60 ADCQ 24(R9)(SI*8), R14 61 MOVQ R11, 0(R10)(SI*8) 62 MOVQ R12, 8(R10)(SI*8) 63 MOVQ R13, 16(R10)(SI*8) 64 MOVQ R14, 24(R10)(SI*8) 65 SBBQ CX, CX // save CF 66 67 ADDQ $4, SI // i += 4 68 SUBQ $4, DI // n -= 4 69 JGE U1 // if n >= 0 goto U1 70 71 V1: ADDQ $4, DI // n += 4 72 JLE E1 // if n <= 0 goto E1 73 74 L1: // n > 0 75 ADDQ CX, CX // restore CF 76 MOVQ 0(R8)(SI*8), R11 77 ADCQ 0(R9)(SI*8), R11 78 MOVQ R11, 0(R10)(SI*8) 79 SBBQ CX, CX // save CF 80 81 ADDQ $1, SI // i++ 82 SUBQ $1, DI // n-- 83 JG L1 // if n > 0 goto L1 84 85 E1: NEGQ CX 86 MOVQ CX, c+72(FP) // return c 87 RET 88 89 90 // func subVV(z, x, y []Word) (c Word) 91 // (same as addVV except for SBBQ instead of ADCQ and label names) 92 TEXT ·subVV(SB),NOSPLIT,$0 93 MOVQ z_len+8(FP), DI 94 MOVQ x+24(FP), R8 95 MOVQ y+48(FP), R9 96 MOVQ z+0(FP), R10 97 98 MOVQ $0, CX // c = 0 99 MOVQ $0, SI // i = 0 100 101 // s/JL/JMP/ below to disable the unrolled loop 102 SUBQ $4, DI // n -= 4 103 JL V2 // if n < 0 goto V2 104 105 U2: // n >= 0 106 // regular loop body unrolled 4x 107 ADDQ CX, CX // restore CF 108 MOVQ 0(R8)(SI*8), R11 109 MOVQ 8(R8)(SI*8), R12 110 MOVQ 16(R8)(SI*8), R13 111 MOVQ 24(R8)(SI*8), R14 112 SBBQ 0(R9)(SI*8), R11 113 SBBQ 8(R9)(SI*8), R12 114 SBBQ 16(R9)(SI*8), R13 115 SBBQ 24(R9)(SI*8), R14 116 MOVQ R11, 0(R10)(SI*8) 117 MOVQ R12, 8(R10)(SI*8) 118 MOVQ R13, 16(R10)(SI*8) 119 MOVQ R14, 24(R10)(SI*8) 120 SBBQ CX, CX // save CF 121 122 ADDQ $4, SI // i += 4 123 SUBQ $4, DI // n -= 4 124 JGE U2 // if n >= 0 goto U2 125 126 V2: ADDQ $4, DI // n += 4 127 JLE E2 // if n <= 0 goto E2 128 129 L2: // n > 0 130 ADDQ CX, CX // restore CF 131 MOVQ 0(R8)(SI*8), R11 132 SBBQ 0(R9)(SI*8), R11 133 MOVQ R11, 0(R10)(SI*8) 134 SBBQ CX, CX // save CF 135 136 ADDQ $1, SI // i++ 137 SUBQ $1, DI // n-- 138 JG L2 // if n > 0 goto L2 139 140 E2: NEGQ CX 141 MOVQ CX, c+72(FP) // return c 142 RET 143 144 145 // func addVW(z, x []Word, y Word) (c Word) 146 TEXT ·addVW(SB),NOSPLIT,$0 147 MOVQ z_len+8(FP), DI 148 MOVQ x+24(FP), R8 149 MOVQ y+48(FP), CX // c = y 150 MOVQ z+0(FP), R10 151 152 MOVQ $0, SI // i = 0 153 154 // s/JL/JMP/ below to disable the unrolled loop 155 SUBQ $4, DI // n -= 4 156 JL V3 // if n < 4 goto V3 157 158 U3: // n >= 0 159 // regular loop body unrolled 4x 160 MOVQ 0(R8)(SI*8), R11 161 MOVQ 8(R8)(SI*8), R12 162 MOVQ 16(R8)(SI*8), R13 163 MOVQ 24(R8)(SI*8), R14 164 ADDQ CX, R11 165 ADCQ $0, R12 166 ADCQ $0, R13 167 ADCQ $0, R14 168 SBBQ CX, CX // save CF 169 NEGQ CX 170 MOVQ R11, 0(R10)(SI*8) 171 MOVQ R12, 8(R10)(SI*8) 172 MOVQ R13, 16(R10)(SI*8) 173 MOVQ R14, 24(R10)(SI*8) 174 175 ADDQ $4, SI // i += 4 176 SUBQ $4, DI // n -= 4 177 JGE U3 // if n >= 0 goto U3 178 179 V3: ADDQ $4, DI // n += 4 180 JLE E3 // if n <= 0 goto E3 181 182 L3: // n > 0 183 ADDQ 0(R8)(SI*8), CX 184 MOVQ CX, 0(R10)(SI*8) 185 SBBQ CX, CX // save CF 186 NEGQ CX 187 188 ADDQ $1, SI // i++ 189 SUBQ $1, DI // n-- 190 JG L3 // if n > 0 goto L3 191 192 E3: MOVQ CX, c+56(FP) // return c 193 RET 194 195 196 // func subVW(z, x []Word, y Word) (c Word) 197 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 198 TEXT ·subVW(SB),NOSPLIT,$0 199 MOVQ z_len+8(FP), DI 200 MOVQ x+24(FP), R8 201 MOVQ y+48(FP), CX // c = y 202 MOVQ z+0(FP), R10 203 204 MOVQ $0, SI // i = 0 205 206 // s/JL/JMP/ below to disable the unrolled loop 207 SUBQ $4, DI // n -= 4 208 JL V4 // if n < 4 goto V4 209 210 U4: // n >= 0 211 // regular loop body unrolled 4x 212 MOVQ 0(R8)(SI*8), R11 213 MOVQ 8(R8)(SI*8), R12 214 MOVQ 16(R8)(SI*8), R13 215 MOVQ 24(R8)(SI*8), R14 216 SUBQ CX, R11 217 SBBQ $0, R12 218 SBBQ $0, R13 219 SBBQ $0, R14 220 SBBQ CX, CX // save CF 221 NEGQ CX 222 MOVQ R11, 0(R10)(SI*8) 223 MOVQ R12, 8(R10)(SI*8) 224 MOVQ R13, 16(R10)(SI*8) 225 MOVQ R14, 24(R10)(SI*8) 226 227 ADDQ $4, SI // i += 4 228 SUBQ $4, DI // n -= 4 229 JGE U4 // if n >= 0 goto U4 230 231 V4: ADDQ $4, DI // n += 4 232 JLE E4 // if n <= 0 goto E4 233 234 L4: // n > 0 235 MOVQ 0(R8)(SI*8), R11 236 SUBQ CX, R11 237 MOVQ R11, 0(R10)(SI*8) 238 SBBQ CX, CX // save CF 239 NEGQ CX 240 241 ADDQ $1, SI // i++ 242 SUBQ $1, DI // n-- 243 JG L4 // if n > 0 goto L4 244 245 E4: MOVQ CX, c+56(FP) // return c 246 RET 247 248 249 // func shlVU(z, x []Word, s uint) (c Word) 250 TEXT ·shlVU(SB),NOSPLIT,$0 251 MOVQ z_len+8(FP), BX // i = z 252 SUBQ $1, BX // i-- 253 JL X8b // i < 0 (n <= 0) 254 255 // n > 0 256 MOVQ z+0(FP), R10 257 MOVQ x+24(FP), R8 258 MOVQ s+48(FP), CX 259 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 260 MOVQ $0, DX 261 SHLQ CX, DX:AX // w1>>ŝ 262 MOVQ DX, c+56(FP) 263 264 CMPQ BX, $0 265 JLE X8a // i <= 0 266 267 // i > 0 268 L8: MOVQ AX, DX // w = w1 269 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 270 SHLQ CX, DX:AX // w<<s | w1>>ŝ 271 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ 272 SUBQ $1, BX // i-- 273 JG L8 // i > 0 274 275 // i <= 0 276 X8a: SHLQ CX, AX // w1<<s 277 MOVQ AX, (R10) // z[0] = w1<<s 278 RET 279 280 X8b: MOVQ $0, c+56(FP) 281 RET 282 283 284 // func shrVU(z, x []Word, s uint) (c Word) 285 TEXT ·shrVU(SB),NOSPLIT,$0 286 MOVQ z_len+8(FP), R11 287 SUBQ $1, R11 // n-- 288 JL X9b // n < 0 (n <= 0) 289 290 // n > 0 291 MOVQ z+0(FP), R10 292 MOVQ x+24(FP), R8 293 MOVQ s+48(FP), CX 294 MOVQ (R8), AX // w1 = x[0] 295 MOVQ $0, DX 296 SHRQ CX, DX:AX // w1<<ŝ 297 MOVQ DX, c+56(FP) 298 299 MOVQ $0, BX // i = 0 300 JMP E9 301 302 // i < n-1 303 L9: MOVQ AX, DX // w = w1 304 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 305 SHRQ CX, DX:AX // w>>s | w1<<ŝ 306 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ 307 ADDQ $1, BX // i++ 308 309 E9: CMPQ BX, R11 310 JL L9 // i < n-1 311 312 // i >= n-1 313 X9a: SHRQ CX, AX // w1>>s 314 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 315 RET 316 317 X9b: MOVQ $0, c+56(FP) 318 RET 319 320 321 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 322 TEXT ·mulAddVWW(SB),NOSPLIT,$0 323 MOVQ z+0(FP), R10 324 MOVQ x+24(FP), R8 325 MOVQ y+48(FP), R9 326 MOVQ r+56(FP), CX // c = r 327 MOVQ z_len+8(FP), R11 328 MOVQ $0, BX // i = 0 329 JMP E5 330 331 L5: MOVQ (R8)(BX*8), AX 332 MULQ R9 333 ADDQ CX, AX 334 ADCQ $0, DX 335 MOVQ AX, (R10)(BX*8) 336 MOVQ DX, CX 337 ADDQ $1, BX // i++ 338 339 E5: CMPQ BX, R11 // i < n 340 JL L5 341 342 MOVQ CX, c+64(FP) 343 RET 344 345 346 // func addMulVVW(z, x []Word, y Word) (c Word) 347 TEXT ·addMulVVW(SB),NOSPLIT,$0 348 MOVQ z+0(FP), R10 349 MOVQ x+24(FP), R8 350 MOVQ y+48(FP), R9 351 MOVQ z_len+8(FP), R11 352 MOVQ $0, BX // i = 0 353 MOVQ $0, CX // c = 0 354 JMP E6 355 356 L6: MOVQ (R8)(BX*8), AX 357 MULQ R9 358 ADDQ CX, AX 359 ADCQ $0, DX 360 ADDQ AX, (R10)(BX*8) 361 ADCQ $0, DX 362 MOVQ DX, CX 363 ADDQ $1, BX // i++ 364 365 E6: CMPQ BX, R11 // i < n 366 JL L6 367 368 MOVQ CX, c+56(FP) 369 RET 370 371 372 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) 373 TEXT ·divWVW(SB),NOSPLIT,$0 374 MOVQ z+0(FP), R10 375 MOVQ xn+24(FP), DX // r = xn 376 MOVQ x+32(FP), R8 377 MOVQ y+56(FP), R9 378 MOVQ z_len+8(FP), BX // i = z 379 JMP E7 380 381 L7: MOVQ (R8)(BX*8), AX 382 DIVQ R9 383 MOVQ AX, (R10)(BX*8) 384 385 E7: SUBQ $1, BX // i-- 386 JGE L7 // i >= 0 387 388 MOVQ DX, r+64(FP) 389 RET 390 391 // func bitLen(x Word) (n int) 392 TEXT ·bitLen(SB),NOSPLIT,$0 393 BSRQ x+0(FP), AX 394 JZ Z1 395 ADDQ $1, AX 396 MOVQ AX, n+8(FP) 397 RET 398 399 Z1: MOVQ $0, n+8(FP) 400 RET