github.com/hongwozai/go-src-1.4.3@v0.0.0-20191127132709-dc3fce3dbccb/src/math/big/arith_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // This file provides fast assembly versions for the elementary 8 // arithmetic operations on vectors implemented in arith.go. 9 10 // Literal instruction for MOVQ $0, CX. 11 // (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.) 12 #define ZERO_CX BYTE $0x48; \ 13 BYTE $0xc7; \ 14 BYTE $0xc1; \ 15 BYTE $0x00; \ 16 BYTE $0x00; \ 17 BYTE $0x00; \ 18 BYTE $0x00 19 20 // func mulWW(x, y Word) (z1, z0 Word) 21 TEXT ·mulWW(SB),NOSPLIT,$0 22 MOVQ x+0(FP), AX 23 MULQ y+8(FP) 24 MOVQ DX, z1+16(FP) 25 MOVQ AX, z0+24(FP) 26 RET 27 28 29 // func divWW(x1, x0, y Word) (q, r Word) 30 TEXT ·divWW(SB),NOSPLIT,$0 31 MOVQ x1+0(FP), DX 32 MOVQ x0+8(FP), AX 33 DIVQ y+16(FP) 34 MOVQ AX, q+24(FP) 35 MOVQ DX, r+32(FP) 36 RET 37 38 39 // func addVV(z, x, y []Word) (c Word) 40 TEXT ·addVV(SB),NOSPLIT,$0 41 MOVQ z_len+8(FP), DI 42 MOVQ x+24(FP), R8 43 MOVQ y+48(FP), R9 44 MOVQ z+0(FP), R10 45 46 MOVQ $0, CX // c = 0 47 MOVQ $0, SI // i = 0 48 49 // s/JL/JMP/ below to disable the unrolled loop 50 SUBQ $4, DI // n -= 4 51 JL V1 // if n < 0 goto V1 52 53 U1: // n >= 0 54 // regular loop body unrolled 4x 55 RCRQ $1, CX // CF = c 56 MOVQ 0(R8)(SI*8), R11 57 MOVQ 8(R8)(SI*8), R12 58 MOVQ 16(R8)(SI*8), R13 59 MOVQ 24(R8)(SI*8), R14 60 ADCQ 0(R9)(SI*8), R11 61 ADCQ 8(R9)(SI*8), R12 62 ADCQ 16(R9)(SI*8), R13 63 ADCQ 24(R9)(SI*8), R14 64 MOVQ R11, 0(R10)(SI*8) 65 MOVQ R12, 8(R10)(SI*8) 66 MOVQ R13, 16(R10)(SI*8) 67 MOVQ R14, 24(R10)(SI*8) 68 RCLQ $1, CX // c = CF 69 70 ADDQ $4, SI // i += 4 71 SUBQ $4, DI // n -= 4 72 JGE U1 // if n >= 0 goto U1 73 74 V1: ADDQ $4, DI // n += 4 75 JLE E1 // if n <= 0 goto E1 76 77 L1: // n > 0 78 RCRQ $1, CX // CF = c 79 MOVQ 0(R8)(SI*8), R11 80 ADCQ 0(R9)(SI*8), R11 81 MOVQ R11, 0(R10)(SI*8) 82 RCLQ $1, CX // c = CF 83 84 ADDQ $1, SI // i++ 85 SUBQ $1, DI // n-- 86 JG L1 // if n > 0 goto L1 87 88 E1: MOVQ CX, c+72(FP) // return c 89 RET 90 91 92 // func subVV(z, x, y []Word) (c Word) 93 // (same as addVV except for SBBQ instead of ADCQ and label names) 94 TEXT ·subVV(SB),NOSPLIT,$0 95 MOVQ z_len+8(FP), DI 96 MOVQ x+24(FP), R8 97 MOVQ y+48(FP), R9 98 MOVQ z+0(FP), R10 99 100 MOVQ $0, CX // c = 0 101 MOVQ $0, SI // i = 0 102 103 // s/JL/JMP/ below to disable the unrolled loop 104 SUBQ $4, DI // n -= 4 105 JL V2 // if n < 0 goto V2 106 107 U2: // n >= 0 108 // regular loop body unrolled 4x 109 RCRQ $1, CX // CF = c 110 MOVQ 0(R8)(SI*8), R11 111 MOVQ 8(R8)(SI*8), R12 112 MOVQ 16(R8)(SI*8), R13 113 MOVQ 24(R8)(SI*8), R14 114 SBBQ 0(R9)(SI*8), R11 115 SBBQ 8(R9)(SI*8), R12 116 SBBQ 16(R9)(SI*8), R13 117 SBBQ 24(R9)(SI*8), R14 118 MOVQ R11, 0(R10)(SI*8) 119 MOVQ R12, 8(R10)(SI*8) 120 MOVQ R13, 16(R10)(SI*8) 121 MOVQ R14, 24(R10)(SI*8) 122 RCLQ $1, CX // c = CF 123 124 ADDQ $4, SI // i += 4 125 SUBQ $4, DI // n -= 4 126 JGE U2 // if n >= 0 goto U2 127 128 V2: ADDQ $4, DI // n += 4 129 JLE E2 // if n <= 0 goto E2 130 131 L2: // n > 0 132 RCRQ $1, CX // CF = c 133 MOVQ 0(R8)(SI*8), R11 134 SBBQ 0(R9)(SI*8), R11 135 MOVQ R11, 0(R10)(SI*8) 136 RCLQ $1, CX // c = CF 137 138 ADDQ $1, SI // i++ 139 SUBQ $1, DI // n-- 140 JG L2 // if n > 0 goto L2 141 142 E2: MOVQ CX, c+72(FP) // return c 143 RET 144 145 146 // func addVW(z, x []Word, y Word) (c Word) 147 TEXT ·addVW(SB),NOSPLIT,$0 148 MOVQ z_len+8(FP), DI 149 MOVQ x+24(FP), R8 150 MOVQ y+48(FP), CX // c = y 151 MOVQ z+0(FP), R10 152 153 MOVQ $0, SI // i = 0 154 155 // s/JL/JMP/ below to disable the unrolled loop 156 SUBQ $4, DI // n -= 4 157 JL V3 // if n < 4 goto V3 158 159 U3: // n >= 0 160 // regular loop body unrolled 4x 161 MOVQ 0(R8)(SI*8), R11 162 MOVQ 8(R8)(SI*8), R12 163 MOVQ 16(R8)(SI*8), R13 164 MOVQ 24(R8)(SI*8), R14 165 ADDQ CX, R11 166 ZERO_CX 167 ADCQ $0, R12 168 ADCQ $0, R13 169 ADCQ $0, R14 170 SETCS CX // c = CF 171 MOVQ R11, 0(R10)(SI*8) 172 MOVQ R12, 8(R10)(SI*8) 173 MOVQ R13, 16(R10)(SI*8) 174 MOVQ R14, 24(R10)(SI*8) 175 176 ADDQ $4, SI // i += 4 177 SUBQ $4, DI // n -= 4 178 JGE U3 // if n >= 0 goto U3 179 180 V3: ADDQ $4, DI // n += 4 181 JLE E3 // if n <= 0 goto E3 182 183 L3: // n > 0 184 ADDQ 0(R8)(SI*8), CX 185 MOVQ CX, 0(R10)(SI*8) 186 ZERO_CX 187 RCLQ $1, CX // c = CF 188 189 ADDQ $1, SI // i++ 190 SUBQ $1, DI // n-- 191 JG L3 // if n > 0 goto L3 192 193 E3: MOVQ CX, c+56(FP) // return c 194 RET 195 196 197 // func subVW(z, x []Word, y Word) (c Word) 198 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) 199 TEXT ·subVW(SB),NOSPLIT,$0 200 MOVQ z_len+8(FP), DI 201 MOVQ x+24(FP), R8 202 MOVQ y+48(FP), CX // c = y 203 MOVQ z+0(FP), R10 204 205 MOVQ $0, SI // i = 0 206 207 // s/JL/JMP/ below to disable the unrolled loop 208 SUBQ $4, DI // n -= 4 209 JL V4 // if n < 4 goto V4 210 211 U4: // n >= 0 212 // regular loop body unrolled 4x 213 MOVQ 0(R8)(SI*8), R11 214 MOVQ 8(R8)(SI*8), R12 215 MOVQ 16(R8)(SI*8), R13 216 MOVQ 24(R8)(SI*8), R14 217 SUBQ CX, R11 218 ZERO_CX 219 SBBQ $0, R12 220 SBBQ $0, R13 221 SBBQ $0, R14 222 SETCS CX // c = CF 223 MOVQ R11, 0(R10)(SI*8) 224 MOVQ R12, 8(R10)(SI*8) 225 MOVQ R13, 16(R10)(SI*8) 226 MOVQ R14, 24(R10)(SI*8) 227 228 ADDQ $4, SI // i += 4 229 SUBQ $4, DI // n -= 4 230 JGE U4 // if n >= 0 goto U4 231 232 V4: ADDQ $4, DI // n += 4 233 JLE E4 // if n <= 0 goto E4 234 235 L4: // n > 0 236 MOVQ 0(R8)(SI*8), R11 237 SUBQ CX, R11 238 MOVQ R11, 0(R10)(SI*8) 239 ZERO_CX 240 RCLQ $1, CX // c = CF 241 242 ADDQ $1, SI // i++ 243 SUBQ $1, DI // n-- 244 JG L4 // if n > 0 goto L4 245 246 E4: MOVQ CX, c+56(FP) // return c 247 RET 248 249 250 // func shlVU(z, x []Word, s uint) (c Word) 251 TEXT ·shlVU(SB),NOSPLIT,$0 252 MOVQ z_len+8(FP), BX // i = z 253 SUBQ $1, BX // i-- 254 JL X8b // i < 0 (n <= 0) 255 256 // n > 0 257 MOVQ z+0(FP), R10 258 MOVQ x+24(FP), R8 259 MOVQ s+48(FP), CX 260 MOVQ (R8)(BX*8), AX // w1 = x[n-1] 261 MOVQ $0, DX 262 SHLQ CX, DX:AX // w1>>ŝ 263 MOVQ DX, c+56(FP) 264 265 CMPQ BX, $0 266 JLE X8a // i <= 0 267 268 // i > 0 269 L8: MOVQ AX, DX // w = w1 270 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] 271 SHLQ CX, DX:AX // w<<s | w1>>ŝ 272 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ 273 SUBQ $1, BX // i-- 274 JG L8 // i > 0 275 276 // i <= 0 277 X8a: SHLQ CX, AX // w1<<s 278 MOVQ AX, (R10) // z[0] = w1<<s 279 RET 280 281 X8b: MOVQ $0, c+56(FP) 282 RET 283 284 285 // func shrVU(z, x []Word, s uint) (c Word) 286 TEXT ·shrVU(SB),NOSPLIT,$0 287 MOVQ z_len+8(FP), R11 288 SUBQ $1, R11 // n-- 289 JL X9b // n < 0 (n <= 0) 290 291 // n > 0 292 MOVQ z+0(FP), R10 293 MOVQ x+24(FP), R8 294 MOVQ s+48(FP), CX 295 MOVQ (R8), AX // w1 = x[0] 296 MOVQ $0, DX 297 SHRQ CX, DX:AX // w1<<ŝ 298 MOVQ DX, c+56(FP) 299 300 MOVQ $0, BX // i = 0 301 JMP E9 302 303 // i < n-1 304 L9: MOVQ AX, DX // w = w1 305 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] 306 SHRQ CX, DX:AX // w>>s | w1<<ŝ 307 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ 308 ADDQ $1, BX // i++ 309 310 E9: CMPQ BX, R11 311 JL L9 // i < n-1 312 313 // i >= n-1 314 X9a: SHRQ CX, AX // w1>>s 315 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s 316 RET 317 318 X9b: MOVQ $0, c+56(FP) 319 RET 320 321 322 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 323 TEXT ·mulAddVWW(SB),NOSPLIT,$0 324 MOVQ z+0(FP), R10 325 MOVQ x+24(FP), R8 326 MOVQ y+48(FP), R9 327 MOVQ r+56(FP), CX // c = r 328 MOVQ z_len+8(FP), R11 329 MOVQ $0, BX // i = 0 330 JMP E5 331 332 L5: MOVQ (R8)(BX*8), AX 333 MULQ R9 334 ADDQ CX, AX 335 ADCQ $0, DX 336 MOVQ AX, (R10)(BX*8) 337 MOVQ DX, CX 338 ADDQ $1, BX // i++ 339 340 E5: CMPQ BX, R11 // i < n 341 JL L5 342 343 MOVQ CX, c+64(FP) 344 RET 345 346 347 // func addMulVVW(z, x []Word, y Word) (c Word) 348 TEXT ·addMulVVW(SB),NOSPLIT,$0 349 MOVQ z+0(FP), R10 350 MOVQ x+24(FP), R8 351 MOVQ y+48(FP), R9 352 MOVQ z_len+8(FP), R11 353 MOVQ $0, BX // i = 0 354 MOVQ $0, CX // c = 0 355 JMP E6 356 357 L6: MOVQ (R8)(BX*8), AX 358 MULQ R9 359 ADDQ CX, AX 360 ADCQ $0, DX 361 ADDQ AX, (R10)(BX*8) 362 ADCQ $0, DX 363 MOVQ DX, CX 364 ADDQ $1, BX // i++ 365 366 E6: CMPQ BX, R11 // i < n 367 JL L6 368 369 MOVQ CX, c+56(FP) 370 RET 371 372 373 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) 374 TEXT ·divWVW(SB),NOSPLIT,$0 375 MOVQ z+0(FP), R10 376 MOVQ xn+24(FP), DX // r = xn 377 MOVQ x+32(FP), R8 378 MOVQ y+56(FP), R9 379 MOVQ z_len+8(FP), BX // i = z 380 JMP E7 381 382 L7: MOVQ (R8)(BX*8), AX 383 DIVQ R9 384 MOVQ AX, (R10)(BX*8) 385 386 E7: SUBQ $1, BX // i-- 387 JGE L7 // i >= 0 388 389 MOVQ DX, r+64(FP) 390 RET 391 392 // func bitLen(x Word) (n int) 393 TEXT ·bitLen(SB),NOSPLIT,$0 394 BSRQ x+0(FP), AX 395 JZ Z1 396 ADDQ $1, AX 397 MOVQ AX, n+8(FP) 398 RET 399 400 Z1: MOVQ $0, n+8(FP) 401 RET