github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/math/big/arith_ppc64x.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT ·mulWW(SB), NOSPLIT, $0 14 MOVD x+0(FP), R4 15 MOVD y+8(FP), R5 16 MULHDU R4, R5, R6 17 MULLD R4, R5, R7 18 MOVD R6, z1+16(FP) 19 MOVD R7, z0+24(FP) 20 RET 21 22 // func addVV(z, y, y []Word) (c Word) 23 // z[i] = x[i] + y[i] for all i, carrying 24 TEXT ·addVV(SB), NOSPLIT, $0 25 MOVD z_len+8(FP), R7 // R7 = z_len 26 MOVD x+24(FP), R8 // R8 = x[] 27 MOVD y+48(FP), R9 // R9 = y[] 28 MOVD z+0(FP), R10 // R10 = z[] 29 30 // If z_len = 0, we are done 31 CMP R0, R7 32 MOVD R0, R4 33 BEQ done 34 35 // Process the first iteration out of the loop so we can 36 // use MOVDU and avoid 3 index registers updates. 37 MOVD 0(R8), R11 // R11 = x[i] 38 MOVD 0(R9), R12 // R12 = y[i] 39 ADD $-1, R7 // R7 = z_len - 1 40 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA 41 CMP R0, R7 42 MOVD R15, 0(R10) // z[i] 43 BEQ final // If z_len was 1, we are done 44 45 SRD $2, R7, R5 // R5 = z_len/4 46 CMP R0, R5 47 MOVD R5, CTR // Set up loop counter 48 BEQ tail // If R5 = 0, we can't use the loop 49 50 // Process 4 elements per iteration. Unrolling this loop 51 // means a performance trade-off: we will lose performance 52 // for small values of z_len (0.90x in the worst case), but 53 // gain significant performance as z_len increases (up to 54 // 1.45x). 55 loop: 56 MOVD 8(R8), R11 // R11 = x[i] 57 MOVD 16(R8), R12 // R12 = x[i+1] 58 MOVD 24(R8), R14 // R14 = x[i+2] 59 MOVDU 32(R8), R15 // R15 = x[i+3] 60 MOVD 8(R9), R16 // R16 = y[i] 61 MOVD 16(R9), R17 // R17 = y[i+1] 62 MOVD 24(R9), R18 // R18 = y[i+2] 63 MOVDU 32(R9), R19 // R19 = y[i+3] 64 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 65 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA 66 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA 67 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA 68 MOVD R20, 8(R10) // z[i] 69 MOVD R21, 16(R10) // z[i+1] 70 MOVD R22, 24(R10) // z[i+2] 71 MOVDU R23, 32(R10) // z[i+3] 72 ADD $-4, R7 // R7 = z_len - 4 73 BC 16, 0, loop // bdnz 74 75 // We may have more elements to read 76 CMP R0, R7 77 BEQ final 78 79 // Process the remaining elements, one at a time 80 tail: 81 MOVDU 8(R8), R11 // R11 = x[i] 82 MOVDU 8(R9), R16 // R16 = y[i] 83 ADD $-1, R7 // R7 = z_len - 1 84 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 85 CMP R0, R7 86 MOVDU R20, 8(R10) // z[i] 87 BEQ final // If R7 = 0, we are done 88 89 MOVDU 8(R8), R11 90 MOVDU 8(R9), R16 91 ADD $-1, R7 92 ADDE R11, R16, R20 93 CMP R0, R7 94 MOVDU R20, 8(R10) 95 BEQ final 96 97 MOVD 8(R8), R11 98 MOVD 8(R9), R16 99 ADDE R11, R16, R20 100 MOVD R20, 8(R10) 101 102 final: 103 ADDZE R4 // Capture CA 104 105 done: 106 MOVD R4, c+72(FP) 107 RET 108 109 // func subVV(z, x, y []Word) (c Word) 110 // z[i] = x[i] - y[i] for all i, carrying 111 TEXT ·subVV(SB), NOSPLIT, $0 112 MOVD z_len+8(FP), R7 // R7 = z_len 113 MOVD x+24(FP), R8 // R8 = x[] 114 MOVD y+48(FP), R9 // R9 = y[] 115 MOVD z+0(FP), R10 // R10 = z[] 116 117 // If z_len = 0, we are done 118 CMP R0, R7 119 MOVD R0, R4 120 BEQ done 121 122 // Process the first iteration out of the loop so we can 123 // use MOVDU and avoid 3 index registers updates. 124 MOVD 0(R8), R11 // R11 = x[i] 125 MOVD 0(R9), R12 // R12 = y[i] 126 ADD $-1, R7 // R7 = z_len - 1 127 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA 128 CMP R0, R7 129 MOVD R15, 0(R10) // z[i] 130 BEQ final // If z_len was 1, we are done 131 132 SRD $2, R7, R5 // R5 = z_len/4 133 CMP R0, R5 134 MOVD R5, CTR // Set up loop counter 135 BEQ tail // If R5 = 0, we can't use the loop 136 137 // Process 4 elements per iteration. Unrolling this loop 138 // means a performance trade-off: we will lose performance 139 // for small values of z_len (0.92x in the worst case), but 140 // gain significant performance as z_len increases (up to 141 // 1.45x). 142 loop: 143 MOVD 8(R8), R11 // R11 = x[i] 144 MOVD 16(R8), R12 // R12 = x[i+1] 145 MOVD 24(R8), R14 // R14 = x[i+2] 146 MOVDU 32(R8), R15 // R15 = x[i+3] 147 MOVD 8(R9), R16 // R16 = y[i] 148 MOVD 16(R9), R17 // R17 = y[i+1] 149 MOVD 24(R9), R18 // R18 = y[i+2] 150 MOVDU 32(R9), R19 // R19 = y[i+3] 151 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 152 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA 153 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA 154 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA 155 MOVD R20, 8(R10) // z[i] 156 MOVD R21, 16(R10) // z[i+1] 157 MOVD R22, 24(R10) // z[i+2] 158 MOVDU R23, 32(R10) // z[i+3] 159 ADD $-4, R7 // R7 = z_len - 4 160 BC 16, 0, loop // bdnz 161 162 // We may have more elements to read 163 CMP R0, R7 164 BEQ final 165 166 // Process the remaining elements, one at a time 167 tail: 168 MOVDU 8(R8), R11 // R11 = x[i] 169 MOVDU 8(R9), R16 // R16 = y[i] 170 ADD $-1, R7 // R7 = z_len - 1 171 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 172 CMP R0, R7 173 MOVDU R20, 8(R10) // z[i] 174 BEQ final // If R7 = 0, we are done 175 176 MOVDU 8(R8), R11 177 MOVDU 8(R9), R16 178 ADD $-1, R7 179 SUBE R16, R11, R20 180 CMP R0, R7 181 MOVDU R20, 8(R10) 182 BEQ final 183 184 MOVD 8(R8), R11 185 MOVD 8(R9), R16 186 SUBE R16, R11, R20 187 MOVD R20, 8(R10) 188 189 final: 190 ADDZE R4 191 XOR $1, R4 192 193 done: 194 MOVD R4, c+72(FP) 195 RET 196 197 // func addVW(z, x []Word, y Word) (c Word) 198 TEXT ·addVW(SB), NOSPLIT, $0 199 MOVD z+0(FP), R10 // R10 = z[] 200 MOVD x+24(FP), R8 // R8 = x[] 201 MOVD y+48(FP), R4 // R4 = y = c 202 MOVD z_len+8(FP), R11 // R11 = z_len 203 204 CMP R0, R11 // If z_len is zero, return 205 BEQ done 206 207 // We will process the first iteration out of the loop so we capture 208 // the value of c. In the subsequent iterations, we will rely on the 209 // value of CA set here. 210 MOVD 0(R8), R20 // R20 = x[i] 211 ADD $-1, R11 // R11 = z_len - 1 212 ADDC R20, R4, R6 // R6 = x[i] + c 213 CMP R0, R11 // If z_len was 1, we are done 214 MOVD R6, 0(R10) // z[i] 215 BEQ final 216 217 // We will read 4 elements per iteration 218 SRD $2, R11, R9 // R9 = z_len/4 219 DCBT (R8) 220 CMP R0, R9 221 MOVD R9, CTR // Set up the loop counter 222 BEQ tail // If R9 = 0, we can't use the loop 223 224 loop: 225 MOVD 8(R8), R20 // R20 = x[i] 226 MOVD 16(R8), R21 // R21 = x[i+1] 227 MOVD 24(R8), R22 // R22 = x[i+2] 228 MOVDU 32(R8), R23 // R23 = x[i+3] 229 ADDZE R20, R24 // R24 = x[i] + CA 230 ADDZE R21, R25 // R25 = x[i+1] + CA 231 ADDZE R22, R26 // R26 = x[i+2] + CA 232 ADDZE R23, R27 // R27 = x[i+3] + CA 233 MOVD R24, 8(R10) // z[i] 234 MOVD R25, 16(R10) // z[i+1] 235 MOVD R26, 24(R10) // z[i+2] 236 MOVDU R27, 32(R10) // z[i+3] 237 ADD $-4, R11 // R11 = z_len - 4 238 BC 16, 0, loop // bdnz 239 240 // We may have some elements to read 241 CMP R0, R11 242 BEQ final 243 244 tail: 245 MOVDU 8(R8), R20 246 ADDZE R20, R24 247 ADD $-1, R11 248 MOVDU R24, 8(R10) 249 CMP R0, R11 250 BEQ final 251 252 MOVDU 8(R8), R20 253 ADDZE R20, R24 254 ADD $-1, R11 255 MOVDU R24, 8(R10) 256 CMP R0, R11 257 BEQ final 258 259 MOVD 8(R8), R20 260 ADDZE R20, R24 261 MOVD R24, 8(R10) 262 263 final: 264 ADDZE R0, R4 // c = CA 265 done: 266 MOVD R4, c+56(FP) 267 RET 268 269 // func subVW(z, x []Word, y Word) (c Word) 270 TEXT ·subVW(SB), NOSPLIT, $0 271 MOVD z+0(FP), R10 // R10 = z[] 272 MOVD x+24(FP), R8 // R8 = x[] 273 MOVD y+48(FP), R4 // R4 = y = c 274 MOVD z_len+8(FP), R11 // R11 = z_len 275 276 CMP R0, R11 // If z_len is zero, return 277 BEQ done 278 279 // We will process the first iteration out of the loop so we capture 280 // the value of c. In the subsequent iterations, we will rely on the 281 // value of CA set here. 282 MOVD 0(R8), R20 // R20 = x[i] 283 ADD $-1, R11 // R11 = z_len - 1 284 SUBC R4, R20, R6 // R6 = x[i] - c 285 CMP R0, R11 // If z_len was 1, we are done 286 MOVD R6, 0(R10) // z[i] 287 BEQ final 288 289 // We will read 4 elements per iteration 290 SRD $2, R11, R9 // R9 = z_len/4 291 DCBT (R8) 292 CMP R0, R9 293 MOVD R9, CTR // Set up the loop counter 294 BEQ tail // If R9 = 0, we can't use the loop 295 296 // The loop here is almost the same as the one used in s390x, but 297 // we don't need to capture CA every iteration because we've already 298 // done that above. 299 loop: 300 MOVD 8(R8), R20 301 MOVD 16(R8), R21 302 MOVD 24(R8), R22 303 MOVDU 32(R8), R23 304 SUBE R0, R20 305 SUBE R0, R21 306 SUBE R0, R22 307 SUBE R0, R23 308 MOVD R20, 8(R10) 309 MOVD R21, 16(R10) 310 MOVD R22, 24(R10) 311 MOVDU R23, 32(R10) 312 ADD $-4, R11 313 BC 16, 0, loop // bdnz 314 315 // We may have some elements to read 316 CMP R0, R11 317 BEQ final 318 319 tail: 320 MOVDU 8(R8), R20 321 SUBE R0, R20 322 ADD $-1, R11 323 MOVDU R20, 8(R10) 324 CMP R0, R11 325 BEQ final 326 327 MOVDU 8(R8), R20 328 SUBE R0, R20 329 ADD $-1, R11 330 MOVDU R20, 8(R10) 331 CMP R0, R11 332 BEQ final 333 334 MOVD 8(R8), R20 335 SUBE R0, R20 336 MOVD R20, 8(R10) 337 338 final: 339 // Capture CA 340 SUBE R4, R4 341 NEG R4, R4 342 343 done: 344 MOVD R4, c+56(FP) 345 RET 346 347 TEXT ·shlVU(SB), NOSPLIT, $0 348 BR ·shlVU_g(SB) 349 350 TEXT ·shrVU(SB), NOSPLIT, $0 351 BR ·shrVU_g(SB) 352 353 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 354 TEXT ·mulAddVWW(SB), NOSPLIT, $0 355 MOVD z+0(FP), R10 // R10 = z[] 356 MOVD x+24(FP), R8 // R8 = x[] 357 MOVD y+48(FP), R9 // R9 = y 358 MOVD r+56(FP), R4 // R4 = r = c 359 MOVD z_len+8(FP), R11 // R11 = z_len 360 361 CMP R0, R11 362 BEQ done 363 364 MOVD 0(R8), R20 365 ADD $-1, R11 366 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) 367 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) 368 ADDC R4, R6 // R6 = z0 + r 369 ADDZE R7 // R7 = z1 + CA 370 CMP R0, R11 371 MOVD R7, R4 // R4 = c 372 MOVD R6, 0(R10) // z[i] 373 BEQ done 374 375 // We will read 4 elements per iteration 376 SRD $2, R11, R14 // R14 = z_len/4 377 DCBT (R8) 378 CMP R0, R14 379 MOVD R14, CTR // Set up the loop counter 380 BEQ tail // If R9 = 0, we can't use the loop 381 382 loop: 383 MOVD 8(R8), R20 // R20 = x[i] 384 MOVD 16(R8), R21 // R21 = x[i+1] 385 MOVD 24(R8), R22 // R22 = x[i+2] 386 MOVDU 32(R8), R23 // R23 = x[i+3] 387 MULLD R9, R20, R24 // R24 = z0[i] 388 MULHDU R9, R20, R20 // R20 = z1[i] 389 ADDC R4, R24 // R24 = z0[i] + c 390 ADDZE R20 // R7 = z1[i] + CA 391 MULLD R9, R21, R25 392 MULHDU R9, R21, R21 393 ADDC R20, R25 394 ADDZE R21 395 MULLD R9, R22, R26 396 MULHDU R9, R22, R22 397 MULLD R9, R23, R27 398 MULHDU R9, R23, R23 399 ADDC R21, R26 400 ADDZE R22 401 MOVD R24, 8(R10) // z[i] 402 MOVD R25, 16(R10) // z[i+1] 403 ADDC R22, R27 404 ADDZE R23,R4 // update carry 405 MOVD R26, 24(R10) // z[i+2] 406 MOVDU R27, 32(R10) // z[i+3] 407 ADD $-4, R11 // R11 = z_len - 4 408 BC 16, 0, loop // bdnz 409 410 // We may have some elements to read 411 CMP R0, R11 412 BEQ done 413 414 // Process the remaining elements, one at a time 415 tail: 416 MOVDU 8(R8), R20 // R20 = x[i] 417 MULLD R9, R20, R24 // R24 = z0[i] 418 MULHDU R9, R20, R25 // R25 = z1[i] 419 ADD $-1, R11 // R11 = z_len - 1 420 ADDC R4, R24 421 ADDZE R25 422 MOVDU R24, 8(R10) // z[i] 423 CMP R0, R11 424 MOVD R25, R4 // R4 = c 425 BEQ done // If R11 = 0, we are done 426 427 MOVDU 8(R8), R20 428 MULLD R9, R20, R24 429 MULHDU R9, R20, R25 430 ADD $-1, R11 431 ADDC R4, R24 432 ADDZE R25 433 MOVDU R24, 8(R10) 434 CMP R0, R11 435 MOVD R25, R4 436 BEQ done 437 438 MOVD 8(R8), R20 439 MULLD R9, R20, R24 440 MULHDU R9, R20, R25 441 ADD $-1, R11 442 ADDC R4, R24 443 ADDZE R25 444 MOVD R24, 8(R10) 445 MOVD R25, R4 446 447 done: 448 MOVD R4, c+64(FP) 449 RET 450 451 // func addMulVVW(z, x []Word, y Word) (c Word) 452 TEXT ·addMulVVW(SB), NOSPLIT, $0 453 MOVD z+0(FP), R10 // R10 = z[] 454 MOVD x+24(FP), R8 // R8 = x[] 455 MOVD y+48(FP), R9 // R9 = y 456 MOVD z_len+8(FP), R22 // R22 = z_len 457 458 MOVD R0, R3 // R3 will be the index register 459 CMP R0, R22 460 MOVD R0, R4 // R4 = c = 0 461 MOVD R22, CTR // Initialize loop counter 462 BEQ done 463 464 loop: 465 MOVD (R8)(R3), R20 // Load x[i] 466 MOVD (R10)(R3), R21 // Load z[i] 467 MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) 468 MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) 469 ADDC R21, R6 // R6 = z0 470 ADDZE R7 // R7 = z1 471 ADDC R4, R6 // R6 = z0 + c + 0 472 ADDZE R7, R4 // c += z1 473 MOVD R6, (R10)(R3) // Store z[i] 474 ADD $8, R3 475 BC 16, 0, loop // bdnz 476 477 done: 478 MOVD R4, c+56(FP) 479 RET 480 481