github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/math/big/arith_ppc64x.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !math_big_pure_go && (ppc64 || ppc64le) 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func addVV(z, y, y []Word) (c Word) 13 // z[i] = x[i] + y[i] for all i, carrying 14 TEXT ·addVV(SB), NOSPLIT, $0 15 MOVD z_len+8(FP), R7 // R7 = z_len 16 MOVD x+24(FP), R8 // R8 = x[] 17 MOVD y+48(FP), R9 // R9 = y[] 18 MOVD z+0(FP), R10 // R10 = z[] 19 20 // If z_len = 0, we are done 21 CMP R0, R7 22 MOVD R0, R4 23 BEQ done 24 25 // Process the first iteration out of the loop so we can 26 // use MOVDU and avoid 3 index registers updates. 27 MOVD 0(R8), R11 // R11 = x[i] 28 MOVD 0(R9), R12 // R12 = y[i] 29 ADD $-1, R7 // R7 = z_len - 1 30 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA 31 CMP R0, R7 32 MOVD R15, 0(R10) // z[i] 33 BEQ final // If z_len was 1, we are done 34 35 SRD $2, R7, R5 // R5 = z_len/4 36 CMP R0, R5 37 MOVD R5, CTR // Set up loop counter 38 BEQ tail // If R5 = 0, we can't use the loop 39 40 // Process 4 elements per iteration. Unrolling this loop 41 // means a performance trade-off: we will lose performance 42 // for small values of z_len (0.90x in the worst case), but 43 // gain significant performance as z_len increases (up to 44 // 1.45x). 45 46 PCALIGN $16 47 loop: 48 MOVD 8(R8), R11 // R11 = x[i] 49 MOVD 16(R8), R12 // R12 = x[i+1] 50 MOVD 24(R8), R14 // R14 = x[i+2] 51 MOVDU 32(R8), R15 // R15 = x[i+3] 52 MOVD 8(R9), R16 // R16 = y[i] 53 MOVD 16(R9), R17 // R17 = y[i+1] 54 MOVD 24(R9), R18 // R18 = y[i+2] 55 MOVDU 32(R9), R19 // R19 = y[i+3] 56 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 57 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA 58 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA 59 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA 60 MOVD R20, 8(R10) // z[i] 61 MOVD R21, 16(R10) // z[i+1] 62 MOVD R22, 24(R10) // z[i+2] 63 MOVDU R23, 32(R10) // z[i+3] 64 ADD $-4, R7 // R7 = z_len - 4 65 BC 16, 0, loop // bdnz 66 67 // We may have more elements to read 68 CMP R0, R7 69 BEQ final 70 71 // Process the remaining elements, one at a time 72 tail: 73 MOVDU 8(R8), R11 // R11 = x[i] 74 MOVDU 8(R9), R16 // R16 = y[i] 75 ADD $-1, R7 // R7 = z_len - 1 76 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 77 CMP R0, R7 78 MOVDU R20, 8(R10) // z[i] 79 BEQ final // If R7 = 0, we are done 80 81 MOVDU 8(R8), R11 82 MOVDU 8(R9), R16 83 ADD $-1, R7 84 ADDE R11, R16, R20 85 CMP R0, R7 86 MOVDU R20, 8(R10) 87 BEQ final 88 89 MOVD 8(R8), R11 90 MOVD 8(R9), R16 91 ADDE R11, R16, R20 92 MOVD R20, 8(R10) 93 94 final: 95 ADDZE R4 // Capture CA 96 97 done: 98 MOVD R4, c+72(FP) 99 RET 100 101 // func subVV(z, x, y []Word) (c Word) 102 // z[i] = x[i] - y[i] for all i, carrying 103 TEXT ·subVV(SB), NOSPLIT, $0 104 MOVD z_len+8(FP), R7 // R7 = z_len 105 MOVD x+24(FP), R8 // R8 = x[] 106 MOVD y+48(FP), R9 // R9 = y[] 107 MOVD z+0(FP), R10 // R10 = z[] 108 109 // If z_len = 0, we are done 110 CMP R0, R7 111 MOVD R0, R4 112 BEQ done 113 114 // Process the first iteration out of the loop so we can 115 // use MOVDU and avoid 3 index registers updates. 116 MOVD 0(R8), R11 // R11 = x[i] 117 MOVD 0(R9), R12 // R12 = y[i] 118 ADD $-1, R7 // R7 = z_len - 1 119 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA 120 CMP R0, R7 121 MOVD R15, 0(R10) // z[i] 122 BEQ final // If z_len was 1, we are done 123 124 SRD $2, R7, R5 // R5 = z_len/4 125 CMP R0, R5 126 MOVD R5, CTR // Set up loop counter 127 BEQ tail // If R5 = 0, we can't use the loop 128 129 // Process 4 elements per iteration. Unrolling this loop 130 // means a performance trade-off: we will lose performance 131 // for small values of z_len (0.92x in the worst case), but 132 // gain significant performance as z_len increases (up to 133 // 1.45x). 134 135 PCALIGN $16 136 loop: 137 MOVD 8(R8), R11 // R11 = x[i] 138 MOVD 16(R8), R12 // R12 = x[i+1] 139 MOVD 24(R8), R14 // R14 = x[i+2] 140 MOVDU 32(R8), R15 // R15 = x[i+3] 141 MOVD 8(R9), R16 // R16 = y[i] 142 MOVD 16(R9), R17 // R17 = y[i+1] 143 MOVD 24(R9), R18 // R18 = y[i+2] 144 MOVDU 32(R9), R19 // R19 = y[i+3] 145 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 146 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA 147 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA 148 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA 149 MOVD R20, 8(R10) // z[i] 150 MOVD R21, 16(R10) // z[i+1] 151 MOVD R22, 24(R10) // z[i+2] 152 MOVDU R23, 32(R10) // z[i+3] 153 ADD $-4, R7 // R7 = z_len - 4 154 BC 16, 0, loop // bdnz 155 156 // We may have more elements to read 157 CMP R0, R7 158 BEQ final 159 160 // Process the remaining elements, one at a time 161 tail: 162 MOVDU 8(R8), R11 // R11 = x[i] 163 MOVDU 8(R9), R16 // R16 = y[i] 164 ADD $-1, R7 // R7 = z_len - 1 165 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 166 CMP R0, R7 167 MOVDU R20, 8(R10) // z[i] 168 BEQ final // If R7 = 0, we are done 169 170 MOVDU 8(R8), R11 171 MOVDU 8(R9), R16 172 ADD $-1, R7 173 SUBE R16, R11, R20 174 CMP R0, R7 175 MOVDU R20, 8(R10) 176 BEQ final 177 178 MOVD 8(R8), R11 179 MOVD 8(R9), R16 180 SUBE R16, R11, R20 181 MOVD R20, 8(R10) 182 183 final: 184 ADDZE R4 185 XOR $1, R4 186 187 done: 188 MOVD R4, c+72(FP) 189 RET 190 191 // func addVW(z, x []Word, y Word) (c Word) 192 TEXT ·addVW(SB), NOSPLIT, $0 193 MOVD z+0(FP), R10 // R10 = z[] 194 MOVD x+24(FP), R8 // R8 = x[] 195 MOVD y+48(FP), R4 // R4 = y = c 196 MOVD z_len+8(FP), R11 // R11 = z_len 197 198 CMP R0, R11 // If z_len is zero, return 199 BEQ done 200 201 // We will process the first iteration out of the loop so we capture 202 // the value of c. In the subsequent iterations, we will rely on the 203 // value of CA set here. 204 MOVD 0(R8), R20 // R20 = x[i] 205 ADD $-1, R11 // R11 = z_len - 1 206 ADDC R20, R4, R6 // R6 = x[i] + c 207 CMP R0, R11 // If z_len was 1, we are done 208 MOVD R6, 0(R10) // z[i] 209 BEQ final 210 211 // We will read 4 elements per iteration 212 SRD $2, R11, R9 // R9 = z_len/4 213 DCBT (R8) 214 CMP R0, R9 215 MOVD R9, CTR // Set up the loop counter 216 BEQ tail // If R9 = 0, we can't use the loop 217 PCALIGN $16 218 219 loop: 220 MOVD 8(R8), R20 // R20 = x[i] 221 MOVD 16(R8), R21 // R21 = x[i+1] 222 MOVD 24(R8), R22 // R22 = x[i+2] 223 MOVDU 32(R8), R23 // R23 = x[i+3] 224 ADDZE R20, R24 // R24 = x[i] + CA 225 ADDZE R21, R25 // R25 = x[i+1] + CA 226 ADDZE R22, R26 // R26 = x[i+2] + CA 227 ADDZE R23, R27 // R27 = x[i+3] + CA 228 MOVD R24, 8(R10) // z[i] 229 MOVD R25, 16(R10) // z[i+1] 230 MOVD R26, 24(R10) // z[i+2] 231 MOVDU R27, 32(R10) // z[i+3] 232 ADD $-4, R11 // R11 = z_len - 4 233 BC 16, 0, loop // bdnz 234 235 // We may have some elements to read 236 CMP R0, R11 237 BEQ final 238 239 tail: 240 MOVDU 8(R8), R20 241 ADDZE R20, R24 242 ADD $-1, R11 243 MOVDU R24, 8(R10) 244 CMP R0, R11 245 BEQ final 246 247 MOVDU 8(R8), R20 248 ADDZE R20, R24 249 ADD $-1, R11 250 MOVDU R24, 8(R10) 251 CMP R0, R11 252 BEQ final 253 254 MOVD 8(R8), R20 255 ADDZE R20, R24 256 MOVD R24, 8(R10) 257 258 final: 259 ADDZE R0, R4 // c = CA 260 done: 261 MOVD R4, c+56(FP) 262 RET 263 264 // func subVW(z, x []Word, y Word) (c Word) 265 TEXT ·subVW(SB), NOSPLIT, $0 266 MOVD z+0(FP), R10 // R10 = z[] 267 MOVD x+24(FP), R8 // R8 = x[] 268 MOVD y+48(FP), R4 // R4 = y = c 269 MOVD z_len+8(FP), R11 // R11 = z_len 270 271 CMP R0, R11 // If z_len is zero, return 272 BEQ done 273 274 // We will process the first iteration out of the loop so we capture 275 // the value of c. In the subsequent iterations, we will rely on the 276 // value of CA set here. 277 MOVD 0(R8), R20 // R20 = x[i] 278 ADD $-1, R11 // R11 = z_len - 1 279 SUBC R4, R20, R6 // R6 = x[i] - c 280 CMP R0, R11 // If z_len was 1, we are done 281 MOVD R6, 0(R10) // z[i] 282 BEQ final 283 284 // We will read 4 elements per iteration 285 SRD $2, R11, R9 // R9 = z_len/4 286 DCBT (R8) 287 CMP R0, R9 288 MOVD R9, CTR // Set up the loop counter 289 BEQ tail // If R9 = 0, we can't use the loop 290 291 // The loop here is almost the same as the one used in s390x, but 292 // we don't need to capture CA every iteration because we've already 293 // done that above. 294 295 PCALIGN $16 296 loop: 297 MOVD 8(R8), R20 298 MOVD 16(R8), R21 299 MOVD 24(R8), R22 300 MOVDU 32(R8), R23 301 SUBE R0, R20 302 SUBE R0, R21 303 SUBE R0, R22 304 SUBE R0, R23 305 MOVD R20, 8(R10) 306 MOVD R21, 16(R10) 307 MOVD R22, 24(R10) 308 MOVDU R23, 32(R10) 309 ADD $-4, R11 310 BC 16, 0, loop // bdnz 311 312 // We may have some elements to read 313 CMP R0, R11 314 BEQ final 315 316 tail: 317 MOVDU 8(R8), R20 318 SUBE R0, R20 319 ADD $-1, R11 320 MOVDU R20, 8(R10) 321 CMP R0, R11 322 BEQ final 323 324 MOVDU 8(R8), R20 325 SUBE R0, R20 326 ADD $-1, R11 327 MOVDU R20, 8(R10) 328 CMP R0, R11 329 BEQ final 330 331 MOVD 8(R8), R20 332 SUBE R0, R20 333 MOVD R20, 8(R10) 334 335 final: 336 // Capture CA 337 SUBE R4, R4 338 NEG R4, R4 339 340 done: 341 MOVD R4, c+56(FP) 342 RET 343 344 //func shlVU(z, x []Word, s uint) (c Word) 345 TEXT ·shlVU(SB), NOSPLIT, $0 346 MOVD z+0(FP), R3 347 MOVD x+24(FP), R6 348 MOVD s+48(FP), R9 349 MOVD z_len+8(FP), R4 350 MOVD x_len+32(FP), R7 351 CMP R9, R0 // s==0 copy(z,x) 352 BEQ zeroshift 353 CMP R4, R0 // len(z)==0 return 354 BEQ done 355 356 ADD $-1, R4, R5 // len(z)-1 357 SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) 358 SLD $3, R5, R7 359 ADD R6, R7, R15 // save starting address &x[len(z)-1] 360 ADD R3, R7, R16 // save starting address &z[len(z)-1] 361 MOVD (R6)(R7), R14 362 SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7 363 CMP R5, R0 // iterate from i=len(z)-1 to 0 364 BEQ loopexit // Already at end? 365 MOVD 0(R15),R10 // x[i] 366 PCALIGN $16 367 shloop: 368 SLD R9, R10, R10 // x[i]<<s 369 MOVDU -8(R15), R14 370 SRD R4, R14, R11 // x[i-1]>>ŝ 371 OR R11, R10, R10 372 MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ 373 MOVD R14, R10 // reuse x[i-1] for next iteration 374 ADD $-8, R16 // i-- 375 CMP R15, R6 // &x[i-1]>&x[0]? 376 BGT shloop 377 loopexit: 378 MOVD 0(R6), R4 379 SLD R9, R4, R4 380 MOVD R4, 0(R3) // z[0]=x[0]<<s 381 MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c 382 RET 383 384 zeroshift: 385 CMP R6, R0 // x is null, nothing to copy 386 BEQ done 387 CMP R6, R3 // if x is same as z, nothing to copy 388 BEQ done 389 CMP R7, R4 390 ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z 391 SLD $3, R7, R7 392 SUB R6, R3, R11 // dest - src 393 CMPU R11, R7, CR2 // < len? 394 BLT CR2, backward // there is overlap, copy backwards 395 MOVD $0, R14 396 // shlVU processes backwards, but added a forward copy option 397 // since its faster on POWER 398 repeat: 399 MOVD (R6)(R14), R15 // Copy 8 bytes at a time 400 MOVD R15, (R3)(R14) 401 ADD $8, R14 402 CMP R14, R7 // More 8 bytes left? 403 BLT repeat 404 BR done 405 backward: 406 ADD $-8,R7, R14 407 repeatback: 408 MOVD (R6)(R14), R15 // copy x into z backwards 409 MOVD R15, (R3)(R14) // copy 8 bytes at a time 410 SUB $8, R14 411 CMP R14, $-8 // More 8 bytes left? 412 BGT repeatback 413 414 done: 415 MOVD R0, c+56(FP) // c=0 416 RET 417 418 //func shrVU(z, x []Word, s uint) (c Word) 419 TEXT ·shrVU(SB), NOSPLIT, $0 420 MOVD z+0(FP), R3 421 MOVD x+24(FP), R6 422 MOVD s+48(FP), R9 423 MOVD z_len+8(FP), R4 424 MOVD x_len+32(FP), R7 425 426 CMP R9, R0 // s==0, copy(z,x) 427 BEQ zeroshift 428 CMP R4, R0 // len(z)==0 return 429 BEQ done 430 SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) 431 432 MOVD 0(R6), R7 433 SLD R5, R7, R7 // compute x[0]<<ŝ 434 MOVD $1, R8 // iterate from i=1 to i<len(z) 435 CMP R8, R4 436 BGE loopexit // Already at end? 437 438 // vectorize if len(z) is >=3, else jump to scalar loop 439 CMP R4, $3 440 BLT scalar 441 MTVSRD R9, VS38 // s 442 VSPLTB $7, V6, V4 443 MTVSRD R5, VS39 // ŝ 444 VSPLTB $7, V7, V2 445 ADD $-2, R4, R16 446 PCALIGN $16 447 loopback: 448 ADD $-1, R8, R10 449 SLD $3, R10 450 LXVD2X (R6)(R10), VS32 // load x[i-1], x[i] 451 SLD $3, R8, R12 452 LXVD2X (R6)(R12), VS33 // load x[i], x[i+1] 453 454 VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s 455 VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ 456 VOR V3, V5, V5 // Or(|) the two registers together 457 STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i] 458 ADD $2, R8 // Done processing 2 entries, i and i+1 459 CMP R8, R16 // Are there at least a couple of more entries left? 460 BLE loopback 461 CMP R8, R4 // Are we at the last element? 462 BEQ loopexit 463 scalar: 464 ADD $-1, R8, R10 465 SLD $3, R10 466 MOVD (R6)(R10),R11 467 SRD R9, R11, R11 // x[len(z)-2] >> s 468 SLD $3, R8, R12 469 MOVD (R6)(R12), R12 470 SLD R5, R12, R12 // x[len(z)-1]<<ŝ 471 OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ 472 MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ 473 loopexit: 474 ADD $-1, R4 475 SLD $3, R4 476 MOVD (R6)(R4), R5 477 SRD R9, R5, R5 // x[len(z)-1]>>s 478 MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s 479 MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c 480 RET 481 482 zeroshift: 483 CMP R6, R0 // x is null, nothing to copy 484 BEQ done 485 CMP R6, R3 // if x is same as z, nothing to copy 486 BEQ done 487 CMP R7, R4 488 ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z 489 SLD $3, R7, R7 490 MOVD $0, R14 491 repeat: 492 MOVD (R6)(R14), R15 // copy 8 bytes at a time 493 MOVD R15, (R3)(R14) // shrVU processes bytes only forwards 494 ADD $8, R14 495 CMP R14, R7 // More 8 bytes left? 496 BLT repeat 497 done: 498 MOVD R0, c+56(FP) 499 RET 500 501 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 502 TEXT ·mulAddVWW(SB), NOSPLIT, $0 503 MOVD z+0(FP), R10 // R10 = z[] 504 MOVD x+24(FP), R8 // R8 = x[] 505 MOVD y+48(FP), R9 // R9 = y 506 MOVD r+56(FP), R4 // R4 = r = c 507 MOVD z_len+8(FP), R11 // R11 = z_len 508 509 CMP R0, R11 510 BEQ done 511 512 MOVD 0(R8), R20 513 ADD $-1, R11 514 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) 515 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) 516 ADDC R4, R6 // R6 = z0 + r 517 ADDZE R7 // R7 = z1 + CA 518 CMP R0, R11 519 MOVD R7, R4 // R4 = c 520 MOVD R6, 0(R10) // z[i] 521 BEQ done 522 523 // We will read 4 elements per iteration 524 SRD $2, R11, R14 // R14 = z_len/4 525 DCBT (R8) 526 CMP R0, R14 527 MOVD R14, CTR // Set up the loop counter 528 BEQ tail // If R9 = 0, we can't use the loop 529 PCALIGN $16 530 531 loop: 532 MOVD 8(R8), R20 // R20 = x[i] 533 MOVD 16(R8), R21 // R21 = x[i+1] 534 MOVD 24(R8), R22 // R22 = x[i+2] 535 MOVDU 32(R8), R23 // R23 = x[i+3] 536 MULLD R9, R20, R24 // R24 = z0[i] 537 MULHDU R9, R20, R20 // R20 = z1[i] 538 ADDC R4, R24 // R24 = z0[i] + c 539 ADDZE R20 // R7 = z1[i] + CA 540 MULLD R9, R21, R25 541 MULHDU R9, R21, R21 542 ADDC R20, R25 543 ADDZE R21 544 MULLD R9, R22, R26 545 MULHDU R9, R22, R22 546 MULLD R9, R23, R27 547 MULHDU R9, R23, R23 548 ADDC R21, R26 549 ADDZE R22 550 MOVD R24, 8(R10) // z[i] 551 MOVD R25, 16(R10) // z[i+1] 552 ADDC R22, R27 553 ADDZE R23,R4 // update carry 554 MOVD R26, 24(R10) // z[i+2] 555 MOVDU R27, 32(R10) // z[i+3] 556 ADD $-4, R11 // R11 = z_len - 4 557 BC 16, 0, loop // bdnz 558 559 // We may have some elements to read 560 CMP R0, R11 561 BEQ done 562 563 // Process the remaining elements, one at a time 564 tail: 565 MOVDU 8(R8), R20 // R20 = x[i] 566 MULLD R9, R20, R24 // R24 = z0[i] 567 MULHDU R9, R20, R25 // R25 = z1[i] 568 ADD $-1, R11 // R11 = z_len - 1 569 ADDC R4, R24 570 ADDZE R25 571 MOVDU R24, 8(R10) // z[i] 572 CMP R0, R11 573 MOVD R25, R4 // R4 = c 574 BEQ done // If R11 = 0, we are done 575 576 MOVDU 8(R8), R20 577 MULLD R9, R20, R24 578 MULHDU R9, R20, R25 579 ADD $-1, R11 580 ADDC R4, R24 581 ADDZE R25 582 MOVDU R24, 8(R10) 583 CMP R0, R11 584 MOVD R25, R4 585 BEQ done 586 587 MOVD 8(R8), R20 588 MULLD R9, R20, R24 589 MULHDU R9, R20, R25 590 ADD $-1, R11 591 ADDC R4, R24 592 ADDZE R25 593 MOVD R24, 8(R10) 594 MOVD R25, R4 595 596 done: 597 MOVD R4, c+64(FP) 598 RET 599 600 // func addMulVVW(z, x []Word, y Word) (c Word) 601 TEXT ·addMulVVW(SB), NOSPLIT, $0 602 MOVD z+0(FP), R3 // R3 = z[] 603 MOVD x+24(FP), R4 // R4 = x[] 604 MOVD y+48(FP), R5 // R5 = y 605 MOVD z_len+8(FP), R6 // R6 = z_len 606 607 CMP R6, $4 608 MOVD R0, R9 // R9 = c = 0 609 BLT tail 610 SRD $2, R6, R7 611 MOVD R7, CTR // Initialize loop counter 612 PCALIGN $16 613 614 loop: 615 MOVD 0(R4), R14 // x[i] 616 MOVD 8(R4), R16 // x[i+1] 617 MOVD 16(R4), R18 // x[i+2] 618 MOVD 24(R4), R20 // x[i+3] 619 MOVD 0(R3), R15 // z[i] 620 MOVD 8(R3), R17 // z[i+1] 621 MOVD 16(R3), R19 // z[i+2] 622 MOVD 24(R3), R21 // z[i+3] 623 MULLD R5, R14, R10 // low x[i]*y 624 MULHDU R5, R14, R11 // high x[i]*y 625 ADDC R15, R10 626 ADDZE R11 627 ADDC R9, R10 628 ADDZE R11, R9 629 MULLD R5, R16, R14 // low x[i+1]*y 630 MULHDU R5, R16, R15 // high x[i+1]*y 631 ADDC R17, R14 632 ADDZE R15 633 ADDC R9, R14 634 ADDZE R15, R9 635 MULLD R5, R18, R16 // low x[i+2]*y 636 MULHDU R5, R18, R17 // high x[i+2]*y 637 ADDC R19, R16 638 ADDZE R17 639 ADDC R9, R16 640 ADDZE R17, R9 641 MULLD R5, R20, R18 // low x[i+3]*y 642 MULHDU R5, R20, R19 // high x[i+3]*y 643 ADDC R21, R18 644 ADDZE R19 645 ADDC R9, R18 646 ADDZE R19, R9 647 MOVD R10, 0(R3) // z[i] 648 MOVD R14, 8(R3) // z[i+1] 649 MOVD R16, 16(R3) // z[i+2] 650 MOVD R18, 24(R3) // z[i+3] 651 ADD $32, R3 652 ADD $32, R4 653 BDNZ loop 654 655 ANDCC $3, R6 656 tail: 657 CMP R0, R6 658 BEQ done 659 MOVD R6, CTR 660 PCALIGN $16 661 tailloop: 662 MOVD 0(R4), R14 663 MOVD 0(R3), R15 664 MULLD R5, R14, R10 665 MULHDU R5, R14, R11 666 ADDC R15, R10 667 ADDZE R11 668 ADDC R9, R10 669 ADDZE R11, R9 670 MOVD R10, 0(R3) 671 ADD $8, R3 672 ADD $8, R4 673 BDNZ tailloop 674 675 done: 676 MOVD R9, c+56(FP) 677 RET 678