github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/math/big/arith_ppc64x.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !math_big_pure_go && (ppc64 || ppc64le) 6 // +build !math_big_pure_go 7 // +build ppc64 ppc64le 8 9 #include "textflag.h" 10 11 // This file provides fast assembly versions for the elementary 12 // arithmetic operations on vectors implemented in arith.go. 13 14 // func addVV(z, y, y []Word) (c Word) 15 // z[i] = x[i] + y[i] for all i, carrying 16 TEXT ·addVV(SB), NOSPLIT, $0 17 MOVD z_len+8(FP), R7 // R7 = z_len 18 MOVD x+24(FP), R8 // R8 = x[] 19 MOVD y+48(FP), R9 // R9 = y[] 20 MOVD z+0(FP), R10 // R10 = z[] 21 22 // If z_len = 0, we are done 23 CMP R0, R7 24 MOVD R0, R4 25 BEQ done 26 27 // Process the first iteration out of the loop so we can 28 // use MOVDU and avoid 3 index registers updates. 29 MOVD 0(R8), R11 // R11 = x[i] 30 MOVD 0(R9), R12 // R12 = y[i] 31 ADD $-1, R7 // R7 = z_len - 1 32 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA 33 CMP R0, R7 34 MOVD R15, 0(R10) // z[i] 35 BEQ final // If z_len was 1, we are done 36 37 SRD $2, R7, R5 // R5 = z_len/4 38 CMP R0, R5 39 MOVD R5, CTR // Set up loop counter 40 BEQ tail // If R5 = 0, we can't use the loop 41 42 // Process 4 elements per iteration. Unrolling this loop 43 // means a performance trade-off: we will lose performance 44 // for small values of z_len (0.90x in the worst case), but 45 // gain significant performance as z_len increases (up to 46 // 1.45x). 47 48 PCALIGN $16 49 loop: 50 MOVD 8(R8), R11 // R11 = x[i] 51 MOVD 16(R8), R12 // R12 = x[i+1] 52 MOVD 24(R8), R14 // R14 = x[i+2] 53 MOVDU 32(R8), R15 // R15 = x[i+3] 54 MOVD 8(R9), R16 // R16 = y[i] 55 MOVD 16(R9), R17 // R17 = y[i+1] 56 MOVD 24(R9), R18 // R18 = y[i+2] 57 MOVDU 32(R9), R19 // R19 = y[i+3] 58 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 59 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA 60 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA 61 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA 62 MOVD R20, 8(R10) // z[i] 63 MOVD R21, 16(R10) // z[i+1] 64 MOVD R22, 24(R10) // z[i+2] 65 MOVDU R23, 32(R10) // z[i+3] 66 ADD $-4, R7 // R7 = z_len - 4 67 BC 16, 0, loop // bdnz 68 69 // We may have more elements to read 70 CMP R0, R7 71 BEQ final 72 73 // Process the remaining elements, one at a time 74 tail: 75 MOVDU 8(R8), R11 // R11 = x[i] 76 MOVDU 8(R9), R16 // R16 = y[i] 77 ADD $-1, R7 // R7 = z_len - 1 78 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 79 CMP R0, R7 80 MOVDU R20, 8(R10) // z[i] 81 BEQ final // If R7 = 0, we are done 82 83 MOVDU 8(R8), R11 84 MOVDU 8(R9), R16 85 ADD $-1, R7 86 ADDE R11, R16, R20 87 CMP R0, R7 88 MOVDU R20, 8(R10) 89 BEQ final 90 91 MOVD 8(R8), R11 92 MOVD 8(R9), R16 93 ADDE R11, R16, R20 94 MOVD R20, 8(R10) 95 96 final: 97 ADDZE R4 // Capture CA 98 99 done: 100 MOVD R4, c+72(FP) 101 RET 102 103 // func subVV(z, x, y []Word) (c Word) 104 // z[i] = x[i] - y[i] for all i, carrying 105 TEXT ·subVV(SB), NOSPLIT, $0 106 MOVD z_len+8(FP), R7 // R7 = z_len 107 MOVD x+24(FP), R8 // R8 = x[] 108 MOVD y+48(FP), R9 // R9 = y[] 109 MOVD z+0(FP), R10 // R10 = z[] 110 111 // If z_len = 0, we are done 112 CMP R0, R7 113 MOVD R0, R4 114 BEQ done 115 116 // Process the first iteration out of the loop so we can 117 // use MOVDU and avoid 3 index registers updates. 118 MOVD 0(R8), R11 // R11 = x[i] 119 MOVD 0(R9), R12 // R12 = y[i] 120 ADD $-1, R7 // R7 = z_len - 1 121 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA 122 CMP R0, R7 123 MOVD R15, 0(R10) // z[i] 124 BEQ final // If z_len was 1, we are done 125 126 SRD $2, R7, R5 // R5 = z_len/4 127 CMP R0, R5 128 MOVD R5, CTR // Set up loop counter 129 BEQ tail // If R5 = 0, we can't use the loop 130 131 // Process 4 elements per iteration. Unrolling this loop 132 // means a performance trade-off: we will lose performance 133 // for small values of z_len (0.92x in the worst case), but 134 // gain significant performance as z_len increases (up to 135 // 1.45x). 136 137 PCALIGN $16 138 loop: 139 MOVD 8(R8), R11 // R11 = x[i] 140 MOVD 16(R8), R12 // R12 = x[i+1] 141 MOVD 24(R8), R14 // R14 = x[i+2] 142 MOVDU 32(R8), R15 // R15 = x[i+3] 143 MOVD 8(R9), R16 // R16 = y[i] 144 MOVD 16(R9), R17 // R17 = y[i+1] 145 MOVD 24(R9), R18 // R18 = y[i+2] 146 MOVDU 32(R9), R19 // R19 = y[i+3] 147 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 148 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA 149 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA 150 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA 151 MOVD R20, 8(R10) // z[i] 152 MOVD R21, 16(R10) // z[i+1] 153 MOVD R22, 24(R10) // z[i+2] 154 MOVDU R23, 32(R10) // z[i+3] 155 ADD $-4, R7 // R7 = z_len - 4 156 BC 16, 0, loop // bdnz 157 158 // We may have more elements to read 159 CMP R0, R7 160 BEQ final 161 162 // Process the remaining elements, one at a time 163 tail: 164 MOVDU 8(R8), R11 // R11 = x[i] 165 MOVDU 8(R9), R16 // R16 = y[i] 166 ADD $-1, R7 // R7 = z_len - 1 167 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 168 CMP R0, R7 169 MOVDU R20, 8(R10) // z[i] 170 BEQ final // If R7 = 0, we are done 171 172 MOVDU 8(R8), R11 173 MOVDU 8(R9), R16 174 ADD $-1, R7 175 SUBE R16, R11, R20 176 CMP R0, R7 177 MOVDU R20, 8(R10) 178 BEQ final 179 180 MOVD 8(R8), R11 181 MOVD 8(R9), R16 182 SUBE R16, R11, R20 183 MOVD R20, 8(R10) 184 185 final: 186 ADDZE R4 187 XOR $1, R4 188 189 done: 190 MOVD R4, c+72(FP) 191 RET 192 193 // func addVW(z, x []Word, y Word) (c Word) 194 TEXT ·addVW(SB), NOSPLIT, $0 195 MOVD z+0(FP), R10 // R10 = z[] 196 MOVD x+24(FP), R8 // R8 = x[] 197 MOVD y+48(FP), R4 // R4 = y = c 198 MOVD z_len+8(FP), R11 // R11 = z_len 199 200 CMP R0, R11 // If z_len is zero, return 201 BEQ done 202 203 // We will process the first iteration out of the loop so we capture 204 // the value of c. In the subsequent iterations, we will rely on the 205 // value of CA set here. 206 MOVD 0(R8), R20 // R20 = x[i] 207 ADD $-1, R11 // R11 = z_len - 1 208 ADDC R20, R4, R6 // R6 = x[i] + c 209 CMP R0, R11 // If z_len was 1, we are done 210 MOVD R6, 0(R10) // z[i] 211 BEQ final 212 213 // We will read 4 elements per iteration 214 SRD $2, R11, R9 // R9 = z_len/4 215 DCBT (R8) 216 CMP R0, R9 217 MOVD R9, CTR // Set up the loop counter 218 BEQ tail // If R9 = 0, we can't use the loop 219 PCALIGN $16 220 221 loop: 222 MOVD 8(R8), R20 // R20 = x[i] 223 MOVD 16(R8), R21 // R21 = x[i+1] 224 MOVD 24(R8), R22 // R22 = x[i+2] 225 MOVDU 32(R8), R23 // R23 = x[i+3] 226 ADDZE R20, R24 // R24 = x[i] + CA 227 ADDZE R21, R25 // R25 = x[i+1] + CA 228 ADDZE R22, R26 // R26 = x[i+2] + CA 229 ADDZE R23, R27 // R27 = x[i+3] + CA 230 MOVD R24, 8(R10) // z[i] 231 MOVD R25, 16(R10) // z[i+1] 232 MOVD R26, 24(R10) // z[i+2] 233 MOVDU R27, 32(R10) // z[i+3] 234 ADD $-4, R11 // R11 = z_len - 4 235 BC 16, 0, loop // bdnz 236 237 // We may have some elements to read 238 CMP R0, R11 239 BEQ final 240 241 tail: 242 MOVDU 8(R8), R20 243 ADDZE R20, R24 244 ADD $-1, R11 245 MOVDU R24, 8(R10) 246 CMP R0, R11 247 BEQ final 248 249 MOVDU 8(R8), R20 250 ADDZE R20, R24 251 ADD $-1, R11 252 MOVDU R24, 8(R10) 253 CMP R0, R11 254 BEQ final 255 256 MOVD 8(R8), R20 257 ADDZE R20, R24 258 MOVD R24, 8(R10) 259 260 final: 261 ADDZE R0, R4 // c = CA 262 done: 263 MOVD R4, c+56(FP) 264 RET 265 266 // func subVW(z, x []Word, y Word) (c Word) 267 TEXT ·subVW(SB), NOSPLIT, $0 268 MOVD z+0(FP), R10 // R10 = z[] 269 MOVD x+24(FP), R8 // R8 = x[] 270 MOVD y+48(FP), R4 // R4 = y = c 271 MOVD z_len+8(FP), R11 // R11 = z_len 272 273 CMP R0, R11 // If z_len is zero, return 274 BEQ done 275 276 // We will process the first iteration out of the loop so we capture 277 // the value of c. In the subsequent iterations, we will rely on the 278 // value of CA set here. 279 MOVD 0(R8), R20 // R20 = x[i] 280 ADD $-1, R11 // R11 = z_len - 1 281 SUBC R4, R20, R6 // R6 = x[i] - c 282 CMP R0, R11 // If z_len was 1, we are done 283 MOVD R6, 0(R10) // z[i] 284 BEQ final 285 286 // We will read 4 elements per iteration 287 SRD $2, R11, R9 // R9 = z_len/4 288 DCBT (R8) 289 CMP R0, R9 290 MOVD R9, CTR // Set up the loop counter 291 BEQ tail // If R9 = 0, we can't use the loop 292 293 // The loop here is almost the same as the one used in s390x, but 294 // we don't need to capture CA every iteration because we've already 295 // done that above. 296 297 PCALIGN $16 298 loop: 299 MOVD 8(R8), R20 300 MOVD 16(R8), R21 301 MOVD 24(R8), R22 302 MOVDU 32(R8), R23 303 SUBE R0, R20 304 SUBE R0, R21 305 SUBE R0, R22 306 SUBE R0, R23 307 MOVD R20, 8(R10) 308 MOVD R21, 16(R10) 309 MOVD R22, 24(R10) 310 MOVDU R23, 32(R10) 311 ADD $-4, R11 312 BC 16, 0, loop // bdnz 313 314 // We may have some elements to read 315 CMP R0, R11 316 BEQ final 317 318 tail: 319 MOVDU 8(R8), R20 320 SUBE R0, R20 321 ADD $-1, R11 322 MOVDU R20, 8(R10) 323 CMP R0, R11 324 BEQ final 325 326 MOVDU 8(R8), R20 327 SUBE R0, R20 328 ADD $-1, R11 329 MOVDU R20, 8(R10) 330 CMP R0, R11 331 BEQ final 332 333 MOVD 8(R8), R20 334 SUBE R0, R20 335 MOVD R20, 8(R10) 336 337 final: 338 // Capture CA 339 SUBE R4, R4 340 NEG R4, R4 341 342 done: 343 MOVD R4, c+56(FP) 344 RET 345 346 //func shlVU(z, x []Word, s uint) (c Word) 347 TEXT ·shlVU(SB), NOSPLIT, $0 348 MOVD z+0(FP), R3 349 MOVD x+24(FP), R6 350 MOVD s+48(FP), R9 351 MOVD z_len+8(FP), R4 352 MOVD x_len+32(FP), R7 353 CMP R9, R0 // s==0 copy(z,x) 354 BEQ zeroshift 355 CMP R4, R0 // len(z)==0 return 356 BEQ done 357 358 ADD $-1, R4, R5 // len(z)-1 359 SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) 360 SLD $3, R5, R7 361 ADD R6, R7, R15 // save starting address &x[len(z)-1] 362 ADD R3, R7, R16 // save starting address &z[len(z)-1] 363 MOVD (R6)(R7), R14 364 SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7 365 CMP R5, R0 // iterate from i=len(z)-1 to 0 366 BEQ loopexit // Already at end? 367 MOVD 0(R15),R10 // x[i] 368 PCALIGN $16 369 shloop: 370 SLD R9, R10, R10 // x[i]<<s 371 MOVDU -8(R15), R14 372 SRD R4, R14, R11 // x[i-1]>>ŝ 373 OR R11, R10, R10 374 MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ 375 MOVD R14, R10 // reuse x[i-1] for next iteration 376 ADD $-8, R16 // i-- 377 CMP R15, R6 // &x[i-1]>&x[0]? 378 BGT shloop 379 loopexit: 380 MOVD 0(R6), R4 381 SLD R9, R4, R4 382 MOVD R4, 0(R3) // z[0]=x[0]<<s 383 MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c 384 RET 385 386 zeroshift: 387 CMP R6, R0 // x is null, nothing to copy 388 BEQ done 389 CMP R6, R3 // if x is same as z, nothing to copy 390 BEQ done 391 CMP R7, R4 392 ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z 393 SLD $3, R7, R7 394 SUB R6, R3, R11 // dest - src 395 CMPU R11, R7, CR2 // < len? 396 BLT CR2, backward // there is overlap, copy backwards 397 MOVD $0, R14 398 // shlVU processes backwards, but added a forward copy option 399 // since its faster on POWER 400 repeat: 401 MOVD (R6)(R14), R15 // Copy 8 bytes at a time 402 MOVD R15, (R3)(R14) 403 ADD $8, R14 404 CMP R14, R7 // More 8 bytes left? 405 BLT repeat 406 BR done 407 backward: 408 ADD $-8,R7, R14 409 repeatback: 410 MOVD (R6)(R14), R15 // copy x into z backwards 411 MOVD R15, (R3)(R14) // copy 8 bytes at a time 412 SUB $8, R14 413 CMP R14, $-8 // More 8 bytes left? 414 BGT repeatback 415 416 done: 417 MOVD R0, c+56(FP) // c=0 418 RET 419 420 //func shrVU(z, x []Word, s uint) (c Word) 421 TEXT ·shrVU(SB), NOSPLIT, $0 422 MOVD z+0(FP), R3 423 MOVD x+24(FP), R6 424 MOVD s+48(FP), R9 425 MOVD z_len+8(FP), R4 426 MOVD x_len+32(FP), R7 427 428 CMP R9, R0 // s==0, copy(z,x) 429 BEQ zeroshift 430 CMP R4, R0 // len(z)==0 return 431 BEQ done 432 SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) 433 434 MOVD 0(R6), R7 435 SLD R5, R7, R7 // compute x[0]<<ŝ 436 MOVD $1, R8 // iterate from i=1 to i<len(z) 437 CMP R8, R4 438 BGE loopexit // Already at end? 439 440 // vectorize if len(z) is >=3, else jump to scalar loop 441 CMP R4, $3 442 BLT scalar 443 MTVSRD R9, VS38 // s 444 VSPLTB $7, V6, V4 445 MTVSRD R5, VS39 // ŝ 446 VSPLTB $7, V7, V2 447 ADD $-2, R4, R16 448 PCALIGN $16 449 loopback: 450 ADD $-1, R8, R10 451 SLD $3, R10 452 LXVD2X (R6)(R10), VS32 // load x[i-1], x[i] 453 SLD $3, R8, R12 454 LXVD2X (R6)(R12), VS33 // load x[i], x[i+1] 455 456 VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s 457 VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ 458 VOR V3, V5, V5 // Or(|) the two registers together 459 STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i] 460 ADD $2, R8 // Done processing 2 entries, i and i+1 461 CMP R8, R16 // Are there at least a couple of more entries left? 462 BLE loopback 463 CMP R8, R4 // Are we at the last element? 464 BEQ loopexit 465 scalar: 466 ADD $-1, R8, R10 467 SLD $3, R10 468 MOVD (R6)(R10),R11 469 SRD R9, R11, R11 // x[len(z)-2] >> s 470 SLD $3, R8, R12 471 MOVD (R6)(R12), R12 472 SLD R5, R12, R12 // x[len(z)-1]<<ŝ 473 OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ 474 MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ 475 loopexit: 476 ADD $-1, R4 477 SLD $3, R4 478 MOVD (R6)(R4), R5 479 SRD R9, R5, R5 // x[len(z)-1]>>s 480 MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s 481 MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c 482 RET 483 484 zeroshift: 485 CMP R6, R0 // x is null, nothing to copy 486 BEQ done 487 CMP R6, R3 // if x is same as z, nothing to copy 488 BEQ done 489 CMP R7, R4 490 ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z 491 SLD $3, R7, R7 492 MOVD $0, R14 493 repeat: 494 MOVD (R6)(R14), R15 // copy 8 bytes at a time 495 MOVD R15, (R3)(R14) // shrVU processes bytes only forwards 496 ADD $8, R14 497 CMP R14, R7 // More 8 bytes left? 498 BLT repeat 499 done: 500 MOVD R0, c+56(FP) 501 RET 502 503 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 504 TEXT ·mulAddVWW(SB), NOSPLIT, $0 505 MOVD z+0(FP), R10 // R10 = z[] 506 MOVD x+24(FP), R8 // R8 = x[] 507 MOVD y+48(FP), R9 // R9 = y 508 MOVD r+56(FP), R4 // R4 = r = c 509 MOVD z_len+8(FP), R11 // R11 = z_len 510 511 CMP R0, R11 512 BEQ done 513 514 MOVD 0(R8), R20 515 ADD $-1, R11 516 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) 517 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) 518 ADDC R4, R6 // R6 = z0 + r 519 ADDZE R7 // R7 = z1 + CA 520 CMP R0, R11 521 MOVD R7, R4 // R4 = c 522 MOVD R6, 0(R10) // z[i] 523 BEQ done 524 525 // We will read 4 elements per iteration 526 SRD $2, R11, R14 // R14 = z_len/4 527 DCBT (R8) 528 CMP R0, R14 529 MOVD R14, CTR // Set up the loop counter 530 BEQ tail // If R9 = 0, we can't use the loop 531 PCALIGN $16 532 533 loop: 534 MOVD 8(R8), R20 // R20 = x[i] 535 MOVD 16(R8), R21 // R21 = x[i+1] 536 MOVD 24(R8), R22 // R22 = x[i+2] 537 MOVDU 32(R8), R23 // R23 = x[i+3] 538 MULLD R9, R20, R24 // R24 = z0[i] 539 MULHDU R9, R20, R20 // R20 = z1[i] 540 ADDC R4, R24 // R24 = z0[i] + c 541 ADDZE R20 // R7 = z1[i] + CA 542 MULLD R9, R21, R25 543 MULHDU R9, R21, R21 544 ADDC R20, R25 545 ADDZE R21 546 MULLD R9, R22, R26 547 MULHDU R9, R22, R22 548 MULLD R9, R23, R27 549 MULHDU R9, R23, R23 550 ADDC R21, R26 551 ADDZE R22 552 MOVD R24, 8(R10) // z[i] 553 MOVD R25, 16(R10) // z[i+1] 554 ADDC R22, R27 555 ADDZE R23,R4 // update carry 556 MOVD R26, 24(R10) // z[i+2] 557 MOVDU R27, 32(R10) // z[i+3] 558 ADD $-4, R11 // R11 = z_len - 4 559 BC 16, 0, loop // bdnz 560 561 // We may have some elements to read 562 CMP R0, R11 563 BEQ done 564 565 // Process the remaining elements, one at a time 566 tail: 567 MOVDU 8(R8), R20 // R20 = x[i] 568 MULLD R9, R20, R24 // R24 = z0[i] 569 MULHDU R9, R20, R25 // R25 = z1[i] 570 ADD $-1, R11 // R11 = z_len - 1 571 ADDC R4, R24 572 ADDZE R25 573 MOVDU R24, 8(R10) // z[i] 574 CMP R0, R11 575 MOVD R25, R4 // R4 = c 576 BEQ done // If R11 = 0, we are done 577 578 MOVDU 8(R8), R20 579 MULLD R9, R20, R24 580 MULHDU R9, R20, R25 581 ADD $-1, R11 582 ADDC R4, R24 583 ADDZE R25 584 MOVDU R24, 8(R10) 585 CMP R0, R11 586 MOVD R25, R4 587 BEQ done 588 589 MOVD 8(R8), R20 590 MULLD R9, R20, R24 591 MULHDU R9, R20, R25 592 ADD $-1, R11 593 ADDC R4, R24 594 ADDZE R25 595 MOVD R24, 8(R10) 596 MOVD R25, R4 597 598 done: 599 MOVD R4, c+64(FP) 600 RET 601 602 // func addMulVVW(z, x []Word, y Word) (c Word) 603 TEXT ·addMulVVW(SB), NOSPLIT, $0 604 MOVD z+0(FP), R10 // R10 = z[] 605 MOVD x+24(FP), R8 // R8 = x[] 606 MOVD y+48(FP), R9 // R9 = y 607 MOVD z_len+8(FP), R22 // R22 = z_len 608 609 MOVD R0, R3 // R3 will be the index register 610 CMP R0, R22 611 MOVD R0, R4 // R4 = c = 0 612 MOVD R22, CTR // Initialize loop counter 613 BEQ done 614 PCALIGN $16 615 616 loop: 617 MOVD (R8)(R3), R20 // Load x[i] 618 MOVD (R10)(R3), R21 // Load z[i] 619 MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) 620 MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) 621 ADDC R21, R6 // R6 = z0 622 ADDZE R7 // R7 = z1 623 ADDC R4, R6 // R6 = z0 + c + 0 624 ADDZE R7, R4 // c += z1 625 MOVD R6, (R10)(R3) // Store z[i] 626 ADD $8, R3 627 BC 16, 0, loop // bdnz 628 629 done: 630 MOVD R4, c+56(FP) 631 RET 632 633