github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/math/big/arith_ppc64x.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !math_big_pure_go && (ppc64 || ppc64le) 6 // +build !math_big_pure_go 7 // +build ppc64 ppc64le 8 9 #include "textflag.h" 10 11 // This file provides fast assembly versions for the elementary 12 // arithmetic operations on vectors implemented in arith.go. 13 14 // func addVV(z, y, y []Word) (c Word) 15 // z[i] = x[i] + y[i] for all i, carrying 16 TEXT ·addVV(SB), NOSPLIT, $0 17 MOVD z_len+8(FP), R7 // R7 = z_len 18 MOVD x+24(FP), R8 // R8 = x[] 19 MOVD y+48(FP), R9 // R9 = y[] 20 MOVD z+0(FP), R10 // R10 = z[] 21 22 // If z_len = 0, we are done 23 CMP R0, R7 24 MOVD R0, R4 25 BEQ done 26 27 // Process the first iteration out of the loop so we can 28 // use MOVDU and avoid 3 index registers updates. 29 MOVD 0(R8), R11 // R11 = x[i] 30 MOVD 0(R9), R12 // R12 = y[i] 31 ADD $-1, R7 // R7 = z_len - 1 32 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA 33 CMP R0, R7 34 MOVD R15, 0(R10) // z[i] 35 BEQ final // If z_len was 1, we are done 36 37 SRD $2, R7, R5 // R5 = z_len/4 38 CMP R0, R5 39 MOVD R5, CTR // Set up loop counter 40 BEQ tail // If R5 = 0, we can't use the loop 41 42 // Process 4 elements per iteration. Unrolling this loop 43 // means a performance trade-off: we will lose performance 44 // for small values of z_len (0.90x in the worst case), but 45 // gain significant performance as z_len increases (up to 46 // 1.45x). 47 loop: 48 MOVD 8(R8), R11 // R11 = x[i] 49 MOVD 16(R8), R12 // R12 = x[i+1] 50 MOVD 24(R8), R14 // R14 = x[i+2] 51 MOVDU 32(R8), R15 // R15 = x[i+3] 52 MOVD 8(R9), R16 // R16 = y[i] 53 MOVD 16(R9), R17 // R17 = y[i+1] 54 MOVD 24(R9), R18 // R18 = y[i+2] 55 MOVDU 32(R9), R19 // R19 = y[i+3] 56 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 57 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA 58 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA 59 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA 60 MOVD R20, 8(R10) // z[i] 61 MOVD R21, 16(R10) // z[i+1] 62 MOVD R22, 24(R10) // z[i+2] 63 MOVDU R23, 32(R10) // z[i+3] 64 ADD $-4, R7 // R7 = z_len - 4 65 BC 16, 0, loop // bdnz 66 67 // We may have more elements to read 68 CMP R0, R7 69 BEQ final 70 71 // Process the remaining elements, one at a time 72 tail: 73 MOVDU 8(R8), R11 // R11 = x[i] 74 MOVDU 8(R9), R16 // R16 = y[i] 75 ADD $-1, R7 // R7 = z_len - 1 76 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA 77 CMP R0, R7 78 MOVDU R20, 8(R10) // z[i] 79 BEQ final // If R7 = 0, we are done 80 81 MOVDU 8(R8), R11 82 MOVDU 8(R9), R16 83 ADD $-1, R7 84 ADDE R11, R16, R20 85 CMP R0, R7 86 MOVDU R20, 8(R10) 87 BEQ final 88 89 MOVD 8(R8), R11 90 MOVD 8(R9), R16 91 ADDE R11, R16, R20 92 MOVD R20, 8(R10) 93 94 final: 95 ADDZE R4 // Capture CA 96 97 done: 98 MOVD R4, c+72(FP) 99 RET 100 101 // func subVV(z, x, y []Word) (c Word) 102 // z[i] = x[i] - y[i] for all i, carrying 103 TEXT ·subVV(SB), NOSPLIT, $0 104 MOVD z_len+8(FP), R7 // R7 = z_len 105 MOVD x+24(FP), R8 // R8 = x[] 106 MOVD y+48(FP), R9 // R9 = y[] 107 MOVD z+0(FP), R10 // R10 = z[] 108 109 // If z_len = 0, we are done 110 CMP R0, R7 111 MOVD R0, R4 112 BEQ done 113 114 // Process the first iteration out of the loop so we can 115 // use MOVDU and avoid 3 index registers updates. 116 MOVD 0(R8), R11 // R11 = x[i] 117 MOVD 0(R9), R12 // R12 = y[i] 118 ADD $-1, R7 // R7 = z_len - 1 119 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA 120 CMP R0, R7 121 MOVD R15, 0(R10) // z[i] 122 BEQ final // If z_len was 1, we are done 123 124 SRD $2, R7, R5 // R5 = z_len/4 125 CMP R0, R5 126 MOVD R5, CTR // Set up loop counter 127 BEQ tail // If R5 = 0, we can't use the loop 128 129 // Process 4 elements per iteration. Unrolling this loop 130 // means a performance trade-off: we will lose performance 131 // for small values of z_len (0.92x in the worst case), but 132 // gain significant performance as z_len increases (up to 133 // 1.45x). 134 loop: 135 MOVD 8(R8), R11 // R11 = x[i] 136 MOVD 16(R8), R12 // R12 = x[i+1] 137 MOVD 24(R8), R14 // R14 = x[i+2] 138 MOVDU 32(R8), R15 // R15 = x[i+3] 139 MOVD 8(R9), R16 // R16 = y[i] 140 MOVD 16(R9), R17 // R17 = y[i+1] 141 MOVD 24(R9), R18 // R18 = y[i+2] 142 MOVDU 32(R9), R19 // R19 = y[i+3] 143 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 144 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA 145 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA 146 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA 147 MOVD R20, 8(R10) // z[i] 148 MOVD R21, 16(R10) // z[i+1] 149 MOVD R22, 24(R10) // z[i+2] 150 MOVDU R23, 32(R10) // z[i+3] 151 ADD $-4, R7 // R7 = z_len - 4 152 BC 16, 0, loop // bdnz 153 154 // We may have more elements to read 155 CMP R0, R7 156 BEQ final 157 158 // Process the remaining elements, one at a time 159 tail: 160 MOVDU 8(R8), R11 // R11 = x[i] 161 MOVDU 8(R9), R16 // R16 = y[i] 162 ADD $-1, R7 // R7 = z_len - 1 163 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA 164 CMP R0, R7 165 MOVDU R20, 8(R10) // z[i] 166 BEQ final // If R7 = 0, we are done 167 168 MOVDU 8(R8), R11 169 MOVDU 8(R9), R16 170 ADD $-1, R7 171 SUBE R16, R11, R20 172 CMP R0, R7 173 MOVDU R20, 8(R10) 174 BEQ final 175 176 MOVD 8(R8), R11 177 MOVD 8(R9), R16 178 SUBE R16, R11, R20 179 MOVD R20, 8(R10) 180 181 final: 182 ADDZE R4 183 XOR $1, R4 184 185 done: 186 MOVD R4, c+72(FP) 187 RET 188 189 // func addVW(z, x []Word, y Word) (c Word) 190 TEXT ·addVW(SB), NOSPLIT, $0 191 MOVD z+0(FP), R10 // R10 = z[] 192 MOVD x+24(FP), R8 // R8 = x[] 193 MOVD y+48(FP), R4 // R4 = y = c 194 MOVD z_len+8(FP), R11 // R11 = z_len 195 196 CMP R0, R11 // If z_len is zero, return 197 BEQ done 198 199 // We will process the first iteration out of the loop so we capture 200 // the value of c. In the subsequent iterations, we will rely on the 201 // value of CA set here. 202 MOVD 0(R8), R20 // R20 = x[i] 203 ADD $-1, R11 // R11 = z_len - 1 204 ADDC R20, R4, R6 // R6 = x[i] + c 205 CMP R0, R11 // If z_len was 1, we are done 206 MOVD R6, 0(R10) // z[i] 207 BEQ final 208 209 // We will read 4 elements per iteration 210 SRD $2, R11, R9 // R9 = z_len/4 211 DCBT (R8) 212 CMP R0, R9 213 MOVD R9, CTR // Set up the loop counter 214 BEQ tail // If R9 = 0, we can't use the loop 215 216 loop: 217 MOVD 8(R8), R20 // R20 = x[i] 218 MOVD 16(R8), R21 // R21 = x[i+1] 219 MOVD 24(R8), R22 // R22 = x[i+2] 220 MOVDU 32(R8), R23 // R23 = x[i+3] 221 ADDZE R20, R24 // R24 = x[i] + CA 222 ADDZE R21, R25 // R25 = x[i+1] + CA 223 ADDZE R22, R26 // R26 = x[i+2] + CA 224 ADDZE R23, R27 // R27 = x[i+3] + CA 225 MOVD R24, 8(R10) // z[i] 226 MOVD R25, 16(R10) // z[i+1] 227 MOVD R26, 24(R10) // z[i+2] 228 MOVDU R27, 32(R10) // z[i+3] 229 ADD $-4, R11 // R11 = z_len - 4 230 BC 16, 0, loop // bdnz 231 232 // We may have some elements to read 233 CMP R0, R11 234 BEQ final 235 236 tail: 237 MOVDU 8(R8), R20 238 ADDZE R20, R24 239 ADD $-1, R11 240 MOVDU R24, 8(R10) 241 CMP R0, R11 242 BEQ final 243 244 MOVDU 8(R8), R20 245 ADDZE R20, R24 246 ADD $-1, R11 247 MOVDU R24, 8(R10) 248 CMP R0, R11 249 BEQ final 250 251 MOVD 8(R8), R20 252 ADDZE R20, R24 253 MOVD R24, 8(R10) 254 255 final: 256 ADDZE R0, R4 // c = CA 257 done: 258 MOVD R4, c+56(FP) 259 RET 260 261 // func subVW(z, x []Word, y Word) (c Word) 262 TEXT ·subVW(SB), NOSPLIT, $0 263 MOVD z+0(FP), R10 // R10 = z[] 264 MOVD x+24(FP), R8 // R8 = x[] 265 MOVD y+48(FP), R4 // R4 = y = c 266 MOVD z_len+8(FP), R11 // R11 = z_len 267 268 CMP R0, R11 // If z_len is zero, return 269 BEQ done 270 271 // We will process the first iteration out of the loop so we capture 272 // the value of c. In the subsequent iterations, we will rely on the 273 // value of CA set here. 274 MOVD 0(R8), R20 // R20 = x[i] 275 ADD $-1, R11 // R11 = z_len - 1 276 SUBC R4, R20, R6 // R6 = x[i] - c 277 CMP R0, R11 // If z_len was 1, we are done 278 MOVD R6, 0(R10) // z[i] 279 BEQ final 280 281 // We will read 4 elements per iteration 282 SRD $2, R11, R9 // R9 = z_len/4 283 DCBT (R8) 284 CMP R0, R9 285 MOVD R9, CTR // Set up the loop counter 286 BEQ tail // If R9 = 0, we can't use the loop 287 288 // The loop here is almost the same as the one used in s390x, but 289 // we don't need to capture CA every iteration because we've already 290 // done that above. 291 loop: 292 MOVD 8(R8), R20 293 MOVD 16(R8), R21 294 MOVD 24(R8), R22 295 MOVDU 32(R8), R23 296 SUBE R0, R20 297 SUBE R0, R21 298 SUBE R0, R22 299 SUBE R0, R23 300 MOVD R20, 8(R10) 301 MOVD R21, 16(R10) 302 MOVD R22, 24(R10) 303 MOVDU R23, 32(R10) 304 ADD $-4, R11 305 BC 16, 0, loop // bdnz 306 307 // We may have some elements to read 308 CMP R0, R11 309 BEQ final 310 311 tail: 312 MOVDU 8(R8), R20 313 SUBE R0, R20 314 ADD $-1, R11 315 MOVDU R20, 8(R10) 316 CMP R0, R11 317 BEQ final 318 319 MOVDU 8(R8), R20 320 SUBE R0, R20 321 ADD $-1, R11 322 MOVDU R20, 8(R10) 323 CMP R0, R11 324 BEQ final 325 326 MOVD 8(R8), R20 327 SUBE R0, R20 328 MOVD R20, 8(R10) 329 330 final: 331 // Capture CA 332 SUBE R4, R4 333 NEG R4, R4 334 335 done: 336 MOVD R4, c+56(FP) 337 RET 338 339 //func shlVU(z, x []Word, s uint) (c Word) 340 TEXT ·shlVU(SB), NOSPLIT, $0 341 MOVD z+0(FP), R3 342 MOVD x+24(FP), R6 343 MOVD s+48(FP), R9 344 MOVD z_len+8(FP), R4 345 MOVD x_len+32(FP), R7 346 CMP R9, R0 // s==0 copy(z,x) 347 BEQ zeroshift 348 CMP R4, R0 // len(z)==0 return 349 BEQ done 350 351 ADD $-1, R4, R5 // len(z)-1 352 SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) 353 SLD $3, R5, R7 354 ADD R6, R7, R15 // save starting address &x[len(z)-1] 355 ADD R3, R7, R16 // save starting address &z[len(z)-1] 356 MOVD (R6)(R7), R14 357 SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7 358 CMP R5, R0 // iterate from i=len(z)-1 to 0 359 BEQ loopexit // Already at end? 360 MOVD 0(R15),R10 // x[i] 361 shloop: 362 SLD R9, R10, R10 // x[i]<<s 363 MOVDU -8(R15), R14 364 SRD R4, R14, R11 // x[i-1]>>ŝ 365 OR R11, R10, R10 366 MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ 367 MOVD R14, R10 // reuse x[i-1] for next iteration 368 ADD $-8, R16 // i-- 369 CMP R15, R6 // &x[i-1]>&x[0]? 370 BGT shloop 371 loopexit: 372 MOVD 0(R6), R4 373 SLD R9, R4, R4 374 MOVD R4, 0(R3) // z[0]=x[0]<<s 375 MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c 376 RET 377 378 zeroshift: 379 CMP R6, R0 // x is null, nothing to copy 380 BEQ done 381 CMP R6, R3 // if x is same as z, nothing to copy 382 BEQ done 383 CMP R7, R4 384 ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z 385 SLD $3, R7, R7 386 SUB R6, R3, R11 // dest - src 387 CMPU R11, R7, CR2 // < len? 388 BLT CR2, backward // there is overlap, copy backwards 389 MOVD $0, R14 390 // shlVU processes backwards, but added a forward copy option 391 // since its faster on POWER 392 repeat: 393 MOVD (R6)(R14), R15 // Copy 8 bytes at a time 394 MOVD R15, (R3)(R14) 395 ADD $8, R14 396 CMP R14, R7 // More 8 bytes left? 397 BLT repeat 398 BR done 399 backward: 400 ADD $-8,R7, R14 401 repeatback: 402 MOVD (R6)(R14), R15 // copy x into z backwards 403 MOVD R15, (R3)(R14) // copy 8 bytes at a time 404 SUB $8, R14 405 CMP R14, $-8 // More 8 bytes left? 406 BGT repeatback 407 408 done: 409 MOVD R0, c+56(FP) // c=0 410 RET 411 412 //func shrVU(z, x []Word, s uint) (c Word) 413 TEXT ·shrVU(SB), NOSPLIT, $0 414 MOVD z+0(FP), R3 415 MOVD x+24(FP), R6 416 MOVD s+48(FP), R9 417 MOVD z_len+8(FP), R4 418 MOVD x_len+32(FP), R7 419 420 CMP R9, R0 // s==0, copy(z,x) 421 BEQ zeroshift 422 CMP R4, R0 // len(z)==0 return 423 BEQ done 424 SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) 425 426 MOVD 0(R6), R7 427 SLD R5, R7, R7 // compute x[0]<<ŝ 428 MOVD $1, R8 // iterate from i=1 to i<len(z) 429 CMP R8, R4 430 BGE loopexit // Already at end? 431 432 // vectorize if len(z) is >=3, else jump to scalar loop 433 CMP R4, $3 434 BLT scalar 435 MTVSRD R9, VS38 // s 436 VSPLTB $7, V6, V4 437 MTVSRD R5, VS39 // ŝ 438 VSPLTB $7, V7, V2 439 ADD $-2, R4, R16 440 PCALIGN $16 441 loopback: 442 ADD $-1, R8, R10 443 SLD $3, R10 444 LXVD2X (R6)(R10), VS32 // load x[i-1], x[i] 445 SLD $3, R8, R12 446 LXVD2X (R6)(R12), VS33 // load x[i], x[i+1] 447 448 VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s 449 VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ 450 VOR V3, V5, V5 // Or(|) the two registers together 451 STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i] 452 ADD $2, R8 // Done processing 2 entries, i and i+1 453 CMP R8, R16 // Are there at least a couple of more entries left? 454 BLE loopback 455 CMP R8, R4 // Are we at the last element? 456 BEQ loopexit 457 scalar: 458 ADD $-1, R8, R10 459 SLD $3, R10 460 MOVD (R6)(R10),R11 461 SRD R9, R11, R11 // x[len(z)-2] >> s 462 SLD $3, R8, R12 463 MOVD (R6)(R12), R12 464 SLD R5, R12, R12 // x[len(z)-1]<<ŝ 465 OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ 466 MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ 467 loopexit: 468 ADD $-1, R4 469 SLD $3, R4 470 MOVD (R6)(R4), R5 471 SRD R9, R5, R5 // x[len(z)-1]>>s 472 MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s 473 MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c 474 RET 475 476 zeroshift: 477 CMP R6, R0 // x is null, nothing to copy 478 BEQ done 479 CMP R6, R3 // if x is same as z, nothing to copy 480 BEQ done 481 CMP R7, R4 482 ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z 483 SLD $3, R7, R7 484 MOVD $0, R14 485 repeat: 486 MOVD (R6)(R14), R15 // copy 8 bytes at a time 487 MOVD R15, (R3)(R14) // shrVU processes bytes only forwards 488 ADD $8, R14 489 CMP R14, R7 // More 8 bytes left? 490 BLT repeat 491 done: 492 MOVD R0, c+56(FP) 493 RET 494 495 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 496 TEXT ·mulAddVWW(SB), NOSPLIT, $0 497 MOVD z+0(FP), R10 // R10 = z[] 498 MOVD x+24(FP), R8 // R8 = x[] 499 MOVD y+48(FP), R9 // R9 = y 500 MOVD r+56(FP), R4 // R4 = r = c 501 MOVD z_len+8(FP), R11 // R11 = z_len 502 503 CMP R0, R11 504 BEQ done 505 506 MOVD 0(R8), R20 507 ADD $-1, R11 508 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) 509 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) 510 ADDC R4, R6 // R6 = z0 + r 511 ADDZE R7 // R7 = z1 + CA 512 CMP R0, R11 513 MOVD R7, R4 // R4 = c 514 MOVD R6, 0(R10) // z[i] 515 BEQ done 516 517 // We will read 4 elements per iteration 518 SRD $2, R11, R14 // R14 = z_len/4 519 DCBT (R8) 520 CMP R0, R14 521 MOVD R14, CTR // Set up the loop counter 522 BEQ tail // If R9 = 0, we can't use the loop 523 524 loop: 525 MOVD 8(R8), R20 // R20 = x[i] 526 MOVD 16(R8), R21 // R21 = x[i+1] 527 MOVD 24(R8), R22 // R22 = x[i+2] 528 MOVDU 32(R8), R23 // R23 = x[i+3] 529 MULLD R9, R20, R24 // R24 = z0[i] 530 MULHDU R9, R20, R20 // R20 = z1[i] 531 ADDC R4, R24 // R24 = z0[i] + c 532 ADDZE R20 // R7 = z1[i] + CA 533 MULLD R9, R21, R25 534 MULHDU R9, R21, R21 535 ADDC R20, R25 536 ADDZE R21 537 MULLD R9, R22, R26 538 MULHDU R9, R22, R22 539 MULLD R9, R23, R27 540 MULHDU R9, R23, R23 541 ADDC R21, R26 542 ADDZE R22 543 MOVD R24, 8(R10) // z[i] 544 MOVD R25, 16(R10) // z[i+1] 545 ADDC R22, R27 546 ADDZE R23,R4 // update carry 547 MOVD R26, 24(R10) // z[i+2] 548 MOVDU R27, 32(R10) // z[i+3] 549 ADD $-4, R11 // R11 = z_len - 4 550 BC 16, 0, loop // bdnz 551 552 // We may have some elements to read 553 CMP R0, R11 554 BEQ done 555 556 // Process the remaining elements, one at a time 557 tail: 558 MOVDU 8(R8), R20 // R20 = x[i] 559 MULLD R9, R20, R24 // R24 = z0[i] 560 MULHDU R9, R20, R25 // R25 = z1[i] 561 ADD $-1, R11 // R11 = z_len - 1 562 ADDC R4, R24 563 ADDZE R25 564 MOVDU R24, 8(R10) // z[i] 565 CMP R0, R11 566 MOVD R25, R4 // R4 = c 567 BEQ done // If R11 = 0, we are done 568 569 MOVDU 8(R8), R20 570 MULLD R9, R20, R24 571 MULHDU R9, R20, R25 572 ADD $-1, R11 573 ADDC R4, R24 574 ADDZE R25 575 MOVDU R24, 8(R10) 576 CMP R0, R11 577 MOVD R25, R4 578 BEQ done 579 580 MOVD 8(R8), R20 581 MULLD R9, R20, R24 582 MULHDU R9, R20, R25 583 ADD $-1, R11 584 ADDC R4, R24 585 ADDZE R25 586 MOVD R24, 8(R10) 587 MOVD R25, R4 588 589 done: 590 MOVD R4, c+64(FP) 591 RET 592 593 // func addMulVVW(z, x []Word, y Word) (c Word) 594 TEXT ·addMulVVW(SB), NOSPLIT, $0 595 MOVD z+0(FP), R10 // R10 = z[] 596 MOVD x+24(FP), R8 // R8 = x[] 597 MOVD y+48(FP), R9 // R9 = y 598 MOVD z_len+8(FP), R22 // R22 = z_len 599 600 MOVD R0, R3 // R3 will be the index register 601 CMP R0, R22 602 MOVD R0, R4 // R4 = c = 0 603 MOVD R22, CTR // Initialize loop counter 604 BEQ done 605 606 loop: 607 MOVD (R8)(R3), R20 // Load x[i] 608 MOVD (R10)(R3), R21 // Load z[i] 609 MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) 610 MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) 611 ADDC R21, R6 // R6 = z0 612 ADDZE R7 // R7 = z1 613 ADDC R4, R6 // R6 = z0 + c + 0 614 ADDZE R7, R4 // c += z1 615 MOVD R6, (R10)(R3) // Store z[i] 616 ADD $8, R3 617 BC 16, 0, loop // bdnz 618 619 done: 620 MOVD R4, c+56(FP) 621 RET 622 623