github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/math/big/arith_s390x.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go,s390x 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1 13 MOVD $x-24(SP), R1 14 XC $24, 0(R1), 0(R1) // clear the storage 15 MOVD $2, R0 // R0 is the number of double words stored -1 16 WORD $0xB2B01000 // STFLE 0(R1) 17 XOR R0, R0 // reset the value of R0 18 MOVBZ z-8(SP), R1 19 AND $0x40, R1 20 BEQ novector 21 vectorinstalled: 22 // check if the vector instruction has been enabled 23 VLEIB $0, $0xF, V16 24 VLGVB $0, V16, R1 25 CMPBNE R1, $0xF, novector 26 MOVB $1, ret+0(FP) // have vx 27 RET 28 novector: 29 MOVB $0, ret+0(FP) // no vx 30 RET 31 32 TEXT ·mulWW(SB),NOSPLIT,$0 33 MOVD x+0(FP), R3 34 MOVD y+8(FP), R4 35 MULHDU R3, R4 36 MOVD R10, z1+16(FP) 37 MOVD R11, z0+24(FP) 38 RET 39 40 // func divWW(x1, x0, y Word) (q, r Word) 41 TEXT ·divWW(SB),NOSPLIT,$0 42 MOVD x1+0(FP), R10 43 MOVD x0+8(FP), R11 44 MOVD y+16(FP), R5 45 WORD $0xb98700a5 // dlgr r10,r5 46 MOVD R11, q+24(FP) 47 MOVD R10, r+32(FP) 48 RET 49 50 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 51 // func addVV(z, x, y []Word) (c Word) 52 53 54 TEXT ·addVV(SB),NOSPLIT,$0 55 MOVD addvectorfacility+0x00(SB),R1 56 BR (R1) 57 58 TEXT ·addVV_check(SB),NOSPLIT, $0 59 MOVB ·hasVX(SB), R1 60 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 61 MOVD $addvectorfacility+0x00(SB), R1 62 MOVD $·addVV_novec(SB), R2 63 MOVD R2, 0(R1) 64 //MOVD $·addVV_novec(SB), 0(R1) 65 BR ·addVV_novec(SB) 66 vectorimpl: 67 MOVD $addvectorfacility+0x00(SB), R1 68 MOVD $·addVV_vec(SB), R2 69 MOVD R2, 0(R1) 70 //MOVD $·addVV_vec(SB), 0(R1) 71 BR ·addVV_vec(SB) 72 73 GLOBL addvectorfacility+0x00(SB), NOPTR, $8 74 DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB) 75 76 TEXT ·addVV_vec(SB),NOSPLIT,$0 77 MOVD z_len+8(FP), R3 78 MOVD x+24(FP), R8 79 MOVD y+48(FP), R9 80 MOVD z+0(FP), R2 81 82 MOVD $0, R4 // c = 0 83 MOVD $0, R0 // make sure it's zero 84 MOVD $0, R10 // i = 0 85 86 87 // s/JL/JMP/ below to disable the unrolled loop 88 SUB $4, R3 89 BLT v1 90 SUB $12, R3 // n -= 16 91 BLT A1 // if n < 0 goto A1 92 93 MOVD R8, R5 94 MOVD R9, R6 95 MOVD R2, R7 96 // n >= 0 97 // regular loop body unrolled 16x 98 VZERO V0 // c = 0 99 UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8 100 ADD $64, R5 101 VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order 102 VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order 103 104 105 VLM 0(R6), V9, V12 // 64-bytes into V9..V16 106 ADD $64, R6 107 VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order 108 VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order 109 110 VACCCQ V1, V9, V0, V25 111 VACQ V1, V9, V0, V17 112 VACCCQ V2, V10, V25, V26 113 VACQ V2, V10, V25, V18 114 115 116 VLM 0(R5), V5, V6 // 32-bytes into V1..V8 117 VLM 0(R6), V13, V14 // 32-bytes into V9..V16 118 ADD $32, R5 119 ADD $32, R6 120 121 VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order 122 VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order 123 VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order 124 VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order 125 126 VACCCQ V3, V11, V26, V27 127 VACQ V3, V11, V26, V19 128 VACCCQ V4, V12, V27, V28 129 VACQ V4, V12, V27, V20 130 131 VLM 0(R5), V7, V8 // 32-bytes into V1..V8 132 VLM 0(R6), V15, V16 // 32-bytes into V9..V16 133 ADD $32, R5 134 ADD $32, R6 135 136 VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order 137 VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order 138 VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order 139 VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order 140 141 VACCCQ V5, V13, V28, V29 142 VACQ V5, V13, V28, V21 143 VACCCQ V6, V14, V29, V30 144 VACQ V6, V14, V29, V22 145 146 VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order 147 VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order 148 VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order 149 VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order 150 151 VACCCQ V7, V15, V30, V31 152 VACQ V7, V15, V30, V23 153 VACCCQ V8, V16, V31, V0 //V0 has carry-over 154 VACQ V8, V16, V31, V24 155 156 VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order 157 VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order 158 VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order 159 VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order 160 VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order 161 VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order 162 VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order 163 VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order 164 VSTM V17, V24, 0(R7) // 128-bytes into z 165 ADD $128, R7 166 ADD $128, R10 // i += 16 167 SUB $16, R3 // n -= 16 168 BGE UU1 // if n >= 0 goto U1 169 VLGVG $1, V0, R4 // put cf into R4 170 NEG R4, R4 // save cf 171 172 A1: ADD $12, R3 // n += 16 173 174 175 // s/JL/JMP/ below to disable the unrolled loop 176 BLT v1 // if n < 0 goto v1 177 178 U1: // n >= 0 179 // regular loop body unrolled 4x 180 MOVD 0(R8)(R10*1), R5 181 MOVD 8(R8)(R10*1), R6 182 MOVD 16(R8)(R10*1), R7 183 MOVD 24(R8)(R10*1), R1 184 ADDC R4, R4 // restore CF 185 MOVD 0(R9)(R10*1), R11 186 ADDE R11, R5 187 MOVD 8(R9)(R10*1), R11 188 ADDE R11, R6 189 MOVD 16(R9)(R10*1), R11 190 ADDE R11, R7 191 MOVD 24(R9)(R10*1), R11 192 ADDE R11, R1 193 MOVD R0, R4 194 ADDE R4, R4 // save CF 195 NEG R4, R4 196 MOVD R5, 0(R2)(R10*1) 197 MOVD R6, 8(R2)(R10*1) 198 MOVD R7, 16(R2)(R10*1) 199 MOVD R1, 24(R2)(R10*1) 200 201 202 ADD $32, R10 // i += 4 203 SUB $4, R3 // n -= 4 204 BGE U1 // if n >= 0 goto U1 205 206 v1: ADD $4, R3 // n += 4 207 BLE E1 // if n <= 0 goto E1 208 209 L1: // n > 0 210 ADDC R4, R4 // restore CF 211 MOVD 0(R8)(R10*1), R5 212 MOVD 0(R9)(R10*1), R11 213 ADDE R11, R5 214 MOVD R5, 0(R2)(R10*1) 215 MOVD R0, R4 216 ADDE R4, R4 // save CF 217 NEG R4, R4 218 219 ADD $8, R10 // i++ 220 SUB $1, R3 // n-- 221 BGT L1 // if n > 0 goto L1 222 223 E1: NEG R4, R4 224 MOVD R4, c+72(FP) // return c 225 RET 226 227 TEXT ·addVV_novec(SB),NOSPLIT,$0 228 novec: 229 MOVD z_len+8(FP), R3 230 MOVD x+24(FP), R8 231 MOVD y+48(FP), R9 232 MOVD z+0(FP), R2 233 234 MOVD $0, R4 // c = 0 235 MOVD $0, R0 // make sure it's zero 236 MOVD $0, R10 // i = 0 237 238 // s/JL/JMP/ below to disable the unrolled loop 239 SUB $4, R3 // n -= 4 240 BLT v1n // if n < 0 goto v1n 241 U1n: // n >= 0 242 // regular loop body unrolled 4x 243 MOVD 0(R8)(R10*1), R5 244 MOVD 8(R8)(R10*1), R6 245 MOVD 16(R8)(R10*1), R7 246 MOVD 24(R8)(R10*1), R1 247 ADDC R4, R4 // restore CF 248 MOVD 0(R9)(R10*1), R11 249 ADDE R11, R5 250 MOVD 8(R9)(R10*1), R11 251 ADDE R11, R6 252 MOVD 16(R9)(R10*1), R11 253 ADDE R11, R7 254 MOVD 24(R9)(R10*1), R11 255 ADDE R11, R1 256 MOVD R0, R4 257 ADDE R4, R4 // save CF 258 NEG R4, R4 259 MOVD R5, 0(R2)(R10*1) 260 MOVD R6, 8(R2)(R10*1) 261 MOVD R7, 16(R2)(R10*1) 262 MOVD R1, 24(R2)(R10*1) 263 264 265 ADD $32, R10 // i += 4 266 SUB $4, R3 // n -= 4 267 BGE U1n // if n >= 0 goto U1n 268 269 v1n: ADD $4, R3 // n += 4 270 BLE E1n // if n <= 0 goto E1n 271 272 L1n: // n > 0 273 ADDC R4, R4 // restore CF 274 MOVD 0(R8)(R10*1), R5 275 MOVD 0(R9)(R10*1), R11 276 ADDE R11, R5 277 MOVD R5, 0(R2)(R10*1) 278 MOVD R0, R4 279 ADDE R4, R4 // save CF 280 NEG R4, R4 281 282 ADD $8, R10 // i++ 283 SUB $1, R3 // n-- 284 BGT L1n // if n > 0 goto L1n 285 286 E1n: NEG R4, R4 287 MOVD R4, c+72(FP) // return c 288 RET 289 290 291 TEXT ·subVV(SB),NOSPLIT,$0 292 MOVD subvectorfacility+0x00(SB),R1 293 BR (R1) 294 295 TEXT ·subVV_check(SB),NOSPLIT,$0 296 MOVB ·hasVX(SB), R1 297 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 298 MOVD $subvectorfacility+0x00(SB), R1 299 MOVD $·subVV_novec(SB), R2 300 MOVD R2, 0(R1) 301 //MOVD $·subVV_novec(SB), 0(R1) 302 BR ·subVV_novec(SB) 303 vectorimpl: 304 MOVD $subvectorfacility+0x00(SB), R1 305 MOVD $·subVV_vec(SB), R2 306 MOVD R2, 0(R1) 307 //MOVD $·subVV_vec(SB), 0(R1) 308 BR ·subVV_vec(SB) 309 310 GLOBL subvectorfacility+0x00(SB), NOPTR, $8 311 DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) 312 313 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 314 // func subVV(z, x, y []Word) (c Word) 315 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) 316 TEXT ·subVV_vec(SB),NOSPLIT,$0 317 MOVD z_len+8(FP), R3 318 MOVD x+24(FP), R8 319 MOVD y+48(FP), R9 320 MOVD z+0(FP), R2 321 MOVD $0, R4 // c = 0 322 MOVD $0, R0 // make sure it's zero 323 MOVD $0, R10 // i = 0 324 325 // s/JL/JMP/ below to disable the unrolled loop 326 SUB $4, R3 // n -= 4 327 BLT v1 // if n < 0 goto v1 328 SUB $12, R3 // n -= 16 329 BLT A1 // if n < 0 goto A1 330 331 MOVD R8, R5 332 MOVD R9, R6 333 MOVD R2, R7 334 335 // n >= 0 336 // regular loop body unrolled 16x 337 VZERO V0 // cf = 0 338 MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) 339 VLVGG $1, R4, V0 //put carry into V0 340 341 UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8 342 ADD $64, R5 343 VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order 344 VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order 345 346 347 VLM 0(R6), V9, V12 // 64-bytes into V9..V16 348 ADD $64, R6 349 VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order 350 VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order 351 352 VSBCBIQ V1, V9, V0, V25 353 VSBIQ V1, V9, V0, V17 354 VSBCBIQ V2, V10, V25, V26 355 VSBIQ V2, V10, V25, V18 356 357 358 VLM 0(R5), V5, V6 // 32-bytes into V1..V8 359 VLM 0(R6), V13, V14 // 32-bytes into V9..V16 360 ADD $32, R5 361 ADD $32, R6 362 363 VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order 364 VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order 365 VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order 366 VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order 367 368 VSBCBIQ V3, V11, V26, V27 369 VSBIQ V3, V11, V26, V19 370 VSBCBIQ V4, V12, V27, V28 371 VSBIQ V4, V12, V27, V20 372 373 VLM 0(R5), V7, V8 // 32-bytes into V1..V8 374 VLM 0(R6), V15, V16 // 32-bytes into V9..V16 375 ADD $32, R5 376 ADD $32, R6 377 378 VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order 379 VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order 380 VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order 381 VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order 382 383 VSBCBIQ V5, V13, V28, V29 384 VSBIQ V5, V13, V28, V21 385 VSBCBIQ V6, V14, V29, V30 386 VSBIQ V6, V14, V29, V22 387 388 VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order 389 VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order 390 VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order 391 VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order 392 393 VSBCBIQ V7, V15, V30, V31 394 VSBIQ V7, V15, V30, V23 395 VSBCBIQ V8, V16, V31, V0 //V0 has carry-over 396 VSBIQ V8, V16, V31, V24 397 398 VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order 399 VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order 400 VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order 401 VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order 402 VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order 403 VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order 404 VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order 405 VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order 406 VSTM V17, V24, 0(R7) // 128-bytes into z 407 ADD $128, R7 408 ADD $128, R10 // i += 16 409 SUB $16, R3 // n -= 16 410 BGE UU1 // if n >= 0 goto U1 411 VLGVG $1, V0, R4 // put cf into R4 412 SUB $1, R4 // save cf 413 414 A1: ADD $12, R3 // n += 16 415 BLT v1 // if n < 0 goto v1 416 417 U1: // n >= 0 418 // regular loop body unrolled 4x 419 MOVD 0(R8)(R10*1), R5 420 MOVD 8(R8)(R10*1), R6 421 MOVD 16(R8)(R10*1), R7 422 MOVD 24(R8)(R10*1), R1 423 MOVD R0, R11 424 SUBC R4, R11 // restore CF 425 MOVD 0(R9)(R10*1), R11 426 SUBE R11, R5 427 MOVD 8(R9)(R10*1), R11 428 SUBE R11, R6 429 MOVD 16(R9)(R10*1), R11 430 SUBE R11, R7 431 MOVD 24(R9)(R10*1), R11 432 SUBE R11, R1 433 MOVD R0, R4 434 SUBE R4, R4 // save CF 435 MOVD R5, 0(R2)(R10*1) 436 MOVD R6, 8(R2)(R10*1) 437 MOVD R7, 16(R2)(R10*1) 438 MOVD R1, 24(R2)(R10*1) 439 440 ADD $32, R10 // i += 4 441 SUB $4, R3 // n -= 4 442 BGE U1 // if n >= 0 goto U1n 443 444 v1: ADD $4, R3 // n += 4 445 BLE E1 // if n <= 0 goto E1 446 447 L1: // n > 0 448 MOVD R0, R11 449 SUBC R4, R11 // restore CF 450 MOVD 0(R8)(R10*1), R5 451 MOVD 0(R9)(R10*1), R11 452 SUBE R11, R5 453 MOVD R5, 0(R2)(R10*1) 454 MOVD R0, R4 455 SUBE R4, R4 // save CF 456 457 ADD $8, R10 // i++ 458 SUB $1, R3 // n-- 459 BGT L1 // if n > 0 goto L1n 460 461 E1: NEG R4, R4 462 MOVD R4, c+72(FP) // return c 463 RET 464 465 466 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 467 // func subVV(z, x, y []Word) (c Word) 468 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) 469 TEXT ·subVV_novec(SB),NOSPLIT,$0 470 MOVD z_len+8(FP), R3 471 MOVD x+24(FP), R8 472 MOVD y+48(FP), R9 473 MOVD z+0(FP), R2 474 475 MOVD $0, R4 // c = 0 476 MOVD $0, R0 // make sure it's zero 477 MOVD $0, R10 // i = 0 478 479 // s/JL/JMP/ below to disable the unrolled loop 480 SUB $4, R3 // n -= 4 481 BLT v1 // if n < 0 goto v1 482 483 U1: // n >= 0 484 // regular loop body unrolled 4x 485 MOVD 0(R8)(R10*1), R5 486 MOVD 8(R8)(R10*1), R6 487 MOVD 16(R8)(R10*1), R7 488 MOVD 24(R8)(R10*1), R1 489 MOVD R0, R11 490 SUBC R4, R11 // restore CF 491 MOVD 0(R9)(R10*1), R11 492 SUBE R11, R5 493 MOVD 8(R9)(R10*1), R11 494 SUBE R11, R6 495 MOVD 16(R9)(R10*1), R11 496 SUBE R11, R7 497 MOVD 24(R9)(R10*1), R11 498 SUBE R11, R1 499 MOVD R0, R4 500 SUBE R4, R4 // save CF 501 MOVD R5, 0(R2)(R10*1) 502 MOVD R6, 8(R2)(R10*1) 503 MOVD R7, 16(R2)(R10*1) 504 MOVD R1, 24(R2)(R10*1) 505 506 507 ADD $32, R10 // i += 4 508 SUB $4, R3 // n -= 4 509 BGE U1 // if n >= 0 goto U1 510 511 v1: ADD $4, R3 // n += 4 512 BLE E1 // if n <= 0 goto E1 513 514 L1: // n > 0 515 MOVD R0, R11 516 SUBC R4, R11 // restore CF 517 MOVD 0(R8)(R10*1), R5 518 MOVD 0(R9)(R10*1), R11 519 SUBE R11, R5 520 MOVD R5, 0(R2)(R10*1) 521 MOVD R0, R4 522 SUBE R4, R4 // save CF 523 524 ADD $8, R10 // i++ 525 SUB $1, R3 // n-- 526 BGT L1 // if n > 0 goto L1 527 528 E1: NEG R4, R4 529 MOVD R4, c+72(FP) // return c 530 RET 531 532 TEXT ·addVW(SB),NOSPLIT,$0 533 MOVD addwvectorfacility+0x00(SB),R1 534 BR (R1) 535 536 TEXT ·addVW_check(SB),NOSPLIT,$0 537 MOVB ·hasVX(SB), R1 538 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 539 MOVD $addwvectorfacility+0x00(SB), R1 540 MOVD $·addVW_novec(SB), R2 541 MOVD R2, 0(R1) 542 //MOVD $·addVW_novec(SB), 0(R1) 543 BR ·addVW_novec(SB) 544 vectorimpl: 545 MOVD $addwvectorfacility+0x00(SB), R1 546 MOVD $·addVW_vec(SB), R2 547 MOVD R2, 0(R1) 548 //MOVD $·addVW_vec(SB), 0(R1) 549 BR ·addVW_vec(SB) 550 551 GLOBL addwvectorfacility+0x00(SB), NOPTR, $8 552 DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB) 553 554 555 // func addVW_vec(z, x []Word, y Word) (c Word) 556 TEXT ·addVW_vec(SB),NOSPLIT,$0 557 MOVD z_len+8(FP), R3 558 MOVD x+24(FP), R8 559 MOVD y+48(FP), R4 // c = y 560 MOVD z+0(FP), R2 561 562 MOVD $0, R0 // make sure it's zero 563 MOVD $0, R10 // i = 0 564 MOVD R8, R5 565 MOVD R2, R7 566 567 // s/JL/JMP/ below to disable the unrolled loop 568 SUB $4, R3 // n -= 4 569 BLT v10 // if n < 0 goto v10 570 SUB $12, R3 571 BLT A10 572 573 // n >= 0 574 // regular loop body unrolled 16x 575 576 VZERO V0 // prepare V0 to be final carry register 577 VZERO V9 // to ensure upper half is zero 578 VLVGG $1, R4, V9 579 UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4 580 ADD $64, R5 581 VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order 582 VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order 583 584 585 VACCCQ V1, V9, V0, V25 586 VACQ V1, V9, V0, V17 587 VZERO V9 588 VACCCQ V2, V9, V25, V26 589 VACQ V2, V9, V25, V18 590 591 592 VLM 0(R5), V5, V6 // 32-bytes into V5..V6 593 ADD $32, R5 594 595 VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order 596 VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order 597 598 VACCCQ V3, V9, V26, V27 599 VACQ V3, V9, V26, V19 600 VACCCQ V4, V9, V27, V28 601 VACQ V4, V9, V27, V20 602 603 VLM 0(R5), V7, V8 // 32-bytes into V7..V8 604 ADD $32, R5 605 606 VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order 607 VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order 608 609 VACCCQ V5, V9, V28, V29 610 VACQ V5, V9, V28, V21 611 VACCCQ V6, V9, V29, V30 612 VACQ V6, V9, V29, V22 613 614 VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order 615 VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order 616 617 VACCCQ V7, V9, V30, V31 618 VACQ V7, V9, V30, V23 619 VACCCQ V8, V9, V31, V0 //V0 has carry-over 620 VACQ V8, V9, V31, V24 621 622 VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order 623 VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order 624 VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order 625 VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order 626 VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order 627 VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order 628 VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order 629 VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order 630 VSTM V17, V24, 0(R7) // 128-bytes into z 631 ADD $128, R7 632 ADD $128, R10 // i += 16 633 SUB $16, R3 // n -= 16 634 BGE UU1 // if n >= 0 goto U1 635 VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10 636 637 A10: ADD $12, R3 // n += 16 638 639 640 // s/JL/JMP/ below to disable the unrolled loop 641 642 BLT v10 // if n < 0 goto v10 643 644 645 U4: // n >= 0 646 // regular loop body unrolled 4x 647 MOVD 0(R8)(R10*1), R5 648 MOVD 8(R8)(R10*1), R6 649 MOVD 16(R8)(R10*1), R7 650 MOVD 24(R8)(R10*1), R1 651 ADDC R4, R5 652 ADDE R0, R6 653 ADDE R0, R7 654 ADDE R0, R1 655 ADDE R0, R0 656 MOVD R0, R4 // save CF 657 SUB R0, R0 658 MOVD R5, 0(R2)(R10*1) 659 MOVD R6, 8(R2)(R10*1) 660 MOVD R7, 16(R2)(R10*1) 661 MOVD R1, 24(R2)(R10*1) 662 663 ADD $32, R10 // i += 4 -> i +=32 664 SUB $4, R3 // n -= 4 665 BGE U4 // if n >= 0 goto U4 666 667 v10: ADD $4, R3 // n += 4 668 BLE E10 // if n <= 0 goto E4 669 670 671 L4: // n > 0 672 MOVD 0(R8)(R10*1), R5 673 ADDC R4, R5 674 ADDE R0, R0 675 MOVD R0, R4 // save CF 676 SUB R0, R0 677 MOVD R5, 0(R2)(R10*1) 678 679 ADD $8, R10 // i++ 680 SUB $1, R3 // n-- 681 BGT L4 // if n > 0 goto L4 682 683 E10: MOVD R4, c+56(FP) // return c 684 685 RET 686 687 688 TEXT ·addVW_novec(SB),NOSPLIT,$0 689 //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) 690 MOVD z_len+8(FP), R3 691 MOVD x+24(FP), R8 692 MOVD y+48(FP), R4 // c = y 693 MOVD z+0(FP), R2 694 MOVD $0, R0 // make sure it's 0 695 MOVD $0, R10 // i = 0 696 697 // s/JL/JMP/ below to disable the unrolled loop 698 SUB $4, R3 // n -= 4 699 BLT v4 // if n < 4 goto v4 700 701 U4: // n >= 0 702 // regular loop body unrolled 4x 703 MOVD 0(R8)(R10*1), R5 704 MOVD 8(R8)(R10*1), R6 705 MOVD 16(R8)(R10*1), R7 706 MOVD 24(R8)(R10*1), R1 707 ADDC R4, R5 708 ADDE R0, R6 709 ADDE R0, R7 710 ADDE R0, R1 711 ADDE R0, R0 712 MOVD R0, R4 // save CF 713 SUB R0, R0 714 MOVD R5, 0(R2)(R10*1) 715 MOVD R6, 8(R2)(R10*1) 716 MOVD R7, 16(R2)(R10*1) 717 MOVD R1, 24(R2)(R10*1) 718 719 ADD $32, R10 // i += 4 -> i +=32 720 SUB $4, R3 // n -= 4 721 BGE U4 // if n >= 0 goto U4 722 723 v4: ADD $4, R3 // n += 4 724 BLE E4 // if n <= 0 goto E4 725 726 L4: // n > 0 727 MOVD 0(R8)(R10*1), R5 728 ADDC R4, R5 729 ADDE R0, R0 730 MOVD R0, R4 // save CF 731 SUB R0, R0 732 MOVD R5, 0(R2)(R10*1) 733 734 ADD $8, R10 // i++ 735 SUB $1, R3 // n-- 736 BGT L4 // if n > 0 goto L4 737 738 E4: MOVD R4, c+56(FP) // return c 739 740 RET 741 742 TEXT ·subVW(SB),NOSPLIT,$0 743 MOVD subwvectorfacility+0x00(SB),R1 744 BR (R1) 745 746 TEXT ·subVW_check(SB),NOSPLIT,$0 747 MOVB ·hasVX(SB), R1 748 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 749 MOVD $subwvectorfacility+0x00(SB), R1 750 MOVD $·subVW_novec(SB), R2 751 MOVD R2, 0(R1) 752 //MOVD $·subVW_novec(SB), 0(R1) 753 BR ·subVW_novec(SB) 754 vectorimpl: 755 MOVD $subwvectorfacility+0x00(SB), R1 756 MOVD $·subVW_vec(SB), R2 757 MOVD R2, 0(R1) 758 //MOVD $·subVW_vec(SB), 0(R1) 759 BR ·subVW_vec(SB) 760 761 GLOBL subwvectorfacility+0x00(SB), NOPTR, $8 762 DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB) 763 764 // func subVW(z, x []Word, y Word) (c Word) 765 TEXT ·subVW_vec(SB),NOSPLIT,$0 766 MOVD z_len+8(FP), R3 767 MOVD x+24(FP), R8 768 MOVD y+48(FP), R4 // c = y 769 MOVD z+0(FP), R2 770 771 MOVD $0, R0 // make sure it's zero 772 MOVD $0, R10 // i = 0 773 MOVD R8, R5 774 MOVD R2, R7 775 776 // s/JL/JMP/ below to disable the unrolled loop 777 SUB $4, R3 // n -= 4 778 BLT v11 // if n < 0 goto v11 779 SUB $12, R3 780 BLT A11 781 782 VZERO V0 783 MOVD $1, R6 // prepare V0 to be final carry register 784 VLVGG $1, R6, V0 // borrow is initially "no borrow" 785 VZERO V9 // to ensure upper half is zero 786 VLVGG $1, R4, V9 787 788 // n >= 0 789 // regular loop body unrolled 16x 790 791 792 UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4 793 ADD $64, R5 794 VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order 795 VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order 796 797 798 VSBCBIQ V1, V9, V0, V25 799 VSBIQ V1, V9, V0, V17 800 VZERO V9 801 VSBCBIQ V2, V9, V25, V26 802 VSBIQ V2, V9, V25, V18 803 804 VLM 0(R5), V5, V6 // 32-bytes into V5..V6 805 ADD $32, R5 806 807 VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order 808 VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order 809 810 811 VSBCBIQ V3, V9, V26, V27 812 VSBIQ V3, V9, V26, V19 813 VSBCBIQ V4, V9, V27, V28 814 VSBIQ V4, V9, V27, V20 815 816 VLM 0(R5), V7, V8 // 32-bytes into V7..V8 817 ADD $32, R5 818 819 VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order 820 VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order 821 822 VSBCBIQ V5, V9, V28, V29 823 VSBIQ V5, V9, V28, V21 824 VSBCBIQ V6, V9, V29, V30 825 VSBIQ V6, V9, V29, V22 826 827 VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order 828 VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order 829 830 VSBCBIQ V7, V9, V30, V31 831 VSBIQ V7, V9, V30, V23 832 VSBCBIQ V8, V9, V31, V0 // V0 has carry-over 833 VSBIQ V8, V9, V31, V24 834 835 VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order 836 VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order 837 VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order 838 VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order 839 VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order 840 VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order 841 VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order 842 VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order 843 VSTM V17, V24, 0(R7) // 128-bytes into z 844 ADD $128, R7 845 ADD $128, R10 // i += 16 846 SUB $16, R3 // n -= 16 847 BGE UU1 // if n >= 0 goto U1 848 VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10 849 SUB $1, R4 // save cf 850 NEG R4, R4 851 A11: ADD $12, R3 // n += 16 852 853 BLT v11 // if n < 0 goto v11 854 855 // n >= 0 856 // regular loop body unrolled 4x 857 858 U4: // n >= 0 859 // regular loop body unrolled 4x 860 MOVD 0(R8)(R10*1), R5 861 MOVD 8(R8)(R10*1), R6 862 MOVD 16(R8)(R10*1), R7 863 MOVD 24(R8)(R10*1), R1 864 SUBC R4, R5 //SLGR -> SUBC 865 SUBE R0, R6 //SLBGR -> SUBE 866 SUBE R0, R7 867 SUBE R0, R1 868 SUBE R4, R4 // save CF 869 NEG R4, R4 870 MOVD R5, 0(R2)(R10*1) 871 MOVD R6, 8(R2)(R10*1) 872 MOVD R7, 16(R2)(R10*1) 873 MOVD R1, 24(R2)(R10*1) 874 875 ADD $32, R10 // i += 4 -> i +=32 876 SUB $4, R3 // n -= 4 877 BGE U4 // if n >= 0 goto U4 878 879 v11: ADD $4, R3 // n += 4 880 BLE E11 // if n <= 0 goto E4 881 882 L4: // n > 0 883 884 MOVD 0(R8)(R10*1), R5 885 SUBC R4, R5 886 SUBE R4, R4 // save CF 887 NEG R4, R4 888 MOVD R5, 0(R2)(R10*1) 889 890 ADD $8, R10 // i++ 891 SUB $1, R3 // n-- 892 BGT L4 // if n > 0 goto L4 893 894 E11: MOVD R4, c+56(FP) // return c 895 896 RET 897 898 //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) 899 // func subVW(z, x []Word, y Word) (c Word) 900 // (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names) 901 TEXT ·subVW_novec(SB),NOSPLIT,$0 902 MOVD z_len+8(FP), R3 903 MOVD x+24(FP), R8 904 MOVD y+48(FP), R4 // c = y 905 MOVD z+0(FP), R2 906 MOVD $0, R0 // make sure it's 0 907 MOVD $0, R10 // i = 0 908 909 // s/JL/JMP/ below to disable the unrolled loop 910 SUB $4, R3 // n -= 4 911 BLT v4 // if n < 4 goto v4 912 913 U4: // n >= 0 914 // regular loop body unrolled 4x 915 MOVD 0(R8)(R10*1), R5 916 MOVD 8(R8)(R10*1), R6 917 MOVD 16(R8)(R10*1), R7 918 MOVD 24(R8)(R10*1), R1 919 SUBC R4, R5 //SLGR -> SUBC 920 SUBE R0, R6 //SLBGR -> SUBE 921 SUBE R0, R7 922 SUBE R0, R1 923 SUBE R4, R4 // save CF 924 NEG R4, R4 925 MOVD R5, 0(R2)(R10*1) 926 MOVD R6, 8(R2)(R10*1) 927 MOVD R7, 16(R2)(R10*1) 928 MOVD R1, 24(R2)(R10*1) 929 930 ADD $32, R10 // i += 4 -> i +=32 931 SUB $4, R3 // n -= 4 932 BGE U4 // if n >= 0 goto U4 933 934 v4: ADD $4, R3 // n += 4 935 BLE E4 // if n <= 0 goto E4 936 937 L4: // n > 0 938 MOVD 0(R8)(R10*1), R5 939 SUBC R4, R5 940 SUBE R4, R4 // save CF 941 NEG R4, R4 942 MOVD R5, 0(R2)(R10*1) 943 944 ADD $8, R10 // i++ 945 SUB $1, R3 // n-- 946 BGT L4 // if n > 0 goto L4 947 948 E4: MOVD R4, c+56(FP) // return c 949 950 RET 951 952 // func shlVU(z, x []Word, s uint) (c Word) 953 TEXT ·shlVU(SB),NOSPLIT,$0 954 MOVD z_len+8(FP), R5 955 MOVD $0, R0 956 SUB $1, R5 // n-- 957 BLT X8b // n < 0 (n <= 0) 958 959 // n > 0 960 MOVD s+48(FP), R4 961 CMPBEQ R0, R4, Z80 //handle 0 case beq 962 MOVD $64, R6 963 CMPBEQ R6, R4, Z864 //handle 64 case beq 964 MOVD z+0(FP), R2 965 MOVD x+24(FP), R8 966 SLD $3, R5 // n = n*8 967 SUB R4, R6, R7 968 MOVD (R8)(R5*1), R10 // w1 = x[i-1] 969 SRD R7, R10, R3 970 MOVD R3, c+56(FP) 971 972 MOVD $0, R1 // i = 0 973 BR E8 974 975 // i < n-1 976 L8: MOVD R10, R3 // w = w1 977 MOVD -8(R8)(R5*1), R10 // w1 = x[i+1] 978 979 SLD R4, R3 // w<<s | w1>>ŝ 980 SRD R7, R10, R6 981 OR R6, R3 982 MOVD R3, (R2)(R5*1) // z[i] = w<<s | w1>>ŝ 983 SUB $8, R5 // i-- 984 985 E8: CMPBGT R5, R0, L8 // i < n-1 986 987 // i >= n-1 988 X8a: SLD R4, R10 // w1<<s 989 MOVD R10, (R2) // z[0] = w1<<s 990 RET 991 992 X8b: MOVD R0, c+56(FP) 993 RET 994 995 Z80: MOVD z+0(FP), R2 996 MOVD x+24(FP), R8 997 SLD $3, R5 // n = n*8 998 999 MOVD (R8), R10 1000 MOVD $0, R3 1001 MOVD R3, c+56(FP) 1002 1003 MOVD $0, R1 // i = 0 1004 BR E8Z 1005 1006 // i < n-1 1007 L8Z: MOVD R10, R3 1008 MOVD 8(R8)(R1*1), R10 1009 1010 MOVD R3, (R2)(R1*1) 1011 ADD $8, R1 1012 1013 E8Z: CMPBLT R1, R5, L8Z 1014 1015 // i >= n-1 1016 MOVD R10, (R2)(R5*1) 1017 RET 1018 1019 Z864: MOVD z+0(FP), R2 1020 MOVD x+24(FP), R8 1021 SLD $3, R5 // n = n*8 1022 MOVD (R8)(R5*1), R3 // w1 = x[n-1] 1023 MOVD R3, c+56(FP) // z[i] = x[n-1] 1024 1025 BR E864 1026 1027 // i < n-1 1028 L864: MOVD -8(R8)(R5*1), R3 1029 1030 MOVD R3, (R2)(R5*1) // z[i] = x[n-1] 1031 SUB $8, R5 // i-- 1032 1033 E864: CMPBGT R5, R0, L864 // i < n-1 1034 1035 MOVD R0, (R2) // z[n-1] = 0 1036 RET 1037 1038 1039 // CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6 1040 // func shrVU(z, x []Word, s uint) (c Word) 1041 TEXT ·shrVU(SB),NOSPLIT,$0 1042 MOVD z_len+8(FP), R5 1043 MOVD $0, R0 1044 SUB $1, R5 // n-- 1045 BLT X9b // n < 0 (n <= 0) 1046 1047 // n > 0 1048 MOVD s+48(FP), R4 1049 CMPBEQ R0, R4, ZB0 //handle 0 case beq 1050 MOVD $64, R6 1051 CMPBEQ R6, R4, ZB64 //handle 64 case beq 1052 MOVD z+0(FP), R2 1053 MOVD x+24(FP), R8 1054 SLD $3, R5 // n = n*8 1055 SUB R4, R6, R7 1056 MOVD (R8), R10 // w1 = x[0] 1057 SLD R7, R10, R3 1058 MOVD R3, c+56(FP) 1059 1060 MOVD $0, R1 // i = 0 1061 BR E9 1062 1063 // i < n-1 1064 L9: MOVD R10, R3 // w = w1 1065 MOVD 8(R8)(R1*1), R10 // w1 = x[i+1] 1066 1067 SRD R4, R3 // w>>s | w1<<s 1068 SLD R7, R10, R6 1069 OR R6, R3 1070 MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s 1071 ADD $8, R1 // i++ 1072 1073 E9: CMPBLT R1, R5, L9 // i < n-1 1074 1075 // i >= n-1 1076 X9a: SRD R4, R10 // w1>>s 1077 MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s 1078 RET 1079 1080 X9b: MOVD R0, c+56(FP) 1081 RET 1082 1083 ZB0: MOVD z+0(FP), R2 1084 MOVD x+24(FP), R8 1085 SLD $3, R5 // n = n*8 1086 1087 MOVD (R8), R10 // w1 = x[0] 1088 MOVD $0, R3 // R10 << 64 1089 MOVD R3, c+56(FP) 1090 1091 MOVD $0, R1 // i = 0 1092 BR E9Z 1093 1094 // i < n-1 1095 L9Z: MOVD R10, R3 // w = w1 1096 MOVD 8(R8)(R1*1), R10 // w1 = x[i+1] 1097 1098 MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s 1099 ADD $8, R1 // i++ 1100 1101 E9Z: CMPBLT R1, R5, L9Z // i < n-1 1102 1103 // i >= n-1 1104 MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s 1105 RET 1106 1107 ZB64: MOVD z+0(FP), R2 1108 MOVD x+24(FP), R8 1109 SLD $3, R5 // n = n*8 1110 MOVD (R8), R3 // w1 = x[0] 1111 MOVD R3, c+56(FP) 1112 1113 MOVD $0, R1 // i = 0 1114 BR E964 1115 1116 // i < n-1 1117 L964: MOVD 8(R8)(R1*1), R3 // w1 = x[i+1] 1118 1119 MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s 1120 ADD $8, R1 // i++ 1121 1122 E964: CMPBLT R1, R5, L964 // i < n-1 1123 1124 // i >= n-1 1125 MOVD $0, R10 // w1>>s 1126 MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s 1127 RET 1128 1129 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i 1130 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 1131 TEXT ·mulAddVWW(SB),NOSPLIT,$0 1132 MOVD z+0(FP), R2 1133 MOVD x+24(FP), R8 1134 MOVD y+48(FP), R9 1135 MOVD r+56(FP), R4 // c = r 1136 MOVD z_len+8(FP), R5 1137 MOVD $0, R1 // i = 0 1138 MOVD $0, R7 // i*8 = 0 1139 MOVD $0, R0 // make sure it's zero 1140 BR E5 1141 1142 L5: MOVD (R8)(R1*1), R6 1143 MULHDU R9, R6 1144 ADDC R4, R11 //add to low order bits 1145 ADDE R0, R6 1146 MOVD R11, (R2)(R1*1) 1147 MOVD R6, R4 1148 ADD $8, R1 // i*8 + 8 1149 ADD $1, R7 // i++ 1150 1151 E5: CMPBLT R7, R5, L5 // i < n 1152 1153 MOVD R4, c+64(FP) 1154 RET 1155 1156 // func addMulVVW(z, x []Word, y Word) (c Word) 1157 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i 1158 TEXT ·addMulVVW(SB),NOSPLIT,$0 1159 MOVD z+0(FP), R2 1160 MOVD x+24(FP), R8 1161 MOVD y+48(FP), R9 1162 MOVD z_len+8(FP), R5 1163 1164 MOVD $0, R1 // i*8 = 0 1165 MOVD $0, R7 // i = 0 1166 MOVD $0, R0 // make sure it's zero 1167 MOVD $0, R4 // c = 0 1168 1169 MOVD R5, R12 1170 AND $-2, R12 1171 CMPBGE R5, $2, A6 1172 BR E6 1173 1174 A6: MOVD (R8)(R1*1), R6 1175 MULHDU R9, R6 1176 MOVD (R2)(R1*1), R10 1177 ADDC R10, R11 //add to low order bits 1178 ADDE R0, R6 1179 ADDC R4, R11 1180 ADDE R0, R6 1181 MOVD R6, R4 1182 MOVD R11, (R2)(R1*1) 1183 1184 MOVD (8)(R8)(R1*1), R6 1185 MULHDU R9, R6 1186 MOVD (8)(R2)(R1*1), R10 1187 ADDC R10, R11 //add to low order bits 1188 ADDE R0, R6 1189 ADDC R4, R11 1190 ADDE R0, R6 1191 MOVD R6, R4 1192 MOVD R11, (8)(R2)(R1*1) 1193 1194 ADD $16, R1 // i*8 + 8 1195 ADD $2, R7 // i++ 1196 1197 CMPBLT R7, R12, A6 1198 BR E6 1199 1200 L6: MOVD (R8)(R1*1), R6 1201 MULHDU R9, R6 1202 MOVD (R2)(R1*1), R10 1203 ADDC R10, R11 //add to low order bits 1204 ADDE R0, R6 1205 ADDC R4, R11 1206 ADDE R0, R6 1207 MOVD R6, R4 1208 MOVD R11, (R2)(R1*1) 1209 1210 ADD $8, R1 // i*8 + 8 1211 ADD $1, R7 // i++ 1212 1213 E6: CMPBLT R7, R5, L6 // i < n 1214 1215 MOVD R4, c+56(FP) 1216 RET 1217 1218 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) 1219 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i 1220 TEXT ·divWVW(SB),NOSPLIT,$0 1221 MOVD z+0(FP), R2 1222 MOVD xn+24(FP), R10 // r = xn 1223 MOVD x+32(FP), R8 1224 MOVD y+56(FP), R9 1225 MOVD z_len+8(FP), R7 // i = z 1226 SLD $3, R7, R1 // i*8 1227 MOVD $0, R0 // make sure it's zero 1228 BR E7 1229 1230 L7: MOVD (R8)(R1*1), R11 1231 WORD $0xB98700A9 //DLGR R10,R9 1232 MOVD R11, (R2)(R1*1) 1233 1234 E7: SUB $1, R7 // i-- 1235 SUB $8, R1 1236 BGE L7 // i >= 0 1237 1238 MOVD R10, r+64(FP) 1239 RET