github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/math/big/arith_s390x.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go,s390x 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 TEXT ·mulWW(SB), NOSPLIT, $0 13 MOVD x+0(FP), R3 14 MOVD y+8(FP), R4 15 MULHDU R3, R4 16 MOVD R10, z1+16(FP) 17 MOVD R11, z0+24(FP) 18 RET 19 20 21 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 22 // func addVV(z, x, y []Word) (c Word) 23 24 TEXT ·addVV(SB), NOSPLIT, $0 25 MOVD addvectorfacility+0x00(SB), R1 26 BR (R1) 27 28 TEXT ·addVV_check(SB), NOSPLIT, $0 29 MOVB ·hasVX(SB), R1 30 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 31 MOVD $addvectorfacility+0x00(SB), R1 32 MOVD $·addVV_novec(SB), R2 33 MOVD R2, 0(R1) 34 35 // MOVD $·addVV_novec(SB), 0(R1) 36 BR ·addVV_novec(SB) 37 38 vectorimpl: 39 MOVD $addvectorfacility+0x00(SB), R1 40 MOVD $·addVV_vec(SB), R2 41 MOVD R2, 0(R1) 42 43 // MOVD $·addVV_vec(SB), 0(R1) 44 BR ·addVV_vec(SB) 45 46 GLOBL addvectorfacility+0x00(SB), NOPTR, $8 47 DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB) 48 49 TEXT ·addVV_vec(SB), NOSPLIT, $0 50 MOVD z_len+8(FP), R3 51 MOVD x+24(FP), R8 52 MOVD y+48(FP), R9 53 MOVD z+0(FP), R2 54 55 MOVD $0, R4 // c = 0 56 MOVD $0, R0 // make sure it's zero 57 MOVD $0, R10 // i = 0 58 59 // s/JL/JMP/ below to disable the unrolled loop 60 SUB $4, R3 61 BLT v1 62 SUB $12, R3 // n -= 16 63 BLT A1 // if n < 0 goto A1 64 65 MOVD R8, R5 66 MOVD R9, R6 67 MOVD R2, R7 68 69 // n >= 0 70 // regular loop body unrolled 16x 71 VZERO V0 // c = 0 72 73 UU1: 74 VLM 0(R5), V1, V4 // 64-bytes into V1..V8 75 ADD $64, R5 76 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order 77 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order 78 79 VLM 0(R6), V9, V12 // 64-bytes into V9..V16 80 ADD $64, R6 81 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order 82 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order 83 84 VACCCQ V1, V9, V0, V25 85 VACQ V1, V9, V0, V17 86 VACCCQ V2, V10, V25, V26 87 VACQ V2, V10, V25, V18 88 89 VLM 0(R5), V5, V6 // 32-bytes into V1..V8 90 VLM 0(R6), V13, V14 // 32-bytes into V9..V16 91 ADD $32, R5 92 ADD $32, R6 93 94 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order 95 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order 96 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order 97 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order 98 99 VACCCQ V3, V11, V26, V27 100 VACQ V3, V11, V26, V19 101 VACCCQ V4, V12, V27, V28 102 VACQ V4, V12, V27, V20 103 104 VLM 0(R5), V7, V8 // 32-bytes into V1..V8 105 VLM 0(R6), V15, V16 // 32-bytes into V9..V16 106 ADD $32, R5 107 ADD $32, R6 108 109 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order 110 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order 111 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order 112 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order 113 114 VACCCQ V5, V13, V28, V29 115 VACQ V5, V13, V28, V21 116 VACCCQ V6, V14, V29, V30 117 VACQ V6, V14, V29, V22 118 119 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order 120 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order 121 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order 122 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order 123 124 VACCCQ V7, V15, V30, V31 125 VACQ V7, V15, V30, V23 126 VACCCQ V8, V16, V31, V0 // V0 has carry-over 127 VACQ V8, V16, V31, V24 128 129 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order 130 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order 131 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order 132 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order 133 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order 134 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order 135 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order 136 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order 137 VSTM V17, V24, 0(R7) // 128-bytes into z 138 ADD $128, R7 139 ADD $128, R10 // i += 16 140 SUB $16, R3 // n -= 16 141 BGE UU1 // if n >= 0 goto U1 142 VLGVG $1, V0, R4 // put cf into R4 143 NEG R4, R4 // save cf 144 145 A1: 146 ADD $12, R3 // n += 16 147 148 // s/JL/JMP/ below to disable the unrolled loop 149 BLT v1 // if n < 0 goto v1 150 151 U1: // n >= 0 152 // regular loop body unrolled 4x 153 MOVD 0(R8)(R10*1), R5 154 MOVD 8(R8)(R10*1), R6 155 MOVD 16(R8)(R10*1), R7 156 MOVD 24(R8)(R10*1), R1 157 ADDC R4, R4 // restore CF 158 MOVD 0(R9)(R10*1), R11 159 ADDE R11, R5 160 MOVD 8(R9)(R10*1), R11 161 ADDE R11, R6 162 MOVD 16(R9)(R10*1), R11 163 ADDE R11, R7 164 MOVD 24(R9)(R10*1), R11 165 ADDE R11, R1 166 MOVD R0, R4 167 ADDE R4, R4 // save CF 168 NEG R4, R4 169 MOVD R5, 0(R2)(R10*1) 170 MOVD R6, 8(R2)(R10*1) 171 MOVD R7, 16(R2)(R10*1) 172 MOVD R1, 24(R2)(R10*1) 173 174 ADD $32, R10 // i += 4 175 SUB $4, R3 // n -= 4 176 BGE U1 // if n >= 0 goto U1 177 178 v1: 179 ADD $4, R3 // n += 4 180 BLE E1 // if n <= 0 goto E1 181 182 L1: // n > 0 183 ADDC R4, R4 // restore CF 184 MOVD 0(R8)(R10*1), R5 185 MOVD 0(R9)(R10*1), R11 186 ADDE R11, R5 187 MOVD R5, 0(R2)(R10*1) 188 MOVD R0, R4 189 ADDE R4, R4 // save CF 190 NEG R4, R4 191 192 ADD $8, R10 // i++ 193 SUB $1, R3 // n-- 194 BGT L1 // if n > 0 goto L1 195 196 E1: 197 NEG R4, R4 198 MOVD R4, c+72(FP) // return c 199 RET 200 201 TEXT ·addVV_novec(SB), NOSPLIT, $0 202 novec: 203 MOVD z_len+8(FP), R3 204 MOVD x+24(FP), R8 205 MOVD y+48(FP), R9 206 MOVD z+0(FP), R2 207 208 MOVD $0, R4 // c = 0 209 MOVD $0, R0 // make sure it's zero 210 MOVD $0, R10 // i = 0 211 212 // s/JL/JMP/ below to disable the unrolled loop 213 SUB $4, R3 // n -= 4 214 BLT v1n // if n < 0 goto v1n 215 216 U1n: // n >= 0 217 // regular loop body unrolled 4x 218 MOVD 0(R8)(R10*1), R5 219 MOVD 8(R8)(R10*1), R6 220 MOVD 16(R8)(R10*1), R7 221 MOVD 24(R8)(R10*1), R1 222 ADDC R4, R4 // restore CF 223 MOVD 0(R9)(R10*1), R11 224 ADDE R11, R5 225 MOVD 8(R9)(R10*1), R11 226 ADDE R11, R6 227 MOVD 16(R9)(R10*1), R11 228 ADDE R11, R7 229 MOVD 24(R9)(R10*1), R11 230 ADDE R11, R1 231 MOVD R0, R4 232 ADDE R4, R4 // save CF 233 NEG R4, R4 234 MOVD R5, 0(R2)(R10*1) 235 MOVD R6, 8(R2)(R10*1) 236 MOVD R7, 16(R2)(R10*1) 237 MOVD R1, 24(R2)(R10*1) 238 239 ADD $32, R10 // i += 4 240 SUB $4, R3 // n -= 4 241 BGE U1n // if n >= 0 goto U1n 242 243 v1n: 244 ADD $4, R3 // n += 4 245 BLE E1n // if n <= 0 goto E1n 246 247 L1n: // n > 0 248 ADDC R4, R4 // restore CF 249 MOVD 0(R8)(R10*1), R5 250 MOVD 0(R9)(R10*1), R11 251 ADDE R11, R5 252 MOVD R5, 0(R2)(R10*1) 253 MOVD R0, R4 254 ADDE R4, R4 // save CF 255 NEG R4, R4 256 257 ADD $8, R10 // i++ 258 SUB $1, R3 // n-- 259 BGT L1n // if n > 0 goto L1n 260 261 E1n: 262 NEG R4, R4 263 MOVD R4, c+72(FP) // return c 264 RET 265 266 TEXT ·subVV(SB), NOSPLIT, $0 267 MOVD subvectorfacility+0x00(SB), R1 268 BR (R1) 269 270 TEXT ·subVV_check(SB), NOSPLIT, $0 271 MOVB ·hasVX(SB), R1 272 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 273 MOVD $subvectorfacility+0x00(SB), R1 274 MOVD $·subVV_novec(SB), R2 275 MOVD R2, 0(R1) 276 277 // MOVD $·subVV_novec(SB), 0(R1) 278 BR ·subVV_novec(SB) 279 280 vectorimpl: 281 MOVD $subvectorfacility+0x00(SB), R1 282 MOVD $·subVV_vec(SB), R2 283 MOVD R2, 0(R1) 284 285 // MOVD $·subVV_vec(SB), 0(R1) 286 BR ·subVV_vec(SB) 287 288 GLOBL subvectorfacility+0x00(SB), NOPTR, $8 289 DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) 290 291 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 292 // func subVV(z, x, y []Word) (c Word) 293 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) 294 TEXT ·subVV_vec(SB), NOSPLIT, $0 295 MOVD z_len+8(FP), R3 296 MOVD x+24(FP), R8 297 MOVD y+48(FP), R9 298 MOVD z+0(FP), R2 299 MOVD $0, R4 // c = 0 300 MOVD $0, R0 // make sure it's zero 301 MOVD $0, R10 // i = 0 302 303 // s/JL/JMP/ below to disable the unrolled loop 304 SUB $4, R3 // n -= 4 305 BLT v1 // if n < 0 goto v1 306 SUB $12, R3 // n -= 16 307 BLT A1 // if n < 0 goto A1 308 309 MOVD R8, R5 310 MOVD R9, R6 311 MOVD R2, R7 312 313 // n >= 0 314 // regular loop body unrolled 16x 315 VZERO V0 // cf = 0 316 MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) 317 VLVGG $1, R4, V0 // put carry into V0 318 319 UU1: 320 VLM 0(R5), V1, V4 // 64-bytes into V1..V8 321 ADD $64, R5 322 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order 323 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order 324 325 VLM 0(R6), V9, V12 // 64-bytes into V9..V16 326 ADD $64, R6 327 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order 328 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order 329 330 VSBCBIQ V1, V9, V0, V25 331 VSBIQ V1, V9, V0, V17 332 VSBCBIQ V2, V10, V25, V26 333 VSBIQ V2, V10, V25, V18 334 335 VLM 0(R5), V5, V6 // 32-bytes into V1..V8 336 VLM 0(R6), V13, V14 // 32-bytes into V9..V16 337 ADD $32, R5 338 ADD $32, R6 339 340 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order 341 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order 342 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order 343 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order 344 345 VSBCBIQ V3, V11, V26, V27 346 VSBIQ V3, V11, V26, V19 347 VSBCBIQ V4, V12, V27, V28 348 VSBIQ V4, V12, V27, V20 349 350 VLM 0(R5), V7, V8 // 32-bytes into V1..V8 351 VLM 0(R6), V15, V16 // 32-bytes into V9..V16 352 ADD $32, R5 353 ADD $32, R6 354 355 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order 356 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order 357 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order 358 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order 359 360 VSBCBIQ V5, V13, V28, V29 361 VSBIQ V5, V13, V28, V21 362 VSBCBIQ V6, V14, V29, V30 363 VSBIQ V6, V14, V29, V22 364 365 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order 366 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order 367 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order 368 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order 369 370 VSBCBIQ V7, V15, V30, V31 371 VSBIQ V7, V15, V30, V23 372 VSBCBIQ V8, V16, V31, V0 // V0 has carry-over 373 VSBIQ V8, V16, V31, V24 374 375 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order 376 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order 377 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order 378 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order 379 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order 380 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order 381 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order 382 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order 383 VSTM V17, V24, 0(R7) // 128-bytes into z 384 ADD $128, R7 385 ADD $128, R10 // i += 16 386 SUB $16, R3 // n -= 16 387 BGE UU1 // if n >= 0 goto U1 388 VLGVG $1, V0, R4 // put cf into R4 389 SUB $1, R4 // save cf 390 391 A1: 392 ADD $12, R3 // n += 16 393 BLT v1 // if n < 0 goto v1 394 395 U1: // n >= 0 396 // regular loop body unrolled 4x 397 MOVD 0(R8)(R10*1), R5 398 MOVD 8(R8)(R10*1), R6 399 MOVD 16(R8)(R10*1), R7 400 MOVD 24(R8)(R10*1), R1 401 MOVD R0, R11 402 SUBC R4, R11 // restore CF 403 MOVD 0(R9)(R10*1), R11 404 SUBE R11, R5 405 MOVD 8(R9)(R10*1), R11 406 SUBE R11, R6 407 MOVD 16(R9)(R10*1), R11 408 SUBE R11, R7 409 MOVD 24(R9)(R10*1), R11 410 SUBE R11, R1 411 MOVD R0, R4 412 SUBE R4, R4 // save CF 413 MOVD R5, 0(R2)(R10*1) 414 MOVD R6, 8(R2)(R10*1) 415 MOVD R7, 16(R2)(R10*1) 416 MOVD R1, 24(R2)(R10*1) 417 418 ADD $32, R10 // i += 4 419 SUB $4, R3 // n -= 4 420 BGE U1 // if n >= 0 goto U1n 421 422 v1: 423 ADD $4, R3 // n += 4 424 BLE E1 // if n <= 0 goto E1 425 426 L1: // n > 0 427 MOVD R0, R11 428 SUBC R4, R11 // restore CF 429 MOVD 0(R8)(R10*1), R5 430 MOVD 0(R9)(R10*1), R11 431 SUBE R11, R5 432 MOVD R5, 0(R2)(R10*1) 433 MOVD R0, R4 434 SUBE R4, R4 // save CF 435 436 ADD $8, R10 // i++ 437 SUB $1, R3 // n-- 438 BGT L1 // if n > 0 goto L1n 439 440 E1: 441 NEG R4, R4 442 MOVD R4, c+72(FP) // return c 443 RET 444 445 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 446 // func subVV(z, x, y []Word) (c Word) 447 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) 448 TEXT ·subVV_novec(SB), NOSPLIT, $0 449 MOVD z_len+8(FP), R3 450 MOVD x+24(FP), R8 451 MOVD y+48(FP), R9 452 MOVD z+0(FP), R2 453 454 MOVD $0, R4 // c = 0 455 MOVD $0, R0 // make sure it's zero 456 MOVD $0, R10 // i = 0 457 458 // s/JL/JMP/ below to disable the unrolled loop 459 SUB $4, R3 // n -= 4 460 BLT v1 // if n < 0 goto v1 461 462 U1: // n >= 0 463 // regular loop body unrolled 4x 464 MOVD 0(R8)(R10*1), R5 465 MOVD 8(R8)(R10*1), R6 466 MOVD 16(R8)(R10*1), R7 467 MOVD 24(R8)(R10*1), R1 468 MOVD R0, R11 469 SUBC R4, R11 // restore CF 470 MOVD 0(R9)(R10*1), R11 471 SUBE R11, R5 472 MOVD 8(R9)(R10*1), R11 473 SUBE R11, R6 474 MOVD 16(R9)(R10*1), R11 475 SUBE R11, R7 476 MOVD 24(R9)(R10*1), R11 477 SUBE R11, R1 478 MOVD R0, R4 479 SUBE R4, R4 // save CF 480 MOVD R5, 0(R2)(R10*1) 481 MOVD R6, 8(R2)(R10*1) 482 MOVD R7, 16(R2)(R10*1) 483 MOVD R1, 24(R2)(R10*1) 484 485 ADD $32, R10 // i += 4 486 SUB $4, R3 // n -= 4 487 BGE U1 // if n >= 0 goto U1 488 489 v1: 490 ADD $4, R3 // n += 4 491 BLE E1 // if n <= 0 goto E1 492 493 L1: // n > 0 494 MOVD R0, R11 495 SUBC R4, R11 // restore CF 496 MOVD 0(R8)(R10*1), R5 497 MOVD 0(R9)(R10*1), R11 498 SUBE R11, R5 499 MOVD R5, 0(R2)(R10*1) 500 MOVD R0, R4 501 SUBE R4, R4 // save CF 502 503 ADD $8, R10 // i++ 504 SUB $1, R3 // n-- 505 BGT L1 // if n > 0 goto L1 506 507 E1: 508 NEG R4, R4 509 MOVD R4, c+72(FP) // return c 510 RET 511 512 TEXT ·addVW(SB), NOSPLIT, $0 513 MOVD z_len+8(FP), R5 // length of z 514 MOVD x+24(FP), R6 515 MOVD y+48(FP), R7 // c = y 516 MOVD z+0(FP), R8 517 518 CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return 519 520 // Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag. 521 ADDC 0(R6), R7 522 MOVD R7, 0(R8) 523 CMPBEQ R5, $1, returnResult // len(z) == 1 524 MOVD $0, R9 525 ADDE 8(R6), R9 526 MOVD R9, 8(R8) 527 CMPBEQ R5, $2, returnResult // len(z) == 2 528 529 // Update the counters 530 MOVD $16, R12 // i = 2 531 MOVD $-2(R5), R5 // n = n - 2 532 533 loopOverEachWord: 534 BRC $12, copySetup // carry = 0, copy the rest 535 MOVD $1, R9 536 537 // Originally we used the carry flag generated in the previous iteration 538 // (i.e: ADDE could be used here to do the addition). However, since we 539 // already know carry is 1 (otherwise we will go to copy section), we can use 540 // ADDC here so the current iteration does not depend on the carry flag 541 // generated in the previous iteration. This could be useful when branch prediction happens. 542 ADDC 0(R6)(R12*1), R9 543 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c 544 545 MOVD $8(R12), R12 // i++ 546 BRCTG R5, loopOverEachWord // n-- 547 548 // Return the current carry value 549 returnResult: 550 MOVD $0, R0 551 ADDE R0, R0 552 MOVD R0, c+56(FP) 553 RET 554 555 // Update position of x(R6) and z(R8) based on the current counter value and perform copying. 556 // With the assumption that x and z will not overlap with each other or x and z will 557 // point to same memory region, we can use a faster version of copy using only MVC here. 558 // In the following implementation, we have three copy loops, each copying a word, 4 words, and 559 // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. 560 copySetup: 561 ADD R12, R6 562 ADD R12, R8 563 564 CMPBGE R5, $4, mediumLoop 565 566 smallLoop: // does a loop unrolling to copy word when n < 4 567 CMPBEQ R5, $0, returnZero 568 MVC $8, 0(R6), 0(R8) 569 CMPBEQ R5, $1, returnZero 570 MVC $8, 8(R6), 8(R8) 571 CMPBEQ R5, $2, returnZero 572 MVC $8, 16(R6), 16(R8) 573 574 returnZero: 575 MOVD $0, c+56(FP) // return 0 as carry 576 RET 577 578 mediumLoop: 579 CMPBLT R5, $4, smallLoop 580 CMPBLT R5, $32, mediumLoopBody 581 582 largeLoop: // Copying 256 bytes at a time. 583 MVC $256, 0(R6), 0(R8) 584 MOVD $256(R6), R6 585 MOVD $256(R8), R8 586 MOVD $-32(R5), R5 587 CMPBGE R5, $32, largeLoop 588 BR mediumLoop 589 590 mediumLoopBody: // Copying 32 bytes at a time 591 MVC $32, 0(R6), 0(R8) 592 MOVD $32(R6), R6 593 MOVD $32(R8), R8 594 MOVD $-4(R5), R5 595 CMPBGE R5, $4, mediumLoopBody 596 BR smallLoop 597 598 returnC: 599 MOVD R7, c+56(FP) 600 RET 601 602 TEXT ·subVW(SB), NOSPLIT, $0 603 MOVD z_len+8(FP), R5 604 MOVD x+24(FP), R6 605 MOVD y+48(FP), R7 // The borrow bit passed in 606 MOVD z+0(FP), R8 607 MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it. 608 609 CMPBEQ R5, $0, returnC // len(z) == 0, have an early return 610 611 // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag 612 MOVD 0(R6), R9 613 SUBC R7, R9 614 MOVD R9, 0(R8) 615 CMPBEQ R5, $1, returnResult 616 MOVD 8(R6), R9 617 SUBE R0, R9 618 MOVD R9, 8(R8) 619 CMPBEQ R5, $2, returnResult 620 621 // Update the counters 622 MOVD $16, R12 // i = 2 623 MOVD $-2(R5), R5 // n = n - 2 624 625 loopOverEachWord: 626 BRC $3, copySetup // no borrow, copy the rest 627 MOVD 0(R6)(R12*1), R9 628 629 // Originally we used the borrow flag generated in the previous iteration 630 // (i.e: SUBE could be used here to do the subtraction). However, since we 631 // already know borrow is 1 (otherwise we will go to copy section), we can 632 // use SUBC here so the current iteration does not depend on the borrow flag 633 // generated in the previous iteration. This could be useful when branch prediction happens. 634 SUBC $1, R9 635 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1 636 637 MOVD $8(R12), R12 // i++ 638 BRCTG R5, loopOverEachWord // n-- 639 640 // return the current borrow value 641 returnResult: 642 SUBE R0, R0 643 NEG R0, R0 644 MOVD R0, c+56(FP) 645 RET 646 647 // Update position of x(R6) and z(R8) based on the current counter value and perform copying. 648 // With the assumption that x and z will not overlap with each other or x and z will 649 // point to same memory region, we can use a faster version of copy using only MVC here. 650 // In the following implementation, we have three copy loops, each copying a word, 4 words, and 651 // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. 652 copySetup: 653 ADD R12, R6 654 ADD R12, R8 655 656 CMPBGE R5, $4, mediumLoop 657 658 smallLoop: // does a loop unrolling to copy word when n < 4 659 CMPBEQ R5, $0, returnZero 660 MVC $8, 0(R6), 0(R8) 661 CMPBEQ R5, $1, returnZero 662 MVC $8, 8(R6), 8(R8) 663 CMPBEQ R5, $2, returnZero 664 MVC $8, 16(R6), 16(R8) 665 666 returnZero: 667 MOVD $0, c+56(FP) // return 0 as borrow 668 RET 669 670 mediumLoop: 671 CMPBLT R5, $4, smallLoop 672 CMPBLT R5, $32, mediumLoopBody 673 674 largeLoop: // Copying 256 bytes at a time 675 MVC $256, 0(R6), 0(R8) 676 MOVD $256(R6), R6 677 MOVD $256(R8), R8 678 MOVD $-32(R5), R5 679 CMPBGE R5, $32, largeLoop 680 BR mediumLoop 681 682 mediumLoopBody: // Copying 32 bytes at a time 683 MVC $32, 0(R6), 0(R8) 684 MOVD $32(R6), R6 685 MOVD $32(R8), R8 686 MOVD $-4(R5), R5 687 CMPBGE R5, $4, mediumLoopBody 688 BR smallLoop 689 690 returnC: 691 MOVD R7, c+56(FP) 692 RET 693 694 // func shlVU(z, x []Word, s uint) (c Word) 695 TEXT ·shlVU(SB), NOSPLIT, $0 696 BR ·shlVU_g(SB) 697 698 // func shrVU(z, x []Word, s uint) (c Word) 699 TEXT ·shrVU(SB), NOSPLIT, $0 700 BR ·shrVU_g(SB) 701 702 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i 703 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 704 TEXT ·mulAddVWW(SB), NOSPLIT, $0 705 MOVD z+0(FP), R2 706 MOVD x+24(FP), R8 707 MOVD y+48(FP), R9 708 MOVD r+56(FP), R4 // c = r 709 MOVD z_len+8(FP), R5 710 MOVD $0, R1 // i = 0 711 MOVD $0, R7 // i*8 = 0 712 MOVD $0, R0 // make sure it's zero 713 BR E5 714 715 L5: 716 MOVD (R8)(R1*1), R6 717 MULHDU R9, R6 718 ADDC R4, R11 // add to low order bits 719 ADDE R0, R6 720 MOVD R11, (R2)(R1*1) 721 MOVD R6, R4 722 ADD $8, R1 // i*8 + 8 723 ADD $1, R7 // i++ 724 725 E5: 726 CMPBLT R7, R5, L5 // i < n 727 728 MOVD R4, c+64(FP) 729 RET 730 731 // func addMulVVW(z, x []Word, y Word) (c Word) 732 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i 733 TEXT ·addMulVVW(SB), NOSPLIT, $0 734 MOVD z+0(FP), R2 735 MOVD x+24(FP), R8 736 MOVD y+48(FP), R9 737 MOVD z_len+8(FP), R5 738 739 MOVD $0, R1 // i*8 = 0 740 MOVD $0, R7 // i = 0 741 MOVD $0, R0 // make sure it's zero 742 MOVD $0, R4 // c = 0 743 744 MOVD R5, R12 745 AND $-2, R12 746 CMPBGE R5, $2, A6 747 BR E6 748 749 A6: 750 MOVD (R8)(R1*1), R6 751 MULHDU R9, R6 752 MOVD (R2)(R1*1), R10 753 ADDC R10, R11 // add to low order bits 754 ADDE R0, R6 755 ADDC R4, R11 756 ADDE R0, R6 757 MOVD R6, R4 758 MOVD R11, (R2)(R1*1) 759 760 MOVD (8)(R8)(R1*1), R6 761 MULHDU R9, R6 762 MOVD (8)(R2)(R1*1), R10 763 ADDC R10, R11 // add to low order bits 764 ADDE R0, R6 765 ADDC R4, R11 766 ADDE R0, R6 767 MOVD R6, R4 768 MOVD R11, (8)(R2)(R1*1) 769 770 ADD $16, R1 // i*8 + 8 771 ADD $2, R7 // i++ 772 773 CMPBLT R7, R12, A6 774 BR E6 775 776 L6: 777 MOVD (R8)(R1*1), R6 778 MULHDU R9, R6 779 MOVD (R2)(R1*1), R10 780 ADDC R10, R11 // add to low order bits 781 ADDE R0, R6 782 ADDC R4, R11 783 ADDE R0, R6 784 MOVD R6, R4 785 MOVD R11, (R2)(R1*1) 786 787 ADD $8, R1 // i*8 + 8 788 ADD $1, R7 // i++ 789 790 E6: 791 CMPBLT R7, R5, L6 // i < n 792 793 MOVD R4, c+56(FP) 794 RET 795