github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/math/big/arith_s390x.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !math_big_pure_go 6 // +build !math_big_pure_go 7 8 #include "textflag.h" 9 10 // This file provides fast assembly versions for the elementary 11 // arithmetic operations on vectors implemented in arith.go. 12 13 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 14 // func addVV(z, x, y []Word) (c Word) 15 16 TEXT ·addVV(SB), NOSPLIT, $0 17 MOVD addvectorfacility+0x00(SB), R1 18 BR (R1) 19 20 TEXT ·addVV_check(SB), NOSPLIT, $0 21 MOVB ·hasVX(SB), R1 22 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 23 MOVD $addvectorfacility+0x00(SB), R1 24 MOVD $·addVV_novec(SB), R2 25 MOVD R2, 0(R1) 26 27 // MOVD $·addVV_novec(SB), 0(R1) 28 BR ·addVV_novec(SB) 29 30 vectorimpl: 31 MOVD $addvectorfacility+0x00(SB), R1 32 MOVD $·addVV_vec(SB), R2 33 MOVD R2, 0(R1) 34 35 // MOVD $·addVV_vec(SB), 0(R1) 36 BR ·addVV_vec(SB) 37 38 GLOBL addvectorfacility+0x00(SB), NOPTR, $8 39 DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB) 40 41 TEXT ·addVV_vec(SB), NOSPLIT, $0 42 MOVD z_len+8(FP), R3 43 MOVD x+24(FP), R8 44 MOVD y+48(FP), R9 45 MOVD z+0(FP), R2 46 47 MOVD $0, R4 // c = 0 48 MOVD $0, R0 // make sure it's zero 49 MOVD $0, R10 // i = 0 50 51 // s/JL/JMP/ below to disable the unrolled loop 52 SUB $4, R3 53 BLT v1 54 SUB $12, R3 // n -= 16 55 BLT A1 // if n < 0 goto A1 56 57 MOVD R8, R5 58 MOVD R9, R6 59 MOVD R2, R7 60 61 // n >= 0 62 // regular loop body unrolled 16x 63 VZERO V0 // c = 0 64 65 UU1: 66 VLM 0(R5), V1, V4 // 64-bytes into V1..V8 67 ADD $64, R5 68 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order 69 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order 70 71 VLM 0(R6), V9, V12 // 64-bytes into V9..V16 72 ADD $64, R6 73 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order 74 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order 75 76 VACCCQ V1, V9, V0, V25 77 VACQ V1, V9, V0, V17 78 VACCCQ V2, V10, V25, V26 79 VACQ V2, V10, V25, V18 80 81 VLM 0(R5), V5, V6 // 32-bytes into V1..V8 82 VLM 0(R6), V13, V14 // 32-bytes into V9..V16 83 ADD $32, R5 84 ADD $32, R6 85 86 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order 87 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order 88 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order 89 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order 90 91 VACCCQ V3, V11, V26, V27 92 VACQ V3, V11, V26, V19 93 VACCCQ V4, V12, V27, V28 94 VACQ V4, V12, V27, V20 95 96 VLM 0(R5), V7, V8 // 32-bytes into V1..V8 97 VLM 0(R6), V15, V16 // 32-bytes into V9..V16 98 ADD $32, R5 99 ADD $32, R6 100 101 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order 102 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order 103 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order 104 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order 105 106 VACCCQ V5, V13, V28, V29 107 VACQ V5, V13, V28, V21 108 VACCCQ V6, V14, V29, V30 109 VACQ V6, V14, V29, V22 110 111 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order 112 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order 113 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order 114 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order 115 116 VACCCQ V7, V15, V30, V31 117 VACQ V7, V15, V30, V23 118 VACCCQ V8, V16, V31, V0 // V0 has carry-over 119 VACQ V8, V16, V31, V24 120 121 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order 122 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order 123 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order 124 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order 125 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order 126 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order 127 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order 128 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order 129 VSTM V17, V24, 0(R7) // 128-bytes into z 130 ADD $128, R7 131 ADD $128, R10 // i += 16 132 SUB $16, R3 // n -= 16 133 BGE UU1 // if n >= 0 goto U1 134 VLGVG $1, V0, R4 // put cf into R4 135 NEG R4, R4 // save cf 136 137 A1: 138 ADD $12, R3 // n += 16 139 140 // s/JL/JMP/ below to disable the unrolled loop 141 BLT v1 // if n < 0 goto v1 142 143 U1: // n >= 0 144 // regular loop body unrolled 4x 145 MOVD 0(R8)(R10*1), R5 146 MOVD 8(R8)(R10*1), R6 147 MOVD 16(R8)(R10*1), R7 148 MOVD 24(R8)(R10*1), R1 149 ADDC R4, R4 // restore CF 150 MOVD 0(R9)(R10*1), R11 151 ADDE R11, R5 152 MOVD 8(R9)(R10*1), R11 153 ADDE R11, R6 154 MOVD 16(R9)(R10*1), R11 155 ADDE R11, R7 156 MOVD 24(R9)(R10*1), R11 157 ADDE R11, R1 158 MOVD R0, R4 159 ADDE R4, R4 // save CF 160 NEG R4, R4 161 MOVD R5, 0(R2)(R10*1) 162 MOVD R6, 8(R2)(R10*1) 163 MOVD R7, 16(R2)(R10*1) 164 MOVD R1, 24(R2)(R10*1) 165 166 ADD $32, R10 // i += 4 167 SUB $4, R3 // n -= 4 168 BGE U1 // if n >= 0 goto U1 169 170 v1: 171 ADD $4, R3 // n += 4 172 BLE E1 // if n <= 0 goto E1 173 174 L1: // n > 0 175 ADDC R4, R4 // restore CF 176 MOVD 0(R8)(R10*1), R5 177 MOVD 0(R9)(R10*1), R11 178 ADDE R11, R5 179 MOVD R5, 0(R2)(R10*1) 180 MOVD R0, R4 181 ADDE R4, R4 // save CF 182 NEG R4, R4 183 184 ADD $8, R10 // i++ 185 SUB $1, R3 // n-- 186 BGT L1 // if n > 0 goto L1 187 188 E1: 189 NEG R4, R4 190 MOVD R4, c+72(FP) // return c 191 RET 192 193 TEXT ·addVV_novec(SB), NOSPLIT, $0 194 novec: 195 MOVD z_len+8(FP), R3 196 MOVD x+24(FP), R8 197 MOVD y+48(FP), R9 198 MOVD z+0(FP), R2 199 200 MOVD $0, R4 // c = 0 201 MOVD $0, R0 // make sure it's zero 202 MOVD $0, R10 // i = 0 203 204 // s/JL/JMP/ below to disable the unrolled loop 205 SUB $4, R3 // n -= 4 206 BLT v1n // if n < 0 goto v1n 207 208 U1n: // n >= 0 209 // regular loop body unrolled 4x 210 MOVD 0(R8)(R10*1), R5 211 MOVD 8(R8)(R10*1), R6 212 MOVD 16(R8)(R10*1), R7 213 MOVD 24(R8)(R10*1), R1 214 ADDC R4, R4 // restore CF 215 MOVD 0(R9)(R10*1), R11 216 ADDE R11, R5 217 MOVD 8(R9)(R10*1), R11 218 ADDE R11, R6 219 MOVD 16(R9)(R10*1), R11 220 ADDE R11, R7 221 MOVD 24(R9)(R10*1), R11 222 ADDE R11, R1 223 MOVD R0, R4 224 ADDE R4, R4 // save CF 225 NEG R4, R4 226 MOVD R5, 0(R2)(R10*1) 227 MOVD R6, 8(R2)(R10*1) 228 MOVD R7, 16(R2)(R10*1) 229 MOVD R1, 24(R2)(R10*1) 230 231 ADD $32, R10 // i += 4 232 SUB $4, R3 // n -= 4 233 BGE U1n // if n >= 0 goto U1n 234 235 v1n: 236 ADD $4, R3 // n += 4 237 BLE E1n // if n <= 0 goto E1n 238 239 L1n: // n > 0 240 ADDC R4, R4 // restore CF 241 MOVD 0(R8)(R10*1), R5 242 MOVD 0(R9)(R10*1), R11 243 ADDE R11, R5 244 MOVD R5, 0(R2)(R10*1) 245 MOVD R0, R4 246 ADDE R4, R4 // save CF 247 NEG R4, R4 248 249 ADD $8, R10 // i++ 250 SUB $1, R3 // n-- 251 BGT L1n // if n > 0 goto L1n 252 253 E1n: 254 NEG R4, R4 255 MOVD R4, c+72(FP) // return c 256 RET 257 258 TEXT ·subVV(SB), NOSPLIT, $0 259 MOVD subvectorfacility+0x00(SB), R1 260 BR (R1) 261 262 TEXT ·subVV_check(SB), NOSPLIT, $0 263 MOVB ·hasVX(SB), R1 264 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 265 MOVD $subvectorfacility+0x00(SB), R1 266 MOVD $·subVV_novec(SB), R2 267 MOVD R2, 0(R1) 268 269 // MOVD $·subVV_novec(SB), 0(R1) 270 BR ·subVV_novec(SB) 271 272 vectorimpl: 273 MOVD $subvectorfacility+0x00(SB), R1 274 MOVD $·subVV_vec(SB), R2 275 MOVD R2, 0(R1) 276 277 // MOVD $·subVV_vec(SB), 0(R1) 278 BR ·subVV_vec(SB) 279 280 GLOBL subvectorfacility+0x00(SB), NOPTR, $8 281 DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) 282 283 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 284 // func subVV(z, x, y []Word) (c Word) 285 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) 286 TEXT ·subVV_vec(SB), NOSPLIT, $0 287 MOVD z_len+8(FP), R3 288 MOVD x+24(FP), R8 289 MOVD y+48(FP), R9 290 MOVD z+0(FP), R2 291 MOVD $0, R4 // c = 0 292 MOVD $0, R0 // make sure it's zero 293 MOVD $0, R10 // i = 0 294 295 // s/JL/JMP/ below to disable the unrolled loop 296 SUB $4, R3 // n -= 4 297 BLT v1 // if n < 0 goto v1 298 SUB $12, R3 // n -= 16 299 BLT A1 // if n < 0 goto A1 300 301 MOVD R8, R5 302 MOVD R9, R6 303 MOVD R2, R7 304 305 // n >= 0 306 // regular loop body unrolled 16x 307 VZERO V0 // cf = 0 308 MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) 309 VLVGG $1, R4, V0 // put carry into V0 310 311 UU1: 312 VLM 0(R5), V1, V4 // 64-bytes into V1..V8 313 ADD $64, R5 314 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order 315 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order 316 317 VLM 0(R6), V9, V12 // 64-bytes into V9..V16 318 ADD $64, R6 319 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order 320 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order 321 322 VSBCBIQ V1, V9, V0, V25 323 VSBIQ V1, V9, V0, V17 324 VSBCBIQ V2, V10, V25, V26 325 VSBIQ V2, V10, V25, V18 326 327 VLM 0(R5), V5, V6 // 32-bytes into V1..V8 328 VLM 0(R6), V13, V14 // 32-bytes into V9..V16 329 ADD $32, R5 330 ADD $32, R6 331 332 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order 333 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order 334 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order 335 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order 336 337 VSBCBIQ V3, V11, V26, V27 338 VSBIQ V3, V11, V26, V19 339 VSBCBIQ V4, V12, V27, V28 340 VSBIQ V4, V12, V27, V20 341 342 VLM 0(R5), V7, V8 // 32-bytes into V1..V8 343 VLM 0(R6), V15, V16 // 32-bytes into V9..V16 344 ADD $32, R5 345 ADD $32, R6 346 347 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order 348 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order 349 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order 350 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order 351 352 VSBCBIQ V5, V13, V28, V29 353 VSBIQ V5, V13, V28, V21 354 VSBCBIQ V6, V14, V29, V30 355 VSBIQ V6, V14, V29, V22 356 357 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order 358 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order 359 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order 360 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order 361 362 VSBCBIQ V7, V15, V30, V31 363 VSBIQ V7, V15, V30, V23 364 VSBCBIQ V8, V16, V31, V0 // V0 has carry-over 365 VSBIQ V8, V16, V31, V24 366 367 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order 368 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order 369 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order 370 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order 371 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order 372 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order 373 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order 374 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order 375 VSTM V17, V24, 0(R7) // 128-bytes into z 376 ADD $128, R7 377 ADD $128, R10 // i += 16 378 SUB $16, R3 // n -= 16 379 BGE UU1 // if n >= 0 goto U1 380 VLGVG $1, V0, R4 // put cf into R4 381 SUB $1, R4 // save cf 382 383 A1: 384 ADD $12, R3 // n += 16 385 BLT v1 // if n < 0 goto v1 386 387 U1: // n >= 0 388 // regular loop body unrolled 4x 389 MOVD 0(R8)(R10*1), R5 390 MOVD 8(R8)(R10*1), R6 391 MOVD 16(R8)(R10*1), R7 392 MOVD 24(R8)(R10*1), R1 393 MOVD R0, R11 394 SUBC R4, R11 // restore CF 395 MOVD 0(R9)(R10*1), R11 396 SUBE R11, R5 397 MOVD 8(R9)(R10*1), R11 398 SUBE R11, R6 399 MOVD 16(R9)(R10*1), R11 400 SUBE R11, R7 401 MOVD 24(R9)(R10*1), R11 402 SUBE R11, R1 403 MOVD R0, R4 404 SUBE R4, R4 // save CF 405 MOVD R5, 0(R2)(R10*1) 406 MOVD R6, 8(R2)(R10*1) 407 MOVD R7, 16(R2)(R10*1) 408 MOVD R1, 24(R2)(R10*1) 409 410 ADD $32, R10 // i += 4 411 SUB $4, R3 // n -= 4 412 BGE U1 // if n >= 0 goto U1n 413 414 v1: 415 ADD $4, R3 // n += 4 416 BLE E1 // if n <= 0 goto E1 417 418 L1: // n > 0 419 MOVD R0, R11 420 SUBC R4, R11 // restore CF 421 MOVD 0(R8)(R10*1), R5 422 MOVD 0(R9)(R10*1), R11 423 SUBE R11, R5 424 MOVD R5, 0(R2)(R10*1) 425 MOVD R0, R4 426 SUBE R4, R4 // save CF 427 428 ADD $8, R10 // i++ 429 SUB $1, R3 // n-- 430 BGT L1 // if n > 0 goto L1n 431 432 E1: 433 NEG R4, R4 434 MOVD R4, c+72(FP) // return c 435 RET 436 437 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 438 // func subVV(z, x, y []Word) (c Word) 439 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) 440 TEXT ·subVV_novec(SB), NOSPLIT, $0 441 MOVD z_len+8(FP), R3 442 MOVD x+24(FP), R8 443 MOVD y+48(FP), R9 444 MOVD z+0(FP), R2 445 446 MOVD $0, R4 // c = 0 447 MOVD $0, R0 // make sure it's zero 448 MOVD $0, R10 // i = 0 449 450 // s/JL/JMP/ below to disable the unrolled loop 451 SUB $4, R3 // n -= 4 452 BLT v1 // if n < 0 goto v1 453 454 U1: // n >= 0 455 // regular loop body unrolled 4x 456 MOVD 0(R8)(R10*1), R5 457 MOVD 8(R8)(R10*1), R6 458 MOVD 16(R8)(R10*1), R7 459 MOVD 24(R8)(R10*1), R1 460 MOVD R0, R11 461 SUBC R4, R11 // restore CF 462 MOVD 0(R9)(R10*1), R11 463 SUBE R11, R5 464 MOVD 8(R9)(R10*1), R11 465 SUBE R11, R6 466 MOVD 16(R9)(R10*1), R11 467 SUBE R11, R7 468 MOVD 24(R9)(R10*1), R11 469 SUBE R11, R1 470 MOVD R0, R4 471 SUBE R4, R4 // save CF 472 MOVD R5, 0(R2)(R10*1) 473 MOVD R6, 8(R2)(R10*1) 474 MOVD R7, 16(R2)(R10*1) 475 MOVD R1, 24(R2)(R10*1) 476 477 ADD $32, R10 // i += 4 478 SUB $4, R3 // n -= 4 479 BGE U1 // if n >= 0 goto U1 480 481 v1: 482 ADD $4, R3 // n += 4 483 BLE E1 // if n <= 0 goto E1 484 485 L1: // n > 0 486 MOVD R0, R11 487 SUBC R4, R11 // restore CF 488 MOVD 0(R8)(R10*1), R5 489 MOVD 0(R9)(R10*1), R11 490 SUBE R11, R5 491 MOVD R5, 0(R2)(R10*1) 492 MOVD R0, R4 493 SUBE R4, R4 // save CF 494 495 ADD $8, R10 // i++ 496 SUB $1, R3 // n-- 497 BGT L1 // if n > 0 goto L1 498 499 E1: 500 NEG R4, R4 501 MOVD R4, c+72(FP) // return c 502 RET 503 504 TEXT ·addVW(SB), NOSPLIT, $0 505 MOVD z_len+8(FP), R5 // length of z 506 MOVD x+24(FP), R6 507 MOVD y+48(FP), R7 // c = y 508 MOVD z+0(FP), R8 509 510 CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return 511 512 // Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag. 513 ADDC 0(R6), R7 514 MOVD R7, 0(R8) 515 CMPBEQ R5, $1, returnResult // len(z) == 1 516 MOVD $0, R9 517 ADDE 8(R6), R9 518 MOVD R9, 8(R8) 519 CMPBEQ R5, $2, returnResult // len(z) == 2 520 521 // Update the counters 522 MOVD $16, R12 // i = 2 523 MOVD $-2(R5), R5 // n = n - 2 524 525 loopOverEachWord: 526 BRC $12, copySetup // carry = 0, copy the rest 527 MOVD $1, R9 528 529 // Originally we used the carry flag generated in the previous iteration 530 // (i.e: ADDE could be used here to do the addition). However, since we 531 // already know carry is 1 (otherwise we will go to copy section), we can use 532 // ADDC here so the current iteration does not depend on the carry flag 533 // generated in the previous iteration. This could be useful when branch prediction happens. 534 ADDC 0(R6)(R12*1), R9 535 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c 536 537 MOVD $8(R12), R12 // i++ 538 BRCTG R5, loopOverEachWord // n-- 539 540 // Return the current carry value 541 returnResult: 542 MOVD $0, R0 543 ADDE R0, R0 544 MOVD R0, c+56(FP) 545 RET 546 547 // Update position of x(R6) and z(R8) based on the current counter value and perform copying. 548 // With the assumption that x and z will not overlap with each other or x and z will 549 // point to same memory region, we can use a faster version of copy using only MVC here. 550 // In the following implementation, we have three copy loops, each copying a word, 4 words, and 551 // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. 552 copySetup: 553 ADD R12, R6 554 ADD R12, R8 555 556 CMPBGE R5, $4, mediumLoop 557 558 smallLoop: // does a loop unrolling to copy word when n < 4 559 CMPBEQ R5, $0, returnZero 560 MVC $8, 0(R6), 0(R8) 561 CMPBEQ R5, $1, returnZero 562 MVC $8, 8(R6), 8(R8) 563 CMPBEQ R5, $2, returnZero 564 MVC $8, 16(R6), 16(R8) 565 566 returnZero: 567 MOVD $0, c+56(FP) // return 0 as carry 568 RET 569 570 mediumLoop: 571 CMPBLT R5, $4, smallLoop 572 CMPBLT R5, $32, mediumLoopBody 573 574 largeLoop: // Copying 256 bytes at a time. 575 MVC $256, 0(R6), 0(R8) 576 MOVD $256(R6), R6 577 MOVD $256(R8), R8 578 MOVD $-32(R5), R5 579 CMPBGE R5, $32, largeLoop 580 BR mediumLoop 581 582 mediumLoopBody: // Copying 32 bytes at a time 583 MVC $32, 0(R6), 0(R8) 584 MOVD $32(R6), R6 585 MOVD $32(R8), R8 586 MOVD $-4(R5), R5 587 CMPBGE R5, $4, mediumLoopBody 588 BR smallLoop 589 590 returnC: 591 MOVD R7, c+56(FP) 592 RET 593 594 TEXT ·subVW(SB), NOSPLIT, $0 595 MOVD z_len+8(FP), R5 596 MOVD x+24(FP), R6 597 MOVD y+48(FP), R7 // The borrow bit passed in 598 MOVD z+0(FP), R8 599 MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it. 600 601 CMPBEQ R5, $0, returnC // len(z) == 0, have an early return 602 603 // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag 604 MOVD 0(R6), R9 605 SUBC R7, R9 606 MOVD R9, 0(R8) 607 CMPBEQ R5, $1, returnResult 608 MOVD 8(R6), R9 609 SUBE R0, R9 610 MOVD R9, 8(R8) 611 CMPBEQ R5, $2, returnResult 612 613 // Update the counters 614 MOVD $16, R12 // i = 2 615 MOVD $-2(R5), R5 // n = n - 2 616 617 loopOverEachWord: 618 BRC $3, copySetup // no borrow, copy the rest 619 MOVD 0(R6)(R12*1), R9 620 621 // Originally we used the borrow flag generated in the previous iteration 622 // (i.e: SUBE could be used here to do the subtraction). However, since we 623 // already know borrow is 1 (otherwise we will go to copy section), we can 624 // use SUBC here so the current iteration does not depend on the borrow flag 625 // generated in the previous iteration. This could be useful when branch prediction happens. 626 SUBC $1, R9 627 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1 628 629 MOVD $8(R12), R12 // i++ 630 BRCTG R5, loopOverEachWord // n-- 631 632 // return the current borrow value 633 returnResult: 634 SUBE R0, R0 635 NEG R0, R0 636 MOVD R0, c+56(FP) 637 RET 638 639 // Update position of x(R6) and z(R8) based on the current counter value and perform copying. 640 // With the assumption that x and z will not overlap with each other or x and z will 641 // point to same memory region, we can use a faster version of copy using only MVC here. 642 // In the following implementation, we have three copy loops, each copying a word, 4 words, and 643 // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. 644 copySetup: 645 ADD R12, R6 646 ADD R12, R8 647 648 CMPBGE R5, $4, mediumLoop 649 650 smallLoop: // does a loop unrolling to copy word when n < 4 651 CMPBEQ R5, $0, returnZero 652 MVC $8, 0(R6), 0(R8) 653 CMPBEQ R5, $1, returnZero 654 MVC $8, 8(R6), 8(R8) 655 CMPBEQ R5, $2, returnZero 656 MVC $8, 16(R6), 16(R8) 657 658 returnZero: 659 MOVD $0, c+56(FP) // return 0 as borrow 660 RET 661 662 mediumLoop: 663 CMPBLT R5, $4, smallLoop 664 CMPBLT R5, $32, mediumLoopBody 665 666 largeLoop: // Copying 256 bytes at a time 667 MVC $256, 0(R6), 0(R8) 668 MOVD $256(R6), R6 669 MOVD $256(R8), R8 670 MOVD $-32(R5), R5 671 CMPBGE R5, $32, largeLoop 672 BR mediumLoop 673 674 mediumLoopBody: // Copying 32 bytes at a time 675 MVC $32, 0(R6), 0(R8) 676 MOVD $32(R6), R6 677 MOVD $32(R8), R8 678 MOVD $-4(R5), R5 679 CMPBGE R5, $4, mediumLoopBody 680 BR smallLoop 681 682 returnC: 683 MOVD R7, c+56(FP) 684 RET 685 686 // func shlVU(z, x []Word, s uint) (c Word) 687 TEXT ·shlVU(SB), NOSPLIT, $0 688 BR ·shlVU_g(SB) 689 690 // func shrVU(z, x []Word, s uint) (c Word) 691 TEXT ·shrVU(SB), NOSPLIT, $0 692 BR ·shrVU_g(SB) 693 694 // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i 695 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 696 TEXT ·mulAddVWW(SB), NOSPLIT, $0 697 MOVD z+0(FP), R2 698 MOVD x+24(FP), R8 699 MOVD y+48(FP), R9 700 MOVD r+56(FP), R4 // c = r 701 MOVD z_len+8(FP), R5 702 MOVD $0, R1 // i = 0 703 MOVD $0, R7 // i*8 = 0 704 MOVD $0, R0 // make sure it's zero 705 BR E5 706 707 L5: 708 MOVD (R8)(R1*1), R6 709 MULHDU R9, R6 710 ADDC R4, R11 // add to low order bits 711 ADDE R0, R6 712 MOVD R11, (R2)(R1*1) 713 MOVD R6, R4 714 ADD $8, R1 // i*8 + 8 715 ADD $1, R7 // i++ 716 717 E5: 718 CMPBLT R7, R5, L5 // i < n 719 720 MOVD R4, c+64(FP) 721 RET 722 723 // func addMulVVW(z, x []Word, y Word) (c Word) 724 // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i 725 TEXT ·addMulVVW(SB), NOSPLIT, $0 726 MOVD z+0(FP), R2 727 MOVD x+24(FP), R8 728 MOVD y+48(FP), R9 729 MOVD z_len+8(FP), R5 730 731 MOVD $0, R1 // i*8 = 0 732 MOVD $0, R7 // i = 0 733 MOVD $0, R0 // make sure it's zero 734 MOVD $0, R4 // c = 0 735 736 MOVD R5, R12 737 AND $-2, R12 738 CMPBGE R5, $2, A6 739 BR E6 740 741 A6: 742 MOVD (R8)(R1*1), R6 743 MULHDU R9, R6 744 MOVD (R2)(R1*1), R10 745 ADDC R10, R11 // add to low order bits 746 ADDE R0, R6 747 ADDC R4, R11 748 ADDE R0, R6 749 MOVD R6, R4 750 MOVD R11, (R2)(R1*1) 751 752 MOVD (8)(R8)(R1*1), R6 753 MULHDU R9, R6 754 MOVD (8)(R2)(R1*1), R10 755 ADDC R10, R11 // add to low order bits 756 ADDE R0, R6 757 ADDC R4, R11 758 ADDE R0, R6 759 MOVD R6, R4 760 MOVD R11, (8)(R2)(R1*1) 761 762 ADD $16, R1 // i*8 + 8 763 ADD $2, R7 // i++ 764 765 CMPBLT R7, R12, A6 766 BR E6 767 768 L6: 769 MOVD (R8)(R1*1), R6 770 MULHDU R9, R6 771 MOVD (R2)(R1*1), R10 772 ADDC R10, R11 // add to low order bits 773 ADDE R0, R6 774 ADDC R4, R11 775 ADDE R0, R6 776 MOVD R6, R4 777 MOVD R11, (R2)(R1*1) 778 779 ADD $8, R1 // i*8 + 8 780 ADD $1, R7 // i++ 781 782 E6: 783 CMPBLT R7, R5, L6 // i < n 784 785 MOVD R4, c+56(FP) 786 RET 787