github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/math/big/arith_s390x.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !math_big_pure_go 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 13 // func addVV(z, x, y []Word) (c Word) 14 15 TEXT ·addVV(SB), NOSPLIT, $0 16 MOVD addvectorfacility+0x00(SB), R1 17 BR (R1) 18 19 TEXT ·addVV_check(SB), NOSPLIT, $0 20 MOVB ·hasVX(SB), R1 21 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 22 MOVD $addvectorfacility+0x00(SB), R1 23 MOVD $·addVV_novec(SB), R2 24 MOVD R2, 0(R1) 25 26 // MOVD $·addVV_novec(SB), 0(R1) 27 BR ·addVV_novec(SB) 28 29 vectorimpl: 30 MOVD $addvectorfacility+0x00(SB), R1 31 MOVD $·addVV_vec(SB), R2 32 MOVD R2, 0(R1) 33 34 // MOVD $·addVV_vec(SB), 0(R1) 35 BR ·addVV_vec(SB) 36 37 GLOBL addvectorfacility+0x00(SB), NOPTR, $8 38 DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB) 39 40 TEXT ·addVV_vec(SB), NOSPLIT, $0 41 MOVD z_len+8(FP), R3 42 MOVD x+24(FP), R8 43 MOVD y+48(FP), R9 44 MOVD z+0(FP), R2 45 46 MOVD $0, R4 // c = 0 47 MOVD $0, R0 // make sure it's zero 48 MOVD $0, R10 // i = 0 49 50 // s/JL/JMP/ below to disable the unrolled loop 51 SUB $4, R3 52 BLT v1 53 SUB $12, R3 // n -= 16 54 BLT A1 // if n < 0 goto A1 55 56 MOVD R8, R5 57 MOVD R9, R6 58 MOVD R2, R7 59 60 // n >= 0 61 // regular loop body unrolled 16x 62 VZERO V0 // c = 0 63 64 UU1: 65 VLM 0(R5), V1, V4 // 64-bytes into V1..V8 66 ADD $64, R5 67 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order 68 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order 69 70 VLM 0(R6), V9, V12 // 64-bytes into V9..V16 71 ADD $64, R6 72 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order 73 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order 74 75 VACCCQ V1, V9, V0, V25 76 VACQ V1, V9, V0, V17 77 VACCCQ V2, V10, V25, V26 78 VACQ V2, V10, V25, V18 79 80 VLM 0(R5), V5, V6 // 32-bytes into V1..V8 81 VLM 0(R6), V13, V14 // 32-bytes into V9..V16 82 ADD $32, R5 83 ADD $32, R6 84 85 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order 86 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order 87 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order 88 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order 89 90 VACCCQ V3, V11, V26, V27 91 VACQ V3, V11, V26, V19 92 VACCCQ V4, V12, V27, V28 93 VACQ V4, V12, V27, V20 94 95 VLM 0(R5), V7, V8 // 32-bytes into V1..V8 96 VLM 0(R6), V15, V16 // 32-bytes into V9..V16 97 ADD $32, R5 98 ADD $32, R6 99 100 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order 101 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order 102 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order 103 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order 104 105 VACCCQ V5, V13, V28, V29 106 VACQ V5, V13, V28, V21 107 VACCCQ V6, V14, V29, V30 108 VACQ V6, V14, V29, V22 109 110 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order 111 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order 112 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order 113 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order 114 115 VACCCQ V7, V15, V30, V31 116 VACQ V7, V15, V30, V23 117 VACCCQ V8, V16, V31, V0 // V0 has carry-over 118 VACQ V8, V16, V31, V24 119 120 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order 121 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order 122 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order 123 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order 124 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order 125 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order 126 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order 127 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order 128 VSTM V17, V24, 0(R7) // 128-bytes into z 129 ADD $128, R7 130 ADD $128, R10 // i += 16 131 SUB $16, R3 // n -= 16 132 BGE UU1 // if n >= 0 goto U1 133 VLGVG $1, V0, R4 // put cf into R4 134 NEG R4, R4 // save cf 135 136 A1: 137 ADD $12, R3 // n += 16 138 139 // s/JL/JMP/ below to disable the unrolled loop 140 BLT v1 // if n < 0 goto v1 141 142 U1: // n >= 0 143 // regular loop body unrolled 4x 144 MOVD 0(R8)(R10*1), R5 145 MOVD 8(R8)(R10*1), R6 146 MOVD 16(R8)(R10*1), R7 147 MOVD 24(R8)(R10*1), R1 148 ADDC R4, R4 // restore CF 149 MOVD 0(R9)(R10*1), R11 150 ADDE R11, R5 151 MOVD 8(R9)(R10*1), R11 152 ADDE R11, R6 153 MOVD 16(R9)(R10*1), R11 154 ADDE R11, R7 155 MOVD 24(R9)(R10*1), R11 156 ADDE R11, R1 157 MOVD R0, R4 158 ADDE R4, R4 // save CF 159 NEG R4, R4 160 MOVD R5, 0(R2)(R10*1) 161 MOVD R6, 8(R2)(R10*1) 162 MOVD R7, 16(R2)(R10*1) 163 MOVD R1, 24(R2)(R10*1) 164 165 ADD $32, R10 // i += 4 166 SUB $4, R3 // n -= 4 167 BGE U1 // if n >= 0 goto U1 168 169 v1: 170 ADD $4, R3 // n += 4 171 BLE E1 // if n <= 0 goto E1 172 173 L1: // n > 0 174 ADDC R4, R4 // restore CF 175 MOVD 0(R8)(R10*1), R5 176 MOVD 0(R9)(R10*1), R11 177 ADDE R11, R5 178 MOVD R5, 0(R2)(R10*1) 179 MOVD R0, R4 180 ADDE R4, R4 // save CF 181 NEG R4, R4 182 183 ADD $8, R10 // i++ 184 SUB $1, R3 // n-- 185 BGT L1 // if n > 0 goto L1 186 187 E1: 188 NEG R4, R4 189 MOVD R4, c+72(FP) // return c 190 RET 191 192 TEXT ·addVV_novec(SB), NOSPLIT, $0 193 novec: 194 MOVD z_len+8(FP), R3 195 MOVD x+24(FP), R8 196 MOVD y+48(FP), R9 197 MOVD z+0(FP), R2 198 199 MOVD $0, R4 // c = 0 200 MOVD $0, R0 // make sure it's zero 201 MOVD $0, R10 // i = 0 202 203 // s/JL/JMP/ below to disable the unrolled loop 204 SUB $4, R3 // n -= 4 205 BLT v1n // if n < 0 goto v1n 206 207 U1n: // n >= 0 208 // regular loop body unrolled 4x 209 MOVD 0(R8)(R10*1), R5 210 MOVD 8(R8)(R10*1), R6 211 MOVD 16(R8)(R10*1), R7 212 MOVD 24(R8)(R10*1), R1 213 ADDC R4, R4 // restore CF 214 MOVD 0(R9)(R10*1), R11 215 ADDE R11, R5 216 MOVD 8(R9)(R10*1), R11 217 ADDE R11, R6 218 MOVD 16(R9)(R10*1), R11 219 ADDE R11, R7 220 MOVD 24(R9)(R10*1), R11 221 ADDE R11, R1 222 MOVD R0, R4 223 ADDE R4, R4 // save CF 224 NEG R4, R4 225 MOVD R5, 0(R2)(R10*1) 226 MOVD R6, 8(R2)(R10*1) 227 MOVD R7, 16(R2)(R10*1) 228 MOVD R1, 24(R2)(R10*1) 229 230 ADD $32, R10 // i += 4 231 SUB $4, R3 // n -= 4 232 BGE U1n // if n >= 0 goto U1n 233 234 v1n: 235 ADD $4, R3 // n += 4 236 BLE E1n // if n <= 0 goto E1n 237 238 L1n: // n > 0 239 ADDC R4, R4 // restore CF 240 MOVD 0(R8)(R10*1), R5 241 MOVD 0(R9)(R10*1), R11 242 ADDE R11, R5 243 MOVD R5, 0(R2)(R10*1) 244 MOVD R0, R4 245 ADDE R4, R4 // save CF 246 NEG R4, R4 247 248 ADD $8, R10 // i++ 249 SUB $1, R3 // n-- 250 BGT L1n // if n > 0 goto L1n 251 252 E1n: 253 NEG R4, R4 254 MOVD R4, c+72(FP) // return c 255 RET 256 257 TEXT ·subVV(SB), NOSPLIT, $0 258 MOVD subvectorfacility+0x00(SB), R1 259 BR (R1) 260 261 TEXT ·subVV_check(SB), NOSPLIT, $0 262 MOVB ·hasVX(SB), R1 263 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported 264 MOVD $subvectorfacility+0x00(SB), R1 265 MOVD $·subVV_novec(SB), R2 266 MOVD R2, 0(R1) 267 268 // MOVD $·subVV_novec(SB), 0(R1) 269 BR ·subVV_novec(SB) 270 271 vectorimpl: 272 MOVD $subvectorfacility+0x00(SB), R1 273 MOVD $·subVV_vec(SB), R2 274 MOVD R2, 0(R1) 275 276 // MOVD $·subVV_vec(SB), 0(R1) 277 BR ·subVV_vec(SB) 278 279 GLOBL subvectorfacility+0x00(SB), NOPTR, $8 280 DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) 281 282 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 283 // func subVV(z, x, y []Word) (c Word) 284 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) 285 TEXT ·subVV_vec(SB), NOSPLIT, $0 286 MOVD z_len+8(FP), R3 287 MOVD x+24(FP), R8 288 MOVD y+48(FP), R9 289 MOVD z+0(FP), R2 290 MOVD $0, R4 // c = 0 291 MOVD $0, R0 // make sure it's zero 292 MOVD $0, R10 // i = 0 293 294 // s/JL/JMP/ below to disable the unrolled loop 295 SUB $4, R3 // n -= 4 296 BLT v1 // if n < 0 goto v1 297 SUB $12, R3 // n -= 16 298 BLT A1 // if n < 0 goto A1 299 300 MOVD R8, R5 301 MOVD R9, R6 302 MOVD R2, R7 303 304 // n >= 0 305 // regular loop body unrolled 16x 306 VZERO V0 // cf = 0 307 MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) 308 VLVGG $1, R4, V0 // put carry into V0 309 310 UU1: 311 VLM 0(R5), V1, V4 // 64-bytes into V1..V8 312 ADD $64, R5 313 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order 314 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order 315 316 VLM 0(R6), V9, V12 // 64-bytes into V9..V16 317 ADD $64, R6 318 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order 319 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order 320 321 VSBCBIQ V1, V9, V0, V25 322 VSBIQ V1, V9, V0, V17 323 VSBCBIQ V2, V10, V25, V26 324 VSBIQ V2, V10, V25, V18 325 326 VLM 0(R5), V5, V6 // 32-bytes into V1..V8 327 VLM 0(R6), V13, V14 // 32-bytes into V9..V16 328 ADD $32, R5 329 ADD $32, R6 330 331 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order 332 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order 333 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order 334 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order 335 336 VSBCBIQ V3, V11, V26, V27 337 VSBIQ V3, V11, V26, V19 338 VSBCBIQ V4, V12, V27, V28 339 VSBIQ V4, V12, V27, V20 340 341 VLM 0(R5), V7, V8 // 32-bytes into V1..V8 342 VLM 0(R6), V15, V16 // 32-bytes into V9..V16 343 ADD $32, R5 344 ADD $32, R6 345 346 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order 347 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order 348 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order 349 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order 350 351 VSBCBIQ V5, V13, V28, V29 352 VSBIQ V5, V13, V28, V21 353 VSBCBIQ V6, V14, V29, V30 354 VSBIQ V6, V14, V29, V22 355 356 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order 357 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order 358 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order 359 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order 360 361 VSBCBIQ V7, V15, V30, V31 362 VSBIQ V7, V15, V30, V23 363 VSBCBIQ V8, V16, V31, V0 // V0 has carry-over 364 VSBIQ V8, V16, V31, V24 365 366 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order 367 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order 368 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order 369 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order 370 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order 371 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order 372 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order 373 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order 374 VSTM V17, V24, 0(R7) // 128-bytes into z 375 ADD $128, R7 376 ADD $128, R10 // i += 16 377 SUB $16, R3 // n -= 16 378 BGE UU1 // if n >= 0 goto U1 379 VLGVG $1, V0, R4 // put cf into R4 380 SUB $1, R4 // save cf 381 382 A1: 383 ADD $12, R3 // n += 16 384 BLT v1 // if n < 0 goto v1 385 386 U1: // n >= 0 387 // regular loop body unrolled 4x 388 MOVD 0(R8)(R10*1), R5 389 MOVD 8(R8)(R10*1), R6 390 MOVD 16(R8)(R10*1), R7 391 MOVD 24(R8)(R10*1), R1 392 MOVD R0, R11 393 SUBC R4, R11 // restore CF 394 MOVD 0(R9)(R10*1), R11 395 SUBE R11, R5 396 MOVD 8(R9)(R10*1), R11 397 SUBE R11, R6 398 MOVD 16(R9)(R10*1), R11 399 SUBE R11, R7 400 MOVD 24(R9)(R10*1), R11 401 SUBE R11, R1 402 MOVD R0, R4 403 SUBE R4, R4 // save CF 404 MOVD R5, 0(R2)(R10*1) 405 MOVD R6, 8(R2)(R10*1) 406 MOVD R7, 16(R2)(R10*1) 407 MOVD R1, 24(R2)(R10*1) 408 409 ADD $32, R10 // i += 4 410 SUB $4, R3 // n -= 4 411 BGE U1 // if n >= 0 goto U1n 412 413 v1: 414 ADD $4, R3 // n += 4 415 BLE E1 // if n <= 0 goto E1 416 417 L1: // n > 0 418 MOVD R0, R11 419 SUBC R4, R11 // restore CF 420 MOVD 0(R8)(R10*1), R5 421 MOVD 0(R9)(R10*1), R11 422 SUBE R11, R5 423 MOVD R5, 0(R2)(R10*1) 424 MOVD R0, R4 425 SUBE R4, R4 // save CF 426 427 ADD $8, R10 // i++ 428 SUB $1, R3 // n-- 429 BGT L1 // if n > 0 goto L1n 430 431 E1: 432 NEG R4, R4 433 MOVD R4, c+72(FP) // return c 434 RET 435 436 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 437 // func subVV(z, x, y []Word) (c Word) 438 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) 439 TEXT ·subVV_novec(SB), NOSPLIT, $0 440 MOVD z_len+8(FP), R3 441 MOVD x+24(FP), R8 442 MOVD y+48(FP), R9 443 MOVD z+0(FP), R2 444 445 MOVD $0, R4 // c = 0 446 MOVD $0, R0 // make sure it's zero 447 MOVD $0, R10 // i = 0 448 449 // s/JL/JMP/ below to disable the unrolled loop 450 SUB $4, R3 // n -= 4 451 BLT v1 // if n < 0 goto v1 452 453 U1: // n >= 0 454 // regular loop body unrolled 4x 455 MOVD 0(R8)(R10*1), R5 456 MOVD 8(R8)(R10*1), R6 457 MOVD 16(R8)(R10*1), R7 458 MOVD 24(R8)(R10*1), R1 459 MOVD R0, R11 460 SUBC R4, R11 // restore CF 461 MOVD 0(R9)(R10*1), R11 462 SUBE R11, R5 463 MOVD 8(R9)(R10*1), R11 464 SUBE R11, R6 465 MOVD 16(R9)(R10*1), R11 466 SUBE R11, R7 467 MOVD 24(R9)(R10*1), R11 468 SUBE R11, R1 469 MOVD R0, R4 470 SUBE R4, R4 // save CF 471 MOVD R5, 0(R2)(R10*1) 472 MOVD R6, 8(R2)(R10*1) 473 MOVD R7, 16(R2)(R10*1) 474 MOVD R1, 24(R2)(R10*1) 475 476 ADD $32, R10 // i += 4 477 SUB $4, R3 // n -= 4 478 BGE U1 // if n >= 0 goto U1 479 480 v1: 481 ADD $4, R3 // n += 4 482 BLE E1 // if n <= 0 goto E1 483 484 L1: // n > 0 485 MOVD R0, R11 486 SUBC R4, R11 // restore CF 487 MOVD 0(R8)(R10*1), R5 488 MOVD 0(R9)(R10*1), R11 489 SUBE R11, R5 490 MOVD R5, 0(R2)(R10*1) 491 MOVD R0, R4 492 SUBE R4, R4 // save CF 493 494 ADD $8, R10 // i++ 495 SUB $1, R3 // n-- 496 BGT L1 // if n > 0 goto L1 497 498 E1: 499 NEG R4, R4 500 MOVD R4, c+72(FP) // return c 501 RET 502 503 TEXT ·addVW(SB), NOSPLIT, $0 504 MOVD z_len+8(FP), R5 // length of z 505 MOVD x+24(FP), R6 506 MOVD y+48(FP), R7 // c = y 507 MOVD z+0(FP), R8 508 509 CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return 510 511 // Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag. 512 ADDC 0(R6), R7 513 MOVD R7, 0(R8) 514 CMPBEQ R5, $1, returnResult // len(z) == 1 515 MOVD $0, R9 516 ADDE 8(R6), R9 517 MOVD R9, 8(R8) 518 CMPBEQ R5, $2, returnResult // len(z) == 2 519 520 // Update the counters 521 MOVD $16, R12 // i = 2 522 MOVD $-2(R5), R5 // n = n - 2 523 524 loopOverEachWord: 525 BRC $12, copySetup // carry = 0, copy the rest 526 MOVD $1, R9 527 528 // Originally we used the carry flag generated in the previous iteration 529 // (i.e: ADDE could be used here to do the addition). However, since we 530 // already know carry is 1 (otherwise we will go to copy section), we can use 531 // ADDC here so the current iteration does not depend on the carry flag 532 // generated in the previous iteration. This could be useful when branch prediction happens. 533 ADDC 0(R6)(R12*1), R9 534 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c 535 536 MOVD $8(R12), R12 // i++ 537 BRCTG R5, loopOverEachWord // n-- 538 539 // Return the current carry value 540 returnResult: 541 MOVD $0, R0 542 ADDE R0, R0 543 MOVD R0, c+56(FP) 544 RET 545 546 // Update position of x(R6) and z(R8) based on the current counter value and perform copying. 547 // With the assumption that x and z will not overlap with each other or x and z will 548 // point to same memory region, we can use a faster version of copy using only MVC here. 549 // In the following implementation, we have three copy loops, each copying a word, 4 words, and 550 // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. 551 copySetup: 552 ADD R12, R6 553 ADD R12, R8 554 555 CMPBGE R5, $4, mediumLoop 556 557 smallLoop: // does a loop unrolling to copy word when n < 4 558 CMPBEQ R5, $0, returnZero 559 MVC $8, 0(R6), 0(R8) 560 CMPBEQ R5, $1, returnZero 561 MVC $8, 8(R6), 8(R8) 562 CMPBEQ R5, $2, returnZero 563 MVC $8, 16(R6), 16(R8) 564 565 returnZero: 566 MOVD $0, c+56(FP) // return 0 as carry 567 RET 568 569 mediumLoop: 570 CMPBLT R5, $4, smallLoop 571 CMPBLT R5, $32, mediumLoopBody 572 573 largeLoop: // Copying 256 bytes at a time. 574 MVC $256, 0(R6), 0(R8) 575 MOVD $256(R6), R6 576 MOVD $256(R8), R8 577 MOVD $-32(R5), R5 578 CMPBGE R5, $32, largeLoop 579 BR mediumLoop 580 581 mediumLoopBody: // Copying 32 bytes at a time 582 MVC $32, 0(R6), 0(R8) 583 MOVD $32(R6), R6 584 MOVD $32(R8), R8 585 MOVD $-4(R5), R5 586 CMPBGE R5, $4, mediumLoopBody 587 BR smallLoop 588 589 returnC: 590 MOVD R7, c+56(FP) 591 RET 592 593 TEXT ·subVW(SB), NOSPLIT, $0 594 MOVD z_len+8(FP), R5 595 MOVD x+24(FP), R6 596 MOVD y+48(FP), R7 // The borrow bit passed in 597 MOVD z+0(FP), R8 598 MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it. 599 600 CMPBEQ R5, $0, returnC // len(z) == 0, have an early return 601 602 // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag 603 MOVD 0(R6), R9 604 SUBC R7, R9 605 MOVD R9, 0(R8) 606 CMPBEQ R5, $1, returnResult 607 MOVD 8(R6), R9 608 SUBE R0, R9 609 MOVD R9, 8(R8) 610 CMPBEQ R5, $2, returnResult 611 612 // Update the counters 613 MOVD $16, R12 // i = 2 614 MOVD $-2(R5), R5 // n = n - 2 615 616 loopOverEachWord: 617 BRC $3, copySetup // no borrow, copy the rest 618 MOVD 0(R6)(R12*1), R9 619 620 // Originally we used the borrow flag generated in the previous iteration 621 // (i.e: SUBE could be used here to do the subtraction). However, since we 622 // already know borrow is 1 (otherwise we will go to copy section), we can 623 // use SUBC here so the current iteration does not depend on the borrow flag 624 // generated in the previous iteration. This could be useful when branch prediction happens. 625 SUBC $1, R9 626 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1 627 628 MOVD $8(R12), R12 // i++ 629 BRCTG R5, loopOverEachWord // n-- 630 631 // return the current borrow value 632 returnResult: 633 SUBE R0, R0 634 NEG R0, R0 635 MOVD R0, c+56(FP) 636 RET 637 638 // Update position of x(R6) and z(R8) based on the current counter value and perform copying. 639 // With the assumption that x and z will not overlap with each other or x and z will 640 // point to same memory region, we can use a faster version of copy using only MVC here. 641 // In the following implementation, we have three copy loops, each copying a word, 4 words, and 642 // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove. 643 copySetup: 644 ADD R12, R6 645 ADD R12, R8 646 647 CMPBGE R5, $4, mediumLoop 648 649 smallLoop: // does a loop unrolling to copy word when n < 4 650 CMPBEQ R5, $0, returnZero 651 MVC $8, 0(R6), 0(R8) 652 CMPBEQ R5, $1, returnZero 653 MVC $8, 8(R6), 8(R8) 654 CMPBEQ R5, $2, returnZero 655 MVC $8, 16(R6), 16(R8) 656 657 returnZero: 658 MOVD $0, c+56(FP) // return 0 as borrow 659 RET 660 661 mediumLoop: 662 CMPBLT R5, $4, smallLoop 663 CMPBLT R5, $32, mediumLoopBody 664 665 largeLoop: // Copying 256 bytes at a time 666 MVC $256, 0(R6), 0(R8) 667 MOVD $256(R6), R6 668 MOVD $256(R8), R8 669 MOVD $-32(R5), R5 670 CMPBGE R5, $32, largeLoop 671 BR mediumLoop 672 673 mediumLoopBody: // Copying 32 bytes at a time 674 MVC $32, 0(R6), 0(R8) 675 MOVD $32(R6), R6 676 MOVD $32(R8), R8 677 MOVD $-4(R5), R5 678 CMPBGE R5, $4, mediumLoopBody 679 BR smallLoop 680 681 returnC: 682 MOVD R7, c+56(FP) 683 RET 684 685 // func shlVU(z, x []Word, s uint) (c Word) 686 TEXT ·shlVU(SB), NOSPLIT, $0 687 BR ·shlVU_g(SB) 688 689 // func shrVU(z, x []Word, s uint) (c Word) 690 TEXT ·shrVU(SB), NOSPLIT, $0 691 BR ·shrVU_g(SB) 692 693 // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i 694 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 695 TEXT ·mulAddVWW(SB), NOSPLIT, $0 696 MOVD z+0(FP), R2 697 MOVD x+24(FP), R8 698 MOVD y+48(FP), R9 699 MOVD r+56(FP), R4 // c = r 700 MOVD z_len+8(FP), R5 701 MOVD $0, R1 // i = 0 702 MOVD $0, R7 // i*8 = 0 703 MOVD $0, R0 // make sure it's zero 704 BR E5 705 706 L5: 707 MOVD (R8)(R1*1), R6 708 MULHDU R9, R6 709 ADDC R4, R11 // add to low order bits 710 ADDE R0, R6 711 MOVD R11, (R2)(R1*1) 712 MOVD R6, R4 713 ADD $8, R1 // i*8 + 8 714 ADD $1, R7 // i++ 715 716 E5: 717 CMPBLT R7, R5, L5 // i < n 718 719 MOVD R4, c+64(FP) 720 RET 721 722 // func addMulVVW(z, x []Word, y Word) (c Word) 723 // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i 724 TEXT ·addMulVVW(SB), NOSPLIT, $0 725 MOVD z+0(FP), R2 726 MOVD x+24(FP), R8 727 MOVD y+48(FP), R9 728 MOVD z_len+8(FP), R5 729 730 MOVD $0, R1 // i*8 = 0 731 MOVD $0, R7 // i = 0 732 MOVD $0, R0 // make sure it's zero 733 MOVD $0, R4 // c = 0 734 735 MOVD R5, R12 736 AND $-2, R12 737 CMPBGE R5, $2, A6 738 BR E6 739 740 A6: 741 MOVD (R8)(R1*1), R6 742 MULHDU R9, R6 743 MOVD (R2)(R1*1), R10 744 ADDC R10, R11 // add to low order bits 745 ADDE R0, R6 746 ADDC R4, R11 747 ADDE R0, R6 748 MOVD R6, R4 749 MOVD R11, (R2)(R1*1) 750 751 MOVD (8)(R8)(R1*1), R6 752 MULHDU R9, R6 753 MOVD (8)(R2)(R1*1), R10 754 ADDC R10, R11 // add to low order bits 755 ADDE R0, R6 756 ADDC R4, R11 757 ADDE R0, R6 758 MOVD R6, R4 759 MOVD R11, (8)(R2)(R1*1) 760 761 ADD $16, R1 // i*8 + 8 762 ADD $2, R7 // i++ 763 764 CMPBLT R7, R12, A6 765 BR E6 766 767 L6: 768 MOVD (R8)(R1*1), R6 769 MULHDU R9, R6 770 MOVD (R2)(R1*1), R10 771 ADDC R10, R11 // add to low order bits 772 ADDE R0, R6 773 ADDC R4, R11 774 ADDE R0, R6 775 MOVD R6, R4 776 MOVD R11, (R2)(R1*1) 777 778 ADD $8, R1 // i*8 + 8 779 ADD $1, R7 // i++ 780 781 E6: 782 CMPBLT R7, R5, L6 // i < n 783 784 MOVD R4, c+56(FP) 785 RET 786