github.com/m10x/go/src@v0.0.0-20220112094212-ba61592315da/internal/bytealg/index_ppc64x.s (about) 1 // Copyright 2021 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This is an implementation based on the s390x 6 // implementation. 7 8 // Find a separator with 2 <= len <= 32 within a string. 9 // Separators with lengths of 2, 3 or 4 are handled 10 // specially. 11 12 // This works on power8 and above. The loads and 13 // compares are done in big endian order 14 // since that allows the used of VCLZD, and allows 15 // the same implementation to work on big and little 16 // endian platforms with minimal conditional changes. 17 18 // NOTE: There is a power9 implementation that 19 // improves performance by 10-15% on little 20 // endian for some of the benchmarks, but 21 // work is still needed for a big endian 22 // implementation on power9. 23 24 //go:build ppc64 || ppc64le 25 26 #include "go_asm.h" 27 #include "textflag.h" 28 29 // Needed to swap LXVD2X loads to the correct 30 // byte order to work on POWER8. 31 32 #ifdef GOARCH_ppc64 33 DATA byteswap<>+0(SB)/8, $0x0001020304050607 34 DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f 35 #else 36 DATA byteswap<>+0(SB)/8, $0x0706050403020100 37 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 38 #endif 39 40 // Load bytes in big endian order. Address 41 // alignment does not need checking. 42 #define VLOADSWAP(base, index, vreg, vsreg) \ 43 LXVD2X (base)(index), vsreg; \ 44 VPERM vreg, vreg, SWAP, vreg 45 46 GLOBL byteswap<>+0(SB), RODATA, $16 47 48 TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 49 #ifdef GOEXPERIMENT_regabiargs 50 // R3 = byte array pointer 51 // R4 = length 52 MOVD R6,R5 // R5 = separator pointer 53 MOVD R7,R6 // R6 = separator length 54 #else 55 MOVD a_base+0(FP), R3 // R3 = byte array pointer 56 MOVD a_len+8(FP), R4 // R4 = length 57 MOVD b_base+24(FP), R5 // R5 = separator pointer 58 MOVD b_len+32(FP), R6 // R6 = separator length 59 MOVD $ret+48(FP), R14 // R14 = &ret 60 #endif 61 62 63 #ifdef GOARCH_ppc64le 64 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7 65 CMP R7, $1 66 BNE power8 67 BR indexbodyp9<>(SB) 68 69 #endif 70 power8: 71 BR indexbody<>(SB) 72 73 TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 74 #ifndef GOEXPERIMENT_regabiargs 75 MOVD a_base+0(FP), R3 // R3 = string 76 MOVD a_len+8(FP), R4 // R4 = length 77 MOVD b_base+16(FP), R5 // R5 = separator pointer 78 MOVD b_len+24(FP), R6 // R6 = separator length 79 MOVD $ret+32(FP), R14 // R14 = &ret 80 #endif 81 82 83 #ifdef GOARCH_ppc64le 84 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7 85 CMP R7, $1 86 BNE power8 87 BR indexbodyp9<>(SB) 88 89 #endif 90 power8: 91 BR indexbody<>(SB) 92 93 // s: string we are searching 94 // sep: string to search for 95 // R3=&s[0], R4=len(s) 96 // R5=&sep[0], R6=len(sep) 97 // R14=&ret (index where sep found) 98 // R7=working addr of string 99 // R16=index value 16 100 // R17=index value 17 101 // R18=index value 18 102 // R19=index value 1 103 // R26=LASTBYTE of string 104 // R27=LASTSTR last start byte to compare with sep 105 // R8, R9 scratch 106 // V0=sep left justified zero fill 107 // CR4=sep length >= 16 108 109 #define SEPMASK V17 110 #define LASTBYTE R26 111 #define LASTSTR R27 112 #define ONES V20 113 #define SWAP V21 114 #define V0_ VS32 115 #define V1_ VS33 116 #define V2_ VS34 117 #define V3_ VS35 118 #define V4_ VS36 119 #define V5_ VS37 120 #define V6_ VS38 121 #define V7_ VS39 122 #define V8_ VS40 123 #define V9_ VS41 124 #define SWAP_ VS53 125 TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0 126 CMP R6, R4 // Compare lengths 127 BGT notfound // If sep len is > string, notfound 128 ADD R4, R3, LASTBYTE // find last byte addr 129 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) 130 CMP R6, $0 // Check sep len 131 BEQ notfound // sep len 0 -- not found 132 MOVD R3, R7 // Copy of string addr 133 MOVD $16, R16 // Index value 16 134 MOVD $17, R17 // Index value 17 135 MOVD $18, R18 // Index value 18 136 MOVD $1, R19 // Index value 1 137 MOVD $byteswap<>+00(SB), R8 138 VSPLTISB $0xFF, ONES // splat all 1s 139 LXVD2X (R8)(R0), SWAP_ // Set up swap string 140 141 CMP R6, $16, CR4 // CR4 for len(sep) >= 16 142 VOR ONES, ONES, SEPMASK // Set up full SEPMASK 143 BGE CR4, loadge16 // Load for len(sep) >= 16 144 SUB R6, R16, R9 // 16-len of sep 145 SLD $3, R9 // Set up for VSLO 146 MTVSRD R9, V9_ // Set up for VSLO 147 VSLDOI $8, V9, V9, V9 // Set up for VSLO 148 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 149 150 loadge16: 151 ANDCC $15, R5, R9 // Find byte offset of sep 152 ADD R9, R6, R10 // Add sep len 153 CMP R10, $16 // Check if sep len+offset > 16 154 BGT sepcross16 // Sep crosses 16 byte boundary 155 156 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container 157 VLOADSWAP(R8, R0, V0, V0_)// Load 16 bytes @R8 into V0 158 SLD $3, R9 // Set up shift count for VSLO 159 MTVSRD R9, V8_ // Set up shift count for VSLO 160 VSLDOI $8, V8, V8, V8 161 VSLO V0, V8, V0 // Shift by start byte 162 163 VAND V0, SEPMASK, V0 // Mask separator (< 16) 164 BR index2plus 165 166 sepcross16: 167 VLOADSWAP(R5, R0, V0, V0_) // Load 16 bytes @R5 into V0 168 169 VAND V0, SEPMASK, V0 // mask out separator 170 BLE CR4, index2to16 171 BR index17plus // Handle sep > 16 172 173 index2plus: 174 CMP R6, $2 // Check length of sep 175 BNE index3plus // If not 2, check for 3 176 ADD $16, R7, R9 // Check if next 16 bytes past last 177 CMP R9, LASTBYTE // compare with last 178 BGE index2to16 // 2 <= len(string) <= 16 179 MOVD $0xff00, R21 // Mask for later 180 MTVSRD R21, V25 // Move to Vreg 181 VSPLTH $3, V25, V31 // Splat mask 182 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep 183 VSPLTISB $0, V10 // Clear V10 184 185 // First case: 2 byte separator 186 // V1: 2 byte separator splatted 187 // V2: 16 bytes at addr 188 // V4: 16 bytes at addr+1 189 // Compare 2 byte separator at start 190 // and at start+1. Use VSEL to combine 191 // those results to find the first 192 // matching start byte, returning 193 // that value when found. Loop as 194 // long as len(string) > 16 195 index2loop2: 196 VLOADSWAP(R7, R19, V3, V3_) // Load 16 bytes @R7+1 into V3 197 198 index2loop: 199 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2 200 VCMPEQUH V1, V2, V5 // Search for sep 201 VCMPEQUH V1, V3, V6 // Search for sep offset by 1 202 VSEL V6, V5, V31, V7 // merge even and odd indices 203 VCLZD V7, V18 // find index of first match 204 MFVSRD V18, R25 // get first value 205 CMP R25, $64 // Found if < 64 206 BLT foundR25 // Return byte index where found 207 VSLDOI $8, V18, V18, V18 // Adjust 2nd value 208 MFVSRD V18, R25 // get second value 209 CMP R25, $64 // Found if < 64 210 ADD $64, R25 // Update byte offset 211 BLT foundR25 // Return value 212 ADD $16, R7 // R7+=16 Update string pointer 213 ADD $17, R7, R9 // R9=F7+17 since loop unrolled 214 CMP R9, LASTBYTE // Compare addr+17 against last byte 215 BLT index2loop2 // If < last, continue loop 216 CMP R7, LASTBYTE // Compare addr+16 against last byte 217 BLT index2to16 // If < 16 handle specially 218 VLOADSWAP(R7, R0, V3, V3_) // Load 16 bytes @R7 into V3 219 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte 220 BR index2loop 221 222 index3plus: 223 CMP R6, $3 // Check if sep == 3 224 BNE index4plus // If not check larger 225 ADD $19, R7, R9 // Find bytes for use in this loop 226 CMP R9, LASTBYTE // Compare against last byte 227 BGE index2to16 // Remaining string 2<=len<=16 228 MOVD $0xff00, R21 // Set up mask for upcoming loop 229 MTVSRD R21, V25 // Move mask to Vreg 230 VSPLTH $3, V25, V31 // Splat mask 231 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep 232 VSPLTB $2, V0, V8 // Splat 3rd byte of sep 233 234 // Loop to process 3 byte separator. 235 // string[0:16] is in V2 236 // string[2:18] is in V3 237 // sep[0:2] splatted in V1 238 // sec[3] splatted in v8 239 // Load vectors at string, string+1 240 // and string+2. Compare string, string+1 241 // against first 2 bytes of separator 242 // splatted, and string+2 against 3rd 243 // byte splatted. Merge the results with 244 // VSEL to find the first byte of a match. 245 246 // Special handling for last 16 bytes if the 247 // string fits in 16 byte multiple. 248 index3loop2: 249 MOVD $2, R21 // Set up index for 2 250 VSPLTISB $0, V10 // Clear V10 251 VLOADSWAP(R7, R21, V3, V3_)// Load 16 bytes @R7+2 into V3 252 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes 253 254 index3loop: 255 VLOADSWAP(R7, R0, V2, V2_) // Load with correct order 256 VSLDOI $1, V2, V3, V4 // string[1:17] 257 VSLDOI $2, V2, V3, V9 // string[2:18] 258 VCMPEQUH V1, V2, V5 // compare hw even indices 259 VCMPEQUH V1, V4, V6 // compare hw odd indices 260 VCMPEQUB V8, V9, V10 // compare 3rd to last byte 261 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask 262 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte 263 VCLZD V7, V18 // Find first nonzero indexes 264 MFVSRD V18, R25 // Move 1st doubleword 265 CMP R25, $64 // If < 64 found 266 BLT foundR25 // Return matching index 267 VSLDOI $8, V18, V18, V18 // Move value 268 MFVSRD V18, R25 // Move 2nd doubleword 269 CMP R25, $64 // If < 64 found 270 ADD $64, R25 // Update byte index 271 BLT foundR25 // Return matching index 272 ADD $16, R7 // R7+=16 string ptr 273 ADD $19, R7, R9 // Number of string bytes for loop 274 CMP R9, LASTBYTE // Compare against last byte of string 275 BLT index3loop2 // If within, continue this loop 276 CMP R7, LASTSTR // Compare against last start byte 277 BLT index2to16 // Process remainder 278 VSPLTISB $0, V3 // Special case for last 16 bytes 279 BR index3loop // Continue this loop 280 281 // Loop to process 4 byte separator 282 // string[0:16] in V2 283 // string[3:16] in V3 284 // sep[0:4] splatted in V1 285 // Set up vectors with strings at offsets 286 // 0, 1, 2, 3 and compare against the 4 byte 287 // separator also splatted. Use VSEL with the 288 // compare results to find the first byte where 289 // a separator match is found. 290 index4plus: 291 CMP R6, $4 // Check if 4 byte separator 292 BNE index5plus // If not next higher 293 ADD $20, R7, R9 // Check string size to load 294 CMP R9, LASTBYTE // Verify string length 295 BGE index2to16 // If not large enough, process remaining 296 MOVD $2, R15 // Set up index 297 298 // Set up masks for use with VSEL 299 MOVD $0xff, R21 // Set up mask 0xff000000ff000000... 300 SLD $24, R21 301 MTVSRD R21, V10 302 VSPLTW $1, V10, V29 303 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... 304 MOVD $0xffff, R21 305 SLD $16, R21 306 MTVSRD R21, V10 307 VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000... 308 VSPLTW $0, V0, V1 // Splat 1st word of separator 309 310 index4loop: 311 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2 312 313 next4: 314 VSPLTISB $0, V10 // Clear 315 MOVD $3, R9 // Number of bytes beyond 16 316 VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+3 into V3 317 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes 318 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 319 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 320 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 321 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep 322 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep 323 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep 324 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep 325 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask 326 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask 327 VSEL V14, V13, V31, V7 // final merge 328 VCLZD V7, V18 // Find first index for each half 329 MFVSRD V18, R25 // Isolate value 330 CMP R25, $64 // If < 64, found 331 BLT foundR25 // Return found index 332 VSLDOI $8, V18, V18, V18 // Move for MFVSRD 333 MFVSRD V18, R25 // Isolate other value 334 CMP R25, $64 // If < 64, found 335 ADD $64, R25 // Update index for high doubleword 336 BLT foundR25 // Return found index 337 ADD $16, R7 // R7+=16 for next string 338 ADD $20, R7, R9 // R+20 for all bytes to load 339 CMP R9, LASTBYTE // Past end? Maybe check for extra? 340 BLT index4loop // If not, continue loop 341 CMP R7, LASTSTR // Check remainder 342 BLE index2to16 // Process remainder 343 BR notfound // Not found 344 345 index5plus: 346 CMP R6, $16 // Check for sep > 16 347 BGT index17plus // Handle large sep 348 349 // Assumption is that the separator is smaller than the string at this point 350 index2to16: 351 CMP R7, LASTSTR // Compare last start byte 352 BGT notfound // last takes len(sep) into account 353 354 ADD $16, R7, R9 // Check for last byte of string 355 CMP R9, LASTBYTE 356 BGT index2to16tail 357 358 // At least 16 bytes of string left 359 // Mask the number of bytes in sep 360 index2to16loop: 361 VLOADSWAP(R7, R0, V1, V1_) // Load 16 bytes @R7 into V1 362 363 compare: 364 VAND V1, SEPMASK, V2 // Mask out sep size 365 VCMPEQUBCC V0, V2, V3 // Compare masked string 366 BLT CR6, found // All equal 367 ADD $1, R7 // Update ptr to next byte 368 CMP R7, LASTSTR // Still less than last start byte 369 BGT notfound // Not found 370 ADD $16, R7, R9 // Verify remaining bytes 371 CMP R9, LASTBYTE // At least 16 372 BLT index2to16loop // Try again 373 374 // Less than 16 bytes remaining in string 375 // Separator >= 2 376 index2to16tail: 377 ADD R3, R4, R9 // End of string 378 SUB R7, R9, R9 // Number of bytes left 379 ANDCC $15, R7, R10 // 16 byte offset 380 ADD R10, R9, R11 // offset + len 381 CMP R11, $16 // >= 16? 382 BLE short // Does not cross 16 bytes 383 VLOADSWAP(R7, R0, V1, V1_)// Load 16 bytes @R7 into V1 384 BR index2to16next // Continue on 385 386 short: 387 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container 388 VLOADSWAP(R9, R0, V1, V1_)// Load 16 bytes @R9 into V1 389 SLD $3, R10 // Set up shift 390 MTVSRD R10, V8_ // Set up shift 391 VSLDOI $8, V8, V8, V8 392 VSLO V1, V8, V1 // Shift by start byte 393 VSPLTISB $0, V25 // Clear for later use 394 395 index2to16next: 396 VAND V1, SEPMASK, V2 // Just compare size of sep 397 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 398 BLT CR6, found // Found 399 ADD $1, R7 // Not found, try next partial string 400 CMP R7, LASTSTR // Check for end of string 401 BGT notfound // If at end, then not found 402 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte 403 BR index2to16next // Check the next partial string 404 405 index17plus: 406 CMP R6, $32 // Check if 17 < len(sep) <= 32 407 BGT index33plus 408 SUB $16, R6, R9 // Extra > 16 409 SLD $56, R9, R10 // Shift to use in VSLO 410 MTVSRD R10, V9_ // Set up for VSLO 411 VLOADSWAP(R5, R9, V1, V1_)// Load 16 bytes @R5+R9 into V1 412 VSLO V1, V9, V1 // Shift left 413 VSPLTISB $0xff, V7 // Splat 1s 414 VSPLTISB $0, V27 // Splat 0 415 416 index17to32loop: 417 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2 418 419 next17: 420 VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+R9 into V3 421 VSLO V3, V9, V3 // Shift left 422 VCMPEQUB V0, V2, V4 // Compare first 16 bytes 423 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes 424 VAND V4, V5, V6 // Check if both equal 425 VCMPEQUBCC V6, V7, V8 // All equal? 426 BLT CR6, found // Yes 427 ADD $1, R7 // On to next byte 428 CMP R7, LASTSTR // Check if last start byte 429 BGT notfound // If too high, not found 430 BR index17to32loop // Continue 431 432 notfound: 433 #ifdef GOEXPERIMENT_regabiargs 434 MOVD $-1, R3 // Return -1 if not found 435 #else 436 MOVD $-1, R8 // Return -1 if not found 437 MOVD R8, (R14) 438 #endif 439 RET 440 441 index33plus: 442 MOVD $0, (R0) // Case not implemented 443 RET // Crash before return 444 445 foundR25: 446 SRD $3, R25 // Convert from bits to bytes 447 ADD R25, R7 // Add to current string address 448 SUB R3, R7 // Subtract from start of string 449 #ifdef GOEXPERIMENT_regabiargs 450 MOVD R7, R3 // Return byte where found 451 #else 452 MOVD R7, (R14) // Return byte where found 453 #endif 454 RET 455 456 found: 457 SUB R3, R7 // Return byte where found 458 #ifdef GOEXPERIMENT_regabiargs 459 MOVD R7, R3 460 #else 461 MOVD R7, (R14) 462 #endif 463 RET 464 465 TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0 466 CMP R6, R4 // Compare lengths 467 BGT notfound // If sep len is > string, notfound 468 ADD R4, R3, LASTBYTE // find last byte addr 469 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) 470 CMP R6, $0 // Check sep len 471 BEQ notfound // sep len 0 -- not found 472 MOVD R3, R7 // Copy of string addr 473 MOVD $16, R16 // Index value 16 474 MOVD $17, R17 // Index value 17 475 MOVD $18, R18 // Index value 18 476 MOVD $1, R19 // Index value 1 477 VSPLTISB $0xFF, ONES // splat all 1s 478 479 CMP R6, $16, CR4 // CR4 for len(sep) >= 16 480 VOR ONES, ONES, SEPMASK // Set up full SEPMASK 481 BGE CR4, loadge16 // Load for len(sep) >= 16 482 SUB R6, R16, R9 // 16-len of sep 483 SLD $3, R9 // Set up for VSLO 484 MTVSRD R9, V9_ // Set up for VSLO 485 VSLDOI $8, V9, V9, V9 // Set up for VSLO 486 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 487 488 loadge16: 489 ANDCC $15, R5, R9 // Find byte offset of sep 490 ADD R9, R6, R10 // Add sep len 491 CMP R10, $16 // Check if sep len+offset > 16 492 BGT sepcross16 // Sep crosses 16 byte boundary 493 494 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container 495 LXVB16X (R8)(R0), V0_ // Load 16 bytes @R8 into V0 496 SLD $3, R9 // Set up shift count for VSLO 497 MTVSRD R9, V8_ // Set up shift count for VSLO 498 VSLDOI $8, V8, V8, V8 499 VSLO V0, V8, V0 // Shift by start byte 500 501 VAND V0, SEPMASK, V0 // Mask separator (< 16) 502 BR index2plus 503 504 sepcross16: 505 LXVB16X (R5)(R0), V0_ // Load 16 bytes @R5 into V0 506 507 VAND V0, SEPMASK, V0 // mask out separator 508 BLE CR4, index2to16 509 BR index17plus // Handle sep > 16 510 511 index2plus: 512 CMP R6, $2 // Check length of sep 513 BNE index3plus // If not 2, check for 3 514 ADD $16, R7, R9 // Check if next 16 bytes past last 515 CMP R9, LASTBYTE // compare with last 516 BGE index2to16 // 2 <= len(string) <= 16 517 MOVD $0xff00, R21 // Mask for later 518 MTVSRD R21, V25 // Move to Vreg 519 VSPLTH $3, V25, V31 // Splat mask 520 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep 521 VSPLTISB $0, V10 // Clear V10 522 523 // First case: 2 byte separator 524 // V1: 2 byte separator splatted 525 // V2: 16 bytes at addr 526 // V4: 16 bytes at addr+1 527 // Compare 2 byte separator at start 528 // and at start+1. Use VSEL to combine 529 // those results to find the first 530 // matching start byte, returning 531 // that value when found. Loop as 532 // long as len(string) > 16 533 index2loop2: 534 LXVB16X (R7)(R19), V3_ // Load 16 bytes @R7+1 into V3 535 536 index2loop: 537 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2 538 VCMPEQUH V1, V2, V5 // Search for sep 539 VCMPEQUH V1, V3, V6 // Search for sep offset by 1 540 VSEL V6, V5, V31, V7 // merge even and odd indices 541 VCLZD V7, V18 // find index of first match 542 MFVSRD V18, R25 // get first value 543 CMP R25, $64 // Found if < 64 544 BLT foundR25 // Return byte index where found 545 546 MFVSRLD V18, R25 // get second value 547 CMP R25, $64 // Found if < 64 548 ADD $64, R25 // Update byte offset 549 BLT foundR25 // Return value 550 ADD $16, R7 // R7+=16 Update string pointer 551 ADD $17, R7, R9 // R9=F7+17 since loop unrolled 552 CMP R9, LASTBYTE // Compare addr+17 against last byte 553 BLT index2loop2 // If < last, continue loop 554 CMP R7, LASTBYTE // Compare addr+16 against last byte 555 BLT index2to16 // If < 16 handle specially 556 LXVB16X (R7)(R0), V3_ // Load 16 bytes @R7 into V3 557 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte 558 BR index2loop 559 560 index3plus: 561 CMP R6, $3 // Check if sep == 3 562 BNE index4plus // If not check larger 563 ADD $19, R7, R9 // Find bytes for use in this loop 564 CMP R9, LASTBYTE // Compare against last byte 565 BGE index2to16 // Remaining string 2<=len<=16 566 MOVD $0xff00, R21 // Set up mask for upcoming loop 567 MTVSRD R21, V25 // Move mask to Vreg 568 VSPLTH $3, V25, V31 // Splat mask 569 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep 570 VSPLTB $2, V0, V8 // Splat 3rd byte of sep 571 572 // Loop to process 3 byte separator. 573 // string[0:16] is in V2 574 // string[2:18] is in V3 575 // sep[0:2] splatted in V1 576 // sec[3] splatted in v8 577 // Load vectors at string, string+1 578 // and string+2. Compare string, string+1 579 // against first 2 bytes of separator 580 // splatted, and string+2 against 3rd 581 // byte splatted. Merge the results with 582 // VSEL to find the first byte of a match. 583 584 // Special handling for last 16 bytes if the 585 // string fits in 16 byte multiple. 586 index3loop2: 587 MOVD $2, R21 // Set up index for 2 588 VSPLTISB $0, V10 // Clear V10 589 LXVB16X (R7)(R21), V3_ // Load 16 bytes @R7+2 into V3 590 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes 591 592 index3loop: 593 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 594 VSLDOI $1, V2, V3, V4 // string[1:17] 595 VSLDOI $2, V2, V3, V9 // string[2:18] 596 VCMPEQUH V1, V2, V5 // compare hw even indices 597 VCMPEQUH V1, V4, V6 // compare hw odd indices 598 VCMPEQUB V8, V9, V10 // compare 3rd to last byte 599 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask 600 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte 601 VCLZD V7, V18 // Find first nonzero indexes 602 MFVSRD V18, R25 // Move 1st doubleword 603 CMP R25, $64 // If < 64 found 604 BLT foundR25 // Return matching index 605 606 MFVSRLD V18, R25 // Move 2nd doubleword 607 CMP R25, $64 // If < 64 found 608 ADD $64, R25 // Update byte index 609 BLT foundR25 // Return matching index 610 ADD $16, R7 // R7+=16 string ptr 611 ADD $19, R7, R9 // Number of string bytes for loop 612 CMP R9, LASTBYTE // Compare against last byte of string 613 BLT index3loop2 // If within, continue this loop 614 CMP R7, LASTSTR // Compare against last start byte 615 BLT index2to16 // Process remainder 616 VSPLTISB $0, V3 // Special case for last 16 bytes 617 BR index3loop // Continue this loop 618 619 // Loop to process 4 byte separator 620 // string[0:16] in V2 621 // string[3:16] in V3 622 // sep[0:4] splatted in V1 623 // Set up vectors with strings at offsets 624 // 0, 1, 2, 3 and compare against the 4 byte 625 // separator also splatted. Use VSEL with the 626 // compare results to find the first byte where 627 // a separator match is found. 628 index4plus: 629 CMP R6, $4 // Check if 4 byte separator 630 BNE index5plus // If not next higher 631 ADD $20, R7, R9 // Check string size to load 632 CMP R9, LASTBYTE // Verify string length 633 BGE index2to16 // If not large enough, process remaining 634 MOVD $2, R15 // Set up index 635 636 // Set up masks for use with VSEL 637 MOVD $0xff, R21 // Set up mask 0xff000000ff000000... 638 SLD $24, R21 639 MTVSRWS R21, V29 640 641 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... 642 MOVD $0xffff, R21 643 SLD $16, R21 644 MTVSRWS R21, V31 645 646 VSPLTW $0, V0, V1 // Splat 1st word of separator 647 648 index4loop: 649 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2 650 651 next4: 652 VSPLTISB $0, V10 // Clear 653 MOVD $3, R9 // Number of bytes beyond 16 654 LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7 into V2 655 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes 656 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 657 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 658 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 659 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep 660 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep 661 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep 662 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep 663 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask 664 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask 665 VSEL V14, V13, V31, V7 // final merge 666 VCLZD V7, V18 // Find first index for each half 667 MFVSRD V18, R25 // Isolate value 668 CMP R25, $64 // If < 64, found 669 BLT foundR25 // Return found index 670 671 MFVSRLD V18, R25 // Isolate other value 672 CMP R25, $64 // If < 64, found 673 ADD $64, R25 // Update index for high doubleword 674 BLT foundR25 // Return found index 675 ADD $16, R7 // R7+=16 for next string 676 ADD $20, R7, R9 // R+20 for all bytes to load 677 CMP R9, LASTBYTE // Past end? Maybe check for extra? 678 BLT index4loop // If not, continue loop 679 CMP R7, LASTSTR // Check remainder 680 BLE index2to16 // Process remainder 681 BR notfound // Not found 682 683 index5plus: 684 CMP R6, $16 // Check for sep > 16 685 BGT index17plus // Handle large sep 686 687 // Assumption is that the separator is smaller than the string at this point 688 index2to16: 689 CMP R7, LASTSTR // Compare last start byte 690 BGT notfound // last takes len(sep) into account 691 692 ADD $16, R7, R9 // Check for last byte of string 693 CMP R9, LASTBYTE 694 BGT index2to16tail 695 696 // At least 16 bytes of string left 697 // Mask the number of bytes in sep 698 index2to16loop: 699 LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1 700 701 compare: 702 VAND V1, SEPMASK, V2 // Mask out sep size 703 VCMPEQUBCC V0, V2, V3 // Compare masked string 704 BLT CR6, found // All equal 705 ADD $1, R7 // Update ptr to next byte 706 CMP R7, LASTSTR // Still less than last start byte 707 BGT notfound // Not found 708 ADD $16, R7, R9 // Verify remaining bytes 709 CMP R9, LASTBYTE // At least 16 710 BLT index2to16loop // Try again 711 712 // Less than 16 bytes remaining in string 713 // Separator >= 2 714 index2to16tail: 715 ADD R3, R4, R9 // End of string 716 SUB R7, R9, R9 // Number of bytes left 717 ANDCC $15, R7, R10 // 16 byte offset 718 ADD R10, R9, R11 // offset + len 719 CMP R11, $16 // >= 16? 720 BLE short // Does not cross 16 bytes 721 LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1 722 BR index2to16next // Continue on 723 724 short: 725 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container 726 LXVB16X (R9)(R0), V1_ // Load 16 bytes @R9 into V1 727 SLD $3, R10 // Set up shift 728 MTVSRD R10, V8_ // Set up shift 729 VSLDOI $8, V8, V8, V8 730 VSLO V1, V8, V1 // Shift by start byte 731 VSPLTISB $0, V25 // Clear for later use 732 733 index2to16next: 734 VAND V1, SEPMASK, V2 // Just compare size of sep 735 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 736 BLT CR6, found // Found 737 ADD $1, R7 // Not found, try next partial string 738 CMP R7, LASTSTR // Check for end of string 739 BGT notfound // If at end, then not found 740 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte 741 BR index2to16next // Check the next partial string 742 743 index17plus: 744 CMP R6, $32 // Check if 17 < len(sep) <= 32 745 BGT index33plus 746 SUB $16, R6, R9 // Extra > 16 747 SLD $56, R9, R10 // Shift to use in VSLO 748 MTVSRD R10, V9_ // Set up for VSLO 749 LXVB16X (R5)(R9), V1_ // Load 16 bytes @R5+R9 into V1 750 VSLO V1, V9, V1 // Shift left 751 VSPLTISB $0xff, V7 // Splat 1s 752 VSPLTISB $0, V27 // Splat 0 753 754 index17to32loop: 755 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2 756 757 next17: 758 LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7+R9 into V3 759 VSLO V3, V9, V3 // Shift left 760 VCMPEQUB V0, V2, V4 // Compare first 16 bytes 761 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes 762 VAND V4, V5, V6 // Check if both equal 763 VCMPEQUBCC V6, V7, V8 // All equal? 764 BLT CR6, found // Yes 765 ADD $1, R7 // On to next byte 766 CMP R7, LASTSTR // Check if last start byte 767 BGT notfound // If too high, not found 768 BR index17to32loop // Continue 769 770 notfound: 771 #ifdef GOEXPERIMENT_regabiargs 772 MOVD $-1, R3 // Return -1 if not found 773 #else 774 MOVD $-1, R8 // Return -1 if not found 775 MOVD R8, (R14) 776 #endif 777 RET 778 779 index33plus: 780 MOVD $0, (R0) // Case not implemented 781 RET // Crash before return 782 783 foundR25: 784 SRD $3, R25 // Convert from bits to bytes 785 ADD R25, R7 // Add to current string address 786 SUB R3, R7 // Subtract from start of string 787 #ifdef GOEXPERIMENT_regabiargs 788 MOVD R7, R3 // Return byte where found 789 #else 790 MOVD R7, (R14) // Return byte where found 791 #endif 792 RET 793 794 found: 795 SUB R3, R7 // Return byte where found 796 #ifdef GOEXPERIMENT_regabiargs 797 MOVD R7, R3 798 #else 799 MOVD R7, (R14) 800 #endif 801 RET 802