github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/internal/bytealg/index_ppc64x.s (about) 1 // Copyright 2021 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This is an implementation based on the s390x 6 // implementation. 7 8 // Find a separator with 2 <= len <= 32 within a string. 9 // Separators with lengths of 2, 3 or 4 are handled 10 // specially. 11 12 // This works on power8 and above. The loads and 13 // compares are done in big endian order 14 // since that allows the used of VCLZD, and allows 15 // the same implementation to work on big and little 16 // endian platforms with minimal conditional changes. 17 18 // NOTE: There is a power9 implementation that 19 // improves performance by 10-15% on little 20 // endian for some of the benchmarks, but 21 // work is still needed for a big endian 22 // implementation on power9. 23 24 //go:build ppc64 || ppc64le 25 // +build ppc64 ppc64le 26 27 #include "go_asm.h" 28 #include "textflag.h" 29 30 // Needed to swap LXVD2X loads to the correct 31 // byte order to work on POWER8. 32 33 #ifdef GOARCH_ppc64 34 DATA byteswap<>+0(SB)/8, $0x0001020304050607 35 DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f 36 #else 37 DATA byteswap<>+0(SB)/8, $0x0706050403020100 38 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 39 #endif 40 41 // Load bytes in big endian order. Address 42 // alignment does not need checking. 43 #define VLOADSWAP(base, index, vreg, vsreg) \ 44 LXVD2X (base)(index), vsreg; \ 45 VPERM vreg, vreg, SWAP, vreg 46 47 GLOBL byteswap<>+0(SB), RODATA, $16 48 49 TEXT ·Index(SB), NOSPLIT|NOFRAME, $0-56 50 MOVD a_base+0(FP), R3 // R3 = byte array pointer 51 MOVD a_len+8(FP), R4 // R4 = length 52 MOVD b_base+24(FP), R5 // R5 = separator pointer 53 MOVD b_len+32(FP), R6 // R6 = separator length 54 MOVD $ret+48(FP), R14 // R14 = &ret 55 56 #ifdef GOARCH_ppc64le 57 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7 58 CMP R7, $1 59 BNE power8 60 BR indexbodyp9<>(SB) 61 62 #endif 63 power8: 64 BR indexbody<>(SB) 65 66 TEXT ·IndexString(SB), NOSPLIT|NOFRAME, $0-40 67 MOVD a_base+0(FP), R3 // R3 = string 68 MOVD a_len+8(FP), R4 // R4 = length 69 MOVD b_base+16(FP), R5 // R5 = separator pointer 70 MOVD b_len+24(FP), R6 // R6 = separator length 71 MOVD $ret+32(FP), R14 // R14 = &ret 72 73 #ifdef GOARCH_ppc64le 74 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7 75 CMP R7, $1 76 BNE power8 77 BR indexbody<>(SB) 78 79 #endif 80 power8: 81 BR indexbody<>(SB) 82 83 // s: string we are searching 84 // sep: string to search for 85 // R3=&s[0], R4=len(s) 86 // R5=&sep[0], R6=len(sep) 87 // R14=&ret (index where sep found) 88 // R7=working addr of string 89 // R16=index value 16 90 // R17=index value 17 91 // R18=index value 18 92 // R19=index value 1 93 // R26=LASTBYTE of string 94 // R27=LASTSTR last start byte to compare with sep 95 // R8, R9 scratch 96 // V0=sep left justified zero fill 97 // CR4=sep length >= 16 98 99 #define SEPMASK V17 100 #define LASTBYTE R26 101 #define LASTSTR R27 102 #define ONES V20 103 #define SWAP V21 104 #define V0_ VS32 105 #define V1_ VS33 106 #define V2_ VS34 107 #define V3_ VS35 108 #define V4_ VS36 109 #define V5_ VS37 110 #define V6_ VS38 111 #define V7_ VS39 112 #define V8_ VS40 113 #define V9_ VS41 114 #define SWAP_ VS53 115 TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0 116 CMP R6, R4 // Compare lengths 117 BGT notfound // If sep len is > string, notfound 118 ADD R4, R3, LASTBYTE // find last byte addr 119 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) 120 CMP R6, $0 // Check sep len 121 BEQ notfound // sep len 0 -- not found 122 MOVD R3, R7 // Copy of string addr 123 MOVD $16, R16 // Index value 16 124 MOVD $17, R17 // Index value 17 125 MOVD $18, R18 // Index value 18 126 MOVD $1, R19 // Index value 1 127 MOVD $byteswap<>+00(SB), R8 128 VSPLTISB $0xFF, ONES // splat all 1s 129 LXVD2X (R8)(R0), SWAP_ // Set up swap string 130 131 CMP R6, $16, CR4 // CR4 for len(sep) >= 16 132 VOR ONES, ONES, SEPMASK // Set up full SEPMASK 133 BGE CR4, loadge16 // Load for len(sep) >= 16 134 SUB R6, R16, R9 // 16-len of sep 135 SLD $3, R9 // Set up for VSLO 136 MTVSRD R9, V9_ // Set up for VSLO 137 VSLDOI $8, V9, V9, V9 // Set up for VSLO 138 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 139 140 loadge16: 141 ANDCC $15, R5, R9 // Find byte offset of sep 142 ADD R9, R6, R10 // Add sep len 143 CMP R10, $16 // Check if sep len+offset > 16 144 BGE sepcross16 // Sep crosses 16 byte boundary 145 146 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container 147 VLOADSWAP(R8, R0, V0, V0_)// Load 16 bytes @R8 into V0 148 SLD $3, R9 // Set up shift count for VSLO 149 MTVSRD R9, V8_ // Set up shift count for VSLO 150 VSLDOI $8, V8, V8, V8 151 VSLO V0, V8, V0 // Shift by start byte 152 153 VAND V0, SEPMASK, V0 // Mask separator (< 16) 154 BR index2plus 155 156 sepcross16: 157 VLOADSWAP(R5, R0, V0, V0_) // Load 16 bytes @R5 into V0 158 159 VAND V0, SEPMASK, V0 // mask out separator 160 BLE CR4, index2to16 161 BR index17plus // Handle sep > 16 162 163 index2plus: 164 CMP R6, $2 // Check length of sep 165 BNE index3plus // If not 2, check for 3 166 ADD $16, R7, R9 // Check if next 16 bytes past last 167 CMP R9, LASTBYTE // compare with last 168 BGE index2to16 // 2 <= len(string) <= 16 169 MOVD $0xff00, R21 // Mask for later 170 MTVSRD R21, V25 // Move to Vreg 171 VSPLTH $3, V25, V31 // Splat mask 172 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep 173 VSPLTISB $0, V10 // Clear V10 174 175 // First case: 2 byte separator 176 // V1: 2 byte separator splatted 177 // V2: 16 bytes at addr 178 // V4: 16 bytes at addr+1 179 // Compare 2 byte separator at start 180 // and at start+1. Use VSEL to combine 181 // those results to find the first 182 // matching start byte, returning 183 // that value when found. Loop as 184 // long as len(string) > 16 185 index2loop2: 186 VLOADSWAP(R7, R19, V3, V3_) // Load 16 bytes @R7+1 into V3 187 188 index2loop: 189 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2 190 VCMPEQUH V1, V2, V5 // Search for sep 191 VCMPEQUH V1, V3, V6 // Search for sep offset by 1 192 VSEL V6, V5, V31, V7 // merge even and odd indices 193 VCLZD V7, V18 // find index of first match 194 MFVSRD V18, R25 // get first value 195 CMP R25, $64 // Found if < 64 196 BLT foundR25 // Return byte index where found 197 VSLDOI $8, V18, V18, V18 // Adjust 2nd value 198 MFVSRD V18, R25 // get second value 199 CMP R25, $64 // Found if < 64 200 ADD $64, R25 // Update byte offset 201 BLT foundR25 // Return value 202 ADD $16, R7 // R7+=16 Update string pointer 203 ADD $17, R7, R9 // R9=F7+17 since loop unrolled 204 CMP R9, LASTBYTE // Compare addr+17 against last byte 205 BLT index2loop2 // If < last, continue loop 206 CMP R7, LASTBYTE // Compare addr+16 against last byte 207 BLT index2to16 // If < 16 handle specially 208 VLOADSWAP(R7, R0, V3, V3_) // Load 16 bytes @R7 into V3 209 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte 210 BR index2loop 211 212 index3plus: 213 CMP R6, $3 // Check if sep == 3 214 BNE index4plus // If not check larger 215 ADD $19, R7, R9 // Find bytes for use in this loop 216 CMP R9, LASTBYTE // Compare against last byte 217 BGE index2to16 // Remaining string 2<=len<=16 218 MOVD $0xff00, R21 // Set up mask for upcoming loop 219 MTVSRD R21, V25 // Move mask to Vreg 220 VSPLTH $3, V25, V31 // Splat mask 221 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep 222 VSPLTB $2, V0, V8 // Splat 3rd byte of sep 223 224 // Loop to process 3 byte separator. 225 // string[0:16] is in V2 226 // string[2:18] is in V3 227 // sep[0:2] splatted in V1 228 // sec[3] splatted in v8 229 // Load vectors at string, string+1 230 // and string+2. Compare string, string+1 231 // against first 2 bytes of separator 232 // splatted, and string+2 against 3rd 233 // byte splatted. Merge the results with 234 // VSEL to find the first byte of a match. 235 236 // Special handling for last 16 bytes if the 237 // string fits in 16 byte multiple. 238 index3loop2: 239 MOVD $2, R21 // Set up index for 2 240 VSPLTISB $0, V10 // Clear V10 241 VLOADSWAP(R7, R21, V3, V3_)// Load 16 bytes @R7+2 into V3 242 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes 243 244 index3loop: 245 VLOADSWAP(R7, R0, V2, V2_) // Load with correct order 246 VSLDOI $1, V2, V3, V4 // string[1:17] 247 VSLDOI $2, V2, V3, V9 // string[2:18] 248 VCMPEQUH V1, V2, V5 // compare hw even indices 249 VCMPEQUH V1, V4, V6 // compare hw odd indices 250 VCMPEQUB V8, V9, V10 // compare 3rd to last byte 251 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask 252 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte 253 VCLZD V7, V18 // Find first nonzero indexes 254 MFVSRD V18, R25 // Move 1st doubleword 255 CMP R25, $64 // If < 64 found 256 BLT foundR25 // Return matching index 257 VSLDOI $8, V18, V18, V18 // Move value 258 MFVSRD V18, R25 // Move 2nd doubleword 259 CMP R25, $64 // If < 64 found 260 ADD $64, R25 // Update byte index 261 BLT foundR25 // Return matching index 262 ADD $16, R7 // R7+=16 string ptr 263 ADD $19, R7, R9 // Number of string bytes for loop 264 CMP R9, LASTBYTE // Compare against last byte of string 265 BLT index3loop2 // If within, continue this loop 266 CMP R7, LASTSTR // Compare against last start byte 267 BLT index2to16 // Process remainder 268 VSPLTISB $0, V3 // Special case for last 16 bytes 269 BR index3loop // Continue this loop 270 271 // Loop to process 4 byte separator 272 // string[0:16] in V2 273 // string[3:16] in V3 274 // sep[0:4] splatted in V1 275 // Set up vectors with strings at offsets 276 // 0, 1, 2, 3 and compare against the 4 byte 277 // separator also splatted. Use VSEL with the 278 // compare results to find the first byte where 279 // a separator match is found. 280 index4plus: 281 CMP R6, $4 // Check if 4 byte separator 282 BNE index5plus // If not next higher 283 ADD $20, R7, R9 // Check string size to load 284 CMP R9, LASTBYTE // Verify string length 285 BGE index2to16 // If not large enough, process remaining 286 MOVD $2, R15 // Set up index 287 288 // Set up masks for use with VSEL 289 MOVD $0xff, R21 // Set up mask 0xff000000ff000000... 290 SLD $24, R21 291 MTVSRD R21, V10 292 VSPLTW $1, V10, V29 293 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... 294 MOVD $0xffff, R21 295 SLD $16, R21 296 MTVSRD R21, V10 297 VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000... 298 VSPLTW $0, V0, V1 // Splat 1st word of separator 299 300 index4loop: 301 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2 302 303 next4: 304 VSPLTISB $0, V10 // Clear 305 MOVD $3, R9 // Number of bytes beyond 16 306 VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+3 into V3 307 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes 308 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 309 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 310 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 311 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep 312 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep 313 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep 314 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep 315 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask 316 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask 317 VSEL V14, V13, V31, V7 // final merge 318 VCLZD V7, V18 // Find first index for each half 319 MFVSRD V18, R25 // Isolate value 320 CMP R25, $64 // If < 64, found 321 BLT foundR25 // Return found index 322 VSLDOI $8, V18, V18, V18 // Move for MFVSRD 323 MFVSRD V18, R25 // Isolate other value 324 CMP R25, $64 // If < 64, found 325 ADD $64, R25 // Update index for high doubleword 326 BLT foundR25 // Return found index 327 ADD $16, R7 // R7+=16 for next string 328 ADD $20, R7, R9 // R+20 for all bytes to load 329 CMP R9, LASTBYTE // Past end? Maybe check for extra? 330 BLT index4loop // If not, continue loop 331 CMP R7, LASTSTR // Check remainder 332 BLE index2to16 // Process remainder 333 BR notfound // Not found 334 335 index5plus: 336 CMP R6, $16 // Check for sep > 16 337 BGT index17plus // Handle large sep 338 339 // Assumption is that the separator is smaller than the string at this point 340 index2to16: 341 CMP R7, LASTSTR // Compare last start byte 342 BGT notfound // last takes len(sep) into account 343 344 ADD $16, R7, R9 // Check for last byte of string 345 CMP R9, LASTBYTE 346 BGT index2to16tail 347 348 // At least 16 bytes of string left 349 // Mask the number of bytes in sep 350 index2to16loop: 351 VLOADSWAP(R7, R0, V1, V1_) // Load 16 bytes @R7 into V1 352 353 compare: 354 VAND V1, SEPMASK, V2 // Mask out sep size 355 VCMPEQUBCC V0, V2, V3 // Compare masked string 356 BLT CR6, found // All equal 357 ADD $1, R7 // Update ptr to next byte 358 CMP R7, LASTSTR // Still less than last start byte 359 BGT notfound // Not found 360 ADD $16, R7, R9 // Verify remaining bytes 361 CMP R9, LASTBYTE // At least 16 362 BLT index2to16loop // Try again 363 364 // Less than 16 bytes remaining in string 365 // Separator >= 2 366 index2to16tail: 367 ADD R3, R4, R9 // End of string 368 SUB R7, R9, R9 // Number of bytes left 369 ANDCC $15, R7, R10 // 16 byte offset 370 ADD R10, R9, R11 // offset + len 371 CMP R11, $16 // >= 16? 372 BLE short // Does not cross 16 bytes 373 VLOADSWAP(R7, R0, V1, V1_)// Load 16 bytes @R7 into V1 374 BR index2to16next // Continue on 375 376 short: 377 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container 378 VLOADSWAP(R9, R0, V1, V1_)// Load 16 bytes @R9 into V1 379 SLD $3, R10 // Set up shift 380 MTVSRD R10, V8_ // Set up shift 381 VSLDOI $8, V8, V8, V8 382 VSLO V1, V8, V1 // Shift by start byte 383 VSPLTISB $0, V25 // Clear for later use 384 385 index2to16next: 386 VAND V1, SEPMASK, V2 // Just compare size of sep 387 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 388 BLT CR6, found // Found 389 ADD $1, R7 // Not found, try next partial string 390 CMP R7, LASTSTR // Check for end of string 391 BGT notfound // If at end, then not found 392 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte 393 BR index2to16next // Check the next partial string 394 395 index17plus: 396 CMP R6, $32 // Check if 17 < len(sep) <= 32 397 BGT index33plus 398 SUB $16, R6, R9 // Extra > 16 399 SLD $56, R9, R10 // Shift to use in VSLO 400 MTVSRD R10, V9_ // Set up for VSLO 401 VLOADSWAP(R5, R9, V1, V1_)// Load 16 bytes @R5+R9 into V1 402 VSLO V1, V9, V1 // Shift left 403 VSPLTISB $0xff, V7 // Splat 1s 404 VSPLTISB $0, V27 // Splat 0 405 406 index17to32loop: 407 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2 408 409 next17: 410 VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+R9 into V3 411 VSLO V3, V9, V3 // Shift left 412 VCMPEQUB V0, V2, V4 // Compare first 16 bytes 413 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes 414 VAND V4, V5, V6 // Check if both equal 415 VCMPEQUBCC V6, V7, V8 // All equal? 416 BLT CR6, found // Yes 417 ADD $1, R7 // On to next byte 418 CMP R7, LASTSTR // Check if last start byte 419 BGT notfound // If too high, not found 420 BR index17to32loop // Continue 421 422 notfound: 423 MOVD $-1, R8 // Return -1 if not found 424 MOVD R8, (R14) 425 RET 426 427 index33plus: 428 MOVD $0, (R0) // Case not implemented 429 RET // Crash before return 430 431 foundR25: 432 SRD $3, R25 // Convert from bits to bytes 433 ADD R25, R7 // Add to current string address 434 SUB R3, R7 // Subtract from start of string 435 MOVD R7, (R14) // Return byte where found 436 RET 437 438 found: 439 SUB R3, R7 // Return byte where found 440 MOVD R7, (R14) 441 RET 442 443 TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0 444 CMP R6, R4 // Compare lengths 445 BGT notfound // If sep len is > string, notfound 446 ADD R4, R3, LASTBYTE // find last byte addr 447 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) 448 CMP R6, $0 // Check sep len 449 BEQ notfound // sep len 0 -- not found 450 MOVD R3, R7 // Copy of string addr 451 MOVD $16, R16 // Index value 16 452 MOVD $17, R17 // Index value 17 453 MOVD $18, R18 // Index value 18 454 MOVD $1, R19 // Index value 1 455 VSPLTISB $0xFF, ONES // splat all 1s 456 457 CMP R6, $16, CR4 // CR4 for len(sep) >= 16 458 VOR ONES, ONES, SEPMASK // Set up full SEPMASK 459 BGE CR4, loadge16 // Load for len(sep) >= 16 460 SUB R6, R16, R9 // 16-len of sep 461 SLD $3, R9 // Set up for VSLO 462 MTVSRD R9, V9_ // Set up for VSLO 463 VSLDOI $8, V9, V9, V9 // Set up for VSLO 464 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 465 466 loadge16: 467 ANDCC $15, R5, R9 // Find byte offset of sep 468 ADD R9, R6, R10 // Add sep len 469 CMP R10, $16 // Check if sep len+offset > 16 470 BGE sepcross16 // Sep crosses 16 byte boundary 471 472 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container 473 LXVB16X (R8)(R0), V0_ // Load 16 bytes @R8 into V0 474 SLD $3, R9 // Set up shift count for VSLO 475 MTVSRD R9, V8_ // Set up shift count for VSLO 476 VSLDOI $8, V8, V8, V8 477 VSLO V0, V8, V0 // Shift by start byte 478 479 VAND V0, SEPMASK, V0 // Mask separator (< 16) 480 BR index2plus 481 482 sepcross16: 483 LXVB16X (R5)(R0), V0_ // Load 16 bytes @R5 into V0 484 485 VAND V0, SEPMASK, V0 // mask out separator 486 BLE CR4, index2to16 487 BR index17plus // Handle sep > 16 488 489 index2plus: 490 CMP R6, $2 // Check length of sep 491 BNE index3plus // If not 2, check for 3 492 ADD $16, R7, R9 // Check if next 16 bytes past last 493 CMP R9, LASTBYTE // compare with last 494 BGE index2to16 // 2 <= len(string) <= 16 495 MOVD $0xff00, R21 // Mask for later 496 MTVSRD R21, V25 // Move to Vreg 497 VSPLTH $3, V25, V31 // Splat mask 498 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep 499 VSPLTISB $0, V10 // Clear V10 500 501 // First case: 2 byte separator 502 // V1: 2 byte separator splatted 503 // V2: 16 bytes at addr 504 // V4: 16 bytes at addr+1 505 // Compare 2 byte separator at start 506 // and at start+1. Use VSEL to combine 507 // those results to find the first 508 // matching start byte, returning 509 // that value when found. Loop as 510 // long as len(string) > 16 511 index2loop2: 512 LXVB16X (R7)(R19), V3_ // Load 16 bytes @R7+1 into V3 513 514 index2loop: 515 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2 516 VCMPEQUH V1, V2, V5 // Search for sep 517 VCMPEQUH V1, V3, V6 // Search for sep offset by 1 518 VSEL V6, V5, V31, V7 // merge even and odd indices 519 VCLZD V7, V18 // find index of first match 520 MFVSRD V18, R25 // get first value 521 CMP R25, $64 // Found if < 64 522 BLT foundR25 // Return byte index where found 523 524 MFVSRLD V18, R25 // get second value 525 CMP R25, $64 // Found if < 64 526 ADD $64, R25 // Update byte offset 527 BLT foundR25 // Return value 528 ADD $16, R7 // R7+=16 Update string pointer 529 ADD $17, R7, R9 // R9=F7+17 since loop unrolled 530 CMP R9, LASTBYTE // Compare addr+17 against last byte 531 BLT index2loop2 // If < last, continue loop 532 CMP R7, LASTBYTE // Compare addr+16 against last byte 533 BLT index2to16 // If < 16 handle specially 534 LXVB16X (R7)(R0), V3_ // Load 16 bytes @R7 into V3 535 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte 536 BR index2loop 537 538 index3plus: 539 CMP R6, $3 // Check if sep == 3 540 BNE index4plus // If not check larger 541 ADD $19, R7, R9 // Find bytes for use in this loop 542 CMP R9, LASTBYTE // Compare against last byte 543 BGE index2to16 // Remaining string 2<=len<=16 544 MOVD $0xff00, R21 // Set up mask for upcoming loop 545 MTVSRD R21, V25 // Move mask to Vreg 546 VSPLTH $3, V25, V31 // Splat mask 547 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep 548 VSPLTB $2, V0, V8 // Splat 3rd byte of sep 549 550 // Loop to process 3 byte separator. 551 // string[0:16] is in V2 552 // string[2:18] is in V3 553 // sep[0:2] splatted in V1 554 // sec[3] splatted in v8 555 // Load vectors at string, string+1 556 // and string+2. Compare string, string+1 557 // against first 2 bytes of separator 558 // splatted, and string+2 against 3rd 559 // byte splatted. Merge the results with 560 // VSEL to find the first byte of a match. 561 562 // Special handling for last 16 bytes if the 563 // string fits in 16 byte multiple. 564 index3loop2: 565 MOVD $2, R21 // Set up index for 2 566 VSPLTISB $0, V10 // Clear V10 567 LXVB16X (R7)(R21), V3_ // Load 16 bytes @R7+2 into V3 568 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes 569 570 index3loop: 571 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 572 VSLDOI $1, V2, V3, V4 // string[1:17] 573 VSLDOI $2, V2, V3, V9 // string[2:18] 574 VCMPEQUH V1, V2, V5 // compare hw even indices 575 VCMPEQUH V1, V4, V6 // compare hw odd indices 576 VCMPEQUB V8, V9, V10 // compare 3rd to last byte 577 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask 578 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte 579 VCLZD V7, V18 // Find first nonzero indexes 580 MFVSRD V18, R25 // Move 1st doubleword 581 CMP R25, $64 // If < 64 found 582 BLT foundR25 // Return matching index 583 584 MFVSRLD V18, R25 // Move 2nd doubleword 585 CMP R25, $64 // If < 64 found 586 ADD $64, R25 // Update byte index 587 BLT foundR25 // Return matching index 588 ADD $16, R7 // R7+=16 string ptr 589 ADD $19, R7, R9 // Number of string bytes for loop 590 CMP R9, LASTBYTE // Compare against last byte of string 591 BLT index3loop2 // If within, continue this loop 592 CMP R7, LASTSTR // Compare against last start byte 593 BLT index2to16 // Process remainder 594 VSPLTISB $0, V3 // Special case for last 16 bytes 595 BR index3loop // Continue this loop 596 597 // Loop to process 4 byte separator 598 // string[0:16] in V2 599 // string[3:16] in V3 600 // sep[0:4] splatted in V1 601 // Set up vectors with strings at offsets 602 // 0, 1, 2, 3 and compare against the 4 byte 603 // separator also splatted. Use VSEL with the 604 // compare results to find the first byte where 605 // a separator match is found. 606 index4plus: 607 CMP R6, $4 // Check if 4 byte separator 608 BNE index5plus // If not next higher 609 ADD $20, R7, R9 // Check string size to load 610 CMP R9, LASTBYTE // Verify string length 611 BGE index2to16 // If not large enough, process remaining 612 MOVD $2, R15 // Set up index 613 614 // Set up masks for use with VSEL 615 MOVD $0xff, R21 // Set up mask 0xff000000ff000000... 616 SLD $24, R21 617 MTVSRWS R21, V29 618 619 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... 620 MOVD $0xffff, R21 621 SLD $16, R21 622 MTVSRWS R21, V31 623 624 VSPLTW $0, V0, V1 // Splat 1st word of separator 625 626 index4loop: 627 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2 628 629 next4: 630 VSPLTISB $0, V10 // Clear 631 MOVD $3, R9 // Number of bytes beyond 16 632 LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7 into V2 633 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes 634 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 635 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 636 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 637 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep 638 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep 639 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep 640 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep 641 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask 642 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask 643 VSEL V14, V13, V31, V7 // final merge 644 VCLZD V7, V18 // Find first index for each half 645 MFVSRD V18, R25 // Isolate value 646 CMP R25, $64 // If < 64, found 647 BLT foundR25 // Return found index 648 649 MFVSRLD V18, R25 // Isolate other value 650 CMP R25, $64 // If < 64, found 651 ADD $64, R25 // Update index for high doubleword 652 BLT foundR25 // Return found index 653 ADD $16, R7 // R7+=16 for next string 654 ADD $20, R7, R9 // R+20 for all bytes to load 655 CMP R9, LASTBYTE // Past end? Maybe check for extra? 656 BLT index4loop // If not, continue loop 657 CMP R7, LASTSTR // Check remainder 658 BLE index2to16 // Process remainder 659 BR notfound // Not found 660 661 index5plus: 662 CMP R6, $16 // Check for sep > 16 663 BGT index17plus // Handle large sep 664 665 // Assumption is that the separator is smaller than the string at this point 666 index2to16: 667 CMP R7, LASTSTR // Compare last start byte 668 BGT notfound // last takes len(sep) into account 669 670 ADD $16, R7, R9 // Check for last byte of string 671 CMP R9, LASTBYTE 672 BGT index2to16tail 673 674 // At least 16 bytes of string left 675 // Mask the number of bytes in sep 676 index2to16loop: 677 LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1 678 679 compare: 680 VAND V1, SEPMASK, V2 // Mask out sep size 681 VCMPEQUBCC V0, V2, V3 // Compare masked string 682 BLT CR6, found // All equal 683 ADD $1, R7 // Update ptr to next byte 684 CMP R7, LASTSTR // Still less than last start byte 685 BGT notfound // Not found 686 ADD $16, R7, R9 // Verify remaining bytes 687 CMP R9, LASTBYTE // At least 16 688 BLT index2to16loop // Try again 689 690 // Less than 16 bytes remaining in string 691 // Separator >= 2 692 index2to16tail: 693 ADD R3, R4, R9 // End of string 694 SUB R7, R9, R9 // Number of bytes left 695 ANDCC $15, R7, R10 // 16 byte offset 696 ADD R10, R9, R11 // offset + len 697 CMP R11, $16 // >= 16? 698 BLE short // Does not cross 16 bytes 699 LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1 700 BR index2to16next // Continue on 701 702 short: 703 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container 704 LXVB16X (R9)(R0), V1_ // Load 16 bytes @R9 into V1 705 SLD $3, R10 // Set up shift 706 MTVSRD R10, V8_ // Set up shift 707 VSLDOI $8, V8, V8, V8 708 VSLO V1, V8, V1 // Shift by start byte 709 VSPLTISB $0, V25 // Clear for later use 710 711 index2to16next: 712 VAND V1, SEPMASK, V2 // Just compare size of sep 713 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 714 BLT CR6, found // Found 715 ADD $1, R7 // Not found, try next partial string 716 CMP R7, LASTSTR // Check for end of string 717 BGT notfound // If at end, then not found 718 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte 719 BR index2to16next // Check the next partial string 720 721 index17plus: 722 CMP R6, $32 // Check if 17 < len(sep) <= 32 723 BGT index33plus 724 SUB $16, R6, R9 // Extra > 16 725 SLD $56, R9, R10 // Shift to use in VSLO 726 MTVSRD R10, V9_ // Set up for VSLO 727 LXVB16X (R5)(R9), V1_ // Load 16 bytes @R5+R9 into V1 728 VSLO V1, V9, V1 // Shift left 729 VSPLTISB $0xff, V7 // Splat 1s 730 VSPLTISB $0, V27 // Splat 0 731 732 index17to32loop: 733 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2 734 735 next17: 736 LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7+R9 into V3 737 VSLO V3, V9, V3 // Shift left 738 VCMPEQUB V0, V2, V4 // Compare first 16 bytes 739 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes 740 VAND V4, V5, V6 // Check if both equal 741 VCMPEQUBCC V6, V7, V8 // All equal? 742 BLT CR6, found // Yes 743 ADD $1, R7 // On to next byte 744 CMP R7, LASTSTR // Check if last start byte 745 BGT notfound // If too high, not found 746 BR index17to32loop // Continue 747 748 notfound: 749 MOVD $-1, R8 // Return -1 if not found 750 MOVD R8, (R14) 751 RET 752 753 index33plus: 754 MOVD $0, (R0) // Case not implemented 755 RET // Crash before return 756 757 foundR25: 758 SRD $3, R25 // Convert from bits to bytes 759 ADD R25, R7 // Add to current string address 760 SUB R3, R7 // Subtract from start of string 761 MOVD R7, (R14) // Return byte where found 762 RET 763 764 found: 765 SUB R3, R7 // Return byte where found 766 MOVD R7, (R14) 767 RET 768