github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/index_ppc64x.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2021 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 // This is an implementation based on the s390x 9 // implementation. 10 11 // Find a separator with 2 <= len <= 32 within a string. 12 // Separators with lengths of 2, 3 or 4 are handled 13 // specially. 14 15 // This works on power8 and above. The loads and 16 // compares are done in big endian order 17 // since that allows the used of VCLZD, and allows 18 // the same implementation to work on big and little 19 // endian platforms with minimal conditional changes. 20 21 // NOTE: There is a power9 implementation that 22 // improves performance by 10-15% on little 23 // endian for some of the benchmarks. 24 // Unrolled index2to16 loop by 4 on ppc64le/power9 25 // Work is still needed for a big endian 26 // implementation on power9. 27 28 //go:build pcz && (ppc64 || ppc64le) 29 30 #include "textflag.h" 31 32 // Needed to swap LXVD2X loads to the correct 33 // byte order to work on POWER8. 34 35 #ifdef GOARCH_ppc64 36 DATA byteswap<>+0(SB)/8, $0x0001020304050607 37 DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f 38 #else 39 DATA byteswap<>+0(SB)/8, $0x0706050403020100 40 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 41 #endif 42 43 // Load bytes in big endian order. Address 44 // alignment does not need checking. 45 #define VLOADSWAP(base, index, vreg, vsreg) \ 46 LXVD2X (base)(index), vsreg; \ 47 VPERM vreg, vreg, SWAP, vreg 48 49 GLOBL byteswap<>+0(SB), RODATA, $16 50 51 TEXT ·indexSlice<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 52 // R3 = byte array pointer 53 // R4 = length 54 MOVD R6, R5 // R5 = separator pointer 55 MOVD R7, R6 // R6 = separator length 56 57 #ifdef GOARCH_ppc64le 58 MOVBZ ·isPOWER9(SB), R7 59 CMP R7, $1 60 BNE power8 61 BR indexbodyp9<>(SB) 62 #endif 63 power8: 64 BR indexbody<>(SB) 65 66 TEXT ·index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 67 // R3 = string 68 // R4 = length 69 // R5 = separator pointer 70 // R6 = separator length 71 72 #ifdef GOARCH_ppc64le 73 MOVBZ ·isPOWER9(SB), R7 74 CMP R7, $1 75 BNE power8 76 BR indexbodyp9<>(SB) 77 78 #endif 79 power8: 80 BR indexbody<>(SB) 81 82 // s: string we are searching 83 // sep: string to search for 84 // R3=&s[0], R4=len(s) 85 // R5=&sep[0], R6=len(sep) 86 // R14=&ret (index where sep found) 87 // R7=working addr of string 88 // R16=index value 16 89 // R17=index value 17 90 // R18=index value 18 91 // R19=index value 1 92 // R26=LASTBYTE of string 93 // R27=LASTSTR last start byte to compare with sep 94 // R8, R9 scratch 95 // V0=sep left justified zero fill 96 // CR4=sep length >= 16 97 98 #define SEPMASK V17 99 #define LASTBYTE R26 100 #define LASTSTR R27 101 #define ONES V20 102 #define SWAP V21 103 #define SWAP_ VS53 104 TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0 105 CMP R6, R4 // Compare lengths 106 BGT notfound // If sep len is > string, notfound 107 ADD R4, R3, LASTBYTE // find last byte addr 108 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) 109 CMP R6, $0 // Check sep len 110 BEQ notfound // sep len 0 -- not found 111 MOVD R3, R7 // Copy of string addr 112 MOVD $16, R16 // Index value 16 113 MOVD $17, R17 // Index value 17 114 MOVD $18, R18 // Index value 18 115 MOVD $1, R19 // Index value 1 116 MOVD $byteswap<>+00(SB), R8 117 VSPLTISB $0xFF, ONES // splat all 1s 118 LXVD2X (R8)(R0), SWAP_ // Set up swap string 119 120 CMP R6, $16, CR4 // CR4 for len(sep) >= 16 121 VOR ONES, ONES, SEPMASK // Set up full SEPMASK 122 BGE CR4, loadge16 // Load for len(sep) >= 16 123 SUB R6, R16, R9 // 16-len of sep 124 SLD $3, R9 // Set up for VSLO 125 MTVSRD R9, V9 // Set up for VSLO 126 VSLDOI $8, V9, V9, V9 // Set up for VSLO 127 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 128 129 loadge16: 130 ANDCC $15, R5, R9 // Find byte offset of sep 131 ADD R9, R6, R10 // Add sep len 132 CMP R10, $16 // Check if sep len+offset > 16 133 BGT sepcross16 // Sep crosses 16 byte boundary 134 135 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container 136 VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0 137 SLD $3, R9 // Set up shift count for VSLO 138 MTVSRD R9, V8 // Set up shift count for VSLO 139 VSLDOI $8, V8, V8, V8 140 VSLO V0, V8, V0 // Shift by start byte 141 142 VAND V0, SEPMASK, V0 // Mask separator (< 16) 143 BR index2plus 144 145 sepcross16: 146 VLOADSWAP(R5, R0, V0, V0) // Load 16 bytes @R5 into V0 147 148 VAND V0, SEPMASK, V0 // mask out separator 149 BLE CR4, index2to16 150 BR index17plus // Handle sep > 16 151 152 index2plus: 153 CMP R6, $2 // Check length of sep 154 BNE index3plus // If not 2, check for 3 155 ADD $16, R7, R9 // Check if next 16 bytes past last 156 CMP R9, LASTBYTE // compare with last 157 BGE index2to16 // 2 <= len(string) <= 16 158 MOVD $0xff00, R21 // Mask for later 159 MTVSRD R21, V25 // Move to Vreg 160 VSPLTH $3, V25, V31 // Splat mask 161 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep 162 VSPLTISB $0, V10 // Clear V10 163 164 // First case: 2 byte separator 165 // V1: 2 byte separator splatted 166 // V2: 16 bytes at addr 167 // V4: 16 bytes at addr+1 168 // Compare 2 byte separator at start 169 // and at start+1. Use VSEL to combine 170 // those results to find the first 171 // matching start byte, returning 172 // that value when found. Loop as 173 // long as len(string) > 16 174 index2loop2: 175 VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3 176 177 index2loop: 178 VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 179 VCMPEQUH V1, V2, V5 // Search for sep 180 VCMPEQUH V1, V3, V6 // Search for sep offset by 1 181 VSEL V6, V5, V31, V7 // merge even and odd indices 182 VCLZD V7, V18 // find index of first match 183 MFVSRD V18, R25 // get first value 184 CMP R25, $64 // Found if < 64 185 BLT foundR25 // Return byte index where found 186 VSLDOI $8, V18, V18, V18 // Adjust 2nd value 187 MFVSRD V18, R25 // get second value 188 CMP R25, $64 // Found if < 64 189 ADD $64, R25 // Update byte offset 190 BLT foundR25 // Return value 191 ADD $16, R7 // R7+=16 Update string pointer 192 ADD $17, R7, R9 // R9=F7+17 since loop unrolled 193 CMP R9, LASTBYTE // Compare addr+17 against last byte 194 BLT index2loop2 // If < last, continue loop 195 CMP R7, LASTBYTE // Compare addr+16 against last byte 196 BLT index2to16 // If < 16 handle specially 197 VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3 198 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte 199 BR index2loop 200 201 index3plus: 202 CMP R6, $3 // Check if sep == 3 203 BNE index4plus // If not check larger 204 ADD $19, R7, R9 // Find bytes for use in this loop 205 CMP R9, LASTBYTE // Compare against last byte 206 BGE index2to16 // Remaining string 2<=len<=16 207 MOVD $0xff00, R21 // Set up mask for upcoming loop 208 MTVSRD R21, V25 // Move mask to Vreg 209 VSPLTH $3, V25, V31 // Splat mask 210 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep 211 VSPLTB $2, V0, V8 // Splat 3rd byte of sep 212 213 // Loop to process 3 byte separator. 214 // string[0:16] is in V2 215 // string[2:18] is in V3 216 // sep[0:2] splatted in V1 217 // sec[3] splatted in v8 218 // Load vectors at string, string+1 219 // and string+2. Compare string, string+1 220 // against first 2 bytes of separator 221 // splatted, and string+2 against 3rd 222 // byte splatted. Merge the results with 223 // VSEL to find the first byte of a match. 224 225 // Special handling for last 16 bytes if the 226 // string fits in 16 byte multiple. 227 index3loop2: 228 MOVD $2, R21 // Set up index for 2 229 VSPLTISB $0, V10 // Clear V10 230 VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3 231 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes 232 233 index3loop: 234 VLOADSWAP(R7, R0, V2, V2) // Load with correct order 235 VSLDOI $1, V2, V3, V4 // string[1:17] 236 VSLDOI $2, V2, V3, V9 // string[2:18] 237 VCMPEQUH V1, V2, V5 // compare hw even indices 238 VCMPEQUH V1, V4, V6 // compare hw odd indices 239 VCMPEQUB V8, V9, V10 // compare 3rd to last byte 240 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask 241 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte 242 VCLZD V7, V18 // Find first nonzero indexes 243 MFVSRD V18, R25 // Move 1st doubleword 244 CMP R25, $64 // If < 64 found 245 BLT foundR25 // Return matching index 246 VSLDOI $8, V18, V18, V18 // Move value 247 MFVSRD V18, R25 // Move 2nd doubleword 248 CMP R25, $64 // If < 64 found 249 ADD $64, R25 // Update byte index 250 BLT foundR25 // Return matching index 251 ADD $16, R7 // R7+=16 string ptr 252 ADD $19, R7, R9 // Number of string bytes for loop 253 CMP R9, LASTBYTE // Compare against last byte of string 254 BLT index3loop2 // If within, continue this loop 255 CMP R7, LASTSTR // Compare against last start byte 256 BLT index2to16 // Process remainder 257 VSPLTISB $0, V3 // Special case for last 16 bytes 258 BR index3loop // Continue this loop 259 260 // Loop to process 4 byte separator 261 // string[0:16] in V2 262 // string[3:16] in V3 263 // sep[0:4] splatted in V1 264 // Set up vectors with strings at offsets 265 // 0, 1, 2, 3 and compare against the 4 byte 266 // separator also splatted. Use VSEL with the 267 // compare results to find the first byte where 268 // a separator match is found. 269 index4plus: 270 CMP R6, $4 // Check if 4 byte separator 271 BNE index5plus // If not next higher 272 ADD $20, R7, R9 // Check string size to load 273 CMP R9, LASTBYTE // Verify string length 274 BGE index2to16 // If not large enough, process remaining 275 MOVD $2, R15 // Set up index 276 277 // Set up masks for use with VSEL 278 MOVD $0xff, R21 // Set up mask 0xff000000ff000000... 279 SLD $24, R21 280 MTVSRD R21, V10 281 VSPLTW $1, V10, V29 282 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... 283 MOVD $0xffff, R21 284 SLD $16, R21 285 MTVSRD R21, V10 286 VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000... 287 VSPLTW $0, V0, V1 // Splat 1st word of separator 288 289 index4loop: 290 VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 291 292 next4: 293 VSPLTISB $0, V10 // Clear 294 MOVD $3, R9 // Number of bytes beyond 16 295 VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+3 into V3 296 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes 297 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 298 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 299 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 300 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep 301 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep 302 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep 303 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep 304 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask 305 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask 306 VSEL V14, V13, V31, V7 // final merge 307 VCLZD V7, V18 // Find first index for each half 308 MFVSRD V18, R25 // Isolate value 309 CMP R25, $64 // If < 64, found 310 BLT foundR25 // Return found index 311 VSLDOI $8, V18, V18, V18 // Move for MFVSRD 312 MFVSRD V18, R25 // Isolate other value 313 CMP R25, $64 // If < 64, found 314 ADD $64, R25 // Update index for high doubleword 315 BLT foundR25 // Return found index 316 ADD $16, R7 // R7+=16 for next string 317 ADD $20, R7, R9 // R+20 for all bytes to load 318 CMP R9, LASTBYTE // Past end? Maybe check for extra? 319 BLT index4loop // If not, continue loop 320 CMP R7, LASTSTR // Check remainder 321 BLE index2to16 // Process remainder 322 BR notfound // Not found 323 324 index5plus: 325 CMP R6, $16 // Check for sep > 16 326 BGT index17plus // Handle large sep 327 328 // Assumption is that the separator is smaller than the string at this point 329 index2to16: 330 CMP R7, LASTSTR // Compare last start byte 331 BGT notfound // last takes len(sep) into account 332 333 ADD $16, R7, R9 // Check for last byte of string 334 CMP R9, LASTBYTE 335 BGT index2to16tail 336 337 // At least 16 bytes of string left 338 // Mask the number of bytes in sep 339 index2to16loop: 340 VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1 341 342 compare: 343 VAND V1, SEPMASK, V2 // Mask out sep size 344 VCMPEQUBCC V0, V2, V3 // Compare masked string 345 BLT CR6, found // All equal 346 ADD $1, R7 // Update ptr to next byte 347 CMP R7, LASTSTR // Still less than last start byte 348 BGT notfound // Not found 349 ADD $16, R7, R9 // Verify remaining bytes 350 CMP R9, LASTBYTE // At least 16 351 BLT index2to16loop // Try again 352 353 // Less than 16 bytes remaining in string 354 // Separator >= 2 355 index2to16tail: 356 ADD R3, R4, R9 // End of string 357 SUB R7, R9, R9 // Number of bytes left 358 ANDCC $15, R7, R10 // 16 byte offset 359 ADD R10, R9, R11 // offset + len 360 CMP R11, $16 // >= 16? 361 BLE short // Does not cross 16 bytes 362 VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1 363 BR index2to16next // Continue on 364 365 short: 366 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container 367 VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1 368 SLD $3, R10 // Set up shift 369 MTVSRD R10, V8 // Set up shift 370 VSLDOI $8, V8, V8, V8 371 VSLO V1, V8, V1 // Shift by start byte 372 VSPLTISB $0, V25 // Clear for later use 373 374 index2to16next: 375 VAND V1, SEPMASK, V2 // Just compare size of sep 376 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 377 BLT CR6, found // Found 378 ADD $1, R7 // Not found, try next partial string 379 CMP R7, LASTSTR // Check for end of string 380 BGT notfound // If at end, then not found 381 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte 382 BR index2to16next // Check the next partial string 383 384 index17plus: 385 CMP R6, $32 // Check if 17 < len(sep) <= 32 386 BGT index33plus 387 SUB $16, R6, R9 // Extra > 16 388 SLD $56, R9, R10 // Shift to use in VSLO 389 MTVSRD R10, V9 // Set up for VSLO 390 VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1 391 VSLO V1, V9, V1 // Shift left 392 VSPLTISB $0xff, V7 // Splat 1s 393 VSPLTISB $0, V27 // Splat 0 394 395 index17to32loop: 396 VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 397 398 next17: 399 VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+R9 into V3 400 VSLO V3, V9, V3 // Shift left 401 VCMPEQUB V0, V2, V4 // Compare first 16 bytes 402 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes 403 VAND V4, V5, V6 // Check if both equal 404 VCMPEQUBCC V6, V7, V8 // All equal? 405 BLT CR6, found // Yes 406 ADD $1, R7 // On to next byte 407 CMP R7, LASTSTR // Check if last start byte 408 BGT notfound // If too high, not found 409 BR index17to32loop // Continue 410 411 notfound: 412 MOVD $-1, R3 // Return -1 if not found 413 RET 414 415 index33plus: 416 MOVD $0, (R0) // Case not implemented 417 RET // Crash before return 418 419 foundR25: 420 SRD $3, R25 // Convert from bits to bytes 421 ADD R25, R7 // Add to current string address 422 SUB R3, R7 // Subtract from start of string 423 MOVD R7, R3 // Return byte where found 424 RET 425 426 found: 427 SUB R3, R7 // Return byte where found 428 MOVD R7, R3 429 RET 430 431 TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0 432 CMP R6, R4 // Compare lengths 433 BGT notfound // If sep len is > string, notfound 434 ADD R4, R3, LASTBYTE // find last byte addr 435 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) 436 CMP R6, $0 // Check sep len 437 BEQ notfound // sep len 0 -- not found 438 MOVD R3, R7 // Copy of string addr 439 MOVD $16, R16 // Index value 16 440 MOVD $17, R17 // Index value 17 441 MOVD $18, R18 // Index value 18 442 MOVD $1, R19 // Index value 1 443 VSPLTISB $0xFF, ONES // splat all 1s 444 445 CMP R6, $16, CR4 // CR4 for len(sep) >= 16 446 VOR ONES, ONES, SEPMASK // Set up full SEPMASK 447 BGE CR4, loadge16 // Load for len(sep) >= 16 448 SUB R6, R16, R9 // 16-len of sep 449 SLD $3, R9 // Set up for VSLO 450 MTVSRD R9, V9 // Set up for VSLO 451 VSLDOI $8, V9, V9, V9 // Set up for VSLO 452 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 453 454 loadge16: 455 ANDCC $15, R5, R9 // Find byte offset of sep 456 ADD R9, R6, R10 // Add sep len 457 CMP R10, $16 // Check if sep len+offset > 16 458 BGT sepcross16 // Sep crosses 16 byte boundary 459 460 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container 461 LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0 462 SLD $3, R9 // Set up shift count for VSLO 463 MTVSRD R9, V8 // Set up shift count for VSLO 464 VSLDOI $8, V8, V8, V8 465 VSLO V0, V8, V0 // Shift by start byte 466 467 VAND V0, SEPMASK, V0 // Mask separator (< 16) 468 BR index2plus 469 470 sepcross16: 471 LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0 472 473 VAND V0, SEPMASK, V0 // mask out separator 474 BLE CR4, index2to16 475 BR index17plus // Handle sep > 16 476 477 index2plus: 478 CMP R6, $2 // Check length of sep 479 BNE index3plus // If not 2, check for 3 480 ADD $16, R7, R9 // Check if next 16 bytes past last 481 CMP R9, LASTBYTE // compare with last 482 BGE index2to16 // 2 <= len(string) <= 16 483 MOVD $0xff00, R21 // Mask for later 484 MTVSRD R21, V25 // Move to Vreg 485 VSPLTH $3, V25, V31 // Splat mask 486 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep 487 VSPLTISB $0, V10 // Clear V10 488 489 // First case: 2 byte separator 490 // V1: 2 byte separator splatted 491 // V2: 16 bytes at addr 492 // V4: 16 bytes at addr+1 493 // Compare 2 byte separator at start 494 // and at start+1. Use VSEL to combine 495 // those results to find the first 496 // matching start byte, returning 497 // that value when found. Loop as 498 // long as len(string) > 16 499 index2loop2: 500 LXVB16X (R7)(R19), V3 // Load 16 bytes @R7+1 into V3 501 502 index2loop: 503 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 504 VCMPEQUH V1, V2, V5 // Search for sep 505 VCMPEQUH V1, V3, V6 // Search for sep offset by 1 506 VSEL V6, V5, V31, V7 // merge even and odd indices 507 VCLZD V7, V18 // find index of first match 508 MFVSRD V18, R25 // get first value 509 CMP R25, $64 // Found if < 64 510 BLT foundR25 // Return byte index where found 511 512 MFVSRLD V18, R25 // get second value 513 CMP R25, $64 // Found if < 64 514 ADD $64, R25 // Update byte offset 515 BLT foundR25 // Return value 516 ADD $16, R7 // R7+=16 Update string pointer 517 ADD $17, R7, R9 // R9=F7+17 since loop unrolled 518 CMP R9, LASTBYTE // Compare addr+17 against last byte 519 BLT index2loop2 // If < last, continue loop 520 CMP R7, LASTBYTE // Compare addr+16 against last byte 521 BLT index2to16 // If < 16 handle specially 522 LXVB16X (R7)(R0), V3 // Load 16 bytes @R7 into V3 523 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte 524 BR index2loop 525 526 index3plus: 527 CMP R6, $3 // Check if sep == 3 528 BNE index4plus // If not check larger 529 ADD $19, R7, R9 // Find bytes for use in this loop 530 CMP R9, LASTBYTE // Compare against last byte 531 BGE index2to16 // Remaining string 2<=len<=16 532 MOVD $0xff00, R21 // Set up mask for upcoming loop 533 MTVSRD R21, V25 // Move mask to Vreg 534 VSPLTH $3, V25, V31 // Splat mask 535 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep 536 VSPLTB $2, V0, V8 // Splat 3rd byte of sep 537 538 // Loop to process 3 byte separator. 539 // string[0:16] is in V2 540 // string[2:18] is in V3 541 // sep[0:2] splatted in V1 542 // sec[3] splatted in v8 543 // Load vectors at string, string+1 544 // and string+2. Compare string, string+1 545 // against first 2 bytes of separator 546 // splatted, and string+2 against 3rd 547 // byte splatted. Merge the results with 548 // VSEL to find the first byte of a match. 549 550 // Special handling for last 16 bytes if the 551 // string fits in 16 byte multiple. 552 index3loop2: 553 MOVD $2, R21 // Set up index for 2 554 VSPLTISB $0, V10 // Clear V10 555 LXVB16X (R7)(R21), V3 // Load 16 bytes @R7+2 into V3 556 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes 557 558 index3loop: 559 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 560 VSLDOI $1, V2, V3, V4 // string[1:17] 561 VSLDOI $2, V2, V3, V9 // string[2:18] 562 VCMPEQUH V1, V2, V5 // compare hw even indices 563 VCMPEQUH V1, V4, V6 // compare hw odd indices 564 VCMPEQUB V8, V9, V10 // compare 3rd to last byte 565 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask 566 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte 567 VCLZD V7, V18 // Find first nonzero indexes 568 MFVSRD V18, R25 // Move 1st doubleword 569 CMP R25, $64 // If < 64 found 570 BLT foundR25 // Return matching index 571 572 MFVSRLD V18, R25 // Move 2nd doubleword 573 CMP R25, $64 // If < 64 found 574 ADD $64, R25 // Update byte index 575 BLT foundR25 // Return matching index 576 ADD $16, R7 // R7+=16 string ptr 577 ADD $19, R7, R9 // Number of string bytes for loop 578 CMP R9, LASTBYTE // Compare against last byte of string 579 BLT index3loop2 // If within, continue this loop 580 CMP R7, LASTSTR // Compare against last start byte 581 BLT index2to16 // Process remainder 582 VSPLTISB $0, V3 // Special case for last 16 bytes 583 BR index3loop // Continue this loop 584 585 // Loop to process 4 byte separator 586 // string[0:16] in V2 587 // string[3:16] in V3 588 // sep[0:4] splatted in V1 589 // Set up vectors with strings at offsets 590 // 0, 1, 2, 3 and compare against the 4 byte 591 // separator also splatted. Use VSEL with the 592 // compare results to find the first byte where 593 // a separator match is found. 594 index4plus: 595 CMP R6, $4 // Check if 4 byte separator 596 BNE index5plus // If not next higher 597 ADD $20, R7, R9 // Check string size to load 598 CMP R9, LASTBYTE // Verify string length 599 BGE index2to16 // If not large enough, process remaining 600 601 // Set up masks for use with VSEL 602 MOVD $0xff, R21 // Set up mask 0xff000000ff000000... 603 SLD $24, R21 604 MTVSRWS R21, V29 605 606 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... 607 MOVD $0xffff, R21 608 SLD $16, R21 609 MTVSRWS R21, V31 610 611 VSPLTW $0, V0, V1 // Splat 1st word of separator 612 613 index4loop: 614 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 615 616 next4: 617 VSPLTISB $0, V10 // Clear 618 MOVD $3, R9 // Number of bytes beyond 16 619 LXVB16X (R7)(R9), V3 // Load 16 bytes @R7 into V3 620 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes 621 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 622 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 623 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 624 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep 625 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep 626 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep 627 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep 628 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask 629 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask 630 VSEL V14, V13, V31, V7 // final merge 631 VCLZD V7, V18 // Find first index for each half 632 MFVSRD V18, R25 // Isolate value 633 CMP R25, $64 // If < 64, found 634 BLT foundR25 // Return found index 635 636 MFVSRLD V18, R25 // Isolate other value 637 CMP R25, $64 // If < 64, found 638 ADD $64, R25 // Update index for high doubleword 639 BLT foundR25 // Return found index 640 ADD $16, R7 // R7+=16 for next string 641 ADD $20, R7, R9 // R+20 for all bytes to load 642 CMP R9, LASTBYTE // Past end? Maybe check for extra? 643 BLT index4loop // If not, continue loop 644 CMP R7, LASTSTR // Check remainder 645 BLE index2to16 // Process remainder 646 BR notfound // Not found 647 648 index5plus: 649 CMP R6, $16 // Check for sep > 16 650 BGT index17plus // Handle large sep 651 652 // Assumption is that the separator is smaller than the string at this point 653 index2to16: 654 CMP R7, LASTSTR // Compare last start byte 655 BGT notfound // last takes len(sep) into account 656 657 ADD $19, R7, R9 // To check 4 indices per iteration, need at least 16+3 bytes 658 CMP R9, LASTBYTE 659 // At least 16 bytes of string left 660 // Mask the number of bytes in sep 661 VSPLTISB $0, V10 // Clear 662 BGT index2to16tail 663 664 MOVD $3, R17 // Number of bytes beyond 16 665 PCALIGN $32 666 index2to16loop: 667 LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7 668 LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3 669 670 VSLDOI $13, V5, V10, V2 // Shift left last 3 bytes 671 VSLDOI $1, V1, V2, V3 // V3=(V1:V2)<<1 672 VSLDOI $2, V1, V2, V4 // V4=(V1:V2)<<2 673 VAND V1, SEPMASK, V8 // Mask out sep size 0th index 674 VAND V3, SEPMASK, V9 // Mask out sep size 1st index 675 VAND V4, SEPMASK, V11 // Mask out sep size 2nd index 676 VAND V5, SEPMASK, V12 // Mask out sep size 3rd index 677 VCMPEQUBCC V0, V8, V8 // compare masked string 678 BLT CR6, found // All equal while comparing 0th index 679 VCMPEQUBCC V0, V9, V9 // compare masked string 680 BLT CR6, found2 // All equal while comparing 1st index 681 VCMPEQUBCC V0, V11, V11 // compare masked string 682 BLT CR6, found3 // All equal while comparing 2nd index 683 VCMPEQUBCC V0, V12, V12 // compare masked string 684 BLT CR6, found4 // All equal while comparing 3rd index 685 686 ADD $4, R7 // Update ptr to next 4 bytes 687 CMP R7, LASTSTR // Still less than last start byte 688 BGT notfound // Not found 689 ADD $19, R7, R9 // Verify remaining bytes 690 CMP R9, LASTBYTE // length of string at least 19 691 BLE index2to16loop // Try again, else do post processing and jump to index2to16next 692 693 // <19 bytes left, post process the remaining string 694 index2to16tail: 695 ADD R3, R4, R9 // End of string 696 SUB R7, R9, R9 // Number of bytes left 697 ANDCC $15, R7, R10 // 16 byte offset 698 ADD R10, R9, R11 // offset + len 699 CMP R11, $16 // >= 16? 700 BLE short // Does not cross 16 bytes 701 LXVB16X (R7)(R0), V1 // Load 16 bytes @R7 into V1 702 CMP R9, $16 // Post-processing of unrolled loop 703 BLE index2to16next // continue to index2to16next if <= 16 bytes 704 SUB R16, R9, R10 // R9 should be 18 or 17 hence R10 is 1 or 2 705 LXVB16X (R7)(R10), V9 706 CMP R10, $1 // string length is 17, compare 1 more byte 707 BNE extra2 // string length is 18, compare 2 more bytes 708 VSLDOI $15, V9, V10, V25 709 VAND V1, SEPMASK, V2 // Just compare size of sep 710 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 711 BLT CR6, found // Found 712 ADD $1, R7 // Not found, try next partial string 713 CMP R7, LASTSTR // Check for end of string 714 BGT notfound // If at end, then not found 715 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte 716 BR index2to16next // go to remainder loop 717 extra2: 718 VSLDOI $14, V9, V10, V25 719 VAND V1, SEPMASK, V2 // Just compare size of sep 720 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 721 BLT CR6, found // Found 722 ADD $1, R7 // Not found, try next partial string 723 CMP R7, LASTSTR // Check for end of string 724 BGT notfound // If at end, then not found 725 VOR V1, V1, V4 // save remaining string 726 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte 727 VAND V1, SEPMASK, V2 // Just compare size of sep 728 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 729 BLT CR6, found // Found 730 ADD $1, R7 // Not found, try next partial string 731 CMP R7, LASTSTR // Check for end of string 732 BGT notfound // If at end, then not found 733 VSLDOI $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte 734 BR index2to16next // Check the remaining partial string in index2to16next 735 736 short: 737 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container 738 LXVB16X (R9)(R0), V1 // Load 16 bytes @R9 into V1 739 SLD $3, R10 // Set up shift 740 MTVSRD R10, V8 // Set up shift 741 VSLDOI $8, V8, V8, V8 742 VSLO V1, V8, V1 // Shift by start byte 743 PCALIGN $32 744 index2to16next: 745 VAND V1, SEPMASK, V2 // Just compare size of sep 746 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 747 BLT CR6, found // Found 748 ADD $1, R7 // Not found, try next partial string 749 CMP R7, LASTSTR // Check for end of string 750 BGT notfound // If at end, then not found 751 VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte 752 BR index2to16next // Check the next partial string 753 754 index17plus: 755 CMP R6, $32 // Check if 17 < len(sep) <= 32 756 BGT index33plus 757 SUB $16, R6, R9 // Extra > 16 758 SLD $56, R9, R10 // Shift to use in VSLO 759 MTVSRD R10, V9 // Set up for VSLO 760 LXVB16X (R5)(R9), V1 // Load 16 bytes @R5+R9 into V1 761 VSLO V1, V9, V1 // Shift left 762 VSPLTISB $0xff, V7 // Splat 1s 763 VSPLTISB $0, V27 // Splat 0 764 765 index17to32loop: 766 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 767 768 next17: 769 LXVB16X (R7)(R9), V3 // Load 16 bytes @R7+R9 into V3 770 VSLO V3, V9, V3 // Shift left 771 VCMPEQUB V0, V2, V4 // Compare first 16 bytes 772 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes 773 VAND V4, V5, V6 // Check if both equal 774 VCMPEQUBCC V6, V7, V8 // All equal? 775 BLT CR6, found // Yes 776 ADD $1, R7 // On to next byte 777 CMP R7, LASTSTR // Check if last start byte 778 BGT notfound // If too high, not found 779 BR index17to32loop // Continue 780 781 notfound: 782 MOVD $-1, R3 // Return -1 if not found 783 RET 784 785 index33plus: 786 MOVD $0, (R0) // Case not implemented 787 RET // Crash before return 788 789 foundR25: 790 SRD $3, R25 // Convert from bits to bytes 791 ADD R25, R7 // Add to current string address 792 SUB R3, R7 // Subtract from start of string 793 MOVD R7, R3 // Return byte where found 794 RET 795 found4: 796 ADD $1, R7 // found from unrolled loop at index 3 797 found3: 798 ADD $1, R7 // found from unrolled loop at index 2 799 found2: 800 ADD $1, R7 // found from unrolled loop at index 1 801 found: // found at index 0 802 SUB R3, R7 // Return byte where found 803 MOVD R7, R3 804 RET