github.com/bir3/gocompiler@v0.9.2202/src/internal/bytealg/index_ppc64x.s (about) 1 // Copyright 2021 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This is an implementation based on the s390x 6 // implementation. 7 8 // Find a separator with 2 <= len <= 32 within a string. 9 // Separators with lengths of 2, 3 or 4 are handled 10 // specially. 11 12 // This works on power8 and above. The loads and 13 // compares are done in big endian order 14 // since that allows the used of VCLZD, and allows 15 // the same implementation to work on big and little 16 // endian platforms with minimal conditional changes. 17 18 // NOTE: There is a power9 implementation that 19 // improves performance by 10-15% on little 20 // endian for some of the benchmarks. 21 // Unrolled index2to16 loop by 4 on ppc64le/power9 22 // Work is still needed for a big endian 23 // implementation on power9. 24 25 //go:build ppc64 || ppc64le 26 27 #include "go_asm.h" 28 #include "textflag.h" 29 30 // Needed to swap LXVD2X loads to the correct 31 // byte order to work on POWER8. 32 33 #ifdef GOARCH_ppc64 34 DATA byteswap<>+0(SB)/8, $0x0001020304050607 35 DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f 36 #else 37 DATA byteswap<>+0(SB)/8, $0x0706050403020100 38 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 39 #endif 40 41 // Load bytes in big endian order. Address 42 // alignment does not need checking. 43 #define VLOADSWAP(base, index, vreg, vsreg) \ 44 LXVD2X (base)(index), vsreg; \ 45 VPERM vreg, vreg, SWAP, vreg 46 47 GLOBL byteswap<>+0(SB), RODATA, $16 48 49 TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 50 // R3 = byte array pointer 51 // R4 = length 52 MOVD R6, R5 // R5 = separator pointer 53 MOVD R7, R6 // R6 = separator length 54 55 #ifdef GOARCH_ppc64le 56 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7 57 CMP R7, $1 58 BNE power8 59 BR indexbodyp9<>(SB) 60 #endif 61 power8: 62 BR indexbody<>(SB) 63 64 TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 65 // R3 = string 66 // R4 = length 67 // R5 = separator pointer 68 // R6 = separator length 69 70 #ifdef GOARCH_ppc64le 71 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7 72 CMP R7, $1 73 BNE power8 74 BR indexbodyp9<>(SB) 75 76 #endif 77 power8: 78 BR indexbody<>(SB) 79 80 // s: string we are searching 81 // sep: string to search for 82 // R3=&s[0], R4=len(s) 83 // R5=&sep[0], R6=len(sep) 84 // R14=&ret (index where sep found) 85 // R7=working addr of string 86 // R16=index value 16 87 // R17=index value 17 88 // R18=index value 18 89 // R19=index value 1 90 // R26=LASTBYTE of string 91 // R27=LASTSTR last start byte to compare with sep 92 // R8, R9 scratch 93 // V0=sep left justified zero fill 94 // CR4=sep length >= 16 95 96 #define SEPMASK V17 97 #define LASTBYTE R26 98 #define LASTSTR R27 99 #define ONES V20 100 #define SWAP V21 101 #define SWAP_ VS53 102 TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0 103 CMP R6, R4 // Compare lengths 104 BGT notfound // If sep len is > string, notfound 105 ADD R4, R3, LASTBYTE // find last byte addr 106 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) 107 CMP R6, $0 // Check sep len 108 BEQ notfound // sep len 0 -- not found 109 MOVD R3, R7 // Copy of string addr 110 MOVD $16, R16 // Index value 16 111 MOVD $17, R17 // Index value 17 112 MOVD $18, R18 // Index value 18 113 MOVD $1, R19 // Index value 1 114 MOVD $byteswap<>+00(SB), R8 115 VSPLTISB $0xFF, ONES // splat all 1s 116 LXVD2X (R8)(R0), SWAP_ // Set up swap string 117 118 CMP R6, $16, CR4 // CR4 for len(sep) >= 16 119 VOR ONES, ONES, SEPMASK // Set up full SEPMASK 120 BGE CR4, loadge16 // Load for len(sep) >= 16 121 SUB R6, R16, R9 // 16-len of sep 122 SLD $3, R9 // Set up for VSLO 123 MTVSRD R9, V9 // Set up for VSLO 124 VSLDOI $8, V9, V9, V9 // Set up for VSLO 125 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 126 127 loadge16: 128 ANDCC $15, R5, R9 // Find byte offset of sep 129 ADD R9, R6, R10 // Add sep len 130 CMP R10, $16 // Check if sep len+offset > 16 131 BGT sepcross16 // Sep crosses 16 byte boundary 132 133 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container 134 VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0 135 SLD $3, R9 // Set up shift count for VSLO 136 MTVSRD R9, V8 // Set up shift count for VSLO 137 VSLDOI $8, V8, V8, V8 138 VSLO V0, V8, V0 // Shift by start byte 139 140 VAND V0, SEPMASK, V0 // Mask separator (< 16) 141 BR index2plus 142 143 sepcross16: 144 VLOADSWAP(R5, R0, V0, V0) // Load 16 bytes @R5 into V0 145 146 VAND V0, SEPMASK, V0 // mask out separator 147 BLE CR4, index2to16 148 BR index17plus // Handle sep > 16 149 150 index2plus: 151 CMP R6, $2 // Check length of sep 152 BNE index3plus // If not 2, check for 3 153 ADD $16, R7, R9 // Check if next 16 bytes past last 154 CMP R9, LASTBYTE // compare with last 155 BGE index2to16 // 2 <= len(string) <= 16 156 MOVD $0xff00, R21 // Mask for later 157 MTVSRD R21, V25 // Move to Vreg 158 VSPLTH $3, V25, V31 // Splat mask 159 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep 160 VSPLTISB $0, V10 // Clear V10 161 162 // First case: 2 byte separator 163 // V1: 2 byte separator splatted 164 // V2: 16 bytes at addr 165 // V4: 16 bytes at addr+1 166 // Compare 2 byte separator at start 167 // and at start+1. Use VSEL to combine 168 // those results to find the first 169 // matching start byte, returning 170 // that value when found. Loop as 171 // long as len(string) > 16 172 index2loop2: 173 VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3 174 175 index2loop: 176 VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 177 VCMPEQUH V1, V2, V5 // Search for sep 178 VCMPEQUH V1, V3, V6 // Search for sep offset by 1 179 VSEL V6, V5, V31, V7 // merge even and odd indices 180 VCLZD V7, V18 // find index of first match 181 MFVSRD V18, R25 // get first value 182 CMP R25, $64 // Found if < 64 183 BLT foundR25 // Return byte index where found 184 VSLDOI $8, V18, V18, V18 // Adjust 2nd value 185 MFVSRD V18, R25 // get second value 186 CMP R25, $64 // Found if < 64 187 ADD $64, R25 // Update byte offset 188 BLT foundR25 // Return value 189 ADD $16, R7 // R7+=16 Update string pointer 190 ADD $17, R7, R9 // R9=F7+17 since loop unrolled 191 CMP R9, LASTBYTE // Compare addr+17 against last byte 192 BLT index2loop2 // If < last, continue loop 193 CMP R7, LASTBYTE // Compare addr+16 against last byte 194 BLT index2to16 // If < 16 handle specially 195 VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3 196 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte 197 BR index2loop 198 199 index3plus: 200 CMP R6, $3 // Check if sep == 3 201 BNE index4plus // If not check larger 202 ADD $19, R7, R9 // Find bytes for use in this loop 203 CMP R9, LASTBYTE // Compare against last byte 204 BGE index2to16 // Remaining string 2<=len<=16 205 MOVD $0xff00, R21 // Set up mask for upcoming loop 206 MTVSRD R21, V25 // Move mask to Vreg 207 VSPLTH $3, V25, V31 // Splat mask 208 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep 209 VSPLTB $2, V0, V8 // Splat 3rd byte of sep 210 211 // Loop to process 3 byte separator. 212 // string[0:16] is in V2 213 // string[2:18] is in V3 214 // sep[0:2] splatted in V1 215 // sec[3] splatted in v8 216 // Load vectors at string, string+1 217 // and string+2. Compare string, string+1 218 // against first 2 bytes of separator 219 // splatted, and string+2 against 3rd 220 // byte splatted. Merge the results with 221 // VSEL to find the first byte of a match. 222 223 // Special handling for last 16 bytes if the 224 // string fits in 16 byte multiple. 225 index3loop2: 226 MOVD $2, R21 // Set up index for 2 227 VSPLTISB $0, V10 // Clear V10 228 VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3 229 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes 230 231 index3loop: 232 VLOADSWAP(R7, R0, V2, V2) // Load with correct order 233 VSLDOI $1, V2, V3, V4 // string[1:17] 234 VSLDOI $2, V2, V3, V9 // string[2:18] 235 VCMPEQUH V1, V2, V5 // compare hw even indices 236 VCMPEQUH V1, V4, V6 // compare hw odd indices 237 VCMPEQUB V8, V9, V10 // compare 3rd to last byte 238 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask 239 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte 240 VCLZD V7, V18 // Find first nonzero indexes 241 MFVSRD V18, R25 // Move 1st doubleword 242 CMP R25, $64 // If < 64 found 243 BLT foundR25 // Return matching index 244 VSLDOI $8, V18, V18, V18 // Move value 245 MFVSRD V18, R25 // Move 2nd doubleword 246 CMP R25, $64 // If < 64 found 247 ADD $64, R25 // Update byte index 248 BLT foundR25 // Return matching index 249 ADD $16, R7 // R7+=16 string ptr 250 ADD $19, R7, R9 // Number of string bytes for loop 251 CMP R9, LASTBYTE // Compare against last byte of string 252 BLT index3loop2 // If within, continue this loop 253 CMP R7, LASTSTR // Compare against last start byte 254 BLT index2to16 // Process remainder 255 VSPLTISB $0, V3 // Special case for last 16 bytes 256 BR index3loop // Continue this loop 257 258 // Loop to process 4 byte separator 259 // string[0:16] in V2 260 // string[3:16] in V3 261 // sep[0:4] splatted in V1 262 // Set up vectors with strings at offsets 263 // 0, 1, 2, 3 and compare against the 4 byte 264 // separator also splatted. Use VSEL with the 265 // compare results to find the first byte where 266 // a separator match is found. 267 index4plus: 268 CMP R6, $4 // Check if 4 byte separator 269 BNE index5plus // If not next higher 270 ADD $20, R7, R9 // Check string size to load 271 CMP R9, LASTBYTE // Verify string length 272 BGE index2to16 // If not large enough, process remaining 273 MOVD $2, R15 // Set up index 274 275 // Set up masks for use with VSEL 276 MOVD $0xff, R21 // Set up mask 0xff000000ff000000... 277 SLD $24, R21 278 MTVSRD R21, V10 279 VSPLTW $1, V10, V29 280 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... 281 MOVD $0xffff, R21 282 SLD $16, R21 283 MTVSRD R21, V10 284 VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000... 285 VSPLTW $0, V0, V1 // Splat 1st word of separator 286 287 index4loop: 288 VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 289 290 next4: 291 VSPLTISB $0, V10 // Clear 292 MOVD $3, R9 // Number of bytes beyond 16 293 VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+3 into V3 294 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes 295 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 296 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 297 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 298 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep 299 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep 300 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep 301 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep 302 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask 303 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask 304 VSEL V14, V13, V31, V7 // final merge 305 VCLZD V7, V18 // Find first index for each half 306 MFVSRD V18, R25 // Isolate value 307 CMP R25, $64 // If < 64, found 308 BLT foundR25 // Return found index 309 VSLDOI $8, V18, V18, V18 // Move for MFVSRD 310 MFVSRD V18, R25 // Isolate other value 311 CMP R25, $64 // If < 64, found 312 ADD $64, R25 // Update index for high doubleword 313 BLT foundR25 // Return found index 314 ADD $16, R7 // R7+=16 for next string 315 ADD $20, R7, R9 // R+20 for all bytes to load 316 CMP R9, LASTBYTE // Past end? Maybe check for extra? 317 BLT index4loop // If not, continue loop 318 CMP R7, LASTSTR // Check remainder 319 BLE index2to16 // Process remainder 320 BR notfound // Not found 321 322 index5plus: 323 CMP R6, $16 // Check for sep > 16 324 BGT index17plus // Handle large sep 325 326 // Assumption is that the separator is smaller than the string at this point 327 index2to16: 328 CMP R7, LASTSTR // Compare last start byte 329 BGT notfound // last takes len(sep) into account 330 331 ADD $16, R7, R9 // Check for last byte of string 332 CMP R9, LASTBYTE 333 BGT index2to16tail 334 335 // At least 16 bytes of string left 336 // Mask the number of bytes in sep 337 index2to16loop: 338 VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1 339 340 compare: 341 VAND V1, SEPMASK, V2 // Mask out sep size 342 VCMPEQUBCC V0, V2, V3 // Compare masked string 343 BLT CR6, found // All equal 344 ADD $1, R7 // Update ptr to next byte 345 CMP R7, LASTSTR // Still less than last start byte 346 BGT notfound // Not found 347 ADD $16, R7, R9 // Verify remaining bytes 348 CMP R9, LASTBYTE // At least 16 349 BLT index2to16loop // Try again 350 351 // Less than 16 bytes remaining in string 352 // Separator >= 2 353 index2to16tail: 354 ADD R3, R4, R9 // End of string 355 SUB R7, R9, R9 // Number of bytes left 356 ANDCC $15, R7, R10 // 16 byte offset 357 ADD R10, R9, R11 // offset + len 358 CMP R11, $16 // >= 16? 359 BLE short // Does not cross 16 bytes 360 VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1 361 BR index2to16next // Continue on 362 363 short: 364 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container 365 VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1 366 SLD $3, R10 // Set up shift 367 MTVSRD R10, V8 // Set up shift 368 VSLDOI $8, V8, V8, V8 369 VSLO V1, V8, V1 // Shift by start byte 370 VSPLTISB $0, V25 // Clear for later use 371 372 index2to16next: 373 VAND V1, SEPMASK, V2 // Just compare size of sep 374 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 375 BLT CR6, found // Found 376 ADD $1, R7 // Not found, try next partial string 377 CMP R7, LASTSTR // Check for end of string 378 BGT notfound // If at end, then not found 379 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte 380 BR index2to16next // Check the next partial string 381 382 index17plus: 383 CMP R6, $32 // Check if 17 < len(sep) <= 32 384 BGT index33plus 385 SUB $16, R6, R9 // Extra > 16 386 SLD $56, R9, R10 // Shift to use in VSLO 387 MTVSRD R10, V9 // Set up for VSLO 388 VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1 389 VSLO V1, V9, V1 // Shift left 390 VSPLTISB $0xff, V7 // Splat 1s 391 VSPLTISB $0, V27 // Splat 0 392 393 index17to32loop: 394 VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 395 396 next17: 397 VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+R9 into V3 398 VSLO V3, V9, V3 // Shift left 399 VCMPEQUB V0, V2, V4 // Compare first 16 bytes 400 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes 401 VAND V4, V5, V6 // Check if both equal 402 VCMPEQUBCC V6, V7, V8 // All equal? 403 BLT CR6, found // Yes 404 ADD $1, R7 // On to next byte 405 CMP R7, LASTSTR // Check if last start byte 406 BGT notfound // If too high, not found 407 BR index17to32loop // Continue 408 409 notfound: 410 MOVD $-1, R3 // Return -1 if not found 411 RET 412 413 index33plus: 414 MOVD $0, (R0) // Case not implemented 415 RET // Crash before return 416 417 foundR25: 418 SRD $3, R25 // Convert from bits to bytes 419 ADD R25, R7 // Add to current string address 420 SUB R3, R7 // Subtract from start of string 421 MOVD R7, R3 // Return byte where found 422 RET 423 424 found: 425 SUB R3, R7 // Return byte where found 426 MOVD R7, R3 427 RET 428 429 TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0 430 CMP R6, R4 // Compare lengths 431 BGT notfound // If sep len is > string, notfound 432 ADD R4, R3, LASTBYTE // find last byte addr 433 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) 434 CMP R6, $0 // Check sep len 435 BEQ notfound // sep len 0 -- not found 436 MOVD R3, R7 // Copy of string addr 437 #ifndef GOPPC64_power10 438 MOVD $16, R16 // Index value 16 439 MOVD $17, R17 // Index value 17 440 MOVD $18, R18 // Index value 18 441 VSPLTISB $0xFF, ONES // splat all 1s 442 VOR ONES, ONES, SEPMASK // Set up full SEPMASK 443 #else 444 SLD $56, R6, R14 // Set up separator length for LXVLL 445 #endif 446 MOVD $1, R19 // Index value 1 447 CMP R6, $16, CR4 // CR4 for len(sep) >= 16 448 BGE CR4, loadge16 // Load for len(sep) >= 16 449 #ifndef GOPPC64_power10 450 SUB R6, R16, R9 // 16-len of sep 451 SLD $3, R9 // Set up for VSLO 452 MTVSRD R9, V9 // Set up for VSLO 453 VSLDOI $8, V9, V9, V9 // Set up for VSLO 454 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 455 #endif 456 loadge16: 457 ANDCC $15, R5, R9 // Find byte offset of sep 458 ADD R9, R6, R10 // Add sep len 459 CMP R10, $16 // Check if sep len+offset > 16 460 BGT sepcross16 // Sep crosses 16 byte boundary 461 #ifdef GOPPC64_power10 462 LXVLL R5, R14, V0 // Load separator 463 #else 464 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container 465 LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0 466 SLD $3, R9 // Set up shift count for VSLO 467 MTVSRD R9, V8 // Set up shift count for VSLO 468 VSLDOI $8, V8, V8, V8 469 VSLO V0, V8, V0 // Shift by start byte 470 VAND V0, SEPMASK, V0 // Mask separator (< 16) 471 #endif 472 BR index2plus 473 sepcross16: 474 #ifdef GOPPC64_power10 475 LXVLL R5, R14, V0 // Load separator 476 #else 477 LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0\ 478 VAND V0, SEPMASK, V0 // mask out separator 479 #endif 480 BLE CR4, index2to16 481 BR index17plus // Handle sep > 16 482 483 index2plus: 484 CMP R6, $2 // Check length of sep 485 BNE index3plus // If not 2, check for 3 486 ADD $16, R7, R9 // Check if next 16 bytes past last 487 CMP R9, LASTBYTE // compare with last 488 BGE index2to16 // 2 <= len(string) <= 16 489 MOVD $0xff00, R21 // Mask for later 490 MTVSRD R21, V25 // Move to Vreg 491 VSPLTH $3, V25, V31 // Splat mask 492 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep 493 VSPLTISB $0, V10 // Clear V10 494 495 // First case: 2 byte separator 496 // V1: 2 byte separator splatted 497 // V2: 16 bytes at addr 498 // V4: 16 bytes at addr+1 499 // Compare 2 byte separator at start 500 // and at start+1. Use VSEL to combine 501 // those results to find the first 502 // matching start byte, returning 503 // that value when found. Loop as 504 // long as len(string) > 16 505 index2loop2: 506 LXVB16X (R7)(R19), V3 // Load 16 bytes @R7+1 into V3 507 508 index2loop: 509 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 510 VCMPEQUH V1, V2, V5 // Search for sep 511 VCMPEQUH V1, V3, V6 // Search for sep offset by 1 512 VSEL V6, V5, V31, V7 // merge even and odd indices 513 VCLZD V7, V18 // find index of first match 514 MFVSRD V18, R25 // get first value 515 CMP R25, $64 // Found if < 64 516 BLT foundR25 // Return byte index where found 517 518 MFVSRLD V18, R25 // get second value 519 CMP R25, $64 // Found if < 64 520 ADD $64, R25 // Update byte offset 521 BLT foundR25 // Return value 522 ADD $16, R7 // R7+=16 Update string pointer 523 ADD $17, R7, R9 // R9=F7+17 since loop unrolled 524 CMP R9, LASTBYTE // Compare addr+17 against last byte 525 BLT index2loop2 // If < last, continue loop 526 CMP R7, LASTBYTE // Compare addr+16 against last byte 527 BLT index2to16 // If < 16 handle specially 528 LXVB16X (R7)(R0), V3 // Load 16 bytes @R7 into V3 529 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte 530 BR index2loop 531 532 index3plus: 533 CMP R6, $3 // Check if sep == 3 534 BNE index4plus // If not check larger 535 ADD $19, R7, R9 // Find bytes for use in this loop 536 CMP R9, LASTBYTE // Compare against last byte 537 BGE index2to16 // Remaining string 2<=len<=16 538 MOVD $0xff00, R21 // Set up mask for upcoming loop 539 MTVSRD R21, V25 // Move mask to Vreg 540 VSPLTH $3, V25, V31 // Splat mask 541 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep 542 VSPLTB $2, V0, V8 // Splat 3rd byte of sep 543 544 // Loop to process 3 byte separator. 545 // string[0:16] is in V2 546 // string[2:18] is in V3 547 // sep[0:2] splatted in V1 548 // sec[3] splatted in v8 549 // Load vectors at string, string+1 550 // and string+2. Compare string, string+1 551 // against first 2 bytes of separator 552 // splatted, and string+2 against 3rd 553 // byte splatted. Merge the results with 554 // VSEL to find the first byte of a match. 555 556 // Special handling for last 16 bytes if the 557 // string fits in 16 byte multiple. 558 index3loop2: 559 MOVD $2, R21 // Set up index for 2 560 VSPLTISB $0, V10 // Clear V10 561 LXVB16X (R7)(R21), V3 // Load 16 bytes @R7+2 into V3 562 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes 563 564 index3loop: 565 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 566 VSLDOI $1, V2, V3, V4 // string[1:17] 567 VSLDOI $2, V2, V3, V9 // string[2:18] 568 VCMPEQUH V1, V2, V5 // compare hw even indices 569 VCMPEQUH V1, V4, V6 // compare hw odd indices 570 VCMPEQUB V8, V9, V10 // compare 3rd to last byte 571 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask 572 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte 573 VCLZD V7, V18 // Find first nonzero indexes 574 MFVSRD V18, R25 // Move 1st doubleword 575 CMP R25, $64 // If < 64 found 576 BLT foundR25 // Return matching index 577 578 MFVSRLD V18, R25 // Move 2nd doubleword 579 CMP R25, $64 // If < 64 found 580 ADD $64, R25 // Update byte index 581 BLT foundR25 // Return matching index 582 ADD $16, R7 // R7+=16 string ptr 583 ADD $19, R7, R9 // Number of string bytes for loop 584 CMP R9, LASTBYTE // Compare against last byte of string 585 BLT index3loop2 // If within, continue this loop 586 CMP R7, LASTSTR // Compare against last start byte 587 BLT index2to16 // Process remainder 588 VSPLTISB $0, V3 // Special case for last 16 bytes 589 BR index3loop // Continue this loop 590 591 // Loop to process 4 byte separator 592 // string[0:16] in V2 593 // string[3:16] in V3 594 // sep[0:4] splatted in V1 595 // Set up vectors with strings at offsets 596 // 0, 1, 2, 3 and compare against the 4 byte 597 // separator also splatted. Use VSEL with the 598 // compare results to find the first byte where 599 // a separator match is found. 600 index4plus: 601 CMP R6, $4 // Check if 4 byte separator 602 BNE index5plus // If not next higher 603 ADD $20, R7, R9 // Check string size to load 604 CMP R9, LASTBYTE // Verify string length 605 BGE index2to16 // If not large enough, process remaining 606 607 // Set up masks for use with VSEL 608 MOVD $0xff, R21 // Set up mask 0xff000000ff000000... 609 SLD $24, R21 610 MTVSRWS R21, V29 611 612 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... 613 MOVD $0xffff, R21 614 SLD $16, R21 615 MTVSRWS R21, V31 616 617 VSPLTW $0, V0, V1 // Splat 1st word of separator 618 619 index4loop: 620 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 621 622 next4: 623 VSPLTISB $0, V10 // Clear 624 MOVD $3, R9 // Number of bytes beyond 16 625 LXVB16X (R7)(R9), V3 // Load 16 bytes @R7 into V3 626 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes 627 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 628 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 629 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 630 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep 631 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep 632 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep 633 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep 634 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask 635 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask 636 VSEL V14, V13, V31, V7 // final merge 637 VCLZD V7, V18 // Find first index for each half 638 MFVSRD V18, R25 // Isolate value 639 CMP R25, $64 // If < 64, found 640 BLT foundR25 // Return found index 641 642 MFVSRLD V18, R25 // Isolate other value 643 CMP R25, $64 // If < 64, found 644 ADD $64, R25 // Update index for high doubleword 645 BLT foundR25 // Return found index 646 ADD $16, R7 // R7+=16 for next string 647 ADD $20, R7, R9 // R+20 for all bytes to load 648 CMP R9, LASTBYTE // Past end? Maybe check for extra? 649 BLT index4loop // If not, continue loop 650 CMP R7, LASTSTR // Check remainder 651 BLE index2to16 // Process remainder 652 BR notfound // Not found 653 654 index5plus: 655 CMP R6, $16 // Check for sep > 16 656 BGT index17plus // Handle large sep 657 658 // Assumption is that the separator is smaller than the string at this point 659 index2to16: 660 CMP R7, LASTSTR // Compare last start byte 661 BGT notfound // last takes len(sep) into account 662 663 ADD $19, R7, R9 // To check 4 indices per iteration, need at least 16+3 bytes 664 CMP R9, LASTBYTE 665 // At least 16 bytes of string left 666 // Mask the number of bytes in sep 667 VSPLTISB $0, V10 // Clear 668 BGT index2to16tail 669 670 #ifdef GOPPC64_power10 671 ADD $3,R7, R17 // Base+3 672 ADD $2,R7, R8 // Base+2 673 ADD $1,R7, R10 // Base+1 674 #else 675 MOVD $3, R17 // Number of bytes beyond 16 676 #endif 677 PCALIGN $16 678 679 index2to16loop: 680 681 #ifdef GOPPC64_power10 682 LXVLL R7, R14, V8 // Load next 16 bytes of string from Base 683 LXVLL R10, R14, V9 // Load next 16 bytes of string from Base+1 684 LXVLL R8, R14, V11 // Load next 16 bytes of string from Base+2 685 LXVLL R17,R14, V12 // Load next 16 bytes of string from Base+3 686 #else 687 LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7 688 LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3 689 690 VSLDOI $13, V5, V10, V2 // Shift left last 3 bytes 691 VSLDOI $1, V1, V2, V3 // V3=(V1:V2)<<1 692 VSLDOI $2, V1, V2, V4 // V4=(V1:V2)<<2 693 VAND V1, SEPMASK, V8 // Mask out sep size 0th index 694 VAND V3, SEPMASK, V9 // Mask out sep size 1st index 695 VAND V4, SEPMASK, V11 // Mask out sep size 2nd index 696 VAND V5, SEPMASK, V12 // Mask out sep size 3rd index 697 #endif 698 VCMPEQUBCC V0, V8, V8 // compare masked string 699 BLT CR6, found // All equal while comparing 0th index 700 VCMPEQUBCC V0, V9, V9 // compare masked string 701 BLT CR6, found2 // All equal while comparing 1st index 702 VCMPEQUBCC V0, V11, V11 // compare masked string 703 BLT CR6, found3 // All equal while comparing 2nd index 704 VCMPEQUBCC V0, V12, V12 // compare masked string 705 BLT CR6, found4 // All equal while comparing 3rd index 706 707 ADD $4, R7 // Update ptr to next 4 bytes 708 #ifdef GOPPC64_power10 709 ADD $4, R17 // Update ptr to next 4 bytes 710 ADD $4, R8 // Update ptr to next 4 bytes 711 ADD $4, R10 // Update ptr to next 4 bytes 712 #endif 713 CMP R7, LASTSTR // Still less than last start byte 714 BGT notfound // Not found 715 ADD $19, R7, R9 // Verify remaining bytes 716 CMP R9, LASTBYTE // length of string at least 19 717 BLE index2to16loop // Try again, else do post processing and jump to index2to16next 718 PCALIGN $32 719 // <19 bytes left, post process the remaining string 720 index2to16tail: 721 #ifdef GOPPC64_power10 722 index2to16next_p10: 723 LXVLL R7,R14, V1 // Load 16 bytes @R7 into V1 724 VCMPEQUBCC V1, V0, V3 // Compare sep and partial string 725 BLT CR6, found // Found 726 ADD $1, R7 // Not found, try next partial string 727 CMP R7, LASTSTR // Check for end of string 728 BLE index2to16next_p10 // If at end, then not found 729 BR notfound // go to remainder loop 730 #else 731 ADD R3, R4, R9 // End of string 732 SUB R7, R9, R9 // Number of bytes left 733 ANDCC $15, R7, R10 // 16 byte offset 734 ADD R10, R9, R11 // offset + len 735 CMP R11, $16 // >= 16? 736 BLE short // Does not cross 16 bytes 737 LXVB16X (R7)(R0), V1 // Load 16 bytes @R7 into V1 738 CMP R9, $16 // Post-processing of unrolled loop 739 BLE index2to16next // continue to index2to16next if <= 16 bytes 740 SUB R16, R9, R10 // R9 should be 18 or 17 hence R10 is 1 or 2 741 LXVB16X (R7)(R10), V9 742 CMP R10, $1 // string length is 17, compare 1 more byte 743 BNE extra2 // string length is 18, compare 2 more bytes 744 VSLDOI $15, V9, V10, V25 745 VAND V1, SEPMASK, V2 // Just compare size of sep 746 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 747 BLT CR6, found // Found 748 ADD $1, R7 // Not found, try next partial string 749 CMP R7, LASTSTR // Check for end of string 750 BGT notfound // If at end, then not found 751 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte 752 BR index2to16next // go to remainder loop 753 extra2: 754 VSLDOI $14, V9, V10, V25 755 VAND V1, SEPMASK, V2 // Just compare size of sep 756 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 757 BLT CR6, found // Found 758 ADD $1, R7 // Not found, try next partial string 759 CMP R7, LASTSTR // Check for end of string 760 BGT notfound // If at end, then not found 761 VOR V1, V1, V4 // save remaining string 762 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte 763 VAND V1, SEPMASK, V2 // Just compare size of sep 764 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 765 BLT CR6, found // Found 766 ADD $1, R7 // Not found, try next partial string 767 CMP R7, LASTSTR // Check for end of string 768 BGT notfound // If at end, then not found 769 VSLDOI $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte 770 BR index2to16next // Check the remaining partial string in index2to16next 771 772 short: 773 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container 774 LXVB16X (R9)(R0), V1 // Load 16 bytes @R9 into V1 775 SLD $3, R10 // Set up shift 776 MTVSRD R10, V8 // Set up shift 777 VSLDOI $8, V8, V8, V8 778 VSLO V1, V8, V1 // Shift by start byte 779 PCALIGN $16 780 index2to16next: 781 VAND V1, SEPMASK, V2 // Just compare size of sep 782 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string 783 BLT CR6, found // Found 784 ADD $1, R7 // Not found, try next partial string 785 CMP R7, LASTSTR // Check for end of string 786 BGT notfound // If at end, then not found 787 VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte 788 BR index2to16next // Check the next partial string 789 #endif // Tail processing if GOPPC64!=power10 790 791 index17plus: 792 CMP R6, $32 // Check if 17 < len(sep) <= 32 793 BGT index33plus 794 SUB $16, R6, R9 // Extra > 16 795 SLD $56, R9, R10 // Shift to use in VSLO 796 MTVSRD R10, V9 // Set up for VSLO 797 LXVB16X (R5)(R9), V1 // Load 16 bytes @R5+R9 into V1 798 VSLO V1, V9, V1 // Shift left 799 VSPLTISB $0xff, V7 // Splat 1s 800 VSPLTISB $0, V27 // Splat 0 801 802 index17to32loop: 803 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 804 805 next17: 806 LXVB16X (R7)(R9), V3 // Load 16 bytes @R7+R9 into V3 807 VSLO V3, V9, V3 // Shift left 808 VCMPEQUB V0, V2, V4 // Compare first 16 bytes 809 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes 810 VAND V4, V5, V6 // Check if both equal 811 VCMPEQUBCC V6, V7, V8 // All equal? 812 BLT CR6, found // Yes 813 ADD $1, R7 // On to next byte 814 CMP R7, LASTSTR // Check if last start byte 815 BGT notfound // If too high, not found 816 BR index17to32loop // Continue 817 818 notfound: 819 MOVD $-1, R3 // Return -1 if not found 820 RET 821 822 index33plus: 823 MOVD $0, (R0) // Case not implemented 824 RET // Crash before return 825 826 foundR25: 827 SRD $3, R25 // Convert from bits to bytes 828 ADD R25, R7 // Add to current string address 829 SUB R3, R7 // Subtract from start of string 830 MOVD R7, R3 // Return byte where found 831 RET 832 found4: 833 ADD $1, R7 // found from unrolled loop at index 3 834 found3: 835 ADD $1, R7 // found from unrolled loop at index 2 836 found2: 837 ADD $1, R7 // found from unrolled loop at index 1 838 found: // found at index 0 839 SUB R3, R7 // Return byte where found 840 MOVD R7, R3 841 RET