github.com/JimmyHuang454/JLS-go@v0.0.0-20230831150107-90d536585ba0/internal/bytealg/index_ppc64x.s (about)

     1  // Copyright 2021 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This is an implementation based on the s390x
     6  // implementation.
     7  
     8  // Find a separator with 2 <= len <= 32 within a string.
     9  // Separators with lengths of 2, 3 or 4 are handled
    10  // specially.
    11  
    12  // This works on power8 and above. The loads and
    13  // compares are done in big endian order
    14  // since that allows the used of VCLZD, and allows
    15  // the same implementation to work on big and little
    16  // endian platforms with minimal conditional changes.
    17  
    18  // NOTE: There is a power9 implementation that
    19  // improves performance by 10-15% on little
    20  // endian for some of the benchmarks.
    21  // Unrolled index2to16 loop by 4 on ppc64le/power9
    22  // Work is still needed for a big endian
    23  // implementation on power9.
    24  
    25  //go:build ppc64 || ppc64le
    26  
    27  #include "go_asm.h"
    28  #include "textflag.h"
    29  
    30  // Needed to swap LXVD2X loads to the correct
    31  // byte order to work on POWER8.
    32  
    33  #ifdef GOARCH_ppc64
    34  DATA byteswap<>+0(SB)/8, $0x0001020304050607
    35  DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
    36  #else
    37  DATA byteswap<>+0(SB)/8, $0x0706050403020100
    38  DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
    39  #endif
    40  
    41  // Load bytes in big endian order. Address
    42  // alignment does not need checking.
    43  #define VLOADSWAP(base, index, vreg, vsreg) \
    44  	LXVD2X (base)(index), vsreg;  \
    45  	VPERM  vreg, vreg, SWAP, vreg
    46  
    47  GLOBL byteswap<>+0(SB), RODATA, $16
    48  
    49  TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
    50  	// R3 = byte array pointer
    51  	// R4 = length
    52  	MOVD R6, R5             // R5 = separator pointer
    53  	MOVD R7, R6             // R6 = separator length
    54  
    55  #ifdef GOARCH_ppc64le
    56  	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
    57  	CMP   R7, $1
    58  	BNE   power8
    59  	BR    indexbodyp9<>(SB)
    60  #endif
    61  power8:
    62  	BR indexbody<>(SB)
    63  
    64  TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    65  	// R3 = string
    66  	// R4 = length
    67  	// R5 = separator pointer
    68  	// R6 = separator length
    69  
    70  #ifdef GOARCH_ppc64le
    71  	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
    72  	CMP   R7, $1
    73  	BNE   power8
    74  	BR    indexbodyp9<>(SB)
    75  
    76  #endif
    77  power8:
    78  	BR indexbody<>(SB)
    79  
    80  	// s: string we are searching
    81  	// sep: string to search for
    82  	// R3=&s[0], R4=len(s)
    83  	// R5=&sep[0], R6=len(sep)
    84  	// R14=&ret (index where sep found)
    85  	// R7=working addr of string
    86  	// R16=index value 16
    87  	// R17=index value 17
    88  	// R18=index value 18
    89  	// R19=index value 1
    90  	// R26=LASTBYTE of string
    91  	// R27=LASTSTR last start byte to compare with sep
    92  	// R8, R9 scratch
    93  	// V0=sep left justified zero fill
    94  	// CR4=sep length >= 16
    95  
    96  #define SEPMASK V17
    97  #define LASTBYTE R26
    98  #define LASTSTR R27
    99  #define ONES V20
   100  #define SWAP V21
   101  #define SWAP_ VS53
   102  TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
   103  	CMP      R6, R4                 // Compare lengths
   104  	BGT      notfound               // If sep len is > string, notfound
   105  	ADD      R4, R3, LASTBYTE       // find last byte addr
   106  	SUB      R6, LASTBYTE, LASTSTR  // LAST=&s[len(s)-len(sep)] (last valid start index)
   107  	CMP      R6, $0                 // Check sep len
   108  	BEQ      notfound               // sep len 0 -- not found
   109  	MOVD     R3, R7                 // Copy of string addr
   110  	MOVD     $16, R16               // Index value 16
   111  	MOVD     $17, R17               // Index value 17
   112  	MOVD     $18, R18               // Index value 18
   113  	MOVD     $1, R19                // Index value 1
   114  	MOVD     $byteswap<>+00(SB), R8
   115  	VSPLTISB $0xFF, ONES            // splat all 1s
   116  	LXVD2X   (R8)(R0), SWAP_        // Set up swap string
   117  
   118  	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
   119  	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
   120  	BGE    CR4, loadge16       // Load for len(sep) >= 16
   121  	SUB    R6, R16, R9         // 16-len of sep
   122  	SLD    $3, R9              // Set up for VSLO
   123  	MTVSRD R9, V9              // Set up for VSLO
   124  	VSLDOI $8, V9, V9, V9      // Set up for VSLO
   125  	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
   126  
   127  loadge16:
   128  	ANDCC $15, R5, R9 // Find byte offset of sep
   129  	ADD   R9, R6, R10 // Add sep len
   130  	CMP   R10, $16    // Check if sep len+offset > 16
   131  	BGT   sepcross16  // Sep crosses 16 byte boundary
   132  
   133  	RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
   134  	VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0
   135  	SLD    $3, R9          // Set up shift count for VSLO
   136  	MTVSRD R9, V8         // Set up shift count for VSLO
   137  	VSLDOI $8, V8, V8, V8
   138  	VSLO   V0, V8, V0      // Shift by start byte
   139  
   140  	VAND V0, SEPMASK, V0 // Mask separator (< 16)
   141  	BR   index2plus
   142  
   143  sepcross16:
   144  	VLOADSWAP(R5, R0, V0, V0)  // Load 16 bytes @R5 into V0
   145  
   146  	VAND V0, SEPMASK, V0 // mask out separator
   147  	BLE  CR4, index2to16
   148  	BR   index17plus     // Handle sep > 16
   149  
   150  index2plus:
   151  	CMP      R6, $2       // Check length of sep
   152  	BNE      index3plus   // If not 2, check for 3
   153  	ADD      $16, R7, R9  // Check if next 16 bytes past last
   154  	CMP      R9, LASTBYTE // compare with last
   155  	BGE      index2to16   // 2 <= len(string) <= 16
   156  	MOVD     $0xff00, R21 // Mask for later
   157  	MTVSRD   R21, V25     // Move to Vreg
   158  	VSPLTH   $3, V25, V31 // Splat mask
   159  	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
   160  	VSPLTISB $0, V10      // Clear V10
   161  
   162  	// First case: 2 byte separator
   163  	// V1: 2 byte separator splatted
   164  	// V2: 16 bytes at addr
   165  	// V4: 16 bytes at addr+1
   166  	// Compare 2 byte separator at start
   167  	// and at start+1. Use VSEL to combine
   168  	// those results to find the first
   169  	// matching start byte, returning
   170  	// that value when found. Loop as
   171  	// long as len(string) > 16
   172  index2loop2:
   173  	VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3
   174  
   175  index2loop:
   176  	VLOADSWAP(R7, R0, V2, V2)  // Load 16 bytes @R7 into V2
   177  	VCMPEQUH V1, V2, V5        // Search for sep
   178  	VCMPEQUH V1, V3, V6        // Search for sep offset by 1
   179  	VSEL     V6, V5, V31, V7   // merge even and odd indices
   180  	VCLZD    V7, V18           // find index of first match
   181  	MFVSRD   V18, R25          // get first value
   182  	CMP      R25, $64          // Found if < 64
   183  	BLT      foundR25          // Return byte index where found
   184  	VSLDOI   $8, V18, V18, V18 // Adjust 2nd value
   185  	MFVSRD   V18, R25          // get second value
   186  	CMP      R25, $64          // Found if < 64
   187  	ADD      $64, R25          // Update byte offset
   188  	BLT      foundR25          // Return value
   189  	ADD      $16, R7           // R7+=16 Update string pointer
   190  	ADD      $17, R7, R9       // R9=F7+17 since loop unrolled
   191  	CMP      R9, LASTBYTE      // Compare addr+17 against last byte
   192  	BLT      index2loop2       // If < last, continue loop
   193  	CMP      R7, LASTBYTE      // Compare addr+16 against last byte
   194  	BLT      index2to16        // If < 16 handle specially
   195  	VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3
   196  	VSLDOI   $1, V3, V10, V3   // Shift left by 1 byte
   197  	BR       index2loop
   198  
   199  index3plus:
   200  	CMP    R6, $3       // Check if sep == 3
   201  	BNE    index4plus   // If not check larger
   202  	ADD    $19, R7, R9  // Find bytes for use in this loop
   203  	CMP    R9, LASTBYTE // Compare against last byte
   204  	BGE    index2to16   // Remaining string 2<=len<=16
   205  	MOVD   $0xff00, R21 // Set up mask for upcoming loop
   206  	MTVSRD R21, V25     // Move mask to Vreg
   207  	VSPLTH $3, V25, V31 // Splat mask
   208  	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
   209  	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
   210  
   211  	// Loop to process 3 byte separator.
   212  	// string[0:16] is in V2
   213  	// string[2:18] is in V3
   214  	// sep[0:2] splatted in V1
   215  	// sec[3] splatted in v8
   216  	// Load vectors at string, string+1
   217  	// and string+2. Compare string, string+1
   218  	// against first 2 bytes of separator
   219  	// splatted, and string+2 against 3rd
   220  	// byte splatted. Merge the results with
   221  	// VSEL to find the first byte of a match.
   222  
   223  	// Special handling for last 16 bytes if the
   224  	// string fits in 16 byte multiple.
   225  index3loop2:
   226  	MOVD     $2, R21          // Set up index for 2
   227  	VSPLTISB $0, V10          // Clear V10
   228  	VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3
   229  	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
   230  
   231  index3loop:
   232  	VLOADSWAP(R7, R0, V2, V2)  // Load with correct order
   233  	VSLDOI   $1, V2, V3, V4    // string[1:17]
   234  	VSLDOI   $2, V2, V3, V9    // string[2:18]
   235  	VCMPEQUH V1, V2, V5        // compare hw even indices
   236  	VCMPEQUH V1, V4, V6        // compare hw odd indices
   237  	VCMPEQUB V8, V9, V10       // compare 3rd to last byte
   238  	VSEL     V6, V5, V31, V7   // Find 1st matching byte using mask
   239  	VAND     V7, V10, V7       // AND matched bytes with matched 3rd byte
   240  	VCLZD    V7, V18           // Find first nonzero indexes
   241  	MFVSRD   V18, R25          // Move 1st doubleword
   242  	CMP      R25, $64          // If < 64 found
   243  	BLT      foundR25          // Return matching index
   244  	VSLDOI   $8, V18, V18, V18 // Move value
   245  	MFVSRD   V18, R25          // Move 2nd doubleword
   246  	CMP      R25, $64          // If < 64 found
   247  	ADD      $64, R25          // Update byte index
   248  	BLT      foundR25          // Return matching index
   249  	ADD      $16, R7           // R7+=16 string ptr
   250  	ADD      $19, R7, R9       // Number of string bytes for loop
   251  	CMP      R9, LASTBYTE      // Compare against last byte of string
   252  	BLT      index3loop2       // If within, continue this loop
   253  	CMP      R7, LASTSTR       // Compare against last start byte
   254  	BLT      index2to16        // Process remainder
   255  	VSPLTISB $0, V3            // Special case for last 16 bytes
   256  	BR       index3loop        // Continue this loop
   257  
   258  	// Loop to process 4 byte separator
   259  	// string[0:16] in V2
   260  	// string[3:16] in V3
   261  	// sep[0:4] splatted in V1
   262  	// Set up vectors with strings at offsets
   263  	// 0, 1, 2, 3 and compare against the 4 byte
   264  	// separator also splatted. Use VSEL with the
   265  	// compare results to find the first byte where
   266  	// a separator match is found.
   267  index4plus:
   268  	CMP  R6, $4       // Check if 4 byte separator
   269  	BNE  index5plus   // If not next higher
   270  	ADD  $20, R7, R9  // Check string size to load
   271  	CMP  R9, LASTBYTE // Verify string length
   272  	BGE  index2to16   // If not large enough, process remaining
   273  	MOVD $2, R15      // Set up index
   274  
   275  	// Set up masks for use with VSEL
   276  	MOVD   $0xff, R21        // Set up mask 0xff000000ff000000...
   277  	SLD    $24, R21
   278  	MTVSRD R21, V10
   279  	VSPLTW $1, V10, V29
   280  	VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
   281  	MOVD   $0xffff, R21
   282  	SLD    $16, R21
   283  	MTVSRD R21, V10
   284  	VSPLTW $1, V10, V31      // Mask 0xffff0000ffff0000...
   285  	VSPLTW $0, V0, V1        // Splat 1st word of separator
   286  
   287  index4loop:
   288  	VLOADSWAP(R7, R0, V2, V2)   // Load 16 bytes @R7 into V2
   289  
   290  next4:
   291  	VSPLTISB $0, V10            // Clear
   292  	MOVD     $3, R9             // Number of bytes beyond 16
   293  	VLOADSWAP(R7, R9, V3, V3)   // Load 16 bytes @R7+3 into V3
   294  	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
   295  	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
   296  	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
   297  	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
   298  	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
   299  	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
   300  	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
   301  	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
   302  	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
   303  	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
   304  	VSEL     V14, V13, V31, V7  // final merge
   305  	VCLZD    V7, V18            // Find first index for each half
   306  	MFVSRD   V18, R25           // Isolate value
   307  	CMP      R25, $64           // If < 64, found
   308  	BLT      foundR25           // Return found index
   309  	VSLDOI   $8, V18, V18, V18  // Move for MFVSRD
   310  	MFVSRD   V18, R25           // Isolate other value
   311  	CMP      R25, $64           // If < 64, found
   312  	ADD      $64, R25           // Update index for high doubleword
   313  	BLT      foundR25           // Return found index
   314  	ADD      $16, R7            // R7+=16 for next string
   315  	ADD      $20, R7, R9        // R+20 for all bytes to load
   316  	CMP      R9, LASTBYTE       // Past end? Maybe check for extra?
   317  	BLT      index4loop         // If not, continue loop
   318  	CMP      R7, LASTSTR        // Check remainder
   319  	BLE      index2to16         // Process remainder
   320  	BR       notfound           // Not found
   321  
   322  index5plus:
   323  	CMP R6, $16     // Check for sep > 16
   324  	BGT index17plus // Handle large sep
   325  
   326  	// Assumption is that the separator is smaller than the string at this point
   327  index2to16:
   328  	CMP R7, LASTSTR // Compare last start byte
   329  	BGT notfound    // last takes len(sep) into account
   330  
   331  	ADD $16, R7, R9    // Check for last byte of string
   332  	CMP R9, LASTBYTE
   333  	BGT index2to16tail
   334  
   335  	// At least 16 bytes of string left
   336  	// Mask the number of bytes in sep
   337  index2to16loop:
   338  	VLOADSWAP(R7, R0, V1, V1)  // Load 16 bytes @R7 into V1
   339  
   340  compare:
   341  	VAND       V1, SEPMASK, V2 // Mask out sep size
   342  	VCMPEQUBCC V0, V2, V3      // Compare masked string
   343  	BLT        CR6, found      // All equal
   344  	ADD        $1, R7          // Update ptr to next byte
   345  	CMP        R7, LASTSTR     // Still less than last start byte
   346  	BGT        notfound        // Not found
   347  	ADD        $16, R7, R9     // Verify remaining bytes
   348  	CMP        R9, LASTBYTE    // At least 16
   349  	BLT        index2to16loop  // Try again
   350  
   351  	// Less than 16 bytes remaining in string
   352  	// Separator >= 2
   353  index2to16tail:
   354  	ADD   R3, R4, R9     // End of string
   355  	SUB   R7, R9, R9     // Number of bytes left
   356  	ANDCC $15, R7, R10   // 16 byte offset
   357  	ADD   R10, R9, R11   // offset + len
   358  	CMP   R11, $16       // >= 16?
   359  	BLE   short          // Does not cross 16 bytes
   360  	VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
   361  	BR    index2to16next // Continue on
   362  
   363  short:
   364  	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
   365  	VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1
   366  	SLD      $3, R10         // Set up shift
   367  	MTVSRD   R10, V8         // Set up shift
   368  	VSLDOI   $8, V8, V8, V8
   369  	VSLO     V1, V8, V1      // Shift by start byte
   370  	VSPLTISB $0, V25         // Clear for later use
   371  
   372  index2to16next:
   373  	VAND       V1, SEPMASK, V2 // Just compare size of sep
   374  	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   375  	BLT        CR6, found      // Found
   376  	ADD        $1, R7          // Not found, try next partial string
   377  	CMP        R7, LASTSTR     // Check for end of string
   378  	BGT        notfound        // If at end, then not found
   379  	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
   380  	BR         index2to16next  // Check the next partial string
   381  
   382  index17plus:
   383  	CMP      R6, $32      // Check if 17 < len(sep) <= 32
   384  	BGT      index33plus
   385  	SUB      $16, R6, R9  // Extra > 16
   386  	SLD      $56, R9, R10 // Shift to use in VSLO
   387  	MTVSRD   R10, V9      // Set up for VSLO
   388  	VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1
   389  	VSLO     V1, V9, V1   // Shift left
   390  	VSPLTISB $0xff, V7    // Splat 1s
   391  	VSPLTISB $0, V27      // Splat 0
   392  
   393  index17to32loop:
   394  	VLOADSWAP(R7, R0, V2, V2)  // Load 16 bytes @R7 into V2
   395  
   396  next17:
   397  	VLOADSWAP(R7, R9, V3, V3)  // Load 16 bytes @R7+R9 into V3
   398  	VSLO       V3, V9, V3      // Shift left
   399  	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
   400  	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
   401  	VAND       V4, V5, V6      // Check if both equal
   402  	VCMPEQUBCC V6, V7, V8      // All equal?
   403  	BLT        CR6, found      // Yes
   404  	ADD        $1, R7          // On to next byte
   405  	CMP        R7, LASTSTR     // Check if last start byte
   406  	BGT        notfound        // If too high, not found
   407  	BR         index17to32loop // Continue
   408  
   409  notfound:
   410  	MOVD $-1, R3   // Return -1 if not found
   411  	RET
   412  
   413  index33plus:
   414  	MOVD $0, (R0) // Case not implemented
   415  	RET           // Crash before return
   416  
   417  foundR25:
   418  	SRD  $3, R25   // Convert from bits to bytes
   419  	ADD  R25, R7   // Add to current string address
   420  	SUB  R3, R7    // Subtract from start of string
   421  	MOVD R7, R3    // Return byte where found
   422  	RET
   423  
   424  found:
   425  	SUB  R3, R7    // Return byte where found
   426  	MOVD R7, R3
   427  	RET
   428  
   429  TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
   430  	CMP      R6, R4                // Compare lengths
   431  	BGT      notfound              // If sep len is > string, notfound
   432  	ADD      R4, R3, LASTBYTE      // find last byte addr
   433  	SUB      R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
   434  	CMP      R6, $0                // Check sep len
   435  	BEQ      notfound              // sep len 0 -- not found
   436  	MOVD     R3, R7                // Copy of string addr
   437  	MOVD     $16, R16              // Index value 16
   438  	MOVD     $17, R17              // Index value 17
   439  	MOVD     $18, R18              // Index value 18
   440  	MOVD     $1, R19               // Index value 1
   441  	VSPLTISB $0xFF, ONES           // splat all 1s
   442  
   443  	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
   444  	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
   445  	BGE    CR4, loadge16       // Load for len(sep) >= 16
   446  	SUB    R6, R16, R9         // 16-len of sep
   447  	SLD    $3, R9              // Set up for VSLO
   448  	MTVSRD R9, V9              // Set up for VSLO
   449  	VSLDOI $8, V9, V9, V9      // Set up for VSLO
   450  	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
   451  
   452  loadge16:
   453  	ANDCC $15, R5, R9 // Find byte offset of sep
   454  	ADD   R9, R6, R10 // Add sep len
   455  	CMP   R10, $16    // Check if sep len+offset > 16
   456  	BGT   sepcross16  // Sep crosses 16 byte boundary
   457  
   458  	RLDICR  $0, R5, $59, R8 // Adjust addr to 16 byte container
   459  	LXVB16X (R8)(R0), V0    // Load 16 bytes @R8 into V0
   460  	SLD     $3, R9          // Set up shift count for VSLO
   461  	MTVSRD  R9, V8          // Set up shift count for VSLO
   462  	VSLDOI  $8, V8, V8, V8
   463  	VSLO    V0, V8, V0      // Shift by start byte
   464  
   465  	VAND V0, SEPMASK, V0 // Mask separator (< 16)
   466  	BR   index2plus
   467  
   468  sepcross16:
   469  	LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0
   470  
   471  	VAND V0, SEPMASK, V0 // mask out separator
   472  	BLE  CR4, index2to16
   473  	BR   index17plus     // Handle sep > 16
   474  
   475  index2plus:
   476  	CMP      R6, $2       // Check length of sep
   477  	BNE      index3plus   // If not 2, check for 3
   478  	ADD      $16, R7, R9  // Check if next 16 bytes past last
   479  	CMP      R9, LASTBYTE // compare with last
   480  	BGE      index2to16   // 2 <= len(string) <= 16
   481  	MOVD     $0xff00, R21 // Mask for later
   482  	MTVSRD   R21, V25     // Move to Vreg
   483  	VSPLTH   $3, V25, V31 // Splat mask
   484  	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
   485  	VSPLTISB $0, V10      // Clear V10
   486  
   487  	// First case: 2 byte separator
   488  	// V1: 2 byte separator splatted
   489  	// V2: 16 bytes at addr
   490  	// V4: 16 bytes at addr+1
   491  	// Compare 2 byte separator at start
   492  	// and at start+1. Use VSEL to combine
   493  	// those results to find the first
   494  	// matching start byte, returning
   495  	// that value when found. Loop as
   496  	// long as len(string) > 16
   497  index2loop2:
   498  	LXVB16X (R7)(R19), V3  // Load 16 bytes @R7+1 into V3
   499  
   500  index2loop:
   501  	LXVB16X  (R7)(R0), V2    // Load 16 bytes @R7 into V2
   502  	VCMPEQUH V1, V2, V5      // Search for sep
   503  	VCMPEQUH V1, V3, V6      // Search for sep offset by 1
   504  	VSEL     V6, V5, V31, V7 // merge even and odd indices
   505  	VCLZD    V7, V18         // find index of first match
   506  	MFVSRD   V18, R25        // get first value
   507  	CMP      R25, $64        // Found if < 64
   508  	BLT      foundR25        // Return byte index where found
   509  
   510  	MFVSRLD V18, R25        // get second value
   511  	CMP     R25, $64        // Found if < 64
   512  	ADD     $64, R25        // Update byte offset
   513  	BLT     foundR25        // Return value
   514  	ADD     $16, R7         // R7+=16 Update string pointer
   515  	ADD     $17, R7, R9     // R9=F7+17 since loop unrolled
   516  	CMP     R9, LASTBYTE    // Compare addr+17 against last byte
   517  	BLT     index2loop2     // If < last, continue loop
   518  	CMP     R7, LASTBYTE    // Compare addr+16 against last byte
   519  	BLT     index2to16      // If < 16 handle specially
   520  	LXVB16X (R7)(R0), V3    // Load 16 bytes @R7 into V3
   521  	VSLDOI  $1, V3, V10, V3 // Shift left by 1 byte
   522  	BR      index2loop
   523  
   524  index3plus:
   525  	CMP    R6, $3       // Check if sep == 3
   526  	BNE    index4plus   // If not check larger
   527  	ADD    $19, R7, R9  // Find bytes for use in this loop
   528  	CMP    R9, LASTBYTE // Compare against last byte
   529  	BGE    index2to16   // Remaining string 2<=len<=16
   530  	MOVD   $0xff00, R21 // Set up mask for upcoming loop
   531  	MTVSRD R21, V25     // Move mask to Vreg
   532  	VSPLTH $3, V25, V31 // Splat mask
   533  	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
   534  	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
   535  
   536  	// Loop to process 3 byte separator.
   537  	// string[0:16] is in V2
   538  	// string[2:18] is in V3
   539  	// sep[0:2] splatted in V1
   540  	// sec[3] splatted in v8
   541  	// Load vectors at string, string+1
   542  	// and string+2. Compare string, string+1
   543  	// against first 2 bytes of separator
   544  	// splatted, and string+2 against 3rd
   545  	// byte splatted. Merge the results with
   546  	// VSEL to find the first byte of a match.
   547  
   548  	// Special handling for last 16 bytes if the
   549  	// string fits in 16 byte multiple.
   550  index3loop2:
   551  	MOVD     $2, R21          // Set up index for 2
   552  	VSPLTISB $0, V10          // Clear V10
   553  	LXVB16X  (R7)(R21), V3    // Load 16 bytes @R7+2 into V3
   554  	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
   555  
   556  index3loop:
   557  	LXVB16X  (R7)(R0), V2    // Load 16 bytes @R7
   558  	VSLDOI   $1, V2, V3, V4  // string[1:17]
   559  	VSLDOI   $2, V2, V3, V9  // string[2:18]
   560  	VCMPEQUH V1, V2, V5      // compare hw even indices
   561  	VCMPEQUH V1, V4, V6      // compare hw odd indices
   562  	VCMPEQUB V8, V9, V10     // compare 3rd to last byte
   563  	VSEL     V6, V5, V31, V7 // Find 1st matching byte using mask
   564  	VAND     V7, V10, V7     // AND matched bytes with matched 3rd byte
   565  	VCLZD    V7, V18         // Find first nonzero indexes
   566  	MFVSRD   V18, R25        // Move 1st doubleword
   567  	CMP      R25, $64        // If < 64 found
   568  	BLT      foundR25        // Return matching index
   569  
   570  	MFVSRLD  V18, R25     // Move 2nd doubleword
   571  	CMP      R25, $64     // If < 64 found
   572  	ADD      $64, R25     // Update byte index
   573  	BLT      foundR25     // Return matching index
   574  	ADD      $16, R7      // R7+=16 string ptr
   575  	ADD      $19, R7, R9  // Number of string bytes for loop
   576  	CMP      R9, LASTBYTE // Compare against last byte of string
   577  	BLT      index3loop2  // If within, continue this loop
   578  	CMP      R7, LASTSTR  // Compare against last start byte
   579  	BLT      index2to16   // Process remainder
   580  	VSPLTISB $0, V3       // Special case for last 16 bytes
   581  	BR       index3loop   // Continue this loop
   582  
   583  	// Loop to process 4 byte separator
   584  	// string[0:16] in V2
   585  	// string[3:16] in V3
   586  	// sep[0:4] splatted in V1
   587  	// Set up vectors with strings at offsets
   588  	// 0, 1, 2, 3 and compare against the 4 byte
   589  	// separator also splatted. Use VSEL with the
   590  	// compare results to find the first byte where
   591  	// a separator match is found.
   592  index4plus:
   593  	CMP  R6, $4       // Check if 4 byte separator
   594  	BNE  index5plus   // If not next higher
   595  	ADD  $20, R7, R9  // Check string size to load
   596  	CMP  R9, LASTBYTE // Verify string length
   597  	BGE  index2to16   // If not large enough, process remaining
   598  
   599  	// Set up masks for use with VSEL
   600  	MOVD    $0xff, R21 // Set up mask 0xff000000ff000000...
   601  	SLD     $24, R21
   602  	MTVSRWS R21, V29
   603  
   604  	VSLDOI  $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
   605  	MOVD    $0xffff, R21
   606  	SLD     $16, R21
   607  	MTVSRWS R21, V31
   608  
   609  	VSPLTW $0, V0, V1 // Splat 1st word of separator
   610  
   611  index4loop:
   612  	LXVB16X (R7)(R0), V2  // Load 16 bytes @R7 into V2
   613  
   614  next4:
   615  	VSPLTISB $0, V10            // Clear
   616  	MOVD     $3, R9             // Number of bytes beyond 16
   617  	LXVB16X  (R7)(R9), V3       // Load 16 bytes @R7 into V3
   618  	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
   619  	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
   620  	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
   621  	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
   622  	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
   623  	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
   624  	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
   625  	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
   626  	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
   627  	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
   628  	VSEL     V14, V13, V31, V7  // final merge
   629  	VCLZD    V7, V18            // Find first index for each half
   630  	MFVSRD   V18, R25           // Isolate value
   631  	CMP      R25, $64           // If < 64, found
   632  	BLT      foundR25           // Return found index
   633  
   634  	MFVSRLD V18, R25     // Isolate other value
   635  	CMP     R25, $64     // If < 64, found
   636  	ADD     $64, R25     // Update index for high doubleword
   637  	BLT     foundR25     // Return found index
   638  	ADD     $16, R7      // R7+=16 for next string
   639  	ADD     $20, R7, R9  // R+20 for all bytes to load
   640  	CMP     R9, LASTBYTE // Past end? Maybe check for extra?
   641  	BLT     index4loop   // If not, continue loop
   642  	CMP     R7, LASTSTR  // Check remainder
   643  	BLE     index2to16   // Process remainder
   644  	BR      notfound     // Not found
   645  
   646  index5plus:
   647  	CMP R6, $16     // Check for sep > 16
   648  	BGT index17plus // Handle large sep
   649  
   650  	// Assumption is that the separator is smaller than the string at this point
   651  index2to16:
   652  	CMP R7, LASTSTR // Compare last start byte
   653  	BGT notfound    // last takes len(sep) into account
   654  
   655  	ADD $19, R7, R9    // To check 4 indices per iteration, need at least 16+3 bytes
   656  	CMP R9, LASTBYTE
   657  	// At least 16 bytes of string left
   658  	// Mask the number of bytes in sep
   659  	VSPLTISB $0, V10            // Clear
   660  	BGT index2to16tail
   661  
   662  	MOVD     $3, R17            // Number of bytes beyond 16
   663  	PCALIGN  $32
   664  index2to16loop:
   665  	LXVB16X  (R7)(R0), V1       // Load next 16 bytes of string into V1 from R7
   666  	LXVB16X  (R7)(R17), V5      // Load next 16 bytes of string into V5 from R7+3
   667  
   668  	VSLDOI   $13, V5, V10, V2  // Shift left last 3 bytes
   669  	VSLDOI  $1, V1, V2, V3     // V3=(V1:V2)<<1
   670  	VSLDOI  $2, V1, V2, V4     // V4=(V1:V2)<<2
   671  	VAND    V1, SEPMASK, V8    // Mask out sep size 0th index
   672  	VAND    V3, SEPMASK, V9    // Mask out sep size 1st index
   673  	VAND    V4, SEPMASK, V11   // Mask out sep size 2nd index
   674  	VAND    V5, SEPMASK, V12   // Mask out sep size 3rd index
   675  	VCMPEQUBCC      V0, V8, V8 // compare masked string
   676  	BLT     CR6, found         // All equal while comparing 0th index
   677  	VCMPEQUBCC      V0, V9, V9 // compare masked string
   678  	BLT     CR6, found2        // All equal while comparing 1st index
   679  	VCMPEQUBCC      V0, V11, V11    // compare masked string
   680  	BLT     CR6, found3        // All equal while comparing 2nd index
   681  	VCMPEQUBCC      V0, V12, V12    // compare masked string
   682  	BLT     CR6, found4        // All equal while comparing 3rd index
   683  
   684  	ADD        $4, R7          // Update ptr to next 4 bytes
   685  	CMP        R7, LASTSTR     // Still less than last start byte
   686  	BGT        notfound        // Not found
   687  	ADD        $19, R7, R9     // Verify remaining bytes
   688  	CMP        R9, LASTBYTE    // length of string at least 19
   689  	BLE        index2to16loop  // Try again, else do post processing and jump to index2to16next
   690  
   691  	// <19 bytes left, post process the remaining string
   692  index2to16tail:
   693  	ADD     R3, R4, R9         // End of string
   694  	SUB     R7, R9, R9         // Number of bytes left
   695  	ANDCC   $15, R7, R10       // 16 byte offset
   696  	ADD     R10, R9, R11       // offset + len
   697  	CMP     R11, $16           // >= 16?
   698  	BLE     short              // Does not cross 16 bytes
   699  	LXVB16X (R7)(R0), V1       // Load 16 bytes @R7 into V1
   700  	CMP     R9, $16            // Post-processing of unrolled loop
   701  	BLE     index2to16next     // continue to index2to16next if <= 16 bytes
   702  	SUB     R16, R9, R10       // R9 should be 18 or 17 hence R10 is 1 or 2
   703  	LXVB16X (R7)(R10), V9
   704  	CMP     R10, $1            // string length is 17, compare 1 more byte
   705  	BNE     extra2             // string length is 18, compare 2 more bytes
   706  	VSLDOI  $15, V9, V10, V25
   707  	VAND       V1, SEPMASK, V2 // Just compare size of sep
   708  	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   709  	BLT        CR6, found      // Found
   710  	ADD        $1, R7          // Not found, try next partial string
   711  	CMP        R7, LASTSTR     // Check for end of string
   712  	BGT        notfound        // If at end, then not found
   713  	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
   714  	BR         index2to16next  // go to remainder loop
   715  extra2:
   716  	VSLDOI  $14, V9, V10, V25
   717  	VAND       V1, SEPMASK, V2 // Just compare size of sep
   718  	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   719  	BLT        CR6, found      // Found
   720  	ADD        $1, R7          // Not found, try next partial string
   721  	CMP        R7, LASTSTR     // Check for end of string
   722  	BGT        notfound        // If at end, then not found
   723  	VOR        V1, V1, V4      // save remaining string
   724  	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte
   725  	VAND       V1, SEPMASK, V2 // Just compare size of sep
   726  	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   727  	BLT        CR6, found      // Found
   728  	ADD        $1, R7          // Not found, try next partial string
   729  	CMP        R7, LASTSTR     // Check for end of string
   730  	BGT        notfound        // If at end, then not found
   731  	VSLDOI     $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte
   732  	BR         index2to16next  // Check the remaining partial string in index2to16next
   733  
   734  short:
   735  	RLDICR   $0, R7, $59, R9   // Adjust addr to 16 byte container
   736  	LXVB16X  (R9)(R0), V1      // Load 16 bytes @R9 into V1
   737  	SLD      $3, R10           // Set up shift
   738  	MTVSRD   R10, V8           // Set up shift
   739  	VSLDOI   $8, V8, V8, V8
   740  	VSLO     V1, V8, V1        // Shift by start byte
   741  	PCALIGN  $32
   742  index2to16next:
   743  	VAND       V1, SEPMASK, V2 // Just compare size of sep
   744  	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   745  	BLT        CR6, found      // Found
   746  	ADD        $1, R7          // Not found, try next partial string
   747  	CMP        R7, LASTSTR     // Check for end of string
   748  	BGT        notfound        // If at end, then not found
   749  	VSLDOI     $1, V1, V10, V1 // Shift string left by 1 byte
   750  	BR         index2to16next  // Check the next partial string
   751  
   752  index17plus:
   753  	CMP      R6, $32       // Check if 17 < len(sep) <= 32
   754  	BGT      index33plus
   755  	SUB      $16, R6, R9   // Extra > 16
   756  	SLD      $56, R9, R10  // Shift to use in VSLO
   757  	MTVSRD   R10, V9       // Set up for VSLO
   758  	LXVB16X  (R5)(R9), V1  // Load 16 bytes @R5+R9 into V1
   759  	VSLO     V1, V9, V1    // Shift left
   760  	VSPLTISB $0xff, V7     // Splat 1s
   761  	VSPLTISB $0, V27       // Splat 0
   762  
   763  index17to32loop:
   764  	LXVB16X (R7)(R0), V2  // Load 16 bytes @R7 into V2
   765  
   766  next17:
   767  	LXVB16X    (R7)(R9), V3    // Load 16 bytes @R7+R9 into V3
   768  	VSLO       V3, V9, V3      // Shift left
   769  	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
   770  	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
   771  	VAND       V4, V5, V6      // Check if both equal
   772  	VCMPEQUBCC V6, V7, V8      // All equal?
   773  	BLT        CR6, found      // Yes
   774  	ADD        $1, R7          // On to next byte
   775  	CMP        R7, LASTSTR     // Check if last start byte
   776  	BGT        notfound        // If too high, not found
   777  	BR         index17to32loop // Continue
   778  
   779  notfound:
   780  	MOVD $-1, R3   // Return -1 if not found
   781  	RET
   782  
   783  index33plus:
   784  	MOVD $0, (R0) // Case not implemented
   785  	RET           // Crash before return
   786  
   787  foundR25:
   788  	SRD  $3, R25   // Convert from bits to bytes
   789  	ADD  R25, R7   // Add to current string address
   790  	SUB  R3, R7    // Subtract from start of string
   791  	MOVD R7, R3    // Return byte where found
   792  	RET
   793  found4:
   794  	ADD $1, R7     // found from unrolled loop at index 3
   795  found3:
   796  	ADD $1, R7     // found from unrolled loop at index 2
   797  found2:
   798  	ADD $1, R7     // found from unrolled loop at index 1
   799  found:                 // found at index 0
   800  	SUB  R3, R7    // Return byte where found
   801  	MOVD R7, R3
   802  	RET