github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/internal/bytealg/index_ppc64x.s (about)

     1  // Copyright 2021 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This is an implementation based on the s390x
     6  // implementation.
     7  
     8  // Find a separator with 2 <= len <= 32 within a string.
     9  // Separators with lengths of 2, 3 or 4 are handled
    10  // specially.
    11  
    12  // This works on power8 and above. The loads and
    13  // compares are done in big endian order
    14  // since that allows the used of VCLZD, and allows
    15  // the same implementation to work on big and little
    16  // endian platforms with minimal conditional changes.
    17  
    18  // NOTE: There is a power9 implementation that
    19  // improves performance by 10-15% on little
    20  // endian for some of the benchmarks, but
    21  // work is still needed for a big endian
    22  // implementation on power9.
    23  
    24  //go:build ppc64 || ppc64le
    25  // +build ppc64 ppc64le
    26  
    27  #include "go_asm.h"
    28  #include "textflag.h"
    29  
    30  // Needed to swap LXVD2X loads to the correct
    31  // byte order to work on POWER8.
    32  
    33  #ifdef GOARCH_ppc64
    34  DATA byteswap<>+0(SB)/8, $0x0001020304050607
    35  DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
    36  #else
    37  DATA byteswap<>+0(SB)/8, $0x0706050403020100
    38  DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
    39  #endif
    40  
    41  // Load bytes in big endian order. Address
    42  // alignment does not need checking.
    43  #define VLOADSWAP(base, index, vreg, vsreg) \
    44  	LXVD2X (base)(index), vsreg;  \
    45  	VPERM  vreg, vreg, SWAP, vreg
    46  
    47  GLOBL byteswap<>+0(SB), RODATA, $16
    48  
    49  TEXT ·Index(SB), NOSPLIT|NOFRAME, $0-56
    50  	MOVD a_base+0(FP), R3  // R3 = byte array pointer
    51  	MOVD a_len+8(FP), R4   // R4 = length
    52  	MOVD b_base+24(FP), R5 // R5 = separator pointer
    53  	MOVD b_len+32(FP), R6  // R6 = separator length
    54  	MOVD $ret+48(FP), R14  // R14 = &ret
    55  
    56  #ifdef GOARCH_ppc64le
    57  	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
    58  	CMP   R7, $1
    59  	BNE   power8
    60  	BR    indexbodyp9<>(SB)
    61  
    62  #endif
    63  power8:
    64  	BR indexbody<>(SB)
    65  
    66  TEXT ·IndexString(SB), NOSPLIT|NOFRAME, $0-40
    67  	MOVD a_base+0(FP), R3  // R3 = string
    68  	MOVD a_len+8(FP), R4   // R4 = length
    69  	MOVD b_base+16(FP), R5 // R5 = separator pointer
    70  	MOVD b_len+24(FP), R6  // R6 = separator length
    71  	MOVD $ret+32(FP), R14  // R14 = &ret
    72  
    73  #ifdef GOARCH_ppc64le
    74  	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
    75  	CMP   R7, $1
    76  	BNE   power8
    77  	BR    indexbody<>(SB)
    78  
    79  #endif
    80  power8:
    81  	BR indexbody<>(SB)
    82  
    83  	// s: string we are searching
    84  	// sep: string to search for
    85  	// R3=&s[0], R4=len(s)
    86  	// R5=&sep[0], R6=len(sep)
    87  	// R14=&ret (index where sep found)
    88  	// R7=working addr of string
    89  	// R16=index value 16
    90  	// R17=index value 17
    91  	// R18=index value 18
    92  	// R19=index value 1
    93  	// R26=LASTBYTE of string
    94  	// R27=LASTSTR last start byte to compare with sep
    95  	// R8, R9 scratch
    96  	// V0=sep left justified zero fill
    97  	// CR4=sep length >= 16
    98  
    99  #define SEPMASK V17
   100  #define LASTBYTE R26
   101  #define LASTSTR R27
   102  #define ONES V20
   103  #define SWAP V21
   104  #define V0_ VS32
   105  #define V1_ VS33
   106  #define V2_ VS34
   107  #define V3_ VS35
   108  #define V4_ VS36
   109  #define V5_ VS37
   110  #define V6_ VS38
   111  #define V7_ VS39
   112  #define V8_ VS40
   113  #define V9_ VS41
   114  #define SWAP_ VS53
   115  TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
   116  	CMP      R6, R4                 // Compare lengths
   117  	BGT      notfound               // If sep len is > string, notfound
   118  	ADD      R4, R3, LASTBYTE       // find last byte addr
   119  	SUB      R6, LASTBYTE, LASTSTR  // LAST=&s[len(s)-len(sep)] (last valid start index)
   120  	CMP      R6, $0                 // Check sep len
   121  	BEQ      notfound               // sep len 0 -- not found
   122  	MOVD     R3, R7                 // Copy of string addr
   123  	MOVD     $16, R16               // Index value 16
   124  	MOVD     $17, R17               // Index value 17
   125  	MOVD     $18, R18               // Index value 18
   126  	MOVD     $1, R19                // Index value 1
   127  	MOVD     $byteswap<>+00(SB), R8
   128  	VSPLTISB $0xFF, ONES            // splat all 1s
   129  	LXVD2X   (R8)(R0), SWAP_        // Set up swap string
   130  
   131  	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
   132  	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
   133  	BGE    CR4, loadge16       // Load for len(sep) >= 16
   134  	SUB    R6, R16, R9         // 16-len of sep
   135  	SLD    $3, R9              // Set up for VSLO
   136  	MTVSRD R9, V9_             // Set up for VSLO
   137  	VSLDOI $8, V9, V9, V9      // Set up for VSLO
   138  	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
   139  
   140  loadge16:
   141  	ANDCC $15, R5, R9 // Find byte offset of sep
   142  	ADD   R9, R6, R10 // Add sep len
   143  	CMP   R10, $16    // Check if sep len+offset > 16
   144  	BGE   sepcross16  // Sep crosses 16 byte boundary
   145  
   146  	RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
   147  	VLOADSWAP(R8, R0, V0, V0_)// Load 16 bytes @R8 into V0
   148  	SLD    $3, R9          // Set up shift count for VSLO
   149  	MTVSRD R9, V8_         // Set up shift count for VSLO
   150  	VSLDOI $8, V8, V8, V8
   151  	VSLO   V0, V8, V0      // Shift by start byte
   152  
   153  	VAND V0, SEPMASK, V0 // Mask separator (< 16)
   154  	BR   index2plus
   155  
   156  sepcross16:
   157  	VLOADSWAP(R5, R0, V0, V0_) // Load 16 bytes @R5 into V0
   158  
   159  	VAND V0, SEPMASK, V0 // mask out separator
   160  	BLE  CR4, index2to16
   161  	BR   index17plus     // Handle sep > 16
   162  
   163  index2plus:
   164  	CMP      R6, $2       // Check length of sep
   165  	BNE      index3plus   // If not 2, check for 3
   166  	ADD      $16, R7, R9  // Check if next 16 bytes past last
   167  	CMP      R9, LASTBYTE // compare with last
   168  	BGE      index2to16   // 2 <= len(string) <= 16
   169  	MOVD     $0xff00, R21 // Mask for later
   170  	MTVSRD   R21, V25     // Move to Vreg
   171  	VSPLTH   $3, V25, V31 // Splat mask
   172  	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
   173  	VSPLTISB $0, V10      // Clear V10
   174  
   175  	// First case: 2 byte separator
   176  	// V1: 2 byte separator splatted
   177  	// V2: 16 bytes at addr
   178  	// V4: 16 bytes at addr+1
   179  	// Compare 2 byte separator at start
   180  	// and at start+1. Use VSEL to combine
   181  	// those results to find the first
   182  	// matching start byte, returning
   183  	// that value when found. Loop as
   184  	// long as len(string) > 16
   185  index2loop2:
   186  	VLOADSWAP(R7, R19, V3, V3_) // Load 16 bytes @R7+1 into V3
   187  
   188  index2loop:
   189  	VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
   190  	VCMPEQUH V1, V2, V5        // Search for sep
   191  	VCMPEQUH V1, V3, V6        // Search for sep offset by 1
   192  	VSEL     V6, V5, V31, V7   // merge even and odd indices
   193  	VCLZD    V7, V18           // find index of first match
   194  	MFVSRD   V18, R25          // get first value
   195  	CMP      R25, $64          // Found if < 64
   196  	BLT      foundR25          // Return byte index where found
   197  	VSLDOI   $8, V18, V18, V18 // Adjust 2nd value
   198  	MFVSRD   V18, R25          // get second value
   199  	CMP      R25, $64          // Found if < 64
   200  	ADD      $64, R25          // Update byte offset
   201  	BLT      foundR25          // Return value
   202  	ADD      $16, R7           // R7+=16 Update string pointer
   203  	ADD      $17, R7, R9       // R9=F7+17 since loop unrolled
   204  	CMP      R9, LASTBYTE      // Compare addr+17 against last byte
   205  	BLT      index2loop2       // If < last, continue loop
   206  	CMP      R7, LASTBYTE      // Compare addr+16 against last byte
   207  	BLT      index2to16        // If < 16 handle specially
   208  	VLOADSWAP(R7, R0, V3, V3_) // Load 16 bytes @R7 into V3
   209  	VSLDOI   $1, V3, V10, V3   // Shift left by 1 byte
   210  	BR       index2loop
   211  
   212  index3plus:
   213  	CMP    R6, $3       // Check if sep == 3
   214  	BNE    index4plus   // If not check larger
   215  	ADD    $19, R7, R9  // Find bytes for use in this loop
   216  	CMP    R9, LASTBYTE // Compare against last byte
   217  	BGE    index2to16   // Remaining string 2<=len<=16
   218  	MOVD   $0xff00, R21 // Set up mask for upcoming loop
   219  	MTVSRD R21, V25     // Move mask to Vreg
   220  	VSPLTH $3, V25, V31 // Splat mask
   221  	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
   222  	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
   223  
   224  	// Loop to process 3 byte separator.
   225  	// string[0:16] is in V2
   226  	// string[2:18] is in V3
   227  	// sep[0:2] splatted in V1
   228  	// sec[3] splatted in v8
   229  	// Load vectors at string, string+1
   230  	// and string+2. Compare string, string+1
   231  	// against first 2 bytes of separator
   232  	// splatted, and string+2 against 3rd
   233  	// byte splatted. Merge the results with
   234  	// VSEL to find the first byte of a match.
   235  
   236  	// Special handling for last 16 bytes if the
   237  	// string fits in 16 byte multiple.
   238  index3loop2:
   239  	MOVD     $2, R21          // Set up index for 2
   240  	VSPLTISB $0, V10          // Clear V10
   241  	VLOADSWAP(R7, R21, V3, V3_)// Load 16 bytes @R7+2 into V3
   242  	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
   243  
   244  index3loop:
   245  	VLOADSWAP(R7, R0, V2, V2_) // Load with correct order
   246  	VSLDOI   $1, V2, V3, V4    // string[1:17]
   247  	VSLDOI   $2, V2, V3, V9    // string[2:18]
   248  	VCMPEQUH V1, V2, V5        // compare hw even indices
   249  	VCMPEQUH V1, V4, V6        // compare hw odd indices
   250  	VCMPEQUB V8, V9, V10       // compare 3rd to last byte
   251  	VSEL     V6, V5, V31, V7   // Find 1st matching byte using mask
   252  	VAND     V7, V10, V7       // AND matched bytes with matched 3rd byte
   253  	VCLZD    V7, V18           // Find first nonzero indexes
   254  	MFVSRD   V18, R25          // Move 1st doubleword
   255  	CMP      R25, $64          // If < 64 found
   256  	BLT      foundR25          // Return matching index
   257  	VSLDOI   $8, V18, V18, V18 // Move value
   258  	MFVSRD   V18, R25          // Move 2nd doubleword
   259  	CMP      R25, $64          // If < 64 found
   260  	ADD      $64, R25          // Update byte index
   261  	BLT      foundR25          // Return matching index
   262  	ADD      $16, R7           // R7+=16 string ptr
   263  	ADD      $19, R7, R9       // Number of string bytes for loop
   264  	CMP      R9, LASTBYTE      // Compare against last byte of string
   265  	BLT      index3loop2       // If within, continue this loop
   266  	CMP      R7, LASTSTR       // Compare against last start byte
   267  	BLT      index2to16        // Process remainder
   268  	VSPLTISB $0, V3            // Special case for last 16 bytes
   269  	BR       index3loop        // Continue this loop
   270  
   271  	// Loop to process 4 byte separator
   272  	// string[0:16] in V2
   273  	// string[3:16] in V3
   274  	// sep[0:4] splatted in V1
   275  	// Set up vectors with strings at offsets
   276  	// 0, 1, 2, 3 and compare against the 4 byte
   277  	// separator also splatted. Use VSEL with the
   278  	// compare results to find the first byte where
   279  	// a separator match is found.
   280  index4plus:
   281  	CMP  R6, $4       // Check if 4 byte separator
   282  	BNE  index5plus   // If not next higher
   283  	ADD  $20, R7, R9  // Check string size to load
   284  	CMP  R9, LASTBYTE // Verify string length
   285  	BGE  index2to16   // If not large enough, process remaining
   286  	MOVD $2, R15      // Set up index
   287  
   288  	// Set up masks for use with VSEL
   289  	MOVD   $0xff, R21        // Set up mask 0xff000000ff000000...
   290  	SLD    $24, R21
   291  	MTVSRD R21, V10
   292  	VSPLTW $1, V10, V29
   293  	VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
   294  	MOVD   $0xffff, R21
   295  	SLD    $16, R21
   296  	MTVSRD R21, V10
   297  	VSPLTW $1, V10, V31      // Mask 0xffff0000ffff0000...
   298  	VSPLTW $0, V0, V1        // Splat 1st word of separator
   299  
   300  index4loop:
   301  	VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
   302  
   303  next4:
   304  	VSPLTISB $0, V10            // Clear
   305  	MOVD     $3, R9             // Number of bytes beyond 16
   306  	VLOADSWAP(R7, R9, V3, V3_)  // Load 16 bytes @R7+3 into V3
   307  	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
   308  	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
   309  	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
   310  	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
   311  	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
   312  	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
   313  	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
   314  	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
   315  	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
   316  	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
   317  	VSEL     V14, V13, V31, V7  // final merge
   318  	VCLZD    V7, V18            // Find first index for each half
   319  	MFVSRD   V18, R25           // Isolate value
   320  	CMP      R25, $64           // If < 64, found
   321  	BLT      foundR25           // Return found index
   322  	VSLDOI   $8, V18, V18, V18  // Move for MFVSRD
   323  	MFVSRD   V18, R25           // Isolate other value
   324  	CMP      R25, $64           // If < 64, found
   325  	ADD      $64, R25           // Update index for high doubleword
   326  	BLT      foundR25           // Return found index
   327  	ADD      $16, R7            // R7+=16 for next string
   328  	ADD      $20, R7, R9        // R+20 for all bytes to load
   329  	CMP      R9, LASTBYTE       // Past end? Maybe check for extra?
   330  	BLT      index4loop         // If not, continue loop
   331  	CMP      R7, LASTSTR        // Check remainder
   332  	BLE      index2to16         // Process remainder
   333  	BR       notfound           // Not found
   334  
   335  index5plus:
   336  	CMP R6, $16     // Check for sep > 16
   337  	BGT index17plus // Handle large sep
   338  
   339  	// Assumption is that the separator is smaller than the string at this point
   340  index2to16:
   341  	CMP R7, LASTSTR // Compare last start byte
   342  	BGT notfound    // last takes len(sep) into account
   343  
   344  	ADD $16, R7, R9    // Check for last byte of string
   345  	CMP R9, LASTBYTE
   346  	BGT index2to16tail
   347  
   348  	// At least 16 bytes of string left
   349  	// Mask the number of bytes in sep
   350  index2to16loop:
   351  	VLOADSWAP(R7, R0, V1, V1_) // Load 16 bytes @R7 into V1
   352  
   353  compare:
   354  	VAND       V1, SEPMASK, V2 // Mask out sep size
   355  	VCMPEQUBCC V0, V2, V3      // Compare masked string
   356  	BLT        CR6, found      // All equal
   357  	ADD        $1, R7          // Update ptr to next byte
   358  	CMP        R7, LASTSTR     // Still less than last start byte
   359  	BGT        notfound        // Not found
   360  	ADD        $16, R7, R9     // Verify remaining bytes
   361  	CMP        R9, LASTBYTE    // At least 16
   362  	BLT        index2to16loop  // Try again
   363  
   364  	// Less than 16 bytes remaining in string
   365  	// Separator >= 2
   366  index2to16tail:
   367  	ADD   R3, R4, R9     // End of string
   368  	SUB   R7, R9, R9     // Number of bytes left
   369  	ANDCC $15, R7, R10   // 16 byte offset
   370  	ADD   R10, R9, R11   // offset + len
   371  	CMP   R11, $16       // >= 16?
   372  	BLE   short          // Does not cross 16 bytes
   373  	VLOADSWAP(R7, R0, V1, V1_)// Load 16 bytes @R7 into V1
   374  	BR    index2to16next // Continue on
   375  
   376  short:
   377  	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
   378  	VLOADSWAP(R9, R0, V1, V1_)// Load 16 bytes @R9 into V1
   379  	SLD      $3, R10         // Set up shift
   380  	MTVSRD   R10, V8_        // Set up shift
   381  	VSLDOI   $8, V8, V8, V8
   382  	VSLO     V1, V8, V1      // Shift by start byte
   383  	VSPLTISB $0, V25         // Clear for later use
   384  
   385  index2to16next:
   386  	VAND       V1, SEPMASK, V2 // Just compare size of sep
   387  	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   388  	BLT        CR6, found      // Found
   389  	ADD        $1, R7          // Not found, try next partial string
   390  	CMP        R7, LASTSTR     // Check for end of string
   391  	BGT        notfound        // If at end, then not found
   392  	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
   393  	BR         index2to16next  // Check the next partial string
   394  
   395  index17plus:
   396  	CMP      R6, $32      // Check if 17 < len(sep) <= 32
   397  	BGT      index33plus
   398  	SUB      $16, R6, R9  // Extra > 16
   399  	SLD      $56, R9, R10 // Shift to use in VSLO
   400  	MTVSRD   R10, V9_     // Set up for VSLO
   401  	VLOADSWAP(R5, R9, V1, V1_)// Load 16 bytes @R5+R9 into V1
   402  	VSLO     V1, V9, V1   // Shift left
   403  	VSPLTISB $0xff, V7    // Splat 1s
   404  	VSPLTISB $0, V27      // Splat 0
   405  
   406  index17to32loop:
   407  	VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
   408  
   409  next17:
   410  	VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+R9 into V3
   411  	VSLO       V3, V9, V3      // Shift left
   412  	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
   413  	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
   414  	VAND       V4, V5, V6      // Check if both equal
   415  	VCMPEQUBCC V6, V7, V8      // All equal?
   416  	BLT        CR6, found      // Yes
   417  	ADD        $1, R7          // On to next byte
   418  	CMP        R7, LASTSTR     // Check if last start byte
   419  	BGT        notfound        // If too high, not found
   420  	BR         index17to32loop // Continue
   421  
   422  notfound:
   423  	MOVD $-1, R8   // Return -1 if not found
   424  	MOVD R8, (R14)
   425  	RET
   426  
   427  index33plus:
   428  	MOVD $0, (R0) // Case not implemented
   429  	RET           // Crash before return
   430  
   431  foundR25:
   432  	SRD  $3, R25   // Convert from bits to bytes
   433  	ADD  R25, R7   // Add to current string address
   434  	SUB  R3, R7    // Subtract from start of string
   435  	MOVD R7, (R14) // Return byte where found
   436  	RET
   437  
   438  found:
   439  	SUB  R3, R7    // Return byte where found
   440  	MOVD R7, (R14)
   441  	RET
   442  
   443  TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
   444  	CMP      R6, R4                // Compare lengths
   445  	BGT      notfound              // If sep len is > string, notfound
   446  	ADD      R4, R3, LASTBYTE      // find last byte addr
   447  	SUB      R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
   448  	CMP      R6, $0                // Check sep len
   449  	BEQ      notfound              // sep len 0 -- not found
   450  	MOVD     R3, R7                // Copy of string addr
   451  	MOVD     $16, R16              // Index value 16
   452  	MOVD     $17, R17              // Index value 17
   453  	MOVD     $18, R18              // Index value 18
   454  	MOVD     $1, R19               // Index value 1
   455  	VSPLTISB $0xFF, ONES           // splat all 1s
   456  
   457  	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
   458  	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
   459  	BGE    CR4, loadge16       // Load for len(sep) >= 16
   460  	SUB    R6, R16, R9         // 16-len of sep
   461  	SLD    $3, R9              // Set up for VSLO
   462  	MTVSRD R9, V9_             // Set up for VSLO
   463  	VSLDOI $8, V9, V9, V9      // Set up for VSLO
   464  	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
   465  
   466  loadge16:
   467  	ANDCC $15, R5, R9 // Find byte offset of sep
   468  	ADD   R9, R6, R10 // Add sep len
   469  	CMP   R10, $16    // Check if sep len+offset > 16
   470  	BGE   sepcross16  // Sep crosses 16 byte boundary
   471  
   472  	RLDICR  $0, R5, $59, R8 // Adjust addr to 16 byte container
   473  	LXVB16X (R8)(R0), V0_   // Load 16 bytes @R8 into V0
   474  	SLD     $3, R9          // Set up shift count for VSLO
   475  	MTVSRD  R9, V8_         // Set up shift count for VSLO
   476  	VSLDOI  $8, V8, V8, V8
   477  	VSLO    V0, V8, V0      // Shift by start byte
   478  
   479  	VAND V0, SEPMASK, V0 // Mask separator (< 16)
   480  	BR   index2plus
   481  
   482  sepcross16:
   483  	LXVB16X (R5)(R0), V0_ // Load 16 bytes @R5 into V0
   484  
   485  	VAND V0, SEPMASK, V0 // mask out separator
   486  	BLE  CR4, index2to16
   487  	BR   index17plus     // Handle sep > 16
   488  
   489  index2plus:
   490  	CMP      R6, $2       // Check length of sep
   491  	BNE      index3plus   // If not 2, check for 3
   492  	ADD      $16, R7, R9  // Check if next 16 bytes past last
   493  	CMP      R9, LASTBYTE // compare with last
   494  	BGE      index2to16   // 2 <= len(string) <= 16
   495  	MOVD     $0xff00, R21 // Mask for later
   496  	MTVSRD   R21, V25     // Move to Vreg
   497  	VSPLTH   $3, V25, V31 // Splat mask
   498  	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
   499  	VSPLTISB $0, V10      // Clear V10
   500  
   501  	// First case: 2 byte separator
   502  	// V1: 2 byte separator splatted
   503  	// V2: 16 bytes at addr
   504  	// V4: 16 bytes at addr+1
   505  	// Compare 2 byte separator at start
   506  	// and at start+1. Use VSEL to combine
   507  	// those results to find the first
   508  	// matching start byte, returning
   509  	// that value when found. Loop as
   510  	// long as len(string) > 16
   511  index2loop2:
   512  	LXVB16X (R7)(R19), V3_ // Load 16 bytes @R7+1 into V3
   513  
   514  index2loop:
   515  	LXVB16X  (R7)(R0), V2_   // Load 16 bytes @R7 into V2
   516  	VCMPEQUH V1, V2, V5      // Search for sep
   517  	VCMPEQUH V1, V3, V6      // Search for sep offset by 1
   518  	VSEL     V6, V5, V31, V7 // merge even and odd indices
   519  	VCLZD    V7, V18         // find index of first match
   520  	MFVSRD   V18, R25        // get first value
   521  	CMP      R25, $64        // Found if < 64
   522  	BLT      foundR25        // Return byte index where found
   523  
   524  	MFVSRLD V18, R25        // get second value
   525  	CMP     R25, $64        // Found if < 64
   526  	ADD     $64, R25        // Update byte offset
   527  	BLT     foundR25        // Return value
   528  	ADD     $16, R7         // R7+=16 Update string pointer
   529  	ADD     $17, R7, R9     // R9=F7+17 since loop unrolled
   530  	CMP     R9, LASTBYTE    // Compare addr+17 against last byte
   531  	BLT     index2loop2     // If < last, continue loop
   532  	CMP     R7, LASTBYTE    // Compare addr+16 against last byte
   533  	BLT     index2to16      // If < 16 handle specially
   534  	LXVB16X (R7)(R0), V3_   // Load 16 bytes @R7 into V3
   535  	VSLDOI  $1, V3, V10, V3 // Shift left by 1 byte
   536  	BR      index2loop
   537  
   538  index3plus:
   539  	CMP    R6, $3       // Check if sep == 3
   540  	BNE    index4plus   // If not check larger
   541  	ADD    $19, R7, R9  // Find bytes for use in this loop
   542  	CMP    R9, LASTBYTE // Compare against last byte
   543  	BGE    index2to16   // Remaining string 2<=len<=16
   544  	MOVD   $0xff00, R21 // Set up mask for upcoming loop
   545  	MTVSRD R21, V25     // Move mask to Vreg
   546  	VSPLTH $3, V25, V31 // Splat mask
   547  	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
   548  	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
   549  
   550  	// Loop to process 3 byte separator.
   551  	// string[0:16] is in V2
   552  	// string[2:18] is in V3
   553  	// sep[0:2] splatted in V1
   554  	// sec[3] splatted in v8
   555  	// Load vectors at string, string+1
   556  	// and string+2. Compare string, string+1
   557  	// against first 2 bytes of separator
   558  	// splatted, and string+2 against 3rd
   559  	// byte splatted. Merge the results with
   560  	// VSEL to find the first byte of a match.
   561  
   562  	// Special handling for last 16 bytes if the
   563  	// string fits in 16 byte multiple.
   564  index3loop2:
   565  	MOVD     $2, R21          // Set up index for 2
   566  	VSPLTISB $0, V10          // Clear V10
   567  	LXVB16X  (R7)(R21), V3_   // Load 16 bytes @R7+2 into V3
   568  	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
   569  
   570  index3loop:
   571  	LXVB16X  (R7)(R0), V2_   // Load 16 bytes @R7
   572  	VSLDOI   $1, V2, V3, V4  // string[1:17]
   573  	VSLDOI   $2, V2, V3, V9  // string[2:18]
   574  	VCMPEQUH V1, V2, V5      // compare hw even indices
   575  	VCMPEQUH V1, V4, V6      // compare hw odd indices
   576  	VCMPEQUB V8, V9, V10     // compare 3rd to last byte
   577  	VSEL     V6, V5, V31, V7 // Find 1st matching byte using mask
   578  	VAND     V7, V10, V7     // AND matched bytes with matched 3rd byte
   579  	VCLZD    V7, V18         // Find first nonzero indexes
   580  	MFVSRD   V18, R25        // Move 1st doubleword
   581  	CMP      R25, $64        // If < 64 found
   582  	BLT      foundR25        // Return matching index
   583  
   584  	MFVSRLD  V18, R25     // Move 2nd doubleword
   585  	CMP      R25, $64     // If < 64 found
   586  	ADD      $64, R25     // Update byte index
   587  	BLT      foundR25     // Return matching index
   588  	ADD      $16, R7      // R7+=16 string ptr
   589  	ADD      $19, R7, R9  // Number of string bytes for loop
   590  	CMP      R9, LASTBYTE // Compare against last byte of string
   591  	BLT      index3loop2  // If within, continue this loop
   592  	CMP      R7, LASTSTR  // Compare against last start byte
   593  	BLT      index2to16   // Process remainder
   594  	VSPLTISB $0, V3       // Special case for last 16 bytes
   595  	BR       index3loop   // Continue this loop
   596  
   597  	// Loop to process 4 byte separator
   598  	// string[0:16] in V2
   599  	// string[3:16] in V3
   600  	// sep[0:4] splatted in V1
   601  	// Set up vectors with strings at offsets
   602  	// 0, 1, 2, 3 and compare against the 4 byte
   603  	// separator also splatted. Use VSEL with the
   604  	// compare results to find the first byte where
   605  	// a separator match is found.
   606  index4plus:
   607  	CMP  R6, $4       // Check if 4 byte separator
   608  	BNE  index5plus   // If not next higher
   609  	ADD  $20, R7, R9  // Check string size to load
   610  	CMP  R9, LASTBYTE // Verify string length
   611  	BGE  index2to16   // If not large enough, process remaining
   612  	MOVD $2, R15      // Set up index
   613  
   614  	// Set up masks for use with VSEL
   615  	MOVD    $0xff, R21 // Set up mask 0xff000000ff000000...
   616  	SLD     $24, R21
   617  	MTVSRWS R21, V29
   618  
   619  	VSLDOI  $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
   620  	MOVD    $0xffff, R21
   621  	SLD     $16, R21
   622  	MTVSRWS R21, V31
   623  
   624  	VSPLTW $0, V0, V1 // Splat 1st word of separator
   625  
   626  index4loop:
   627  	LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
   628  
   629  next4:
   630  	VSPLTISB $0, V10            // Clear
   631  	MOVD     $3, R9             // Number of bytes beyond 16
   632  	LXVB16X  (R7)(R9), V3_      // Load 16 bytes @R7 into V2
   633  	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
   634  	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
   635  	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
   636  	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
   637  	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
   638  	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
   639  	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
   640  	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
   641  	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
   642  	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
   643  	VSEL     V14, V13, V31, V7  // final merge
   644  	VCLZD    V7, V18            // Find first index for each half
   645  	MFVSRD   V18, R25           // Isolate value
   646  	CMP      R25, $64           // If < 64, found
   647  	BLT      foundR25           // Return found index
   648  
   649  	MFVSRLD V18, R25     // Isolate other value
   650  	CMP     R25, $64     // If < 64, found
   651  	ADD     $64, R25     // Update index for high doubleword
   652  	BLT     foundR25     // Return found index
   653  	ADD     $16, R7      // R7+=16 for next string
   654  	ADD     $20, R7, R9  // R+20 for all bytes to load
   655  	CMP     R9, LASTBYTE // Past end? Maybe check for extra?
   656  	BLT     index4loop   // If not, continue loop
   657  	CMP     R7, LASTSTR  // Check remainder
   658  	BLE     index2to16   // Process remainder
   659  	BR      notfound     // Not found
   660  
   661  index5plus:
   662  	CMP R6, $16     // Check for sep > 16
   663  	BGT index17plus // Handle large sep
   664  
   665  	// Assumption is that the separator is smaller than the string at this point
   666  index2to16:
   667  	CMP R7, LASTSTR // Compare last start byte
   668  	BGT notfound    // last takes len(sep) into account
   669  
   670  	ADD $16, R7, R9    // Check for last byte of string
   671  	CMP R9, LASTBYTE
   672  	BGT index2to16tail
   673  
   674  	// At least 16 bytes of string left
   675  	// Mask the number of bytes in sep
   676  index2to16loop:
   677  	LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
   678  
   679  compare:
   680  	VAND       V1, SEPMASK, V2 // Mask out sep size
   681  	VCMPEQUBCC V0, V2, V3      // Compare masked string
   682  	BLT        CR6, found      // All equal
   683  	ADD        $1, R7          // Update ptr to next byte
   684  	CMP        R7, LASTSTR     // Still less than last start byte
   685  	BGT        notfound        // Not found
   686  	ADD        $16, R7, R9     // Verify remaining bytes
   687  	CMP        R9, LASTBYTE    // At least 16
   688  	BLT        index2to16loop  // Try again
   689  
   690  	// Less than 16 bytes remaining in string
   691  	// Separator >= 2
   692  index2to16tail:
   693  	ADD     R3, R4, R9     // End of string
   694  	SUB     R7, R9, R9     // Number of bytes left
   695  	ANDCC   $15, R7, R10   // 16 byte offset
   696  	ADD     R10, R9, R11   // offset + len
   697  	CMP     R11, $16       // >= 16?
   698  	BLE     short          // Does not cross 16 bytes
   699  	LXVB16X (R7)(R0), V1_  // Load 16 bytes @R7 into V1
   700  	BR      index2to16next // Continue on
   701  
   702  short:
   703  	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
   704  	LXVB16X  (R9)(R0), V1_   // Load 16 bytes @R9 into V1
   705  	SLD      $3, R10         // Set up shift
   706  	MTVSRD   R10, V8_        // Set up shift
   707  	VSLDOI   $8, V8, V8, V8
   708  	VSLO     V1, V8, V1      // Shift by start byte
   709  	VSPLTISB $0, V25         // Clear for later use
   710  
   711  index2to16next:
   712  	VAND       V1, SEPMASK, V2 // Just compare size of sep
   713  	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
   714  	BLT        CR6, found      // Found
   715  	ADD        $1, R7          // Not found, try next partial string
   716  	CMP        R7, LASTSTR     // Check for end of string
   717  	BGT        notfound        // If at end, then not found
   718  	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
   719  	BR         index2to16next  // Check the next partial string
   720  
   721  index17plus:
   722  	CMP      R6, $32       // Check if 17 < len(sep) <= 32
   723  	BGT      index33plus
   724  	SUB      $16, R6, R9   // Extra > 16
   725  	SLD      $56, R9, R10  // Shift to use in VSLO
   726  	MTVSRD   R10, V9_      // Set up for VSLO
   727  	LXVB16X  (R5)(R9), V1_ // Load 16 bytes @R5+R9 into V1
   728  	VSLO     V1, V9, V1    // Shift left
   729  	VSPLTISB $0xff, V7     // Splat 1s
   730  	VSPLTISB $0, V27       // Splat 0
   731  
   732  index17to32loop:
   733  	LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
   734  
   735  next17:
   736  	LXVB16X    (R7)(R9), V3_   // Load 16 bytes @R7+R9 into V3
   737  	VSLO       V3, V9, V3      // Shift left
   738  	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
   739  	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
   740  	VAND       V4, V5, V6      // Check if both equal
   741  	VCMPEQUBCC V6, V7, V8      // All equal?
   742  	BLT        CR6, found      // Yes
   743  	ADD        $1, R7          // On to next byte
   744  	CMP        R7, LASTSTR     // Check if last start byte
   745  	BGT        notfound        // If too high, not found
   746  	BR         index17to32loop // Continue
   747  
   748  notfound:
   749  	MOVD $-1, R8   // Return -1 if not found
   750  	MOVD R8, (R14)
   751  	RET
   752  
   753  index33plus:
   754  	MOVD $0, (R0) // Case not implemented
   755  	RET           // Crash before return
   756  
   757  foundR25:
   758  	SRD  $3, R25   // Convert from bits to bytes
   759  	ADD  R25, R7   // Add to current string address
   760  	SUB  R3, R7    // Subtract from start of string
   761  	MOVD R7, (R14) // Return byte where found
   762  	RET
   763  
   764  found:
   765  	SUB  R3, R7    // Return byte where found
   766  	MOVD R7, (R14)
   767  	RET
   768