github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/indexbyte_ppc64x.s

github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/indexbyte_ppc64x.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  // 
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && (ppc64 || ppc64le)
     9  
    10  #include "textflag.h"
    11  
    12  TEXT ·IndexSliceByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    13  	// R3 = byte array pointer
    14  	// R4 = length
    15  	MOVD R6, R5 // R5 = byte
    16  	MOVBZ ·isPOWER9(SB), R16
    17  	BR indexbytebody<>(SB)
    18  
    19  TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
    20  	// R3 = string
    21  	// R4 = length
    22  	// R5 = byte
    23  	MOVBZ ·isPOWER9(SB), R16
    24  	BR indexbytebody<>(SB)
    25  
    26  // R3 = addr of string
    27  // R4 = len of string
    28  // R5 = byte to find
    29  // R16 = 1 if running on a POWER9 system, 0 otherwise
    30  // On exit:
    31  // R3 = return value
    32  TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
    33  	MOVD R3,R17 // Save base address for calculating the index later.
    34  	RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
    35  	RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
    36  	ADD R4,R3,R7 // Last acceptable address in R7.
    37  
    38  	RLDIMI $16,R5,$32,R5
    39  	CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently.
    40  	MOVD $-1,R9
    41  	RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8
    42  	RLDIMI $32,R5,$0,R5
    43  	MOVD R7,R10 // Save last acceptable address in R10 for later.
    44  	ADD $-1,R7,R7
    45  #ifdef GOARCH_ppc64le
    46  	SLD R6,R9,R9 // Prepare mask for Little Endian
    47  #else
    48  	SRD R6,R9,R9 // Same for Big Endian
    49  #endif
    50  	BLT small_string // Jump to the small string case if it's <32 bytes.
    51  	CMP R16,$1 // optimize for power8 v power9
    52  	BNE power8
    53  	VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
    54  	MTVRD R5,V1
    55  	LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
    56  	VSLB V11,V10,V10 // to extract the first bit of match result into GPR
    57  	VSPLTB $7,V1,V1 // Replicate byte across V1
    58  	CMP R4,$64
    59  	MOVD $16,R11
    60  	MOVD R3,R8
    61  	BLT cmp32
    62  	MOVD $32,R12
    63  	MOVD $48,R6
    64  
    65  loop64:
    66  	LXVB16X (R0)(R8),V2 // scan 64 bytes at a time
    67  	VCMPEQUBCC V2,V1,V6
    68  	BNE CR6,foundat0 // match found at R8, jump out
    69  
    70  	LXVB16X (R8)(R11),V2
    71  	VCMPEQUBCC V2,V1,V6
    72  	BNE CR6,foundat1 // match found at R8+16 bytes, jump out
    73  
    74  	LXVB16X (R8)(R12),V2
    75  	VCMPEQUBCC V2,V1,V6
    76  	BNE CR6,foundat2 // match found at R8+32 bytes, jump out
    77  
    78  	LXVB16X (R8)(R6),V2
    79  	VCMPEQUBCC V2,V1,V6
    80  	BNE CR6,foundat3 // match found at R8+48 bytes, jump out
    81  	ADD $64,R8
    82  	ADD $-64,R4
    83  	CMP R4,$64 // >=64 bytes left to scan?
    84  	BGE loop64
    85  	CMP R4,$32
    86  	BLT rem // jump to rem if there are < 32 bytes left
    87  cmp32:
    88  	LXVB16X (R0)(R8),V2 // 32-63 bytes left
    89  	VCMPEQUBCC V2,V1,V6
    90  	BNE CR6,foundat0 // match found at R8
    91  
    92  	LXVB16X (R11)(R8),V2
    93  	VCMPEQUBCC V2,V1,V6
    94  	BNE CR6,foundat1 // match found at R8+16
    95  
    96  	ADD $32,R8
    97  	ADD $-32,R4
    98  rem:
    99  	RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing
   100  	BR small_string
   101  
   102  foundat3:
   103  	ADD $16,R8
   104  foundat2:
   105  	ADD $16,R8
   106  foundat1:
   107  	ADD $16,R8
   108  foundat0:
   109  	// Compress the result into a single doubleword and
   110  	// move it to a GPR for the final calculation.
   111  	VBPERMQ V6,V10,V6
   112  	MFVRD V6,R3
   113  	// count leading zeroes upto the match that ends up in low 16 bits
   114  	// in both endian modes, compute index by subtracting the number by 16
   115  	CNTLZW R3,R11
   116  	ADD $-16,R11
   117  	ADD R8,R11,R3 // Calculate byte address
   118  	SUB R17,R3
   119  	RET
   120  power8:
   121  	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
   122  	// in V0, V1 and V10, then branch to the preloop.
   123  	ANDCC $63,R3,R11
   124  	BEQ CR0,qw_align
   125  	RLDICL $0,R3,$61,R11
   126  
   127  	MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
   128  	CMPB R12,R5,R3 // Check for a match.
   129  	AND R9,R3,R3 // Mask bytes below s_base
   130  	RLDICR $0,R7,$60,R7 // Last doubleword in R7
   131  	CMPU R3,$0,CR7 // If we have a match, jump to the final computation
   132  	BNE CR7,done
   133  	ADD $8,R8,R8
   134  	ADD $-8,R4,R4
   135  	ADD R4,R11,R4
   136  
   137  	// Check for quadword alignment
   138  	ANDCC $15,R8,R11
   139  	BEQ CR0,qw_align
   140  
   141  	// Not aligned, so handle the next doubleword
   142  	MOVD 0(R8),R12
   143  	CMPB R12,R5,R3
   144  	CMPU R3,$0,CR7
   145  	BNE CR7,done
   146  	ADD $8,R8,R8
   147  	ADD $-8,R4,R4
   148  
   149  	// Either quadword aligned or 64-byte at this point. We can use LVX.
   150  qw_align:
   151  
   152  	// Set up auxiliary data for the vectorized algorithm.
   153  	VSPLTISB  $0,V0 // Replicate 0 across V0
   154  	VSPLTISB  $3,V10 // Use V10 as control for VBPERMQ
   155  	MTVRD   R5,V1
   156  	LVSL   (R0+R0),V11
   157  	VSLB   V11,V10,V10
   158  	VSPLTB   $7,V1,V1 // Replicate byte across V1
   159  	CMPU   R4, $64 // If len ≤ 64, don't use the vectorized loop
   160  	BLE   tail
   161  
   162  	// We will load 4 quardwords per iteration in the loop, so check for
   163  	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
   164  	ANDCC   $63,R8,R11
   165  	BEQ   CR0,preloop
   166  
   167  	// Not 64-byte aligned. Load one quadword at a time until aligned.
   168  	LVX     (R8+R0),V4
   169  	VCMPEQUBCC  V1,V4,V6 // Check for byte in V4
   170  	BNE     CR6,found_qw_align
   171  	ADD     $16,R8,R8
   172  	ADD     $-16,R4,R4
   173  
   174  	ANDCC     $63,R8,R11
   175  	BEQ     CR0,preloop
   176  	LVX     (R8+R0),V4
   177  	VCMPEQUBCC  V1,V4,V6 // Check for byte in V4
   178  	BNE     CR6,found_qw_align
   179  	ADD     $16,R8,R8
   180  	ADD     $-16,R4,R4
   181  
   182  	ANDCC     $63,R8,R11
   183  	BEQ     CR0,preloop
   184  	LVX     (R8+R0),V4
   185  	VCMPEQUBCC  V1,V4,V6 // Check for byte in V4
   186  	BNE     CR6,found_qw_align
   187  	ADD     $-16,R4,R4
   188  	ADD     $16,R8,R8
   189  
   190  	// 64-byte aligned. Prepare for the main loop.
   191  preloop:
   192  	CMPU R4,$64
   193  	BLE tail       // If len ≤ 64, don't use the vectorized loop
   194  
   195  	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
   196  	// per loop iteration. The last doubleword is in R10, so our loop counter
   197  	// starts at (R10-R8)/64.
   198  	SUB R8,R10,R6
   199  	SRD $6,R6,R9      // Loop counter in R9
   200  	MOVD R9,CTR
   201  
   202  	ADD $-64,R8,R8   // Adjust index for loop entry
   203  	MOVD $16,R11      // Load offsets for the vector loads
   204  	MOVD $32,R9
   205  	MOVD $48,R7
   206  
   207  	// Main loop we will load 64 bytes per iteration
   208  loop:
   209  	ADD     $64,R8,R8       // Fuse addi+lvx for performance
   210  	LVX     (R8+R0),V2       // Load 4 16-byte vectors
   211  	LVX     (R8+R11),V3
   212  	VCMPEQUB    V1,V2,V6       // Look for byte in each vector
   213  	VCMPEQUB    V1,V3,V7
   214  
   215  	LVX     (R8+R9),V4
   216  	LVX     (R8+R7),V5
   217  	VCMPEQUB    V1,V4,V8
   218  	VCMPEQUB    V1,V5,V9
   219  
   220  	VOR     V6,V7,V11       // Compress the result in a single vector
   221  	VOR     V8,V9,V12
   222  	VOR     V11,V12,V13
   223  	VCMPEQUBCC  V0,V13,V14       // Check for byte
   224  	BGE     CR6,found
   225  	BC     16,0,loop       // bdnz loop
   226  
   227  	// Handle the tailing bytes or R4 ≤ 64
   228  	RLDICL $0,R6,$58,R4
   229  	ADD $64,R8,R8
   230  tail:
   231  	CMPU     R4,$0
   232  	BEQ     notfound
   233  	LVX     (R8+R0),V4
   234  	VCMPEQUBCC  V1,V4,V6
   235  	BNE     CR6,found_qw_align
   236  	ADD     $16,R8,R8
   237  	CMPU     R4,$16,CR6
   238  	BLE     CR6,notfound
   239  	ADD     $-16,R4,R4
   240  
   241  	LVX     (R8+R0),V4
   242  	VCMPEQUBCC  V1,V4,V6
   243  	BNE     CR6,found_qw_align
   244  	ADD     $16,R8,R8
   245  	CMPU     R4,$16,CR6
   246  	BLE     CR6,notfound
   247  	ADD     $-16,R4,R4
   248  
   249  	LVX     (R8+R0),V4
   250  	VCMPEQUBCC  V1,V4,V6
   251  	BNE     CR6,found_qw_align
   252  	ADD     $16,R8,R8
   253  	CMPU     R4,$16,CR6
   254  	BLE     CR6,notfound
   255  	ADD     $-16,R4,R4
   256  
   257  	LVX     (R8+R0),V4
   258  	VCMPEQUBCC  V1,V4,V6
   259  	BNE     CR6,found_qw_align
   260  
   261  notfound:
   262  	MOVD $-1, R3
   263  	RET
   264  
   265  found:
   266  	// We will now compress the results into a single doubleword,
   267  	// so it can be moved to a GPR for the final index calculation.
   268  
   269  	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
   270  	// first bit of each byte into bits 48-63.
   271  	VBPERMQ   V6,V10,V6
   272  	VBPERMQ   V7,V10,V7
   273  	VBPERMQ   V8,V10,V8
   274  	VBPERMQ   V9,V10,V9
   275  
   276  	// Shift each 16-bit component into its correct position for
   277  	// merging into a single doubleword.
   278  #ifdef GOARCH_ppc64le
   279  	VSLDOI   $2,V7,V7,V7
   280  	VSLDOI   $4,V8,V8,V8
   281  	VSLDOI   $6,V9,V9,V9
   282  #else
   283  	VSLDOI   $6,V6,V6,V6
   284  	VSLDOI   $4,V7,V7,V7
   285  	VSLDOI   $2,V8,V8,V8
   286  #endif
   287  
   288  	// Merge V6-V9 into a single doubleword and move to a GPR.
   289  	VOR V6,V7,V11
   290  	VOR V8,V9,V4
   291  	VOR V4,V11,V4
   292  	MFVRD V4,R3
   293  
   294  #ifdef GOARCH_ppc64le
   295  	ADD   $-1,R3,R11
   296  	ANDN   R3,R11,R11
   297  	POPCNTD   R11,R11 // Count trailing zeros (Little Endian).
   298  #else
   299  	CNTLZD R3,R11 // Count leading zeros (Big Endian).
   300  #endif
   301  	ADD R8,R11,R3 // Calculate byte address
   302  
   303  return:
   304  	SUB R17, R3
   305  	RET
   306  
   307  found_qw_align:
   308  	// Use the same algorithm as above. Compress the result into
   309  	// a single doubleword and move it to a GPR for the final
   310  	// calculation.
   311  	VBPERMQ   V6,V10,V6
   312  
   313  #ifdef GOARCH_ppc64le
   314  	MFVRD   V6,R3
   315  	ADD   $-1,R3,R11
   316  	ANDN   R3,R11,R11
   317  	POPCNTD   R11,R11
   318  #else
   319  	VSLDOI   $6,V6,V6,V6
   320  	MFVRD   V6,R3
   321  	CNTLZD   R3,R11
   322  #endif
   323  	ADD   R8,R11,R3
   324  	CMPU   R11,R4
   325  	BLT   return
   326  	BR   notfound
   327  	PCALIGN   $16
   328  
   329  done:
   330  	ADD $-1,R10,R6
   331  	// Offset of last index for the final
   332  	// doubleword comparison
   333  	RLDICL $0,R6,$61,R6
   334  	// At this point, R3 has 0xFF in the same position as the byte we are
   335  	// looking for in the doubleword. Use that to calculate the exact index
   336  	// of the byte.
   337  #ifdef GOARCH_ppc64le
   338  	ADD $-1,R3,R11
   339  	ANDN R3,R11,R11
   340  	POPCNTD R11,R11 // Count trailing zeros (Little Endian).
   341  #else
   342  	CNTLZD R3,R11 // Count leading zeros (Big Endian).
   343  #endif
   344  	CMPU R8,R7 // Check if we are at the last doubleword.
   345  	SRD $3,R11 // Convert trailing zeros to bytes.
   346  	ADD R11,R8,R3
   347  	CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
   348  	BNE return
   349  	BLE CR7,return
   350  	BR notfound
   351  
   352  small_string:
   353  	// process string of length < 32 bytes
   354  	// We unroll this loop for better performance.
   355  	CMPU R4,$0 // Check for length=0
   356  	BEQ notfound
   357  
   358  	MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
   359  	CMPB R12,R5,R3 // Check for a match.
   360  	AND R9,R3,R3 // Mask bytes below s_base.
   361  	CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
   362  	RLDICR $0,R7,$60,R7 // Last doubleword in R7.
   363  	CMPU R8,R7
   364  	BNE CR7,done
   365  	BEQ notfound // Hit length.
   366  
   367  	MOVDU 8(R8),R12
   368  	CMPB R12,R5,R3
   369  	CMPU R3,$0,CR6
   370  	CMPU R8,R7
   371  	BNE CR6,done
   372  	BEQ notfound
   373  
   374  	MOVDU 8(R8),R12
   375  	CMPB R12,R5,R3
   376  	CMPU R3,$0,CR6
   377  	CMPU R8,R7
   378  	BNE CR6,done
   379  	BEQ notfound
   380  
   381  	MOVDU 8(R8),R12
   382  	CMPB R12,R5,R3
   383  	CMPU R3,$0,CR6
   384  	CMPU R8,R7
   385  	BNE CR6,done
   386  	BEQ notfound
   387  
   388  	MOVDU 8(R8),R12
   389  	CMPB R12,R5,R3
   390  	CMPU R3,$0,CR6
   391  	BNE CR6,done
   392  	BR notfound
   393