github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/internal/bytealg/indexbyte_ppc64x.s

github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/internal/bytealg/indexbyte_ppc64x.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ppc64 || ppc64le
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  
    10  TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    11  	// R3 = byte array pointer
    12  	// R4 = length
    13  	MOVD	R6, R5		// R5 = byte
    14  	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
    15  	BR	indexbytebody<>(SB)
    16  
    17  TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
    18  	// R3 = string
    19  	// R4 = length
    20  	// R5 = byte
    21  	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
    22  	BR	indexbytebody<>(SB)
    23  
    24  // R3 = addr of string
    25  // R4 = len of string
    26  // R5 = byte to find
    27  // R16 = 1 if running on a POWER9 system, 0 otherwise
    28  // On exit:
    29  // R3 = return value
    30  TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
    31  	MOVD	R3,R17		// Save base address for calculating the index later.
    32  	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
    33  	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
    34  	ADD	R4,R3,R7	// Last acceptable address in R7.
    35  
    36  	RLDIMI	$16,R5,$32,R5
    37  	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
    38  	MOVD	$-1,R9
    39  	RLWNM	$3,R3,$26,$28,R6	// shift amount for mask (r3&0x7)*8
    40  	RLDIMI	$32,R5,$0,R5
    41  	MOVD	R7,R10		// Save last acceptable address in R10 for later.
    42  	ADD	$-1,R7,R7
    43  #ifdef GOARCH_ppc64le
    44  	SLD	R6,R9,R9	// Prepare mask for Little Endian
    45  #else
    46  	SRD	R6,R9,R9	// Same for Big Endian
    47  #endif
    48  	BLT	small_string	// Jump to the small string case if it's <32 bytes.
    49  	CMP	R16,$1		// optimize for power8 v power9
    50  	BNE	power8
    51  	VSPLTISB	$3,V10	// Use V10 as control for VBPERMQ
    52  	MTVRD	R5,V1
    53  	LVSL	(R0+R0),V11	// set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
    54  	VSLB	V11,V10,V10	// to extract the first bit of match result into GPR
    55  	VSPLTB	$7,V1,V1	// Replicate byte across V1
    56  	CMP	R4,$64
    57  	MOVD	$16,R11
    58  	MOVD	R3,R8
    59  	BLT	cmp32
    60  	MOVD	$32,R12
    61  	MOVD	$48,R6
    62  
    63  loop64:
    64  	LXVB16X	(R0)(R8),V2	// scan 64 bytes at a time
    65  	VCMPEQUBCC	V2,V1,V6
    66  	BNE	CR6,foundat0	// match found at R8, jump out
    67  
    68  	LXVB16X	(R8)(R11),V2
    69  	VCMPEQUBCC	V2,V1,V6
    70  	BNE	CR6,foundat1	// match found at R8+16 bytes, jump out
    71  
    72  	LXVB16X	(R8)(R12),V2
    73  	VCMPEQUBCC	V2,V1,V6
    74  	BNE	CR6,foundat2	// match found at R8+32 bytes, jump out
    75  
    76  	LXVB16X	(R8)(R6),V2
    77  	VCMPEQUBCC	V2,V1,V6
    78  	BNE	CR6,foundat3	// match found at R8+48 bytes, jump out
    79  	ADD	$64,R8
    80  	ADD	$-64,R4
    81  	CMP	R4,$64		// >=64 bytes left to scan?
    82  	BGE	loop64
    83  	CMP	R4,$32
    84  	BLT	rem		// jump to rem if there are < 32 bytes left
    85  cmp32:
    86  	LXVB16X	(R0)(R8),V2	// 32-63 bytes left
    87  	VCMPEQUBCC	V2,V1,V6
    88  	BNE	CR6,foundat0	// match found at R8
    89  
    90  	LXVB16X	(R11)(R8),V2
    91  	VCMPEQUBCC	V2,V1,V6
    92  	BNE	CR6,foundat1	// match found at R8+16
    93  
    94  	ADD	$32,R8
    95  	ADD	$-32,R4
    96  rem:
    97  	RLDICR	$0,R8,$60,R8	// align address to reuse code for tail end processing
    98  	BR	small_string
    99  
   100  foundat3:
   101  	ADD	$16,R8
   102  foundat2:
   103  	ADD	$16,R8
   104  foundat1:
   105  	ADD	$16,R8
   106  foundat0:
   107  	// Compress the result into a single doubleword and
   108  	// move it to a GPR for the final calculation.
   109  	VBPERMQ	V6,V10,V6
   110  	MFVRD	V6,R3
   111  	// count leading zeroes upto the match that ends up in low 16 bits
   112  	// in both endian modes, compute index by subtracting the number by 16
   113  	CNTLZW	R3,R11
   114  	ADD	$-16,R11
   115  	ADD	R8,R11,R3	// Calculate byte address
   116  	SUB	R17,R3
   117  	RET
   118  power8:
   119  	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
   120  	// in V0, V1 and V10, then branch to the preloop.
   121  	ANDCC	$63,R3,R11
   122  	BEQ	CR0,qw_align
   123  	RLDICL	$0,R3,$61,R11
   124  
   125  	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
   126  	CMPB	R12,R5,R3	// Check for a match.
   127  	AND	R9,R3,R3	// Mask bytes below s_base
   128  	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
   129  	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
   130  	BNE	CR7,done
   131  	ADD	$8,R8,R8
   132  	ADD	$-8,R4,R4
   133  	ADD	R4,R11,R4
   134  
   135  	// Check for quadword alignment
   136  	ANDCC	$15,R8,R11
   137  	BEQ	CR0,qw_align
   138  
   139  	// Not aligned, so handle the next doubleword
   140  	MOVD	0(R8),R12
   141  	CMPB	R12,R5,R3
   142  	CMPU	R3,$0,CR7
   143  	BNE	CR7,done
   144  	ADD	$8,R8,R8
   145  	ADD	$-8,R4,R4
   146  
   147  	// Either quadword aligned or 64-byte at this point. We can use LVX.
   148  qw_align:
   149  
   150  	// Set up auxiliary data for the vectorized algorithm.
   151  	VSPLTISB  $0,V0		// Replicate 0 across V0
   152  	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
   153  	MTVRD	  R5,V1
   154  	LVSL	  (R0+R0),V11
   155  	VSLB	  V11,V10,V10
   156  	VSPLTB	  $7,V1,V1	// Replicate byte across V1
   157  	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
   158  	BLE	  tail
   159  
   160  	// We will load 4 quardwords per iteration in the loop, so check for
   161  	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
   162  	ANDCC	  $63,R8,R11
   163  	BEQ	  CR0,preloop
   164  
   165  	// Not 64-byte aligned. Load one quadword at a time until aligned.
   166  	LVX	    (R8+R0),V4
   167  	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   168  	BNE	    CR6,found_qw_align
   169  	ADD	    $16,R8,R8
   170  	ADD	    $-16,R4,R4
   171  
   172  	ANDCC	    $63,R8,R11
   173  	BEQ	    CR0,preloop
   174  	LVX	    (R8+R0),V4
   175  	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   176  	BNE	    CR6,found_qw_align
   177  	ADD	    $16,R8,R8
   178  	ADD	    $-16,R4,R4
   179  
   180  	ANDCC	    $63,R8,R11
   181  	BEQ	    CR0,preloop
   182  	LVX	    (R8+R0),V4
   183  	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   184  	BNE	    CR6,found_qw_align
   185  	ADD	    $-16,R4,R4
   186  	ADD	    $16,R8,R8
   187  
   188  	// 64-byte aligned. Prepare for the main loop.
   189  preloop:
   190  	CMPU	R4,$64
   191  	BLE	tail	      // If len ≤ 64, don't use the vectorized loop
   192  
   193  	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
   194  	// per loop iteration. The last doubleword is in R10, so our loop counter
   195  	// starts at (R10-R8)/64.
   196  	SUB	R8,R10,R6
   197  	SRD	$6,R6,R9      // Loop counter in R9
   198  	MOVD	R9,CTR
   199  
   200  	ADD	$-64,R8,R8   // Adjust index for loop entry
   201  	MOVD	$16,R11      // Load offsets for the vector loads
   202  	MOVD	$32,R9
   203  	MOVD	$48,R7
   204  
   205  	// Main loop we will load 64 bytes per iteration
   206  loop:
   207  	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
   208  	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
   209  	LVX	    (R8+R11),V3
   210  	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
   211  	VCMPEQUB    V1,V3,V7
   212  
   213  	LVX	    (R8+R9),V4
   214  	LVX	    (R8+R7),V5
   215  	VCMPEQUB    V1,V4,V8
   216  	VCMPEQUB    V1,V5,V9
   217  
   218  	VOR	    V6,V7,V11	      // Compress the result in a single vector
   219  	VOR	    V8,V9,V12
   220  	VOR	    V11,V12,V13
   221  	VCMPEQUBCC  V0,V13,V14	      // Check for byte
   222  	BGE	    CR6,found
   223  	BC	    16,0,loop	      // bdnz loop
   224  
   225  	// Handle the tailing bytes or R4 ≤ 64
   226  	RLDICL	$0,R6,$58,R4
   227  	ADD	$64,R8,R8
   228  tail:
   229  	CMPU	    R4,$0
   230  	BEQ	    notfound
   231  	LVX	    (R8+R0),V4
   232  	VCMPEQUBCC  V1,V4,V6
   233  	BNE	    CR6,found_qw_align
   234  	ADD	    $16,R8,R8
   235  	CMPU	    R4,$16,CR6
   236  	BLE	    CR6,notfound
   237  	ADD	    $-16,R4,R4
   238  
   239  	LVX	    (R8+R0),V4
   240  	VCMPEQUBCC  V1,V4,V6
   241  	BNE	    CR6,found_qw_align
   242  	ADD	    $16,R8,R8
   243  	CMPU	    R4,$16,CR6
   244  	BLE	    CR6,notfound
   245  	ADD	    $-16,R4,R4
   246  
   247  	LVX	    (R8+R0),V4
   248  	VCMPEQUBCC  V1,V4,V6
   249  	BNE	    CR6,found_qw_align
   250  	ADD	    $16,R8,R8
   251  	CMPU	    R4,$16,CR6
   252  	BLE	    CR6,notfound
   253  	ADD	    $-16,R4,R4
   254  
   255  	LVX	    (R8+R0),V4
   256  	VCMPEQUBCC  V1,V4,V6
   257  	BNE	    CR6,found_qw_align
   258  
   259  notfound:
   260  	MOVD	$-1, R3
   261  	RET
   262  
   263  found:
   264  	// We will now compress the results into a single doubleword,
   265  	// so it can be moved to a GPR for the final index calculation.
   266  
   267  	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
   268  	// first bit of each byte into bits 48-63.
   269  	VBPERMQ	  V6,V10,V6
   270  	VBPERMQ	  V7,V10,V7
   271  	VBPERMQ	  V8,V10,V8
   272  	VBPERMQ	  V9,V10,V9
   273  
   274  	// Shift each 16-bit component into its correct position for
   275  	// merging into a single doubleword.
   276  #ifdef GOARCH_ppc64le
   277  	VSLDOI	  $2,V7,V7,V7
   278  	VSLDOI	  $4,V8,V8,V8
   279  	VSLDOI	  $6,V9,V9,V9
   280  #else
   281  	VSLDOI	  $6,V6,V6,V6
   282  	VSLDOI	  $4,V7,V7,V7
   283  	VSLDOI	  $2,V8,V8,V8
   284  #endif
   285  
   286  	// Merge V6-V9 into a single doubleword and move to a GPR.
   287  	VOR	V6,V7,V11
   288  	VOR	V8,V9,V4
   289  	VOR	V4,V11,V4
   290  	MFVRD	V4,R3
   291  
   292  #ifdef GOARCH_ppc64le
   293  	ADD	  $-1,R3,R11
   294  	ANDN	  R3,R11,R11
   295  	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
   296  #else
   297  	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
   298  #endif
   299  	ADD	R8,R11,R3	// Calculate byte address
   300  
   301  return:
   302  	SUB	R17, R3
   303  	RET
   304  
   305  found_qw_align:
   306  	// Use the same algorithm as above. Compress the result into
   307  	// a single doubleword and move it to a GPR for the final
   308  	// calculation.
   309  	VBPERMQ	  V6,V10,V6
   310  
   311  #ifdef GOARCH_ppc64le
   312  	MFVRD	  V6,R3
   313  	ADD	  $-1,R3,R11
   314  	ANDN	  R3,R11,R11
   315  	POPCNTD	  R11,R11
   316  #else
   317  	VSLDOI	  $6,V6,V6,V6
   318  	MFVRD	  V6,R3
   319  	CNTLZD	  R3,R11
   320  #endif
   321  	ADD	  R8,R11,R3
   322  	CMPU	  R11,R4
   323  	BLT	  return
   324  	BR	  notfound
   325  	PCALIGN	  $16
   326  
   327  done:
   328  	ADD	$-1,R10,R6
   329  	// Offset of last index for the final
   330  	// doubleword comparison
   331  	RLDICL	$0,R6,$61,R6
   332  	// At this point, R3 has 0xFF in the same position as the byte we are
   333  	// looking for in the doubleword. Use that to calculate the exact index
   334  	// of the byte.
   335  #ifdef GOARCH_ppc64le
   336  	ADD	$-1,R3,R11
   337  	ANDN	R3,R11,R11
   338  	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
   339  #else
   340  	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
   341  #endif
   342  	CMPU	R8,R7		// Check if we are at the last doubleword.
   343  	SRD	$3,R11		// Convert trailing zeros to bytes.
   344  	ADD	R11,R8,R3
   345  	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
   346  	BNE	return
   347  	BLE	CR7,return
   348  	BR	notfound
   349  
   350  small_string:
   351  	// process string of length < 32 bytes
   352  	// We unroll this loop for better performance.
   353  	CMPU	R4,$0		// Check for length=0
   354  	BEQ	notfound
   355  
   356  	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
   357  	CMPB	R12,R5,R3	// Check for a match.
   358  	AND	R9,R3,R3	// Mask bytes below s_base.
   359  	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
   360  	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
   361  	CMPU	R8,R7
   362  	BNE	CR7,done
   363  	BEQ	notfound	// Hit length.
   364  
   365  	MOVDU	8(R8),R12
   366  	CMPB	R12,R5,R3
   367  	CMPU	R3,$0,CR6
   368  	CMPU	R8,R7
   369  	BNE	CR6,done
   370  	BEQ	notfound
   371  
   372  	MOVDU	8(R8),R12
   373  	CMPB	R12,R5,R3
   374  	CMPU	R3,$0,CR6
   375  	CMPU	R8,R7
   376  	BNE	CR6,done
   377  	BEQ	notfound
   378  
   379  	MOVDU	8(R8),R12
   380  	CMPB	R12,R5,R3
   381  	CMPU	R3,$0,CR6
   382  	CMPU	R8,R7
   383  	BNE	CR6,done
   384  	BEQ	notfound
   385  
   386  	MOVDU	8(R8),R12
   387  	CMPB	R12,R5,R3
   388  	CMPU	R3,$0,CR6
   389  	BNE	CR6,done
   390  	BR	notfound
   391