github.com/hikaru7719/go@v0.0.0-20181025140707-c8b2ac68906a/src/internal/bytealg/indexbyte_ppc64x.s

github.com/hikaru7719/go@v0.0.0-20181025140707-c8b2ac68906a/src/internal/bytealg/indexbyte_ppc64x.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ppc64 ppc64le
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  
    10  TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
    11  	MOVD	b_base+0(FP), R3	// R3 = byte array pointer
    12  	MOVD	b_len+8(FP), R4		// R4 = length
    13  	MOVBZ	c+24(FP), R5		// R5 = byte
    14  	MOVD	$ret+32(FP), R14	// R14 = &ret
    15  	BR	indexbytebody<>(SB)
    16  
    17  TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
    18  	MOVD	s_base+0(FP), R3  // R3 = string
    19  	MOVD	s_len+8(FP), R4	  // R4 = length
    20  	MOVBZ	c+16(FP), R5	  // R5 = byte
    21  	MOVD	$ret+24(FP), R14  // R14 = &ret
    22  	BR	indexbytebody<>(SB)
    23  
    24  TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
    25  	FUNCDATA $0, ·IndexByte·args_stackmap(SB)
    26  	MOVD	b_base+0(FP), R3	// R3 = byte array pointer
    27  	MOVD	b_len+8(FP), R4		// R4 = length
    28  	MOVBZ	c+24(FP), R5		// R5 = byte
    29  	MOVD	$ret+32(FP), R14	// R14 = &ret
    30  	BR	indexbytebody<>(SB)
    31  
    32  TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
    33  	FUNCDATA $0, ·IndexByteString·args_stackmap(SB)
    34  	MOVD	s_base+0(FP), R3  // R3 = string
    35  	MOVD	s_len+8(FP), R4	  // R4 = length
    36  	MOVBZ	c+16(FP), R5	  // R5 = byte
    37  	MOVD	$ret+24(FP), R14  // R14 = &ret
    38  	BR	indexbytebody<>(SB)
    39  
    40  TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
    41  	MOVD	R3,R17		// Save base address for calculating the index later.
    42  	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
    43  	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
    44  	ADD	R4,R3,R7	// Last acceptable address in R7.
    45  	DCBT	(R8)		// Prepare cache line.
    46  
    47  	RLDIMI	$16,R5,$32,R5
    48  	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
    49  	MOVD	$-1,R9
    50  	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
    51  	RLDIMI	$32,R5,$0,R5
    52  	MOVD	R7,R10		// Save last acceptable address in R10 for later.
    53  	ADD	$-1,R7,R7
    54  #ifdef GOARCH_ppc64le
    55  	SLD	R6,R9,R9	// Prepare mask for Little Endian
    56  #else
    57  	SRD	R6,R9,R9	// Same for Big Endian
    58  #endif
    59  	BLE	small_string	// Jump to the small string case if it's ≤32 bytes.
    60  
    61  	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
    62  	// in V0, V1 and V10, then branch to the preloop.
    63  	ANDCC	$63,R3,R11
    64  	BEQ	CR0,qw_align
    65  	RLDICL	$0,R3,$61,R11
    66  
    67  	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
    68  	CMPB	R12,R5,R3	// Check for a match.
    69  	AND	R9,R3,R3	// Mask bytes below s_base
    70  	RLDICL	$0,R7,$61,R6	// length-1
    71  	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
    72  	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
    73  	BNE	CR7,done
    74  	ADD	$8,R8,R8
    75  	ADD	$-8,R4,R4
    76  	ADD	R4,R11,R4
    77  
    78  	// Check for quadword alignment
    79  	ANDCC	$15,R8,R11
    80  	BEQ	CR0,qw_align
    81  
    82  	// Not aligned, so handle the next doubleword
    83  	MOVD	0(R8),R12
    84  	CMPB	R12,R5,R3
    85  	CMPU	R3,$0,CR7
    86  	BNE	CR7,done
    87  	ADD	$8,R8,R8
    88  	ADD	$-8,R4,R4
    89  
    90  	// Either quadword aligned or 64-byte at this point. We can use LVX.
    91  qw_align:
    92  
    93  	// Set up auxiliary data for the vectorized algorithm.
    94  	VSPLTISB  $0,V0		// Replicate 0 across V0
    95  	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
    96  	MTVRD	  R5,V1
    97  	LVSL	  (R0+R0),V11
    98  	VSLB	  V11,V10,V10
    99  	VSPLTB	  $7,V1,V1	// Replicate byte across V1
   100  	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
   101  	BLE	  tail
   102  
   103  	// We will load 4 quardwords per iteration in the loop, so check for
   104  	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
   105  	ANDCC	  $63,R8,R11
   106  	BEQ	  CR0,preloop
   107  
   108  	// Not 64-byte aligned. Load one quadword at a time until aligned.
   109  	LVX	    (R8+R0),V4
   110  	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   111  	BNE	    CR6,found_qw_align
   112  	ADD	    $16,R8,R8
   113  	ADD	    $-16,R4,R4
   114  
   115  	ANDCC	    $63,R8,R11
   116  	BEQ	    CR0,preloop
   117  	LVX	    (R8+R0),V4
   118  	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   119  	BNE	    CR6,found_qw_align
   120  	ADD	    $16,R8,R8
   121  	ADD	    $-16,R4,R4
   122  
   123  	ANDCC	    $63,R8,R11
   124  	BEQ	    CR0,preloop
   125  	LVX	    (R8+R0),V4
   126  	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   127  	BNE	    CR6,found_qw_align
   128  	ADD	    $-16,R4,R4
   129  	ADD	    $16,R8,R8
   130  
   131  	// 64-byte aligned. Prepare for the main loop.
   132  preloop:
   133  	CMPU	R4,$64
   134  	BLE	tail	      // If len ≤ 64, don't use the vectorized loop
   135  
   136  	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
   137  	// per loop iteration. The last doubleword is in R10, so our loop counter
   138  	// starts at (R10-R8)/64.
   139  	SUB	R8,R10,R6
   140  	SRD	$6,R6,R9      // Loop counter in R9
   141  	MOVD	R9,CTR
   142  
   143  	ADD	$-64,R8,R8   // Adjust index for loop entry
   144  	MOVD	$16,R11      // Load offsets for the vector loads
   145  	MOVD	$32,R9
   146  	MOVD	$48,R7
   147  
   148  	// Main loop we will load 64 bytes per iteration
   149  loop:
   150  	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
   151  	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
   152  	LVX	    (R8+R11),V3
   153  	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
   154  	VCMPEQUB    V1,V3,V7
   155  
   156  	LVX	    (R8+R9),V4
   157  	LVX	    (R8+R7),V5
   158  	VCMPEQUB    V1,V4,V8
   159  	VCMPEQUB    V1,V5,V9
   160  
   161  	VOR	    V6,V7,V11	      // Compress the result in a single vector
   162  	VOR	    V8,V9,V12
   163  	VOR	    V11,V12,V13
   164  	VCMPEQUBCC  V0,V13,V14	      // Check for byte
   165  	BGE	    CR6,found
   166  	BC	    16,0,loop	      // bdnz loop
   167  
   168  	// Handle the tailing bytes or R4 ≤ 64
   169  	RLDICL	$0,R6,$58,R4
   170  	ADD	$64,R8,R8
   171  tail:
   172  	CMPU	    R4,$0
   173  	BEQ	    notfound
   174  	LVX	    (R8+R0),V4
   175  	VCMPEQUBCC  V1,V4,V6
   176  	BNE	    CR6,found_qw_align
   177  	ADD	    $16,R8,R8
   178  	CMPU	    R4,$16,CR6
   179  	BLE	    CR6,notfound
   180  	ADD	    $-16,R4,R4
   181  
   182  	LVX	    (R8+R0),V4
   183  	VCMPEQUBCC  V1,V4,V6
   184  	BNE	    CR6,found_qw_align
   185  	ADD	    $16,R8,R8
   186  	CMPU	    R4,$16,CR6
   187  	BLE	    CR6,notfound
   188  	ADD	    $-16,R4,R4
   189  
   190  	LVX	    (R8+R0),V4
   191  	VCMPEQUBCC  V1,V4,V6
   192  	BNE	    CR6,found_qw_align
   193  	ADD	    $16,R8,R8
   194  	CMPU	    R4,$16,CR6
   195  	BLE	    CR6,notfound
   196  	ADD	    $-16,R4,R4
   197  
   198  	LVX	    (R8+R0),V4
   199  	VCMPEQUBCC  V1,V4,V6
   200  	BNE	    CR6,found_qw_align
   201  
   202  notfound:
   203  	MOVD	$-1,R3
   204  	MOVD	R3,(R14)
   205  	RET
   206  
   207  found:
   208  	// We will now compress the results into a single doubleword,
   209  	// so it can be moved to a GPR for the final index calculation.
   210  
   211  	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
   212  	// first bit of each byte into bits 48-63.
   213  	VBPERMQ	  V6,V10,V6
   214  	VBPERMQ	  V7,V10,V7
   215  	VBPERMQ	  V8,V10,V8
   216  	VBPERMQ	  V9,V10,V9
   217  
   218  	// Shift each 16-bit component into its correct position for
   219  	// merging into a single doubleword.
   220  #ifdef GOARCH_ppc64le
   221  	VSLDOI	  $2,V7,V7,V7
   222  	VSLDOI	  $4,V8,V8,V8
   223  	VSLDOI	  $6,V9,V9,V9
   224  #else
   225  	VSLDOI	  $6,V6,V6,V6
   226  	VSLDOI	  $4,V7,V7,V7
   227  	VSLDOI	  $2,V8,V8,V8
   228  #endif
   229  
   230  	// Merge V6-V9 into a single doubleword and move to a GPR.
   231  	VOR	V6,V7,V11
   232  	VOR	V8,V9,V4
   233  	VOR	V4,V11,V4
   234  	MFVRD	V4,R3
   235  
   236  #ifdef GOARCH_ppc64le
   237  	ADD	  $-1,R3,R11
   238  	ANDN	  R3,R11,R11
   239  	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
   240  #else
   241  	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
   242  #endif
   243  	ADD	R8,R11,R3	// Calculate byte address
   244  
   245  return:
   246  	SUB	R17,R3
   247  	MOVD	R3,(R14)
   248  	RET
   249  
   250  found_qw_align:
   251  	// Use the same algorithm as above. Compress the result into
   252  	// a single doubleword and move it to a GPR for the final
   253  	// calculation.
   254  	VBPERMQ	  V6,V10,V6
   255  
   256  #ifdef GOARCH_ppc64le
   257  	MFVRD	  V6,R3
   258  	ADD	  $-1,R3,R11
   259  	ANDN	  R3,R11,R11
   260  	POPCNTD	  R11,R11
   261  #else
   262  	VSLDOI	  $6,V6,V6,V6
   263  	MFVRD	  V6,R3
   264  	CNTLZD	  R3,R11
   265  #endif
   266  	ADD	  R8,R11,R3
   267  	CMPU	  R11,R4
   268  	BLT	  return
   269  	BR	  notfound
   270  
   271  done:
   272  	// At this point, R3 has 0xFF in the same position as the byte we are
   273  	// looking for in the doubleword. Use that to calculate the exact index
   274  	// of the byte.
   275  #ifdef GOARCH_ppc64le
   276  	ADD	$-1,R3,R11
   277  	ANDN	R3,R11,R11
   278  	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
   279  #else
   280  	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
   281  #endif
   282  	CMPU	R8,R7		// Check if we are at the last doubleword.
   283  	SRD	$3,R11		// Convert trailing zeros to bytes.
   284  	ADD	R11,R8,R3
   285  	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
   286  	BNE	return
   287  	BLE	CR7,return
   288  	BR	notfound
   289  
   290  small_string:
   291  	// We unroll this loop for better performance.
   292  	CMPU	R4,$0		// Check for length=0
   293  	BEQ	notfound
   294  
   295  	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
   296  	CMPB	R12,R5,R3	// Check for a match.
   297  	AND	R9,R3,R3	// Mask bytes below s_base.
   298  	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
   299  	RLDICL	$0,R7,$61,R6	// length-1
   300  	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
   301  	CMPU	R8,R7
   302  	BNE	CR7,done
   303  	BEQ	notfound	// Hit length.
   304  
   305  	MOVDU	8(R8),R12
   306  	CMPB	R12,R5,R3
   307  	CMPU	R3,$0,CR6
   308  	CMPU	R8,R7
   309  	BNE	CR6,done
   310  	BEQ	notfound
   311  
   312  	MOVDU	8(R8),R12
   313  	CMPB	R12,R5,R3
   314  	CMPU	R3,$0,CR6
   315  	CMPU	R8,R7
   316  	BNE	CR6,done
   317  	BEQ	notfound
   318  
   319  	MOVDU	8(R8),R12
   320  	CMPB	R12,R5,R3
   321  	CMPU	R3,$0,CR6
   322  	CMPU	R8,R7
   323  	BNE	CR6,done
   324  	BEQ	notfound
   325  
   326  	MOVDU	8(R8),R12
   327  	CMPB	R12,R5,R3
   328  	CMPU	R3,$0,CR6
   329  	BNE	CR6,done
   330  	BR	notfound
   331