github.com/sandwichdev/go-internals@v0.0.0-20210605002614-12311ac6b2c5/bytealg/indexbyte_arm64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  TEXT ·IndexByte(SB),NOSPLIT,$0-40
     8  	MOVD	b_base+0(FP), R0
     9  	MOVD	b_len+8(FP), R2
    10  	MOVBU	c+24(FP), R1
    11  	MOVD	$ret+32(FP), R8
    12  	B	indexbytebody<>(SB)
    13  
    14  TEXT ·IndexByteString(SB),NOSPLIT,$0-32
    15  	MOVD	s_base+0(FP), R0
    16  	MOVD	s_len+8(FP), R2
    17  	MOVBU	c+16(FP), R1
    18  	MOVD	$ret+24(FP), R8
    19  	B	indexbytebody<>(SB)
    20  
    21  // input:
    22  //   R0: data
    23  //   R1: byte to search
    24  //   R2: data len
    25  //   R8: address to put result
    26  TEXT indexbytebody<>(SB),NOSPLIT,$0
    27  	// Core algorithm:
    28  	// For each 32-byte chunk we calculate a 64-bit syndrome value,
    29  	// with two bits per byte. For each tuple, bit 0 is set if the
    30  	// relevant byte matched the requested character and bit 1 is
    31  	// not used (faster than using a 32bit syndrome). Since the bits
    32  	// in the syndrome reflect exactly the order in which things occur
    33  	// in the original string, counting trailing zeros allows to
    34  	// identify exactly which byte has matched.
    35  
    36  	CBZ	R2, fail
    37  	MOVD	R0, R11
    38  	// Magic constant 0x40100401 allows us to identify
    39  	// which lane matches the requested byte.
    40  	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
    41  	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
    42  	MOVD	$0x40100401, R5
    43  	VMOV	R1, V0.B16
    44  	// Work with aligned 32-byte chunks
    45  	BIC	$0x1f, R0, R3
    46  	VMOV	R5, V5.S4
    47  	ANDS	$0x1f, R0, R9
    48  	AND	$0x1f, R2, R10
    49  	BEQ	loop
    50  
    51  	// Input string is not 32-byte aligned. We calculate the
    52  	// syndrome value for the aligned 32 bytes block containing
    53  	// the first bytes and mask off the irrelevant part.
    54  	VLD1.P	(R3), [V1.B16, V2.B16]
    55  	SUB	$0x20, R9, R4
    56  	ADDS	R4, R2, R2
    57  	VCMEQ	V0.B16, V1.B16, V3.B16
    58  	VCMEQ	V0.B16, V2.B16, V4.B16
    59  	VAND	V5.B16, V3.B16, V3.B16
    60  	VAND	V5.B16, V4.B16, V4.B16
    61  	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
    62  	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
    63  	VMOV	V6.D[0], R6
    64  	// Clear the irrelevant lower bits
    65  	LSL	$1, R9, R4
    66  	LSR	R4, R6, R6
    67  	LSL	R4, R6, R6
    68  	// The first block can also be the last
    69  	BLS	masklast
    70  	// Have we found something already?
    71  	CBNZ	R6, tail
    72  
    73  loop:
    74  	VLD1.P	(R3), [V1.B16, V2.B16]
    75  	SUBS	$0x20, R2, R2
    76  	VCMEQ	V0.B16, V1.B16, V3.B16
    77  	VCMEQ	V0.B16, V2.B16, V4.B16
    78  	// If we're out of data we finish regardless of the result
    79  	BLS	end
    80  	// Use a fast check for the termination condition
    81  	VORR	V4.B16, V3.B16, V6.B16
    82  	VADDP	V6.D2, V6.D2, V6.D2
    83  	VMOV	V6.D[0], R6
    84  	// We're not out of data, loop if we haven't found the character
    85  	CBZ	R6, loop
    86  
    87  end:
    88  	// Termination condition found, let's calculate the syndrome value
    89  	VAND	V5.B16, V3.B16, V3.B16
    90  	VAND	V5.B16, V4.B16, V4.B16
    91  	VADDP	V4.B16, V3.B16, V6.B16
    92  	VADDP	V6.B16, V6.B16, V6.B16
    93  	VMOV	V6.D[0], R6
    94  	// Only do the clear for the last possible block with less than 32 bytes
    95  	// Condition flags come from SUBS in the loop
    96  	BHS	tail
    97  
    98  masklast:
    99  	// Clear the irrelevant upper bits
   100  	ADD	R9, R10, R4
   101  	AND	$0x1f, R4, R4
   102  	SUB	$0x20, R4, R4
   103  	NEG	R4<<1, R4
   104  	LSL	R4, R6, R6
   105  	LSR	R4, R6, R6
   106  
   107  tail:
   108  	// Check that we have found a character
   109  	CBZ	R6, fail
   110  	// Count the trailing zeros using bit reversing
   111  	RBIT	R6, R6
   112  	// Compensate the last post-increment
   113  	SUB	$0x20, R3, R3
   114  	// And count the leading zeros
   115  	CLZ	R6, R6
   116  	// R6 is twice the offset into the fragment
   117  	ADD	R6>>1, R3, R0
   118  	// Compute the offset result
   119  	SUB	R11, R0, R0
   120  	MOVD	R0, (R8)
   121  	RET
   122  
   123  fail:
   124  	MOVD	$-1, R0
   125  	MOVD	R0, (R8)
   126  	RET