github.com/hikaru7719/go@v0.0.0-20181025140707-c8b2ac68906a/src/internal/bytealg/indexbyte_arm64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  TEXT ·IndexByte(SB),NOSPLIT,$0-40
     8  	MOVD	b_base+0(FP), R0
     9  	MOVD	b_len+8(FP), R2
    10  	MOVBU	c+24(FP), R1
    11  	MOVD	$ret+32(FP), R8
    12  	B	indexbytebody<>(SB)
    13  
    14  TEXT ·IndexByteString(SB),NOSPLIT,$0-32
    15  	MOVD	s_base+0(FP), R0
    16  	MOVD	s_len+8(FP), R2
    17  	MOVBU	c+16(FP), R1
    18  	MOVD	$ret+24(FP), R8
    19  	B	indexbytebody<>(SB)
    20  
    21  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
    22  	FUNCDATA $0, ·IndexByte·args_stackmap(SB)
    23  	MOVD	b_base+0(FP), R0
    24  	MOVD	b_len+8(FP), R2
    25  	MOVBU	c+24(FP), R1
    26  	MOVD	$ret+32(FP), R8
    27  	B	indexbytebody<>(SB)
    28  
    29  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
    30  	FUNCDATA $0, ·IndexByteString·args_stackmap(SB)
    31  	MOVD	s_base+0(FP), R0
    32  	MOVD	s_len+8(FP), R2
    33  	MOVBU	c+16(FP), R1
    34  	MOVD	$ret+24(FP), R8
    35  	B	indexbytebody<>(SB)
    36  
    37  // input:
    38  //   R0: data
    39  //   R1: byte to search
    40  //   R2: data len
    41  //   R8: address to put result
    42  TEXT indexbytebody<>(SB),NOSPLIT,$0
    43  	// Core algorithm:
    44  	// For each 32-byte chunk we calculate a 64-bit syndrome value,
    45  	// with two bits per byte. For each tuple, bit 0 is set if the
    46  	// relevant byte matched the requested character and bit 1 is
    47  	// not used (faster than using a 32bit syndrome). Since the bits
    48  	// in the syndrome reflect exactly the order in which things occur
    49  	// in the original string, counting trailing zeros allows to
    50  	// identify exactly which byte has matched.
    51  
    52  	CBZ	R2, fail
    53  	MOVD	R0, R11
    54  	// Magic constant 0x40100401 allows us to identify
    55  	// which lane matches the requested byte.
    56  	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
    57  	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
    58  	MOVD	$0x40100401, R5
    59  	VMOV	R1, V0.B16
    60  	// Work with aligned 32-byte chunks
    61  	BIC	$0x1f, R0, R3
    62  	VMOV	R5, V5.S4
    63  	ANDS	$0x1f, R0, R9
    64  	AND	$0x1f, R2, R10
    65  	BEQ	loop
    66  
    67  	// Input string is not 32-byte aligned. We calculate the
    68  	// syndrome value for the aligned 32 bytes block containing
    69  	// the first bytes and mask off the irrelevant part.
    70  	VLD1.P	(R3), [V1.B16, V2.B16]
    71  	SUB	$0x20, R9, R4
    72  	ADDS	R4, R2, R2
    73  	VCMEQ	V0.B16, V1.B16, V3.B16
    74  	VCMEQ	V0.B16, V2.B16, V4.B16
    75  	VAND	V5.B16, V3.B16, V3.B16
    76  	VAND	V5.B16, V4.B16, V4.B16
    77  	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
    78  	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
    79  	VMOV	V6.D[0], R6
    80  	// Clear the irrelevant lower bits
    81  	LSL	$1, R9, R4
    82  	LSR	R4, R6, R6
    83  	LSL	R4, R6, R6
    84  	// The first block can also be the last
    85  	BLS	masklast
    86  	// Have we found something already?
    87  	CBNZ	R6, tail
    88  
    89  loop:
    90  	VLD1.P	(R3), [V1.B16, V2.B16]
    91  	SUBS	$0x20, R2, R2
    92  	VCMEQ	V0.B16, V1.B16, V3.B16
    93  	VCMEQ	V0.B16, V2.B16, V4.B16
    94  	// If we're out of data we finish regardless of the result
    95  	BLS	end
    96  	// Use a fast check for the termination condition
    97  	VORR	V4.B16, V3.B16, V6.B16
    98  	VADDP	V6.D2, V6.D2, V6.D2
    99  	VMOV	V6.D[0], R6
   100  	// We're not out of data, loop if we haven't found the character
   101  	CBZ	R6, loop
   102  
   103  end:
   104  	// Termination condition found, let's calculate the syndrome value
   105  	VAND	V5.B16, V3.B16, V3.B16
   106  	VAND	V5.B16, V4.B16, V4.B16
   107  	VADDP	V4.B16, V3.B16, V6.B16
   108  	VADDP	V6.B16, V6.B16, V6.B16
   109  	VMOV	V6.D[0], R6
   110  	// Only do the clear for the last possible block with less than 32 bytes
   111  	// Condition flags come from SUBS in the loop
   112  	BHS	tail
   113  
   114  masklast:
   115  	// Clear the irrelevant upper bits
   116  	ADD	R9, R10, R4
   117  	AND	$0x1f, R4, R4
   118  	SUB	$0x20, R4, R4
   119  	NEG	R4<<1, R4
   120  	LSL	R4, R6, R6
   121  	LSR	R4, R6, R6
   122  
   123  tail:
   124  	// Check that we have found a character
   125  	CBZ	R6, fail
   126  	// Count the trailing zeros using bit reversing
   127  	RBIT	R6, R6
   128  	// Compensate the last post-increment
   129  	SUB	$0x20, R3, R3
   130  	// And count the leading zeros
   131  	CLZ	R6, R6
   132  	// R6 is twice the offset into the fragment
   133  	ADD	R6>>1, R3, R0
   134  	// Compute the offset result
   135  	SUB	R11, R0, R0
   136  	MOVD	R0, (R8)
   137  	RET
   138  
   139  fail:
   140  	MOVD	$-1, R0
   141  	MOVD	R0, (R8)
   142  	RET