github.com/SandwichDev/go-internals@v0.0.0-20210605002614-12311ac6b2c5/bytealg/count_arm64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·Count(SB),NOSPLIT,$0-40
     9  	MOVD	b_base+0(FP), R0
    10  	MOVD	b_len+8(FP), R2
    11  	MOVBU	c+24(FP), R1
    12  	MOVD	$ret+32(FP), R8
    13  	B	countbytebody<>(SB)
    14  
    15  TEXT ·CountString(SB),NOSPLIT,$0-32
    16  	MOVD	s_base+0(FP), R0
    17  	MOVD	s_len+8(FP), R2
    18  	MOVBU	c+16(FP), R1
    19  	MOVD	$ret+24(FP), R8
    20  	B	countbytebody<>(SB)
    21  
    22  // input:
    23  //   R0: data
    24  //   R2: data len
    25  //   R1: byte to find
    26  //   R8: address to put result
    27  TEXT countbytebody<>(SB),NOSPLIT,$0
    28  	// R11 = count of byte to search
    29  	MOVD	$0, R11
    30  	// short path to handle 0-byte case
    31  	CBZ	R2, done
    32  	CMP	$0x20, R2
    33  	// jump directly to tail if length < 32
    34  	BLO	tail
    35  	ANDS	$0x1f, R0, R9
    36  	BEQ	chunk
    37  	// Work with not 32-byte aligned head
    38  	BIC	$0x1f, R0, R3
    39  	ADD	$0x20, R3
    40  head_loop:
    41  	MOVBU.P	1(R0), R5
    42  	CMP	R5, R1
    43  	CINC	EQ, R11, R11
    44  	SUB	$1, R2, R2
    45  	CMP	R0, R3
    46  	BNE	head_loop
    47  	// Work with 32-byte aligned chunks
    48  chunk:
    49  	BIC	$0x1f, R2, R9
    50  	// The first chunk can also be the last
    51  	CBZ	R9, tail
    52  	// R3 = end of 32-byte chunks
    53  	ADD	R0, R9, R3
    54  	MOVD	$1, R5
    55  	VMOV	R5, V5.B16
    56  	// R2 = length of tail
    57  	SUB	R9, R2, R2
    58  	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
    59  	VMOV	R1, V0.B16
    60  	// Clear the low 64-bit element of V7 and V8
    61  	VEOR	V7.B8, V7.B8, V7.B8
    62  	VEOR	V8.B8, V8.B8, V8.B8
    63  	// Count the target byte in 32-byte chunk
    64  chunk_loop:
    65  	VLD1.P	(R0), [V1.B16, V2.B16]
    66  	CMP	R0, R3
    67  	VCMEQ	V0.B16, V1.B16, V3.B16
    68  	VCMEQ	V0.B16, V2.B16, V4.B16
    69  	// Clear the higher 7 bits
    70  	VAND	V5.B16, V3.B16, V3.B16
    71  	VAND	V5.B16, V4.B16, V4.B16
    72  	// Count lanes match the requested byte
    73  	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
    74  	VUADDLV	V6.B16, V7
    75  	// Accumulate the count in low 64-bit element of V8 when inside the loop
    76  	VADD	V7, V8
    77  	BNE	chunk_loop
    78  	VMOV	V8.D[0], R6
    79  	ADD	R6, R11, R11
    80  	CBZ	R2, done
    81  tail:
    82  	// Work with tail shorter than 32 bytes
    83  	MOVBU.P	1(R0), R5
    84  	SUB	$1, R2, R2
    85  	CMP	R5, R1
    86  	CINC	EQ, R11, R11
    87  	CBNZ	R2, tail
    88  done:
    89  	MOVD	R11, (R8)
    90  	RET