github.com/likebike/go--@v0.0.0-20190911215757-0bd925d16e96/go/src/bytes/bytes_arm64.s (about)

     1  // Copyright 2017 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // countByte(s []byte, c byte) int
     8  TEXT bytes·countByte(SB),NOSPLIT,$0-40
     9  	MOVD	s_base+0(FP), R0
    10  	MOVD	s_len+8(FP), R2
    11  	MOVBU	c+24(FP), R1
    12  	// R11 = count of byte to search
    13  	MOVD	$0, R11
    14  	// short path to handle 0-byte case
    15  	CBZ	R2, done
    16  	CMP	$0x20, R2
    17  	// jump directly to tail if length < 32
    18  	BLO	tail
    19  	ANDS	$0x1f, R0, R9
    20  	BEQ	chunk
    21  	// Work with not 32-byte aligned head
    22  	BIC	$0x1f, R0, R3
    23  	ADD	$0x20, R3
    24  head_loop:
    25  	MOVBU.P	1(R0), R5
    26  	CMP	R5, R1
    27  	CINC	EQ, R11, R11
    28  	SUB	$1, R2, R2
    29  	CMP	R0, R3
    30  	BNE	head_loop
    31  	// Work with 32-byte aligned chunks
    32  chunk:
    33  	BIC	$0x1f, R2, R9
    34  	// The first chunk can also be the last
    35  	CBZ	R9, tail
    36  	// R3 = end of 32-byte chunks
    37  	ADD	R0, R9, R3
    38  	MOVD	$1, R5
    39  	VMOV	R5, V5.B16
    40  	// R2 = length of tail
    41  	SUB	R9, R2, R2
    42  	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
    43  	VMOV	R1, V0.B16
    44  	// Clear the low 64-bit element of V7 and V8
    45  	VEOR	V7.B8, V7.B8, V7.B8
    46  	VEOR	V8.B8, V8.B8, V8.B8
    47  	// Count the target byte in 32-byte chunk
    48  chunk_loop:
    49  	VLD1.P	(R0), [V1.B16, V2.B16]
    50  	CMP	R0, R3
    51  	VCMEQ	V0.B16, V1.B16, V3.B16
    52  	VCMEQ	V0.B16, V2.B16, V4.B16
    53  	// Clear the higher 7 bits
    54  	VAND	V5.B16, V3.B16, V3.B16
    55  	VAND	V5.B16, V4.B16, V4.B16
    56  	// Count lanes match the requested byte
    57  	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
    58  	VUADDLV	V6.B16, V7
    59  	// Accumulate the count in low 64-bit element of V8 when inside the loop
    60  	VADD	V7, V8
    61  	BNE	chunk_loop
    62  	VMOV	V8.D[0], R6
    63  	ADD	R6, R11, R11
    64  	CBZ	R2, done
    65  tail:
    66  	// Work with tail shorter than 32 bytes
    67  	MOVBU.P	1(R0), R5
    68  	SUB	$1, R2, R2
    69  	CMP	R5, R1
    70  	CINC	EQ, R11, R11
    71  	CBNZ	R2, tail
    72  done:
    73  	MOVD	R11, ret+32(FP)
    74  	RET