github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/count_arm64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  // 
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && arm64
     9  
    10  #include "textflag.h"
    11  
    12  TEXT ·CountSlice(SB),NOSPLIT,$0-40
    13  	MOVD b_base+0(FP), R0
    14  	MOVD b_len+8(FP), R2
    15  	MOVBU c+24(FP), R1
    16  	MOVD $ret+32(FP), R8
    17  	B countbytebody<>(SB)
    18  
    19  TEXT ·Count(SB),NOSPLIT,$0-32
    20  	MOVD s_base+0(FP), R0
    21  	MOVD s_len+8(FP), R2
    22  	MOVBU c+16(FP), R1
    23  	MOVD $ret+24(FP), R8
    24  	B countbytebody<>(SB)
    25  
    26  // input:
    27  //   R0: data
    28  //   R2: data len
    29  //   R1: byte to find
    30  //   R8: address to put result
    31  TEXT countbytebody<>(SB),NOSPLIT,$0
    32  	// R11 = count of byte to search
    33  	MOVD $0, R11
    34  	// short path to handle 0-byte case
    35  	CBZ R2, done
    36  	CMP $0x20, R2
    37  	// jump directly to tail if length < 32
    38  	BLO tail
    39  	ANDS $0x1f, R0, R9
    40  	BEQ chunk
    41  	// Work with not 32-byte aligned head
    42  	BIC $0x1f, R0, R3
    43  	ADD $0x20, R3
    44  head_loop:
    45  	MOVBU.P 1(R0), R5
    46  	CMP R5, R1
    47  	CINC EQ, R11, R11
    48  	SUB $1, R2, R2
    49  	CMP R0, R3
    50  	BNE head_loop
    51  	// Work with 32-byte aligned chunks
    52  chunk:
    53  	BIC $0x1f, R2, R9
    54  	// The first chunk can also be the last
    55  	CBZ R9, tail
    56  	// R3 = end of 32-byte chunks
    57  	ADD R0, R9, R3
    58  	MOVD $1, R5
    59  	VMOV R5, V5.B16
    60  	// R2 = length of tail
    61  	SUB R9, R2, R2
    62  	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
    63  	VMOV R1, V0.B16
    64  	// Clear the low 64-bit element of V7 and V8
    65  	VEOR V7.B8, V7.B8, V7.B8
    66  	VEOR V8.B8, V8.B8, V8.B8
    67  	// Count the target byte in 32-byte chunk
    68  chunk_loop:
    69  	VLD1.P (R0), [V1.B16, V2.B16]
    70  	CMP R0, R3
    71  	VCMEQ V0.B16, V1.B16, V3.B16
    72  	VCMEQ V0.B16, V2.B16, V4.B16
    73  	// Clear the higher 7 bits
    74  	VAND V5.B16, V3.B16, V3.B16
    75  	VAND V5.B16, V4.B16, V4.B16
    76  	// Count lanes match the requested byte
    77  	VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
    78  	VUADDLV V6.B16, V7
    79  	// Accumulate the count in low 64-bit element of V8 when inside the loop
    80  	VADD V7, V8
    81  	BNE chunk_loop
    82  	VMOV V8.D[0], R6
    83  	ADD R6, R11, R11
    84  	CBZ R2, done
    85  tail:
    86  	// Work with tail shorter than 32 bytes
    87  	MOVBU.P 1(R0), R5
    88  	SUB $1, R2, R2
    89  	CMP R5, R1
    90  	CINC EQ, R11, R11
    91  	CBNZ R2, tail
    92  done:
    93  	MOVD R11, (R8)
    94  	RET