github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/count_arm64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2018 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && arm64 9 10 #include "textflag.h" 11 12 TEXT ·CountSlice(SB),NOSPLIT,$0-40 13 MOVD b_base+0(FP), R0 14 MOVD b_len+8(FP), R2 15 MOVBU c+24(FP), R1 16 MOVD $ret+32(FP), R8 17 B countbytebody<>(SB) 18 19 TEXT ·Count(SB),NOSPLIT,$0-32 20 MOVD s_base+0(FP), R0 21 MOVD s_len+8(FP), R2 22 MOVBU c+16(FP), R1 23 MOVD $ret+24(FP), R8 24 B countbytebody<>(SB) 25 26 // input: 27 // R0: data 28 // R2: data len 29 // R1: byte to find 30 // R8: address to put result 31 TEXT countbytebody<>(SB),NOSPLIT,$0 32 // R11 = count of byte to search 33 MOVD $0, R11 34 // short path to handle 0-byte case 35 CBZ R2, done 36 CMP $0x20, R2 37 // jump directly to tail if length < 32 38 BLO tail 39 ANDS $0x1f, R0, R9 40 BEQ chunk 41 // Work with not 32-byte aligned head 42 BIC $0x1f, R0, R3 43 ADD $0x20, R3 44 head_loop: 45 MOVBU.P 1(R0), R5 46 CMP R5, R1 47 CINC EQ, R11, R11 48 SUB $1, R2, R2 49 CMP R0, R3 50 BNE head_loop 51 // Work with 32-byte aligned chunks 52 chunk: 53 BIC $0x1f, R2, R9 54 // The first chunk can also be the last 55 CBZ R9, tail 56 // R3 = end of 32-byte chunks 57 ADD R0, R9, R3 58 MOVD $1, R5 59 VMOV R5, V5.B16 60 // R2 = length of tail 61 SUB R9, R2, R2 62 // Duplicate R1 (byte to search) to 16 1-byte elements of V0 63 VMOV R1, V0.B16 64 // Clear the low 64-bit element of V7 and V8 65 VEOR V7.B8, V7.B8, V7.B8 66 VEOR V8.B8, V8.B8, V8.B8 67 // Count the target byte in 32-byte chunk 68 chunk_loop: 69 VLD1.P (R0), [V1.B16, V2.B16] 70 CMP R0, R3 71 VCMEQ V0.B16, V1.B16, V3.B16 72 VCMEQ V0.B16, V2.B16, V4.B16 73 // Clear the higher 7 bits 74 VAND V5.B16, V3.B16, V3.B16 75 VAND V5.B16, V4.B16, V4.B16 76 // Count lanes match the requested byte 77 VADDP V4.B16, V3.B16, V6.B16 // 32B->16B 78 VUADDLV V6.B16, V7 79 // Accumulate the count in low 64-bit element of V8 when inside the loop 80 VADD V7, V8 81 BNE chunk_loop 82 VMOV V8.D[0], R6 83 ADD R6, R11, R11 84 CBZ R2, done 85 tail: 86 // Work with tail shorter than 32 bytes 87 MOVBU.P 1(R0), R5 88 SUB $1, R2, R2 89 CMP R5, R1 90 CINC EQ, R11, R11 91 CBNZ R2, tail 92 done: 93 MOVD R11, (R8) 94 RET