github.com/bir3/gocompiler@v0.9.2202/src/internal/bytealg/count_arm64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·Count(SB),NOSPLIT,$0-40 9 MOVD b_base+0(FP), R0 10 MOVD b_len+8(FP), R2 11 MOVBU c+24(FP), R1 12 MOVD $ret+32(FP), R8 13 B countbytebody<>(SB) 14 15 TEXT ·CountString(SB),NOSPLIT,$0-32 16 MOVD s_base+0(FP), R0 17 MOVD s_len+8(FP), R2 18 MOVBU c+16(FP), R1 19 MOVD $ret+24(FP), R8 20 B countbytebody<>(SB) 21 22 // input: 23 // R0: data 24 // R2: data len 25 // R1: byte to find 26 // R8: address to put result 27 TEXT countbytebody<>(SB),NOSPLIT,$0 28 // R11 = count of byte to search 29 MOVD $0, R11 30 // short path to handle 0-byte case 31 CBZ R2, done 32 CMP $0x20, R2 33 // jump directly to tail if length < 32 34 BLO tail 35 ANDS $0x1f, R0, R9 36 BEQ chunk 37 // Work with not 32-byte aligned head 38 BIC $0x1f, R0, R3 39 ADD $0x20, R3 40 PCALIGN $16 41 head_loop: 42 MOVBU.P 1(R0), R5 43 CMP R5, R1 44 CINC EQ, R11, R11 45 SUB $1, R2, R2 46 CMP R0, R3 47 BNE head_loop 48 // Work with 32-byte aligned chunks 49 chunk: 50 BIC $0x1f, R2, R9 51 // The first chunk can also be the last 52 CBZ R9, tail 53 // R3 = end of 32-byte chunks 54 ADD R0, R9, R3 55 MOVD $1, R5 56 VMOV R5, V5.B16 57 // R2 = length of tail 58 SUB R9, R2, R2 59 // Duplicate R1 (byte to search) to 16 1-byte elements of V0 60 VMOV R1, V0.B16 61 // Clear the low 64-bit element of V7 and V8 62 VEOR V7.B8, V7.B8, V7.B8 63 VEOR V8.B8, V8.B8, V8.B8 64 PCALIGN $16 65 // Count the target byte in 32-byte chunk 66 chunk_loop: 67 VLD1.P (R0), [V1.B16, V2.B16] 68 CMP R0, R3 69 VCMEQ V0.B16, V1.B16, V3.B16 70 VCMEQ V0.B16, V2.B16, V4.B16 71 // Clear the higher 7 bits 72 VAND V5.B16, V3.B16, V3.B16 73 VAND V5.B16, V4.B16, V4.B16 74 // Count lanes match the requested byte 75 VADDP V4.B16, V3.B16, V6.B16 // 32B->16B 76 VUADDLV V6.B16, V7 77 // Accumulate the count in low 64-bit element of V8 when inside the loop 78 VADD V7, V8 79 BNE chunk_loop 80 VMOV V8.D[0], R6 81 ADD R6, R11, R11 82 CBZ R2, done 83 tail: 84 // Work with tail shorter than 32 bytes 85 MOVBU.P 1(R0), R5 86 SUB $1, R2, R2 87 CMP R5, R1 88 CINC EQ, R11, R11 89 CBNZ R2, tail 90 done: 91 MOVD R11, (R8) 92 RET