github.com/likebike/go--@v0.0.0-20190911215757-0bd925d16e96/go/src/bytes/bytes_arm64.s (about) 1 // Copyright 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // countByte(s []byte, c byte) int 8 TEXT bytes·countByte(SB),NOSPLIT,$0-40 9 MOVD s_base+0(FP), R0 10 MOVD s_len+8(FP), R2 11 MOVBU c+24(FP), R1 12 // R11 = count of byte to search 13 MOVD $0, R11 14 // short path to handle 0-byte case 15 CBZ R2, done 16 CMP $0x20, R2 17 // jump directly to tail if length < 32 18 BLO tail 19 ANDS $0x1f, R0, R9 20 BEQ chunk 21 // Work with not 32-byte aligned head 22 BIC $0x1f, R0, R3 23 ADD $0x20, R3 24 head_loop: 25 MOVBU.P 1(R0), R5 26 CMP R5, R1 27 CINC EQ, R11, R11 28 SUB $1, R2, R2 29 CMP R0, R3 30 BNE head_loop 31 // Work with 32-byte aligned chunks 32 chunk: 33 BIC $0x1f, R2, R9 34 // The first chunk can also be the last 35 CBZ R9, tail 36 // R3 = end of 32-byte chunks 37 ADD R0, R9, R3 38 MOVD $1, R5 39 VMOV R5, V5.B16 40 // R2 = length of tail 41 SUB R9, R2, R2 42 // Duplicate R1 (byte to search) to 16 1-byte elements of V0 43 VMOV R1, V0.B16 44 // Clear the low 64-bit element of V7 and V8 45 VEOR V7.B8, V7.B8, V7.B8 46 VEOR V8.B8, V8.B8, V8.B8 47 // Count the target byte in 32-byte chunk 48 chunk_loop: 49 VLD1.P (R0), [V1.B16, V2.B16] 50 CMP R0, R3 51 VCMEQ V0.B16, V1.B16, V3.B16 52 VCMEQ V0.B16, V2.B16, V4.B16 53 // Clear the higher 7 bits 54 VAND V5.B16, V3.B16, V3.B16 55 VAND V5.B16, V4.B16, V4.B16 56 // Count lanes match the requested byte 57 VADDP V4.B16, V3.B16, V6.B16 // 32B->16B 58 VUADDLV V6.B16, V7 59 // Accumulate the count in low 64-bit element of V8 when inside the loop 60 VADD V7, V8 61 BNE chunk_loop 62 VMOV V8.D[0], R6 63 ADD R6, R11, R11 64 CBZ R2, done 65 tail: 66 // Work with tail shorter than 32 bytes 67 MOVBU.P 1(R0), R5 68 SUB $1, R2, R2 69 CMP R5, R1 70 CINC EQ, R11, R11 71 CBNZ R2, tail 72 done: 73 MOVD R11, ret+32(FP) 74 RET