github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/bytealg/indexbyte_arm64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 TEXT ·IndexByte(SB),NOSPLIT,$0-40 8 MOVD b_base+0(FP), R0 9 MOVD b_len+8(FP), R2 10 MOVBU c+24(FP), R1 11 MOVD $ret+32(FP), R8 12 B indexbytebody<>(SB) 13 14 TEXT ·IndexByteString(SB),NOSPLIT,$0-32 15 MOVD s_base+0(FP), R0 16 MOVD s_len+8(FP), R2 17 MOVBU c+16(FP), R1 18 MOVD $ret+24(FP), R8 19 B indexbytebody<>(SB) 20 21 // input: 22 // R0: data 23 // R1: byte to search 24 // R2: data len 25 // R8: address to put result 26 TEXT indexbytebody<>(SB),NOSPLIT,$0 27 // Core algorithm: 28 // For each 32-byte chunk we calculate a 64-bit syndrome value, 29 // with two bits per byte. For each tuple, bit 0 is set if the 30 // relevant byte matched the requested character and bit 1 is 31 // not used (faster than using a 32bit syndrome). Since the bits 32 // in the syndrome reflect exactly the order in which things occur 33 // in the original string, counting trailing zeros allows to 34 // identify exactly which byte has matched. 35 36 CBZ R2, fail 37 MOVD R0, R11 38 // Magic constant 0x40100401 allows us to identify 39 // which lane matches the requested byte. 40 // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24)) 41 // Different bytes have different bit masks (i.e: 1, 4, 16, 64) 42 MOVD $0x40100401, R5 43 VMOV R1, V0.B16 44 // Work with aligned 32-byte chunks 45 BIC $0x1f, R0, R3 46 VMOV R5, V5.S4 47 ANDS $0x1f, R0, R9 48 AND $0x1f, R2, R10 49 BEQ loop 50 51 // Input string is not 32-byte aligned. We calculate the 52 // syndrome value for the aligned 32 bytes block containing 53 // the first bytes and mask off the irrelevant part. 54 VLD1.P (R3), [V1.B16, V2.B16] 55 SUB $0x20, R9, R4 56 ADDS R4, R2, R2 57 VCMEQ V0.B16, V1.B16, V3.B16 58 VCMEQ V0.B16, V2.B16, V4.B16 59 VAND V5.B16, V3.B16, V3.B16 60 VAND V5.B16, V4.B16, V4.B16 61 VADDP V4.B16, V3.B16, V6.B16 // 256->128 62 VADDP V6.B16, V6.B16, V6.B16 // 128->64 63 VMOV V6.D[0], R6 64 // Clear the irrelevant lower bits 65 LSL $1, R9, R4 66 LSR R4, R6, R6 67 LSL R4, R6, R6 68 // The first block can also be the last 69 BLS masklast 70 // Have we found something already? 71 CBNZ R6, tail 72 73 loop: 74 VLD1.P (R3), [V1.B16, V2.B16] 75 SUBS $0x20, R2, R2 76 VCMEQ V0.B16, V1.B16, V3.B16 77 VCMEQ V0.B16, V2.B16, V4.B16 78 // If we're out of data we finish regardless of the result 79 BLS end 80 // Use a fast check for the termination condition 81 VORR V4.B16, V3.B16, V6.B16 82 VADDP V6.D2, V6.D2, V6.D2 83 VMOV V6.D[0], R6 84 // We're not out of data, loop if we haven't found the character 85 CBZ R6, loop 86 87 end: 88 // Termination condition found, let's calculate the syndrome value 89 VAND V5.B16, V3.B16, V3.B16 90 VAND V5.B16, V4.B16, V4.B16 91 VADDP V4.B16, V3.B16, V6.B16 92 VADDP V6.B16, V6.B16, V6.B16 93 VMOV V6.D[0], R6 94 // Only do the clear for the last possible block with less than 32 bytes 95 // Condition flags come from SUBS in the loop 96 BHS tail 97 98 masklast: 99 // Clear the irrelevant upper bits 100 ADD R9, R10, R4 101 AND $0x1f, R4, R4 102 SUB $0x20, R4, R4 103 NEG R4<<1, R4 104 LSL R4, R6, R6 105 LSR R4, R6, R6 106 107 tail: 108 // Check that we have found a character 109 CBZ R6, fail 110 // Count the trailing zeros using bit reversing 111 RBIT R6, R6 112 // Compensate the last post-increment 113 SUB $0x20, R3, R3 114 // And count the leading zeros 115 CLZ R6, R6 116 // R6 is twice the offset into the fragment 117 ADD R6>>1, R3, R0 118 // Compute the offset result 119 SUB R11, R0, R0 120 MOVD R0, (R8) 121 RET 122 123 fail: 124 MOVD $-1, R0 125 MOVD R0, (R8) 126 RET