github.com/hikaru7719/go@v0.0.0-20181025140707-c8b2ac68906a/src/internal/bytealg/indexbyte_arm64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 TEXT ·IndexByte(SB),NOSPLIT,$0-40 8 MOVD b_base+0(FP), R0 9 MOVD b_len+8(FP), R2 10 MOVBU c+24(FP), R1 11 MOVD $ret+32(FP), R8 12 B indexbytebody<>(SB) 13 14 TEXT ·IndexByteString(SB),NOSPLIT,$0-32 15 MOVD s_base+0(FP), R0 16 MOVD s_len+8(FP), R2 17 MOVBU c+16(FP), R1 18 MOVD $ret+24(FP), R8 19 B indexbytebody<>(SB) 20 21 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 22 FUNCDATA $0, ·IndexByte·args_stackmap(SB) 23 MOVD b_base+0(FP), R0 24 MOVD b_len+8(FP), R2 25 MOVBU c+24(FP), R1 26 MOVD $ret+32(FP), R8 27 B indexbytebody<>(SB) 28 29 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 30 FUNCDATA $0, ·IndexByteString·args_stackmap(SB) 31 MOVD s_base+0(FP), R0 32 MOVD s_len+8(FP), R2 33 MOVBU c+16(FP), R1 34 MOVD $ret+24(FP), R8 35 B indexbytebody<>(SB) 36 37 // input: 38 // R0: data 39 // R1: byte to search 40 // R2: data len 41 // R8: address to put result 42 TEXT indexbytebody<>(SB),NOSPLIT,$0 43 // Core algorithm: 44 // For each 32-byte chunk we calculate a 64-bit syndrome value, 45 // with two bits per byte. For each tuple, bit 0 is set if the 46 // relevant byte matched the requested character and bit 1 is 47 // not used (faster than using a 32bit syndrome). Since the bits 48 // in the syndrome reflect exactly the order in which things occur 49 // in the original string, counting trailing zeros allows to 50 // identify exactly which byte has matched. 51 52 CBZ R2, fail 53 MOVD R0, R11 54 // Magic constant 0x40100401 allows us to identify 55 // which lane matches the requested byte. 56 // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24)) 57 // Different bytes have different bit masks (i.e: 1, 4, 16, 64) 58 MOVD $0x40100401, R5 59 VMOV R1, V0.B16 60 // Work with aligned 32-byte chunks 61 BIC $0x1f, R0, R3 62 VMOV R5, V5.S4 63 ANDS $0x1f, R0, R9 64 AND $0x1f, R2, R10 65 BEQ loop 66 67 // Input string is not 32-byte aligned. We calculate the 68 // syndrome value for the aligned 32 bytes block containing 69 // the first bytes and mask off the irrelevant part. 70 VLD1.P (R3), [V1.B16, V2.B16] 71 SUB $0x20, R9, R4 72 ADDS R4, R2, R2 73 VCMEQ V0.B16, V1.B16, V3.B16 74 VCMEQ V0.B16, V2.B16, V4.B16 75 VAND V5.B16, V3.B16, V3.B16 76 VAND V5.B16, V4.B16, V4.B16 77 VADDP V4.B16, V3.B16, V6.B16 // 256->128 78 VADDP V6.B16, V6.B16, V6.B16 // 128->64 79 VMOV V6.D[0], R6 80 // Clear the irrelevant lower bits 81 LSL $1, R9, R4 82 LSR R4, R6, R6 83 LSL R4, R6, R6 84 // The first block can also be the last 85 BLS masklast 86 // Have we found something already? 87 CBNZ R6, tail 88 89 loop: 90 VLD1.P (R3), [V1.B16, V2.B16] 91 SUBS $0x20, R2, R2 92 VCMEQ V0.B16, V1.B16, V3.B16 93 VCMEQ V0.B16, V2.B16, V4.B16 94 // If we're out of data we finish regardless of the result 95 BLS end 96 // Use a fast check for the termination condition 97 VORR V4.B16, V3.B16, V6.B16 98 VADDP V6.D2, V6.D2, V6.D2 99 VMOV V6.D[0], R6 100 // We're not out of data, loop if we haven't found the character 101 CBZ R6, loop 102 103 end: 104 // Termination condition found, let's calculate the syndrome value 105 VAND V5.B16, V3.B16, V3.B16 106 VAND V5.B16, V4.B16, V4.B16 107 VADDP V4.B16, V3.B16, V6.B16 108 VADDP V6.B16, V6.B16, V6.B16 109 VMOV V6.D[0], R6 110 // Only do the clear for the last possible block with less than 32 bytes 111 // Condition flags come from SUBS in the loop 112 BHS tail 113 114 masklast: 115 // Clear the irrelevant upper bits 116 ADD R9, R10, R4 117 AND $0x1f, R4, R4 118 SUB $0x20, R4, R4 119 NEG R4<<1, R4 120 LSL R4, R6, R6 121 LSR R4, R6, R6 122 123 tail: 124 // Check that we have found a character 125 CBZ R6, fail 126 // Count the trailing zeros using bit reversing 127 RBIT R6, R6 128 // Compensate the last post-increment 129 SUB $0x20, R3, R3 130 // And count the leading zeros 131 CLZ R6, R6 132 // R6 is twice the offset into the fragment 133 ADD R6>>1, R3, R0 134 // Compute the offset result 135 SUB R11, R0, R0 136 MOVD R0, (R8) 137 RET 138 139 fail: 140 MOVD $-1, R0 141 MOVD R0, (R8) 142 RET