github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/indexbyte_arm64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2018 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && arm64 9 10 #include "textflag.h" 11 12 TEXT ·IndexSliceByte(SB),NOSPLIT,$0-40 13 MOVD b_base+0(FP), R0 14 MOVD b_len+8(FP), R2 15 MOVBU c+24(FP), R1 16 MOVD $ret+32(FP), R8 17 B indexbytebody<>(SB) 18 19 TEXT ·IndexByte(SB),NOSPLIT,$0-32 20 MOVD s_base+0(FP), R0 21 MOVD s_len+8(FP), R2 22 MOVBU c+16(FP), R1 23 MOVD $ret+24(FP), R8 24 B indexbytebody<>(SB) 25 26 // input: 27 // R0: data 28 // R1: byte to search 29 // R2: data len 30 // R8: address to put result 31 TEXT indexbytebody<>(SB),NOSPLIT,$0 32 // Core algorithm: 33 // For each 32-byte chunk we calculate a 64-bit syndrome value, 34 // with two bits per byte. For each tuple, bit 0 is set if the 35 // relevant byte matched the requested character and bit 1 is 36 // not used (faster than using a 32bit syndrome). Since the bits 37 // in the syndrome reflect exactly the order in which things occur 38 // in the original string, counting trailing zeros allows to 39 // identify exactly which byte has matched. 40 41 CBZ R2, fail 42 MOVD R0, R11 43 // Magic constant 0x40100401 allows us to identify 44 // which lane matches the requested byte. 45 // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24)) 46 // Different bytes have different bit masks (i.e: 1, 4, 16, 64) 47 MOVD $0x40100401, R5 48 VMOV R1, V0.B16 49 // Work with aligned 32-byte chunks 50 BIC $0x1f, R0, R3 51 VMOV R5, V5.S4 52 ANDS $0x1f, R0, R9 53 AND $0x1f, R2, R10 54 BEQ loop 55 56 // Input string is not 32-byte aligned. We calculate the 57 // syndrome value for the aligned 32 bytes block containing 58 // the first bytes and mask off the irrelevant part. 59 VLD1.P (R3), [V1.B16, V2.B16] 60 SUB $0x20, R9, R4 61 ADDS R4, R2, R2 62 VCMEQ V0.B16, V1.B16, V3.B16 63 VCMEQ V0.B16, V2.B16, V4.B16 64 VAND V5.B16, V3.B16, V3.B16 65 VAND V5.B16, V4.B16, V4.B16 66 VADDP V4.B16, V3.B16, V6.B16 // 256->128 67 VADDP V6.B16, V6.B16, V6.B16 // 128->64 68 VMOV V6.D[0], R6 69 // Clear the irrelevant lower bits 70 LSL $1, R9, R4 71 LSR R4, R6, R6 72 LSL R4, R6, R6 73 // The first block can also be the last 74 BLS masklast 75 // Have we found something already? 76 CBNZ R6, tail 77 78 loop: 79 VLD1.P (R3), [V1.B16, V2.B16] 80 SUBS $0x20, R2, R2 81 VCMEQ V0.B16, V1.B16, V3.B16 82 VCMEQ V0.B16, V2.B16, V4.B16 83 // If we're out of data we finish regardless of the result 84 BLS end 85 // Use a fast check for the termination condition 86 VORR V4.B16, V3.B16, V6.B16 87 VADDP V6.D2, V6.D2, V6.D2 88 VMOV V6.D[0], R6 89 // We're not out of data, loop if we haven't found the character 90 CBZ R6, loop 91 92 end: 93 // Termination condition found, let's calculate the syndrome value 94 VAND V5.B16, V3.B16, V3.B16 95 VAND V5.B16, V4.B16, V4.B16 96 VADDP V4.B16, V3.B16, V6.B16 97 VADDP V6.B16, V6.B16, V6.B16 98 VMOV V6.D[0], R6 99 // Only do the clear for the last possible block with less than 32 bytes 100 // Condition flags come from SUBS in the loop 101 BHS tail 102 103 masklast: 104 // Clear the irrelevant upper bits 105 ADD R9, R10, R4 106 AND $0x1f, R4, R4 107 SUB $0x20, R4, R4 108 NEG R4<<1, R4 109 LSL R4, R6, R6 110 LSR R4, R6, R6 111 112 tail: 113 // Check that we have found a character 114 CBZ R6, fail 115 // Count the trailing zeros using bit reversing 116 RBIT R6, R6 117 // Compensate the last post-increment 118 SUB $0x20, R3, R3 119 // And count the leading zeros 120 CLZ R6, R6 121 // R6 is twice the offset into the fragment 122 ADD R6>>1, R3, R0 123 // Compute the offset result 124 SUB R11, R0, R0 125 MOVD R0, (R8) 126 RET 127 128 fail: 129 MOVD $-1, R0 130 MOVD R0, (R8) 131 RET