github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/indexbyte_arm64.s

github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/indexbyte_arm64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  // 
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && arm64
     9  
    10  #include "textflag.h"
    11  
    12  TEXT ·IndexSliceByte(SB),NOSPLIT,$0-40
    13  	MOVD b_base+0(FP), R0
    14  	MOVD b_len+8(FP), R2
    15  	MOVBU c+24(FP), R1
    16  	MOVD $ret+32(FP), R8
    17  	B indexbytebody<>(SB)
    18  
    19  TEXT ·IndexByte(SB),NOSPLIT,$0-32
    20  	MOVD s_base+0(FP), R0
    21  	MOVD s_len+8(FP), R2
    22  	MOVBU c+16(FP), R1
    23  	MOVD $ret+24(FP), R8
    24  	B indexbytebody<>(SB)
    25  
    26  // input:
    27  //   R0: data
    28  //   R1: byte to search
    29  //   R2: data len
    30  //   R8: address to put result
    31  TEXT indexbytebody<>(SB),NOSPLIT,$0
    32  	// Core algorithm:
    33  	// For each 32-byte chunk we calculate a 64-bit syndrome value,
    34  	// with two bits per byte. For each tuple, bit 0 is set if the
    35  	// relevant byte matched the requested character and bit 1 is
    36  	// not used (faster than using a 32bit syndrome). Since the bits
    37  	// in the syndrome reflect exactly the order in which things occur
    38  	// in the original string, counting trailing zeros allows to
    39  	// identify exactly which byte has matched.
    40  
    41  	CBZ R2, fail
    42  	MOVD R0, R11
    43  	// Magic constant 0x40100401 allows us to identify
    44  	// which lane matches the requested byte.
    45  	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
    46  	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
    47  	MOVD $0x40100401, R5
    48  	VMOV R1, V0.B16
    49  	// Work with aligned 32-byte chunks
    50  	BIC $0x1f, R0, R3
    51  	VMOV R5, V5.S4
    52  	ANDS $0x1f, R0, R9
    53  	AND $0x1f, R2, R10
    54  	BEQ loop
    55  
    56  	// Input string is not 32-byte aligned. We calculate the
    57  	// syndrome value for the aligned 32 bytes block containing
    58  	// the first bytes and mask off the irrelevant part.
    59  	VLD1.P (R3), [V1.B16, V2.B16]
    60  	SUB $0x20, R9, R4
    61  	ADDS R4, R2, R2
    62  	VCMEQ V0.B16, V1.B16, V3.B16
    63  	VCMEQ V0.B16, V2.B16, V4.B16
    64  	VAND V5.B16, V3.B16, V3.B16
    65  	VAND V5.B16, V4.B16, V4.B16
    66  	VADDP V4.B16, V3.B16, V6.B16 // 256->128
    67  	VADDP V6.B16, V6.B16, V6.B16 // 128->64
    68  	VMOV V6.D[0], R6
    69  	// Clear the irrelevant lower bits
    70  	LSL $1, R9, R4
    71  	LSR R4, R6, R6
    72  	LSL R4, R6, R6
    73  	// The first block can also be the last
    74  	BLS masklast
    75  	// Have we found something already?
    76  	CBNZ R6, tail
    77  
    78  loop:
    79  	VLD1.P (R3), [V1.B16, V2.B16]
    80  	SUBS $0x20, R2, R2
    81  	VCMEQ V0.B16, V1.B16, V3.B16
    82  	VCMEQ V0.B16, V2.B16, V4.B16
    83  	// If we're out of data we finish regardless of the result
    84  	BLS end
    85  	// Use a fast check for the termination condition
    86  	VORR V4.B16, V3.B16, V6.B16
    87  	VADDP V6.D2, V6.D2, V6.D2
    88  	VMOV V6.D[0], R6
    89  	// We're not out of data, loop if we haven't found the character
    90  	CBZ R6, loop
    91  
    92  end:
    93  	// Termination condition found, let's calculate the syndrome value
    94  	VAND V5.B16, V3.B16, V3.B16
    95  	VAND V5.B16, V4.B16, V4.B16
    96  	VADDP V4.B16, V3.B16, V6.B16
    97  	VADDP V6.B16, V6.B16, V6.B16
    98  	VMOV V6.D[0], R6
    99  	// Only do the clear for the last possible block with less than 32 bytes
   100  	// Condition flags come from SUBS in the loop
   101  	BHS tail
   102  
   103  masklast:
   104  	// Clear the irrelevant upper bits
   105  	ADD R9, R10, R4
   106  	AND $0x1f, R4, R4
   107  	SUB $0x20, R4, R4
   108  	NEG R4<<1, R4
   109  	LSL R4, R6, R6
   110  	LSR R4, R6, R6
   111  
   112  tail:
   113  	// Check that we have found a character
   114  	CBZ R6, fail
   115  	// Count the trailing zeros using bit reversing
   116  	RBIT R6, R6
   117  	// Compensate the last post-increment
   118  	SUB $0x20, R3, R3
   119  	// And count the leading zeros
   120  	CLZ R6, R6
   121  	// R6 is twice the offset into the fragment
   122  	ADD R6>>1, R3, R0
   123  	// Compute the offset result
   124  	SUB R11, R0, R0
   125  	MOVD R0, (R8)
   126  	RET
   127  
   128  fail:
   129  	MOVD $-1, R0
   130  	MOVD R0, (R8)
   131  	RET