github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/index_arm64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  // 
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && arm64
     9  
    10  #include "textflag.h"
    11  
    12  TEXT ·indexSlice(SB),NOSPLIT,$0-56
    13  	MOVD a_base+0(FP), R0
    14  	MOVD a_len+8(FP), R1
    15  	MOVD b_base+24(FP), R2
    16  	MOVD b_len+32(FP), R3
    17  	MOVD $ret+48(FP), R9
    18  	B indexbody<>(SB)
    19  
    20  TEXT ·index(SB),NOSPLIT,$0-40
    21  	MOVD a_base+0(FP), R0
    22  	MOVD a_len+8(FP), R1
    23  	MOVD b_base+16(FP), R2
    24  	MOVD b_len+24(FP), R3
    25  	MOVD $ret+32(FP), R9
    26  	B indexbody<>(SB)
    27  
    28  // input:
    29  //   R0: haystack
    30  //   R1: length of haystack
    31  //   R2: needle
    32  //   R3: length of needle (2 <= len <= 32)
    33  //   R9: address to put result
    34  TEXT indexbody<>(SB),NOSPLIT,$0-56
    35  	// main idea is to load 'sep' into separate register(s)
    36  	// to avoid repeatedly re-load it again and again
    37  	// for sebsequent substring comparisons
    38  	SUB R3, R1, R4
    39  	// R4 contains the start of last substring for comparison
    40  	ADD R0, R4, R4
    41  	ADD $1, R0, R8
    42  
    43  	CMP $8, R3
    44  	BHI greater_8
    45  	TBZ $3, R3, len_2_7
    46  len_8:
    47  	// R5 contains 8-byte of sep
    48  	MOVD (R2), R5
    49  loop_8:
    50  	// R6 contains substring for comparison
    51  	CMP R4, R0
    52  	BHI not_found
    53  	MOVD.P 1(R0), R6
    54  	CMP R5, R6
    55  	BNE loop_8
    56  	B found
    57  len_2_7:
    58  	TBZ $2, R3, len_2_3
    59  	TBZ $1, R3, len_4_5
    60  	TBZ $0, R3, len_6
    61  len_7:
    62  	// R5 and R6 contain 7-byte of sep
    63  	MOVWU (R2), R5
    64  	// 1-byte overlap with R5
    65  	MOVWU 3(R2), R6
    66  loop_7:
    67  	CMP R4, R0
    68  	BHI not_found
    69  	MOVWU.P 1(R0), R3
    70  	CMP R5, R3
    71  	BNE loop_7
    72  	MOVWU 2(R0), R3
    73  	CMP R6, R3
    74  	BNE loop_7
    75  	B found
    76  len_6:
    77  	// R5 and R6 contain 6-byte of sep
    78  	MOVWU (R2), R5
    79  	MOVHU 4(R2), R6
    80  loop_6:
    81  	CMP R4, R0
    82  	BHI not_found
    83  	MOVWU.P 1(R0), R3
    84  	CMP R5, R3
    85  	BNE loop_6
    86  	MOVHU 3(R0), R3
    87  	CMP R6, R3
    88  	BNE loop_6
    89  	B found
    90  len_4_5:
    91  	TBZ $0, R3, len_4
    92  len_5:
    93  	// R5 and R7 contain 5-byte of sep
    94  	MOVWU (R2), R5
    95  	MOVBU 4(R2), R7
    96  loop_5:
    97  	CMP R4, R0
    98  	BHI not_found
    99  	MOVWU.P 1(R0), R3
   100  	CMP R5, R3
   101  	BNE loop_5
   102  	MOVBU 3(R0), R3
   103  	CMP R7, R3
   104  	BNE loop_5
   105  	B found
   106  len_4:
   107  	// R5 contains 4-byte of sep
   108  	MOVWU (R2), R5
   109  loop_4:
   110  	CMP R4, R0
   111  	BHI not_found
   112  	MOVWU.P 1(R0), R6
   113  	CMP R5, R6
   114  	BNE loop_4
   115  	B found
   116  len_2_3:
   117  	TBZ $0, R3, len_2
   118  len_3:
   119  	// R6 and R7 contain 3-byte of sep
   120  	MOVHU (R2), R6
   121  	MOVBU 2(R2), R7
   122  loop_3:
   123  	CMP R4, R0
   124  	BHI not_found
   125  	MOVHU.P 1(R0), R3
   126  	CMP R6, R3
   127  	BNE loop_3
   128  	MOVBU 1(R0), R3
   129  	CMP R7, R3
   130  	BNE loop_3
   131  	B found
   132  len_2:
   133  	// R5 contains 2-byte of sep
   134  	MOVHU (R2), R5
   135  loop_2:
   136  	CMP R4, R0
   137  	BHI not_found
   138  	MOVHU.P 1(R0), R6
   139  	CMP R5, R6
   140  	BNE loop_2
   141  found:
   142  	SUB R8, R0, R0
   143  	MOVD R0, (R9)
   144  	RET
   145  not_found:
   146  	MOVD $-1, R0
   147  	MOVD R0, (R9)
   148  	RET
   149  greater_8:
   150  	SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes
   151  	CMP $16, R3
   152  	BHI greater_16
   153  len_9_16:
   154  	MOVD.P 8(R2), R5 // R5 contains the first 8-byte of sep
   155  	SUB $16, R3, R7 // len(sep) - 16, offset of R2 for last 8 bytes
   156  	MOVD (R2)(R7), R6 // R6 contains the last 8-byte of sep
   157  loop_9_16:
   158  	// search the first 8 bytes first
   159  	CMP R4, R0
   160  	BHI not_found
   161  	MOVD.P 1(R0), R7
   162  	CMP R5, R7
   163  	BNE loop_9_16
   164  	MOVD (R0)(R11), R7
   165  	CMP R6, R7 // compare the last 8 bytes
   166  	BNE loop_9_16
   167  	B found
   168  greater_16:
   169  	CMP $24, R3
   170  	BHI len_25_32
   171  len_17_24:
   172  	LDP.P 16(R2), (R5, R6)	// R5 and R6 contain the first 16-byte of sep
   173  	SUB $24, R3, R10 // len(sep) - 24
   174  	MOVD (R2)(R10), R7 // R7 contains the last 8-byte of sep
   175  loop_17_24:
   176  	// search the first 16 bytes first
   177  	CMP R4, R0
   178  	BHI not_found
   179  	MOVD.P 1(R0), R10
   180  	CMP R5, R10
   181  	BNE loop_17_24
   182  	MOVD 7(R0), R10
   183  	CMP R6, R10
   184  	BNE loop_17_24
   185  	MOVD (R0)(R11), R10
   186  	CMP R7, R10 // compare the last 8 bytes
   187  	BNE loop_17_24
   188  	B found
   189  len_25_32:
   190  	LDP.P 16(R2), (R5, R6)
   191  	MOVD.P 8(R2), R7 // R5, R6 and R7 contain the first 24-byte of sep
   192  	SUB $32, R3, R12 // len(sep) - 32
   193  	MOVD (R2)(R12), R10 // R10 contains the last 8-byte of sep
   194  loop_25_32:
   195  	// search the first 24 bytes first
   196  	CMP R4, R0
   197  	BHI not_found
   198  	MOVD.P 1(R0), R12
   199  	CMP R5, R12
   200  	BNE loop_25_32
   201  	MOVD 7(R0), R12
   202  	CMP R6, R12
   203  	BNE loop_25_32
   204  	MOVD 15(R0), R12
   205  	CMP R7, R12
   206  	BNE loop_25_32
   207  	MOVD (R0)(R11), R12
   208  	CMP R10, R12 // compare the last 8 bytes
   209  	BNE loop_25_32
   210  	B found