github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/index_amd64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  // 
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && amd64
     9  
    10  #include "textflag.h"
    11  
    12  TEXT ·indexSlice(SB),NOSPLIT,$0-56
    13  	MOVQ a_base+0(FP), DI
    14  	MOVQ a_len+8(FP), DX
    15  	MOVQ b_base+24(FP), R8
    16  	MOVQ b_len+32(FP), AX
    17  	MOVQ DI, R10
    18  	LEAQ ret+48(FP), R11
    19  	JMP  indexbody<>(SB)
    20  
    21  TEXT ·index(SB),NOSPLIT,$0-40
    22  	MOVQ a_base+0(FP), DI
    23  	MOVQ a_len+8(FP), DX
    24  	MOVQ b_base+16(FP), R8
    25  	MOVQ b_len+24(FP), AX
    26  	MOVQ DI, R10
    27  	LEAQ ret+32(FP), R11
    28  	JMP  indexbody<>(SB)
    29  
    30  // AX: length of string, that we are searching for
    31  // DX: length of string, in which we are searching
    32  // DI: pointer to string, in which we are searching
    33  // R8: pointer to string, that we are searching for
    34  // R11: address, where to put return value
    35  // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
    36  TEXT indexbody<>(SB),NOSPLIT,$0
    37  	CMPQ AX, DX
    38  	JA   fail
    39  	CMPQ DX, $16
    40  	JAE  sse42
    41  no_sse42:
    42  	CMPQ AX, $2
    43  	JA   _3_or_more
    44  	MOVW (R8), R8
    45  	LEAQ -1(DI)(DX*1), DX
    46  loop2:
    47  	MOVW (DI), SI
    48  	CMPW SI,R8
    49  	JZ   success
    50  	ADDQ $1,DI
    51  	CMPQ DI,DX
    52  	JB   loop2
    53  	JMP  fail
    54  _3_or_more:
    55  	CMPQ AX, $3
    56  	JA   _4_or_more
    57  	MOVW 1(R8), BX
    58  	MOVW (R8), R8
    59  	LEAQ -2(DI)(DX*1), DX
    60  loop3:
    61  	MOVW (DI), SI
    62  	CMPW SI,R8
    63  	JZ   partial_success3
    64  	ADDQ $1,DI
    65  	CMPQ DI,DX
    66  	JB   loop3
    67  	JMP  fail
    68  partial_success3:
    69  	MOVW 1(DI), SI
    70  	CMPW SI,BX
    71  	JZ   success
    72  	ADDQ $1,DI
    73  	CMPQ DI,DX
    74  	JB   loop3
    75  	JMP  fail
    76  _4_or_more:
    77  	CMPQ AX, $4
    78  	JA   _5_or_more
    79  	MOVL (R8), R8
    80  	LEAQ -3(DI)(DX*1), DX
    81  loop4:
    82  	MOVL (DI), SI
    83  	CMPL SI,R8
    84  	JZ   success
    85  	ADDQ $1,DI
    86  	CMPQ DI,DX
    87  	JB   loop4
    88  	JMP  fail
    89  _5_or_more:
    90  	CMPQ AX, $7
    91  	JA   _8_or_more
    92  	LEAQ 1(DI)(DX*1), DX
    93  	SUBQ AX, DX
    94  	MOVL -4(R8)(AX*1), BX
    95  	MOVL (R8), R8
    96  loop5to7:
    97  	MOVL (DI), SI
    98  	CMPL SI,R8
    99  	JZ   partial_success5to7
   100  	ADDQ $1,DI
   101  	CMPQ DI,DX
   102  	JB   loop5to7
   103  	JMP  fail
   104  partial_success5to7:
   105  	MOVL -4(AX)(DI*1), SI
   106  	CMPL SI,BX
   107  	JZ   success
   108  	ADDQ $1,DI
   109  	CMPQ DI,DX
   110  	JB   loop5to7
   111  	JMP  fail
   112  _8_or_more:
   113  	CMPQ AX, $8
   114  	JA   _9_or_more
   115  	MOVQ (R8), R8
   116  	LEAQ -7(DI)(DX*1), DX
   117  loop8:
   118  	MOVQ (DI), SI
   119  	CMPQ SI,R8
   120  	JZ   success
   121  	ADDQ $1,DI
   122  	CMPQ DI,DX
   123  	JB   loop8
   124  	JMP  fail
   125  _9_or_more:
   126  	CMPQ AX, $15
   127  	JA   _16_or_more
   128  	LEAQ 1(DI)(DX*1), DX
   129  	SUBQ AX, DX
   130  	MOVQ -8(R8)(AX*1), BX
   131  	MOVQ (R8), R8
   132  loop9to15:
   133  	MOVQ (DI), SI
   134  	CMPQ SI,R8
   135  	JZ   partial_success9to15
   136  	ADDQ $1,DI
   137  	CMPQ DI,DX
   138  	JB   loop9to15
   139  	JMP  fail
   140  partial_success9to15:
   141  	MOVQ -8(AX)(DI*1), SI
   142  	CMPQ SI,BX
   143  	JZ   success
   144  	ADDQ $1,DI
   145  	CMPQ DI,DX
   146  	JB   loop9to15
   147  	JMP  fail
   148  _16_or_more:
   149  	CMPQ  AX, $16
   150  	JA    _17_or_more
   151  	MOVOU (R8), X1
   152  	LEAQ  -15(DI)(DX*1), DX
   153  loop16:
   154  	MOVOU    (DI), X2
   155  	PCMPEQB  X1, X2
   156  	PMOVMSKB X2, SI
   157  	CMPQ     SI, $0xffff
   158  	JE       success
   159  	ADDQ     $1,DI
   160  	CMPQ     DI,DX
   161  	JB       loop16
   162  	JMP      fail
   163  _17_or_more:
   164  	CMPQ  AX, $31
   165  	JA    _32_or_more
   166  	LEAQ  1(DI)(DX*1), DX
   167  	SUBQ  AX, DX
   168  	MOVOU -16(R8)(AX*1), X0
   169  	MOVOU (R8), X1
   170  loop17to31:
   171  	MOVOU    (DI), X2
   172  	PCMPEQB  X1,X2
   173  	PMOVMSKB X2, SI
   174  	CMPQ     SI, $0xffff
   175  	JE       partial_success17to31
   176  	ADDQ     $1,DI
   177  	CMPQ     DI,DX
   178  	JB       loop17to31
   179  	JMP      fail
   180  partial_success17to31:
   181  	MOVOU    -16(AX)(DI*1), X3
   182  	PCMPEQB  X0, X3
   183  	PMOVMSKB X3, SI
   184  	CMPQ     SI, $0xffff
   185  	JE       success
   186  	ADDQ     $1,DI
   187  	CMPQ     DI,DX
   188  	JB       loop17to31
   189  	JMP      fail
   190  // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
   191  // So no need to check cpuid
   192  _32_or_more:
   193  	CMPQ    AX, $32
   194  	JA      _33_to_63
   195  	VMOVDQU (R8), Y1
   196  	LEAQ    -31(DI)(DX*1), DX
   197  loop32:
   198  	VMOVDQU   (DI), Y2
   199  	VPCMPEQB  Y1, Y2, Y3
   200  	VPMOVMSKB Y3, SI
   201  	CMPL      SI, $0xffffffff
   202  	JE        success_avx2
   203  	ADDQ      $1,DI
   204  	CMPQ      DI,DX
   205  	JB        loop32
   206  	JMP       fail_avx2
   207  _33_to_63:
   208  	LEAQ    1(DI)(DX*1), DX
   209  	SUBQ    AX, DX
   210  	VMOVDQU -32(R8)(AX*1), Y0
   211  	VMOVDQU (R8), Y1
   212  loop33to63:
   213  	VMOVDQU   (DI), Y2
   214  	VPCMPEQB  Y1, Y2, Y3
   215  	VPMOVMSKB Y3, SI
   216  	CMPL      SI, $0xffffffff
   217  	JE        partial_success33to63
   218  	ADDQ      $1,DI
   219  	CMPQ      DI,DX
   220  	JB        loop33to63
   221  	JMP       fail_avx2
   222  partial_success33to63:
   223  	VMOVDQU   -32(AX)(DI*1), Y3
   224  	VPCMPEQB  Y0, Y3, Y4
   225  	VPMOVMSKB Y4, SI
   226  	CMPL      SI, $0xffffffff
   227  	JE        success_avx2
   228  	ADDQ      $1,DI
   229  	CMPQ      DI,DX
   230  	JB        loop33to63
   231  fail_avx2:
   232  	VZEROUPPER
   233  fail:
   234  	MOVQ $-1, (R11)
   235  	RET
   236  success_avx2:
   237  	VZEROUPPER
   238  	JMP        success
   239  sse42:
   240  	CMPB ·hasSSE42(SB), $1
   241  	JNE  no_sse42
   242  	CMPQ AX, $12
   243  	// PCMPESTRI is slower than normal compare,
   244  	// so using it makes sense only if we advance 4+ bytes per compare
   245  	// This value was determined experimentally and is the ~same
   246  	// on Nehalem (first with SSE42) and Haswell.
   247  	JAE   _9_or_more
   248  	LEAQ  16(R8), SI
   249  	TESTW $0xff0, SI
   250  	JEQ   no_sse42
   251  	MOVOU (R8), X1
   252  	LEAQ  -15(DI)(DX*1), SI
   253  	MOVQ  $16, R9
   254  	SUBQ  AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
   255  loop_sse42:
   256  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
   257  	// for equality (bits 2,3 are 11)
   258  	// result is not masked or inverted (bits 4,5 are 00)
   259  	// and corresponds to first matching byte (bit 6 is 0)
   260  	PCMPESTRI $0x0c, (DI), X1
   261  	// CX == 16 means no match,
   262  	// CX > R9 means partial match at the end of the string,
   263  	// otherwise sep is at offset CX from X1 start
   264  	CMPQ      CX, R9
   265  	JBE       sse42_success
   266  	ADDQ      R9, DI
   267  	CMPQ      DI, SI
   268  	JB        loop_sse42
   269  	PCMPESTRI $0x0c, -1(SI), X1
   270  	CMPQ      CX, R9
   271  	JA        fail
   272  	LEAQ      -1(SI), DI
   273  sse42_success:
   274  	ADDQ CX, DI
   275  success:
   276  	SUBQ R10, DI
   277  	MOVQ DI, (R11)
   278  	RET