github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/index_amd64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2018 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && amd64 9 10 #include "textflag.h" 11 12 TEXT ·indexSlice(SB),NOSPLIT,$0-56 13 MOVQ a_base+0(FP), DI 14 MOVQ a_len+8(FP), DX 15 MOVQ b_base+24(FP), R8 16 MOVQ b_len+32(FP), AX 17 MOVQ DI, R10 18 LEAQ ret+48(FP), R11 19 JMP indexbody<>(SB) 20 21 TEXT ·index(SB),NOSPLIT,$0-40 22 MOVQ a_base+0(FP), DI 23 MOVQ a_len+8(FP), DX 24 MOVQ b_base+16(FP), R8 25 MOVQ b_len+24(FP), AX 26 MOVQ DI, R10 27 LEAQ ret+32(FP), R11 28 JMP indexbody<>(SB) 29 30 // AX: length of string, that we are searching for 31 // DX: length of string, in which we are searching 32 // DI: pointer to string, in which we are searching 33 // R8: pointer to string, that we are searching for 34 // R11: address, where to put return value 35 // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them 36 TEXT indexbody<>(SB),NOSPLIT,$0 37 CMPQ AX, DX 38 JA fail 39 CMPQ DX, $16 40 JAE sse42 41 no_sse42: 42 CMPQ AX, $2 43 JA _3_or_more 44 MOVW (R8), R8 45 LEAQ -1(DI)(DX*1), DX 46 loop2: 47 MOVW (DI), SI 48 CMPW SI,R8 49 JZ success 50 ADDQ $1,DI 51 CMPQ DI,DX 52 JB loop2 53 JMP fail 54 _3_or_more: 55 CMPQ AX, $3 56 JA _4_or_more 57 MOVW 1(R8), BX 58 MOVW (R8), R8 59 LEAQ -2(DI)(DX*1), DX 60 loop3: 61 MOVW (DI), SI 62 CMPW SI,R8 63 JZ partial_success3 64 ADDQ $1,DI 65 CMPQ DI,DX 66 JB loop3 67 JMP fail 68 partial_success3: 69 MOVW 1(DI), SI 70 CMPW SI,BX 71 JZ success 72 ADDQ $1,DI 73 CMPQ DI,DX 74 JB loop3 75 JMP fail 76 _4_or_more: 77 CMPQ AX, $4 78 JA _5_or_more 79 MOVL (R8), R8 80 LEAQ -3(DI)(DX*1), DX 81 loop4: 82 MOVL (DI), SI 83 CMPL SI,R8 84 JZ success 85 ADDQ $1,DI 86 CMPQ DI,DX 87 JB loop4 88 JMP fail 89 _5_or_more: 90 CMPQ AX, $7 91 JA _8_or_more 92 LEAQ 1(DI)(DX*1), DX 93 SUBQ AX, DX 94 MOVL -4(R8)(AX*1), BX 95 MOVL (R8), R8 96 loop5to7: 97 MOVL (DI), SI 98 CMPL SI,R8 99 JZ partial_success5to7 100 ADDQ $1,DI 101 CMPQ DI,DX 102 JB loop5to7 103 JMP fail 104 partial_success5to7: 105 MOVL -4(AX)(DI*1), SI 106 CMPL SI,BX 107 JZ success 108 ADDQ $1,DI 109 CMPQ DI,DX 110 JB loop5to7 111 JMP fail 112 _8_or_more: 113 CMPQ AX, $8 114 JA _9_or_more 115 MOVQ (R8), R8 116 LEAQ -7(DI)(DX*1), DX 117 loop8: 118 MOVQ (DI), SI 119 CMPQ SI,R8 120 JZ success 121 ADDQ $1,DI 122 CMPQ DI,DX 123 JB loop8 124 JMP fail 125 _9_or_more: 126 CMPQ AX, $15 127 JA _16_or_more 128 LEAQ 1(DI)(DX*1), DX 129 SUBQ AX, DX 130 MOVQ -8(R8)(AX*1), BX 131 MOVQ (R8), R8 132 loop9to15: 133 MOVQ (DI), SI 134 CMPQ SI,R8 135 JZ partial_success9to15 136 ADDQ $1,DI 137 CMPQ DI,DX 138 JB loop9to15 139 JMP fail 140 partial_success9to15: 141 MOVQ -8(AX)(DI*1), SI 142 CMPQ SI,BX 143 JZ success 144 ADDQ $1,DI 145 CMPQ DI,DX 146 JB loop9to15 147 JMP fail 148 _16_or_more: 149 CMPQ AX, $16 150 JA _17_or_more 151 MOVOU (R8), X1 152 LEAQ -15(DI)(DX*1), DX 153 loop16: 154 MOVOU (DI), X2 155 PCMPEQB X1, X2 156 PMOVMSKB X2, SI 157 CMPQ SI, $0xffff 158 JE success 159 ADDQ $1,DI 160 CMPQ DI,DX 161 JB loop16 162 JMP fail 163 _17_or_more: 164 CMPQ AX, $31 165 JA _32_or_more 166 LEAQ 1(DI)(DX*1), DX 167 SUBQ AX, DX 168 MOVOU -16(R8)(AX*1), X0 169 MOVOU (R8), X1 170 loop17to31: 171 MOVOU (DI), X2 172 PCMPEQB X1,X2 173 PMOVMSKB X2, SI 174 CMPQ SI, $0xffff 175 JE partial_success17to31 176 ADDQ $1,DI 177 CMPQ DI,DX 178 JB loop17to31 179 JMP fail 180 partial_success17to31: 181 MOVOU -16(AX)(DI*1), X3 182 PCMPEQB X0, X3 183 PMOVMSKB X3, SI 184 CMPQ SI, $0xffff 185 JE success 186 ADDQ $1,DI 187 CMPQ DI,DX 188 JB loop17to31 189 JMP fail 190 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 191 // So no need to check cpuid 192 _32_or_more: 193 CMPQ AX, $32 194 JA _33_to_63 195 VMOVDQU (R8), Y1 196 LEAQ -31(DI)(DX*1), DX 197 loop32: 198 VMOVDQU (DI), Y2 199 VPCMPEQB Y1, Y2, Y3 200 VPMOVMSKB Y3, SI 201 CMPL SI, $0xffffffff 202 JE success_avx2 203 ADDQ $1,DI 204 CMPQ DI,DX 205 JB loop32 206 JMP fail_avx2 207 _33_to_63: 208 LEAQ 1(DI)(DX*1), DX 209 SUBQ AX, DX 210 VMOVDQU -32(R8)(AX*1), Y0 211 VMOVDQU (R8), Y1 212 loop33to63: 213 VMOVDQU (DI), Y2 214 VPCMPEQB Y1, Y2, Y3 215 VPMOVMSKB Y3, SI 216 CMPL SI, $0xffffffff 217 JE partial_success33to63 218 ADDQ $1,DI 219 CMPQ DI,DX 220 JB loop33to63 221 JMP fail_avx2 222 partial_success33to63: 223 VMOVDQU -32(AX)(DI*1), Y3 224 VPCMPEQB Y0, Y3, Y4 225 VPMOVMSKB Y4, SI 226 CMPL SI, $0xffffffff 227 JE success_avx2 228 ADDQ $1,DI 229 CMPQ DI,DX 230 JB loop33to63 231 fail_avx2: 232 VZEROUPPER 233 fail: 234 MOVQ $-1, (R11) 235 RET 236 success_avx2: 237 VZEROUPPER 238 JMP success 239 sse42: 240 CMPB ·hasSSE42(SB), $1 241 JNE no_sse42 242 CMPQ AX, $12 243 // PCMPESTRI is slower than normal compare, 244 // so using it makes sense only if we advance 4+ bytes per compare 245 // This value was determined experimentally and is the ~same 246 // on Nehalem (first with SSE42) and Haswell. 247 JAE _9_or_more 248 LEAQ 16(R8), SI 249 TESTW $0xff0, SI 250 JEQ no_sse42 251 MOVOU (R8), X1 252 LEAQ -15(DI)(DX*1), SI 253 MOVQ $16, R9 254 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 255 loop_sse42: 256 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 257 // for equality (bits 2,3 are 11) 258 // result is not masked or inverted (bits 4,5 are 00) 259 // and corresponds to first matching byte (bit 6 is 0) 260 PCMPESTRI $0x0c, (DI), X1 261 // CX == 16 means no match, 262 // CX > R9 means partial match at the end of the string, 263 // otherwise sep is at offset CX from X1 start 264 CMPQ CX, R9 265 JBE sse42_success 266 ADDQ R9, DI 267 CMPQ DI, SI 268 JB loop_sse42 269 PCMPESTRI $0x0c, -1(SI), X1 270 CMPQ CX, R9 271 JA fail 272 LEAQ -1(SI), DI 273 sse42_success: 274 ADDQ CX, DI 275 success: 276 SUBQ R10, DI 277 MOVQ DI, (R11) 278 RET