github.com/SandwichDev/go-internals@v0.0.0-20210605002614-12311ac6b2c5/bytealg/index_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·Index(SB),NOSPLIT,$0-56 9 MOVQ a_base+0(FP), DI 10 MOVQ a_len+8(FP), DX 11 MOVQ b_base+24(FP), R8 12 MOVQ b_len+32(FP), AX 13 MOVQ DI, R10 14 LEAQ ret+48(FP), R11 15 JMP indexbody<>(SB) 16 17 TEXT ·IndexString(SB),NOSPLIT,$0-40 18 MOVQ a_base+0(FP), DI 19 MOVQ a_len+8(FP), DX 20 MOVQ b_base+16(FP), R8 21 MOVQ b_len+24(FP), AX 22 MOVQ DI, R10 23 LEAQ ret+32(FP), R11 24 JMP indexbody<>(SB) 25 26 // AX: length of string, that we are searching for 27 // DX: length of string, in which we are searching 28 // DI: pointer to string, in which we are searching 29 // R8: pointer to string, that we are searching for 30 // R11: address, where to put return value 31 // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them 32 TEXT indexbody<>(SB),NOSPLIT,$0 33 CMPQ AX, DX 34 JA fail 35 CMPQ DX, $16 36 JAE sse42 37 no_sse42: 38 CMPQ AX, $2 39 JA _3_or_more 40 MOVW (R8), R8 41 LEAQ -1(DI)(DX*1), DX 42 loop2: 43 MOVW (DI), SI 44 CMPW SI,R8 45 JZ success 46 ADDQ $1,DI 47 CMPQ DI,DX 48 JB loop2 49 JMP fail 50 _3_or_more: 51 CMPQ AX, $3 52 JA _4_or_more 53 MOVW 1(R8), BX 54 MOVW (R8), R8 55 LEAQ -2(DI)(DX*1), DX 56 loop3: 57 MOVW (DI), SI 58 CMPW SI,R8 59 JZ partial_success3 60 ADDQ $1,DI 61 CMPQ DI,DX 62 JB loop3 63 JMP fail 64 partial_success3: 65 MOVW 1(DI), SI 66 CMPW SI,BX 67 JZ success 68 ADDQ $1,DI 69 CMPQ DI,DX 70 JB loop3 71 JMP fail 72 _4_or_more: 73 CMPQ AX, $4 74 JA _5_or_more 75 MOVL (R8), R8 76 LEAQ -3(DI)(DX*1), DX 77 loop4: 78 MOVL (DI), SI 79 CMPL SI,R8 80 JZ success 81 ADDQ $1,DI 82 CMPQ DI,DX 83 JB loop4 84 JMP fail 85 _5_or_more: 86 CMPQ AX, $7 87 JA _8_or_more 88 LEAQ 1(DI)(DX*1), DX 89 SUBQ AX, DX 90 MOVL -4(R8)(AX*1), BX 91 MOVL (R8), R8 92 loop5to7: 93 MOVL (DI), SI 94 CMPL SI,R8 95 JZ partial_success5to7 96 ADDQ $1,DI 97 CMPQ DI,DX 98 JB loop5to7 99 JMP fail 100 partial_success5to7: 101 MOVL -4(AX)(DI*1), SI 102 CMPL SI,BX 103 JZ success 104 ADDQ $1,DI 105 CMPQ DI,DX 106 JB loop5to7 107 JMP fail 108 _8_or_more: 109 CMPQ AX, $8 110 JA _9_or_more 111 MOVQ (R8), R8 112 LEAQ -7(DI)(DX*1), DX 113 loop8: 114 MOVQ (DI), SI 115 CMPQ SI,R8 116 JZ success 117 ADDQ $1,DI 118 CMPQ DI,DX 119 JB loop8 120 JMP fail 121 _9_or_more: 122 CMPQ AX, $15 123 JA _16_or_more 124 LEAQ 1(DI)(DX*1), DX 125 SUBQ AX, DX 126 MOVQ -8(R8)(AX*1), BX 127 MOVQ (R8), R8 128 loop9to15: 129 MOVQ (DI), SI 130 CMPQ SI,R8 131 JZ partial_success9to15 132 ADDQ $1,DI 133 CMPQ DI,DX 134 JB loop9to15 135 JMP fail 136 partial_success9to15: 137 MOVQ -8(AX)(DI*1), SI 138 CMPQ SI,BX 139 JZ success 140 ADDQ $1,DI 141 CMPQ DI,DX 142 JB loop9to15 143 JMP fail 144 _16_or_more: 145 CMPQ AX, $16 146 JA _17_or_more 147 MOVOU (R8), X1 148 LEAQ -15(DI)(DX*1), DX 149 loop16: 150 MOVOU (DI), X2 151 PCMPEQB X1, X2 152 PMOVMSKB X2, SI 153 CMPQ SI, $0xffff 154 JE success 155 ADDQ $1,DI 156 CMPQ DI,DX 157 JB loop16 158 JMP fail 159 _17_or_more: 160 CMPQ AX, $31 161 JA _32_or_more 162 LEAQ 1(DI)(DX*1), DX 163 SUBQ AX, DX 164 MOVOU -16(R8)(AX*1), X0 165 MOVOU (R8), X1 166 loop17to31: 167 MOVOU (DI), X2 168 PCMPEQB X1,X2 169 PMOVMSKB X2, SI 170 CMPQ SI, $0xffff 171 JE partial_success17to31 172 ADDQ $1,DI 173 CMPQ DI,DX 174 JB loop17to31 175 JMP fail 176 partial_success17to31: 177 MOVOU -16(AX)(DI*1), X3 178 PCMPEQB X0, X3 179 PMOVMSKB X3, SI 180 CMPQ SI, $0xffff 181 JE success 182 ADDQ $1,DI 183 CMPQ DI,DX 184 JB loop17to31 185 JMP fail 186 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 187 // So no need to check cpuid 188 _32_or_more: 189 CMPQ AX, $32 190 JA _33_to_63 191 VMOVDQU (R8), Y1 192 LEAQ -31(DI)(DX*1), DX 193 loop32: 194 VMOVDQU (DI), Y2 195 VPCMPEQB Y1, Y2, Y3 196 VPMOVMSKB Y3, SI 197 CMPL SI, $0xffffffff 198 JE success_avx2 199 ADDQ $1,DI 200 CMPQ DI,DX 201 JB loop32 202 JMP fail_avx2 203 _33_to_63: 204 LEAQ 1(DI)(DX*1), DX 205 SUBQ AX, DX 206 VMOVDQU -32(R8)(AX*1), Y0 207 VMOVDQU (R8), Y1 208 loop33to63: 209 VMOVDQU (DI), Y2 210 VPCMPEQB Y1, Y2, Y3 211 VPMOVMSKB Y3, SI 212 CMPL SI, $0xffffffff 213 JE partial_success33to63 214 ADDQ $1,DI 215 CMPQ DI,DX 216 JB loop33to63 217 JMP fail_avx2 218 partial_success33to63: 219 VMOVDQU -32(AX)(DI*1), Y3 220 VPCMPEQB Y0, Y3, Y4 221 VPMOVMSKB Y4, SI 222 CMPL SI, $0xffffffff 223 JE success_avx2 224 ADDQ $1,DI 225 CMPQ DI,DX 226 JB loop33to63 227 fail_avx2: 228 VZEROUPPER 229 fail: 230 MOVQ $-1, (R11) 231 RET 232 success_avx2: 233 VZEROUPPER 234 JMP success 235 sse42: 236 CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 237 JNE no_sse42 238 CMPQ AX, $12 239 // PCMPESTRI is slower than normal compare, 240 // so using it makes sense only if we advance 4+ bytes per compare 241 // This value was determined experimentally and is the ~same 242 // on Nehalem (first with SSE42) and Haswell. 243 JAE _9_or_more 244 LEAQ 16(R8), SI 245 TESTW $0xff0, SI 246 JEQ no_sse42 247 MOVOU (R8), X1 248 LEAQ -15(DI)(DX*1), SI 249 MOVQ $16, R9 250 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 251 loop_sse42: 252 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 253 // for equality (bits 2,3 are 11) 254 // result is not masked or inverted (bits 4,5 are 00) 255 // and corresponds to first matching byte (bit 6 is 0) 256 PCMPESTRI $0x0c, (DI), X1 257 // CX == 16 means no match, 258 // CX > R9 means partial match at the end of the string, 259 // otherwise sep is at offset CX from X1 start 260 CMPQ CX, R9 261 JBE sse42_success 262 ADDQ R9, DI 263 CMPQ DI, SI 264 JB loop_sse42 265 PCMPESTRI $0x0c, -1(SI), X1 266 CMPQ CX, R9 267 JA fail 268 LEAQ -1(SI), DI 269 sse42_success: 270 ADDQ CX, DI 271 success: 272 SUBQ R10, DI 273 MOVQ DI, (R11) 274 RET