github.com/bir3/gocompiler@v0.9.2202/src/internal/bytealg/index_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·Index(SB),NOSPLIT,$0-56 9 MOVQ a_base+0(FP), DI 10 MOVQ a_len+8(FP), DX 11 MOVQ b_base+24(FP), R8 12 MOVQ b_len+32(FP), AX 13 MOVQ DI, R10 14 LEAQ ret+48(FP), R11 15 JMP indexbody<>(SB) 16 17 TEXT ·IndexString(SB),NOSPLIT,$0-40 18 MOVQ a_base+0(FP), DI 19 MOVQ a_len+8(FP), DX 20 MOVQ b_base+16(FP), R8 21 MOVQ b_len+24(FP), AX 22 MOVQ DI, R10 23 LEAQ ret+32(FP), R11 24 JMP indexbody<>(SB) 25 26 // AX: length of string, that we are searching for 27 // DX: length of string, in which we are searching 28 // DI: pointer to string, in which we are searching 29 // R8: pointer to string, that we are searching for 30 // R11: address, where to put return value 31 // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them 32 TEXT indexbody<>(SB),NOSPLIT,$0 33 CMPQ AX, DX 34 JA fail 35 CMPQ DX, $16 36 JAE sse42 37 no_sse42: 38 CMPQ AX, $2 39 JA _3_or_more 40 MOVW (R8), R8 41 LEAQ -1(DI)(DX*1), DX 42 PCALIGN $16 43 loop2: 44 MOVW (DI), SI 45 CMPW SI,R8 46 JZ success 47 ADDQ $1,DI 48 CMPQ DI,DX 49 JB loop2 50 JMP fail 51 _3_or_more: 52 CMPQ AX, $3 53 JA _4_or_more 54 MOVW 1(R8), BX 55 MOVW (R8), R8 56 LEAQ -2(DI)(DX*1), DX 57 loop3: 58 MOVW (DI), SI 59 CMPW SI,R8 60 JZ partial_success3 61 ADDQ $1,DI 62 CMPQ DI,DX 63 JB loop3 64 JMP fail 65 partial_success3: 66 MOVW 1(DI), SI 67 CMPW SI,BX 68 JZ success 69 ADDQ $1,DI 70 CMPQ DI,DX 71 JB loop3 72 JMP fail 73 _4_or_more: 74 CMPQ AX, $4 75 JA _5_or_more 76 MOVL (R8), R8 77 LEAQ -3(DI)(DX*1), DX 78 loop4: 79 MOVL (DI), SI 80 CMPL SI,R8 81 JZ success 82 ADDQ $1,DI 83 CMPQ DI,DX 84 JB loop4 85 JMP fail 86 _5_or_more: 87 CMPQ AX, $7 88 JA _8_or_more 89 LEAQ 1(DI)(DX*1), DX 90 SUBQ AX, DX 91 MOVL -4(R8)(AX*1), BX 92 MOVL (R8), R8 93 loop5to7: 94 MOVL (DI), SI 95 CMPL SI,R8 96 JZ partial_success5to7 97 ADDQ $1,DI 98 CMPQ DI,DX 99 JB loop5to7 100 JMP fail 101 partial_success5to7: 102 MOVL -4(AX)(DI*1), SI 103 CMPL SI,BX 104 JZ success 105 ADDQ $1,DI 106 CMPQ DI,DX 107 JB loop5to7 108 JMP fail 109 _8_or_more: 110 CMPQ AX, $8 111 JA _9_or_more 112 MOVQ (R8), R8 113 LEAQ -7(DI)(DX*1), DX 114 loop8: 115 MOVQ (DI), SI 116 CMPQ SI,R8 117 JZ success 118 ADDQ $1,DI 119 CMPQ DI,DX 120 JB loop8 121 JMP fail 122 _9_or_more: 123 CMPQ AX, $15 124 JA _16_or_more 125 LEAQ 1(DI)(DX*1), DX 126 SUBQ AX, DX 127 MOVQ -8(R8)(AX*1), BX 128 MOVQ (R8), R8 129 loop9to15: 130 MOVQ (DI), SI 131 CMPQ SI,R8 132 JZ partial_success9to15 133 ADDQ $1,DI 134 CMPQ DI,DX 135 JB loop9to15 136 JMP fail 137 partial_success9to15: 138 MOVQ -8(AX)(DI*1), SI 139 CMPQ SI,BX 140 JZ success 141 ADDQ $1,DI 142 CMPQ DI,DX 143 JB loop9to15 144 JMP fail 145 _16_or_more: 146 CMPQ AX, $16 147 JA _17_or_more 148 MOVOU (R8), X1 149 LEAQ -15(DI)(DX*1), DX 150 loop16: 151 MOVOU (DI), X2 152 PCMPEQB X1, X2 153 PMOVMSKB X2, SI 154 CMPQ SI, $0xffff 155 JE success 156 ADDQ $1,DI 157 CMPQ DI,DX 158 JB loop16 159 JMP fail 160 _17_or_more: 161 CMPQ AX, $31 162 JA _32_or_more 163 LEAQ 1(DI)(DX*1), DX 164 SUBQ AX, DX 165 MOVOU -16(R8)(AX*1), X0 166 MOVOU (R8), X1 167 loop17to31: 168 MOVOU (DI), X2 169 PCMPEQB X1,X2 170 PMOVMSKB X2, SI 171 CMPQ SI, $0xffff 172 JE partial_success17to31 173 ADDQ $1,DI 174 CMPQ DI,DX 175 JB loop17to31 176 JMP fail 177 partial_success17to31: 178 MOVOU -16(AX)(DI*1), X3 179 PCMPEQB X0, X3 180 PMOVMSKB X3, SI 181 CMPQ SI, $0xffff 182 JE success 183 ADDQ $1,DI 184 CMPQ DI,DX 185 JB loop17to31 186 JMP fail 187 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 188 // So no need to check cpuid 189 _32_or_more: 190 CMPQ AX, $32 191 JA _33_to_63 192 VMOVDQU (R8), Y1 193 LEAQ -31(DI)(DX*1), DX 194 loop32: 195 VMOVDQU (DI), Y2 196 VPCMPEQB Y1, Y2, Y3 197 VPMOVMSKB Y3, SI 198 CMPL SI, $0xffffffff 199 JE success_avx2 200 ADDQ $1,DI 201 CMPQ DI,DX 202 JB loop32 203 JMP fail_avx2 204 _33_to_63: 205 LEAQ 1(DI)(DX*1), DX 206 SUBQ AX, DX 207 VMOVDQU -32(R8)(AX*1), Y0 208 VMOVDQU (R8), Y1 209 loop33to63: 210 VMOVDQU (DI), Y2 211 VPCMPEQB Y1, Y2, Y3 212 VPMOVMSKB Y3, SI 213 CMPL SI, $0xffffffff 214 JE partial_success33to63 215 ADDQ $1,DI 216 CMPQ DI,DX 217 JB loop33to63 218 JMP fail_avx2 219 partial_success33to63: 220 VMOVDQU -32(AX)(DI*1), Y3 221 VPCMPEQB Y0, Y3, Y4 222 VPMOVMSKB Y4, SI 223 CMPL SI, $0xffffffff 224 JE success_avx2 225 ADDQ $1,DI 226 CMPQ DI,DX 227 JB loop33to63 228 fail_avx2: 229 VZEROUPPER 230 fail: 231 MOVQ $-1, (R11) 232 RET 233 success_avx2: 234 VZEROUPPER 235 JMP success 236 sse42: 237 #ifndef hasSSE42 238 CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 239 JNE no_sse42 240 #endif 241 CMPQ AX, $12 242 // PCMPESTRI is slower than normal compare, 243 // so using it makes sense only if we advance 4+ bytes per compare 244 // This value was determined experimentally and is the ~same 245 // on Nehalem (first with SSE42) and Haswell. 246 JAE _9_or_more 247 LEAQ 16(R8), SI 248 TESTW $0xff0, SI 249 JEQ no_sse42 250 MOVOU (R8), X1 251 LEAQ -15(DI)(DX*1), SI 252 MOVQ $16, R9 253 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 254 PCALIGN $16 255 loop_sse42: 256 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 257 // for equality (bits 2,3 are 11) 258 // result is not masked or inverted (bits 4,5 are 00) 259 // and corresponds to first matching byte (bit 6 is 0) 260 PCMPESTRI $0x0c, (DI), X1 261 // CX == 16 means no match, 262 // CX > R9 means partial match at the end of the string, 263 // otherwise sep is at offset CX from X1 start 264 CMPQ CX, R9 265 JBE sse42_success 266 ADDQ R9, DI 267 CMPQ DI, SI 268 JB loop_sse42 269 PCMPESTRI $0x0c, -1(SI), X1 270 CMPQ CX, R9 271 JA fail 272 LEAQ -1(SI), DI 273 sse42_success: 274 ADDQ CX, DI 275 success: 276 SUBQ R10, DI 277 MOVQ DI, (R11) 278 RET