github.com/bir3/gocompiler@v0.9.2202/src/internal/bytealg/compare_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "asm_amd64.h" 7 #include "textflag.h" 8 9 TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56 10 // AX = a_base (want in SI) 11 // BX = a_len (want in BX) 12 // CX = a_cap (unused) 13 // DI = b_base (want in DI) 14 // SI = b_len (want in DX) 15 // R8 = b_cap (unused) 16 MOVQ SI, DX 17 MOVQ AX, SI 18 JMP cmpbody<>(SB) 19 20 TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40 21 // AX = a_base (want in SI) 22 // BX = a_len (want in BX) 23 // CX = b_base (want in DI) 24 // DI = b_len (want in DX) 25 MOVQ AX, SI 26 MOVQ DI, DX 27 MOVQ CX, DI 28 JMP cmpbody<>(SB) 29 30 // input: 31 // SI = a 32 // DI = b 33 // BX = alen 34 // DX = blen 35 // output: 36 // AX = output (-1/0/1) 37 TEXT cmpbody<>(SB),NOSPLIT,$0-0 38 CMPQ SI, DI 39 JEQ allsame 40 CMPQ BX, DX 41 MOVQ DX, R8 42 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 43 CMPQ R8, $8 44 JB small 45 46 CMPQ R8, $63 47 JBE loop 48 #ifndef hasAVX2 49 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 50 JEQ big_loop_avx2 51 JMP big_loop 52 #else 53 JMP big_loop_avx2 54 #endif 55 loop: 56 CMPQ R8, $16 57 JBE _0through16 58 MOVOU (SI), X0 59 MOVOU (DI), X1 60 PCMPEQB X0, X1 61 PMOVMSKB X1, AX 62 XORQ $0xffff, AX // convert EQ to NE 63 JNE diff16 // branch if at least one byte is not equal 64 ADDQ $16, SI 65 ADDQ $16, DI 66 SUBQ $16, R8 67 JMP loop 68 69 diff64: 70 ADDQ $48, SI 71 ADDQ $48, DI 72 JMP diff16 73 diff48: 74 ADDQ $32, SI 75 ADDQ $32, DI 76 JMP diff16 77 diff32: 78 ADDQ $16, SI 79 ADDQ $16, DI 80 // AX = bit mask of differences 81 diff16: 82 BSFQ AX, BX // index of first byte that differs 83 XORQ AX, AX 84 MOVB (SI)(BX*1), CX 85 CMPB CX, (DI)(BX*1) 86 SETHI AX 87 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 88 RET 89 90 // 0 through 16 bytes left, alen>=8, blen>=8 91 _0through16: 92 CMPQ R8, $8 93 JBE _0through8 94 MOVQ (SI), AX 95 MOVQ (DI), CX 96 CMPQ AX, CX 97 JNE diff8 98 _0through8: 99 MOVQ -8(SI)(R8*1), AX 100 MOVQ -8(DI)(R8*1), CX 101 CMPQ AX, CX 102 JEQ allsame 103 104 // AX and CX contain parts of a and b that differ. 105 diff8: 106 BSWAPQ AX // reverse order of bytes 107 BSWAPQ CX 108 XORQ AX, CX 109 BSRQ CX, CX // index of highest bit difference 110 SHRQ CX, AX // move a's bit to bottom 111 ANDQ $1, AX // mask bit 112 LEAQ -1(AX*2), AX // 1/0 => +1/-1 113 RET 114 115 // 0-7 bytes in common 116 small: 117 LEAQ (R8*8), CX // bytes left -> bits left 118 NEGQ CX // - bits lift (== 64 - bits left mod 64) 119 JEQ allsame 120 121 // load bytes of a into high bytes of AX 122 CMPB SI, $0xf8 123 JA si_high 124 MOVQ (SI), SI 125 JMP si_finish 126 si_high: 127 MOVQ -8(SI)(R8*1), SI 128 SHRQ CX, SI 129 si_finish: 130 SHLQ CX, SI 131 132 // load bytes of b in to high bytes of BX 133 CMPB DI, $0xf8 134 JA di_high 135 MOVQ (DI), DI 136 JMP di_finish 137 di_high: 138 MOVQ -8(DI)(R8*1), DI 139 SHRQ CX, DI 140 di_finish: 141 SHLQ CX, DI 142 143 BSWAPQ SI // reverse order of bytes 144 BSWAPQ DI 145 XORQ SI, DI // find bit differences 146 JEQ allsame 147 BSRQ DI, CX // index of highest bit difference 148 SHRQ CX, SI // move a's bit to bottom 149 ANDQ $1, SI // mask bit 150 LEAQ -1(SI*2), AX // 1/0 => +1/-1 151 RET 152 153 allsame: 154 XORQ AX, AX 155 XORQ CX, CX 156 CMPQ BX, DX 157 SETGT AX // 1 if alen > blen 158 SETEQ CX // 1 if alen == blen 159 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 160 RET 161 162 // this works for >= 64 bytes of data. 163 #ifndef hasAVX2 164 big_loop: 165 MOVOU (SI), X0 166 MOVOU (DI), X1 167 PCMPEQB X0, X1 168 PMOVMSKB X1, AX 169 XORQ $0xffff, AX 170 JNE diff16 171 172 MOVOU 16(SI), X0 173 MOVOU 16(DI), X1 174 PCMPEQB X0, X1 175 PMOVMSKB X1, AX 176 XORQ $0xffff, AX 177 JNE diff32 178 179 MOVOU 32(SI), X0 180 MOVOU 32(DI), X1 181 PCMPEQB X0, X1 182 PMOVMSKB X1, AX 183 XORQ $0xffff, AX 184 JNE diff48 185 186 MOVOU 48(SI), X0 187 MOVOU 48(DI), X1 188 PCMPEQB X0, X1 189 PMOVMSKB X1, AX 190 XORQ $0xffff, AX 191 JNE diff64 192 193 ADDQ $64, SI 194 ADDQ $64, DI 195 SUBQ $64, R8 196 CMPQ R8, $64 197 JBE loop 198 JMP big_loop 199 #endif 200 201 // Compare 64-bytes per loop iteration. 202 // Loop is unrolled and uses AVX2. 203 big_loop_avx2: 204 VMOVDQU (SI), Y2 205 VMOVDQU (DI), Y3 206 VMOVDQU 32(SI), Y4 207 VMOVDQU 32(DI), Y5 208 VPCMPEQB Y2, Y3, Y0 209 VPMOVMSKB Y0, AX 210 XORL $0xffffffff, AX 211 JNE diff32_avx2 212 VPCMPEQB Y4, Y5, Y6 213 VPMOVMSKB Y6, AX 214 XORL $0xffffffff, AX 215 JNE diff64_avx2 216 217 ADDQ $64, SI 218 ADDQ $64, DI 219 SUBQ $64, R8 220 CMPQ R8, $64 221 JB big_loop_avx2_exit 222 JMP big_loop_avx2 223 224 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 225 diff32_avx2: 226 VZEROUPPER 227 JMP diff16 228 229 // Same as diff32_avx2, but for last 32 bytes. 230 diff64_avx2: 231 VZEROUPPER 232 JMP diff48 233 234 // For <64 bytes remainder jump to normal loop. 235 big_loop_avx2_exit: 236 VZEROUPPER 237 JMP loop