github.com/m10x/go/src@v0.0.0-20220112094212-ba61592315da/internal/bytealg/compare_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56 9 // AX = a_base (want in SI) 10 // BX = a_len (want in BX) 11 // CX = a_cap (unused) 12 // DI = b_base (want in DI) 13 // SI = b_len (want in DX) 14 // R8 = b_cap (unused) 15 MOVQ SI, DX 16 MOVQ AX, SI 17 JMP cmpbody<>(SB) 18 19 TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40 20 // AX = a_base (want in SI) 21 // BX = a_len (want in BX) 22 // CX = b_base (want in DI) 23 // DI = b_len (want in DX) 24 MOVQ AX, SI 25 MOVQ DI, DX 26 MOVQ CX, DI 27 JMP cmpbody<>(SB) 28 29 // input: 30 // SI = a 31 // DI = b 32 // BX = alen 33 // DX = blen 34 // output: 35 // AX = output (-1/0/1) 36 TEXT cmpbody<>(SB),NOSPLIT,$0-0 37 CMPQ SI, DI 38 JEQ allsame 39 CMPQ BX, DX 40 MOVQ DX, R8 41 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 42 CMPQ R8, $8 43 JB small 44 45 CMPQ R8, $63 46 JBE loop 47 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 48 JEQ big_loop_avx2 49 JMP big_loop 50 loop: 51 CMPQ R8, $16 52 JBE _0through16 53 MOVOU (SI), X0 54 MOVOU (DI), X1 55 PCMPEQB X0, X1 56 PMOVMSKB X1, AX 57 XORQ $0xffff, AX // convert EQ to NE 58 JNE diff16 // branch if at least one byte is not equal 59 ADDQ $16, SI 60 ADDQ $16, DI 61 SUBQ $16, R8 62 JMP loop 63 64 diff64: 65 ADDQ $48, SI 66 ADDQ $48, DI 67 JMP diff16 68 diff48: 69 ADDQ $32, SI 70 ADDQ $32, DI 71 JMP diff16 72 diff32: 73 ADDQ $16, SI 74 ADDQ $16, DI 75 // AX = bit mask of differences 76 diff16: 77 BSFQ AX, BX // index of first byte that differs 78 XORQ AX, AX 79 MOVB (SI)(BX*1), CX 80 CMPB CX, (DI)(BX*1) 81 SETHI AX 82 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 83 RET 84 85 // 0 through 16 bytes left, alen>=8, blen>=8 86 _0through16: 87 CMPQ R8, $8 88 JBE _0through8 89 MOVQ (SI), AX 90 MOVQ (DI), CX 91 CMPQ AX, CX 92 JNE diff8 93 _0through8: 94 MOVQ -8(SI)(R8*1), AX 95 MOVQ -8(DI)(R8*1), CX 96 CMPQ AX, CX 97 JEQ allsame 98 99 // AX and CX contain parts of a and b that differ. 100 diff8: 101 BSWAPQ AX // reverse order of bytes 102 BSWAPQ CX 103 XORQ AX, CX 104 BSRQ CX, CX // index of highest bit difference 105 SHRQ CX, AX // move a's bit to bottom 106 ANDQ $1, AX // mask bit 107 LEAQ -1(AX*2), AX // 1/0 => +1/-1 108 RET 109 110 // 0-7 bytes in common 111 small: 112 LEAQ (R8*8), CX // bytes left -> bits left 113 NEGQ CX // - bits lift (== 64 - bits left mod 64) 114 JEQ allsame 115 116 // load bytes of a into high bytes of AX 117 CMPB SI, $0xf8 118 JA si_high 119 MOVQ (SI), SI 120 JMP si_finish 121 si_high: 122 MOVQ -8(SI)(R8*1), SI 123 SHRQ CX, SI 124 si_finish: 125 SHLQ CX, SI 126 127 // load bytes of b in to high bytes of BX 128 CMPB DI, $0xf8 129 JA di_high 130 MOVQ (DI), DI 131 JMP di_finish 132 di_high: 133 MOVQ -8(DI)(R8*1), DI 134 SHRQ CX, DI 135 di_finish: 136 SHLQ CX, DI 137 138 BSWAPQ SI // reverse order of bytes 139 BSWAPQ DI 140 XORQ SI, DI // find bit differences 141 JEQ allsame 142 BSRQ DI, CX // index of highest bit difference 143 SHRQ CX, SI // move a's bit to bottom 144 ANDQ $1, SI // mask bit 145 LEAQ -1(SI*2), AX // 1/0 => +1/-1 146 RET 147 148 allsame: 149 XORQ AX, AX 150 XORQ CX, CX 151 CMPQ BX, DX 152 SETGT AX // 1 if alen > blen 153 SETEQ CX // 1 if alen == blen 154 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 155 RET 156 157 // this works for >= 64 bytes of data. 158 big_loop: 159 MOVOU (SI), X0 160 MOVOU (DI), X1 161 PCMPEQB X0, X1 162 PMOVMSKB X1, AX 163 XORQ $0xffff, AX 164 JNE diff16 165 166 MOVOU 16(SI), X0 167 MOVOU 16(DI), X1 168 PCMPEQB X0, X1 169 PMOVMSKB X1, AX 170 XORQ $0xffff, AX 171 JNE diff32 172 173 MOVOU 32(SI), X0 174 MOVOU 32(DI), X1 175 PCMPEQB X0, X1 176 PMOVMSKB X1, AX 177 XORQ $0xffff, AX 178 JNE diff48 179 180 MOVOU 48(SI), X0 181 MOVOU 48(DI), X1 182 PCMPEQB X0, X1 183 PMOVMSKB X1, AX 184 XORQ $0xffff, AX 185 JNE diff64 186 187 ADDQ $64, SI 188 ADDQ $64, DI 189 SUBQ $64, R8 190 CMPQ R8, $64 191 JBE loop 192 JMP big_loop 193 194 // Compare 64-bytes per loop iteration. 195 // Loop is unrolled and uses AVX2. 196 big_loop_avx2: 197 VMOVDQU (SI), Y2 198 VMOVDQU (DI), Y3 199 VMOVDQU 32(SI), Y4 200 VMOVDQU 32(DI), Y5 201 VPCMPEQB Y2, Y3, Y0 202 VPMOVMSKB Y0, AX 203 XORL $0xffffffff, AX 204 JNE diff32_avx2 205 VPCMPEQB Y4, Y5, Y6 206 VPMOVMSKB Y6, AX 207 XORL $0xffffffff, AX 208 JNE diff64_avx2 209 210 ADDQ $64, SI 211 ADDQ $64, DI 212 SUBQ $64, R8 213 CMPQ R8, $64 214 JB big_loop_avx2_exit 215 JMP big_loop_avx2 216 217 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 218 diff32_avx2: 219 VZEROUPPER 220 JMP diff16 221 222 // Same as diff32_avx2, but for last 32 bytes. 223 diff64_avx2: 224 VZEROUPPER 225 JMP diff48 226 227 // For <64 bytes remainder jump to normal loop. 228 big_loop_avx2_exit: 229 VZEROUPPER 230 JMP loop