github.com/SandwichDev/go-internals@v0.0.0-20210605002614-12311ac6b2c5/bytealg/compare_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·Compare(SB),NOSPLIT,$0-56 9 MOVQ a_base+0(FP), SI 10 MOVQ a_len+8(FP), BX 11 MOVQ b_base+24(FP), DI 12 MOVQ b_len+32(FP), DX 13 LEAQ ret+48(FP), R9 14 JMP cmpbody<>(SB) 15 16 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 17 MOVQ a_base+0(FP), SI 18 MOVQ a_len+8(FP), BX 19 MOVQ b_base+16(FP), DI 20 MOVQ b_len+24(FP), DX 21 LEAQ ret+32(FP), R9 22 JMP cmpbody<>(SB) 23 24 // input: 25 // SI = a 26 // DI = b 27 // BX = alen 28 // DX = blen 29 // R9 = address of output word (stores -1/0/1 here) 30 TEXT cmpbody<>(SB),NOSPLIT,$0-0 31 CMPQ SI, DI 32 JEQ allsame 33 CMPQ BX, DX 34 MOVQ DX, R8 35 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 36 CMPQ R8, $8 37 JB small 38 39 CMPQ R8, $63 40 JBE loop 41 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 42 JEQ big_loop_avx2 43 JMP big_loop 44 loop: 45 CMPQ R8, $16 46 JBE _0through16 47 MOVOU (SI), X0 48 MOVOU (DI), X1 49 PCMPEQB X0, X1 50 PMOVMSKB X1, AX 51 XORQ $0xffff, AX // convert EQ to NE 52 JNE diff16 // branch if at least one byte is not equal 53 ADDQ $16, SI 54 ADDQ $16, DI 55 SUBQ $16, R8 56 JMP loop 57 58 diff64: 59 ADDQ $48, SI 60 ADDQ $48, DI 61 JMP diff16 62 diff48: 63 ADDQ $32, SI 64 ADDQ $32, DI 65 JMP diff16 66 diff32: 67 ADDQ $16, SI 68 ADDQ $16, DI 69 // AX = bit mask of differences 70 diff16: 71 BSFQ AX, BX // index of first byte that differs 72 XORQ AX, AX 73 MOVB (SI)(BX*1), CX 74 CMPB CX, (DI)(BX*1) 75 SETHI AX 76 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 77 MOVQ AX, (R9) 78 RET 79 80 // 0 through 16 bytes left, alen>=8, blen>=8 81 _0through16: 82 CMPQ R8, $8 83 JBE _0through8 84 MOVQ (SI), AX 85 MOVQ (DI), CX 86 CMPQ AX, CX 87 JNE diff8 88 _0through8: 89 MOVQ -8(SI)(R8*1), AX 90 MOVQ -8(DI)(R8*1), CX 91 CMPQ AX, CX 92 JEQ allsame 93 94 // AX and CX contain parts of a and b that differ. 95 diff8: 96 BSWAPQ AX // reverse order of bytes 97 BSWAPQ CX 98 XORQ AX, CX 99 BSRQ CX, CX // index of highest bit difference 100 SHRQ CX, AX // move a's bit to bottom 101 ANDQ $1, AX // mask bit 102 LEAQ -1(AX*2), AX // 1/0 => +1/-1 103 MOVQ AX, (R9) 104 RET 105 106 // 0-7 bytes in common 107 small: 108 LEAQ (R8*8), CX // bytes left -> bits left 109 NEGQ CX // - bits lift (== 64 - bits left mod 64) 110 JEQ allsame 111 112 // load bytes of a into high bytes of AX 113 CMPB SI, $0xf8 114 JA si_high 115 MOVQ (SI), SI 116 JMP si_finish 117 si_high: 118 MOVQ -8(SI)(R8*1), SI 119 SHRQ CX, SI 120 si_finish: 121 SHLQ CX, SI 122 123 // load bytes of b in to high bytes of BX 124 CMPB DI, $0xf8 125 JA di_high 126 MOVQ (DI), DI 127 JMP di_finish 128 di_high: 129 MOVQ -8(DI)(R8*1), DI 130 SHRQ CX, DI 131 di_finish: 132 SHLQ CX, DI 133 134 BSWAPQ SI // reverse order of bytes 135 BSWAPQ DI 136 XORQ SI, DI // find bit differences 137 JEQ allsame 138 BSRQ DI, CX // index of highest bit difference 139 SHRQ CX, SI // move a's bit to bottom 140 ANDQ $1, SI // mask bit 141 LEAQ -1(SI*2), AX // 1/0 => +1/-1 142 MOVQ AX, (R9) 143 RET 144 145 allsame: 146 XORQ AX, AX 147 XORQ CX, CX 148 CMPQ BX, DX 149 SETGT AX // 1 if alen > blen 150 SETEQ CX // 1 if alen == blen 151 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 152 MOVQ AX, (R9) 153 RET 154 155 // this works for >= 64 bytes of data. 156 big_loop: 157 MOVOU (SI), X0 158 MOVOU (DI), X1 159 PCMPEQB X0, X1 160 PMOVMSKB X1, AX 161 XORQ $0xffff, AX 162 JNE diff16 163 164 MOVOU 16(SI), X0 165 MOVOU 16(DI), X1 166 PCMPEQB X0, X1 167 PMOVMSKB X1, AX 168 XORQ $0xffff, AX 169 JNE diff32 170 171 MOVOU 32(SI), X0 172 MOVOU 32(DI), X1 173 PCMPEQB X0, X1 174 PMOVMSKB X1, AX 175 XORQ $0xffff, AX 176 JNE diff48 177 178 MOVOU 48(SI), X0 179 MOVOU 48(DI), X1 180 PCMPEQB X0, X1 181 PMOVMSKB X1, AX 182 XORQ $0xffff, AX 183 JNE diff64 184 185 ADDQ $64, SI 186 ADDQ $64, DI 187 SUBQ $64, R8 188 CMPQ R8, $64 189 JBE loop 190 JMP big_loop 191 192 // Compare 64-bytes per loop iteration. 193 // Loop is unrolled and uses AVX2. 194 big_loop_avx2: 195 VMOVDQU (SI), Y2 196 VMOVDQU (DI), Y3 197 VMOVDQU 32(SI), Y4 198 VMOVDQU 32(DI), Y5 199 VPCMPEQB Y2, Y3, Y0 200 VPMOVMSKB Y0, AX 201 XORL $0xffffffff, AX 202 JNE diff32_avx2 203 VPCMPEQB Y4, Y5, Y6 204 VPMOVMSKB Y6, AX 205 XORL $0xffffffff, AX 206 JNE diff64_avx2 207 208 ADDQ $64, SI 209 ADDQ $64, DI 210 SUBQ $64, R8 211 CMPQ R8, $64 212 JB big_loop_avx2_exit 213 JMP big_loop_avx2 214 215 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 216 diff32_avx2: 217 VZEROUPPER 218 JMP diff16 219 220 // Same as diff32_avx2, but for last 32 bytes. 221 diff64_avx2: 222 VZEROUPPER 223 JMP diff48 224 225 // For <64 bytes remainder jump to normal loop. 226 big_loop_avx2_exit: 227 VZEROUPPER 228 JMP loop