github.com/hikaru7719/go@v0.0.0-20181025140707-c8b2ac68906a/src/internal/bytealg/compare_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·Compare(SB),NOSPLIT,$0-56 9 MOVQ a_base+0(FP), SI 10 MOVQ a_len+8(FP), BX 11 MOVQ b_base+24(FP), DI 12 MOVQ b_len+32(FP), DX 13 LEAQ ret+48(FP), R9 14 JMP cmpbody<>(SB) 15 16 TEXT bytes·Compare(SB),NOSPLIT,$0-56 17 FUNCDATA $0, ·Compare·args_stackmap(SB) 18 MOVQ a_base+0(FP), SI 19 MOVQ a_len+8(FP), BX 20 MOVQ b_base+24(FP), DI 21 MOVQ b_len+32(FP), DX 22 LEAQ ret+48(FP), R9 23 JMP cmpbody<>(SB) 24 25 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 26 MOVQ a_base+0(FP), SI 27 MOVQ a_len+8(FP), BX 28 MOVQ b_base+16(FP), DI 29 MOVQ b_len+24(FP), DX 30 LEAQ ret+32(FP), R9 31 JMP cmpbody<>(SB) 32 33 // input: 34 // SI = a 35 // DI = b 36 // BX = alen 37 // DX = blen 38 // R9 = address of output word (stores -1/0/1 here) 39 TEXT cmpbody<>(SB),NOSPLIT,$0-0 40 CMPQ SI, DI 41 JEQ allsame 42 CMPQ BX, DX 43 MOVQ DX, R8 44 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 45 CMPQ R8, $8 46 JB small 47 48 CMPQ R8, $63 49 JBE loop 50 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 51 JEQ big_loop_avx2 52 JMP big_loop 53 loop: 54 CMPQ R8, $16 55 JBE _0through16 56 MOVOU (SI), X0 57 MOVOU (DI), X1 58 PCMPEQB X0, X1 59 PMOVMSKB X1, AX 60 XORQ $0xffff, AX // convert EQ to NE 61 JNE diff16 // branch if at least one byte is not equal 62 ADDQ $16, SI 63 ADDQ $16, DI 64 SUBQ $16, R8 65 JMP loop 66 67 diff64: 68 ADDQ $48, SI 69 ADDQ $48, DI 70 JMP diff16 71 diff48: 72 ADDQ $32, SI 73 ADDQ $32, DI 74 JMP diff16 75 diff32: 76 ADDQ $16, SI 77 ADDQ $16, DI 78 // AX = bit mask of differences 79 diff16: 80 BSFQ AX, BX // index of first byte that differs 81 XORQ AX, AX 82 MOVB (SI)(BX*1), CX 83 CMPB CX, (DI)(BX*1) 84 SETHI AX 85 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 86 MOVQ AX, (R9) 87 RET 88 89 // 0 through 16 bytes left, alen>=8, blen>=8 90 _0through16: 91 CMPQ R8, $8 92 JBE _0through8 93 MOVQ (SI), AX 94 MOVQ (DI), CX 95 CMPQ AX, CX 96 JNE diff8 97 _0through8: 98 MOVQ -8(SI)(R8*1), AX 99 MOVQ -8(DI)(R8*1), CX 100 CMPQ AX, CX 101 JEQ allsame 102 103 // AX and CX contain parts of a and b that differ. 104 diff8: 105 BSWAPQ AX // reverse order of bytes 106 BSWAPQ CX 107 XORQ AX, CX 108 BSRQ CX, CX // index of highest bit difference 109 SHRQ CX, AX // move a's bit to bottom 110 ANDQ $1, AX // mask bit 111 LEAQ -1(AX*2), AX // 1/0 => +1/-1 112 MOVQ AX, (R9) 113 RET 114 115 // 0-7 bytes in common 116 small: 117 LEAQ (R8*8), CX // bytes left -> bits left 118 NEGQ CX // - bits lift (== 64 - bits left mod 64) 119 JEQ allsame 120 121 // load bytes of a into high bytes of AX 122 CMPB SI, $0xf8 123 JA si_high 124 MOVQ (SI), SI 125 JMP si_finish 126 si_high: 127 MOVQ -8(SI)(R8*1), SI 128 SHRQ CX, SI 129 si_finish: 130 SHLQ CX, SI 131 132 // load bytes of b in to high bytes of BX 133 CMPB DI, $0xf8 134 JA di_high 135 MOVQ (DI), DI 136 JMP di_finish 137 di_high: 138 MOVQ -8(DI)(R8*1), DI 139 SHRQ CX, DI 140 di_finish: 141 SHLQ CX, DI 142 143 BSWAPQ SI // reverse order of bytes 144 BSWAPQ DI 145 XORQ SI, DI // find bit differences 146 JEQ allsame 147 BSRQ DI, CX // index of highest bit difference 148 SHRQ CX, SI // move a's bit to bottom 149 ANDQ $1, SI // mask bit 150 LEAQ -1(SI*2), AX // 1/0 => +1/-1 151 MOVQ AX, (R9) 152 RET 153 154 allsame: 155 XORQ AX, AX 156 XORQ CX, CX 157 CMPQ BX, DX 158 SETGT AX // 1 if alen > blen 159 SETEQ CX // 1 if alen == blen 160 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 161 MOVQ AX, (R9) 162 RET 163 164 // this works for >= 64 bytes of data. 165 big_loop: 166 MOVOU (SI), X0 167 MOVOU (DI), X1 168 PCMPEQB X0, X1 169 PMOVMSKB X1, AX 170 XORQ $0xffff, AX 171 JNE diff16 172 173 MOVOU 16(SI), X0 174 MOVOU 16(DI), X1 175 PCMPEQB X0, X1 176 PMOVMSKB X1, AX 177 XORQ $0xffff, AX 178 JNE diff32 179 180 MOVOU 32(SI), X0 181 MOVOU 32(DI), X1 182 PCMPEQB X0, X1 183 PMOVMSKB X1, AX 184 XORQ $0xffff, AX 185 JNE diff48 186 187 MOVOU 48(SI), X0 188 MOVOU 48(DI), X1 189 PCMPEQB X0, X1 190 PMOVMSKB X1, AX 191 XORQ $0xffff, AX 192 JNE diff64 193 194 ADDQ $64, SI 195 ADDQ $64, DI 196 SUBQ $64, R8 197 CMPQ R8, $64 198 JBE loop 199 JMP big_loop 200 201 // Compare 64-bytes per loop iteration. 202 // Loop is unrolled and uses AVX2. 203 big_loop_avx2: 204 VMOVDQU (SI), Y2 205 VMOVDQU (DI), Y3 206 VMOVDQU 32(SI), Y4 207 VMOVDQU 32(DI), Y5 208 VPCMPEQB Y2, Y3, Y0 209 VPMOVMSKB Y0, AX 210 XORL $0xffffffff, AX 211 JNE diff32_avx2 212 VPCMPEQB Y4, Y5, Y6 213 VPMOVMSKB Y6, AX 214 XORL $0xffffffff, AX 215 JNE diff64_avx2 216 217 ADDQ $64, SI 218 ADDQ $64, DI 219 SUBQ $64, R8 220 CMPQ R8, $64 221 JB big_loop_avx2_exit 222 JMP big_loop_avx2 223 224 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 225 diff32_avx2: 226 VZEROUPPER 227 JMP diff16 228 229 // Same as diff32_avx2, but for last 32 bytes. 230 diff64_avx2: 231 VZEROUPPER 232 JMP diff48 233 234 // For <64 bytes remainder jump to normal loop. 235 big_loop_avx2_exit: 236 VZEROUPPER 237 JMP loop