github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/internal/bytealg/compare_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56 9 #ifdef GOEXPERIMENT_regabiargs 10 // AX = a_base (want in SI) 11 // BX = a_len (want in BX) 12 // CX = a_cap (unused) 13 // DI = b_base (want in DI) 14 // SI = b_len (want in DX) 15 // R8 = b_cap (unused) 16 MOVQ SI, DX 17 MOVQ AX, SI 18 #else 19 MOVQ a_base+0(FP), SI 20 MOVQ a_len+8(FP), BX 21 MOVQ b_base+24(FP), DI 22 MOVQ b_len+32(FP), DX 23 LEAQ ret+48(FP), R9 24 #endif 25 JMP cmpbody<>(SB) 26 27 TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40 28 #ifdef GOEXPERIMENT_regabiargs 29 // AX = a_base (want in SI) 30 // BX = a_len (want in BX) 31 // CX = b_base (want in DI) 32 // DI = b_len (want in DX) 33 MOVQ AX, SI 34 MOVQ DI, DX 35 MOVQ CX, DI 36 #else 37 MOVQ a_base+0(FP), SI 38 MOVQ a_len+8(FP), BX 39 MOVQ b_base+16(FP), DI 40 MOVQ b_len+24(FP), DX 41 LEAQ ret+32(FP), R9 42 #endif 43 JMP cmpbody<>(SB) 44 45 // input: 46 // SI = a 47 // DI = b 48 // BX = alen 49 // DX = blen 50 #ifndef GOEXPERIMENT_regabiargs 51 // R9 = address of output word (stores -1/0/1 here) 52 #else 53 // output: 54 // AX = output (-1/0/1) 55 #endif 56 TEXT cmpbody<>(SB),NOSPLIT,$0-0 57 CMPQ SI, DI 58 JEQ allsame 59 CMPQ BX, DX 60 MOVQ DX, R8 61 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 62 CMPQ R8, $8 63 JB small 64 65 CMPQ R8, $63 66 JBE loop 67 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 68 JEQ big_loop_avx2 69 JMP big_loop 70 loop: 71 CMPQ R8, $16 72 JBE _0through16 73 MOVOU (SI), X0 74 MOVOU (DI), X1 75 PCMPEQB X0, X1 76 PMOVMSKB X1, AX 77 XORQ $0xffff, AX // convert EQ to NE 78 JNE diff16 // branch if at least one byte is not equal 79 ADDQ $16, SI 80 ADDQ $16, DI 81 SUBQ $16, R8 82 JMP loop 83 84 diff64: 85 ADDQ $48, SI 86 ADDQ $48, DI 87 JMP diff16 88 diff48: 89 ADDQ $32, SI 90 ADDQ $32, DI 91 JMP diff16 92 diff32: 93 ADDQ $16, SI 94 ADDQ $16, DI 95 // AX = bit mask of differences 96 diff16: 97 BSFQ AX, BX // index of first byte that differs 98 XORQ AX, AX 99 MOVB (SI)(BX*1), CX 100 CMPB CX, (DI)(BX*1) 101 SETHI AX 102 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 103 #ifndef GOEXPERIMENT_regabiargs 104 MOVQ AX, (R9) 105 #endif 106 RET 107 108 // 0 through 16 bytes left, alen>=8, blen>=8 109 _0through16: 110 CMPQ R8, $8 111 JBE _0through8 112 MOVQ (SI), AX 113 MOVQ (DI), CX 114 CMPQ AX, CX 115 JNE diff8 116 _0through8: 117 MOVQ -8(SI)(R8*1), AX 118 MOVQ -8(DI)(R8*1), CX 119 CMPQ AX, CX 120 JEQ allsame 121 122 // AX and CX contain parts of a and b that differ. 123 diff8: 124 BSWAPQ AX // reverse order of bytes 125 BSWAPQ CX 126 XORQ AX, CX 127 BSRQ CX, CX // index of highest bit difference 128 SHRQ CX, AX // move a's bit to bottom 129 ANDQ $1, AX // mask bit 130 LEAQ -1(AX*2), AX // 1/0 => +1/-1 131 #ifndef GOEXPERIMENT_regabiargs 132 MOVQ AX, (R9) 133 #endif 134 RET 135 136 // 0-7 bytes in common 137 small: 138 LEAQ (R8*8), CX // bytes left -> bits left 139 NEGQ CX // - bits lift (== 64 - bits left mod 64) 140 JEQ allsame 141 142 // load bytes of a into high bytes of AX 143 CMPB SI, $0xf8 144 JA si_high 145 MOVQ (SI), SI 146 JMP si_finish 147 si_high: 148 MOVQ -8(SI)(R8*1), SI 149 SHRQ CX, SI 150 si_finish: 151 SHLQ CX, SI 152 153 // load bytes of b in to high bytes of BX 154 CMPB DI, $0xf8 155 JA di_high 156 MOVQ (DI), DI 157 JMP di_finish 158 di_high: 159 MOVQ -8(DI)(R8*1), DI 160 SHRQ CX, DI 161 di_finish: 162 SHLQ CX, DI 163 164 BSWAPQ SI // reverse order of bytes 165 BSWAPQ DI 166 XORQ SI, DI // find bit differences 167 JEQ allsame 168 BSRQ DI, CX // index of highest bit difference 169 SHRQ CX, SI // move a's bit to bottom 170 ANDQ $1, SI // mask bit 171 LEAQ -1(SI*2), AX // 1/0 => +1/-1 172 #ifndef GOEXPERIMENT_regabiargs 173 MOVQ AX, (R9) 174 #endif 175 RET 176 177 allsame: 178 XORQ AX, AX 179 XORQ CX, CX 180 CMPQ BX, DX 181 SETGT AX // 1 if alen > blen 182 SETEQ CX // 1 if alen == blen 183 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 184 #ifndef GOEXPERIMENT_regabiargs 185 MOVQ AX, (R9) 186 #endif 187 RET 188 189 // this works for >= 64 bytes of data. 190 big_loop: 191 MOVOU (SI), X0 192 MOVOU (DI), X1 193 PCMPEQB X0, X1 194 PMOVMSKB X1, AX 195 XORQ $0xffff, AX 196 JNE diff16 197 198 MOVOU 16(SI), X0 199 MOVOU 16(DI), X1 200 PCMPEQB X0, X1 201 PMOVMSKB X1, AX 202 XORQ $0xffff, AX 203 JNE diff32 204 205 MOVOU 32(SI), X0 206 MOVOU 32(DI), X1 207 PCMPEQB X0, X1 208 PMOVMSKB X1, AX 209 XORQ $0xffff, AX 210 JNE diff48 211 212 MOVOU 48(SI), X0 213 MOVOU 48(DI), X1 214 PCMPEQB X0, X1 215 PMOVMSKB X1, AX 216 XORQ $0xffff, AX 217 JNE diff64 218 219 ADDQ $64, SI 220 ADDQ $64, DI 221 SUBQ $64, R8 222 CMPQ R8, $64 223 JBE loop 224 JMP big_loop 225 226 // Compare 64-bytes per loop iteration. 227 // Loop is unrolled and uses AVX2. 228 big_loop_avx2: 229 VMOVDQU (SI), Y2 230 VMOVDQU (DI), Y3 231 VMOVDQU 32(SI), Y4 232 VMOVDQU 32(DI), Y5 233 VPCMPEQB Y2, Y3, Y0 234 VPMOVMSKB Y0, AX 235 XORL $0xffffffff, AX 236 JNE diff32_avx2 237 VPCMPEQB Y4, Y5, Y6 238 VPMOVMSKB Y6, AX 239 XORL $0xffffffff, AX 240 JNE diff64_avx2 241 242 ADDQ $64, SI 243 ADDQ $64, DI 244 SUBQ $64, R8 245 CMPQ R8, $64 246 JB big_loop_avx2_exit 247 JMP big_loop_avx2 248 249 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 250 diff32_avx2: 251 VZEROUPPER 252 JMP diff16 253 254 // Same as diff32_avx2, but for last 32 bytes. 255 diff64_avx2: 256 VZEROUPPER 257 JMP diff48 258 259 // For <64 bytes remainder jump to normal loop. 260 big_loop_avx2_exit: 261 VZEROUPPER 262 JMP loop