github.com/primecitizens/pcz/std@v0.2.1/core/cmp/bs_amd64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2018 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && amd64 9 10 #include "textflag.h" 11 12 TEXT ·Bytes<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 13 // AX = a_base (want in SI) 14 // BX = a_len (want in BX) 15 // CX = a_cap (unused) 16 // DI = b_base (want in DI) 17 // SI = b_len (want in DX) 18 // R8 = b_cap (unused) 19 MOVQ SI, DX 20 MOVQ AX, SI 21 JMP cmpbody<>(SB) 22 23 TEXT ·String<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 24 // AX = a_base (want in SI) 25 // BX = a_len (want in BX) 26 // CX = b_base (want in DI) 27 // DI = b_len (want in DX) 28 MOVQ AX, SI 29 MOVQ DI, DX 30 MOVQ CX, DI 31 JMP cmpbody<>(SB) 32 33 // input: 34 // SI = a 35 // DI = b 36 // BX = alen 37 // DX = blen 38 // output: 39 // AX = output (-1/0/1) 40 TEXT cmpbody<>(SB),NOSPLIT,$0-0 41 CMPQ SI, DI 42 JEQ allsame 43 CMPQ BX, DX 44 MOVQ DX, R8 45 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 46 CMPQ R8, $8 47 JB small 48 49 CMPQ R8, $63 50 JBE loop 51 CMPB ·hasAVX2(SB), $1 52 JEQ big_loop_avx2 53 JMP big_loop 54 loop: 55 CMPQ R8, $16 56 JBE _0through16 57 MOVOU (SI), X0 58 MOVOU (DI), X1 59 PCMPEQB X0, X1 60 PMOVMSKB X1, AX 61 XORQ $0xffff, AX // convert EQ to NE 62 JNE diff16 // branch if at least one byte is not equal 63 ADDQ $16, SI 64 ADDQ $16, DI 65 SUBQ $16, R8 66 JMP loop 67 68 diff64: 69 ADDQ $48, SI 70 ADDQ $48, DI 71 JMP diff16 72 diff48: 73 ADDQ $32, SI 74 ADDQ $32, DI 75 JMP diff16 76 diff32: 77 ADDQ $16, SI 78 ADDQ $16, DI 79 // AX = bit mask of differences 80 diff16: 81 BSFQ AX, BX // index of first byte that differs 82 XORQ AX, AX 83 MOVB (SI)(BX*1), CX 84 CMPB CX, (DI)(BX*1) 85 SETHI AX 86 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 87 RET 88 89 // 0 through 16 bytes left, alen>=8, blen>=8 90 _0through16: 91 CMPQ R8, $8 92 JBE _0through8 93 MOVQ (SI), AX 94 MOVQ (DI), CX 95 CMPQ AX, CX 96 JNE diff8 97 _0through8: 98 MOVQ -8(SI)(R8*1), AX 99 MOVQ -8(DI)(R8*1), CX 100 CMPQ AX, CX 101 JEQ allsame 102 103 // AX and CX contain parts of a and b that differ. 104 diff8: 105 BSWAPQ AX // reverse order of bytes 106 BSWAPQ CX 107 XORQ AX, CX 108 BSRQ CX, CX // index of highest bit difference 109 SHRQ CX, AX // move a's bit to bottom 110 ANDQ $1, AX // mask bit 111 LEAQ -1(AX*2), AX // 1/0 => +1/-1 112 RET 113 114 // 0-7 bytes in common 115 small: 116 LEAQ (R8*8), CX // bytes left -> bits left 117 NEGQ CX // - bits lift (== 64 - bits left mod 64) 118 JEQ allsame 119 120 // load bytes of a into high bytes of AX 121 CMPB SI, $0xf8 122 JA si_high 123 MOVQ (SI), SI 124 JMP si_finish 125 si_high: 126 MOVQ -8(SI)(R8*1), SI 127 SHRQ CX, SI 128 si_finish: 129 SHLQ CX, SI 130 131 // load bytes of b in to high bytes of BX 132 CMPB DI, $0xf8 133 JA di_high 134 MOVQ (DI), DI 135 JMP di_finish 136 di_high: 137 MOVQ -8(DI)(R8*1), DI 138 SHRQ CX, DI 139 di_finish: 140 SHLQ CX, DI 141 142 BSWAPQ SI // reverse order of bytes 143 BSWAPQ DI 144 XORQ SI, DI // find bit differences 145 JEQ allsame 146 BSRQ DI, CX // index of highest bit difference 147 SHRQ CX, SI // move a's bit to bottom 148 ANDQ $1, SI // mask bit 149 LEAQ -1(SI*2), AX // 1/0 => +1/-1 150 RET 151 152 allsame: 153 XORQ AX, AX 154 XORQ CX, CX 155 CMPQ BX, DX 156 SETGT AX // 1 if alen > blen 157 SETEQ CX // 1 if alen == blen 158 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 159 RET 160 161 // this works for >= 64 bytes of data. 162 big_loop: 163 MOVOU (SI), X0 164 MOVOU (DI), X1 165 PCMPEQB X0, X1 166 PMOVMSKB X1, AX 167 XORQ $0xffff, AX 168 JNE diff16 169 170 MOVOU 16(SI), X0 171 MOVOU 16(DI), X1 172 PCMPEQB X0, X1 173 PMOVMSKB X1, AX 174 XORQ $0xffff, AX 175 JNE diff32 176 177 MOVOU 32(SI), X0 178 MOVOU 32(DI), X1 179 PCMPEQB X0, X1 180 PMOVMSKB X1, AX 181 XORQ $0xffff, AX 182 JNE diff48 183 184 MOVOU 48(SI), X0 185 MOVOU 48(DI), X1 186 PCMPEQB X0, X1 187 PMOVMSKB X1, AX 188 XORQ $0xffff, AX 189 JNE diff64 190 191 ADDQ $64, SI 192 ADDQ $64, DI 193 SUBQ $64, R8 194 CMPQ R8, $64 195 JBE loop 196 JMP big_loop 197 198 // Compare 64-bytes per loop iteration. 199 // Loop is unrolled and uses AVX2. 200 big_loop_avx2: 201 VMOVDQU (SI), Y2 202 VMOVDQU (DI), Y3 203 VMOVDQU 32(SI), Y4 204 VMOVDQU 32(DI), Y5 205 VPCMPEQB Y2, Y3, Y0 206 VPMOVMSKB Y0, AX 207 XORL $0xffffffff, AX 208 JNE diff32_avx2 209 VPCMPEQB Y4, Y5, Y6 210 VPMOVMSKB Y6, AX 211 XORL $0xffffffff, AX 212 JNE diff64_avx2 213 214 ADDQ $64, SI 215 ADDQ $64, DI 216 SUBQ $64, R8 217 CMPQ R8, $64 218 JB big_loop_avx2_exit 219 JMP big_loop_avx2 220 221 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 222 diff32_avx2: 223 VZEROUPPER 224 JMP diff16 225 226 // Same as diff32_avx2, but for last 32 bytes. 227 diff64_avx2: 228 VZEROUPPER 229 JMP diff48 230 231 // For <64 bytes remainder jump to normal loop. 232 big_loop_avx2_exit: 233 VZEROUPPER 234 JMP loop