github.com/aclements/go-misc@v0.0.0-20240129233631-2f6ede80790c/varint/asm_amd64.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 GLOBL ·hasBMI2(SB),NOPTR,$1 8 9 TEXT ·queryBMI2(SB),NOSPLIT,$0-1 10 // TODO: Check validity of query. 11 MOVQ $0x07, AX 12 MOVQ $0, CX 13 CPUID 14 // Bit 8 of EBX indicates BMI2 support. 15 BTQ $8, BX 16 SETCS ret+0(FP) 17 RET 18 19 // Hand-coded byte decoding loop with some clever tricks. 20 TEXT ·decodeVarintAsmLoop(SB),NOSPLIT,$0-40 21 MOVQ buf_base+0(FP), BX // Pointer 22 MOVQ buf_len+8(FP), AX // Length 23 MOVL $10, CX 24 CMPQ AX, CX 25 CMOVLGT CX, AX // Length is at most 10 26 XORL SI, SI // Index 27 XORL CX, CX // Shift 28 XORL DX, DX // Value 29 30 loop: 31 CMPL SI, AX // (fused with JEQ) 32 JEQ bad // Reached end of buffer or >10 bytes 33 34 MOVBLZX (SI)(BX*1), DI // Load next byte 35 INCL SI 36 // This could be a BTRL $7, DI, but this is simpler and 37 // just as fast thanks to macro-op fusion. 38 TESTL $0x80, DI // Is bit 7 set? (fused with JZ) 39 JZ last 40 ANDL $0x7f, DI // Clear bit 7 41 SHLQ CL, DI // value |= value << shift 42 ORQ DI, DX 43 ADDL $7, CX // shift += 7 44 JMP loop 45 46 last: 47 SHLQ CL, DI // Final value |= value << shift 48 ORQ DI, DX 49 // Return decoded value and length. 50 MOVQ DX, x+24(FP) 51 MOVQ SI, n+32(FP) 52 RET 53 54 bad: 55 MOVQ $0, x+24(FP) 56 MOVQ $0, n+32(FP) 57 RET 58 59 // decodeVarintAsmBMI2 uses the BMI2 PEXT instruction to extract 7 60 // bits from each byte in one instruction. 61 TEXT ·decodeVarintAsmBMI2(SB),NOSPLIT,$0-40 62 MOVQ buf_base+0(FP), BX 63 MOVQ buf_len+8(FP), CX 64 65 // Take the slow path if there's no BMI2 or there are fewer 66 // than 8 bytes available. 67 MOVBLZX ·hasBMI2(SB), AX 68 TESTB AL, AL 69 JEQ slowpath 70 CMPQ CX, $8 71 JLT slowpath 72 73 // Load 8 bytes from buf. 74 MOVQ (BX), AX 75 76 // Extract the continuation bits into BX. 77 MOVQ AX, M0 78 PMOVMSKB M0, BX 79 // Compute byte length - 1 of varint into BX. 80 NOTL BX 81 BSFL BX, BX 82 // If it's more than 8 bytes, take the slow path. 83 CMPL BX, $8 84 JGE slowpath 85 // Extract the relevant bytes from the input. 86 INCL BX 87 MOVQ BX, CX 88 SHLQ $(3+8), CX // CX[15:8] = (byte len * 8); CX[7:0] = 0 89 BEXTRQ CX, AX, AX // Requires BMI1 90 // Extract the low 7 bits from each byte of the input. 91 MOVQ $0x7f7f7f7f7f7f7f7f, DI 92 PEXTQ DI, AX, DX // Requires BMI2 93 // Return decoded value and length. 94 MOVQ DX, x+24(FP) 95 MOVQ BX, n+32(FP) 96 RET 97 98 slowpath: 99 // Consume buffer one byte at a time. 100 // TODO: Could merge with some of the above registers better. 101 MOVQ buf_base+0(FP), BX // Pointer 102 MOVQ buf_len+8(FP), AX // Length 103 MOVQ $10, CX 104 CMPQ AX, CX 105 CMOVQGT CX, AX // Length is at most 10 106 XORQ SI, SI // Index 107 XORQ CX, CX // Shift 108 XORQ DX, DX // Value 109 110 loop: 111 CMPQ SI, AX 112 JEQ bad // Reached end of buffer or >10 bytes 113 114 MOVBLZX (SI)(BX*1), DI // Load next byte 115 INCQ SI 116 BTRL $7, DI // Is bit 7 set? Clear bit 7. 117 JNC last // If not set, this is the final byte 118 SHLQ CL, DI // value |= value << shift 119 ORQ DI, DX 120 ADDQ $7, CX // shift += 7 121 JMP loop 122 123 last: 124 SHLQ CL, DI // value |= value << shift 125 ORQ DI, DX 126 // Return decoded value and length. 127 MOVQ DX, x+24(FP) 128 MOVQ SI, n+32(FP) 129 RET 130 131 bad: 132 MOVQ $0, x+24(FP) 133 MOVQ $0, n+32(FP) 134 RET 135 136 // The other two also use PEXT, but use different tricks to extract 137 // the length and set up the mask. They turned out to be slower than 138 // the one above, but are historically interesting. 139 140 DATA extract<>+0x00(SB)/8,$0x000000000000007f 141 DATA extract<>+0x08(SB)/8,$0x0000000000007f7f 142 DATA extract<>+0x10(SB)/8,$0x00000000007f7f7f 143 DATA extract<>+0x18(SB)/8,$0x000000007f7f7f7f 144 DATA extract<>+0x20(SB)/8,$0x0000007f7f7f7f7f 145 DATA extract<>+0x28(SB)/8,$0x00007f7f7f7f7f7f 146 DATA extract<>+0x30(SB)/8,$0x007f7f7f7f7f7f7f 147 DATA extract<>+0x38(SB)/8,$0x7f7f7f7f7f7f7f7f 148 GLOBL extract<>(SB),(NOPTR+RODATA),$(8*8) 149 150 TEXT ·decodeVarintAsm1(SB),NOSPLIT,$0-40 151 // Take the slow path if there's no BMI2 or there are fewer 152 // than 8 bytes available. 153 MOVBLZX ·hasBMI2(SB), AX 154 TESTB AL, AL 155 JEQ slowpath 156 MOVQ buf_len+8(FP), AX 157 CMPQ AX, $8 158 JLT slowpath 159 160 // Load 8 bytes from buf. 161 MOVQ buf_base+0(FP), AX 162 MOVQ (AX), AX 163 164 // Extract the continuation bits into BX. 165 MOVQ AX, M0 166 PMOVMSKB M0, BX 167 // Compute byte length - 1 of varint into BX. 168 NOTL BX 169 BSFL BX, BX 170 // If it's more than 8 bytes, take the slow path. 171 CMPL BX, $8 172 JGE slowpath 173 // Extract the value into DX using a mask lookup table. 174 MOVQ $extract<>(SB), CX 175 MOVQ (CX)(BX*8), DX 176 PEXTQ DX, AX, DX // Requires BMI2 177 // Return decoded value and length. 178 MOVQ DX, x+24(FP) 179 INCL BX 180 MOVQ BX, n+32(FP) 181 RET 182 183 slowpath: 184 // Consume buffer one byte at a time. 185 // TODO: Could merge with some of the above registers better. 186 MOVQ buf_base+0(FP), BX // Pointer 187 MOVQ buf_len+8(FP), AX // Length 188 MOVQ $10, CX 189 CMPQ AX, CX 190 CMOVQGT CX, AX // Length is at most 10 191 XORQ SI, SI // Index 192 XORQ CX, CX // Shift 193 XORQ DX, DX // Value 194 195 loop: 196 CMPQ SI, AX 197 JEQ bad // Reached end of buffer or >10 bytes 198 199 MOVBLZX (SI)(BX*1), DI // Load next byte 200 INCQ SI 201 BTRL $7, DI // Is bit 7 set? Clear bit 7. 202 JNC last // If not set, this is the final byte 203 SHLQ CL, DI // value |= value << shift 204 ORQ DI, DX 205 ADDQ $7, CX // shift += 7 206 JMP loop 207 208 last: 209 SHLQ CL, DI // value |= value << shift 210 ORQ DI, DX 211 // Return decoded value and length. 212 MOVQ DX, x+24(FP) 213 MOVQ SI, n+32(FP) 214 RET 215 216 bad: 217 MOVQ $0, x+24(FP) 218 MOVQ $0, n+32(FP) 219 RET 220 221 TEXT ·decodeVarintAsm2(SB),NOSPLIT,$0-40 222 MOVQ buf_base+0(FP), BX 223 MOVQ buf_len+8(FP), CX 224 225 // Take the slow path if there's no BMI2 or there are fewer 226 // than 8 bytes available. 227 MOVBLZX ·hasBMI2(SB), AX 228 TESTB AL, AL 229 JEQ slowpath 230 CMPQ CX, $8 231 JLT slowpath 232 233 // Load 8 bytes from buf. 234 MOVQ (BX), AX 235 236 // Get continuation bit mask into DX. 237 MOVQ $0x7f7f7f7f7f7f7f7f, DI 238 MOVQ AX, DX 239 ORQ DI, DX 240 // Compute bit length of varint into CX. 241 NOTQ DX 242 BSFQ DX, CX 243 // If all continuation bits are set, take the slow path. 244 JZ slowpath 245 // Compute bit extraction mask into R14. 246 //BLSMSKQ DX, R14 // Requires BMI1 247 BYTE $0xc4; BYTE $0xe2; BYTE $0x88; BYTE $0xf3; BYTE $0xd2 248 // Mask the value. 249 ANDQ R14, AX 250 // Extract the bits. 251 PEXTQ DI, AX, DX // Requires BMI2 252 253 // Compute byte length. 7=>1, 15=>2, etc. 254 INCQ CX 255 SHRQ $3, CX 256 257 // Return decoded value and length. 258 MOVQ DX, x+24(FP) 259 MOVQ CX, n+32(FP) 260 RET 261 262 slowpath: 263 // Consume buffer one byte at a time. 264 // TODO: Could merge with some of the above registers better. 265 MOVQ buf_base+0(FP), BX // Pointer 266 MOVQ buf_len+8(FP), AX // Length 267 MOVQ $10, CX 268 CMPQ AX, CX 269 CMOVQGT CX, AX // Length is at most 10 270 XORQ SI, SI // Index 271 XORQ CX, CX // Shift 272 XORQ DX, DX // Value 273 274 loop: 275 CMPQ SI, AX 276 JEQ bad // Reached end of buffer or >10 bytes 277 278 MOVBLZX (SI)(BX*1), DI // Load next byte 279 INCQ SI 280 BTRL $7, DI // Is bit 7 set? Clear bit 7. 281 JNC last // If not set, this is the final byte 282 SHLQ CL, DI // value |= value << shift 283 ORQ DI, DX 284 ADDQ $7, CX // shift += 7 285 JMP loop 286 287 last: 288 SHLQ CL, DI // value |= value << shift 289 ORQ DI, DX 290 // Return decoded value and length. 291 MOVQ DX, x+24(FP) 292 MOVQ SI, n+32(FP) 293 RET 294 295 bad: 296 MOVQ $0, x+24(FP) 297 MOVQ $0, n+32(FP) 298 RET