github.com/aclements/go-misc@v0.0.0-20240129233631-2f6ede80790c/varint/asm_amd64.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  GLOBL	·hasBMI2(SB),NOPTR,$1
     8  
     9  TEXT ·queryBMI2(SB),NOSPLIT,$0-1
    10  	// TODO: Check validity of query.
    11  	MOVQ	$0x07, AX
    12  	MOVQ	$0, CX
    13  	CPUID
    14  	// Bit 8 of EBX indicates BMI2 support.
    15  	BTQ	$8, BX
    16  	SETCS	ret+0(FP)
    17  	RET
    18  
    19  // Hand-coded byte decoding loop with some clever tricks.
    20  TEXT ·decodeVarintAsmLoop(SB),NOSPLIT,$0-40
    21  	MOVQ	buf_base+0(FP), BX	// Pointer
    22  	MOVQ	buf_len+8(FP), AX	// Length
    23  	MOVL	$10, CX
    24  	CMPQ	AX, CX
    25  	CMOVLGT	CX, AX		// Length is at most 10
    26  	XORL	SI, SI		// Index
    27  	XORL	CX, CX		// Shift
    28  	XORL	DX, DX		// Value
    29  
    30  loop:
    31  	CMPL	SI, AX		// (fused with JEQ)
    32  	JEQ	bad		// Reached end of buffer or >10 bytes
    33  
    34  	MOVBLZX	(SI)(BX*1), DI	// Load next byte
    35  	INCL	SI
    36  	// This could be a BTRL $7, DI, but this is simpler and
    37  	// just as fast thanks to macro-op fusion.
    38  	TESTL	$0x80, DI	// Is bit 7 set? (fused with JZ)
    39  	JZ	last
    40  	ANDL	$0x7f, DI	// Clear bit 7
    41  	SHLQ	CL, DI		// value |= value << shift
    42  	ORQ	DI, DX
    43  	ADDL	$7, CX		// shift += 7
    44  	JMP	loop
    45  
    46  last:
    47  	SHLQ	CL, DI		// Final value |= value << shift
    48  	ORQ	DI, DX
    49  	// Return decoded value and length.
    50  	MOVQ	DX, x+24(FP)
    51  	MOVQ	SI, n+32(FP)
    52  	RET
    53  
    54  bad:
    55  	MOVQ	$0, x+24(FP)
    56  	MOVQ	$0, n+32(FP)
    57  	RET
    58  
    59  // decodeVarintAsmBMI2 uses the BMI2 PEXT instruction to extract 7
    60  // bits from each byte in one instruction.
    61  TEXT ·decodeVarintAsmBMI2(SB),NOSPLIT,$0-40
    62  	MOVQ	buf_base+0(FP), BX
    63  	MOVQ	buf_len+8(FP), CX
    64  
    65  	// Take the slow path if there's no BMI2 or there are fewer
    66  	// than 8 bytes available.
    67  	MOVBLZX	·hasBMI2(SB), AX
    68  	TESTB	AL, AL
    69  	JEQ	slowpath
    70  	CMPQ	CX, $8
    71  	JLT	slowpath
    72  
    73  	// Load 8 bytes from buf.
    74  	MOVQ	(BX), AX
    75  
    76  	// Extract the continuation bits into BX.
    77  	MOVQ	AX, M0
    78  	PMOVMSKB	M0, BX
    79  	// Compute byte length - 1 of varint into BX.
    80  	NOTL	BX
    81  	BSFL	BX, BX
    82  	// If it's more than 8 bytes, take the slow path.
    83  	CMPL	BX, $8
    84  	JGE	slowpath
    85  	// Extract the relevant bytes from the input.
    86  	INCL	BX
    87  	MOVQ	BX, CX
    88  	SHLQ	$(3+8), CX	// CX[15:8] = (byte len * 8); CX[7:0] = 0
    89  	BEXTRQ	CX, AX, AX	// Requires BMI1
    90  	// Extract the low 7 bits from each byte of the input.
    91  	MOVQ	$0x7f7f7f7f7f7f7f7f, DI
    92  	PEXTQ	DI, AX, DX	// Requires BMI2
    93  	// Return decoded value and length.
    94  	MOVQ	DX, x+24(FP)
    95  	MOVQ	BX, n+32(FP)
    96  	RET
    97  
    98  slowpath:
    99  	// Consume buffer one byte at a time.
   100  	// TODO: Could merge with some of the above registers better.
   101  	MOVQ	buf_base+0(FP), BX	// Pointer
   102  	MOVQ	buf_len+8(FP), AX	// Length
   103  	MOVQ	$10, CX
   104  	CMPQ	AX, CX
   105  	CMOVQGT	CX, AX		// Length is at most 10
   106  	XORQ	SI, SI		// Index
   107  	XORQ	CX, CX		// Shift
   108  	XORQ	DX, DX		// Value
   109  
   110  loop:
   111  	CMPQ	SI, AX
   112  	JEQ	bad		// Reached end of buffer or >10 bytes
   113  
   114  	MOVBLZX	(SI)(BX*1), DI	// Load next byte
   115  	INCQ	SI
   116  	BTRL	$7, DI		// Is bit 7 set? Clear bit 7.
   117  	JNC	last		// If not set, this is the final byte
   118  	SHLQ	CL, DI		// value |= value << shift
   119  	ORQ	DI, DX
   120  	ADDQ	$7, CX		// shift += 7
   121  	JMP	loop
   122  
   123  last:
   124  	SHLQ	CL, DI		// value |= value << shift
   125  	ORQ	DI, DX
   126  	// Return decoded value and length.
   127  	MOVQ	DX, x+24(FP)
   128  	MOVQ	SI, n+32(FP)
   129  	RET
   130  
   131  bad:
   132  	MOVQ	$0, x+24(FP)
   133  	MOVQ	$0, n+32(FP)
   134  	RET
   135  
   136  // The other two also use PEXT, but use different tricks to extract
   137  // the length and set up the mask. They turned out to be slower than
   138  // the one above, but are historically interesting.
   139  
   140  DATA extract<>+0x00(SB)/8,$0x000000000000007f
   141  DATA extract<>+0x08(SB)/8,$0x0000000000007f7f
   142  DATA extract<>+0x10(SB)/8,$0x00000000007f7f7f
   143  DATA extract<>+0x18(SB)/8,$0x000000007f7f7f7f
   144  DATA extract<>+0x20(SB)/8,$0x0000007f7f7f7f7f
   145  DATA extract<>+0x28(SB)/8,$0x00007f7f7f7f7f7f
   146  DATA extract<>+0x30(SB)/8,$0x007f7f7f7f7f7f7f
   147  DATA extract<>+0x38(SB)/8,$0x7f7f7f7f7f7f7f7f
   148  GLOBL extract<>(SB),(NOPTR+RODATA),$(8*8)
   149  
   150  TEXT ·decodeVarintAsm1(SB),NOSPLIT,$0-40
   151  	// Take the slow path if there's no BMI2 or there are fewer
   152  	// than 8 bytes available.
   153  	MOVBLZX	·hasBMI2(SB), AX
   154  	TESTB	AL, AL
   155  	JEQ	slowpath
   156  	MOVQ	buf_len+8(FP), AX
   157  	CMPQ	AX, $8
   158  	JLT	slowpath
   159  
   160  	// Load 8 bytes from buf.
   161  	MOVQ	buf_base+0(FP), AX
   162  	MOVQ	(AX), AX
   163  
   164  	// Extract the continuation bits into BX.
   165  	MOVQ	AX, M0
   166  	PMOVMSKB	M0, BX
   167  	// Compute byte length - 1 of varint into BX.
   168  	NOTL	BX
   169  	BSFL	BX, BX
   170  	// If it's more than 8 bytes, take the slow path.
   171  	CMPL	BX, $8
   172  	JGE	slowpath
   173  	// Extract the value into DX using a mask lookup table.
   174  	MOVQ	$extract<>(SB), CX
   175  	MOVQ	(CX)(BX*8), DX
   176  	PEXTQ	DX, AX, DX	// Requires BMI2
   177  	// Return decoded value and length.
   178  	MOVQ	DX, x+24(FP)
   179  	INCL	BX
   180  	MOVQ	BX, n+32(FP)
   181  	RET
   182  
   183  slowpath:
   184  	// Consume buffer one byte at a time.
   185  	// TODO: Could merge with some of the above registers better.
   186  	MOVQ	buf_base+0(FP), BX	// Pointer
   187  	MOVQ	buf_len+8(FP), AX	// Length
   188  	MOVQ	$10, CX
   189  	CMPQ	AX, CX
   190  	CMOVQGT	CX, AX		// Length is at most 10
   191  	XORQ	SI, SI		// Index
   192  	XORQ	CX, CX		// Shift
   193  	XORQ	DX, DX		// Value
   194  
   195  loop:
   196  	CMPQ	SI, AX
   197  	JEQ	bad		// Reached end of buffer or >10 bytes
   198  
   199  	MOVBLZX	(SI)(BX*1), DI	// Load next byte
   200  	INCQ	SI
   201  	BTRL	$7, DI		// Is bit 7 set? Clear bit 7.
   202  	JNC	last		// If not set, this is the final byte
   203  	SHLQ	CL, DI		// value |= value << shift
   204  	ORQ	DI, DX
   205  	ADDQ	$7, CX		// shift += 7
   206  	JMP	loop
   207  
   208  last:
   209  	SHLQ	CL, DI		// value |= value << shift
   210  	ORQ	DI, DX
   211  	// Return decoded value and length.
   212  	MOVQ	DX, x+24(FP)
   213  	MOVQ	SI, n+32(FP)
   214  	RET
   215  
   216  bad:
   217  	MOVQ	$0, x+24(FP)
   218  	MOVQ	$0, n+32(FP)
   219  	RET
   220  
   221  TEXT ·decodeVarintAsm2(SB),NOSPLIT,$0-40
   222  	MOVQ	buf_base+0(FP), BX
   223  	MOVQ	buf_len+8(FP), CX
   224  
   225  	// Take the slow path if there's no BMI2 or there are fewer
   226  	// than 8 bytes available.
   227  	MOVBLZX	·hasBMI2(SB), AX
   228  	TESTB	AL, AL
   229  	JEQ	slowpath
   230  	CMPQ	CX, $8
   231  	JLT	slowpath
   232  
   233  	// Load 8 bytes from buf.
   234  	MOVQ	(BX), AX
   235  
   236  	// Get continuation bit mask into DX.
   237  	MOVQ	$0x7f7f7f7f7f7f7f7f, DI
   238  	MOVQ	AX, DX
   239  	ORQ	DI, DX
   240  	// Compute bit length of varint into CX.
   241  	NOTQ	DX
   242  	BSFQ	DX, CX
   243  	// If all continuation bits are set, take the slow path.
   244  	JZ	slowpath
   245  	// Compute bit extraction mask into R14.
   246  	//BLSMSKQ	DX, R14		// Requires BMI1
   247  	BYTE $0xc4; BYTE $0xe2; BYTE $0x88; BYTE $0xf3; BYTE $0xd2
   248  	// Mask the value.
   249  	ANDQ	R14, AX
   250  	// Extract the bits.
   251  	PEXTQ	DI, AX, DX	// Requires BMI2
   252  
   253  	// Compute byte length. 7=>1, 15=>2, etc.
   254  	INCQ	CX
   255  	SHRQ	$3, CX
   256  
   257  	// Return decoded value and length.
   258  	MOVQ	DX, x+24(FP)
   259  	MOVQ	CX, n+32(FP)
   260  	RET
   261  
   262  slowpath:
   263  	// Consume buffer one byte at a time.
   264  	// TODO: Could merge with some of the above registers better.
   265  	MOVQ	buf_base+0(FP), BX	// Pointer
   266  	MOVQ	buf_len+8(FP), AX	// Length
   267  	MOVQ	$10, CX
   268  	CMPQ	AX, CX
   269  	CMOVQGT	CX, AX		// Length is at most 10
   270  	XORQ	SI, SI		// Index
   271  	XORQ	CX, CX		// Shift
   272  	XORQ	DX, DX		// Value
   273  
   274  loop:
   275  	CMPQ	SI, AX
   276  	JEQ	bad		// Reached end of buffer or >10 bytes
   277  
   278  	MOVBLZX	(SI)(BX*1), DI	// Load next byte
   279  	INCQ	SI
   280  	BTRL	$7, DI		// Is bit 7 set? Clear bit 7.
   281  	JNC	last		// If not set, this is the final byte
   282  	SHLQ	CL, DI		// value |= value << shift
   283  	ORQ	DI, DX
   284  	ADDQ	$7, CX		// shift += 7
   285  	JMP	loop
   286  
   287  last:
   288  	SHLQ	CL, DI		// value |= value << shift
   289  	ORQ	DI, DX
   290  	// Return decoded value and length.
   291  	MOVQ	DX, x+24(FP)
   292  	MOVQ	SI, n+32(FP)
   293  	RET
   294  
   295  bad:
   296  	MOVQ	$0, x+24(FP)
   297  	MOVQ	$0, n+32(FP)
   298  	RET