github.com/benz9527/xboot@v0.0.0-20240504061247-c23f15593274/lib/kv/simd/asm.go (about)

     1  package main
     2  
     3  import (
     4  	. "github.com/mmcloughlin/avo/build"
     5  	. "github.com/mmcloughlin/avo/operand"
     6  )
     7  
     8  /*
     9  	SIMD
    10  	1. Load group (A), uint8 * 16
    11  	---------------------------------------------     ------------
    12  	| 01010111 | 11111111 | 00110110 | 11111111 | ... | 11111111 |
    13  	---------------------------------------------     ------------
    14  
    15  	2. Set comparable 0b110110 (B), uint8 * 16
    16  	---------------------------------------------     ------------
    17  	| 00110110 | 00110110 | 00110110 | 00110110 | ... | 00110110 |
    18  	---------------------------------------------     ------------
    19  
    20  	3. Compare A and B, uint8 * 16
    21  	---------------------------------------------     ------------
    22  	| 00000000 | 00000000 | 11111111 | 00000000 | ... | 00000000 |
    23  	---------------------------------------------     ------------
    24  	                       (success!)
    25  
    26  	4. Mask values
    27  	---------------------------------------------     ------------
    28  	|    0     |    0     |    1     |    0     | ... |    0     |
    29  	---------------------------------------------     ------------
    30  	                         (true)
    31  */
    32  
    33  func main() {
    34  	ConstraintExpr("amd64")
    35  	ConstraintExpr("!nosimd")
    36  
    37  	TEXT("Fast16WayHashMatch", NOSPLIT, "func(md *[16]int8, hash int8) uint16")
    38  	Doc("Fast16WayHashMatch performs a 16-way linear probing of short hash (h2, metadata) list by SSE instructions",
    39  		"short hash list must be an aligned pointer")
    40  
    41  	// The AX store the md pointer address.
    42  	Comment("Move the pointer of md to register AX")
    43  	mem := Mem{Base: Load(Param("md"), GP64())}
    44  
    45  	// Assume that hash is 0b01100110.
    46  	// After extended into 32 bits, it becomes 0x00 00 00 66
    47  	Comment("Move the hash value (int8) from mem to register CX then extend the size to int32")
    48  	h := Load(Param("hash"), GP32())
    49  	mask := GP32()
    50  
    51  	// XMM 128 bits register
    52  	x0, x1 := XMM(), XMM()
    53  
    54  	// After movd instruction, X0/128bits becomes 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 66.
    55  	// windows storage order is little endian order.
    56  	// 0x1234 => low address: 0x32; high address: 0x12.
    57  	// So the asm read index 0 byte from low address.
    58  	// In X0, we load the index 0 byte will be 0x66.
    59  	Comment("Copy hash value from register CX to XMM register X0")
    60  	Comment("XMM registers are used by SSE or AVX instructions")
    61  	MOVD(h, x0)
    62  
    63  	// Parallel XOR
    64  	Comment("Clear the XMM register X1")
    65  	PXOR(x1, x1)
    66  
    67  	// https://www.felixcloutier.com/x86/pshufb
    68  	// PSHUFB xmm1, xmm2/m128
    69  	// Shuffle bytes in xmm1 according to contents of xmm2/m128.
    70  	// xmm2 is the shuffle control mask.
    71  	// If the most significant bit (bit[7]) of each byte of the shuffle
    72  	// control mask (xmm2) is set, then constant zero is written in the
    73  	// result byte.
    74  	//
    75  	// for i = 0 to 15 {
    76  	//   if (SRC2[(i * 8)+7] = 1) then
    77  	//     DEST[(i*8)+7..(i*8)+0] := 0;
    78  	//     else
    79  	//     index[3..0] := SRC2[(i*8)+3 .. (i*8)+0];
    80  	//     DEST[(i*8)+7..(i*8)+0] := SRC1[(index*8+7)..(index*8+0)];
    81  	//   endif
    82  	// }
    83  	// DEST[MAXVL-1:128] := 0
    84  	//
    85  	// But in go plan9 asm, the xmm2 is the destination register.
    86  	// x1 all bits are zero, like 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00.
    87  	// After the pshufb, the x0 will be 0x66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66.
    88  	Comment("Packed Shuffle Bytes instruction, let hash value in register X0 xor with X1 by byte to generate mask to X0")
    89  	PSHUFB(x1, x0)
    90  
    91  	// Plan9 MOVOU (move vector of oct-words unaligned)
    92  	// An SSE instruction to load 128-bit data from memory to register.
    93  	// The mem is AX actually.
    94  	Comment("Load the metadata from memory to register X1")
    95  	Comment("(AX) means de-reference of address value in AX")
    96  	MOVOU(mem, x1)
    97  
    98  	Comment("Packed Compare for Equal Byte instruction, compare X1 and X0 by byte then store into X0")
    99  	Comment("The same byte are 0xFF. Otherwise, they are 0x00")
   100  	PCMPEQB(x1, x0)
   101  
   102  	Comment("Packed Move with Mask Signed Byte, Extract X0 hi part and convert into int16 then store into AX")
   103  	Comment("The X0 lo part is unused usually")
   104  	Comment("Now the each bit of AX mapping to the each hash of metadata array whether equals to target")
   105  	PMOVMSKB(x0, mask)
   106  
   107  	Comment("Copy the AX value to mem then return")
   108  	Store(mask.As16(), ReturnIndex(0))
   109  	RET()
   110  
   111  	Generate()
   112  }