github.com/songzhibin97/gkit@v1.2.13/sys/xxhash3/internal/avo/avx.go (about)

     1  //go:build ignore
     2  // +build ignore
     3  
     4  package avo
     5  
     6  func AVX2() {
     7  
     8  	primeData := GLOBL("prime_avx", RODATA|NOPTR)
     9  	DATA(0, U32(2654435761))
    10  	TEXT("accumAVX2", NOSPLIT, "func(acc *[8]uint64, xinput, xsecret *byte, len uint64)")
    11  
    12  	acc := Mem{Base: Load(Param("acc"), GP64())}
    13  	xinput := Mem{Base: Load(Param("xinput"), GP64())}
    14  	xsecret := Mem{Base: Load(Param("xsecret"), GP64())}
    15  	skey := Mem{Base: Load(Param("xsecret"), GP64())}
    16  	plen := Load(Param("len"), GP64())
    17  	prime := YMM()
    18  	a := [...]VecVirtual{YMM(), YMM()}
    19  
    20  	VMOVDQU(acc.Offset(0x00), a[0])
    21  	VMOVDQU(acc.Offset(0x20), a[1])
    22  	VPBROADCASTQ(primeData, prime)
    23  
    24  	// Loops over block, process 16*8*8=1024 bytes of data each iteration
    25  	Label("accumBlock")
    26  	{
    27  		CMPQ(plen, U32(1024))
    28  		JLE(LabelRef("accumStripe"))
    29  
    30  		for i := 0; i < 8; i++ {
    31  			y0, y1, y2, y3, y4, y5, y6, y7, y8 := YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM()
    32  			//data_vec    = xinput[i]
    33  			VMOVDQU(xinput.Offset(128*i), y0)
    34  			//key_vec     = xsecret[i]
    35  			VMOVDQU(xsecret.Offset(16*i), y1)
    36  			VMOVDQU(xinput.Offset(128*i+0x20), y3)
    37  			VMOVDQU(xsecret.Offset(16*i+0x20), y4)
    38  			VMOVDQU(xinput.Offset(128*i+0x40), y5)
    39  			VMOVDQU(xsecret.Offset(16*i+0x8), y6)
    40  			VMOVDQU(xinput.Offset(128*i+0x60), y7)
    41  			VMOVDQU(xsecret.Offset(16*i+0x28), y8)
    42  
    43  			// data_key    = data_vec ^ key_vec
    44  			VPXOR(y0, y1, y1)
    45  			// data_key_lo = data_key >> 32
    46  			VPSRLQ(Imm(0x20), y1, y2)
    47  			VPSHUFD(Imm(0x4e), y0, y0)
    48  			// product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff)
    49  			VPMULUDQ(y1, y2, y2)
    50  			// xacc[i] += swap(data_vec)
    51  			VPADDQ(a[0], y0, a[0])
    52  			// xacc[i] += product
    53  			VPADDQ(a[0], y2, a[0])
    54  
    55  			VPXOR(y3, y4, y4)
    56  			VPSRLQ(Imm(0x20), y4, y2)
    57  			VPSHUFD(Imm(0x4e), y3, y3)
    58  			VPMULUDQ(y4, y2, y2)
    59  			VPADDQ(a[1], y3, a[1])
    60  			VPADDQ(a[1], y2, a[1])
    61  
    62  			VPXOR(y5, y6, y6)
    63  			VPSRLQ(Imm(0x20), y6, y2)
    64  			VPSHUFD(Imm(0x4e), y5, y5)
    65  			VPMULUDQ(y6, y2, y2)
    66  			VPADDQ(a[0], y5, a[0])
    67  			VPADDQ(a[0], y2, a[0])
    68  
    69  			VPXOR(y7, y8, y8)
    70  			VPSRLQ(Imm(0x20), y8, y2)
    71  			VPSHUFD(Imm(0x4e), y7, y7)
    72  			VPMULUDQ(y8, y2, y2)
    73  			VPADDQ(a[1], y7, a[1])
    74  			VPADDQ(a[1], y2, a[1])
    75  		}
    76  
    77  		ADDQ(U32(16*64), xinput.Base)
    78  		SUBQ(U32(16*64), plen)
    79  
    80  		y0, y1 := YMM(), YMM()
    81  
    82  		// xacc[i] ^= (xacc[i] >> 47)
    83  		VPSRLQ(Imm(0x2f), a[0], y0)
    84  		VPXOR(a[0], y0, y0)
    85  		// xacc[i] ^= xsecret
    86  		VPXOR(xsecret.Offset(0x80), y0, y0)
    87  		VPMULUDQ(prime, y0, y1)
    88  		// xacc[i] *= prime;
    89  		VPSRLQ(Imm(0x20), y0, y0)
    90  		VPMULUDQ(prime, y0, y0)
    91  		VPSLLQ(Imm(0x20), y0, y0)
    92  		VPADDQ(y1, y0, a[0])
    93  
    94  		VPSRLQ(Imm(0x2f), a[1], y0)
    95  		VPXOR(a[1], y0, y0)
    96  		VPXOR(xsecret.Offset(0xa0), y0, y0)
    97  		VPMULUDQ(prime, y0, y1)
    98  		VPSRLQ(Imm(0x20), y0, y0)
    99  		VPMULUDQ(prime, y0, y0)
   100  		VPSLLQ(Imm(0x20), y0, y0)
   101  		VPADDQ(y1, y0, a[1])
   102  		JMP(LabelRef("accumBlock"))
   103  	}
   104  
   105  	// last partial block (64 bytes)
   106  	Label("accumStripe")
   107  	{
   108  		CMPQ(plen, Imm(64))
   109  		JLE(LabelRef("accumLastStripe"))
   110  
   111  		y0, y1, y2, y3, y4 := YMM(), YMM(), YMM(), YMM(), YMM()
   112  		VMOVDQU(xinput.Offset(0), y0)
   113  		VMOVDQU(skey.Offset(0), y1)
   114  		VMOVDQU(xinput.Offset(0x20), y3)
   115  		VMOVDQU(skey.Offset(0x20), y4)
   116  
   117  		// data_key    = data_vec ^ key_vec
   118  		VPXOR(y0, y1, y1)
   119  		// data_key_lo = data_key >> 32
   120  		VPSRLQ(Imm(0x20), y1, y2)
   121  		VPSHUFD(Imm(0x4e), y0, y0)
   122  		// product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff)
   123  		VPMULUDQ(y1, y2, y2)
   124  		// xacc[i] += swap(data_vec)
   125  		VPADDQ(a[0], y0, a[0])
   126  		// xacc[i] += product
   127  		VPADDQ(a[0], y2, a[0])
   128  
   129  		VPXOR(y3, y4, y4)
   130  		VPSRLQ(Imm(0x20), y4, y2)
   131  		VPMULUDQ(y4, y2, y2)
   132  		VPSHUFD(Imm(0x4e), y3, y3)
   133  		VPADDQ(a[1], y3, a[1])
   134  		VPADDQ(a[1], y2, a[1])
   135  
   136  		ADDQ(U32(64), xinput.Base)
   137  		SUBQ(U32(64), plen)
   138  		ADDQ(U32(8), skey.Base)
   139  
   140  		JMP(LabelRef("accumStripe"))
   141  	}
   142  
   143  	// last stripe 16 bytes
   144  	Label("accumLastStripe")
   145  	{
   146  		CMPQ(plen, Imm(0))
   147  		JE(LabelRef("return"))
   148  
   149  		SUBQ(Imm(64), xinput.Base)
   150  		ADDQ(plen, xinput.Base)
   151  
   152  		y0, y1, y2, y3, y4 := YMM(), YMM(), YMM(), YMM(), YMM()
   153  		VMOVDQU(xinput.Offset(0), y0)
   154  		VMOVDQU(xsecret.Offset(0x79), y1)
   155  		VMOVDQU(xinput.Offset(0x20), y3)
   156  		VMOVDQU(xsecret.Offset(0x79+0x20), y4)
   157  
   158  		// data_key    = data_vec ^ key_vec
   159  		VPXOR(y0, y1, y1)
   160  		// data_key_lo = data_key >> 32
   161  		VPSRLQ(Imm(0x20), y1, y2)
   162  		VPSHUFD(Imm(0x4e), y0, y0)
   163  		// product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff)
   164  		VPMULUDQ(y1, y2, y2)
   165  		// xacc[i] += swap(data_vec)
   166  		VPADDQ(a[0], y0, a[0])
   167  		// xacc[i] += product
   168  		VPADDQ(a[0], y2, a[0])
   169  
   170  		VPXOR(y3, y4, y4)
   171  		VPSRLQ(Imm(0x20), y4, y2)
   172  		VPMULUDQ(y4, y2, y2)
   173  		VPSHUFD(Imm(0x4e), y3, y3)
   174  		VPADDQ(a[1], y3, a[1])
   175  		VPADDQ(a[1], y2, a[1])
   176  	}
   177  
   178  	Label("return")
   179  	{
   180  		VMOVDQU(a[0], acc.Offset(0x00))
   181  		VMOVDQU(a[1], acc.Offset(0x20))
   182  		RET()
   183  	}
   184  
   185  	Generate()
   186  }