github.com/songzhibin97/gkit@v1.2.13/sys/xxhash3/internal/avo/sse.go (about)

     1  //go:build ignore
     2  // +build ignore
     3  
     4  package avo
     5  
     6  func SSE2() {
     7  
     8  	primeData := GLOBL("prime_sse", RODATA|NOPTR)
     9  	DATA(0, U32(2654435761))
    10  	DATA(4, U32(2654435761))
    11  	DATA(8, U32(2654435761))
    12  	DATA(12, U32(2654435761))
    13  
    14  	TEXT("accumSSE2", NOSPLIT, "func(acc *[8]uint64, xinput, xsecret *byte, len uint64)")
    15  
    16  	acc := Mem{Base: Load(Param("acc"), GP64())}
    17  	xinput := Mem{Base: Load(Param("xinput"), GP64())}
    18  	xsecret := Mem{Base: Load(Param("xsecret"), GP64())}
    19  	skey := Mem{Base: Load(Param("xsecret"), GP64())}
    20  	plen := Load(Param("len"), GP64())
    21  	prime := XMM()
    22  	a := [4]VecVirtual{XMM(), XMM(), XMM(), XMM()}
    23  
    24  	MOVOU(acc.Offset(0x00), a[0])
    25  	MOVOU(acc.Offset(0x10), a[1])
    26  	MOVOU(acc.Offset(0x20), a[2])
    27  	MOVOU(acc.Offset(0x30), a[3])
    28  	MOVOU(primeData, prime)
    29  
    30  	// Loops over block, process 16*8*8=1024 bytes of data each iteration
    31  	Label("accumBlock")
    32  	{
    33  		CMPQ(plen, U32(1024))
    34  		JLE(LabelRef("accumStripe"))
    35  
    36  		for i := 0; i < 0x10; i++ {
    37  			for n := 0; n < 4; n++ {
    38  				x0, x1, x2 := XMM(), XMM(), XMM()
    39  				// data_vec    = xinput[i]
    40  				MOVOU(xinput.Offset(0x40*i+n*0x10), x0)
    41  				// key_vec     = xsecret[i]
    42  				MOVOU(xsecret.Offset(8*i+n*0x10), x1)
    43  				// data_key    = data_vec ^ key_vec
    44  				PXOR(x0, x1)
    45  				// product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff)
    46  				PSHUFD(Imm(0x31), x1, x2)
    47  				PMULULQ(x1, x2)
    48  				// xacc[i] += swap(data_vec)
    49  				PSHUFD(Imm(0x4e), x0, x0)
    50  				PADDQ(x0, a[n])
    51  				// xacc[i] += product
    52  				PADDQ(x2, a[n])
    53  			}
    54  		}
    55  		ADDQ(U32(0x10*0x40), xinput.Base)
    56  		SUBQ(U32(0x10*0x40), plen)
    57  
    58  		// scramble xacc
    59  		for n := 0; n < 4; n++ {
    60  			x0 := XMM()
    61  			MOVOU(a[n], x0)
    62  			// xacc[i] ^= (xacc[i] >> 47)
    63  			PSRLQ(Imm(0x2f), a[n])
    64  			PXOR(x0, a[n])
    65  			// xacc[i] ^= xsecret
    66  			PXOR(xsecret.Offset(8*0x10+n*0x10), a[n])
    67  			PSHUFD(Imm(0xf5), a[n], x0)
    68  			PMULULQ(prime, x0)
    69  			// xacc[i] *= prime;
    70  			PSLLQ(Imm(0x20), x0)
    71  			PMULULQ(prime, a[n])
    72  			PADDQ(x0, a[n])
    73  		}
    74  		JMP(LabelRef("accumBlock"))
    75  	}
    76  
    77  	// last partial block (64 bytes)
    78  	Label("accumStripe")
    79  	{
    80  		CMPQ(plen, Imm(0x40))
    81  		JLE(LabelRef("accumLastStripe"))
    82  
    83  		for n := 0; n < 4; n++ {
    84  			x0, x1, x2 := XMM(), XMM(), XMM()
    85  			// data_vec    = xinput[i]
    86  			MOVOU(xinput.Offset(n*0x10), x0)
    87  			// key_vec     = xsecret[i]
    88  			MOVOU(skey.Offset(n*0x10), x1)
    89  			// data_key    = data_vec ^ key_vec
    90  			PXOR(x0, x1)
    91  			// product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff)
    92  			PSHUFD(Imm(0x31), x1, x2)
    93  			PMULULQ(x1, x2)
    94  			// xacc[i] += swap(data_vec)
    95  			PSHUFD(Imm(0x4e), x0, x0)
    96  			PADDQ(x0, a[n])
    97  			// xacc[i] += product
    98  			PADDQ(x2, a[n])
    99  		}
   100  		ADDQ(U32(0x40), xinput.Base)
   101  		SUBQ(U32(0x40), plen)
   102  		ADDQ(U32(8), skey.Base)
   103  
   104  		JMP(LabelRef("accumStripe"))
   105  	}
   106  
   107  	// last stripe 16 bytes
   108  	Label("accumLastStripe")
   109  	{
   110  		CMPQ(plen, Imm(0))
   111  		JE(LabelRef("return"))
   112  
   113  		SUBQ(Imm(0x40), xinput.Base)
   114  		ADDQ(plen, xinput.Base)
   115  
   116  		for n := 0; n < 4; n++ {
   117  			x0, x1, x2 := XMM(), XMM(), XMM()
   118  			// data_vec    = xinput[i]
   119  			MOVOU(xinput.Offset(n*0x10), x0)
   120  			// key_vec     = xsecret[i]
   121  			MOVOU(xsecret.Offset(121+n*0x10), x1)
   122  			// data_key    = data_vec ^ key_vec
   123  			PXOR(x0, x1)
   124  			// product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff)
   125  			PSHUFD(Imm(0x31), x1, x2)
   126  			PMULULQ(x1, x2)
   127  			// xacc[i] += swap(data_vec)
   128  			PSHUFD(Imm(0x4e), x0, x0)
   129  			PADDQ(x0, a[n])
   130  			// xacc[i] += product
   131  			PADDQ(x2, a[n])
   132  		}
   133  	}
   134  
   135  	Label("return")
   136  	{
   137  		MOVOU(a[0], acc.Offset(0x00))
   138  		MOVOU(a[1], acc.Offset(0x10))
   139  		MOVOU(a[2], acc.Offset(0x20))
   140  		MOVOU(a[3], acc.Offset(0x30))
   141  		RET()
   142  	}
   143  
   144  	Generate()
   145  }