github.com/songzhibin97/gkit@v1.2.13/sys/xxhash3/internal/avo/sse.go (about) 1 //go:build ignore 2 // +build ignore 3 4 package avo 5 6 func SSE2() { 7 8 primeData := GLOBL("prime_sse", RODATA|NOPTR) 9 DATA(0, U32(2654435761)) 10 DATA(4, U32(2654435761)) 11 DATA(8, U32(2654435761)) 12 DATA(12, U32(2654435761)) 13 14 TEXT("accumSSE2", NOSPLIT, "func(acc *[8]uint64, xinput, xsecret *byte, len uint64)") 15 16 acc := Mem{Base: Load(Param("acc"), GP64())} 17 xinput := Mem{Base: Load(Param("xinput"), GP64())} 18 xsecret := Mem{Base: Load(Param("xsecret"), GP64())} 19 skey := Mem{Base: Load(Param("xsecret"), GP64())} 20 plen := Load(Param("len"), GP64()) 21 prime := XMM() 22 a := [4]VecVirtual{XMM(), XMM(), XMM(), XMM()} 23 24 MOVOU(acc.Offset(0x00), a[0]) 25 MOVOU(acc.Offset(0x10), a[1]) 26 MOVOU(acc.Offset(0x20), a[2]) 27 MOVOU(acc.Offset(0x30), a[3]) 28 MOVOU(primeData, prime) 29 30 // Loops over block, process 16*8*8=1024 bytes of data each iteration 31 Label("accumBlock") 32 { 33 CMPQ(plen, U32(1024)) 34 JLE(LabelRef("accumStripe")) 35 36 for i := 0; i < 0x10; i++ { 37 for n := 0; n < 4; n++ { 38 x0, x1, x2 := XMM(), XMM(), XMM() 39 // data_vec = xinput[i] 40 MOVOU(xinput.Offset(0x40*i+n*0x10), x0) 41 // key_vec = xsecret[i] 42 MOVOU(xsecret.Offset(8*i+n*0x10), x1) 43 // data_key = data_vec ^ key_vec 44 PXOR(x0, x1) 45 // product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff) 46 PSHUFD(Imm(0x31), x1, x2) 47 PMULULQ(x1, x2) 48 // xacc[i] += swap(data_vec) 49 PSHUFD(Imm(0x4e), x0, x0) 50 PADDQ(x0, a[n]) 51 // xacc[i] += product 52 PADDQ(x2, a[n]) 53 } 54 } 55 ADDQ(U32(0x10*0x40), xinput.Base) 56 SUBQ(U32(0x10*0x40), plen) 57 58 // scramble xacc 59 for n := 0; n < 4; n++ { 60 x0 := XMM() 61 MOVOU(a[n], x0) 62 // xacc[i] ^= (xacc[i] >> 47) 63 PSRLQ(Imm(0x2f), a[n]) 64 PXOR(x0, a[n]) 65 // xacc[i] ^= xsecret 66 PXOR(xsecret.Offset(8*0x10+n*0x10), a[n]) 67 PSHUFD(Imm(0xf5), a[n], x0) 68 PMULULQ(prime, x0) 69 // xacc[i] *= prime; 70 PSLLQ(Imm(0x20), x0) 71 PMULULQ(prime, a[n]) 72 PADDQ(x0, a[n]) 73 } 74 JMP(LabelRef("accumBlock")) 75 } 76 77 // last partial block (64 bytes) 78 Label("accumStripe") 79 { 80 CMPQ(plen, Imm(0x40)) 81 JLE(LabelRef("accumLastStripe")) 82 83 for n := 0; n < 4; n++ { 84 x0, x1, x2 := XMM(), XMM(), XMM() 85 // data_vec = xinput[i] 86 MOVOU(xinput.Offset(n*0x10), x0) 87 // key_vec = xsecret[i] 88 MOVOU(skey.Offset(n*0x10), x1) 89 // data_key = data_vec ^ key_vec 90 PXOR(x0, x1) 91 // product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff) 92 PSHUFD(Imm(0x31), x1, x2) 93 PMULULQ(x1, x2) 94 // xacc[i] += swap(data_vec) 95 PSHUFD(Imm(0x4e), x0, x0) 96 PADDQ(x0, a[n]) 97 // xacc[i] += product 98 PADDQ(x2, a[n]) 99 } 100 ADDQ(U32(0x40), xinput.Base) 101 SUBQ(U32(0x40), plen) 102 ADDQ(U32(8), skey.Base) 103 104 JMP(LabelRef("accumStripe")) 105 } 106 107 // last stripe 16 bytes 108 Label("accumLastStripe") 109 { 110 CMPQ(plen, Imm(0)) 111 JE(LabelRef("return")) 112 113 SUBQ(Imm(0x40), xinput.Base) 114 ADDQ(plen, xinput.Base) 115 116 for n := 0; n < 4; n++ { 117 x0, x1, x2 := XMM(), XMM(), XMM() 118 // data_vec = xinput[i] 119 MOVOU(xinput.Offset(n*0x10), x0) 120 // key_vec = xsecret[i] 121 MOVOU(xsecret.Offset(121+n*0x10), x1) 122 // data_key = data_vec ^ key_vec 123 PXOR(x0, x1) 124 // product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff) 125 PSHUFD(Imm(0x31), x1, x2) 126 PMULULQ(x1, x2) 127 // xacc[i] += swap(data_vec) 128 PSHUFD(Imm(0x4e), x0, x0) 129 PADDQ(x0, a[n]) 130 // xacc[i] += product 131 PADDQ(x2, a[n]) 132 } 133 } 134 135 Label("return") 136 { 137 MOVOU(a[0], acc.Offset(0x00)) 138 MOVOU(a[1], acc.Offset(0x10)) 139 MOVOU(a[2], acc.Offset(0x20)) 140 MOVOU(a[3], acc.Offset(0x30)) 141 RET() 142 } 143 144 Generate() 145 }