github.com/songzhibin97/go-baseutils@v0.0.2-0.20240302024150-487d8ce9c082/sys/xxhash3/internal/avo/avx.go (about) 1 //go:build ignore 2 // +build ignore 3 4 package avo 5 6 func AVX2() { 7 8 primeData := GLOBL("prime_avx", RODATA|NOPTR) 9 DATA(0, U32(2654435761)) 10 TEXT("accumAVX2", NOSPLIT, "func(acc *[8]uint64, xinput, xsecret *byte, len uint64)") 11 12 acc := Mem{Base: Load(Param("acc"), GP64())} 13 xinput := Mem{Base: Load(Param("xinput"), GP64())} 14 xsecret := Mem{Base: Load(Param("xsecret"), GP64())} 15 skey := Mem{Base: Load(Param("xsecret"), GP64())} 16 plen := Load(Param("len"), GP64()) 17 prime := YMM() 18 a := [...]VecVirtual{YMM(), YMM()} 19 20 VMOVDQU(acc.Offset(0x00), a[0]) 21 VMOVDQU(acc.Offset(0x20), a[1]) 22 VPBROADCASTQ(primeData, prime) 23 24 // Loops over block, process 16*8*8=1024 bytes of data each iteration 25 Label("accumBlock") 26 { 27 CMPQ(plen, U32(1024)) 28 JLE(LabelRef("accumStripe")) 29 30 for i := 0; i < 8; i++ { 31 y0, y1, y2, y3, y4, y5, y6, y7, y8 := YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM(), YMM() 32 //data_vec = xinput[i] 33 VMOVDQU(xinput.Offset(128*i), y0) 34 //key_vec = xsecret[i] 35 VMOVDQU(xsecret.Offset(16*i), y1) 36 VMOVDQU(xinput.Offset(128*i+0x20), y3) 37 VMOVDQU(xsecret.Offset(16*i+0x20), y4) 38 VMOVDQU(xinput.Offset(128*i+0x40), y5) 39 VMOVDQU(xsecret.Offset(16*i+0x8), y6) 40 VMOVDQU(xinput.Offset(128*i+0x60), y7) 41 VMOVDQU(xsecret.Offset(16*i+0x28), y8) 42 43 // data_key = data_vec ^ key_vec 44 VPXOR(y0, y1, y1) 45 // data_key_lo = data_key >> 32 46 VPSRLQ(Imm(0x20), y1, y2) 47 VPSHUFD(Imm(0x4e), y0, y0) 48 // product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff) 49 VPMULUDQ(y1, y2, y2) 50 // xacc[i] += swap(data_vec) 51 VPADDQ(a[0], y0, a[0]) 52 // xacc[i] += product 53 VPADDQ(a[0], y2, a[0]) 54 55 VPXOR(y3, y4, y4) 56 VPSRLQ(Imm(0x20), y4, y2) 57 VPSHUFD(Imm(0x4e), y3, y3) 58 VPMULUDQ(y4, y2, y2) 59 VPADDQ(a[1], y3, a[1]) 60 VPADDQ(a[1], y2, a[1]) 61 62 VPXOR(y5, y6, y6) 63 VPSRLQ(Imm(0x20), y6, y2) 64 VPSHUFD(Imm(0x4e), y5, y5) 65 VPMULUDQ(y6, y2, y2) 66 VPADDQ(a[0], y5, a[0]) 67 VPADDQ(a[0], y2, a[0]) 68 69 VPXOR(y7, y8, y8) 70 VPSRLQ(Imm(0x20), y8, y2) 71 VPSHUFD(Imm(0x4e), y7, y7) 72 VPMULUDQ(y8, y2, y2) 73 VPADDQ(a[1], y7, a[1]) 74 VPADDQ(a[1], y2, a[1]) 75 } 76 77 ADDQ(U32(16*64), xinput.Base) 78 SUBQ(U32(16*64), plen) 79 80 y0, y1 := YMM(), YMM() 81 82 // xacc[i] ^= (xacc[i] >> 47) 83 VPSRLQ(Imm(0x2f), a[0], y0) 84 VPXOR(a[0], y0, y0) 85 // xacc[i] ^= xsecret 86 VPXOR(xsecret.Offset(0x80), y0, y0) 87 VPMULUDQ(prime, y0, y1) 88 // xacc[i] *= prime; 89 VPSRLQ(Imm(0x20), y0, y0) 90 VPMULUDQ(prime, y0, y0) 91 VPSLLQ(Imm(0x20), y0, y0) 92 VPADDQ(y1, y0, a[0]) 93 94 VPSRLQ(Imm(0x2f), a[1], y0) 95 VPXOR(a[1], y0, y0) 96 VPXOR(xsecret.Offset(0xa0), y0, y0) 97 VPMULUDQ(prime, y0, y1) 98 VPSRLQ(Imm(0x20), y0, y0) 99 VPMULUDQ(prime, y0, y0) 100 VPSLLQ(Imm(0x20), y0, y0) 101 VPADDQ(y1, y0, a[1]) 102 JMP(LabelRef("accumBlock")) 103 } 104 105 // last partial block (64 bytes) 106 Label("accumStripe") 107 { 108 CMPQ(plen, Imm(64)) 109 JLE(LabelRef("accumLastStripe")) 110 111 y0, y1, y2, y3, y4 := YMM(), YMM(), YMM(), YMM(), YMM() 112 VMOVDQU(xinput.Offset(0), y0) 113 VMOVDQU(skey.Offset(0), y1) 114 VMOVDQU(xinput.Offset(0x20), y3) 115 VMOVDQU(skey.Offset(0x20), y4) 116 117 // data_key = data_vec ^ key_vec 118 VPXOR(y0, y1, y1) 119 // data_key_lo = data_key >> 32 120 VPSRLQ(Imm(0x20), y1, y2) 121 VPSHUFD(Imm(0x4e), y0, y0) 122 // product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff) 123 VPMULUDQ(y1, y2, y2) 124 // xacc[i] += swap(data_vec) 125 VPADDQ(a[0], y0, a[0]) 126 // xacc[i] += product 127 VPADDQ(a[0], y2, a[0]) 128 129 VPXOR(y3, y4, y4) 130 VPSRLQ(Imm(0x20), y4, y2) 131 VPMULUDQ(y4, y2, y2) 132 VPSHUFD(Imm(0x4e), y3, y3) 133 VPADDQ(a[1], y3, a[1]) 134 VPADDQ(a[1], y2, a[1]) 135 136 ADDQ(U32(64), xinput.Base) 137 SUBQ(U32(64), plen) 138 ADDQ(U32(8), skey.Base) 139 140 JMP(LabelRef("accumStripe")) 141 } 142 143 // last stripe 16 bytes 144 Label("accumLastStripe") 145 { 146 CMPQ(plen, Imm(0)) 147 JE(LabelRef("return")) 148 149 SUBQ(Imm(64), xinput.Base) 150 ADDQ(plen, xinput.Base) 151 152 y0, y1, y2, y3, y4 := YMM(), YMM(), YMM(), YMM(), YMM() 153 VMOVDQU(xinput.Offset(0), y0) 154 VMOVDQU(xsecret.Offset(0x79), y1) 155 VMOVDQU(xinput.Offset(0x20), y3) 156 VMOVDQU(xsecret.Offset(0x79+0x20), y4) 157 158 // data_key = data_vec ^ key_vec 159 VPXOR(y0, y1, y1) 160 // data_key_lo = data_key >> 32 161 VPSRLQ(Imm(0x20), y1, y2) 162 VPSHUFD(Imm(0x4e), y0, y0) 163 // product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff) 164 VPMULUDQ(y1, y2, y2) 165 // xacc[i] += swap(data_vec) 166 VPADDQ(a[0], y0, a[0]) 167 // xacc[i] += product 168 VPADDQ(a[0], y2, a[0]) 169 170 VPXOR(y3, y4, y4) 171 VPSRLQ(Imm(0x20), y4, y2) 172 VPMULUDQ(y4, y2, y2) 173 VPSHUFD(Imm(0x4e), y3, y3) 174 VPADDQ(a[1], y3, a[1]) 175 VPADDQ(a[1], y2, a[1]) 176 } 177 178 Label("return") 179 { 180 VMOVDQU(a[0], acc.Offset(0x00)) 181 VMOVDQU(a[1], acc.Offset(0x20)) 182 RET() 183 } 184 185 Generate() 186 }