github.com/songzhibin97/go-baseutils@v0.0.2-0.20240302024150-487d8ce9c082/sys/xxhash3/avx2_amd64.s (about) 1 2 3 // Code generated by command: go run gen.go -avx2 -out ./avx2.s. DO NOT EDIT. 4 5 #include "textflag.h" 6 7 DATA prime_avx<>+0(SB)/4, $0x9e3779b1 8 GLOBL prime_avx<>(SB), RODATA|NOPTR, $4 9 10 // func accumAVX2(acc *[8]uint64, xinput *byte, xsecret *byte, len uint64) 11 // Requires: AVX, AVX2 12 TEXT ·accumAVX2(SB), NOSPLIT, $0-32 13 MOVQ acc+0(FP), AX 14 MOVQ xinput+8(FP), CX 15 MOVQ xsecret+16(FP), DX 16 MOVQ xsecret+16(FP), BX 17 MOVQ len+24(FP), SI 18 VMOVDQU (AX), Y1 19 VMOVDQU 32(AX), Y2 20 VPBROADCASTQ prime_avx<>+0(SB), Y0 21 22 accumBlock: 23 CMPQ SI, $0x00000400 24 JLE accumStripe 25 VMOVDQU (CX), Y3 26 VMOVDQU (DX), Y4 27 VMOVDQU 32(CX), Y6 28 VMOVDQU 32(DX), Y7 29 VMOVDQU 64(CX), Y8 30 VMOVDQU 8(DX), Y9 31 VMOVDQU 96(CX), Y10 32 VMOVDQU 40(DX), Y11 33 VPXOR Y3, Y4, Y4 34 VPSRLQ $0x20, Y4, Y5 35 VPSHUFD $0x4e, Y3, Y3 36 VPMULUDQ Y4, Y5, Y5 37 VPADDQ Y1, Y3, Y1 38 VPADDQ Y1, Y5, Y1 39 VPXOR Y6, Y7, Y7 40 VPSRLQ $0x20, Y7, Y5 41 VPSHUFD $0x4e, Y6, Y6 42 VPMULUDQ Y7, Y5, Y5 43 VPADDQ Y2, Y6, Y2 44 VPADDQ Y2, Y5, Y2 45 VPXOR Y8, Y9, Y9 46 VPSRLQ $0x20, Y9, Y5 47 VPSHUFD $0x4e, Y8, Y8 48 VPMULUDQ Y9, Y5, Y5 49 VPADDQ Y1, Y8, Y1 50 VPADDQ Y1, Y5, Y1 51 VPXOR Y10, Y11, Y11 52 VPSRLQ $0x20, Y11, Y5 53 VPSHUFD $0x4e, Y10, Y10 54 VPMULUDQ Y11, Y5, Y5 55 VPADDQ Y2, Y10, Y2 56 VPADDQ Y2, Y5, Y2 57 VMOVDQU 128(CX), Y3 58 VMOVDQU 16(DX), Y4 59 VMOVDQU 160(CX), Y6 60 VMOVDQU 48(DX), Y7 61 VMOVDQU 192(CX), Y8 62 VMOVDQU 24(DX), Y9 63 VMOVDQU 224(CX), Y10 64 VMOVDQU 56(DX), Y11 65 VPXOR Y3, Y4, Y4 66 VPSRLQ $0x20, Y4, Y5 67 VPSHUFD $0x4e, Y3, Y3 68 VPMULUDQ Y4, Y5, Y5 69 VPADDQ Y1, Y3, Y1 70 VPADDQ Y1, Y5, Y1 71 VPXOR Y6, Y7, Y7 72 VPSRLQ $0x20, Y7, Y5 73 VPSHUFD $0x4e, Y6, Y6 74 VPMULUDQ Y7, Y5, Y5 75 VPADDQ Y2, Y6, Y2 76 VPADDQ Y2, Y5, Y2 77 VPXOR Y8, Y9, Y9 78 VPSRLQ $0x20, Y9, Y5 79 VPSHUFD $0x4e, Y8, Y8 80 VPMULUDQ Y9, Y5, Y5 81 VPADDQ Y1, Y8, Y1 82 VPADDQ Y1, Y5, Y1 83 VPXOR Y10, Y11, Y11 84 VPSRLQ $0x20, Y11, Y5 85 VPSHUFD $0x4e, Y10, Y10 86 VPMULUDQ Y11, Y5, Y5 87 VPADDQ Y2, Y10, Y2 88 VPADDQ Y2, Y5, Y2 89 VMOVDQU 256(CX), Y3 90 VMOVDQU 32(DX), Y4 91 VMOVDQU 288(CX), Y6 92 VMOVDQU 64(DX), Y7 93 VMOVDQU 320(CX), Y8 94 VMOVDQU 40(DX), Y9 95 VMOVDQU 352(CX), Y10 96 VMOVDQU 72(DX), Y11 97 VPXOR Y3, Y4, Y4 98 VPSRLQ $0x20, Y4, Y5 99 VPSHUFD $0x4e, Y3, Y3 100 VPMULUDQ Y4, Y5, Y5 101 VPADDQ Y1, Y3, Y1 102 VPADDQ Y1, Y5, Y1 103 VPXOR Y6, Y7, Y7 104 VPSRLQ $0x20, Y7, Y5 105 VPSHUFD $0x4e, Y6, Y6 106 VPMULUDQ Y7, Y5, Y5 107 VPADDQ Y2, Y6, Y2 108 VPADDQ Y2, Y5, Y2 109 VPXOR Y8, Y9, Y9 110 VPSRLQ $0x20, Y9, Y5 111 VPSHUFD $0x4e, Y8, Y8 112 VPMULUDQ Y9, Y5, Y5 113 VPADDQ Y1, Y8, Y1 114 VPADDQ Y1, Y5, Y1 115 VPXOR Y10, Y11, Y11 116 VPSRLQ $0x20, Y11, Y5 117 VPSHUFD $0x4e, Y10, Y10 118 VPMULUDQ Y11, Y5, Y5 119 VPADDQ Y2, Y10, Y2 120 VPADDQ Y2, Y5, Y2 121 VMOVDQU 384(CX), Y3 122 VMOVDQU 48(DX), Y4 123 VMOVDQU 416(CX), Y6 124 VMOVDQU 80(DX), Y7 125 VMOVDQU 448(CX), Y8 126 VMOVDQU 56(DX), Y9 127 VMOVDQU 480(CX), Y10 128 VMOVDQU 88(DX), Y11 129 VPXOR Y3, Y4, Y4 130 VPSRLQ $0x20, Y4, Y5 131 VPSHUFD $0x4e, Y3, Y3 132 VPMULUDQ Y4, Y5, Y5 133 VPADDQ Y1, Y3, Y1 134 VPADDQ Y1, Y5, Y1 135 VPXOR Y6, Y7, Y7 136 VPSRLQ $0x20, Y7, Y5 137 VPSHUFD $0x4e, Y6, Y6 138 VPMULUDQ Y7, Y5, Y5 139 VPADDQ Y2, Y6, Y2 140 VPADDQ Y2, Y5, Y2 141 VPXOR Y8, Y9, Y9 142 VPSRLQ $0x20, Y9, Y5 143 VPSHUFD $0x4e, Y8, Y8 144 VPMULUDQ Y9, Y5, Y5 145 VPADDQ Y1, Y8, Y1 146 VPADDQ Y1, Y5, Y1 147 VPXOR Y10, Y11, Y11 148 VPSRLQ $0x20, Y11, Y5 149 VPSHUFD $0x4e, Y10, Y10 150 VPMULUDQ Y11, Y5, Y5 151 VPADDQ Y2, Y10, Y2 152 VPADDQ Y2, Y5, Y2 153 VMOVDQU 512(CX), Y3 154 VMOVDQU 64(DX), Y4 155 VMOVDQU 544(CX), Y6 156 VMOVDQU 96(DX), Y7 157 VMOVDQU 576(CX), Y8 158 VMOVDQU 72(DX), Y9 159 VMOVDQU 608(CX), Y10 160 VMOVDQU 104(DX), Y11 161 VPXOR Y3, Y4, Y4 162 VPSRLQ $0x20, Y4, Y5 163 VPSHUFD $0x4e, Y3, Y3 164 VPMULUDQ Y4, Y5, Y5 165 VPADDQ Y1, Y3, Y1 166 VPADDQ Y1, Y5, Y1 167 VPXOR Y6, Y7, Y7 168 VPSRLQ $0x20, Y7, Y5 169 VPSHUFD $0x4e, Y6, Y6 170 VPMULUDQ Y7, Y5, Y5 171 VPADDQ Y2, Y6, Y2 172 VPADDQ Y2, Y5, Y2 173 VPXOR Y8, Y9, Y9 174 VPSRLQ $0x20, Y9, Y5 175 VPSHUFD $0x4e, Y8, Y8 176 VPMULUDQ Y9, Y5, Y5 177 VPADDQ Y1, Y8, Y1 178 VPADDQ Y1, Y5, Y1 179 VPXOR Y10, Y11, Y11 180 VPSRLQ $0x20, Y11, Y5 181 VPSHUFD $0x4e, Y10, Y10 182 VPMULUDQ Y11, Y5, Y5 183 VPADDQ Y2, Y10, Y2 184 VPADDQ Y2, Y5, Y2 185 VMOVDQU 640(CX), Y3 186 VMOVDQU 80(DX), Y4 187 VMOVDQU 672(CX), Y6 188 VMOVDQU 112(DX), Y7 189 VMOVDQU 704(CX), Y8 190 VMOVDQU 88(DX), Y9 191 VMOVDQU 736(CX), Y10 192 VMOVDQU 120(DX), Y11 193 VPXOR Y3, Y4, Y4 194 VPSRLQ $0x20, Y4, Y5 195 VPSHUFD $0x4e, Y3, Y3 196 VPMULUDQ Y4, Y5, Y5 197 VPADDQ Y1, Y3, Y1 198 VPADDQ Y1, Y5, Y1 199 VPXOR Y6, Y7, Y7 200 VPSRLQ $0x20, Y7, Y5 201 VPSHUFD $0x4e, Y6, Y6 202 VPMULUDQ Y7, Y5, Y5 203 VPADDQ Y2, Y6, Y2 204 VPADDQ Y2, Y5, Y2 205 VPXOR Y8, Y9, Y9 206 VPSRLQ $0x20, Y9, Y5 207 VPSHUFD $0x4e, Y8, Y8 208 VPMULUDQ Y9, Y5, Y5 209 VPADDQ Y1, Y8, Y1 210 VPADDQ Y1, Y5, Y1 211 VPXOR Y10, Y11, Y11 212 VPSRLQ $0x20, Y11, Y5 213 VPSHUFD $0x4e, Y10, Y10 214 VPMULUDQ Y11, Y5, Y5 215 VPADDQ Y2, Y10, Y2 216 VPADDQ Y2, Y5, Y2 217 VMOVDQU 768(CX), Y3 218 VMOVDQU 96(DX), Y4 219 VMOVDQU 800(CX), Y6 220 VMOVDQU 128(DX), Y7 221 VMOVDQU 832(CX), Y8 222 VMOVDQU 104(DX), Y9 223 VMOVDQU 864(CX), Y10 224 VMOVDQU 136(DX), Y11 225 VPXOR Y3, Y4, Y4 226 VPSRLQ $0x20, Y4, Y5 227 VPSHUFD $0x4e, Y3, Y3 228 VPMULUDQ Y4, Y5, Y5 229 VPADDQ Y1, Y3, Y1 230 VPADDQ Y1, Y5, Y1 231 VPXOR Y6, Y7, Y7 232 VPSRLQ $0x20, Y7, Y5 233 VPSHUFD $0x4e, Y6, Y6 234 VPMULUDQ Y7, Y5, Y5 235 VPADDQ Y2, Y6, Y2 236 VPADDQ Y2, Y5, Y2 237 VPXOR Y8, Y9, Y9 238 VPSRLQ $0x20, Y9, Y5 239 VPSHUFD $0x4e, Y8, Y8 240 VPMULUDQ Y9, Y5, Y5 241 VPADDQ Y1, Y8, Y1 242 VPADDQ Y1, Y5, Y1 243 VPXOR Y10, Y11, Y11 244 VPSRLQ $0x20, Y11, Y5 245 VPSHUFD $0x4e, Y10, Y10 246 VPMULUDQ Y11, Y5, Y5 247 VPADDQ Y2, Y10, Y2 248 VPADDQ Y2, Y5, Y2 249 VMOVDQU 896(CX), Y3 250 VMOVDQU 112(DX), Y4 251 VMOVDQU 928(CX), Y6 252 VMOVDQU 144(DX), Y7 253 VMOVDQU 960(CX), Y8 254 VMOVDQU 120(DX), Y9 255 VMOVDQU 992(CX), Y10 256 VMOVDQU 152(DX), Y11 257 VPXOR Y3, Y4, Y4 258 VPSRLQ $0x20, Y4, Y5 259 VPSHUFD $0x4e, Y3, Y3 260 VPMULUDQ Y4, Y5, Y5 261 VPADDQ Y1, Y3, Y1 262 VPADDQ Y1, Y5, Y1 263 VPXOR Y6, Y7, Y7 264 VPSRLQ $0x20, Y7, Y5 265 VPSHUFD $0x4e, Y6, Y6 266 VPMULUDQ Y7, Y5, Y5 267 VPADDQ Y2, Y6, Y2 268 VPADDQ Y2, Y5, Y2 269 VPXOR Y8, Y9, Y9 270 VPSRLQ $0x20, Y9, Y5 271 VPSHUFD $0x4e, Y8, Y8 272 VPMULUDQ Y9, Y5, Y5 273 VPADDQ Y1, Y8, Y1 274 VPADDQ Y1, Y5, Y1 275 VPXOR Y10, Y11, Y11 276 VPSRLQ $0x20, Y11, Y5 277 VPSHUFD $0x4e, Y10, Y10 278 VPMULUDQ Y11, Y5, Y5 279 VPADDQ Y2, Y10, Y2 280 VPADDQ Y2, Y5, Y2 281 ADDQ $0x00000400, CX 282 SUBQ $0x00000400, SI 283 VPSRLQ $0x2f, Y1, Y3 284 VPXOR Y1, Y3, Y3 285 VPXOR 128(DX), Y3, Y3 286 VPMULUDQ Y0, Y3, Y4 287 VPSRLQ $0x20, Y3, Y3 288 VPMULUDQ Y0, Y3, Y3 289 VPSLLQ $0x20, Y3, Y3 290 VPADDQ Y4, Y3, Y1 291 VPSRLQ $0x2f, Y2, Y3 292 VPXOR Y2, Y3, Y3 293 VPXOR 160(DX), Y3, Y3 294 VPMULUDQ Y0, Y3, Y4 295 VPSRLQ $0x20, Y3, Y3 296 VPMULUDQ Y0, Y3, Y3 297 VPSLLQ $0x20, Y3, Y3 298 VPADDQ Y4, Y3, Y2 299 JMP accumBlock 300 301 accumStripe: 302 CMPQ SI, $0x40 303 JLE accumLastStripe 304 VMOVDQU (CX), Y0 305 VMOVDQU (BX), Y3 306 VMOVDQU 32(CX), Y5 307 VMOVDQU 32(BX), Y6 308 VPXOR Y0, Y3, Y3 309 VPSRLQ $0x20, Y3, Y4 310 VPSHUFD $0x4e, Y0, Y0 311 VPMULUDQ Y3, Y4, Y4 312 VPADDQ Y1, Y0, Y1 313 VPADDQ Y1, Y4, Y1 314 VPXOR Y5, Y6, Y6 315 VPSRLQ $0x20, Y6, Y4 316 VPMULUDQ Y6, Y4, Y4 317 VPSHUFD $0x4e, Y5, Y5 318 VPADDQ Y2, Y5, Y2 319 VPADDQ Y2, Y4, Y2 320 ADDQ $0x00000040, CX 321 SUBQ $0x00000040, SI 322 ADDQ $0x00000008, BX 323 JMP accumStripe 324 325 accumLastStripe: 326 CMPQ SI, $0x00 327 JE return 328 SUBQ $0x40, CX 329 ADDQ SI, CX 330 VMOVDQU (CX), Y0 331 VMOVDQU 121(DX), Y3 332 VMOVDQU 32(CX), Y5 333 VMOVDQU 153(DX), Y6 334 VPXOR Y0, Y3, Y3 335 VPSRLQ $0x20, Y3, Y4 336 VPSHUFD $0x4e, Y0, Y0 337 VPMULUDQ Y3, Y4, Y4 338 VPADDQ Y1, Y0, Y1 339 VPADDQ Y1, Y4, Y1 340 VPXOR Y5, Y6, Y6 341 VPSRLQ $0x20, Y6, Y4 342 VPMULUDQ Y6, Y4, Y4 343 VPSHUFD $0x4e, Y5, Y5 344 VPADDQ Y2, Y5, Y2 345 VPADDQ Y2, Y4, Y2 346 347 return: 348 VMOVDQU Y1, (AX) 349 VMOVDQU Y2, 32(AX) 350 RET