github.com/bytedance/gopkg@v0.0.0-20240514070511-01b2cbcf35e1/util/xxhash3/avx2_amd64.s (about) 1 // Copyright 2021 ByteDance Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Code generated by command: go run gen.go -avx2 -out ./avx2.s. DO NOT EDIT. 16 17 #include "textflag.h" 18 19 DATA prime_avx<>+0(SB)/4, $0x9e3779b1 20 GLOBL prime_avx<>(SB), RODATA|NOPTR, $4 21 22 // func accumAVX2(acc *[8]uint64, xinput *byte, xsecret *byte, len uint64) 23 // Requires: AVX, AVX2 24 TEXT ·accumAVX2(SB), NOSPLIT, $0-32 25 MOVQ acc+0(FP), AX 26 MOVQ xinput+8(FP), CX 27 MOVQ xsecret+16(FP), DX 28 MOVQ xsecret+16(FP), BX 29 MOVQ len+24(FP), SI 30 VMOVDQU (AX), Y1 31 VMOVDQU 32(AX), Y2 32 VPBROADCASTQ prime_avx<>+0(SB), Y0 33 34 accumBlock: 35 CMPQ SI, $0x00000400 36 JLE accumStripe 37 VMOVDQU (CX), Y3 38 VMOVDQU (DX), Y4 39 VMOVDQU 32(CX), Y6 40 VMOVDQU 32(DX), Y7 41 VMOVDQU 64(CX), Y8 42 VMOVDQU 8(DX), Y9 43 VMOVDQU 96(CX), Y10 44 VMOVDQU 40(DX), Y11 45 VPXOR Y3, Y4, Y4 46 VPSRLQ $0x20, Y4, Y5 47 VPSHUFD $0x4e, Y3, Y3 48 VPMULUDQ Y4, Y5, Y5 49 VPADDQ Y1, Y3, Y1 50 VPADDQ Y1, Y5, Y1 51 VPXOR Y6, Y7, Y7 52 VPSRLQ $0x20, Y7, Y5 53 VPSHUFD $0x4e, Y6, Y6 54 VPMULUDQ Y7, Y5, Y5 55 VPADDQ Y2, Y6, Y2 56 VPADDQ Y2, Y5, Y2 57 VPXOR Y8, Y9, Y9 58 VPSRLQ $0x20, Y9, Y5 59 VPSHUFD $0x4e, Y8, Y8 60 VPMULUDQ Y9, Y5, Y5 61 VPADDQ Y1, Y8, Y1 62 VPADDQ Y1, Y5, Y1 63 VPXOR Y10, Y11, Y11 64 VPSRLQ $0x20, Y11, Y5 65 VPSHUFD $0x4e, Y10, Y10 66 VPMULUDQ Y11, Y5, Y5 67 VPADDQ Y2, Y10, Y2 68 VPADDQ Y2, Y5, Y2 69 VMOVDQU 128(CX), Y3 70 VMOVDQU 16(DX), Y4 71 VMOVDQU 160(CX), Y6 72 VMOVDQU 48(DX), Y7 73 VMOVDQU 192(CX), Y8 74 VMOVDQU 24(DX), Y9 75 VMOVDQU 224(CX), Y10 76 VMOVDQU 56(DX), Y11 77 VPXOR Y3, Y4, Y4 78 VPSRLQ $0x20, Y4, Y5 79 VPSHUFD $0x4e, Y3, Y3 80 VPMULUDQ Y4, Y5, Y5 81 VPADDQ Y1, Y3, Y1 82 VPADDQ Y1, Y5, Y1 83 VPXOR Y6, Y7, Y7 84 VPSRLQ $0x20, Y7, Y5 85 VPSHUFD $0x4e, Y6, Y6 86 VPMULUDQ Y7, Y5, Y5 87 VPADDQ Y2, Y6, Y2 88 VPADDQ Y2, Y5, Y2 89 VPXOR Y8, Y9, Y9 90 VPSRLQ $0x20, Y9, Y5 91 VPSHUFD $0x4e, Y8, Y8 92 VPMULUDQ Y9, Y5, Y5 93 VPADDQ Y1, Y8, Y1 94 VPADDQ Y1, Y5, Y1 95 VPXOR Y10, Y11, Y11 96 VPSRLQ $0x20, Y11, Y5 97 VPSHUFD $0x4e, Y10, Y10 98 VPMULUDQ Y11, Y5, Y5 99 VPADDQ Y2, Y10, Y2 100 VPADDQ Y2, Y5, Y2 101 VMOVDQU 256(CX), Y3 102 VMOVDQU 32(DX), Y4 103 VMOVDQU 288(CX), Y6 104 VMOVDQU 64(DX), Y7 105 VMOVDQU 320(CX), Y8 106 VMOVDQU 40(DX), Y9 107 VMOVDQU 352(CX), Y10 108 VMOVDQU 72(DX), Y11 109 VPXOR Y3, Y4, Y4 110 VPSRLQ $0x20, Y4, Y5 111 VPSHUFD $0x4e, Y3, Y3 112 VPMULUDQ Y4, Y5, Y5 113 VPADDQ Y1, Y3, Y1 114 VPADDQ Y1, Y5, Y1 115 VPXOR Y6, Y7, Y7 116 VPSRLQ $0x20, Y7, Y5 117 VPSHUFD $0x4e, Y6, Y6 118 VPMULUDQ Y7, Y5, Y5 119 VPADDQ Y2, Y6, Y2 120 VPADDQ Y2, Y5, Y2 121 VPXOR Y8, Y9, Y9 122 VPSRLQ $0x20, Y9, Y5 123 VPSHUFD $0x4e, Y8, Y8 124 VPMULUDQ Y9, Y5, Y5 125 VPADDQ Y1, Y8, Y1 126 VPADDQ Y1, Y5, Y1 127 VPXOR Y10, Y11, Y11 128 VPSRLQ $0x20, Y11, Y5 129 VPSHUFD $0x4e, Y10, Y10 130 VPMULUDQ Y11, Y5, Y5 131 VPADDQ Y2, Y10, Y2 132 VPADDQ Y2, Y5, Y2 133 VMOVDQU 384(CX), Y3 134 VMOVDQU 48(DX), Y4 135 VMOVDQU 416(CX), Y6 136 VMOVDQU 80(DX), Y7 137 VMOVDQU 448(CX), Y8 138 VMOVDQU 56(DX), Y9 139 VMOVDQU 480(CX), Y10 140 VMOVDQU 88(DX), Y11 141 VPXOR Y3, Y4, Y4 142 VPSRLQ $0x20, Y4, Y5 143 VPSHUFD $0x4e, Y3, Y3 144 VPMULUDQ Y4, Y5, Y5 145 VPADDQ Y1, Y3, Y1 146 VPADDQ Y1, Y5, Y1 147 VPXOR Y6, Y7, Y7 148 VPSRLQ $0x20, Y7, Y5 149 VPSHUFD $0x4e, Y6, Y6 150 VPMULUDQ Y7, Y5, Y5 151 VPADDQ Y2, Y6, Y2 152 VPADDQ Y2, Y5, Y2 153 VPXOR Y8, Y9, Y9 154 VPSRLQ $0x20, Y9, Y5 155 VPSHUFD $0x4e, Y8, Y8 156 VPMULUDQ Y9, Y5, Y5 157 VPADDQ Y1, Y8, Y1 158 VPADDQ Y1, Y5, Y1 159 VPXOR Y10, Y11, Y11 160 VPSRLQ $0x20, Y11, Y5 161 VPSHUFD $0x4e, Y10, Y10 162 VPMULUDQ Y11, Y5, Y5 163 VPADDQ Y2, Y10, Y2 164 VPADDQ Y2, Y5, Y2 165 VMOVDQU 512(CX), Y3 166 VMOVDQU 64(DX), Y4 167 VMOVDQU 544(CX), Y6 168 VMOVDQU 96(DX), Y7 169 VMOVDQU 576(CX), Y8 170 VMOVDQU 72(DX), Y9 171 VMOVDQU 608(CX), Y10 172 VMOVDQU 104(DX), Y11 173 VPXOR Y3, Y4, Y4 174 VPSRLQ $0x20, Y4, Y5 175 VPSHUFD $0x4e, Y3, Y3 176 VPMULUDQ Y4, Y5, Y5 177 VPADDQ Y1, Y3, Y1 178 VPADDQ Y1, Y5, Y1 179 VPXOR Y6, Y7, Y7 180 VPSRLQ $0x20, Y7, Y5 181 VPSHUFD $0x4e, Y6, Y6 182 VPMULUDQ Y7, Y5, Y5 183 VPADDQ Y2, Y6, Y2 184 VPADDQ Y2, Y5, Y2 185 VPXOR Y8, Y9, Y9 186 VPSRLQ $0x20, Y9, Y5 187 VPSHUFD $0x4e, Y8, Y8 188 VPMULUDQ Y9, Y5, Y5 189 VPADDQ Y1, Y8, Y1 190 VPADDQ Y1, Y5, Y1 191 VPXOR Y10, Y11, Y11 192 VPSRLQ $0x20, Y11, Y5 193 VPSHUFD $0x4e, Y10, Y10 194 VPMULUDQ Y11, Y5, Y5 195 VPADDQ Y2, Y10, Y2 196 VPADDQ Y2, Y5, Y2 197 VMOVDQU 640(CX), Y3 198 VMOVDQU 80(DX), Y4 199 VMOVDQU 672(CX), Y6 200 VMOVDQU 112(DX), Y7 201 VMOVDQU 704(CX), Y8 202 VMOVDQU 88(DX), Y9 203 VMOVDQU 736(CX), Y10 204 VMOVDQU 120(DX), Y11 205 VPXOR Y3, Y4, Y4 206 VPSRLQ $0x20, Y4, Y5 207 VPSHUFD $0x4e, Y3, Y3 208 VPMULUDQ Y4, Y5, Y5 209 VPADDQ Y1, Y3, Y1 210 VPADDQ Y1, Y5, Y1 211 VPXOR Y6, Y7, Y7 212 VPSRLQ $0x20, Y7, Y5 213 VPSHUFD $0x4e, Y6, Y6 214 VPMULUDQ Y7, Y5, Y5 215 VPADDQ Y2, Y6, Y2 216 VPADDQ Y2, Y5, Y2 217 VPXOR Y8, Y9, Y9 218 VPSRLQ $0x20, Y9, Y5 219 VPSHUFD $0x4e, Y8, Y8 220 VPMULUDQ Y9, Y5, Y5 221 VPADDQ Y1, Y8, Y1 222 VPADDQ Y1, Y5, Y1 223 VPXOR Y10, Y11, Y11 224 VPSRLQ $0x20, Y11, Y5 225 VPSHUFD $0x4e, Y10, Y10 226 VPMULUDQ Y11, Y5, Y5 227 VPADDQ Y2, Y10, Y2 228 VPADDQ Y2, Y5, Y2 229 VMOVDQU 768(CX), Y3 230 VMOVDQU 96(DX), Y4 231 VMOVDQU 800(CX), Y6 232 VMOVDQU 128(DX), Y7 233 VMOVDQU 832(CX), Y8 234 VMOVDQU 104(DX), Y9 235 VMOVDQU 864(CX), Y10 236 VMOVDQU 136(DX), Y11 237 VPXOR Y3, Y4, Y4 238 VPSRLQ $0x20, Y4, Y5 239 VPSHUFD $0x4e, Y3, Y3 240 VPMULUDQ Y4, Y5, Y5 241 VPADDQ Y1, Y3, Y1 242 VPADDQ Y1, Y5, Y1 243 VPXOR Y6, Y7, Y7 244 VPSRLQ $0x20, Y7, Y5 245 VPSHUFD $0x4e, Y6, Y6 246 VPMULUDQ Y7, Y5, Y5 247 VPADDQ Y2, Y6, Y2 248 VPADDQ Y2, Y5, Y2 249 VPXOR Y8, Y9, Y9 250 VPSRLQ $0x20, Y9, Y5 251 VPSHUFD $0x4e, Y8, Y8 252 VPMULUDQ Y9, Y5, Y5 253 VPADDQ Y1, Y8, Y1 254 VPADDQ Y1, Y5, Y1 255 VPXOR Y10, Y11, Y11 256 VPSRLQ $0x20, Y11, Y5 257 VPSHUFD $0x4e, Y10, Y10 258 VPMULUDQ Y11, Y5, Y5 259 VPADDQ Y2, Y10, Y2 260 VPADDQ Y2, Y5, Y2 261 VMOVDQU 896(CX), Y3 262 VMOVDQU 112(DX), Y4 263 VMOVDQU 928(CX), Y6 264 VMOVDQU 144(DX), Y7 265 VMOVDQU 960(CX), Y8 266 VMOVDQU 120(DX), Y9 267 VMOVDQU 992(CX), Y10 268 VMOVDQU 152(DX), Y11 269 VPXOR Y3, Y4, Y4 270 VPSRLQ $0x20, Y4, Y5 271 VPSHUFD $0x4e, Y3, Y3 272 VPMULUDQ Y4, Y5, Y5 273 VPADDQ Y1, Y3, Y1 274 VPADDQ Y1, Y5, Y1 275 VPXOR Y6, Y7, Y7 276 VPSRLQ $0x20, Y7, Y5 277 VPSHUFD $0x4e, Y6, Y6 278 VPMULUDQ Y7, Y5, Y5 279 VPADDQ Y2, Y6, Y2 280 VPADDQ Y2, Y5, Y2 281 VPXOR Y8, Y9, Y9 282 VPSRLQ $0x20, Y9, Y5 283 VPSHUFD $0x4e, Y8, Y8 284 VPMULUDQ Y9, Y5, Y5 285 VPADDQ Y1, Y8, Y1 286 VPADDQ Y1, Y5, Y1 287 VPXOR Y10, Y11, Y11 288 VPSRLQ $0x20, Y11, Y5 289 VPSHUFD $0x4e, Y10, Y10 290 VPMULUDQ Y11, Y5, Y5 291 VPADDQ Y2, Y10, Y2 292 VPADDQ Y2, Y5, Y2 293 ADDQ $0x00000400, CX 294 SUBQ $0x00000400, SI 295 VPSRLQ $0x2f, Y1, Y3 296 VPXOR Y1, Y3, Y3 297 VPXOR 128(DX), Y3, Y3 298 VPMULUDQ Y0, Y3, Y4 299 VPSRLQ $0x20, Y3, Y3 300 VPMULUDQ Y0, Y3, Y3 301 VPSLLQ $0x20, Y3, Y3 302 VPADDQ Y4, Y3, Y1 303 VPSRLQ $0x2f, Y2, Y3 304 VPXOR Y2, Y3, Y3 305 VPXOR 160(DX), Y3, Y3 306 VPMULUDQ Y0, Y3, Y4 307 VPSRLQ $0x20, Y3, Y3 308 VPMULUDQ Y0, Y3, Y3 309 VPSLLQ $0x20, Y3, Y3 310 VPADDQ Y4, Y3, Y2 311 JMP accumBlock 312 313 accumStripe: 314 CMPQ SI, $0x40 315 JLE accumLastStripe 316 VMOVDQU (CX), Y0 317 VMOVDQU (BX), Y3 318 VMOVDQU 32(CX), Y5 319 VMOVDQU 32(BX), Y6 320 VPXOR Y0, Y3, Y3 321 VPSRLQ $0x20, Y3, Y4 322 VPSHUFD $0x4e, Y0, Y0 323 VPMULUDQ Y3, Y4, Y4 324 VPADDQ Y1, Y0, Y1 325 VPADDQ Y1, Y4, Y1 326 VPXOR Y5, Y6, Y6 327 VPSRLQ $0x20, Y6, Y4 328 VPMULUDQ Y6, Y4, Y4 329 VPSHUFD $0x4e, Y5, Y5 330 VPADDQ Y2, Y5, Y2 331 VPADDQ Y2, Y4, Y2 332 ADDQ $0x00000040, CX 333 SUBQ $0x00000040, SI 334 ADDQ $0x00000008, BX 335 JMP accumStripe 336 337 accumLastStripe: 338 CMPQ SI, $0x00 339 JE return 340 SUBQ $0x40, CX 341 ADDQ SI, CX 342 VMOVDQU (CX), Y0 343 VMOVDQU 121(DX), Y3 344 VMOVDQU 32(CX), Y5 345 VMOVDQU 153(DX), Y6 346 VPXOR Y0, Y3, Y3 347 VPSRLQ $0x20, Y3, Y4 348 VPSHUFD $0x4e, Y0, Y0 349 VPMULUDQ Y3, Y4, Y4 350 VPADDQ Y1, Y0, Y1 351 VPADDQ Y1, Y4, Y1 352 VPXOR Y5, Y6, Y6 353 VPSRLQ $0x20, Y6, Y4 354 VPMULUDQ Y6, Y4, Y4 355 VPSHUFD $0x4e, Y5, Y5 356 VPADDQ Y2, Y5, Y2 357 VPADDQ Y2, Y4, Y2 358 359 return: 360 VMOVDQU Y1, (AX) 361 VMOVDQU Y2, 32(AX) 362 RET