github.com/cloudflare/circl@v1.5.0/pke/kyber/internal/common/amd64.s (about) 1 // Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT. 2 3 //go:build amd64 && !purego 4 5 #include "textflag.h" 6 7 // func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16) 8 // Requires: AVX, AVX2 9 TEXT ·addAVX2(SB), NOSPLIT, $0-24 10 MOVQ p+0(FP), AX 11 MOVQ a+8(FP), CX 12 MOVQ b+16(FP), DX 13 VMOVDQU (CX), Y0 14 VMOVDQU 32(CX), Y2 15 VMOVDQU 64(CX), Y4 16 VMOVDQU 96(CX), Y6 17 VMOVDQU 128(CX), Y8 18 VMOVDQU 160(CX), Y10 19 VMOVDQU 192(CX), Y12 20 VMOVDQU 224(CX), Y14 21 VMOVDQU (DX), Y1 22 VMOVDQU 32(DX), Y3 23 VMOVDQU 64(DX), Y5 24 VMOVDQU 96(DX), Y7 25 VMOVDQU 128(DX), Y9 26 VMOVDQU 160(DX), Y11 27 VMOVDQU 192(DX), Y13 28 VMOVDQU 224(DX), Y15 29 VPADDW Y0, Y1, Y1 30 VPADDW Y2, Y3, Y3 31 VPADDW Y4, Y5, Y5 32 VPADDW Y6, Y7, Y7 33 VPADDW Y8, Y9, Y9 34 VPADDW Y10, Y11, Y11 35 VPADDW Y12, Y13, Y13 36 VPADDW Y14, Y15, Y15 37 VMOVDQU Y1, (AX) 38 VMOVDQU Y3, 32(AX) 39 VMOVDQU Y5, 64(AX) 40 VMOVDQU Y7, 96(AX) 41 VMOVDQU Y9, 128(AX) 42 VMOVDQU Y11, 160(AX) 43 VMOVDQU Y13, 192(AX) 44 VMOVDQU Y15, 224(AX) 45 VMOVDQU 256(CX), Y0 46 VMOVDQU 288(CX), Y2 47 VMOVDQU 320(CX), Y4 48 VMOVDQU 352(CX), Y6 49 VMOVDQU 384(CX), Y8 50 VMOVDQU 416(CX), Y10 51 VMOVDQU 448(CX), Y12 52 VMOVDQU 480(CX), Y14 53 VMOVDQU 256(DX), Y1 54 VMOVDQU 288(DX), Y3 55 VMOVDQU 320(DX), Y5 56 VMOVDQU 352(DX), Y7 57 VMOVDQU 384(DX), Y9 58 VMOVDQU 416(DX), Y11 59 VMOVDQU 448(DX), Y13 60 VMOVDQU 480(DX), Y15 61 VPADDW Y0, Y1, Y1 62 VPADDW Y2, Y3, Y3 63 VPADDW Y4, Y5, Y5 64 VPADDW Y6, Y7, Y7 65 VPADDW Y8, Y9, Y9 66 VPADDW Y10, Y11, Y11 67 VPADDW Y12, Y13, Y13 68 VPADDW Y14, Y15, Y15 69 VMOVDQU Y1, 256(AX) 70 VMOVDQU Y3, 288(AX) 71 VMOVDQU Y5, 320(AX) 72 VMOVDQU Y7, 352(AX) 73 VMOVDQU Y9, 384(AX) 74 VMOVDQU Y11, 416(AX) 75 VMOVDQU Y13, 448(AX) 76 VMOVDQU Y15, 480(AX) 77 RET 78 79 // func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16) 80 // Requires: AVX, AVX2 81 TEXT ·subAVX2(SB), NOSPLIT, $0-24 82 MOVQ p+0(FP), AX 83 MOVQ a+8(FP), CX 84 MOVQ b+16(FP), DX 85 VMOVDQU (CX), Y0 86 VMOVDQU 32(CX), Y2 87 VMOVDQU 64(CX), Y4 88 VMOVDQU 96(CX), Y6 89 VMOVDQU 128(CX), Y8 90 VMOVDQU 160(CX), Y10 91 VMOVDQU 192(CX), Y12 92 VMOVDQU 224(CX), Y14 93 VMOVDQU (DX), Y1 94 VMOVDQU 32(DX), Y3 95 VMOVDQU 64(DX), Y5 96 VMOVDQU 96(DX), Y7 97 VMOVDQU 128(DX), Y9 98 VMOVDQU 160(DX), Y11 99 VMOVDQU 192(DX), Y13 100 VMOVDQU 224(DX), Y15 101 VPSUBW Y1, Y0, Y1 102 VPSUBW Y3, Y2, Y3 103 VPSUBW Y5, Y4, Y5 104 VPSUBW Y7, Y6, Y7 105 VPSUBW Y9, Y8, Y9 106 VPSUBW Y11, Y10, Y11 107 VPSUBW Y13, Y12, Y13 108 VPSUBW Y15, Y14, Y15 109 VMOVDQU Y1, (AX) 110 VMOVDQU Y3, 32(AX) 111 VMOVDQU Y5, 64(AX) 112 VMOVDQU Y7, 96(AX) 113 VMOVDQU Y9, 128(AX) 114 VMOVDQU Y11, 160(AX) 115 VMOVDQU Y13, 192(AX) 116 VMOVDQU Y15, 224(AX) 117 VMOVDQU 256(CX), Y0 118 VMOVDQU 288(CX), Y2 119 VMOVDQU 320(CX), Y4 120 VMOVDQU 352(CX), Y6 121 VMOVDQU 384(CX), Y8 122 VMOVDQU 416(CX), Y10 123 VMOVDQU 448(CX), Y12 124 VMOVDQU 480(CX), Y14 125 VMOVDQU 256(DX), Y1 126 VMOVDQU 288(DX), Y3 127 VMOVDQU 320(DX), Y5 128 VMOVDQU 352(DX), Y7 129 VMOVDQU 384(DX), Y9 130 VMOVDQU 416(DX), Y11 131 VMOVDQU 448(DX), Y13 132 VMOVDQU 480(DX), Y15 133 VPSUBW Y1, Y0, Y1 134 VPSUBW Y3, Y2, Y3 135 VPSUBW Y5, Y4, Y5 136 VPSUBW Y7, Y6, Y7 137 VPSUBW Y9, Y8, Y9 138 VPSUBW Y11, Y10, Y11 139 VPSUBW Y13, Y12, Y13 140 VPSUBW Y15, Y14, Y15 141 VMOVDQU Y1, 256(AX) 142 VMOVDQU Y3, 288(AX) 143 VMOVDQU Y5, 320(AX) 144 VMOVDQU Y7, 352(AX) 145 VMOVDQU Y9, 384(AX) 146 VMOVDQU Y11, 416(AX) 147 VMOVDQU Y13, 448(AX) 148 VMOVDQU Y15, 480(AX) 149 RET 150 151 // func nttAVX2(p *[256]int16) 152 // Requires: AVX, AVX2 153 TEXT ·nttAVX2(SB), NOSPLIT, $0-8 154 MOVQ p+0(FP), AX 155 LEAQ ·ZetasAVX2+0(SB), CX 156 MOVL $0x00000d01, DX 157 VMOVD DX, X0 158 VPBROADCASTW X0, Y15 159 VPBROADCASTW (CX), Y0 160 VPBROADCASTW 2(CX), Y1 161 VMOVDQU (AX), Y7 162 VMOVDQU 32(AX), Y8 163 VMOVDQU 64(AX), Y9 164 VMOVDQU 96(AX), Y10 165 VMOVDQU 256(AX), Y11 166 VMOVDQU 288(AX), Y12 167 VMOVDQU 320(AX), Y13 168 VMOVDQU 352(AX), Y14 169 VPMULLW Y11, Y0, Y2 170 VPMULLW Y12, Y0, Y3 171 VPMULLW Y13, Y0, Y4 172 VPMULLW Y14, Y0, Y5 173 VPMULHW Y11, Y1, Y11 174 VPMULHW Y12, Y1, Y12 175 VPMULHW Y13, Y1, Y13 176 VPMULHW Y14, Y1, Y14 177 VPMULHW Y2, Y15, Y2 178 VPMULHW Y3, Y15, Y3 179 VPMULHW Y4, Y15, Y4 180 VPMULHW Y5, Y15, Y5 181 VPSUBW Y2, Y11, Y2 182 VPSUBW Y3, Y12, Y3 183 VPSUBW Y4, Y13, Y4 184 VPSUBW Y5, Y14, Y5 185 VPSUBW Y2, Y7, Y11 186 VPSUBW Y3, Y8, Y12 187 VPSUBW Y4, Y9, Y13 188 VPSUBW Y5, Y10, Y14 189 VPADDW Y2, Y7, Y7 190 VPADDW Y3, Y8, Y8 191 VPADDW Y4, Y9, Y9 192 VPADDW Y5, Y10, Y10 193 VMOVDQU Y7, (AX) 194 VMOVDQU Y8, 32(AX) 195 VMOVDQU Y9, 64(AX) 196 VMOVDQU Y10, 96(AX) 197 VMOVDQU Y11, 256(AX) 198 VMOVDQU Y12, 288(AX) 199 VMOVDQU Y13, 320(AX) 200 VMOVDQU Y14, 352(AX) 201 VMOVDQU 128(AX), Y7 202 VMOVDQU 160(AX), Y8 203 VMOVDQU 192(AX), Y9 204 VMOVDQU 224(AX), Y10 205 VMOVDQU 384(AX), Y11 206 VMOVDQU 416(AX), Y12 207 VMOVDQU 448(AX), Y13 208 VMOVDQU 480(AX), Y14 209 VPMULLW Y11, Y0, Y2 210 VPMULLW Y12, Y0, Y3 211 VPMULLW Y13, Y0, Y4 212 VPMULLW Y14, Y0, Y5 213 VPMULHW Y11, Y1, Y11 214 VPMULHW Y12, Y1, Y12 215 VPMULHW Y13, Y1, Y13 216 VPMULHW Y14, Y1, Y14 217 VPMULHW Y2, Y15, Y2 218 VPMULHW Y3, Y15, Y3 219 VPMULHW Y4, Y15, Y4 220 VPMULHW Y5, Y15, Y5 221 VPSUBW Y2, Y11, Y2 222 VPSUBW Y3, Y12, Y3 223 VPSUBW Y4, Y13, Y4 224 VPSUBW Y5, Y14, Y5 225 VPSUBW Y2, Y7, Y11 226 VPSUBW Y3, Y8, Y12 227 VPSUBW Y4, Y9, Y13 228 VPSUBW Y5, Y10, Y14 229 VPADDW Y2, Y7, Y7 230 VPADDW Y3, Y8, Y8 231 VPADDW Y4, Y9, Y9 232 VPADDW Y5, Y10, Y10 233 VMOVDQU Y7, 128(AX) 234 VMOVDQU Y8, 160(AX) 235 VMOVDQU Y9, 192(AX) 236 VMOVDQU Y10, 224(AX) 237 VMOVDQU Y11, 384(AX) 238 VMOVDQU Y12, 416(AX) 239 VMOVDQU Y13, 448(AX) 240 VMOVDQU Y14, 480(AX) 241 VPBROADCASTW 4(CX), Y0 242 VPBROADCASTW 6(CX), Y1 243 VMOVDQU (AX), Y7 244 VMOVDQU 32(AX), Y8 245 VMOVDQU 64(AX), Y9 246 VMOVDQU 96(AX), Y10 247 VMOVDQU 128(AX), Y11 248 VMOVDQU 160(AX), Y12 249 VMOVDQU 192(AX), Y13 250 VMOVDQU 224(AX), Y14 251 VPMULLW Y11, Y0, Y2 252 VPMULLW Y12, Y0, Y3 253 VPMULLW Y13, Y0, Y4 254 VPMULLW Y14, Y0, Y5 255 VPMULHW Y11, Y1, Y11 256 VPMULHW Y12, Y1, Y12 257 VPMULHW Y13, Y1, Y13 258 VPMULHW Y14, Y1, Y14 259 VPMULHW Y2, Y15, Y2 260 VPMULHW Y3, Y15, Y3 261 VPMULHW Y4, Y15, Y4 262 VPMULHW Y5, Y15, Y5 263 VPSUBW Y2, Y11, Y2 264 VPSUBW Y3, Y12, Y3 265 VPSUBW Y4, Y13, Y4 266 VPSUBW Y5, Y14, Y5 267 VPSUBW Y2, Y7, Y11 268 VPSUBW Y3, Y8, Y12 269 VPSUBW Y4, Y9, Y13 270 VPSUBW Y5, Y10, Y14 271 VPADDW Y2, Y7, Y7 272 VPADDW Y3, Y8, Y8 273 VPADDW Y4, Y9, Y9 274 VPADDW Y5, Y10, Y10 275 VPBROADCASTW 12(CX), Y0 276 VPBROADCASTW 14(CX), Y1 277 VPBROADCASTW 16(CX), Y2 278 VPBROADCASTW 18(CX), Y3 279 VPMULLW Y9, Y0, Y4 280 VPMULLW Y10, Y0, Y5 281 VPMULLW Y13, Y2, Y6 282 VPMULLW Y14, Y2, Y0 283 VPMULHW Y9, Y1, Y9 284 VPMULHW Y10, Y1, Y10 285 VPMULHW Y13, Y3, Y13 286 VPMULHW Y14, Y3, Y14 287 VPMULHW Y4, Y15, Y4 288 VPMULHW Y5, Y15, Y5 289 VPMULHW Y6, Y15, Y6 290 VPMULHW Y0, Y15, Y0 291 VPSUBW Y4, Y9, Y4 292 VPSUBW Y5, Y10, Y5 293 VPSUBW Y6, Y13, Y6 294 VPSUBW Y0, Y14, Y0 295 VPSUBW Y4, Y7, Y9 296 VPSUBW Y5, Y8, Y10 297 VPSUBW Y6, Y11, Y13 298 VPSUBW Y0, Y12, Y14 299 VPADDW Y4, Y7, Y7 300 VPADDW Y5, Y8, Y8 301 VPADDW Y6, Y11, Y11 302 VPADDW Y0, Y12, Y12 303 VMOVDQU 32(CX), Y0 304 VMOVDQU 64(CX), Y1 305 VMOVDQU 96(CX), Y2 306 VMOVDQU 128(CX), Y3 307 VPERM2I128 $0x20, Y9, Y7, Y4 308 VPERM2I128 $0x31, Y9, Y7, Y9 309 VMOVDQA Y4, Y7 310 VPERM2I128 $0x20, Y10, Y8, Y4 311 VPERM2I128 $0x31, Y10, Y8, Y10 312 VMOVDQA Y4, Y8 313 VPERM2I128 $0x20, Y13, Y11, Y4 314 VPERM2I128 $0x31, Y13, Y11, Y13 315 VMOVDQA Y4, Y11 316 VPERM2I128 $0x20, Y14, Y12, Y4 317 VPERM2I128 $0x31, Y14, Y12, Y14 318 VMOVDQA Y4, Y12 319 VPMULLW Y8, Y0, Y4 320 VPMULLW Y10, Y0, Y5 321 VPMULLW Y12, Y2, Y6 322 VPMULLW Y14, Y2, Y0 323 VPMULHW Y8, Y1, Y8 324 VPMULHW Y10, Y1, Y10 325 VPMULHW Y12, Y3, Y12 326 VPMULHW Y14, Y3, Y14 327 VPMULHW Y4, Y15, Y4 328 VPMULHW Y5, Y15, Y5 329 VPMULHW Y6, Y15, Y6 330 VPMULHW Y0, Y15, Y0 331 VPSUBW Y4, Y8, Y4 332 VPSUBW Y5, Y10, Y5 333 VPSUBW Y6, Y12, Y6 334 VPSUBW Y0, Y14, Y0 335 VPSUBW Y4, Y7, Y8 336 VPSUBW Y5, Y9, Y10 337 VPSUBW Y6, Y11, Y12 338 VPSUBW Y0, Y13, Y14 339 VPADDW Y4, Y7, Y7 340 VPADDW Y5, Y9, Y9 341 VPADDW Y6, Y11, Y11 342 VPADDW Y0, Y13, Y13 343 VMOVDQU 288(CX), Y0 344 VMOVDQU 320(CX), Y1 345 VMOVDQU 352(CX), Y2 346 VMOVDQU 384(CX), Y3 347 VPUNPCKLQDQ Y8, Y7, Y4 348 VPUNPCKHQDQ Y8, Y7, Y8 349 VMOVDQA Y4, Y7 350 VPUNPCKLQDQ Y10, Y9, Y4 351 VPUNPCKHQDQ Y10, Y9, Y10 352 VMOVDQA Y4, Y9 353 VPUNPCKLQDQ Y12, Y11, Y4 354 VPUNPCKHQDQ Y12, Y11, Y12 355 VMOVDQA Y4, Y11 356 VPUNPCKLQDQ Y14, Y13, Y4 357 VPUNPCKHQDQ Y14, Y13, Y14 358 VMOVDQA Y4, Y13 359 VPMULLW Y9, Y0, Y4 360 VPMULLW Y10, Y0, Y5 361 VPMULLW Y13, Y2, Y6 362 VPMULLW Y14, Y2, Y0 363 VPMULHW Y9, Y1, Y9 364 VPMULHW Y10, Y1, Y10 365 VPMULHW Y13, Y3, Y13 366 VPMULHW Y14, Y3, Y14 367 VPMULHW Y4, Y15, Y4 368 VPMULHW Y5, Y15, Y5 369 VPMULHW Y6, Y15, Y6 370 VPMULHW Y0, Y15, Y0 371 VPSUBW Y4, Y9, Y4 372 VPSUBW Y5, Y10, Y5 373 VPSUBW Y6, Y13, Y6 374 VPSUBW Y0, Y14, Y0 375 VPSUBW Y4, Y7, Y9 376 VPSUBW Y5, Y8, Y10 377 VPSUBW Y6, Y11, Y13 378 VPSUBW Y0, Y12, Y14 379 VPADDW Y4, Y7, Y7 380 VPADDW Y5, Y8, Y8 381 VPADDW Y6, Y11, Y11 382 VPADDW Y0, Y12, Y12 383 VMOVDQU 544(CX), Y0 384 VMOVDQU 576(CX), Y1 385 VMOVDQU 608(CX), Y2 386 VMOVDQU 640(CX), Y3 387 VMOVSLDUP Y9, Y4 388 VPBLENDD $0xaa, Y4, Y7, Y4 389 VPSRLQ $0x20, Y7, Y7 390 VPBLENDD $0xaa, Y9, Y7, Y9 391 VMOVDQA Y4, Y7 392 VMOVSLDUP Y10, Y4 393 VPBLENDD $0xaa, Y4, Y8, Y4 394 VPSRLQ $0x20, Y8, Y8 395 VPBLENDD $0xaa, Y10, Y8, Y10 396 VMOVDQA Y4, Y8 397 VMOVSLDUP Y13, Y4 398 VPBLENDD $0xaa, Y4, Y11, Y4 399 VPSRLQ $0x20, Y11, Y11 400 VPBLENDD $0xaa, Y13, Y11, Y13 401 VMOVDQA Y4, Y11 402 VMOVSLDUP Y14, Y4 403 VPBLENDD $0xaa, Y4, Y12, Y4 404 VPSRLQ $0x20, Y12, Y12 405 VPBLENDD $0xaa, Y14, Y12, Y14 406 VMOVDQA Y4, Y12 407 VPMULLW Y8, Y0, Y4 408 VPMULLW Y10, Y0, Y5 409 VPMULLW Y12, Y2, Y6 410 VPMULLW Y14, Y2, Y0 411 VPMULHW Y8, Y1, Y8 412 VPMULHW Y10, Y1, Y10 413 VPMULHW Y12, Y3, Y12 414 VPMULHW Y14, Y3, Y14 415 VPMULHW Y4, Y15, Y4 416 VPMULHW Y5, Y15, Y5 417 VPMULHW Y6, Y15, Y6 418 VPMULHW Y0, Y15, Y0 419 VPSUBW Y4, Y8, Y4 420 VPSUBW Y5, Y10, Y5 421 VPSUBW Y6, Y12, Y6 422 VPSUBW Y0, Y14, Y0 423 VPSUBW Y4, Y7, Y8 424 VPSUBW Y5, Y9, Y10 425 VPSUBW Y6, Y11, Y12 426 VPSUBW Y0, Y13, Y14 427 VPADDW Y4, Y7, Y7 428 VPADDW Y5, Y9, Y9 429 VPADDW Y6, Y11, Y11 430 VPADDW Y0, Y13, Y13 431 VMOVDQU 800(CX), Y0 432 VMOVDQU 832(CX), Y1 433 VMOVDQU 864(CX), Y2 434 VMOVDQU 896(CX), Y3 435 VPSLLD $0x10, Y8, Y4 436 VPBLENDW $0xaa, Y4, Y7, Y4 437 VPSRLD $0x10, Y7, Y7 438 VPBLENDW $0xaa, Y8, Y7, Y8 439 VMOVDQA Y4, Y7 440 VPSLLD $0x10, Y10, Y4 441 VPBLENDW $0xaa, Y4, Y9, Y4 442 VPSRLD $0x10, Y9, Y9 443 VPBLENDW $0xaa, Y10, Y9, Y10 444 VMOVDQA Y4, Y9 445 VPSLLD $0x10, Y12, Y4 446 VPBLENDW $0xaa, Y4, Y11, Y4 447 VPSRLD $0x10, Y11, Y11 448 VPBLENDW $0xaa, Y12, Y11, Y12 449 VMOVDQA Y4, Y11 450 VPSLLD $0x10, Y14, Y4 451 VPBLENDW $0xaa, Y4, Y13, Y4 452 VPSRLD $0x10, Y13, Y13 453 VPBLENDW $0xaa, Y14, Y13, Y14 454 VMOVDQA Y4, Y13 455 VPMULLW Y9, Y0, Y4 456 VPMULLW Y10, Y0, Y5 457 VPMULLW Y13, Y2, Y6 458 VPMULLW Y14, Y2, Y0 459 VPMULHW Y9, Y1, Y9 460 VPMULHW Y10, Y1, Y10 461 VPMULHW Y13, Y3, Y13 462 VPMULHW Y14, Y3, Y14 463 VPMULHW Y4, Y15, Y4 464 VPMULHW Y5, Y15, Y5 465 VPMULHW Y6, Y15, Y6 466 VPMULHW Y0, Y15, Y0 467 VPSUBW Y4, Y9, Y4 468 VPSUBW Y5, Y10, Y5 469 VPSUBW Y6, Y13, Y6 470 VPSUBW Y0, Y14, Y0 471 VPSUBW Y4, Y7, Y9 472 VPSUBW Y5, Y8, Y10 473 VPSUBW Y6, Y11, Y13 474 VPSUBW Y0, Y12, Y14 475 VPADDW Y4, Y7, Y7 476 VPADDW Y5, Y8, Y8 477 VPADDW Y6, Y11, Y11 478 VPADDW Y0, Y12, Y12 479 VMOVDQU Y7, (AX) 480 VMOVDQU Y8, 32(AX) 481 VMOVDQU Y9, 64(AX) 482 VMOVDQU Y10, 96(AX) 483 VMOVDQU Y11, 128(AX) 484 VMOVDQU Y12, 160(AX) 485 VMOVDQU Y13, 192(AX) 486 VMOVDQU Y14, 224(AX) 487 VPBROADCASTW 8(CX), Y0 488 VPBROADCASTW 10(CX), Y1 489 VMOVDQU 256(AX), Y7 490 VMOVDQU 288(AX), Y8 491 VMOVDQU 320(AX), Y9 492 VMOVDQU 352(AX), Y10 493 VMOVDQU 384(AX), Y11 494 VMOVDQU 416(AX), Y12 495 VMOVDQU 448(AX), Y13 496 VMOVDQU 480(AX), Y14 497 VPMULLW Y11, Y0, Y2 498 VPMULLW Y12, Y0, Y3 499 VPMULLW Y13, Y0, Y4 500 VPMULLW Y14, Y0, Y5 501 VPMULHW Y11, Y1, Y11 502 VPMULHW Y12, Y1, Y12 503 VPMULHW Y13, Y1, Y13 504 VPMULHW Y14, Y1, Y14 505 VPMULHW Y2, Y15, Y2 506 VPMULHW Y3, Y15, Y3 507 VPMULHW Y4, Y15, Y4 508 VPMULHW Y5, Y15, Y5 509 VPSUBW Y2, Y11, Y2 510 VPSUBW Y3, Y12, Y3 511 VPSUBW Y4, Y13, Y4 512 VPSUBW Y5, Y14, Y5 513 VPSUBW Y2, Y7, Y11 514 VPSUBW Y3, Y8, Y12 515 VPSUBW Y4, Y9, Y13 516 VPSUBW Y5, Y10, Y14 517 VPADDW Y2, Y7, Y7 518 VPADDW Y3, Y8, Y8 519 VPADDW Y4, Y9, Y9 520 VPADDW Y5, Y10, Y10 521 VPBROADCASTW 20(CX), Y0 522 VPBROADCASTW 22(CX), Y1 523 VPBROADCASTW 24(CX), Y2 524 VPBROADCASTW 26(CX), Y3 525 VPMULLW Y9, Y0, Y4 526 VPMULLW Y10, Y0, Y5 527 VPMULLW Y13, Y2, Y6 528 VPMULLW Y14, Y2, Y0 529 VPMULHW Y9, Y1, Y9 530 VPMULHW Y10, Y1, Y10 531 VPMULHW Y13, Y3, Y13 532 VPMULHW Y14, Y3, Y14 533 VPMULHW Y4, Y15, Y4 534 VPMULHW Y5, Y15, Y5 535 VPMULHW Y6, Y15, Y6 536 VPMULHW Y0, Y15, Y0 537 VPSUBW Y4, Y9, Y4 538 VPSUBW Y5, Y10, Y5 539 VPSUBW Y6, Y13, Y6 540 VPSUBW Y0, Y14, Y0 541 VPSUBW Y4, Y7, Y9 542 VPSUBW Y5, Y8, Y10 543 VPSUBW Y6, Y11, Y13 544 VPSUBW Y0, Y12, Y14 545 VPADDW Y4, Y7, Y7 546 VPADDW Y5, Y8, Y8 547 VPADDW Y6, Y11, Y11 548 VPADDW Y0, Y12, Y12 549 VMOVDQU 160(CX), Y0 550 VMOVDQU 192(CX), Y1 551 VMOVDQU 224(CX), Y2 552 VMOVDQU 256(CX), Y3 553 VPERM2I128 $0x20, Y9, Y7, Y4 554 VPERM2I128 $0x31, Y9, Y7, Y9 555 VMOVDQA Y4, Y7 556 VPERM2I128 $0x20, Y10, Y8, Y4 557 VPERM2I128 $0x31, Y10, Y8, Y10 558 VMOVDQA Y4, Y8 559 VPERM2I128 $0x20, Y13, Y11, Y4 560 VPERM2I128 $0x31, Y13, Y11, Y13 561 VMOVDQA Y4, Y11 562 VPERM2I128 $0x20, Y14, Y12, Y4 563 VPERM2I128 $0x31, Y14, Y12, Y14 564 VMOVDQA Y4, Y12 565 VPMULLW Y8, Y0, Y4 566 VPMULLW Y10, Y0, Y5 567 VPMULLW Y12, Y2, Y6 568 VPMULLW Y14, Y2, Y0 569 VPMULHW Y8, Y1, Y8 570 VPMULHW Y10, Y1, Y10 571 VPMULHW Y12, Y3, Y12 572 VPMULHW Y14, Y3, Y14 573 VPMULHW Y4, Y15, Y4 574 VPMULHW Y5, Y15, Y5 575 VPMULHW Y6, Y15, Y6 576 VPMULHW Y0, Y15, Y0 577 VPSUBW Y4, Y8, Y4 578 VPSUBW Y5, Y10, Y5 579 VPSUBW Y6, Y12, Y6 580 VPSUBW Y0, Y14, Y0 581 VPSUBW Y4, Y7, Y8 582 VPSUBW Y5, Y9, Y10 583 VPSUBW Y6, Y11, Y12 584 VPSUBW Y0, Y13, Y14 585 VPADDW Y4, Y7, Y7 586 VPADDW Y5, Y9, Y9 587 VPADDW Y6, Y11, Y11 588 VPADDW Y0, Y13, Y13 589 VMOVDQU 416(CX), Y0 590 VMOVDQU 448(CX), Y1 591 VMOVDQU 480(CX), Y2 592 VMOVDQU 512(CX), Y3 593 VPUNPCKLQDQ Y8, Y7, Y4 594 VPUNPCKHQDQ Y8, Y7, Y8 595 VMOVDQA Y4, Y7 596 VPUNPCKLQDQ Y10, Y9, Y4 597 VPUNPCKHQDQ Y10, Y9, Y10 598 VMOVDQA Y4, Y9 599 VPUNPCKLQDQ Y12, Y11, Y4 600 VPUNPCKHQDQ Y12, Y11, Y12 601 VMOVDQA Y4, Y11 602 VPUNPCKLQDQ Y14, Y13, Y4 603 VPUNPCKHQDQ Y14, Y13, Y14 604 VMOVDQA Y4, Y13 605 VPMULLW Y9, Y0, Y4 606 VPMULLW Y10, Y0, Y5 607 VPMULLW Y13, Y2, Y6 608 VPMULLW Y14, Y2, Y0 609 VPMULHW Y9, Y1, Y9 610 VPMULHW Y10, Y1, Y10 611 VPMULHW Y13, Y3, Y13 612 VPMULHW Y14, Y3, Y14 613 VPMULHW Y4, Y15, Y4 614 VPMULHW Y5, Y15, Y5 615 VPMULHW Y6, Y15, Y6 616 VPMULHW Y0, Y15, Y0 617 VPSUBW Y4, Y9, Y4 618 VPSUBW Y5, Y10, Y5 619 VPSUBW Y6, Y13, Y6 620 VPSUBW Y0, Y14, Y0 621 VPSUBW Y4, Y7, Y9 622 VPSUBW Y5, Y8, Y10 623 VPSUBW Y6, Y11, Y13 624 VPSUBW Y0, Y12, Y14 625 VPADDW Y4, Y7, Y7 626 VPADDW Y5, Y8, Y8 627 VPADDW Y6, Y11, Y11 628 VPADDW Y0, Y12, Y12 629 VMOVDQU 672(CX), Y0 630 VMOVDQU 704(CX), Y1 631 VMOVDQU 736(CX), Y2 632 VMOVDQU 768(CX), Y3 633 VMOVSLDUP Y9, Y4 634 VPBLENDD $0xaa, Y4, Y7, Y4 635 VPSRLQ $0x20, Y7, Y7 636 VPBLENDD $0xaa, Y9, Y7, Y9 637 VMOVDQA Y4, Y7 638 VMOVSLDUP Y10, Y4 639 VPBLENDD $0xaa, Y4, Y8, Y4 640 VPSRLQ $0x20, Y8, Y8 641 VPBLENDD $0xaa, Y10, Y8, Y10 642 VMOVDQA Y4, Y8 643 VMOVSLDUP Y13, Y4 644 VPBLENDD $0xaa, Y4, Y11, Y4 645 VPSRLQ $0x20, Y11, Y11 646 VPBLENDD $0xaa, Y13, Y11, Y13 647 VMOVDQA Y4, Y11 648 VMOVSLDUP Y14, Y4 649 VPBLENDD $0xaa, Y4, Y12, Y4 650 VPSRLQ $0x20, Y12, Y12 651 VPBLENDD $0xaa, Y14, Y12, Y14 652 VMOVDQA Y4, Y12 653 VPMULLW Y8, Y0, Y4 654 VPMULLW Y10, Y0, Y5 655 VPMULLW Y12, Y2, Y6 656 VPMULLW Y14, Y2, Y0 657 VPMULHW Y8, Y1, Y8 658 VPMULHW Y10, Y1, Y10 659 VPMULHW Y12, Y3, Y12 660 VPMULHW Y14, Y3, Y14 661 VPMULHW Y4, Y15, Y4 662 VPMULHW Y5, Y15, Y5 663 VPMULHW Y6, Y15, Y6 664 VPMULHW Y0, Y15, Y0 665 VPSUBW Y4, Y8, Y4 666 VPSUBW Y5, Y10, Y5 667 VPSUBW Y6, Y12, Y6 668 VPSUBW Y0, Y14, Y0 669 VPSUBW Y4, Y7, Y8 670 VPSUBW Y5, Y9, Y10 671 VPSUBW Y6, Y11, Y12 672 VPSUBW Y0, Y13, Y14 673 VPADDW Y4, Y7, Y7 674 VPADDW Y5, Y9, Y9 675 VPADDW Y6, Y11, Y11 676 VPADDW Y0, Y13, Y13 677 VMOVDQU 928(CX), Y0 678 VMOVDQU 960(CX), Y1 679 VMOVDQU 992(CX), Y2 680 VMOVDQU 1024(CX), Y3 681 VPSLLD $0x10, Y8, Y4 682 VPBLENDW $0xaa, Y4, Y7, Y4 683 VPSRLD $0x10, Y7, Y7 684 VPBLENDW $0xaa, Y8, Y7, Y8 685 VMOVDQA Y4, Y7 686 VPSLLD $0x10, Y10, Y4 687 VPBLENDW $0xaa, Y4, Y9, Y4 688 VPSRLD $0x10, Y9, Y9 689 VPBLENDW $0xaa, Y10, Y9, Y10 690 VMOVDQA Y4, Y9 691 VPSLLD $0x10, Y12, Y4 692 VPBLENDW $0xaa, Y4, Y11, Y4 693 VPSRLD $0x10, Y11, Y11 694 VPBLENDW $0xaa, Y12, Y11, Y12 695 VMOVDQA Y4, Y11 696 VPSLLD $0x10, Y14, Y4 697 VPBLENDW $0xaa, Y4, Y13, Y4 698 VPSRLD $0x10, Y13, Y13 699 VPBLENDW $0xaa, Y14, Y13, Y14 700 VMOVDQA Y4, Y13 701 VPMULLW Y9, Y0, Y4 702 VPMULLW Y10, Y0, Y5 703 VPMULLW Y13, Y2, Y6 704 VPMULLW Y14, Y2, Y0 705 VPMULHW Y9, Y1, Y9 706 VPMULHW Y10, Y1, Y10 707 VPMULHW Y13, Y3, Y13 708 VPMULHW Y14, Y3, Y14 709 VPMULHW Y4, Y15, Y4 710 VPMULHW Y5, Y15, Y5 711 VPMULHW Y6, Y15, Y6 712 VPMULHW Y0, Y15, Y0 713 VPSUBW Y4, Y9, Y4 714 VPSUBW Y5, Y10, Y5 715 VPSUBW Y6, Y13, Y6 716 VPSUBW Y0, Y14, Y0 717 VPSUBW Y4, Y7, Y9 718 VPSUBW Y5, Y8, Y10 719 VPSUBW Y6, Y11, Y13 720 VPSUBW Y0, Y12, Y14 721 VPADDW Y4, Y7, Y7 722 VPADDW Y5, Y8, Y8 723 VPADDW Y6, Y11, Y11 724 VPADDW Y0, Y12, Y12 725 VMOVDQU Y7, 256(AX) 726 VMOVDQU Y8, 288(AX) 727 VMOVDQU Y9, 320(AX) 728 VMOVDQU Y10, 352(AX) 729 VMOVDQU Y11, 384(AX) 730 VMOVDQU Y12, 416(AX) 731 VMOVDQU Y13, 448(AX) 732 VMOVDQU Y14, 480(AX) 733 RET 734 735 // func invNttAVX2(p *[256]int16) 736 // Requires: AVX, AVX2 737 TEXT ·invNttAVX2(SB), NOSPLIT, $0-8 738 MOVQ p+0(FP), AX 739 LEAQ ·ZetasAVX2+0(SB), CX 740 MOVL $0x00000d01, DX 741 VMOVD DX, X0 742 VPBROADCASTW X0, Y15 743 VMOVDQU (AX), Y7 744 VMOVDQU 32(AX), Y8 745 VMOVDQU 64(AX), Y9 746 VMOVDQU 96(AX), Y10 747 VMOVDQU 128(AX), Y11 748 VMOVDQU 160(AX), Y12 749 VMOVDQU 192(AX), Y13 750 VMOVDQU 224(AX), Y14 751 VMOVDQU 1056(CX), Y0 752 VMOVDQU 1088(CX), Y1 753 VMOVDQU 1120(CX), Y2 754 VMOVDQU 1152(CX), Y3 755 VPSUBW Y7, Y9, Y4 756 VPSUBW Y8, Y10, Y5 757 VPSUBW Y11, Y13, Y6 758 VPADDW Y7, Y9, Y7 759 VPADDW Y8, Y10, Y8 760 VPADDW Y11, Y13, Y11 761 VPMULLW Y4, Y0, Y9 762 VPMULLW Y5, Y0, Y10 763 VPSUBW Y12, Y14, Y0 764 VPMULLW Y6, Y2, Y13 765 VPADDW Y12, Y14, Y12 766 VPMULLW Y0, Y2, Y14 767 VPMULHW Y4, Y1, Y4 768 VPMULHW Y5, Y1, Y5 769 VPMULHW Y6, Y3, Y6 770 VPMULHW Y0, Y3, Y0 771 VPMULHW Y9, Y15, Y9 772 VPMULHW Y10, Y15, Y10 773 VPMULHW Y13, Y15, Y13 774 VPMULHW Y14, Y15, Y14 775 VPSUBW Y9, Y4, Y9 776 VPSUBW Y10, Y5, Y10 777 VPSUBW Y13, Y6, Y13 778 VPSUBW Y14, Y0, Y14 779 VMOVDQU 1312(CX), Y0 780 VMOVDQU 1344(CX), Y1 781 VMOVDQU 1376(CX), Y2 782 VMOVDQU 1408(CX), Y3 783 VPSLLD $0x10, Y8, Y4 784 VPBLENDW $0xaa, Y4, Y7, Y4 785 VPSRLD $0x10, Y7, Y7 786 VPBLENDW $0xaa, Y8, Y7, Y8 787 VMOVDQA Y4, Y7 788 VPSLLD $0x10, Y10, Y4 789 VPBLENDW $0xaa, Y4, Y9, Y4 790 VPSRLD $0x10, Y9, Y9 791 VPBLENDW $0xaa, Y10, Y9, Y10 792 VMOVDQA Y4, Y9 793 VPSLLD $0x10, Y12, Y4 794 VPBLENDW $0xaa, Y4, Y11, Y4 795 VPSRLD $0x10, Y11, Y11 796 VPBLENDW $0xaa, Y12, Y11, Y12 797 VMOVDQA Y4, Y11 798 VPSLLD $0x10, Y14, Y4 799 VPBLENDW $0xaa, Y4, Y13, Y4 800 VPSRLD $0x10, Y13, Y13 801 VPBLENDW $0xaa, Y14, Y13, Y14 802 VMOVDQA Y4, Y13 803 VPSUBW Y7, Y8, Y4 804 VPSUBW Y9, Y10, Y5 805 VPSUBW Y11, Y12, Y6 806 VPADDW Y7, Y8, Y7 807 VPADDW Y9, Y10, Y9 808 VPADDW Y11, Y12, Y11 809 VPMULLW Y4, Y0, Y8 810 VPMULLW Y5, Y0, Y10 811 VPSUBW Y13, Y14, Y0 812 VPMULLW Y6, Y2, Y12 813 VPADDW Y13, Y14, Y13 814 VPMULLW Y0, Y2, Y14 815 VPMULHW Y4, Y1, Y4 816 VPMULHW Y5, Y1, Y5 817 VPMULHW Y6, Y3, Y6 818 VPMULHW Y0, Y3, Y0 819 VPMULHW Y8, Y15, Y8 820 VPMULHW Y10, Y15, Y10 821 VPMULHW Y12, Y15, Y12 822 VPMULHW Y14, Y15, Y14 823 VPSUBW Y8, Y4, Y8 824 VPSUBW Y10, Y5, Y10 825 VPSUBW Y12, Y6, Y12 826 VPSUBW Y14, Y0, Y14 827 VMOVDQU 1568(CX), Y0 828 VMOVDQU 1600(CX), Y1 829 VMOVDQU 1632(CX), Y2 830 VMOVDQU 1664(CX), Y3 831 VMOVSLDUP Y9, Y4 832 VPBLENDD $0xaa, Y4, Y7, Y4 833 VPSRLQ $0x20, Y7, Y7 834 VPBLENDD $0xaa, Y9, Y7, Y9 835 VMOVDQA Y4, Y7 836 VMOVSLDUP Y10, Y4 837 VPBLENDD $0xaa, Y4, Y8, Y4 838 VPSRLQ $0x20, Y8, Y8 839 VPBLENDD $0xaa, Y10, Y8, Y10 840 VMOVDQA Y4, Y8 841 VMOVSLDUP Y13, Y4 842 VPBLENDD $0xaa, Y4, Y11, Y4 843 VPSRLQ $0x20, Y11, Y11 844 VPBLENDD $0xaa, Y13, Y11, Y13 845 VMOVDQA Y4, Y11 846 VMOVSLDUP Y14, Y4 847 VPBLENDD $0xaa, Y4, Y12, Y4 848 VPSRLQ $0x20, Y12, Y12 849 VPBLENDD $0xaa, Y14, Y12, Y14 850 VMOVDQA Y4, Y12 851 VPSUBW Y7, Y9, Y4 852 VPSUBW Y8, Y10, Y5 853 VPSUBW Y11, Y13, Y6 854 VPADDW Y7, Y9, Y7 855 VPADDW Y8, Y10, Y8 856 VPADDW Y11, Y13, Y11 857 VPMULLW Y4, Y0, Y9 858 VPMULLW Y5, Y0, Y10 859 VPSUBW Y12, Y14, Y0 860 VPMULLW Y6, Y2, Y13 861 VPADDW Y12, Y14, Y12 862 VPMULLW Y0, Y2, Y14 863 VPMULHW Y4, Y1, Y4 864 VPMULHW Y5, Y1, Y5 865 VPMULHW Y6, Y3, Y6 866 VPMULHW Y0, Y3, Y0 867 VPMULHW Y9, Y15, Y9 868 VPMULHW Y10, Y15, Y10 869 VPMULHW Y13, Y15, Y13 870 VPMULHW Y14, Y15, Y14 871 VPSUBW Y9, Y4, Y9 872 VPSUBW Y10, Y5, Y10 873 VPSUBW Y13, Y6, Y13 874 VPSUBW Y14, Y0, Y14 875 MOVL $0x00004ebf, DX 876 VMOVD DX, X0 877 VPBROADCASTW X0, Y4 878 VPMULHW Y4, Y7, Y5 879 VPSRAW $0x0a, Y5, Y5 880 VPMULLW Y15, Y5, Y5 881 VPSUBW Y5, Y7, Y7 882 VPMULHW Y4, Y11, Y5 883 VPSRAW $0x0a, Y5, Y5 884 VPMULLW Y15, Y5, Y5 885 VPSUBW Y5, Y11, Y11 886 VMOVDQU 1824(CX), Y0 887 VMOVDQU 1856(CX), Y1 888 VMOVDQU 1888(CX), Y2 889 VMOVDQU 1920(CX), Y3 890 VPUNPCKLQDQ Y8, Y7, Y4 891 VPUNPCKHQDQ Y8, Y7, Y8 892 VMOVDQA Y4, Y7 893 VPUNPCKLQDQ Y10, Y9, Y4 894 VPUNPCKHQDQ Y10, Y9, Y10 895 VMOVDQA Y4, Y9 896 VPUNPCKLQDQ Y12, Y11, Y4 897 VPUNPCKHQDQ Y12, Y11, Y12 898 VMOVDQA Y4, Y11 899 VPUNPCKLQDQ Y14, Y13, Y4 900 VPUNPCKHQDQ Y14, Y13, Y14 901 VMOVDQA Y4, Y13 902 VPSUBW Y7, Y8, Y4 903 VPSUBW Y9, Y10, Y5 904 VPSUBW Y11, Y12, Y6 905 VPADDW Y7, Y8, Y7 906 VPADDW Y9, Y10, Y9 907 VPADDW Y11, Y12, Y11 908 VPMULLW Y4, Y0, Y8 909 VPMULLW Y5, Y0, Y10 910 VPSUBW Y13, Y14, Y0 911 VPMULLW Y6, Y2, Y12 912 VPADDW Y13, Y14, Y13 913 VPMULLW Y0, Y2, Y14 914 VPMULHW Y4, Y1, Y4 915 VPMULHW Y5, Y1, Y5 916 VPMULHW Y6, Y3, Y6 917 VPMULHW Y0, Y3, Y0 918 VPMULHW Y8, Y15, Y8 919 VPMULHW Y10, Y15, Y10 920 VPMULHW Y12, Y15, Y12 921 VPMULHW Y14, Y15, Y14 922 VPSUBW Y8, Y4, Y8 923 VPSUBW Y10, Y5, Y10 924 VPSUBW Y12, Y6, Y12 925 VPSUBW Y14, Y0, Y14 926 VPBROADCASTW 2080(CX), Y0 927 VPBROADCASTW 2082(CX), Y1 928 VPBROADCASTW 2084(CX), Y2 929 VPBROADCASTW 2086(CX), Y3 930 VPERM2I128 $0x20, Y9, Y7, Y4 931 VPERM2I128 $0x31, Y9, Y7, Y9 932 VMOVDQA Y4, Y7 933 VPERM2I128 $0x20, Y10, Y8, Y4 934 VPERM2I128 $0x31, Y10, Y8, Y10 935 VMOVDQA Y4, Y8 936 VPERM2I128 $0x20, Y13, Y11, Y4 937 VPERM2I128 $0x31, Y13, Y11, Y13 938 VMOVDQA Y4, Y11 939 VPERM2I128 $0x20, Y14, Y12, Y4 940 VPERM2I128 $0x31, Y14, Y12, Y14 941 VMOVDQA Y4, Y12 942 VPSUBW Y7, Y9, Y4 943 VPSUBW Y8, Y10, Y5 944 VPSUBW Y11, Y13, Y6 945 VPADDW Y7, Y9, Y7 946 VPADDW Y8, Y10, Y8 947 VPADDW Y11, Y13, Y11 948 VPMULLW Y4, Y0, Y9 949 VPMULLW Y5, Y0, Y10 950 VPSUBW Y12, Y14, Y0 951 VPMULLW Y6, Y2, Y13 952 VPADDW Y12, Y14, Y12 953 VPMULLW Y0, Y2, Y14 954 VPMULHW Y4, Y1, Y4 955 VPMULHW Y5, Y1, Y5 956 VPMULHW Y6, Y3, Y6 957 VPMULHW Y0, Y3, Y0 958 VPMULHW Y9, Y15, Y9 959 VPMULHW Y10, Y15, Y10 960 VPMULHW Y13, Y15, Y13 961 VPMULHW Y14, Y15, Y14 962 VPSUBW Y9, Y4, Y9 963 VPSUBW Y10, Y5, Y10 964 VPSUBW Y13, Y6, Y13 965 VPSUBW Y14, Y0, Y14 966 MOVL $0x00004ebf, DX 967 VMOVD DX, X0 968 VPBROADCASTW X0, Y4 969 VPMULHW Y4, Y7, Y5 970 VPSRAW $0x0a, Y5, Y5 971 VPMULLW Y15, Y5, Y5 972 VPSUBW Y5, Y7, Y7 973 VPMULHW Y4, Y11, Y5 974 VPSRAW $0x0a, Y5, Y5 975 VPMULLW Y15, Y5, Y5 976 VPSUBW Y5, Y11, Y11 977 VPBROADCASTW 2096(CX), Y0 978 VPBROADCASTW 2098(CX), Y1 979 VPSUBW Y7, Y11, Y4 980 VPSUBW Y8, Y12, Y5 981 VPSUBW Y9, Y13, Y6 982 VPADDW Y7, Y11, Y7 983 VPADDW Y8, Y12, Y8 984 VPADDW Y9, Y13, Y9 985 VPMULLW Y4, Y0, Y11 986 VPMULLW Y5, Y0, Y12 987 VPSUBW Y10, Y14, Y2 988 VPMULLW Y6, Y0, Y13 989 VPADDW Y10, Y14, Y10 990 VPMULLW Y2, Y0, Y14 991 VPMULHW Y4, Y1, Y4 992 VPMULHW Y5, Y1, Y5 993 VPMULHW Y6, Y1, Y6 994 VPMULHW Y2, Y1, Y2 995 VPMULHW Y11, Y15, Y11 996 VPMULHW Y12, Y15, Y12 997 VPMULHW Y13, Y15, Y13 998 VPMULHW Y14, Y15, Y14 999 VPSUBW Y11, Y4, Y11 1000 VPSUBW Y12, Y5, Y12 1001 VPSUBW Y13, Y6, Y13 1002 VPSUBW Y14, Y2, Y14 1003 VMOVDQU Y7, (AX) 1004 VMOVDQU Y8, 32(AX) 1005 VMOVDQU Y9, 64(AX) 1006 VMOVDQU Y10, 96(AX) 1007 VMOVDQU Y11, 128(AX) 1008 VMOVDQU Y12, 160(AX) 1009 VMOVDQU Y13, 192(AX) 1010 VMOVDQU Y14, 224(AX) 1011 VMOVDQU 256(AX), Y7 1012 VMOVDQU 288(AX), Y8 1013 VMOVDQU 320(AX), Y9 1014 VMOVDQU 352(AX), Y10 1015 VMOVDQU 384(AX), Y11 1016 VMOVDQU 416(AX), Y12 1017 VMOVDQU 448(AX), Y13 1018 VMOVDQU 480(AX), Y14 1019 VMOVDQU 1184(CX), Y0 1020 VMOVDQU 1216(CX), Y1 1021 VMOVDQU 1248(CX), Y2 1022 VMOVDQU 1280(CX), Y3 1023 VPSUBW Y7, Y9, Y4 1024 VPSUBW Y8, Y10, Y5 1025 VPSUBW Y11, Y13, Y6 1026 VPADDW Y7, Y9, Y7 1027 VPADDW Y8, Y10, Y8 1028 VPADDW Y11, Y13, Y11 1029 VPMULLW Y4, Y0, Y9 1030 VPMULLW Y5, Y0, Y10 1031 VPSUBW Y12, Y14, Y0 1032 VPMULLW Y6, Y2, Y13 1033 VPADDW Y12, Y14, Y12 1034 VPMULLW Y0, Y2, Y14 1035 VPMULHW Y4, Y1, Y4 1036 VPMULHW Y5, Y1, Y5 1037 VPMULHW Y6, Y3, Y6 1038 VPMULHW Y0, Y3, Y0 1039 VPMULHW Y9, Y15, Y9 1040 VPMULHW Y10, Y15, Y10 1041 VPMULHW Y13, Y15, Y13 1042 VPMULHW Y14, Y15, Y14 1043 VPSUBW Y9, Y4, Y9 1044 VPSUBW Y10, Y5, Y10 1045 VPSUBW Y13, Y6, Y13 1046 VPSUBW Y14, Y0, Y14 1047 VMOVDQU 1440(CX), Y0 1048 VMOVDQU 1472(CX), Y1 1049 VMOVDQU 1504(CX), Y2 1050 VMOVDQU 1536(CX), Y3 1051 VPSLLD $0x10, Y8, Y4 1052 VPBLENDW $0xaa, Y4, Y7, Y4 1053 VPSRLD $0x10, Y7, Y7 1054 VPBLENDW $0xaa, Y8, Y7, Y8 1055 VMOVDQA Y4, Y7 1056 VPSLLD $0x10, Y10, Y4 1057 VPBLENDW $0xaa, Y4, Y9, Y4 1058 VPSRLD $0x10, Y9, Y9 1059 VPBLENDW $0xaa, Y10, Y9, Y10 1060 VMOVDQA Y4, Y9 1061 VPSLLD $0x10, Y12, Y4 1062 VPBLENDW $0xaa, Y4, Y11, Y4 1063 VPSRLD $0x10, Y11, Y11 1064 VPBLENDW $0xaa, Y12, Y11, Y12 1065 VMOVDQA Y4, Y11 1066 VPSLLD $0x10, Y14, Y4 1067 VPBLENDW $0xaa, Y4, Y13, Y4 1068 VPSRLD $0x10, Y13, Y13 1069 VPBLENDW $0xaa, Y14, Y13, Y14 1070 VMOVDQA Y4, Y13 1071 VPSUBW Y7, Y8, Y4 1072 VPSUBW Y9, Y10, Y5 1073 VPSUBW Y11, Y12, Y6 1074 VPADDW Y7, Y8, Y7 1075 VPADDW Y9, Y10, Y9 1076 VPADDW Y11, Y12, Y11 1077 VPMULLW Y4, Y0, Y8 1078 VPMULLW Y5, Y0, Y10 1079 VPSUBW Y13, Y14, Y0 1080 VPMULLW Y6, Y2, Y12 1081 VPADDW Y13, Y14, Y13 1082 VPMULLW Y0, Y2, Y14 1083 VPMULHW Y4, Y1, Y4 1084 VPMULHW Y5, Y1, Y5 1085 VPMULHW Y6, Y3, Y6 1086 VPMULHW Y0, Y3, Y0 1087 VPMULHW Y8, Y15, Y8 1088 VPMULHW Y10, Y15, Y10 1089 VPMULHW Y12, Y15, Y12 1090 VPMULHW Y14, Y15, Y14 1091 VPSUBW Y8, Y4, Y8 1092 VPSUBW Y10, Y5, Y10 1093 VPSUBW Y12, Y6, Y12 1094 VPSUBW Y14, Y0, Y14 1095 VMOVDQU 1696(CX), Y0 1096 VMOVDQU 1728(CX), Y1 1097 VMOVDQU 1760(CX), Y2 1098 VMOVDQU 1792(CX), Y3 1099 VMOVSLDUP Y9, Y4 1100 VPBLENDD $0xaa, Y4, Y7, Y4 1101 VPSRLQ $0x20, Y7, Y7 1102 VPBLENDD $0xaa, Y9, Y7, Y9 1103 VMOVDQA Y4, Y7 1104 VMOVSLDUP Y10, Y4 1105 VPBLENDD $0xaa, Y4, Y8, Y4 1106 VPSRLQ $0x20, Y8, Y8 1107 VPBLENDD $0xaa, Y10, Y8, Y10 1108 VMOVDQA Y4, Y8 1109 VMOVSLDUP Y13, Y4 1110 VPBLENDD $0xaa, Y4, Y11, Y4 1111 VPSRLQ $0x20, Y11, Y11 1112 VPBLENDD $0xaa, Y13, Y11, Y13 1113 VMOVDQA Y4, Y11 1114 VMOVSLDUP Y14, Y4 1115 VPBLENDD $0xaa, Y4, Y12, Y4 1116 VPSRLQ $0x20, Y12, Y12 1117 VPBLENDD $0xaa, Y14, Y12, Y14 1118 VMOVDQA Y4, Y12 1119 VPSUBW Y7, Y9, Y4 1120 VPSUBW Y8, Y10, Y5 1121 VPSUBW Y11, Y13, Y6 1122 VPADDW Y7, Y9, Y7 1123 VPADDW Y8, Y10, Y8 1124 VPADDW Y11, Y13, Y11 1125 VPMULLW Y4, Y0, Y9 1126 VPMULLW Y5, Y0, Y10 1127 VPSUBW Y12, Y14, Y0 1128 VPMULLW Y6, Y2, Y13 1129 VPADDW Y12, Y14, Y12 1130 VPMULLW Y0, Y2, Y14 1131 VPMULHW Y4, Y1, Y4 1132 VPMULHW Y5, Y1, Y5 1133 VPMULHW Y6, Y3, Y6 1134 VPMULHW Y0, Y3, Y0 1135 VPMULHW Y9, Y15, Y9 1136 VPMULHW Y10, Y15, Y10 1137 VPMULHW Y13, Y15, Y13 1138 VPMULHW Y14, Y15, Y14 1139 VPSUBW Y9, Y4, Y9 1140 VPSUBW Y10, Y5, Y10 1141 VPSUBW Y13, Y6, Y13 1142 VPSUBW Y14, Y0, Y14 1143 MOVL $0x00004ebf, DX 1144 VMOVD DX, X0 1145 VPBROADCASTW X0, Y4 1146 VPMULHW Y4, Y7, Y5 1147 VPSRAW $0x0a, Y5, Y5 1148 VPMULLW Y15, Y5, Y5 1149 VPSUBW Y5, Y7, Y7 1150 VPMULHW Y4, Y11, Y5 1151 VPSRAW $0x0a, Y5, Y5 1152 VPMULLW Y15, Y5, Y5 1153 VPSUBW Y5, Y11, Y11 1154 VMOVDQU 1952(CX), Y0 1155 VMOVDQU 1984(CX), Y1 1156 VMOVDQU 2016(CX), Y2 1157 VMOVDQU 2048(CX), Y3 1158 VPUNPCKLQDQ Y8, Y7, Y4 1159 VPUNPCKHQDQ Y8, Y7, Y8 1160 VMOVDQA Y4, Y7 1161 VPUNPCKLQDQ Y10, Y9, Y4 1162 VPUNPCKHQDQ Y10, Y9, Y10 1163 VMOVDQA Y4, Y9 1164 VPUNPCKLQDQ Y12, Y11, Y4 1165 VPUNPCKHQDQ Y12, Y11, Y12 1166 VMOVDQA Y4, Y11 1167 VPUNPCKLQDQ Y14, Y13, Y4 1168 VPUNPCKHQDQ Y14, Y13, Y14 1169 VMOVDQA Y4, Y13 1170 VPSUBW Y7, Y8, Y4 1171 VPSUBW Y9, Y10, Y5 1172 VPSUBW Y11, Y12, Y6 1173 VPADDW Y7, Y8, Y7 1174 VPADDW Y9, Y10, Y9 1175 VPADDW Y11, Y12, Y11 1176 VPMULLW Y4, Y0, Y8 1177 VPMULLW Y5, Y0, Y10 1178 VPSUBW Y13, Y14, Y0 1179 VPMULLW Y6, Y2, Y12 1180 VPADDW Y13, Y14, Y13 1181 VPMULLW Y0, Y2, Y14 1182 VPMULHW Y4, Y1, Y4 1183 VPMULHW Y5, Y1, Y5 1184 VPMULHW Y6, Y3, Y6 1185 VPMULHW Y0, Y3, Y0 1186 VPMULHW Y8, Y15, Y8 1187 VPMULHW Y10, Y15, Y10 1188 VPMULHW Y12, Y15, Y12 1189 VPMULHW Y14, Y15, Y14 1190 VPSUBW Y8, Y4, Y8 1191 VPSUBW Y10, Y5, Y10 1192 VPSUBW Y12, Y6, Y12 1193 VPSUBW Y14, Y0, Y14 1194 VPBROADCASTW 2088(CX), Y0 1195 VPBROADCASTW 2090(CX), Y1 1196 VPBROADCASTW 2092(CX), Y2 1197 VPBROADCASTW 2094(CX), Y3 1198 VPERM2I128 $0x20, Y9, Y7, Y4 1199 VPERM2I128 $0x31, Y9, Y7, Y9 1200 VMOVDQA Y4, Y7 1201 VPERM2I128 $0x20, Y10, Y8, Y4 1202 VPERM2I128 $0x31, Y10, Y8, Y10 1203 VMOVDQA Y4, Y8 1204 VPERM2I128 $0x20, Y13, Y11, Y4 1205 VPERM2I128 $0x31, Y13, Y11, Y13 1206 VMOVDQA Y4, Y11 1207 VPERM2I128 $0x20, Y14, Y12, Y4 1208 VPERM2I128 $0x31, Y14, Y12, Y14 1209 VMOVDQA Y4, Y12 1210 VPSUBW Y7, Y9, Y4 1211 VPSUBW Y8, Y10, Y5 1212 VPSUBW Y11, Y13, Y6 1213 VPADDW Y7, Y9, Y7 1214 VPADDW Y8, Y10, Y8 1215 VPADDW Y11, Y13, Y11 1216 VPMULLW Y4, Y0, Y9 1217 VPMULLW Y5, Y0, Y10 1218 VPSUBW Y12, Y14, Y0 1219 VPMULLW Y6, Y2, Y13 1220 VPADDW Y12, Y14, Y12 1221 VPMULLW Y0, Y2, Y14 1222 VPMULHW Y4, Y1, Y4 1223 VPMULHW Y5, Y1, Y5 1224 VPMULHW Y6, Y3, Y6 1225 VPMULHW Y0, Y3, Y0 1226 VPMULHW Y9, Y15, Y9 1227 VPMULHW Y10, Y15, Y10 1228 VPMULHW Y13, Y15, Y13 1229 VPMULHW Y14, Y15, Y14 1230 VPSUBW Y9, Y4, Y9 1231 VPSUBW Y10, Y5, Y10 1232 VPSUBW Y13, Y6, Y13 1233 VPSUBW Y14, Y0, Y14 1234 MOVL $0x00004ebf, DX 1235 VMOVD DX, X0 1236 VPBROADCASTW X0, Y4 1237 VPMULHW Y4, Y7, Y5 1238 VPSRAW $0x0a, Y5, Y5 1239 VPMULLW Y15, Y5, Y5 1240 VPSUBW Y5, Y7, Y7 1241 VPMULHW Y4, Y11, Y5 1242 VPSRAW $0x0a, Y5, Y5 1243 VPMULLW Y15, Y5, Y5 1244 VPSUBW Y5, Y11, Y11 1245 VPBROADCASTW 2100(CX), Y0 1246 VPBROADCASTW 2102(CX), Y1 1247 VPSUBW Y7, Y11, Y4 1248 VPSUBW Y8, Y12, Y5 1249 VPSUBW Y9, Y13, Y6 1250 VPADDW Y7, Y11, Y7 1251 VPADDW Y8, Y12, Y8 1252 VPADDW Y9, Y13, Y9 1253 VPMULLW Y4, Y0, Y11 1254 VPMULLW Y5, Y0, Y12 1255 VPSUBW Y10, Y14, Y2 1256 VPMULLW Y6, Y0, Y13 1257 VPADDW Y10, Y14, Y10 1258 VPMULLW Y2, Y0, Y14 1259 VPMULHW Y4, Y1, Y4 1260 VPMULHW Y5, Y1, Y5 1261 VPMULHW Y6, Y1, Y6 1262 VPMULHW Y2, Y1, Y2 1263 VPMULHW Y11, Y15, Y11 1264 VPMULHW Y12, Y15, Y12 1265 VPMULHW Y13, Y15, Y13 1266 VPMULHW Y14, Y15, Y14 1267 VPSUBW Y11, Y4, Y11 1268 VPSUBW Y12, Y5, Y12 1269 VPSUBW Y13, Y6, Y13 1270 VPSUBW Y14, Y2, Y14 1271 VMOVDQU Y7, 256(AX) 1272 VMOVDQU Y8, 288(AX) 1273 VMOVDQU Y9, 320(AX) 1274 VMOVDQU Y10, 352(AX) 1275 VMOVDQU Y11, 384(AX) 1276 VMOVDQU Y12, 416(AX) 1277 VMOVDQU Y13, 448(AX) 1278 VMOVDQU Y14, 480(AX) 1279 VPBROADCASTW 2104(CX), Y0 1280 VPBROADCASTW 2106(CX), Y1 1281 VMOVDQU (AX), Y7 1282 VMOVDQU 32(AX), Y8 1283 VMOVDQU 64(AX), Y9 1284 VMOVDQU 96(AX), Y10 1285 VMOVDQU 256(AX), Y11 1286 VMOVDQU 288(AX), Y12 1287 VMOVDQU 320(AX), Y13 1288 VMOVDQU 352(AX), Y14 1289 VPSUBW Y7, Y11, Y2 1290 VPSUBW Y8, Y12, Y3 1291 VPSUBW Y9, Y13, Y4 1292 VPADDW Y7, Y11, Y7 1293 VPADDW Y8, Y12, Y8 1294 VPADDW Y9, Y13, Y9 1295 VPMULLW Y2, Y0, Y11 1296 VPMULLW Y3, Y0, Y12 1297 VPSUBW Y10, Y14, Y5 1298 VPMULLW Y4, Y0, Y13 1299 VPADDW Y10, Y14, Y10 1300 VPMULLW Y5, Y0, Y14 1301 VPMULHW Y2, Y1, Y2 1302 VPMULHW Y3, Y1, Y3 1303 VPMULHW Y4, Y1, Y4 1304 VPMULHW Y5, Y1, Y5 1305 VPMULHW Y11, Y15, Y11 1306 VPMULHW Y12, Y15, Y12 1307 VPMULHW Y13, Y15, Y13 1308 VPMULHW Y14, Y15, Y14 1309 VPSUBW Y11, Y2, Y11 1310 VPSUBW Y12, Y3, Y12 1311 VPSUBW Y13, Y4, Y13 1312 VPSUBW Y14, Y5, Y14 1313 MOVL $0xffffd8a1, DX 1314 VMOVD DX, X0 1315 VPBROADCASTW X0, Y0 1316 MOVL $0x000005a1, DX 1317 VMOVD DX, X1 1318 VPBROADCASTW X1, Y1 1319 VPMULLW Y7, Y0, Y2 1320 VPMULLW Y8, Y0, Y3 1321 VPMULLW Y9, Y0, Y4 1322 VPMULLW Y10, Y0, Y5 1323 VPMULHW Y7, Y1, Y7 1324 VPMULHW Y8, Y1, Y8 1325 VPMULHW Y9, Y1, Y9 1326 VPMULHW Y10, Y1, Y10 1327 VPMULHW Y2, Y15, Y2 1328 VPMULHW Y3, Y15, Y3 1329 VPMULHW Y4, Y15, Y4 1330 VPMULHW Y5, Y15, Y5 1331 VPSUBW Y2, Y7, Y7 1332 VPSUBW Y3, Y8, Y8 1333 VPSUBW Y4, Y9, Y9 1334 VPSUBW Y5, Y10, Y10 1335 VPMULLW Y11, Y0, Y2 1336 VPMULLW Y12, Y0, Y3 1337 VPMULLW Y13, Y0, Y4 1338 VPMULLW Y14, Y0, Y5 1339 VPMULHW Y11, Y1, Y11 1340 VPMULHW Y12, Y1, Y12 1341 VPMULHW Y13, Y1, Y13 1342 VPMULHW Y14, Y1, Y14 1343 VPMULHW Y2, Y15, Y2 1344 VPMULHW Y3, Y15, Y3 1345 VPMULHW Y4, Y15, Y4 1346 VPMULHW Y5, Y15, Y5 1347 VPSUBW Y2, Y11, Y11 1348 VPSUBW Y3, Y12, Y12 1349 VPSUBW Y4, Y13, Y13 1350 VPSUBW Y5, Y14, Y14 1351 VMOVDQU Y7, (AX) 1352 VMOVDQU Y8, 32(AX) 1353 VMOVDQU Y9, 64(AX) 1354 VMOVDQU Y10, 96(AX) 1355 VMOVDQU Y11, 256(AX) 1356 VMOVDQU Y12, 288(AX) 1357 VMOVDQU Y13, 320(AX) 1358 VMOVDQU Y14, 352(AX) 1359 VPBROADCASTW 2104(CX), Y0 1360 VPBROADCASTW 2106(CX), Y1 1361 VMOVDQU 128(AX), Y7 1362 VMOVDQU 160(AX), Y8 1363 VMOVDQU 192(AX), Y9 1364 VMOVDQU 224(AX), Y10 1365 VMOVDQU 384(AX), Y11 1366 VMOVDQU 416(AX), Y12 1367 VMOVDQU 448(AX), Y13 1368 VMOVDQU 480(AX), Y14 1369 VPSUBW Y7, Y11, Y2 1370 VPSUBW Y8, Y12, Y3 1371 VPSUBW Y9, Y13, Y4 1372 VPADDW Y7, Y11, Y7 1373 VPADDW Y8, Y12, Y8 1374 VPADDW Y9, Y13, Y9 1375 VPMULLW Y2, Y0, Y11 1376 VPMULLW Y3, Y0, Y12 1377 VPSUBW Y10, Y14, Y5 1378 VPMULLW Y4, Y0, Y13 1379 VPADDW Y10, Y14, Y10 1380 VPMULLW Y5, Y0, Y14 1381 VPMULHW Y2, Y1, Y2 1382 VPMULHW Y3, Y1, Y3 1383 VPMULHW Y4, Y1, Y4 1384 VPMULHW Y5, Y1, Y5 1385 VPMULHW Y11, Y15, Y11 1386 VPMULHW Y12, Y15, Y12 1387 VPMULHW Y13, Y15, Y13 1388 VPMULHW Y14, Y15, Y14 1389 VPSUBW Y11, Y2, Y11 1390 VPSUBW Y12, Y3, Y12 1391 VPSUBW Y13, Y4, Y13 1392 VPSUBW Y14, Y5, Y14 1393 MOVL $0xffffd8a1, CX 1394 VMOVD CX, X0 1395 VPBROADCASTW X0, Y0 1396 MOVL $0x000005a1, CX 1397 VMOVD CX, X1 1398 VPBROADCASTW X1, Y1 1399 VPMULLW Y7, Y0, Y2 1400 VPMULLW Y8, Y0, Y3 1401 VPMULLW Y9, Y0, Y4 1402 VPMULLW Y10, Y0, Y5 1403 VPMULHW Y7, Y1, Y7 1404 VPMULHW Y8, Y1, Y8 1405 VPMULHW Y9, Y1, Y9 1406 VPMULHW Y10, Y1, Y10 1407 VPMULHW Y2, Y15, Y2 1408 VPMULHW Y3, Y15, Y3 1409 VPMULHW Y4, Y15, Y4 1410 VPMULHW Y5, Y15, Y5 1411 VPSUBW Y2, Y7, Y7 1412 VPSUBW Y3, Y8, Y8 1413 VPSUBW Y4, Y9, Y9 1414 VPSUBW Y5, Y10, Y10 1415 VPMULLW Y11, Y0, Y2 1416 VPMULLW Y12, Y0, Y3 1417 VPMULLW Y13, Y0, Y4 1418 VPMULLW Y14, Y0, Y5 1419 VPMULHW Y11, Y1, Y11 1420 VPMULHW Y12, Y1, Y12 1421 VPMULHW Y13, Y1, Y13 1422 VPMULHW Y14, Y1, Y14 1423 VPMULHW Y2, Y15, Y2 1424 VPMULHW Y3, Y15, Y3 1425 VPMULHW Y4, Y15, Y4 1426 VPMULHW Y5, Y15, Y5 1427 VPSUBW Y2, Y11, Y11 1428 VPSUBW Y3, Y12, Y12 1429 VPSUBW Y4, Y13, Y13 1430 VPSUBW Y5, Y14, Y14 1431 VMOVDQU Y7, 128(AX) 1432 VMOVDQU Y8, 160(AX) 1433 VMOVDQU Y9, 192(AX) 1434 VMOVDQU Y10, 224(AX) 1435 VMOVDQU Y11, 384(AX) 1436 VMOVDQU Y12, 416(AX) 1437 VMOVDQU Y13, 448(AX) 1438 VMOVDQU Y14, 480(AX) 1439 RET 1440 1441 // func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16) 1442 // Requires: AVX, AVX2 1443 TEXT ·mulHatAVX2(SB), NOSPLIT, $8-24 1444 MOVQ p+0(FP), AX 1445 MOVQ a+8(FP), CX 1446 MOVQ b+16(FP), DX 1447 LEAQ ·ZetasAVX2+0(SB), BX 1448 MOVL $0xfffff301, SI 1449 VMOVD SI, X0 1450 VPBROADCASTW X0, Y14 1451 MOVL $0x00000d01, SI 1452 VMOVD SI, X0 1453 VPBROADCASTW X0, Y15 1454 VMOVDQU (CX), Y0 1455 VMOVDQU 32(CX), Y1 1456 VMOVDQU 64(CX), Y2 1457 VMOVDQU 96(CX), Y3 1458 VMOVDQU (DX), Y4 1459 VMOVDQU 32(DX), Y5 1460 VMOVDQU 64(DX), Y6 1461 VMOVDQU 96(DX), Y7 1462 VPMULLW Y1, Y5, Y8 1463 VPMULLW Y0, Y4, Y9 1464 VPMULLW Y0, Y5, Y10 1465 VPMULLW Y1, Y4, Y11 1466 VPMULLW Y8, Y14, Y8 1467 VPMULLW Y9, Y14, Y9 1468 VPMULLW Y10, Y14, Y10 1469 VPMULLW Y11, Y14, Y11 1470 VPMULHW Y1, Y5, Y12 1471 VPMULHW Y0, Y4, Y13 1472 VPMULHW Y0, Y5, Y0 1473 VPMULHW Y1, Y4, Y1 1474 VMOVDQA Y12, Y4 1475 VMOVDQA Y13, Y5 1476 VPMULHW Y8, Y15, Y8 1477 VPMULHW Y9, Y15, Y9 1478 VPMULHW Y10, Y15, Y10 1479 VPMULHW Y11, Y15, Y11 1480 VPSUBW Y8, Y4, Y4 1481 VPSUBW Y9, Y5, Y5 1482 VPSUBW Y10, Y0, Y0 1483 VPSUBW Y11, Y1, Y1 1484 VMOVDQU 800(BX), Y12 1485 VMOVDQU 832(BX), Y13 1486 VPMULLW Y4, Y12, Y8 1487 VPMULHW Y4, Y13, Y4 1488 VPMULHW Y8, Y15, Y8 1489 VPSUBW Y8, Y4, Y4 1490 VPADDW Y4, Y5, Y4 1491 VPADDW Y0, Y1, Y5 1492 VPMULLW Y3, Y7, Y8 1493 VPMULLW Y2, Y6, Y9 1494 VPMULLW Y2, Y7, Y10 1495 VPMULLW Y3, Y6, Y11 1496 VPMULLW Y8, Y14, Y8 1497 VPMULLW Y9, Y14, Y9 1498 VPMULLW Y10, Y14, Y10 1499 VPMULLW Y11, Y14, Y11 1500 VPMULHW Y3, Y7, Y12 1501 VPMULHW Y2, Y6, Y13 1502 VPMULHW Y2, Y7, Y2 1503 VPMULHW Y3, Y6, Y3 1504 VMOVDQA Y12, Y6 1505 VMOVDQA Y13, Y7 1506 VPMULHW Y8, Y15, Y8 1507 VPMULHW Y9, Y15, Y9 1508 VPMULHW Y10, Y15, Y10 1509 VPMULHW Y11, Y15, Y11 1510 VPSUBW Y8, Y6, Y6 1511 VPSUBW Y9, Y7, Y7 1512 VPSUBW Y10, Y2, Y2 1513 VPSUBW Y11, Y3, Y3 1514 VMOVDQU 800(BX), Y12 1515 VMOVDQU 832(BX), Y13 1516 VPMULLW Y6, Y12, Y8 1517 VPMULHW Y6, Y13, Y6 1518 VPMULHW Y8, Y15, Y8 1519 VPSUBW Y8, Y6, Y6 1520 VPSUBW Y6, Y7, Y6 1521 VPADDW Y2, Y3, Y7 1522 VMOVDQU Y4, (AX) 1523 VMOVDQU Y5, 32(AX) 1524 VMOVDQU Y6, 64(AX) 1525 VMOVDQU Y7, 96(AX) 1526 VMOVDQU 128(CX), Y0 1527 VMOVDQU 160(CX), Y1 1528 VMOVDQU 192(CX), Y2 1529 VMOVDQU 224(CX), Y3 1530 VMOVDQU 128(DX), Y4 1531 VMOVDQU 160(DX), Y5 1532 VMOVDQU 192(DX), Y6 1533 VMOVDQU 224(DX), Y7 1534 VPMULLW Y1, Y5, Y8 1535 VPMULLW Y0, Y4, Y9 1536 VPMULLW Y0, Y5, Y10 1537 VPMULLW Y1, Y4, Y11 1538 VPMULLW Y8, Y14, Y8 1539 VPMULLW Y9, Y14, Y9 1540 VPMULLW Y10, Y14, Y10 1541 VPMULLW Y11, Y14, Y11 1542 VPMULHW Y1, Y5, Y12 1543 VPMULHW Y0, Y4, Y13 1544 VPMULHW Y0, Y5, Y0 1545 VPMULHW Y1, Y4, Y1 1546 VMOVDQA Y12, Y4 1547 VMOVDQA Y13, Y5 1548 VPMULHW Y8, Y15, Y8 1549 VPMULHW Y9, Y15, Y9 1550 VPMULHW Y10, Y15, Y10 1551 VPMULHW Y11, Y15, Y11 1552 VPSUBW Y8, Y4, Y4 1553 VPSUBW Y9, Y5, Y5 1554 VPSUBW Y10, Y0, Y0 1555 VPSUBW Y11, Y1, Y1 1556 VMOVDQU 864(BX), Y12 1557 VMOVDQU 896(BX), Y13 1558 VPMULLW Y4, Y12, Y8 1559 VPMULHW Y4, Y13, Y4 1560 VPMULHW Y8, Y15, Y8 1561 VPSUBW Y8, Y4, Y4 1562 VPADDW Y4, Y5, Y4 1563 VPADDW Y0, Y1, Y5 1564 VPMULLW Y3, Y7, Y8 1565 VPMULLW Y2, Y6, Y9 1566 VPMULLW Y2, Y7, Y10 1567 VPMULLW Y3, Y6, Y11 1568 VPMULLW Y8, Y14, Y8 1569 VPMULLW Y9, Y14, Y9 1570 VPMULLW Y10, Y14, Y10 1571 VPMULLW Y11, Y14, Y11 1572 VPMULHW Y3, Y7, Y12 1573 VPMULHW Y2, Y6, Y13 1574 VPMULHW Y2, Y7, Y2 1575 VPMULHW Y3, Y6, Y3 1576 VMOVDQA Y12, Y6 1577 VMOVDQA Y13, Y7 1578 VPMULHW Y8, Y15, Y8 1579 VPMULHW Y9, Y15, Y9 1580 VPMULHW Y10, Y15, Y10 1581 VPMULHW Y11, Y15, Y11 1582 VPSUBW Y8, Y6, Y6 1583 VPSUBW Y9, Y7, Y7 1584 VPSUBW Y10, Y2, Y2 1585 VPSUBW Y11, Y3, Y3 1586 VMOVDQU 864(BX), Y12 1587 VMOVDQU 896(BX), Y13 1588 VPMULLW Y6, Y12, Y8 1589 VPMULHW Y6, Y13, Y6 1590 VPMULHW Y8, Y15, Y8 1591 VPSUBW Y8, Y6, Y6 1592 VPSUBW Y6, Y7, Y6 1593 VPADDW Y2, Y3, Y7 1594 VMOVDQU Y4, 128(AX) 1595 VMOVDQU Y5, 160(AX) 1596 VMOVDQU Y6, 192(AX) 1597 VMOVDQU Y7, 224(AX) 1598 VMOVDQU 256(CX), Y0 1599 VMOVDQU 288(CX), Y1 1600 VMOVDQU 320(CX), Y2 1601 VMOVDQU 352(CX), Y3 1602 VMOVDQU 256(DX), Y4 1603 VMOVDQU 288(DX), Y5 1604 VMOVDQU 320(DX), Y6 1605 VMOVDQU 352(DX), Y7 1606 VPMULLW Y1, Y5, Y8 1607 VPMULLW Y0, Y4, Y9 1608 VPMULLW Y0, Y5, Y10 1609 VPMULLW Y1, Y4, Y11 1610 VPMULLW Y8, Y14, Y8 1611 VPMULLW Y9, Y14, Y9 1612 VPMULLW Y10, Y14, Y10 1613 VPMULLW Y11, Y14, Y11 1614 VPMULHW Y1, Y5, Y12 1615 VPMULHW Y0, Y4, Y13 1616 VPMULHW Y0, Y5, Y0 1617 VPMULHW Y1, Y4, Y1 1618 VMOVDQA Y12, Y4 1619 VMOVDQA Y13, Y5 1620 VPMULHW Y8, Y15, Y8 1621 VPMULHW Y9, Y15, Y9 1622 VPMULHW Y10, Y15, Y10 1623 VPMULHW Y11, Y15, Y11 1624 VPSUBW Y8, Y4, Y4 1625 VPSUBW Y9, Y5, Y5 1626 VPSUBW Y10, Y0, Y0 1627 VPSUBW Y11, Y1, Y1 1628 VMOVDQU 928(BX), Y12 1629 VMOVDQU 960(BX), Y13 1630 VPMULLW Y4, Y12, Y8 1631 VPMULHW Y4, Y13, Y4 1632 VPMULHW Y8, Y15, Y8 1633 VPSUBW Y8, Y4, Y4 1634 VPADDW Y4, Y5, Y4 1635 VPADDW Y0, Y1, Y5 1636 VPMULLW Y3, Y7, Y8 1637 VPMULLW Y2, Y6, Y9 1638 VPMULLW Y2, Y7, Y10 1639 VPMULLW Y3, Y6, Y11 1640 VPMULLW Y8, Y14, Y8 1641 VPMULLW Y9, Y14, Y9 1642 VPMULLW Y10, Y14, Y10 1643 VPMULLW Y11, Y14, Y11 1644 VPMULHW Y3, Y7, Y12 1645 VPMULHW Y2, Y6, Y13 1646 VPMULHW Y2, Y7, Y2 1647 VPMULHW Y3, Y6, Y3 1648 VMOVDQA Y12, Y6 1649 VMOVDQA Y13, Y7 1650 VPMULHW Y8, Y15, Y8 1651 VPMULHW Y9, Y15, Y9 1652 VPMULHW Y10, Y15, Y10 1653 VPMULHW Y11, Y15, Y11 1654 VPSUBW Y8, Y6, Y6 1655 VPSUBW Y9, Y7, Y7 1656 VPSUBW Y10, Y2, Y2 1657 VPSUBW Y11, Y3, Y3 1658 VMOVDQU 928(BX), Y12 1659 VMOVDQU 960(BX), Y13 1660 VPMULLW Y6, Y12, Y8 1661 VPMULHW Y6, Y13, Y6 1662 VPMULHW Y8, Y15, Y8 1663 VPSUBW Y8, Y6, Y6 1664 VPSUBW Y6, Y7, Y6 1665 VPADDW Y2, Y3, Y7 1666 VMOVDQU Y4, 256(AX) 1667 VMOVDQU Y5, 288(AX) 1668 VMOVDQU Y6, 320(AX) 1669 VMOVDQU Y7, 352(AX) 1670 VMOVDQU 384(CX), Y0 1671 VMOVDQU 416(CX), Y1 1672 VMOVDQU 448(CX), Y2 1673 VMOVDQU 480(CX), Y3 1674 VMOVDQU 384(DX), Y4 1675 VMOVDQU 416(DX), Y5 1676 VMOVDQU 448(DX), Y6 1677 VMOVDQU 480(DX), Y7 1678 VPMULLW Y1, Y5, Y8 1679 VPMULLW Y0, Y4, Y9 1680 VPMULLW Y0, Y5, Y10 1681 VPMULLW Y1, Y4, Y11 1682 VPMULLW Y8, Y14, Y8 1683 VPMULLW Y9, Y14, Y9 1684 VPMULLW Y10, Y14, Y10 1685 VPMULLW Y11, Y14, Y11 1686 VPMULHW Y1, Y5, Y12 1687 VPMULHW Y0, Y4, Y13 1688 VPMULHW Y0, Y5, Y0 1689 VPMULHW Y1, Y4, Y1 1690 VMOVDQA Y12, Y4 1691 VMOVDQA Y13, Y5 1692 VPMULHW Y8, Y15, Y8 1693 VPMULHW Y9, Y15, Y9 1694 VPMULHW Y10, Y15, Y10 1695 VPMULHW Y11, Y15, Y11 1696 VPSUBW Y8, Y4, Y4 1697 VPSUBW Y9, Y5, Y5 1698 VPSUBW Y10, Y0, Y0 1699 VPSUBW Y11, Y1, Y1 1700 VMOVDQU 992(BX), Y12 1701 VMOVDQU 1024(BX), Y13 1702 VPMULLW Y4, Y12, Y8 1703 VPMULHW Y4, Y13, Y4 1704 VPMULHW Y8, Y15, Y8 1705 VPSUBW Y8, Y4, Y4 1706 VPADDW Y4, Y5, Y4 1707 VPADDW Y0, Y1, Y5 1708 VPMULLW Y3, Y7, Y8 1709 VPMULLW Y2, Y6, Y9 1710 VPMULLW Y2, Y7, Y10 1711 VPMULLW Y3, Y6, Y11 1712 VPMULLW Y8, Y14, Y8 1713 VPMULLW Y9, Y14, Y9 1714 VPMULLW Y10, Y14, Y10 1715 VPMULLW Y11, Y14, Y11 1716 VPMULHW Y3, Y7, Y12 1717 VPMULHW Y2, Y6, Y13 1718 VPMULHW Y2, Y7, Y2 1719 VPMULHW Y3, Y6, Y3 1720 VMOVDQA Y12, Y6 1721 VMOVDQA Y13, Y7 1722 VPMULHW Y8, Y15, Y8 1723 VPMULHW Y9, Y15, Y9 1724 VPMULHW Y10, Y15, Y10 1725 VPMULHW Y11, Y15, Y11 1726 VPSUBW Y8, Y6, Y6 1727 VPSUBW Y9, Y7, Y7 1728 VPSUBW Y10, Y2, Y2 1729 VPSUBW Y11, Y3, Y3 1730 VMOVDQU 992(BX), Y12 1731 VMOVDQU 1024(BX), Y13 1732 VPMULLW Y6, Y12, Y8 1733 VPMULHW Y6, Y13, Y6 1734 VPMULHW Y8, Y15, Y8 1735 VPSUBW Y8, Y6, Y6 1736 VPSUBW Y6, Y7, Y6 1737 VPADDW Y2, Y3, Y7 1738 VMOVDQU Y4, 384(AX) 1739 VMOVDQU Y5, 416(AX) 1740 VMOVDQU Y6, 448(AX) 1741 VMOVDQU Y7, 480(AX) 1742 RET 1743 1744 // func detangleAVX2(p *[256]int16) 1745 // Requires: AVX, AVX2 1746 TEXT ·detangleAVX2(SB), NOSPLIT, $0-8 1747 MOVQ p+0(FP), AX 1748 VMOVDQU (AX), Y0 1749 VMOVDQU 32(AX), Y1 1750 VMOVDQU 64(AX), Y2 1751 VMOVDQU 96(AX), Y3 1752 VMOVDQU 128(AX), Y4 1753 VMOVDQU 160(AX), Y5 1754 VMOVDQU 192(AX), Y6 1755 VMOVDQU 224(AX), Y7 1756 VPSLLD $0x10, Y1, Y8 1757 VPBLENDW $0xaa, Y8, Y0, Y8 1758 VPSRLD $0x10, Y0, Y0 1759 VPBLENDW $0xaa, Y1, Y0, Y1 1760 VMOVDQA Y8, Y0 1761 VPSLLD $0x10, Y3, Y8 1762 VPBLENDW $0xaa, Y8, Y2, Y8 1763 VPSRLD $0x10, Y2, Y2 1764 VPBLENDW $0xaa, Y3, Y2, Y3 1765 VMOVDQA Y8, Y2 1766 VPSLLD $0x10, Y5, Y8 1767 VPBLENDW $0xaa, Y8, Y4, Y8 1768 VPSRLD $0x10, Y4, Y4 1769 VPBLENDW $0xaa, Y5, Y4, Y5 1770 VMOVDQA Y8, Y4 1771 VPSLLD $0x10, Y7, Y8 1772 VPBLENDW $0xaa, Y8, Y6, Y8 1773 VPSRLD $0x10, Y6, Y6 1774 VPBLENDW $0xaa, Y7, Y6, Y7 1775 VMOVDQA Y8, Y6 1776 VMOVSLDUP Y2, Y8 1777 VPBLENDD $0xaa, Y8, Y0, Y8 1778 VPSRLQ $0x20, Y0, Y0 1779 VPBLENDD $0xaa, Y2, Y0, Y2 1780 VMOVDQA Y8, Y0 1781 VMOVSLDUP Y3, Y8 1782 VPBLENDD $0xaa, Y8, Y1, Y8 1783 VPSRLQ $0x20, Y1, Y1 1784 VPBLENDD $0xaa, Y3, Y1, Y3 1785 VMOVDQA Y8, Y1 1786 VMOVSLDUP Y6, Y8 1787 VPBLENDD $0xaa, Y8, Y4, Y8 1788 VPSRLQ $0x20, Y4, Y4 1789 VPBLENDD $0xaa, Y6, Y4, Y6 1790 VMOVDQA Y8, Y4 1791 VMOVSLDUP Y7, Y8 1792 VPBLENDD $0xaa, Y8, Y5, Y8 1793 VPSRLQ $0x20, Y5, Y5 1794 VPBLENDD $0xaa, Y7, Y5, Y7 1795 VMOVDQA Y8, Y5 1796 VPUNPCKLQDQ Y1, Y0, Y8 1797 VPUNPCKHQDQ Y1, Y0, Y1 1798 VMOVDQA Y8, Y0 1799 VPUNPCKLQDQ Y3, Y2, Y8 1800 VPUNPCKHQDQ Y3, Y2, Y3 1801 VMOVDQA Y8, Y2 1802 VPUNPCKLQDQ Y5, Y4, Y8 1803 VPUNPCKHQDQ Y5, Y4, Y5 1804 VMOVDQA Y8, Y4 1805 VPUNPCKLQDQ Y7, Y6, Y8 1806 VPUNPCKHQDQ Y7, Y6, Y7 1807 VMOVDQA Y8, Y6 1808 VPERM2I128 $0x20, Y2, Y0, Y8 1809 VPERM2I128 $0x31, Y2, Y0, Y2 1810 VMOVDQA Y8, Y0 1811 VPERM2I128 $0x20, Y3, Y1, Y8 1812 VPERM2I128 $0x31, Y3, Y1, Y3 1813 VMOVDQA Y8, Y1 1814 VPERM2I128 $0x20, Y6, Y4, Y8 1815 VPERM2I128 $0x31, Y6, Y4, Y6 1816 VMOVDQA Y8, Y4 1817 VPERM2I128 $0x20, Y7, Y5, Y8 1818 VPERM2I128 $0x31, Y7, Y5, Y7 1819 VMOVDQA Y8, Y5 1820 VMOVDQU Y0, (AX) 1821 VMOVDQU Y1, 32(AX) 1822 VMOVDQU Y2, 64(AX) 1823 VMOVDQU Y3, 96(AX) 1824 VMOVDQU Y4, 128(AX) 1825 VMOVDQU Y5, 160(AX) 1826 VMOVDQU Y6, 192(AX) 1827 VMOVDQU Y7, 224(AX) 1828 VMOVDQU 256(AX), Y0 1829 VMOVDQU 288(AX), Y1 1830 VMOVDQU 320(AX), Y2 1831 VMOVDQU 352(AX), Y3 1832 VMOVDQU 384(AX), Y4 1833 VMOVDQU 416(AX), Y5 1834 VMOVDQU 448(AX), Y6 1835 VMOVDQU 480(AX), Y7 1836 VPSLLD $0x10, Y1, Y8 1837 VPBLENDW $0xaa, Y8, Y0, Y8 1838 VPSRLD $0x10, Y0, Y0 1839 VPBLENDW $0xaa, Y1, Y0, Y1 1840 VMOVDQA Y8, Y0 1841 VPSLLD $0x10, Y3, Y8 1842 VPBLENDW $0xaa, Y8, Y2, Y8 1843 VPSRLD $0x10, Y2, Y2 1844 VPBLENDW $0xaa, Y3, Y2, Y3 1845 VMOVDQA Y8, Y2 1846 VPSLLD $0x10, Y5, Y8 1847 VPBLENDW $0xaa, Y8, Y4, Y8 1848 VPSRLD $0x10, Y4, Y4 1849 VPBLENDW $0xaa, Y5, Y4, Y5 1850 VMOVDQA Y8, Y4 1851 VPSLLD $0x10, Y7, Y8 1852 VPBLENDW $0xaa, Y8, Y6, Y8 1853 VPSRLD $0x10, Y6, Y6 1854 VPBLENDW $0xaa, Y7, Y6, Y7 1855 VMOVDQA Y8, Y6 1856 VMOVSLDUP Y2, Y8 1857 VPBLENDD $0xaa, Y8, Y0, Y8 1858 VPSRLQ $0x20, Y0, Y0 1859 VPBLENDD $0xaa, Y2, Y0, Y2 1860 VMOVDQA Y8, Y0 1861 VMOVSLDUP Y3, Y8 1862 VPBLENDD $0xaa, Y8, Y1, Y8 1863 VPSRLQ $0x20, Y1, Y1 1864 VPBLENDD $0xaa, Y3, Y1, Y3 1865 VMOVDQA Y8, Y1 1866 VMOVSLDUP Y6, Y8 1867 VPBLENDD $0xaa, Y8, Y4, Y8 1868 VPSRLQ $0x20, Y4, Y4 1869 VPBLENDD $0xaa, Y6, Y4, Y6 1870 VMOVDQA Y8, Y4 1871 VMOVSLDUP Y7, Y8 1872 VPBLENDD $0xaa, Y8, Y5, Y8 1873 VPSRLQ $0x20, Y5, Y5 1874 VPBLENDD $0xaa, Y7, Y5, Y7 1875 VMOVDQA Y8, Y5 1876 VPUNPCKLQDQ Y1, Y0, Y8 1877 VPUNPCKHQDQ Y1, Y0, Y1 1878 VMOVDQA Y8, Y0 1879 VPUNPCKLQDQ Y3, Y2, Y8 1880 VPUNPCKHQDQ Y3, Y2, Y3 1881 VMOVDQA Y8, Y2 1882 VPUNPCKLQDQ Y5, Y4, Y8 1883 VPUNPCKHQDQ Y5, Y4, Y5 1884 VMOVDQA Y8, Y4 1885 VPUNPCKLQDQ Y7, Y6, Y8 1886 VPUNPCKHQDQ Y7, Y6, Y7 1887 VMOVDQA Y8, Y6 1888 VPERM2I128 $0x20, Y2, Y0, Y8 1889 VPERM2I128 $0x31, Y2, Y0, Y2 1890 VMOVDQA Y8, Y0 1891 VPERM2I128 $0x20, Y3, Y1, Y8 1892 VPERM2I128 $0x31, Y3, Y1, Y3 1893 VMOVDQA Y8, Y1 1894 VPERM2I128 $0x20, Y6, Y4, Y8 1895 VPERM2I128 $0x31, Y6, Y4, Y6 1896 VMOVDQA Y8, Y4 1897 VPERM2I128 $0x20, Y7, Y5, Y8 1898 VPERM2I128 $0x31, Y7, Y5, Y7 1899 VMOVDQA Y8, Y5 1900 VMOVDQU Y0, 256(AX) 1901 VMOVDQU Y1, 288(AX) 1902 VMOVDQU Y2, 320(AX) 1903 VMOVDQU Y3, 352(AX) 1904 VMOVDQU Y4, 384(AX) 1905 VMOVDQU Y5, 416(AX) 1906 VMOVDQU Y6, 448(AX) 1907 VMOVDQU Y7, 480(AX) 1908 RET 1909 1910 // func tangleAVX2(p *[256]int16) 1911 // Requires: AVX, AVX2 1912 TEXT ·tangleAVX2(SB), NOSPLIT, $0-8 1913 MOVQ p+0(FP), AX 1914 VMOVDQU (AX), Y0 1915 VMOVDQU 32(AX), Y1 1916 VMOVDQU 64(AX), Y2 1917 VMOVDQU 96(AX), Y3 1918 VMOVDQU 128(AX), Y4 1919 VMOVDQU 160(AX), Y5 1920 VMOVDQU 192(AX), Y6 1921 VMOVDQU 224(AX), Y7 1922 VPERM2I128 $0x20, Y2, Y0, Y8 1923 VPERM2I128 $0x31, Y2, Y0, Y2 1924 VMOVDQA Y8, Y0 1925 VPERM2I128 $0x20, Y3, Y1, Y8 1926 VPERM2I128 $0x31, Y3, Y1, Y3 1927 VMOVDQA Y8, Y1 1928 VPERM2I128 $0x20, Y6, Y4, Y8 1929 VPERM2I128 $0x31, Y6, Y4, Y6 1930 VMOVDQA Y8, Y4 1931 VPERM2I128 $0x20, Y7, Y5, Y8 1932 VPERM2I128 $0x31, Y7, Y5, Y7 1933 VMOVDQA Y8, Y5 1934 VPUNPCKLQDQ Y1, Y0, Y8 1935 VPUNPCKHQDQ Y1, Y0, Y1 1936 VMOVDQA Y8, Y0 1937 VPUNPCKLQDQ Y3, Y2, Y8 1938 VPUNPCKHQDQ Y3, Y2, Y3 1939 VMOVDQA Y8, Y2 1940 VPUNPCKLQDQ Y5, Y4, Y8 1941 VPUNPCKHQDQ Y5, Y4, Y5 1942 VMOVDQA Y8, Y4 1943 VPUNPCKLQDQ Y7, Y6, Y8 1944 VPUNPCKHQDQ Y7, Y6, Y7 1945 VMOVDQA Y8, Y6 1946 VMOVSLDUP Y2, Y8 1947 VPBLENDD $0xaa, Y8, Y0, Y8 1948 VPSRLQ $0x20, Y0, Y0 1949 VPBLENDD $0xaa, Y2, Y0, Y2 1950 VMOVDQA Y8, Y0 1951 VMOVSLDUP Y3, Y8 1952 VPBLENDD $0xaa, Y8, Y1, Y8 1953 VPSRLQ $0x20, Y1, Y1 1954 VPBLENDD $0xaa, Y3, Y1, Y3 1955 VMOVDQA Y8, Y1 1956 VMOVSLDUP Y6, Y8 1957 VPBLENDD $0xaa, Y8, Y4, Y8 1958 VPSRLQ $0x20, Y4, Y4 1959 VPBLENDD $0xaa, Y6, Y4, Y6 1960 VMOVDQA Y8, Y4 1961 VMOVSLDUP Y7, Y8 1962 VPBLENDD $0xaa, Y8, Y5, Y8 1963 VPSRLQ $0x20, Y5, Y5 1964 VPBLENDD $0xaa, Y7, Y5, Y7 1965 VMOVDQA Y8, Y5 1966 VPSLLD $0x10, Y1, Y8 1967 VPBLENDW $0xaa, Y8, Y0, Y8 1968 VPSRLD $0x10, Y0, Y0 1969 VPBLENDW $0xaa, Y1, Y0, Y1 1970 VMOVDQA Y8, Y0 1971 VPSLLD $0x10, Y3, Y8 1972 VPBLENDW $0xaa, Y8, Y2, Y8 1973 VPSRLD $0x10, Y2, Y2 1974 VPBLENDW $0xaa, Y3, Y2, Y3 1975 VMOVDQA Y8, Y2 1976 VPSLLD $0x10, Y5, Y8 1977 VPBLENDW $0xaa, Y8, Y4, Y8 1978 VPSRLD $0x10, Y4, Y4 1979 VPBLENDW $0xaa, Y5, Y4, Y5 1980 VMOVDQA Y8, Y4 1981 VPSLLD $0x10, Y7, Y8 1982 VPBLENDW $0xaa, Y8, Y6, Y8 1983 VPSRLD $0x10, Y6, Y6 1984 VPBLENDW $0xaa, Y7, Y6, Y7 1985 VMOVDQA Y8, Y6 1986 VMOVDQU Y0, (AX) 1987 VMOVDQU Y1, 32(AX) 1988 VMOVDQU Y2, 64(AX) 1989 VMOVDQU Y3, 96(AX) 1990 VMOVDQU Y4, 128(AX) 1991 VMOVDQU Y5, 160(AX) 1992 VMOVDQU Y6, 192(AX) 1993 VMOVDQU Y7, 224(AX) 1994 VMOVDQU 256(AX), Y0 1995 VMOVDQU 288(AX), Y1 1996 VMOVDQU 320(AX), Y2 1997 VMOVDQU 352(AX), Y3 1998 VMOVDQU 384(AX), Y4 1999 VMOVDQU 416(AX), Y5 2000 VMOVDQU 448(AX), Y6 2001 VMOVDQU 480(AX), Y7 2002 VPERM2I128 $0x20, Y2, Y0, Y8 2003 VPERM2I128 $0x31, Y2, Y0, Y2 2004 VMOVDQA Y8, Y0 2005 VPERM2I128 $0x20, Y3, Y1, Y8 2006 VPERM2I128 $0x31, Y3, Y1, Y3 2007 VMOVDQA Y8, Y1 2008 VPERM2I128 $0x20, Y6, Y4, Y8 2009 VPERM2I128 $0x31, Y6, Y4, Y6 2010 VMOVDQA Y8, Y4 2011 VPERM2I128 $0x20, Y7, Y5, Y8 2012 VPERM2I128 $0x31, Y7, Y5, Y7 2013 VMOVDQA Y8, Y5 2014 VPUNPCKLQDQ Y1, Y0, Y8 2015 VPUNPCKHQDQ Y1, Y0, Y1 2016 VMOVDQA Y8, Y0 2017 VPUNPCKLQDQ Y3, Y2, Y8 2018 VPUNPCKHQDQ Y3, Y2, Y3 2019 VMOVDQA Y8, Y2 2020 VPUNPCKLQDQ Y5, Y4, Y8 2021 VPUNPCKHQDQ Y5, Y4, Y5 2022 VMOVDQA Y8, Y4 2023 VPUNPCKLQDQ Y7, Y6, Y8 2024 VPUNPCKHQDQ Y7, Y6, Y7 2025 VMOVDQA Y8, Y6 2026 VMOVSLDUP Y2, Y8 2027 VPBLENDD $0xaa, Y8, Y0, Y8 2028 VPSRLQ $0x20, Y0, Y0 2029 VPBLENDD $0xaa, Y2, Y0, Y2 2030 VMOVDQA Y8, Y0 2031 VMOVSLDUP Y3, Y8 2032 VPBLENDD $0xaa, Y8, Y1, Y8 2033 VPSRLQ $0x20, Y1, Y1 2034 VPBLENDD $0xaa, Y3, Y1, Y3 2035 VMOVDQA Y8, Y1 2036 VMOVSLDUP Y6, Y8 2037 VPBLENDD $0xaa, Y8, Y4, Y8 2038 VPSRLQ $0x20, Y4, Y4 2039 VPBLENDD $0xaa, Y6, Y4, Y6 2040 VMOVDQA Y8, Y4 2041 VMOVSLDUP Y7, Y8 2042 VPBLENDD $0xaa, Y8, Y5, Y8 2043 VPSRLQ $0x20, Y5, Y5 2044 VPBLENDD $0xaa, Y7, Y5, Y7 2045 VMOVDQA Y8, Y5 2046 VPSLLD $0x10, Y1, Y8 2047 VPBLENDW $0xaa, Y8, Y0, Y8 2048 VPSRLD $0x10, Y0, Y0 2049 VPBLENDW $0xaa, Y1, Y0, Y1 2050 VMOVDQA Y8, Y0 2051 VPSLLD $0x10, Y3, Y8 2052 VPBLENDW $0xaa, Y8, Y2, Y8 2053 VPSRLD $0x10, Y2, Y2 2054 VPBLENDW $0xaa, Y3, Y2, Y3 2055 VMOVDQA Y8, Y2 2056 VPSLLD $0x10, Y5, Y8 2057 VPBLENDW $0xaa, Y8, Y4, Y8 2058 VPSRLD $0x10, Y4, Y4 2059 VPBLENDW $0xaa, Y5, Y4, Y5 2060 VMOVDQA Y8, Y4 2061 VPSLLD $0x10, Y7, Y8 2062 VPBLENDW $0xaa, Y8, Y6, Y8 2063 VPSRLD $0x10, Y6, Y6 2064 VPBLENDW $0xaa, Y7, Y6, Y7 2065 VMOVDQA Y8, Y6 2066 VMOVDQU Y0, 256(AX) 2067 VMOVDQU Y1, 288(AX) 2068 VMOVDQU Y2, 320(AX) 2069 VMOVDQU Y3, 352(AX) 2070 VMOVDQU Y4, 384(AX) 2071 VMOVDQU Y5, 416(AX) 2072 VMOVDQU Y6, 448(AX) 2073 VMOVDQU Y7, 480(AX) 2074 RET 2075 2076 // func barrettReduceAVX2(p *[256]int16) 2077 // Requires: AVX, AVX2 2078 TEXT ·barrettReduceAVX2(SB), NOSPLIT, $0-8 2079 MOVQ p+0(FP), AX 2080 MOVL $0x00000d01, CX 2081 VMOVD CX, X0 2082 VPBROADCASTW X0, Y9 2083 MOVL $0x00004ebf, CX 2084 VMOVD CX, X0 2085 VPBROADCASTW X0, Y8 2086 VMOVDQU (AX), Y0 2087 VMOVDQU 32(AX), Y1 2088 VMOVDQU 64(AX), Y2 2089 VMOVDQU 96(AX), Y3 2090 VPMULHW Y8, Y0, Y4 2091 VPMULHW Y8, Y1, Y5 2092 VPMULHW Y8, Y2, Y6 2093 VPMULHW Y8, Y3, Y7 2094 VPSRAW $0x0a, Y4, Y4 2095 VPSRAW $0x0a, Y5, Y5 2096 VPSRAW $0x0a, Y6, Y6 2097 VPSRAW $0x0a, Y7, Y7 2098 VPMULLW Y9, Y4, Y4 2099 VPMULLW Y9, Y5, Y5 2100 VPMULLW Y9, Y6, Y6 2101 VPMULLW Y9, Y7, Y7 2102 VPSUBW Y4, Y0, Y0 2103 VPSUBW Y5, Y1, Y1 2104 VPSUBW Y6, Y2, Y2 2105 VPSUBW Y7, Y3, Y3 2106 VMOVDQU Y0, (AX) 2107 VMOVDQU Y1, 32(AX) 2108 VMOVDQU Y2, 64(AX) 2109 VMOVDQU Y3, 96(AX) 2110 VMOVDQU 128(AX), Y0 2111 VMOVDQU 160(AX), Y1 2112 VMOVDQU 192(AX), Y2 2113 VMOVDQU 224(AX), Y3 2114 VPMULHW Y8, Y0, Y4 2115 VPMULHW Y8, Y1, Y5 2116 VPMULHW Y8, Y2, Y6 2117 VPMULHW Y8, Y3, Y7 2118 VPSRAW $0x0a, Y4, Y4 2119 VPSRAW $0x0a, Y5, Y5 2120 VPSRAW $0x0a, Y6, Y6 2121 VPSRAW $0x0a, Y7, Y7 2122 VPMULLW Y9, Y4, Y4 2123 VPMULLW Y9, Y5, Y5 2124 VPMULLW Y9, Y6, Y6 2125 VPMULLW Y9, Y7, Y7 2126 VPSUBW Y4, Y0, Y0 2127 VPSUBW Y5, Y1, Y1 2128 VPSUBW Y6, Y2, Y2 2129 VPSUBW Y7, Y3, Y3 2130 VMOVDQU Y0, 128(AX) 2131 VMOVDQU Y1, 160(AX) 2132 VMOVDQU Y2, 192(AX) 2133 VMOVDQU Y3, 224(AX) 2134 VMOVDQU 256(AX), Y0 2135 VMOVDQU 288(AX), Y1 2136 VMOVDQU 320(AX), Y2 2137 VMOVDQU 352(AX), Y3 2138 VPMULHW Y8, Y0, Y4 2139 VPMULHW Y8, Y1, Y5 2140 VPMULHW Y8, Y2, Y6 2141 VPMULHW Y8, Y3, Y7 2142 VPSRAW $0x0a, Y4, Y4 2143 VPSRAW $0x0a, Y5, Y5 2144 VPSRAW $0x0a, Y6, Y6 2145 VPSRAW $0x0a, Y7, Y7 2146 VPMULLW Y9, Y4, Y4 2147 VPMULLW Y9, Y5, Y5 2148 VPMULLW Y9, Y6, Y6 2149 VPMULLW Y9, Y7, Y7 2150 VPSUBW Y4, Y0, Y0 2151 VPSUBW Y5, Y1, Y1 2152 VPSUBW Y6, Y2, Y2 2153 VPSUBW Y7, Y3, Y3 2154 VMOVDQU Y0, 256(AX) 2155 VMOVDQU Y1, 288(AX) 2156 VMOVDQU Y2, 320(AX) 2157 VMOVDQU Y3, 352(AX) 2158 VMOVDQU 384(AX), Y0 2159 VMOVDQU 416(AX), Y1 2160 VMOVDQU 448(AX), Y2 2161 VMOVDQU 480(AX), Y3 2162 VPMULHW Y8, Y0, Y4 2163 VPMULHW Y8, Y1, Y5 2164 VPMULHW Y8, Y2, Y6 2165 VPMULHW Y8, Y3, Y7 2166 VPSRAW $0x0a, Y4, Y4 2167 VPSRAW $0x0a, Y5, Y5 2168 VPSRAW $0x0a, Y6, Y6 2169 VPSRAW $0x0a, Y7, Y7 2170 VPMULLW Y9, Y4, Y4 2171 VPMULLW Y9, Y5, Y5 2172 VPMULLW Y9, Y6, Y6 2173 VPMULLW Y9, Y7, Y7 2174 VPSUBW Y4, Y0, Y0 2175 VPSUBW Y5, Y1, Y1 2176 VPSUBW Y6, Y2, Y2 2177 VPSUBW Y7, Y3, Y3 2178 VMOVDQU Y0, 384(AX) 2179 VMOVDQU Y1, 416(AX) 2180 VMOVDQU Y2, 448(AX) 2181 VMOVDQU Y3, 480(AX) 2182 RET 2183 2184 // func normalizeAVX2(p *[256]int16) 2185 // Requires: AVX, AVX2 2186 TEXT ·normalizeAVX2(SB), NOSPLIT, $0-8 2187 MOVQ p+0(FP), AX 2188 MOVL $0x00000d01, CX 2189 VMOVD CX, X0 2190 VPBROADCASTW X0, Y9 2191 MOVL $0x00004ebf, CX 2192 VMOVD CX, X0 2193 VPBROADCASTW X0, Y8 2194 VMOVDQU (AX), Y0 2195 VMOVDQU 32(AX), Y1 2196 VMOVDQU 64(AX), Y2 2197 VMOVDQU 96(AX), Y3 2198 VPMULHW Y8, Y0, Y4 2199 VPMULHW Y8, Y1, Y5 2200 VPMULHW Y8, Y2, Y6 2201 VPMULHW Y8, Y3, Y7 2202 VPSRAW $0x0a, Y4, Y4 2203 VPSRAW $0x0a, Y5, Y5 2204 VPSRAW $0x0a, Y6, Y6 2205 VPSRAW $0x0a, Y7, Y7 2206 VPMULLW Y9, Y4, Y4 2207 VPMULLW Y9, Y5, Y5 2208 VPMULLW Y9, Y6, Y6 2209 VPMULLW Y9, Y7, Y7 2210 VPSUBW Y4, Y0, Y0 2211 VPSUBW Y5, Y1, Y1 2212 VPSUBW Y6, Y2, Y2 2213 VPSUBW Y7, Y3, Y3 2214 VPSUBW Y9, Y0, Y0 2215 VPSUBW Y9, Y1, Y1 2216 VPSUBW Y9, Y2, Y2 2217 VPSUBW Y9, Y3, Y3 2218 VPSRAW $0x0f, Y0, Y4 2219 VPSRAW $0x0f, Y1, Y5 2220 VPSRAW $0x0f, Y2, Y6 2221 VPSRAW $0x0f, Y3, Y7 2222 VPAND Y4, Y9, Y4 2223 VPAND Y5, Y9, Y5 2224 VPAND Y6, Y9, Y6 2225 VPAND Y7, Y9, Y7 2226 VPADDW Y0, Y4, Y0 2227 VPADDW Y1, Y5, Y1 2228 VPADDW Y2, Y6, Y2 2229 VPADDW Y3, Y7, Y3 2230 VMOVDQU Y0, (AX) 2231 VMOVDQU Y1, 32(AX) 2232 VMOVDQU Y2, 64(AX) 2233 VMOVDQU Y3, 96(AX) 2234 VMOVDQU 128(AX), Y0 2235 VMOVDQU 160(AX), Y1 2236 VMOVDQU 192(AX), Y2 2237 VMOVDQU 224(AX), Y3 2238 VPMULHW Y8, Y0, Y4 2239 VPMULHW Y8, Y1, Y5 2240 VPMULHW Y8, Y2, Y6 2241 VPMULHW Y8, Y3, Y7 2242 VPSRAW $0x0a, Y4, Y4 2243 VPSRAW $0x0a, Y5, Y5 2244 VPSRAW $0x0a, Y6, Y6 2245 VPSRAW $0x0a, Y7, Y7 2246 VPMULLW Y9, Y4, Y4 2247 VPMULLW Y9, Y5, Y5 2248 VPMULLW Y9, Y6, Y6 2249 VPMULLW Y9, Y7, Y7 2250 VPSUBW Y4, Y0, Y0 2251 VPSUBW Y5, Y1, Y1 2252 VPSUBW Y6, Y2, Y2 2253 VPSUBW Y7, Y3, Y3 2254 VPSUBW Y9, Y0, Y0 2255 VPSUBW Y9, Y1, Y1 2256 VPSUBW Y9, Y2, Y2 2257 VPSUBW Y9, Y3, Y3 2258 VPSRAW $0x0f, Y0, Y4 2259 VPSRAW $0x0f, Y1, Y5 2260 VPSRAW $0x0f, Y2, Y6 2261 VPSRAW $0x0f, Y3, Y7 2262 VPAND Y4, Y9, Y4 2263 VPAND Y5, Y9, Y5 2264 VPAND Y6, Y9, Y6 2265 VPAND Y7, Y9, Y7 2266 VPADDW Y0, Y4, Y0 2267 VPADDW Y1, Y5, Y1 2268 VPADDW Y2, Y6, Y2 2269 VPADDW Y3, Y7, Y3 2270 VMOVDQU Y0, 128(AX) 2271 VMOVDQU Y1, 160(AX) 2272 VMOVDQU Y2, 192(AX) 2273 VMOVDQU Y3, 224(AX) 2274 VMOVDQU 256(AX), Y0 2275 VMOVDQU 288(AX), Y1 2276 VMOVDQU 320(AX), Y2 2277 VMOVDQU 352(AX), Y3 2278 VPMULHW Y8, Y0, Y4 2279 VPMULHW Y8, Y1, Y5 2280 VPMULHW Y8, Y2, Y6 2281 VPMULHW Y8, Y3, Y7 2282 VPSRAW $0x0a, Y4, Y4 2283 VPSRAW $0x0a, Y5, Y5 2284 VPSRAW $0x0a, Y6, Y6 2285 VPSRAW $0x0a, Y7, Y7 2286 VPMULLW Y9, Y4, Y4 2287 VPMULLW Y9, Y5, Y5 2288 VPMULLW Y9, Y6, Y6 2289 VPMULLW Y9, Y7, Y7 2290 VPSUBW Y4, Y0, Y0 2291 VPSUBW Y5, Y1, Y1 2292 VPSUBW Y6, Y2, Y2 2293 VPSUBW Y7, Y3, Y3 2294 VPSUBW Y9, Y0, Y0 2295 VPSUBW Y9, Y1, Y1 2296 VPSUBW Y9, Y2, Y2 2297 VPSUBW Y9, Y3, Y3 2298 VPSRAW $0x0f, Y0, Y4 2299 VPSRAW $0x0f, Y1, Y5 2300 VPSRAW $0x0f, Y2, Y6 2301 VPSRAW $0x0f, Y3, Y7 2302 VPAND Y4, Y9, Y4 2303 VPAND Y5, Y9, Y5 2304 VPAND Y6, Y9, Y6 2305 VPAND Y7, Y9, Y7 2306 VPADDW Y0, Y4, Y0 2307 VPADDW Y1, Y5, Y1 2308 VPADDW Y2, Y6, Y2 2309 VPADDW Y3, Y7, Y3 2310 VMOVDQU Y0, 256(AX) 2311 VMOVDQU Y1, 288(AX) 2312 VMOVDQU Y2, 320(AX) 2313 VMOVDQU Y3, 352(AX) 2314 VMOVDQU 384(AX), Y0 2315 VMOVDQU 416(AX), Y1 2316 VMOVDQU 448(AX), Y2 2317 VMOVDQU 480(AX), Y3 2318 VPMULHW Y8, Y0, Y4 2319 VPMULHW Y8, Y1, Y5 2320 VPMULHW Y8, Y2, Y6 2321 VPMULHW Y8, Y3, Y7 2322 VPSRAW $0x0a, Y4, Y4 2323 VPSRAW $0x0a, Y5, Y5 2324 VPSRAW $0x0a, Y6, Y6 2325 VPSRAW $0x0a, Y7, Y7 2326 VPMULLW Y9, Y4, Y4 2327 VPMULLW Y9, Y5, Y5 2328 VPMULLW Y9, Y6, Y6 2329 VPMULLW Y9, Y7, Y7 2330 VPSUBW Y4, Y0, Y0 2331 VPSUBW Y5, Y1, Y1 2332 VPSUBW Y6, Y2, Y2 2333 VPSUBW Y7, Y3, Y3 2334 VPSUBW Y9, Y0, Y0 2335 VPSUBW Y9, Y1, Y1 2336 VPSUBW Y9, Y2, Y2 2337 VPSUBW Y9, Y3, Y3 2338 VPSRAW $0x0f, Y0, Y4 2339 VPSRAW $0x0f, Y1, Y5 2340 VPSRAW $0x0f, Y2, Y6 2341 VPSRAW $0x0f, Y3, Y7 2342 VPAND Y4, Y9, Y4 2343 VPAND Y5, Y9, Y5 2344 VPAND Y6, Y9, Y6 2345 VPAND Y7, Y9, Y7 2346 VPADDW Y0, Y4, Y0 2347 VPADDW Y1, Y5, Y1 2348 VPADDW Y2, Y6, Y2 2349 VPADDW Y3, Y7, Y3 2350 VMOVDQU Y0, 384(AX) 2351 VMOVDQU Y1, 416(AX) 2352 VMOVDQU Y2, 448(AX) 2353 VMOVDQU Y3, 480(AX) 2354 RET