github.com/slayercat/go@v0.0.0-20170428012452-c51559813f61/src/crypto/sha512/sha512block_amd64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // SHA512 block routine. See sha512block.go for Go equivalent. 8 // 9 // The algorithm is detailed in FIPS 180-4: 10 // 11 // http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf 12 // 13 // Wt = Mt; for 0 <= t <= 15 14 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79 15 // 16 // a = H0 17 // b = H1 18 // c = H2 19 // d = H3 20 // e = H4 21 // f = H5 22 // g = H6 23 // h = H7 24 // 25 // for t = 0 to 79 { 26 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt 27 // T2 = BIGSIGMA0(a) + Maj(a,b,c) 28 // h = g 29 // g = f 30 // f = e 31 // e = d + T1 32 // d = c 33 // c = b 34 // b = a 35 // a = T1 + T2 36 // } 37 // 38 // H0 = a + H0 39 // H1 = b + H1 40 // H2 = c + H2 41 // H3 = d + H3 42 // H4 = e + H4 43 // H5 = f + H5 44 // H6 = g + H6 45 // H7 = h + H7 46 47 // Wt = Mt; for 0 <= t <= 15 48 #define MSGSCHEDULE0(index) \ 49 MOVQ (index*8)(SI), AX; \ 50 BSWAPQ AX; \ 51 MOVQ AX, (index*8)(BP) 52 53 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79 54 // SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x) 55 // SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x) 56 #define MSGSCHEDULE1(index) \ 57 MOVQ ((index-2)*8)(BP), AX; \ 58 MOVQ AX, CX; \ 59 RORQ $19, AX; \ 60 MOVQ CX, DX; \ 61 RORQ $61, CX; \ 62 SHRQ $6, DX; \ 63 MOVQ ((index-15)*8)(BP), BX; \ 64 XORQ CX, AX; \ 65 MOVQ BX, CX; \ 66 XORQ DX, AX; \ 67 RORQ $1, BX; \ 68 MOVQ CX, DX; \ 69 SHRQ $7, DX; \ 70 RORQ $8, CX; \ 71 ADDQ ((index-7)*8)(BP), AX; \ 72 XORQ CX, BX; \ 73 XORQ DX, BX; \ 74 ADDQ ((index-16)*8)(BP), BX; \ 75 ADDQ BX, AX; \ 76 MOVQ AX, ((index)*8)(BP) 77 78 // Calculate T1 in AX - uses AX, CX and DX registers. 79 // h is also used as an accumulator. Wt is passed in AX. 80 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt 81 // BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x) 82 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) 83 #define SHA512T1(const, e, f, g, h) \ 84 MOVQ $const, DX; \ 85 ADDQ AX, h; \ 86 MOVQ e, AX; \ 87 ADDQ DX, h; \ 88 MOVQ e, CX; \ 89 RORQ $14, AX; \ 90 MOVQ e, DX; \ 91 RORQ $18, CX; \ 92 XORQ CX, AX; \ 93 MOVQ e, CX; \ 94 RORQ $41, DX; \ 95 ANDQ f, CX; \ 96 XORQ AX, DX; \ 97 MOVQ e, AX; \ 98 NOTQ AX; \ 99 ADDQ DX, h; \ 100 ANDQ g, AX; \ 101 XORQ CX, AX; \ 102 ADDQ h, AX 103 104 // Calculate T2 in BX - uses BX, CX, DX and DI registers. 105 // T2 = BIGSIGMA0(a) + Maj(a, b, c) 106 // BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x) 107 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) 108 #define SHA512T2(a, b, c) \ 109 MOVQ a, DI; \ 110 MOVQ c, BX; \ 111 RORQ $28, DI; \ 112 MOVQ a, DX; \ 113 ANDQ b, BX; \ 114 RORQ $34, DX; \ 115 MOVQ a, CX; \ 116 ANDQ c, CX; \ 117 XORQ DX, DI; \ 118 XORQ CX, BX; \ 119 MOVQ a, DX; \ 120 MOVQ b, CX; \ 121 RORQ $39, DX; \ 122 ANDQ a, CX; \ 123 XORQ CX, BX; \ 124 XORQ DX, DI; \ 125 ADDQ DI, BX 126 127 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2. 128 // The values for e and a are stored in d and h, ready for rotation. 129 #define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \ 130 SHA512T1(const, e, f, g, h); \ 131 SHA512T2(a, b, c); \ 132 MOVQ BX, h; \ 133 ADDQ AX, d; \ 134 ADDQ AX, h 135 136 #define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \ 137 MSGSCHEDULE0(index); \ 138 SHA512ROUND(index, const, a, b, c, d, e, f, g, h) 139 140 #define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \ 141 MSGSCHEDULE1(index); \ 142 SHA512ROUND(index, const, a, b, c, d, e, f, g, h) 143 144 TEXT ·blockAMD64(SB),0,$648-32 145 MOVQ p_base+8(FP), SI 146 MOVQ p_len+16(FP), DX 147 SHRQ $7, DX 148 SHLQ $7, DX 149 150 LEAQ (SI)(DX*1), DI 151 MOVQ DI, 640(SP) 152 CMPQ SI, DI 153 JEQ end 154 155 MOVQ dig+0(FP), BP 156 MOVQ (0*8)(BP), R8 // a = H0 157 MOVQ (1*8)(BP), R9 // b = H1 158 MOVQ (2*8)(BP), R10 // c = H2 159 MOVQ (3*8)(BP), R11 // d = H3 160 MOVQ (4*8)(BP), R12 // e = H4 161 MOVQ (5*8)(BP), R13 // f = H5 162 MOVQ (6*8)(BP), R14 // g = H6 163 MOVQ (7*8)(BP), R15 // h = H7 164 165 loop: 166 MOVQ SP, BP // message schedule 167 168 SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15) 169 SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14) 170 SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13) 171 SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12) 172 SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11) 173 SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10) 174 SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9) 175 SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8) 176 SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15) 177 SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14) 178 SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13) 179 SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12) 180 SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11) 181 SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10) 182 SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9) 183 SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8) 184 185 SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15) 186 SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14) 187 SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13) 188 SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12) 189 SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11) 190 SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10) 191 SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9) 192 SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8) 193 SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15) 194 SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14) 195 SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13) 196 SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12) 197 SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11) 198 SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10) 199 SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9) 200 SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8) 201 SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15) 202 SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14) 203 SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13) 204 SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12) 205 SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11) 206 SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10) 207 SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9) 208 SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8) 209 SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15) 210 SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14) 211 SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13) 212 SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12) 213 SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11) 214 SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10) 215 SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9) 216 SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8) 217 SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15) 218 SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14) 219 SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13) 220 SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12) 221 SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11) 222 SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10) 223 SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9) 224 SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8) 225 SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15) 226 SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14) 227 SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13) 228 SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12) 229 SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11) 230 SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10) 231 SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9) 232 SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8) 233 SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15) 234 SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14) 235 SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13) 236 SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12) 237 SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11) 238 SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10) 239 SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9) 240 SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8) 241 SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15) 242 SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14) 243 SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13) 244 SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12) 245 SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11) 246 SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10) 247 SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9) 248 SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8) 249 250 MOVQ dig+0(FP), BP 251 ADDQ (0*8)(BP), R8 // H0 = a + H0 252 MOVQ R8, (0*8)(BP) 253 ADDQ (1*8)(BP), R9 // H1 = b + H1 254 MOVQ R9, (1*8)(BP) 255 ADDQ (2*8)(BP), R10 // H2 = c + H2 256 MOVQ R10, (2*8)(BP) 257 ADDQ (3*8)(BP), R11 // H3 = d + H3 258 MOVQ R11, (3*8)(BP) 259 ADDQ (4*8)(BP), R12 // H4 = e + H4 260 MOVQ R12, (4*8)(BP) 261 ADDQ (5*8)(BP), R13 // H5 = f + H5 262 MOVQ R13, (5*8)(BP) 263 ADDQ (6*8)(BP), R14 // H6 = g + H6 264 MOVQ R14, (6*8)(BP) 265 ADDQ (7*8)(BP), R15 // H7 = h + H7 266 MOVQ R15, (7*8)(BP) 267 268 ADDQ $128, SI 269 CMPQ SI, 640(SP) 270 JB loop 271 272 end: 273 RET 274 275 // Version below is based on "Fast SHA512 Implementations on Intel 276 // Architecture Processors" White-paper 277 // http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf 278 // AVX2 version by Intel, same algorithm in Linux kernel: 279 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S 280 281 // James Guilford <james.guilford@intel.com> 282 // Kirk Yap <kirk.s.yap@intel.com> 283 // Tim Chen <tim.c.chen@linux.intel.com> 284 // David Cote <david.m.cote@intel.com> 285 // Aleksey Sidorov <aleksey.sidorov@intel.com> 286 287 #define YFER_SIZE (4*8) 288 #define SRND_SIZE (1*8) 289 #define INP_SIZE (1*8) 290 291 #define frame_YFER (0) 292 #define frame_SRND (frame_YFER + YFER_SIZE) 293 #define frame_INP (frame_SRND + SRND_SIZE) 294 #define frame_INPEND (frame_INP + INP_SIZE) 295 296 #define addm(p1, p2) \ 297 ADDQ p1, p2; \ 298 MOVQ p2, p1 299 300 #define COPY_YMM_AND_BSWAP(p1, p2, p3) \ 301 VMOVDQU p2, p1; \ 302 VPSHUFB p3, p1, p1 303 304 #define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \ 305 VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \ 306 VPALIGNR $RVAL, YSRC2, YDST, YDST 307 308 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607 309 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f 310 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617 311 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f 312 313 GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32 314 315 DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000 316 DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000 317 DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 318 DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 319 320 GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32 321 322 TEXT ·blockAVX2(SB), NOSPLIT, $56-32 323 MOVQ dig+0(FP), SI 324 MOVQ p_base+8(FP), DI 325 MOVQ p_len+16(FP), DX 326 327 SHRQ $7, DX 328 SHLQ $7, DX 329 330 JZ done_hash 331 ADDQ DI, DX 332 MOVQ DX, frame_INPEND(SP) 333 334 MOVQ (0*8)(SI), AX 335 MOVQ (1*8)(SI), BX 336 MOVQ (2*8)(SI), CX 337 MOVQ (3*8)(SI), R8 338 MOVQ (4*8)(SI), DX 339 MOVQ (5*8)(SI), R9 340 MOVQ (6*8)(SI), R10 341 MOVQ (7*8)(SI), R11 342 343 MOVQ $PSHUFFLE_BYTE_FLIP_MASK<>(SB), R12 344 VMOVDQU (R12), Y9 345 346 loop0: 347 MOVQ ·_K+0(SB), BP 348 349 // byte swap first 16 dwords 350 COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9) 351 COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9) 352 COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9) 353 COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9) 354 355 MOVQ DI, frame_INP(SP) 356 357 // schedule 64 input dwords, by doing 12 rounds of 4 each 358 MOVQ $4, frame_SRND(SP) 359 360 loop1: 361 VPADDQ (BP), Y4, Y0 362 VMOVDQU Y0, frame_YFER(SP) 363 364 MY_VPALIGNR(Y0, Y7, Y6, 8) 365 366 VPADDQ Y4, Y0, Y0 367 368 MY_VPALIGNR(Y1, Y5, Y4, 8) 369 370 VPSRLQ $1, Y1, Y2 371 VPSLLQ $(64-1), Y1, Y3 372 VPOR Y2, Y3, Y3 373 374 VPSRLQ $7, Y1, Y8 375 376 MOVQ AX, DI 377 RORXQ $41, DX, R13 378 RORXQ $18, DX, R14 379 ADDQ frame_YFER(SP), R11 380 ORQ CX, DI 381 MOVQ R9, R15 382 RORXQ $34, AX, R12 383 384 XORQ R14, R13 385 XORQ R10, R15 386 RORXQ $14, DX, R14 387 388 ANDQ DX, R15 389 XORQ R14, R13 390 RORXQ $39, AX, R14 391 ADDQ R11, R8 392 393 ANDQ BX, DI 394 XORQ R12, R14 395 RORXQ $28, AX, R12 396 397 XORQ R10, R15 398 XORQ R12, R14 399 MOVQ AX, R12 400 ANDQ CX, R12 401 402 ADDQ R13, R15 403 ORQ R12, DI 404 ADDQ R14, R11 405 406 ADDQ R15, R8 407 408 ADDQ R15, R11 409 ADDQ DI, R11 410 411 VPSRLQ $8, Y1, Y2 412 VPSLLQ $(64-8), Y1, Y1 413 VPOR Y2, Y1, Y1 414 415 VPXOR Y8, Y3, Y3 416 VPXOR Y1, Y3, Y1 417 418 VPADDQ Y1, Y0, Y0 419 420 VPERM2F128 $0x0, Y0, Y0, Y4 421 422 MOVQ $MASK_YMM_LO<>(SB), R13 423 424 VPAND (R13), Y0, Y0 425 426 VPERM2F128 $0x11, Y7, Y7, Y2 427 VPSRLQ $6, Y2, Y8 428 429 MOVQ R11, DI 430 RORXQ $41, R8, R13 431 RORXQ $18, R8, R14 432 ADDQ 1*8+frame_YFER(SP), R10 433 ORQ BX, DI 434 435 MOVQ DX, R15 436 RORXQ $34, R11, R12 437 XORQ R14, R13 438 XORQ R9, R15 439 440 RORXQ $14, R8, R14 441 XORQ R14, R13 442 RORXQ $39, R11, R14 443 ANDQ R8, R15 444 ADDQ R10, CX 445 446 ANDQ AX, DI 447 XORQ R12, R14 448 449 RORXQ $28, R11, R12 450 XORQ R9, R15 451 452 XORQ R12, R14 453 MOVQ R11, R12 454 ANDQ BX, R12 455 ADDQ R13, R15 456 457 ORQ R12, DI 458 ADDQ R14, R10 459 460 ADDQ R15, CX 461 ADDQ R15, R10 462 ADDQ DI, R10 463 464 VPSRLQ $19, Y2, Y3 465 VPSLLQ $(64-19), Y2, Y1 466 VPOR Y1, Y3, Y3 467 VPXOR Y3, Y8, Y8 468 VPSRLQ $61, Y2, Y3 469 VPSLLQ $(64-61), Y2, Y1 470 VPOR Y1, Y3, Y3 471 VPXOR Y3, Y8, Y8 472 473 VPADDQ Y8, Y4, Y4 474 475 VPSRLQ $6, Y4, Y8 476 477 MOVQ R10, DI 478 RORXQ $41, CX, R13 479 ADDQ 2*8+frame_YFER(SP), R9 480 481 RORXQ $18, CX, R14 482 ORQ AX, DI 483 MOVQ R8, R15 484 XORQ DX, R15 485 486 RORXQ $34, R10, R12 487 XORQ R14, R13 488 ANDQ CX, R15 489 490 RORXQ $14, CX, R14 491 ADDQ R9, BX 492 ANDQ R11, DI 493 494 XORQ R14, R13 495 RORXQ $39, R10, R14 496 XORQ DX, R15 497 498 XORQ R12, R14 499 RORXQ $28, R10, R12 500 501 XORQ R12, R14 502 MOVQ R10, R12 503 ANDQ AX, R12 504 ADDQ R13, R15 505 506 ORQ R12, DI 507 ADDQ R14, R9 508 ADDQ R15, BX 509 ADDQ R15, R9 510 511 ADDQ DI, R9 512 513 VPSRLQ $19, Y4, Y3 514 VPSLLQ $(64-19), Y4, Y1 515 VPOR Y1, Y3, Y3 516 VPXOR Y3, Y8, Y8 517 VPSRLQ $61, Y4, Y3 518 VPSLLQ $(64-61), Y4, Y1 519 VPOR Y1, Y3, Y3 520 VPXOR Y3, Y8, Y8 521 522 VPADDQ Y8, Y0, Y2 523 524 VPBLENDD $0xF0, Y2, Y4, Y4 525 526 MOVQ R9, DI 527 RORXQ $41, BX, R13 528 RORXQ $18, BX, R14 529 ADDQ 3*8+frame_YFER(SP), DX 530 ORQ R11, DI 531 532 MOVQ CX, R15 533 RORXQ $34, R9, R12 534 XORQ R14, R13 535 XORQ R8, R15 536 537 RORXQ $14, BX, R14 538 ANDQ BX, R15 539 ADDQ DX, AX 540 ANDQ R10, DI 541 542 XORQ R14, R13 543 XORQ R8, R15 544 545 RORXQ $39, R9, R14 546 ADDQ R13, R15 547 548 XORQ R12, R14 549 ADDQ R15, AX 550 551 RORXQ $28, R9, R12 552 553 XORQ R12, R14 554 MOVQ R9, R12 555 ANDQ R11, R12 556 ORQ R12, DI 557 558 ADDQ R14, DX 559 ADDQ R15, DX 560 ADDQ DI, DX 561 562 VPADDQ 1*32(BP), Y5, Y0 563 VMOVDQU Y0, frame_YFER(SP) 564 565 MY_VPALIGNR(Y0, Y4, Y7, 8) 566 567 VPADDQ Y5, Y0, Y0 568 569 MY_VPALIGNR(Y1, Y6, Y5, 8) 570 571 VPSRLQ $1, Y1, Y2 572 VPSLLQ $(64-1), Y1, Y3 573 VPOR Y2, Y3, Y3 574 575 VPSRLQ $7, Y1, Y8 576 577 MOVQ DX, DI 578 RORXQ $41, AX, R13 579 RORXQ $18, AX, R14 580 ADDQ frame_YFER(SP), R8 581 ORQ R10, DI 582 MOVQ BX, R15 583 RORXQ $34, DX, R12 584 585 XORQ R14, R13 586 XORQ CX, R15 587 RORXQ $14, AX, R14 588 589 ANDQ AX, R15 590 XORQ R14, R13 591 RORXQ $39, DX, R14 592 ADDQ R8, R11 593 594 ANDQ R9, DI 595 XORQ R12, R14 596 RORXQ $28, DX, R12 597 598 XORQ CX, R15 599 XORQ R12, R14 600 MOVQ DX, R12 601 ANDQ R10, R12 602 603 ADDQ R13, R15 604 ORQ R12, DI 605 ADDQ R14, R8 606 607 ADDQ R15, R11 608 609 ADDQ R15, R8 610 ADDQ DI, R8 611 612 VPSRLQ $8, Y1, Y2 613 VPSLLQ $(64-8), Y1, Y1 614 VPOR Y2, Y1, Y1 615 616 VPXOR Y8, Y3, Y3 617 VPXOR Y1, Y3, Y1 618 619 VPADDQ Y1, Y0, Y0 620 621 VPERM2F128 $0x0, Y0, Y0, Y5 622 623 MOVQ $MASK_YMM_LO<>(SB), R13 624 VPAND (R13), Y0, Y0 625 626 VPERM2F128 $0x11, Y4, Y4, Y2 627 VPSRLQ $6, Y2, Y8 628 629 MOVQ R8, DI 630 RORXQ $41, R11, R13 631 RORXQ $18, R11, R14 632 ADDQ 1*8+frame_YFER(SP), CX 633 ORQ R9, DI 634 635 MOVQ AX, R15 636 RORXQ $34, R8, R12 637 XORQ R14, R13 638 XORQ BX, R15 639 640 RORXQ $14, R11, R14 641 XORQ R14, R13 642 RORXQ $39, R8, R14 643 ANDQ R11, R15 644 ADDQ CX, R10 645 646 ANDQ DX, DI 647 XORQ R12, R14 648 649 RORXQ $28, R8, R12 650 XORQ BX, R15 651 652 XORQ R12, R14 653 MOVQ R8, R12 654 ANDQ R9, R12 655 ADDQ R13, R15 656 657 ORQ R12, DI 658 ADDQ R14, CX 659 660 ADDQ R15, R10 661 ADDQ R15, CX 662 ADDQ DI, CX 663 664 VPSRLQ $19, Y2, Y3 665 VPSLLQ $(64-19), Y2, Y1 666 VPOR Y1, Y3, Y3 667 VPXOR Y3, Y8, Y8 668 VPSRLQ $61, Y2, Y3 669 VPSLLQ $(64-61), Y2, Y1 670 VPOR Y1, Y3, Y3 671 VPXOR Y3, Y8, Y8 672 673 VPADDQ Y8, Y5, Y5 674 675 VPSRLQ $6, Y5, Y8 676 677 MOVQ CX, DI 678 RORXQ $41, R10, R13 679 ADDQ 2*8+frame_YFER(SP), BX 680 681 RORXQ $18, R10, R14 682 ORQ DX, DI 683 MOVQ R11, R15 684 XORQ AX, R15 685 686 RORXQ $34, CX, R12 687 XORQ R14, R13 688 ANDQ R10, R15 689 690 RORXQ $14, R10, R14 691 ADDQ BX, R9 692 ANDQ R8, DI 693 694 XORQ R14, R13 695 RORXQ $39, CX, R14 696 XORQ AX, R15 697 698 XORQ R12, R14 699 RORXQ $28, CX, R12 700 701 XORQ R12, R14 702 MOVQ CX, R12 703 ANDQ DX, R12 704 ADDQ R13, R15 705 706 ORQ R12, DI 707 ADDQ R14, BX 708 ADDQ R15, R9 709 ADDQ R15, BX 710 711 ADDQ DI, BX 712 713 VPSRLQ $19, Y5, Y3 714 VPSLLQ $(64-19), Y5, Y1 715 VPOR Y1, Y3, Y3 716 VPXOR Y3, Y8, Y8 717 VPSRLQ $61, Y5, Y3 718 VPSLLQ $(64-61), Y5, Y1 719 VPOR Y1, Y3, Y3 720 VPXOR Y3, Y8, Y8 721 722 VPADDQ Y8, Y0, Y2 723 724 VPBLENDD $0xF0, Y2, Y5, Y5 725 726 MOVQ BX, DI 727 RORXQ $41, R9, R13 728 RORXQ $18, R9, R14 729 ADDQ 3*8+frame_YFER(SP), AX 730 ORQ R8, DI 731 732 MOVQ R10, R15 733 RORXQ $34, BX, R12 734 XORQ R14, R13 735 XORQ R11, R15 736 737 RORXQ $14, R9, R14 738 ANDQ R9, R15 739 ADDQ AX, DX 740 ANDQ CX, DI 741 742 XORQ R14, R13 743 XORQ R11, R15 744 745 RORXQ $39, BX, R14 746 ADDQ R13, R15 747 748 XORQ R12, R14 749 ADDQ R15, DX 750 751 RORXQ $28, BX, R12 752 753 XORQ R12, R14 754 MOVQ BX, R12 755 ANDQ R8, R12 756 ORQ R12, DI 757 758 ADDQ R14, AX 759 ADDQ R15, AX 760 ADDQ DI, AX 761 762 VPADDQ 2*32(BP), Y6, Y0 763 VMOVDQU Y0, frame_YFER(SP) 764 765 MY_VPALIGNR(Y0, Y5, Y4, 8) 766 767 VPADDQ Y6, Y0, Y0 768 769 MY_VPALIGNR(Y1, Y7, Y6, 8) 770 771 VPSRLQ $1, Y1, Y2 772 VPSLLQ $(64-1), Y1, Y3 773 VPOR Y2, Y3, Y3 774 775 VPSRLQ $7, Y1, Y8 776 777 MOVQ AX, DI 778 RORXQ $41, DX, R13 779 RORXQ $18, DX, R14 780 ADDQ frame_YFER(SP), R11 781 ORQ CX, DI 782 MOVQ R9, R15 783 RORXQ $34, AX, R12 784 785 XORQ R14, R13 786 XORQ R10, R15 787 RORXQ $14, DX, R14 788 789 ANDQ DX, R15 790 XORQ R14, R13 791 RORXQ $39, AX, R14 792 ADDQ R11, R8 793 794 ANDQ BX, DI 795 XORQ R12, R14 796 RORXQ $28, AX, R12 797 798 XORQ R10, R15 799 XORQ R12, R14 800 MOVQ AX, R12 801 ANDQ CX, R12 802 803 ADDQ R13, R15 804 ORQ R12, DI 805 ADDQ R14, R11 806 807 ADDQ R15, R8 808 809 ADDQ R15, R11 810 ADDQ DI, R11 811 812 VPSRLQ $8, Y1, Y2 813 VPSLLQ $(64-8), Y1, Y1 814 VPOR Y2, Y1, Y1 815 816 VPXOR Y8, Y3, Y3 817 VPXOR Y1, Y3, Y1 818 819 VPADDQ Y1, Y0, Y0 820 821 VPERM2F128 $0x0, Y0, Y0, Y6 822 823 MOVQ $MASK_YMM_LO<>(SB), R13 824 VPAND (R13), Y0, Y0 825 826 VPERM2F128 $0x11, Y5, Y5, Y2 827 VPSRLQ $6, Y2, Y8 828 829 MOVQ R11, DI 830 RORXQ $41, R8, R13 831 RORXQ $18, R8, R14 832 ADDQ 1*8+frame_YFER(SP), R10 833 ORQ BX, DI 834 835 MOVQ DX, R15 836 RORXQ $34, R11, R12 837 XORQ R14, R13 838 XORQ R9, R15 839 840 RORXQ $14, R8, R14 841 XORQ R14, R13 842 RORXQ $39, R11, R14 843 ANDQ R8, R15 844 ADDQ R10, CX 845 846 ANDQ AX, DI 847 XORQ R12, R14 848 849 RORXQ $28, R11, R12 850 XORQ R9, R15 851 852 XORQ R12, R14 853 MOVQ R11, R12 854 ANDQ BX, R12 855 ADDQ R13, R15 856 857 ORQ R12, DI 858 ADDQ R14, R10 859 860 ADDQ R15, CX 861 ADDQ R15, R10 862 ADDQ DI, R10 863 864 VPSRLQ $19, Y2, Y3 865 VPSLLQ $(64-19), Y2, Y1 866 VPOR Y1, Y3, Y3 867 VPXOR Y3, Y8, Y8 868 VPSRLQ $61, Y2, Y3 869 VPSLLQ $(64-61), Y2, Y1 870 VPOR Y1, Y3, Y3 871 VPXOR Y3, Y8, Y8 872 873 VPADDQ Y8, Y6, Y6 874 875 VPSRLQ $6, Y6, Y8 876 877 MOVQ R10, DI 878 RORXQ $41, CX, R13 879 ADDQ 2*8+frame_YFER(SP), R9 880 881 RORXQ $18, CX, R14 882 ORQ AX, DI 883 MOVQ R8, R15 884 XORQ DX, R15 885 886 RORXQ $34, R10, R12 887 XORQ R14, R13 888 ANDQ CX, R15 889 890 RORXQ $14, CX, R14 891 ADDQ R9, BX 892 ANDQ R11, DI 893 894 XORQ R14, R13 895 RORXQ $39, R10, R14 896 XORQ DX, R15 897 898 XORQ R12, R14 899 RORXQ $28, R10, R12 900 901 XORQ R12, R14 902 MOVQ R10, R12 903 ANDQ AX, R12 904 ADDQ R13, R15 905 906 ORQ R12, DI 907 ADDQ R14, R9 908 ADDQ R15, BX 909 ADDQ R15, R9 910 911 ADDQ DI, R9 912 913 VPSRLQ $19, Y6, Y3 914 VPSLLQ $(64-19), Y6, Y1 915 VPOR Y1, Y3, Y3 916 VPXOR Y3, Y8, Y8 917 VPSRLQ $61, Y6, Y3 918 VPSLLQ $(64-61), Y6, Y1 919 VPOR Y1, Y3, Y3 920 VPXOR Y3, Y8, Y8 921 922 VPADDQ Y8, Y0, Y2 923 924 VPBLENDD $0xF0, Y2, Y6, Y6 925 926 MOVQ R9, DI 927 RORXQ $41, BX, R13 928 RORXQ $18, BX, R14 929 ADDQ 3*8+frame_YFER(SP), DX 930 ORQ R11, DI 931 932 MOVQ CX, R15 933 RORXQ $34, R9, R12 934 XORQ R14, R13 935 XORQ R8, R15 936 937 RORXQ $14, BX, R14 938 ANDQ BX, R15 939 ADDQ DX, AX 940 ANDQ R10, DI 941 942 XORQ R14, R13 943 XORQ R8, R15 944 945 RORXQ $39, R9, R14 946 ADDQ R13, R15 947 948 XORQ R12, R14 949 ADDQ R15, AX 950 951 RORXQ $28, R9, R12 952 953 XORQ R12, R14 954 MOVQ R9, R12 955 ANDQ R11, R12 956 ORQ R12, DI 957 958 ADDQ R14, DX 959 ADDQ R15, DX 960 ADDQ DI, DX 961 962 VPADDQ 3*32(BP), Y7, Y0 963 VMOVDQU Y0, frame_YFER(SP) 964 ADDQ $(4*32), BP 965 966 MY_VPALIGNR(Y0, Y6, Y5, 8) 967 968 VPADDQ Y7, Y0, Y0 969 970 MY_VPALIGNR(Y1, Y4, Y7, 8) 971 972 VPSRLQ $1, Y1, Y2 973 VPSLLQ $(64-1), Y1, Y3 974 VPOR Y2, Y3, Y3 975 976 VPSRLQ $7, Y1, Y8 977 978 MOVQ DX, DI 979 RORXQ $41, AX, R13 980 RORXQ $18, AX, R14 981 ADDQ frame_YFER(SP), R8 982 ORQ R10, DI 983 MOVQ BX, R15 984 RORXQ $34, DX, R12 985 986 XORQ R14, R13 987 XORQ CX, R15 988 RORXQ $14, AX, R14 989 990 ANDQ AX, R15 991 XORQ R14, R13 992 RORXQ $39, DX, R14 993 ADDQ R8, R11 994 995 ANDQ R9, DI 996 XORQ R12, R14 997 RORXQ $28, DX, R12 998 999 XORQ CX, R15 1000 XORQ R12, R14 1001 MOVQ DX, R12 1002 ANDQ R10, R12 1003 1004 ADDQ R13, R15 1005 ORQ R12, DI 1006 ADDQ R14, R8 1007 1008 ADDQ R15, R11 1009 1010 ADDQ R15, R8 1011 ADDQ DI, R8 1012 1013 VPSRLQ $8, Y1, Y2 1014 VPSLLQ $(64-8), Y1, Y1 1015 VPOR Y2, Y1, Y1 1016 1017 VPXOR Y8, Y3, Y3 1018 VPXOR Y1, Y3, Y1 1019 1020 VPADDQ Y1, Y0, Y0 1021 1022 VPERM2F128 $0x0, Y0, Y0, Y7 1023 1024 MOVQ $MASK_YMM_LO<>(SB), R13 1025 VPAND (R13), Y0, Y0 1026 1027 VPERM2F128 $0x11, Y6, Y6, Y2 1028 VPSRLQ $6, Y2, Y8 1029 1030 MOVQ R8, DI 1031 RORXQ $41, R11, R13 1032 RORXQ $18, R11, R14 1033 ADDQ 1*8+frame_YFER(SP), CX 1034 ORQ R9, DI 1035 1036 MOVQ AX, R15 1037 RORXQ $34, R8, R12 1038 XORQ R14, R13 1039 XORQ BX, R15 1040 1041 RORXQ $14, R11, R14 1042 XORQ R14, R13 1043 RORXQ $39, R8, R14 1044 ANDQ R11, R15 1045 ADDQ CX, R10 1046 1047 ANDQ DX, DI 1048 XORQ R12, R14 1049 1050 RORXQ $28, R8, R12 1051 XORQ BX, R15 1052 1053 XORQ R12, R14 1054 MOVQ R8, R12 1055 ANDQ R9, R12 1056 ADDQ R13, R15 1057 1058 ORQ R12, DI 1059 ADDQ R14, CX 1060 1061 ADDQ R15, R10 1062 ADDQ R15, CX 1063 ADDQ DI, CX 1064 1065 VPSRLQ $19, Y2, Y3 1066 VPSLLQ $(64-19), Y2, Y1 1067 VPOR Y1, Y3, Y3 1068 VPXOR Y3, Y8, Y8 1069 VPSRLQ $61, Y2, Y3 1070 VPSLLQ $(64-61), Y2, Y1 1071 VPOR Y1, Y3, Y3 1072 VPXOR Y3, Y8, Y8 1073 1074 VPADDQ Y8, Y7, Y7 1075 1076 VPSRLQ $6, Y7, Y8 1077 1078 MOVQ CX, DI 1079 RORXQ $41, R10, R13 1080 ADDQ 2*8+frame_YFER(SP), BX 1081 1082 RORXQ $18, R10, R14 1083 ORQ DX, DI 1084 MOVQ R11, R15 1085 XORQ AX, R15 1086 1087 RORXQ $34, CX, R12 1088 XORQ R14, R13 1089 ANDQ R10, R15 1090 1091 RORXQ $14, R10, R14 1092 ADDQ BX, R9 1093 ANDQ R8, DI 1094 1095 XORQ R14, R13 1096 RORXQ $39, CX, R14 1097 XORQ AX, R15 1098 1099 XORQ R12, R14 1100 RORXQ $28, CX, R12 1101 1102 XORQ R12, R14 1103 MOVQ CX, R12 1104 ANDQ DX, R12 1105 ADDQ R13, R15 1106 1107 ORQ R12, DI 1108 ADDQ R14, BX 1109 ADDQ R15, R9 1110 ADDQ R15, BX 1111 1112 ADDQ DI, BX 1113 1114 VPSRLQ $19, Y7, Y3 1115 VPSLLQ $(64-19), Y7, Y1 1116 VPOR Y1, Y3, Y3 1117 VPXOR Y3, Y8, Y8 1118 VPSRLQ $61, Y7, Y3 1119 VPSLLQ $(64-61), Y7, Y1 1120 VPOR Y1, Y3, Y3 1121 VPXOR Y3, Y8, Y8 1122 1123 VPADDQ Y8, Y0, Y2 1124 1125 VPBLENDD $0xF0, Y2, Y7, Y7 1126 1127 MOVQ BX, DI 1128 RORXQ $41, R9, R13 1129 RORXQ $18, R9, R14 1130 ADDQ 3*8+frame_YFER(SP), AX 1131 ORQ R8, DI 1132 1133 MOVQ R10, R15 1134 RORXQ $34, BX, R12 1135 XORQ R14, R13 1136 XORQ R11, R15 1137 1138 RORXQ $14, R9, R14 1139 ANDQ R9, R15 1140 ADDQ AX, DX 1141 ANDQ CX, DI 1142 1143 XORQ R14, R13 1144 XORQ R11, R15 1145 1146 RORXQ $39, BX, R14 1147 ADDQ R13, R15 1148 1149 XORQ R12, R14 1150 ADDQ R15, DX 1151 1152 RORXQ $28, BX, R12 1153 1154 XORQ R12, R14 1155 MOVQ BX, R12 1156 ANDQ R8, R12 1157 ORQ R12, DI 1158 1159 ADDQ R14, AX 1160 ADDQ R15, AX 1161 ADDQ DI, AX 1162 1163 SUBQ $1, frame_SRND(SP) 1164 JNE loop1 1165 1166 MOVQ $2, frame_SRND(SP) 1167 1168 loop2: 1169 VPADDQ (BP), Y4, Y0 1170 VMOVDQU Y0, frame_YFER(SP) 1171 1172 MOVQ R9, R15 1173 RORXQ $41, DX, R13 1174 RORXQ $18, DX, R14 1175 XORQ R10, R15 1176 1177 XORQ R14, R13 1178 RORXQ $14, DX, R14 1179 ANDQ DX, R15 1180 1181 XORQ R14, R13 1182 RORXQ $34, AX, R12 1183 XORQ R10, R15 1184 RORXQ $39, AX, R14 1185 MOVQ AX, DI 1186 1187 XORQ R12, R14 1188 RORXQ $28, AX, R12 1189 ADDQ frame_YFER(SP), R11 1190 ORQ CX, DI 1191 1192 XORQ R12, R14 1193 MOVQ AX, R12 1194 ANDQ BX, DI 1195 ANDQ CX, R12 1196 ADDQ R13, R15 1197 1198 ADDQ R11, R8 1199 ORQ R12, DI 1200 ADDQ R14, R11 1201 1202 ADDQ R15, R8 1203 1204 ADDQ R15, R11 1205 MOVQ DX, R15 1206 RORXQ $41, R8, R13 1207 RORXQ $18, R8, R14 1208 XORQ R9, R15 1209 1210 XORQ R14, R13 1211 RORXQ $14, R8, R14 1212 ANDQ R8, R15 1213 ADDQ DI, R11 1214 1215 XORQ R14, R13 1216 RORXQ $34, R11, R12 1217 XORQ R9, R15 1218 RORXQ $39, R11, R14 1219 MOVQ R11, DI 1220 1221 XORQ R12, R14 1222 RORXQ $28, R11, R12 1223 ADDQ 8*1+frame_YFER(SP), R10 1224 ORQ BX, DI 1225 1226 XORQ R12, R14 1227 MOVQ R11, R12 1228 ANDQ AX, DI 1229 ANDQ BX, R12 1230 ADDQ R13, R15 1231 1232 ADDQ R10, CX 1233 ORQ R12, DI 1234 ADDQ R14, R10 1235 1236 ADDQ R15, CX 1237 1238 ADDQ R15, R10 1239 MOVQ R8, R15 1240 RORXQ $41, CX, R13 1241 RORXQ $18, CX, R14 1242 XORQ DX, R15 1243 1244 XORQ R14, R13 1245 RORXQ $14, CX, R14 1246 ANDQ CX, R15 1247 ADDQ DI, R10 1248 1249 XORQ R14, R13 1250 RORXQ $34, R10, R12 1251 XORQ DX, R15 1252 RORXQ $39, R10, R14 1253 MOVQ R10, DI 1254 1255 XORQ R12, R14 1256 RORXQ $28, R10, R12 1257 ADDQ 8*2+frame_YFER(SP), R9 1258 ORQ AX, DI 1259 1260 XORQ R12, R14 1261 MOVQ R10, R12 1262 ANDQ R11, DI 1263 ANDQ AX, R12 1264 ADDQ R13, R15 1265 1266 ADDQ R9, BX 1267 ORQ R12, DI 1268 ADDQ R14, R9 1269 1270 ADDQ R15, BX 1271 1272 ADDQ R15, R9 1273 MOVQ CX, R15 1274 RORXQ $41, BX, R13 1275 RORXQ $18, BX, R14 1276 XORQ R8, R15 1277 1278 XORQ R14, R13 1279 RORXQ $14, BX, R14 1280 ANDQ BX, R15 1281 ADDQ DI, R9 1282 1283 XORQ R14, R13 1284 RORXQ $34, R9, R12 1285 XORQ R8, R15 1286 RORXQ $39, R9, R14 1287 MOVQ R9, DI 1288 1289 XORQ R12, R14 1290 RORXQ $28, R9, R12 1291 ADDQ 8*3+frame_YFER(SP), DX 1292 ORQ R11, DI 1293 1294 XORQ R12, R14 1295 MOVQ R9, R12 1296 ANDQ R10, DI 1297 ANDQ R11, R12 1298 ADDQ R13, R15 1299 1300 ADDQ DX, AX 1301 ORQ R12, DI 1302 ADDQ R14, DX 1303 1304 ADDQ R15, AX 1305 1306 ADDQ R15, DX 1307 1308 ADDQ DI, DX 1309 1310 VPADDQ 1*32(BP), Y5, Y0 1311 VMOVDQU Y0, frame_YFER(SP) 1312 ADDQ $(2*32), BP 1313 1314 MOVQ BX, R15 1315 RORXQ $41, AX, R13 1316 RORXQ $18, AX, R14 1317 XORQ CX, R15 1318 1319 XORQ R14, R13 1320 RORXQ $14, AX, R14 1321 ANDQ AX, R15 1322 1323 XORQ R14, R13 1324 RORXQ $34, DX, R12 1325 XORQ CX, R15 1326 RORXQ $39, DX, R14 1327 MOVQ DX, DI 1328 1329 XORQ R12, R14 1330 RORXQ $28, DX, R12 1331 ADDQ frame_YFER(SP), R8 1332 ORQ R10, DI 1333 1334 XORQ R12, R14 1335 MOVQ DX, R12 1336 ANDQ R9, DI 1337 ANDQ R10, R12 1338 ADDQ R13, R15 1339 1340 ADDQ R8, R11 1341 ORQ R12, DI 1342 ADDQ R14, R8 1343 1344 ADDQ R15, R11 1345 1346 ADDQ R15, R8 1347 MOVQ AX, R15 1348 RORXQ $41, R11, R13 1349 RORXQ $18, R11, R14 1350 XORQ BX, R15 1351 1352 XORQ R14, R13 1353 RORXQ $14, R11, R14 1354 ANDQ R11, R15 1355 ADDQ DI, R8 1356 1357 XORQ R14, R13 1358 RORXQ $34, R8, R12 1359 XORQ BX, R15 1360 RORXQ $39, R8, R14 1361 MOVQ R8, DI 1362 1363 XORQ R12, R14 1364 RORXQ $28, R8, R12 1365 ADDQ 8*1+frame_YFER(SP), CX 1366 ORQ R9, DI 1367 1368 XORQ R12, R14 1369 MOVQ R8, R12 1370 ANDQ DX, DI 1371 ANDQ R9, R12 1372 ADDQ R13, R15 1373 1374 ADDQ CX, R10 1375 ORQ R12, DI 1376 ADDQ R14, CX 1377 1378 ADDQ R15, R10 1379 1380 ADDQ R15, CX 1381 MOVQ R11, R15 1382 RORXQ $41, R10, R13 1383 RORXQ $18, R10, R14 1384 XORQ AX, R15 1385 1386 XORQ R14, R13 1387 RORXQ $14, R10, R14 1388 ANDQ R10, R15 1389 ADDQ DI, CX 1390 1391 XORQ R14, R13 1392 RORXQ $34, CX, R12 1393 XORQ AX, R15 1394 RORXQ $39, CX, R14 1395 MOVQ CX, DI 1396 1397 XORQ R12, R14 1398 RORXQ $28, CX, R12 1399 ADDQ 8*2+frame_YFER(SP), BX 1400 ORQ DX, DI 1401 1402 XORQ R12, R14 1403 MOVQ CX, R12 1404 ANDQ R8, DI 1405 ANDQ DX, R12 1406 ADDQ R13, R15 1407 1408 ADDQ BX, R9 1409 ORQ R12, DI 1410 ADDQ R14, BX 1411 1412 ADDQ R15, R9 1413 1414 ADDQ R15, BX 1415 MOVQ R10, R15 1416 RORXQ $41, R9, R13 1417 RORXQ $18, R9, R14 1418 XORQ R11, R15 1419 1420 XORQ R14, R13 1421 RORXQ $14, R9, R14 1422 ANDQ R9, R15 1423 ADDQ DI, BX 1424 1425 XORQ R14, R13 1426 RORXQ $34, BX, R12 1427 XORQ R11, R15 1428 RORXQ $39, BX, R14 1429 MOVQ BX, DI 1430 1431 XORQ R12, R14 1432 RORXQ $28, BX, R12 1433 ADDQ 8*3+frame_YFER(SP), AX 1434 ORQ R8, DI 1435 1436 XORQ R12, R14 1437 MOVQ BX, R12 1438 ANDQ CX, DI 1439 ANDQ R8, R12 1440 ADDQ R13, R15 1441 1442 ADDQ AX, DX 1443 ORQ R12, DI 1444 ADDQ R14, AX 1445 1446 ADDQ R15, DX 1447 1448 ADDQ R15, AX 1449 1450 ADDQ DI, AX 1451 1452 VMOVDQU Y6, Y4 1453 VMOVDQU Y7, Y5 1454 1455 SUBQ $1, frame_SRND(SP) 1456 JNE loop2 1457 1458 addm(8*0(SI),AX) 1459 addm(8*1(SI),BX) 1460 addm(8*2(SI),CX) 1461 addm(8*3(SI),R8) 1462 addm(8*4(SI),DX) 1463 addm(8*5(SI),R9) 1464 addm(8*6(SI),R10) 1465 addm(8*7(SI),R11) 1466 1467 MOVQ frame_INP(SP), DI 1468 ADDQ $128, DI 1469 CMPQ DI, frame_INPEND(SP) 1470 JNE loop0 1471 1472 done_hash: 1473 VZEROUPPER 1474 RET 1475 1476 // func checkAVX2() bool 1477 // returns whether AVX2 is supported 1478 TEXT ·checkAVX2(SB), NOSPLIT, $0 1479 MOVB runtime·support_avx2(SB), AX 1480 CMPB AX,$0 1481 JNE check_bmi2 1482 MOVB AX, ret+0(FP) 1483 RET 1484 check_bmi2: 1485 MOVB runtime·support_bmi2(SB), AX 1486 MOVB AX, ret+0(FP) 1487 RET