github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/crypto/sha512/sha512block_amd64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 #include "textflag.h" 8 9 // SHA512 block routine. See sha512block.go for Go equivalent. 10 // 11 // The algorithm is detailed in FIPS 180-4: 12 // 13 // https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf 14 // 15 // Wt = Mt; for 0 <= t <= 15 16 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79 17 // 18 // a = H0 19 // b = H1 20 // c = H2 21 // d = H3 22 // e = H4 23 // f = H5 24 // g = H6 25 // h = H7 26 // 27 // for t = 0 to 79 { 28 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt 29 // T2 = BIGSIGMA0(a) + Maj(a,b,c) 30 // h = g 31 // g = f 32 // f = e 33 // e = d + T1 34 // d = c 35 // c = b 36 // b = a 37 // a = T1 + T2 38 // } 39 // 40 // H0 = a + H0 41 // H1 = b + H1 42 // H2 = c + H2 43 // H3 = d + H3 44 // H4 = e + H4 45 // H5 = f + H5 46 // H6 = g + H6 47 // H7 = h + H7 48 49 // Wt = Mt; for 0 <= t <= 15 50 #define MSGSCHEDULE0(index) \ 51 MOVQ (index*8)(SI), AX; \ 52 BSWAPQ AX; \ 53 MOVQ AX, (index*8)(BP) 54 55 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79 56 // SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x) 57 // SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x) 58 #define MSGSCHEDULE1(index) \ 59 MOVQ ((index-2)*8)(BP), AX; \ 60 MOVQ AX, CX; \ 61 RORQ $19, AX; \ 62 MOVQ CX, DX; \ 63 RORQ $61, CX; \ 64 SHRQ $6, DX; \ 65 MOVQ ((index-15)*8)(BP), BX; \ 66 XORQ CX, AX; \ 67 MOVQ BX, CX; \ 68 XORQ DX, AX; \ 69 RORQ $1, BX; \ 70 MOVQ CX, DX; \ 71 SHRQ $7, DX; \ 72 RORQ $8, CX; \ 73 ADDQ ((index-7)*8)(BP), AX; \ 74 XORQ CX, BX; \ 75 XORQ DX, BX; \ 76 ADDQ ((index-16)*8)(BP), BX; \ 77 ADDQ BX, AX; \ 78 MOVQ AX, ((index)*8)(BP) 79 80 // Calculate T1 in AX - uses AX, CX and DX registers. 81 // h is also used as an accumulator. Wt is passed in AX. 82 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt 83 // BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x) 84 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) 85 #define SHA512T1(const, e, f, g, h) \ 86 MOVQ $const, DX; \ 87 ADDQ AX, h; \ 88 MOVQ e, AX; \ 89 ADDQ DX, h; \ 90 MOVQ e, CX; \ 91 RORQ $14, AX; \ 92 MOVQ e, DX; \ 93 RORQ $18, CX; \ 94 XORQ CX, AX; \ 95 MOVQ e, CX; \ 96 RORQ $41, DX; \ 97 ANDQ f, CX; \ 98 XORQ AX, DX; \ 99 MOVQ e, AX; \ 100 NOTQ AX; \ 101 ADDQ DX, h; \ 102 ANDQ g, AX; \ 103 XORQ CX, AX; \ 104 ADDQ h, AX 105 106 // Calculate T2 in BX - uses BX, CX, DX and DI registers. 107 // T2 = BIGSIGMA0(a) + Maj(a, b, c) 108 // BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x) 109 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) 110 #define SHA512T2(a, b, c) \ 111 MOVQ a, DI; \ 112 MOVQ c, BX; \ 113 RORQ $28, DI; \ 114 MOVQ a, DX; \ 115 ANDQ b, BX; \ 116 RORQ $34, DX; \ 117 MOVQ a, CX; \ 118 ANDQ c, CX; \ 119 XORQ DX, DI; \ 120 XORQ CX, BX; \ 121 MOVQ a, DX; \ 122 MOVQ b, CX; \ 123 RORQ $39, DX; \ 124 ANDQ a, CX; \ 125 XORQ CX, BX; \ 126 XORQ DX, DI; \ 127 ADDQ DI, BX 128 129 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2. 130 // The values for e and a are stored in d and h, ready for rotation. 131 #define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \ 132 SHA512T1(const, e, f, g, h); \ 133 SHA512T2(a, b, c); \ 134 MOVQ BX, h; \ 135 ADDQ AX, d; \ 136 ADDQ AX, h 137 138 #define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \ 139 MSGSCHEDULE0(index); \ 140 SHA512ROUND(index, const, a, b, c, d, e, f, g, h) 141 142 #define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \ 143 MSGSCHEDULE1(index); \ 144 SHA512ROUND(index, const, a, b, c, d, e, f, g, h) 145 146 TEXT ·blockAMD64(SB),0,$648-32 147 MOVQ p_base+8(FP), SI 148 MOVQ p_len+16(FP), DX 149 SHRQ $7, DX 150 SHLQ $7, DX 151 152 LEAQ (SI)(DX*1), DI 153 MOVQ DI, 640(SP) 154 CMPQ SI, DI 155 JEQ end 156 157 MOVQ dig+0(FP), BP 158 MOVQ (0*8)(BP), R8 // a = H0 159 MOVQ (1*8)(BP), R9 // b = H1 160 MOVQ (2*8)(BP), R10 // c = H2 161 MOVQ (3*8)(BP), R11 // d = H3 162 MOVQ (4*8)(BP), R12 // e = H4 163 MOVQ (5*8)(BP), R13 // f = H5 164 MOVQ (6*8)(BP), R14 // g = H6 165 MOVQ (7*8)(BP), R15 // h = H7 166 167 loop: 168 MOVQ SP, BP // message schedule 169 170 SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15) 171 SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14) 172 SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13) 173 SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12) 174 SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11) 175 SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10) 176 SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9) 177 SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8) 178 SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15) 179 SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14) 180 SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13) 181 SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12) 182 SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11) 183 SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10) 184 SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9) 185 SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8) 186 187 SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15) 188 SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14) 189 SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13) 190 SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12) 191 SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11) 192 SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10) 193 SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9) 194 SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8) 195 SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15) 196 SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14) 197 SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13) 198 SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12) 199 SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11) 200 SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10) 201 SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9) 202 SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8) 203 SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15) 204 SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14) 205 SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13) 206 SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12) 207 SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11) 208 SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10) 209 SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9) 210 SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8) 211 SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15) 212 SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14) 213 SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13) 214 SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12) 215 SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11) 216 SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10) 217 SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9) 218 SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8) 219 SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15) 220 SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14) 221 SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13) 222 SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12) 223 SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11) 224 SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10) 225 SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9) 226 SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8) 227 SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15) 228 SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14) 229 SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13) 230 SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12) 231 SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11) 232 SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10) 233 SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9) 234 SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8) 235 SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15) 236 SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14) 237 SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13) 238 SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12) 239 SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11) 240 SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10) 241 SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9) 242 SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8) 243 SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15) 244 SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14) 245 SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13) 246 SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12) 247 SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11) 248 SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10) 249 SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9) 250 SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8) 251 252 MOVQ dig+0(FP), BP 253 ADDQ (0*8)(BP), R8 // H0 = a + H0 254 MOVQ R8, (0*8)(BP) 255 ADDQ (1*8)(BP), R9 // H1 = b + H1 256 MOVQ R9, (1*8)(BP) 257 ADDQ (2*8)(BP), R10 // H2 = c + H2 258 MOVQ R10, (2*8)(BP) 259 ADDQ (3*8)(BP), R11 // H3 = d + H3 260 MOVQ R11, (3*8)(BP) 261 ADDQ (4*8)(BP), R12 // H4 = e + H4 262 MOVQ R12, (4*8)(BP) 263 ADDQ (5*8)(BP), R13 // H5 = f + H5 264 MOVQ R13, (5*8)(BP) 265 ADDQ (6*8)(BP), R14 // H6 = g + H6 266 MOVQ R14, (6*8)(BP) 267 ADDQ (7*8)(BP), R15 // H7 = h + H7 268 MOVQ R15, (7*8)(BP) 269 270 ADDQ $128, SI 271 CMPQ SI, 640(SP) 272 JB loop 273 274 end: 275 RET 276 277 // Version below is based on "Fast SHA512 Implementations on Intel 278 // Architecture Processors" White-paper 279 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf 280 // AVX2 version by Intel, same algorithm in Linux kernel: 281 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S 282 283 // James Guilford <james.guilford@intel.com> 284 // Kirk Yap <kirk.s.yap@intel.com> 285 // Tim Chen <tim.c.chen@linux.intel.com> 286 // David Cote <david.m.cote@intel.com> 287 // Aleksey Sidorov <aleksey.sidorov@intel.com> 288 289 #define YFER_SIZE (4*8) 290 #define SRND_SIZE (1*8) 291 #define INP_SIZE (1*8) 292 293 #define frame_YFER (0) 294 #define frame_SRND (frame_YFER + YFER_SIZE) 295 #define frame_INP (frame_SRND + SRND_SIZE) 296 #define frame_INPEND (frame_INP + INP_SIZE) 297 298 #define addm(p1, p2) \ 299 ADDQ p1, p2; \ 300 MOVQ p2, p1 301 302 #define COPY_YMM_AND_BSWAP(p1, p2, p3) \ 303 VMOVDQU p2, p1; \ 304 VPSHUFB p3, p1, p1 305 306 #define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \ 307 VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \ 308 VPALIGNR $RVAL, YSRC2, YDST, YDST 309 310 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607 311 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f 312 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617 313 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f 314 315 GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32 316 317 DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000 318 DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000 319 DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 320 DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 321 322 GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32 323 324 TEXT ·blockAVX2(SB), NOSPLIT, $56-32 325 MOVQ dig+0(FP), SI 326 MOVQ p_base+8(FP), DI 327 MOVQ p_len+16(FP), DX 328 329 SHRQ $7, DX 330 SHLQ $7, DX 331 332 JZ done_hash 333 ADDQ DI, DX 334 MOVQ DX, frame_INPEND(SP) 335 336 MOVQ (0*8)(SI), AX 337 MOVQ (1*8)(SI), BX 338 MOVQ (2*8)(SI), CX 339 MOVQ (3*8)(SI), R8 340 MOVQ (4*8)(SI), DX 341 MOVQ (5*8)(SI), R9 342 MOVQ (6*8)(SI), R10 343 MOVQ (7*8)(SI), R11 344 345 VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9 346 347 loop0: 348 MOVQ ·_K+0(SB), BP 349 350 // byte swap first 16 dwords 351 COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9) 352 COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9) 353 COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9) 354 COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9) 355 356 MOVQ DI, frame_INP(SP) 357 358 // schedule 64 input dwords, by doing 12 rounds of 4 each 359 MOVQ $4, frame_SRND(SP) 360 361 loop1: 362 VPADDQ (BP), Y4, Y0 363 VMOVDQU Y0, frame_YFER(SP) 364 365 MY_VPALIGNR(Y0, Y7, Y6, 8) 366 367 VPADDQ Y4, Y0, Y0 368 369 MY_VPALIGNR(Y1, Y5, Y4, 8) 370 371 VPSRLQ $1, Y1, Y2 372 VPSLLQ $(64-1), Y1, Y3 373 VPOR Y2, Y3, Y3 374 375 VPSRLQ $7, Y1, Y8 376 377 MOVQ AX, DI 378 RORXQ $41, DX, R13 379 RORXQ $18, DX, R14 380 ADDQ frame_YFER(SP), R11 381 ORQ CX, DI 382 MOVQ R9, R15 383 RORXQ $34, AX, R12 384 385 XORQ R14, R13 386 XORQ R10, R15 387 RORXQ $14, DX, R14 388 389 ANDQ DX, R15 390 XORQ R14, R13 391 RORXQ $39, AX, R14 392 ADDQ R11, R8 393 394 ANDQ BX, DI 395 XORQ R12, R14 396 RORXQ $28, AX, R12 397 398 XORQ R10, R15 399 XORQ R12, R14 400 MOVQ AX, R12 401 ANDQ CX, R12 402 403 ADDQ R13, R15 404 ORQ R12, DI 405 ADDQ R14, R11 406 407 ADDQ R15, R8 408 409 ADDQ R15, R11 410 ADDQ DI, R11 411 412 VPSRLQ $8, Y1, Y2 413 VPSLLQ $(64-8), Y1, Y1 414 VPOR Y2, Y1, Y1 415 416 VPXOR Y8, Y3, Y3 417 VPXOR Y1, Y3, Y1 418 419 VPADDQ Y1, Y0, Y0 420 421 VPERM2F128 $0x0, Y0, Y0, Y4 422 423 VPAND MASK_YMM_LO<>(SB), Y0, Y0 424 425 VPERM2F128 $0x11, Y7, Y7, Y2 426 VPSRLQ $6, Y2, Y8 427 428 MOVQ R11, DI 429 RORXQ $41, R8, R13 430 RORXQ $18, R8, R14 431 ADDQ 1*8+frame_YFER(SP), R10 432 ORQ BX, DI 433 434 MOVQ DX, R15 435 RORXQ $34, R11, R12 436 XORQ R14, R13 437 XORQ R9, R15 438 439 RORXQ $14, R8, R14 440 XORQ R14, R13 441 RORXQ $39, R11, R14 442 ANDQ R8, R15 443 ADDQ R10, CX 444 445 ANDQ AX, DI 446 XORQ R12, R14 447 448 RORXQ $28, R11, R12 449 XORQ R9, R15 450 451 XORQ R12, R14 452 MOVQ R11, R12 453 ANDQ BX, R12 454 ADDQ R13, R15 455 456 ORQ R12, DI 457 ADDQ R14, R10 458 459 ADDQ R15, CX 460 ADDQ R15, R10 461 ADDQ DI, R10 462 463 VPSRLQ $19, Y2, Y3 464 VPSLLQ $(64-19), Y2, Y1 465 VPOR Y1, Y3, Y3 466 VPXOR Y3, Y8, Y8 467 VPSRLQ $61, Y2, Y3 468 VPSLLQ $(64-61), Y2, Y1 469 VPOR Y1, Y3, Y3 470 VPXOR Y3, Y8, Y8 471 472 VPADDQ Y8, Y4, Y4 473 474 VPSRLQ $6, Y4, Y8 475 476 MOVQ R10, DI 477 RORXQ $41, CX, R13 478 ADDQ 2*8+frame_YFER(SP), R9 479 480 RORXQ $18, CX, R14 481 ORQ AX, DI 482 MOVQ R8, R15 483 XORQ DX, R15 484 485 RORXQ $34, R10, R12 486 XORQ R14, R13 487 ANDQ CX, R15 488 489 RORXQ $14, CX, R14 490 ADDQ R9, BX 491 ANDQ R11, DI 492 493 XORQ R14, R13 494 RORXQ $39, R10, R14 495 XORQ DX, R15 496 497 XORQ R12, R14 498 RORXQ $28, R10, R12 499 500 XORQ R12, R14 501 MOVQ R10, R12 502 ANDQ AX, R12 503 ADDQ R13, R15 504 505 ORQ R12, DI 506 ADDQ R14, R9 507 ADDQ R15, BX 508 ADDQ R15, R9 509 510 ADDQ DI, R9 511 512 VPSRLQ $19, Y4, Y3 513 VPSLLQ $(64-19), Y4, Y1 514 VPOR Y1, Y3, Y3 515 VPXOR Y3, Y8, Y8 516 VPSRLQ $61, Y4, Y3 517 VPSLLQ $(64-61), Y4, Y1 518 VPOR Y1, Y3, Y3 519 VPXOR Y3, Y8, Y8 520 521 VPADDQ Y8, Y0, Y2 522 523 VPBLENDD $0xF0, Y2, Y4, Y4 524 525 MOVQ R9, DI 526 RORXQ $41, BX, R13 527 RORXQ $18, BX, R14 528 ADDQ 3*8+frame_YFER(SP), DX 529 ORQ R11, DI 530 531 MOVQ CX, R15 532 RORXQ $34, R9, R12 533 XORQ R14, R13 534 XORQ R8, R15 535 536 RORXQ $14, BX, R14 537 ANDQ BX, R15 538 ADDQ DX, AX 539 ANDQ R10, DI 540 541 XORQ R14, R13 542 XORQ R8, R15 543 544 RORXQ $39, R9, R14 545 ADDQ R13, R15 546 547 XORQ R12, R14 548 ADDQ R15, AX 549 550 RORXQ $28, R9, R12 551 552 XORQ R12, R14 553 MOVQ R9, R12 554 ANDQ R11, R12 555 ORQ R12, DI 556 557 ADDQ R14, DX 558 ADDQ R15, DX 559 ADDQ DI, DX 560 561 VPADDQ 1*32(BP), Y5, Y0 562 VMOVDQU Y0, frame_YFER(SP) 563 564 MY_VPALIGNR(Y0, Y4, Y7, 8) 565 566 VPADDQ Y5, Y0, Y0 567 568 MY_VPALIGNR(Y1, Y6, Y5, 8) 569 570 VPSRLQ $1, Y1, Y2 571 VPSLLQ $(64-1), Y1, Y3 572 VPOR Y2, Y3, Y3 573 574 VPSRLQ $7, Y1, Y8 575 576 MOVQ DX, DI 577 RORXQ $41, AX, R13 578 RORXQ $18, AX, R14 579 ADDQ frame_YFER(SP), R8 580 ORQ R10, DI 581 MOVQ BX, R15 582 RORXQ $34, DX, R12 583 584 XORQ R14, R13 585 XORQ CX, R15 586 RORXQ $14, AX, R14 587 588 ANDQ AX, R15 589 XORQ R14, R13 590 RORXQ $39, DX, R14 591 ADDQ R8, R11 592 593 ANDQ R9, DI 594 XORQ R12, R14 595 RORXQ $28, DX, R12 596 597 XORQ CX, R15 598 XORQ R12, R14 599 MOVQ DX, R12 600 ANDQ R10, R12 601 602 ADDQ R13, R15 603 ORQ R12, DI 604 ADDQ R14, R8 605 606 ADDQ R15, R11 607 608 ADDQ R15, R8 609 ADDQ DI, R8 610 611 VPSRLQ $8, Y1, Y2 612 VPSLLQ $(64-8), Y1, Y1 613 VPOR Y2, Y1, Y1 614 615 VPXOR Y8, Y3, Y3 616 VPXOR Y1, Y3, Y1 617 618 VPADDQ Y1, Y0, Y0 619 620 VPERM2F128 $0x0, Y0, Y0, Y5 621 622 VPAND MASK_YMM_LO<>(SB), Y0, Y0 623 624 VPERM2F128 $0x11, Y4, Y4, Y2 625 VPSRLQ $6, Y2, Y8 626 627 MOVQ R8, DI 628 RORXQ $41, R11, R13 629 RORXQ $18, R11, R14 630 ADDQ 1*8+frame_YFER(SP), CX 631 ORQ R9, DI 632 633 MOVQ AX, R15 634 RORXQ $34, R8, R12 635 XORQ R14, R13 636 XORQ BX, R15 637 638 RORXQ $14, R11, R14 639 XORQ R14, R13 640 RORXQ $39, R8, R14 641 ANDQ R11, R15 642 ADDQ CX, R10 643 644 ANDQ DX, DI 645 XORQ R12, R14 646 647 RORXQ $28, R8, R12 648 XORQ BX, R15 649 650 XORQ R12, R14 651 MOVQ R8, R12 652 ANDQ R9, R12 653 ADDQ R13, R15 654 655 ORQ R12, DI 656 ADDQ R14, CX 657 658 ADDQ R15, R10 659 ADDQ R15, CX 660 ADDQ DI, CX 661 662 VPSRLQ $19, Y2, Y3 663 VPSLLQ $(64-19), Y2, Y1 664 VPOR Y1, Y3, Y3 665 VPXOR Y3, Y8, Y8 666 VPSRLQ $61, Y2, Y3 667 VPSLLQ $(64-61), Y2, Y1 668 VPOR Y1, Y3, Y3 669 VPXOR Y3, Y8, Y8 670 671 VPADDQ Y8, Y5, Y5 672 673 VPSRLQ $6, Y5, Y8 674 675 MOVQ CX, DI 676 RORXQ $41, R10, R13 677 ADDQ 2*8+frame_YFER(SP), BX 678 679 RORXQ $18, R10, R14 680 ORQ DX, DI 681 MOVQ R11, R15 682 XORQ AX, R15 683 684 RORXQ $34, CX, R12 685 XORQ R14, R13 686 ANDQ R10, R15 687 688 RORXQ $14, R10, R14 689 ADDQ BX, R9 690 ANDQ R8, DI 691 692 XORQ R14, R13 693 RORXQ $39, CX, R14 694 XORQ AX, R15 695 696 XORQ R12, R14 697 RORXQ $28, CX, R12 698 699 XORQ R12, R14 700 MOVQ CX, R12 701 ANDQ DX, R12 702 ADDQ R13, R15 703 704 ORQ R12, DI 705 ADDQ R14, BX 706 ADDQ R15, R9 707 ADDQ R15, BX 708 709 ADDQ DI, BX 710 711 VPSRLQ $19, Y5, Y3 712 VPSLLQ $(64-19), Y5, Y1 713 VPOR Y1, Y3, Y3 714 VPXOR Y3, Y8, Y8 715 VPSRLQ $61, Y5, Y3 716 VPSLLQ $(64-61), Y5, Y1 717 VPOR Y1, Y3, Y3 718 VPXOR Y3, Y8, Y8 719 720 VPADDQ Y8, Y0, Y2 721 722 VPBLENDD $0xF0, Y2, Y5, Y5 723 724 MOVQ BX, DI 725 RORXQ $41, R9, R13 726 RORXQ $18, R9, R14 727 ADDQ 3*8+frame_YFER(SP), AX 728 ORQ R8, DI 729 730 MOVQ R10, R15 731 RORXQ $34, BX, R12 732 XORQ R14, R13 733 XORQ R11, R15 734 735 RORXQ $14, R9, R14 736 ANDQ R9, R15 737 ADDQ AX, DX 738 ANDQ CX, DI 739 740 XORQ R14, R13 741 XORQ R11, R15 742 743 RORXQ $39, BX, R14 744 ADDQ R13, R15 745 746 XORQ R12, R14 747 ADDQ R15, DX 748 749 RORXQ $28, BX, R12 750 751 XORQ R12, R14 752 MOVQ BX, R12 753 ANDQ R8, R12 754 ORQ R12, DI 755 756 ADDQ R14, AX 757 ADDQ R15, AX 758 ADDQ DI, AX 759 760 VPADDQ 2*32(BP), Y6, Y0 761 VMOVDQU Y0, frame_YFER(SP) 762 763 MY_VPALIGNR(Y0, Y5, Y4, 8) 764 765 VPADDQ Y6, Y0, Y0 766 767 MY_VPALIGNR(Y1, Y7, Y6, 8) 768 769 VPSRLQ $1, Y1, Y2 770 VPSLLQ $(64-1), Y1, Y3 771 VPOR Y2, Y3, Y3 772 773 VPSRLQ $7, Y1, Y8 774 775 MOVQ AX, DI 776 RORXQ $41, DX, R13 777 RORXQ $18, DX, R14 778 ADDQ frame_YFER(SP), R11 779 ORQ CX, DI 780 MOVQ R9, R15 781 RORXQ $34, AX, R12 782 783 XORQ R14, R13 784 XORQ R10, R15 785 RORXQ $14, DX, R14 786 787 ANDQ DX, R15 788 XORQ R14, R13 789 RORXQ $39, AX, R14 790 ADDQ R11, R8 791 792 ANDQ BX, DI 793 XORQ R12, R14 794 RORXQ $28, AX, R12 795 796 XORQ R10, R15 797 XORQ R12, R14 798 MOVQ AX, R12 799 ANDQ CX, R12 800 801 ADDQ R13, R15 802 ORQ R12, DI 803 ADDQ R14, R11 804 805 ADDQ R15, R8 806 807 ADDQ R15, R11 808 ADDQ DI, R11 809 810 VPSRLQ $8, Y1, Y2 811 VPSLLQ $(64-8), Y1, Y1 812 VPOR Y2, Y1, Y1 813 814 VPXOR Y8, Y3, Y3 815 VPXOR Y1, Y3, Y1 816 817 VPADDQ Y1, Y0, Y0 818 819 VPERM2F128 $0x0, Y0, Y0, Y6 820 821 VPAND MASK_YMM_LO<>(SB), Y0, Y0 822 823 VPERM2F128 $0x11, Y5, Y5, Y2 824 VPSRLQ $6, Y2, Y8 825 826 MOVQ R11, DI 827 RORXQ $41, R8, R13 828 RORXQ $18, R8, R14 829 ADDQ 1*8+frame_YFER(SP), R10 830 ORQ BX, DI 831 832 MOVQ DX, R15 833 RORXQ $34, R11, R12 834 XORQ R14, R13 835 XORQ R9, R15 836 837 RORXQ $14, R8, R14 838 XORQ R14, R13 839 RORXQ $39, R11, R14 840 ANDQ R8, R15 841 ADDQ R10, CX 842 843 ANDQ AX, DI 844 XORQ R12, R14 845 846 RORXQ $28, R11, R12 847 XORQ R9, R15 848 849 XORQ R12, R14 850 MOVQ R11, R12 851 ANDQ BX, R12 852 ADDQ R13, R15 853 854 ORQ R12, DI 855 ADDQ R14, R10 856 857 ADDQ R15, CX 858 ADDQ R15, R10 859 ADDQ DI, R10 860 861 VPSRLQ $19, Y2, Y3 862 VPSLLQ $(64-19), Y2, Y1 863 VPOR Y1, Y3, Y3 864 VPXOR Y3, Y8, Y8 865 VPSRLQ $61, Y2, Y3 866 VPSLLQ $(64-61), Y2, Y1 867 VPOR Y1, Y3, Y3 868 VPXOR Y3, Y8, Y8 869 870 VPADDQ Y8, Y6, Y6 871 872 VPSRLQ $6, Y6, Y8 873 874 MOVQ R10, DI 875 RORXQ $41, CX, R13 876 ADDQ 2*8+frame_YFER(SP), R9 877 878 RORXQ $18, CX, R14 879 ORQ AX, DI 880 MOVQ R8, R15 881 XORQ DX, R15 882 883 RORXQ $34, R10, R12 884 XORQ R14, R13 885 ANDQ CX, R15 886 887 RORXQ $14, CX, R14 888 ADDQ R9, BX 889 ANDQ R11, DI 890 891 XORQ R14, R13 892 RORXQ $39, R10, R14 893 XORQ DX, R15 894 895 XORQ R12, R14 896 RORXQ $28, R10, R12 897 898 XORQ R12, R14 899 MOVQ R10, R12 900 ANDQ AX, R12 901 ADDQ R13, R15 902 903 ORQ R12, DI 904 ADDQ R14, R9 905 ADDQ R15, BX 906 ADDQ R15, R9 907 908 ADDQ DI, R9 909 910 VPSRLQ $19, Y6, Y3 911 VPSLLQ $(64-19), Y6, Y1 912 VPOR Y1, Y3, Y3 913 VPXOR Y3, Y8, Y8 914 VPSRLQ $61, Y6, Y3 915 VPSLLQ $(64-61), Y6, Y1 916 VPOR Y1, Y3, Y3 917 VPXOR Y3, Y8, Y8 918 919 VPADDQ Y8, Y0, Y2 920 921 VPBLENDD $0xF0, Y2, Y6, Y6 922 923 MOVQ R9, DI 924 RORXQ $41, BX, R13 925 RORXQ $18, BX, R14 926 ADDQ 3*8+frame_YFER(SP), DX 927 ORQ R11, DI 928 929 MOVQ CX, R15 930 RORXQ $34, R9, R12 931 XORQ R14, R13 932 XORQ R8, R15 933 934 RORXQ $14, BX, R14 935 ANDQ BX, R15 936 ADDQ DX, AX 937 ANDQ R10, DI 938 939 XORQ R14, R13 940 XORQ R8, R15 941 942 RORXQ $39, R9, R14 943 ADDQ R13, R15 944 945 XORQ R12, R14 946 ADDQ R15, AX 947 948 RORXQ $28, R9, R12 949 950 XORQ R12, R14 951 MOVQ R9, R12 952 ANDQ R11, R12 953 ORQ R12, DI 954 955 ADDQ R14, DX 956 ADDQ R15, DX 957 ADDQ DI, DX 958 959 VPADDQ 3*32(BP), Y7, Y0 960 VMOVDQU Y0, frame_YFER(SP) 961 ADDQ $(4*32), BP 962 963 MY_VPALIGNR(Y0, Y6, Y5, 8) 964 965 VPADDQ Y7, Y0, Y0 966 967 MY_VPALIGNR(Y1, Y4, Y7, 8) 968 969 VPSRLQ $1, Y1, Y2 970 VPSLLQ $(64-1), Y1, Y3 971 VPOR Y2, Y3, Y3 972 973 VPSRLQ $7, Y1, Y8 974 975 MOVQ DX, DI 976 RORXQ $41, AX, R13 977 RORXQ $18, AX, R14 978 ADDQ frame_YFER(SP), R8 979 ORQ R10, DI 980 MOVQ BX, R15 981 RORXQ $34, DX, R12 982 983 XORQ R14, R13 984 XORQ CX, R15 985 RORXQ $14, AX, R14 986 987 ANDQ AX, R15 988 XORQ R14, R13 989 RORXQ $39, DX, R14 990 ADDQ R8, R11 991 992 ANDQ R9, DI 993 XORQ R12, R14 994 RORXQ $28, DX, R12 995 996 XORQ CX, R15 997 XORQ R12, R14 998 MOVQ DX, R12 999 ANDQ R10, R12 1000 1001 ADDQ R13, R15 1002 ORQ R12, DI 1003 ADDQ R14, R8 1004 1005 ADDQ R15, R11 1006 1007 ADDQ R15, R8 1008 ADDQ DI, R8 1009 1010 VPSRLQ $8, Y1, Y2 1011 VPSLLQ $(64-8), Y1, Y1 1012 VPOR Y2, Y1, Y1 1013 1014 VPXOR Y8, Y3, Y3 1015 VPXOR Y1, Y3, Y1 1016 1017 VPADDQ Y1, Y0, Y0 1018 1019 VPERM2F128 $0x0, Y0, Y0, Y7 1020 1021 VPAND MASK_YMM_LO<>(SB), Y0, Y0 1022 1023 VPERM2F128 $0x11, Y6, Y6, Y2 1024 VPSRLQ $6, Y2, Y8 1025 1026 MOVQ R8, DI 1027 RORXQ $41, R11, R13 1028 RORXQ $18, R11, R14 1029 ADDQ 1*8+frame_YFER(SP), CX 1030 ORQ R9, DI 1031 1032 MOVQ AX, R15 1033 RORXQ $34, R8, R12 1034 XORQ R14, R13 1035 XORQ BX, R15 1036 1037 RORXQ $14, R11, R14 1038 XORQ R14, R13 1039 RORXQ $39, R8, R14 1040 ANDQ R11, R15 1041 ADDQ CX, R10 1042 1043 ANDQ DX, DI 1044 XORQ R12, R14 1045 1046 RORXQ $28, R8, R12 1047 XORQ BX, R15 1048 1049 XORQ R12, R14 1050 MOVQ R8, R12 1051 ANDQ R9, R12 1052 ADDQ R13, R15 1053 1054 ORQ R12, DI 1055 ADDQ R14, CX 1056 1057 ADDQ R15, R10 1058 ADDQ R15, CX 1059 ADDQ DI, CX 1060 1061 VPSRLQ $19, Y2, Y3 1062 VPSLLQ $(64-19), Y2, Y1 1063 VPOR Y1, Y3, Y3 1064 VPXOR Y3, Y8, Y8 1065 VPSRLQ $61, Y2, Y3 1066 VPSLLQ $(64-61), Y2, Y1 1067 VPOR Y1, Y3, Y3 1068 VPXOR Y3, Y8, Y8 1069 1070 VPADDQ Y8, Y7, Y7 1071 1072 VPSRLQ $6, Y7, Y8 1073 1074 MOVQ CX, DI 1075 RORXQ $41, R10, R13 1076 ADDQ 2*8+frame_YFER(SP), BX 1077 1078 RORXQ $18, R10, R14 1079 ORQ DX, DI 1080 MOVQ R11, R15 1081 XORQ AX, R15 1082 1083 RORXQ $34, CX, R12 1084 XORQ R14, R13 1085 ANDQ R10, R15 1086 1087 RORXQ $14, R10, R14 1088 ADDQ BX, R9 1089 ANDQ R8, DI 1090 1091 XORQ R14, R13 1092 RORXQ $39, CX, R14 1093 XORQ AX, R15 1094 1095 XORQ R12, R14 1096 RORXQ $28, CX, R12 1097 1098 XORQ R12, R14 1099 MOVQ CX, R12 1100 ANDQ DX, R12 1101 ADDQ R13, R15 1102 1103 ORQ R12, DI 1104 ADDQ R14, BX 1105 ADDQ R15, R9 1106 ADDQ R15, BX 1107 1108 ADDQ DI, BX 1109 1110 VPSRLQ $19, Y7, Y3 1111 VPSLLQ $(64-19), Y7, Y1 1112 VPOR Y1, Y3, Y3 1113 VPXOR Y3, Y8, Y8 1114 VPSRLQ $61, Y7, Y3 1115 VPSLLQ $(64-61), Y7, Y1 1116 VPOR Y1, Y3, Y3 1117 VPXOR Y3, Y8, Y8 1118 1119 VPADDQ Y8, Y0, Y2 1120 1121 VPBLENDD $0xF0, Y2, Y7, Y7 1122 1123 MOVQ BX, DI 1124 RORXQ $41, R9, R13 1125 RORXQ $18, R9, R14 1126 ADDQ 3*8+frame_YFER(SP), AX 1127 ORQ R8, DI 1128 1129 MOVQ R10, R15 1130 RORXQ $34, BX, R12 1131 XORQ R14, R13 1132 XORQ R11, R15 1133 1134 RORXQ $14, R9, R14 1135 ANDQ R9, R15 1136 ADDQ AX, DX 1137 ANDQ CX, DI 1138 1139 XORQ R14, R13 1140 XORQ R11, R15 1141 1142 RORXQ $39, BX, R14 1143 ADDQ R13, R15 1144 1145 XORQ R12, R14 1146 ADDQ R15, DX 1147 1148 RORXQ $28, BX, R12 1149 1150 XORQ R12, R14 1151 MOVQ BX, R12 1152 ANDQ R8, R12 1153 ORQ R12, DI 1154 1155 ADDQ R14, AX 1156 ADDQ R15, AX 1157 ADDQ DI, AX 1158 1159 SUBQ $1, frame_SRND(SP) 1160 JNE loop1 1161 1162 MOVQ $2, frame_SRND(SP) 1163 1164 loop2: 1165 VPADDQ (BP), Y4, Y0 1166 VMOVDQU Y0, frame_YFER(SP) 1167 1168 MOVQ R9, R15 1169 RORXQ $41, DX, R13 1170 RORXQ $18, DX, R14 1171 XORQ R10, R15 1172 1173 XORQ R14, R13 1174 RORXQ $14, DX, R14 1175 ANDQ DX, R15 1176 1177 XORQ R14, R13 1178 RORXQ $34, AX, R12 1179 XORQ R10, R15 1180 RORXQ $39, AX, R14 1181 MOVQ AX, DI 1182 1183 XORQ R12, R14 1184 RORXQ $28, AX, R12 1185 ADDQ frame_YFER(SP), R11 1186 ORQ CX, DI 1187 1188 XORQ R12, R14 1189 MOVQ AX, R12 1190 ANDQ BX, DI 1191 ANDQ CX, R12 1192 ADDQ R13, R15 1193 1194 ADDQ R11, R8 1195 ORQ R12, DI 1196 ADDQ R14, R11 1197 1198 ADDQ R15, R8 1199 1200 ADDQ R15, R11 1201 MOVQ DX, R15 1202 RORXQ $41, R8, R13 1203 RORXQ $18, R8, R14 1204 XORQ R9, R15 1205 1206 XORQ R14, R13 1207 RORXQ $14, R8, R14 1208 ANDQ R8, R15 1209 ADDQ DI, R11 1210 1211 XORQ R14, R13 1212 RORXQ $34, R11, R12 1213 XORQ R9, R15 1214 RORXQ $39, R11, R14 1215 MOVQ R11, DI 1216 1217 XORQ R12, R14 1218 RORXQ $28, R11, R12 1219 ADDQ 8*1+frame_YFER(SP), R10 1220 ORQ BX, DI 1221 1222 XORQ R12, R14 1223 MOVQ R11, R12 1224 ANDQ AX, DI 1225 ANDQ BX, R12 1226 ADDQ R13, R15 1227 1228 ADDQ R10, CX 1229 ORQ R12, DI 1230 ADDQ R14, R10 1231 1232 ADDQ R15, CX 1233 1234 ADDQ R15, R10 1235 MOVQ R8, R15 1236 RORXQ $41, CX, R13 1237 RORXQ $18, CX, R14 1238 XORQ DX, R15 1239 1240 XORQ R14, R13 1241 RORXQ $14, CX, R14 1242 ANDQ CX, R15 1243 ADDQ DI, R10 1244 1245 XORQ R14, R13 1246 RORXQ $34, R10, R12 1247 XORQ DX, R15 1248 RORXQ $39, R10, R14 1249 MOVQ R10, DI 1250 1251 XORQ R12, R14 1252 RORXQ $28, R10, R12 1253 ADDQ 8*2+frame_YFER(SP), R9 1254 ORQ AX, DI 1255 1256 XORQ R12, R14 1257 MOVQ R10, R12 1258 ANDQ R11, DI 1259 ANDQ AX, R12 1260 ADDQ R13, R15 1261 1262 ADDQ R9, BX 1263 ORQ R12, DI 1264 ADDQ R14, R9 1265 1266 ADDQ R15, BX 1267 1268 ADDQ R15, R9 1269 MOVQ CX, R15 1270 RORXQ $41, BX, R13 1271 RORXQ $18, BX, R14 1272 XORQ R8, R15 1273 1274 XORQ R14, R13 1275 RORXQ $14, BX, R14 1276 ANDQ BX, R15 1277 ADDQ DI, R9 1278 1279 XORQ R14, R13 1280 RORXQ $34, R9, R12 1281 XORQ R8, R15 1282 RORXQ $39, R9, R14 1283 MOVQ R9, DI 1284 1285 XORQ R12, R14 1286 RORXQ $28, R9, R12 1287 ADDQ 8*3+frame_YFER(SP), DX 1288 ORQ R11, DI 1289 1290 XORQ R12, R14 1291 MOVQ R9, R12 1292 ANDQ R10, DI 1293 ANDQ R11, R12 1294 ADDQ R13, R15 1295 1296 ADDQ DX, AX 1297 ORQ R12, DI 1298 ADDQ R14, DX 1299 1300 ADDQ R15, AX 1301 1302 ADDQ R15, DX 1303 1304 ADDQ DI, DX 1305 1306 VPADDQ 1*32(BP), Y5, Y0 1307 VMOVDQU Y0, frame_YFER(SP) 1308 ADDQ $(2*32), BP 1309 1310 MOVQ BX, R15 1311 RORXQ $41, AX, R13 1312 RORXQ $18, AX, R14 1313 XORQ CX, R15 1314 1315 XORQ R14, R13 1316 RORXQ $14, AX, R14 1317 ANDQ AX, R15 1318 1319 XORQ R14, R13 1320 RORXQ $34, DX, R12 1321 XORQ CX, R15 1322 RORXQ $39, DX, R14 1323 MOVQ DX, DI 1324 1325 XORQ R12, R14 1326 RORXQ $28, DX, R12 1327 ADDQ frame_YFER(SP), R8 1328 ORQ R10, DI 1329 1330 XORQ R12, R14 1331 MOVQ DX, R12 1332 ANDQ R9, DI 1333 ANDQ R10, R12 1334 ADDQ R13, R15 1335 1336 ADDQ R8, R11 1337 ORQ R12, DI 1338 ADDQ R14, R8 1339 1340 ADDQ R15, R11 1341 1342 ADDQ R15, R8 1343 MOVQ AX, R15 1344 RORXQ $41, R11, R13 1345 RORXQ $18, R11, R14 1346 XORQ BX, R15 1347 1348 XORQ R14, R13 1349 RORXQ $14, R11, R14 1350 ANDQ R11, R15 1351 ADDQ DI, R8 1352 1353 XORQ R14, R13 1354 RORXQ $34, R8, R12 1355 XORQ BX, R15 1356 RORXQ $39, R8, R14 1357 MOVQ R8, DI 1358 1359 XORQ R12, R14 1360 RORXQ $28, R8, R12 1361 ADDQ 8*1+frame_YFER(SP), CX 1362 ORQ R9, DI 1363 1364 XORQ R12, R14 1365 MOVQ R8, R12 1366 ANDQ DX, DI 1367 ANDQ R9, R12 1368 ADDQ R13, R15 1369 1370 ADDQ CX, R10 1371 ORQ R12, DI 1372 ADDQ R14, CX 1373 1374 ADDQ R15, R10 1375 1376 ADDQ R15, CX 1377 MOVQ R11, R15 1378 RORXQ $41, R10, R13 1379 RORXQ $18, R10, R14 1380 XORQ AX, R15 1381 1382 XORQ R14, R13 1383 RORXQ $14, R10, R14 1384 ANDQ R10, R15 1385 ADDQ DI, CX 1386 1387 XORQ R14, R13 1388 RORXQ $34, CX, R12 1389 XORQ AX, R15 1390 RORXQ $39, CX, R14 1391 MOVQ CX, DI 1392 1393 XORQ R12, R14 1394 RORXQ $28, CX, R12 1395 ADDQ 8*2+frame_YFER(SP), BX 1396 ORQ DX, DI 1397 1398 XORQ R12, R14 1399 MOVQ CX, R12 1400 ANDQ R8, DI 1401 ANDQ DX, R12 1402 ADDQ R13, R15 1403 1404 ADDQ BX, R9 1405 ORQ R12, DI 1406 ADDQ R14, BX 1407 1408 ADDQ R15, R9 1409 1410 ADDQ R15, BX 1411 MOVQ R10, R15 1412 RORXQ $41, R9, R13 1413 RORXQ $18, R9, R14 1414 XORQ R11, R15 1415 1416 XORQ R14, R13 1417 RORXQ $14, R9, R14 1418 ANDQ R9, R15 1419 ADDQ DI, BX 1420 1421 XORQ R14, R13 1422 RORXQ $34, BX, R12 1423 XORQ R11, R15 1424 RORXQ $39, BX, R14 1425 MOVQ BX, DI 1426 1427 XORQ R12, R14 1428 RORXQ $28, BX, R12 1429 ADDQ 8*3+frame_YFER(SP), AX 1430 ORQ R8, DI 1431 1432 XORQ R12, R14 1433 MOVQ BX, R12 1434 ANDQ CX, DI 1435 ANDQ R8, R12 1436 ADDQ R13, R15 1437 1438 ADDQ AX, DX 1439 ORQ R12, DI 1440 ADDQ R14, AX 1441 1442 ADDQ R15, DX 1443 1444 ADDQ R15, AX 1445 1446 ADDQ DI, AX 1447 1448 VMOVDQU Y6, Y4 1449 VMOVDQU Y7, Y5 1450 1451 SUBQ $1, frame_SRND(SP) 1452 JNE loop2 1453 1454 addm(8*0(SI),AX) 1455 addm(8*1(SI),BX) 1456 addm(8*2(SI),CX) 1457 addm(8*3(SI),R8) 1458 addm(8*4(SI),DX) 1459 addm(8*5(SI),R9) 1460 addm(8*6(SI),R10) 1461 addm(8*7(SI),R11) 1462 1463 MOVQ frame_INP(SP), DI 1464 ADDQ $128, DI 1465 CMPQ DI, frame_INPEND(SP) 1466 JNE loop0 1467 1468 done_hash: 1469 VZEROUPPER 1470 RET