github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/crypto/sha512/sha512block_amd64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // SHA512 block routine. See sha512block.go for Go equivalent. 8 // 9 // The algorithm is detailed in FIPS 180-4: 10 // 11 // http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf 12 // 13 // Wt = Mt; for 0 <= t <= 15 14 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79 15 // 16 // a = H0 17 // b = H1 18 // c = H2 19 // d = H3 20 // e = H4 21 // f = H5 22 // g = H6 23 // h = H7 24 // 25 // for t = 0 to 79 { 26 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt 27 // T2 = BIGSIGMA0(a) + Maj(a,b,c) 28 // h = g 29 // g = f 30 // f = e 31 // e = d + T1 32 // d = c 33 // c = b 34 // b = a 35 // a = T1 + T2 36 // } 37 // 38 // H0 = a + H0 39 // H1 = b + H1 40 // H2 = c + H2 41 // H3 = d + H3 42 // H4 = e + H4 43 // H5 = f + H5 44 // H6 = g + H6 45 // H7 = h + H7 46 47 // Wt = Mt; for 0 <= t <= 15 48 #define MSGSCHEDULE0(index) \ 49 MOVQ (index*8)(SI), AX; \ 50 BSWAPQ AX; \ 51 MOVQ AX, (index*8)(BP) 52 53 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79 54 // SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x) 55 // SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x) 56 #define MSGSCHEDULE1(index) \ 57 MOVQ ((index-2)*8)(BP), AX; \ 58 MOVQ AX, CX; \ 59 RORQ $19, AX; \ 60 MOVQ CX, DX; \ 61 RORQ $61, CX; \ 62 SHRQ $6, DX; \ 63 MOVQ ((index-15)*8)(BP), BX; \ 64 XORQ CX, AX; \ 65 MOVQ BX, CX; \ 66 XORQ DX, AX; \ 67 RORQ $1, BX; \ 68 MOVQ CX, DX; \ 69 SHRQ $7, DX; \ 70 RORQ $8, CX; \ 71 ADDQ ((index-7)*8)(BP), AX; \ 72 XORQ CX, BX; \ 73 XORQ DX, BX; \ 74 ADDQ ((index-16)*8)(BP), BX; \ 75 ADDQ BX, AX; \ 76 MOVQ AX, ((index)*8)(BP) 77 78 // Calculate T1 in AX - uses AX, CX and DX registers. 79 // h is also used as an accumulator. Wt is passed in AX. 80 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt 81 // BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x) 82 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) 83 #define SHA512T1(const, e, f, g, h) \ 84 MOVQ $const, DX; \ 85 ADDQ AX, h; \ 86 MOVQ e, AX; \ 87 ADDQ DX, h; \ 88 MOVQ e, CX; \ 89 RORQ $14, AX; \ 90 MOVQ e, DX; \ 91 RORQ $18, CX; \ 92 XORQ CX, AX; \ 93 MOVQ e, CX; \ 94 RORQ $41, DX; \ 95 ANDQ f, CX; \ 96 XORQ AX, DX; \ 97 MOVQ e, AX; \ 98 NOTQ AX; \ 99 ADDQ DX, h; \ 100 ANDQ g, AX; \ 101 XORQ CX, AX; \ 102 ADDQ h, AX 103 104 // Calculate T2 in BX - uses BX, CX, DX and DI registers. 105 // T2 = BIGSIGMA0(a) + Maj(a, b, c) 106 // BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x) 107 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) 108 #define SHA512T2(a, b, c) \ 109 MOVQ a, DI; \ 110 MOVQ c, BX; \ 111 RORQ $28, DI; \ 112 MOVQ a, DX; \ 113 ANDQ b, BX; \ 114 RORQ $34, DX; \ 115 MOVQ a, CX; \ 116 ANDQ c, CX; \ 117 XORQ DX, DI; \ 118 XORQ CX, BX; \ 119 MOVQ a, DX; \ 120 MOVQ b, CX; \ 121 RORQ $39, DX; \ 122 ANDQ a, CX; \ 123 XORQ CX, BX; \ 124 XORQ DX, DI; \ 125 ADDQ DI, BX 126 127 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2. 128 // The values for e and a are stored in d and h, ready for rotation. 129 #define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \ 130 SHA512T1(const, e, f, g, h); \ 131 SHA512T2(a, b, c); \ 132 MOVQ BX, h; \ 133 ADDQ AX, d; \ 134 ADDQ AX, h 135 136 #define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \ 137 MSGSCHEDULE0(index); \ 138 SHA512ROUND(index, const, a, b, c, d, e, f, g, h) 139 140 #define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \ 141 MSGSCHEDULE1(index); \ 142 SHA512ROUND(index, const, a, b, c, d, e, f, g, h) 143 144 TEXT ·blockAMD64(SB),0,$648-32 145 MOVQ p_base+8(FP), SI 146 MOVQ p_len+16(FP), DX 147 SHRQ $7, DX 148 SHLQ $7, DX 149 150 LEAQ (SI)(DX*1), DI 151 MOVQ DI, 640(SP) 152 CMPQ SI, DI 153 JEQ end 154 155 MOVQ dig+0(FP), BP 156 MOVQ (0*8)(BP), R8 // a = H0 157 MOVQ (1*8)(BP), R9 // b = H1 158 MOVQ (2*8)(BP), R10 // c = H2 159 MOVQ (3*8)(BP), R11 // d = H3 160 MOVQ (4*8)(BP), R12 // e = H4 161 MOVQ (5*8)(BP), R13 // f = H5 162 MOVQ (6*8)(BP), R14 // g = H6 163 MOVQ (7*8)(BP), R15 // h = H7 164 165 loop: 166 MOVQ SP, BP // message schedule 167 168 SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15) 169 SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14) 170 SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13) 171 SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12) 172 SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11) 173 SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10) 174 SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9) 175 SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8) 176 SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15) 177 SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14) 178 SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13) 179 SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12) 180 SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11) 181 SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10) 182 SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9) 183 SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8) 184 185 SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15) 186 SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14) 187 SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13) 188 SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12) 189 SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11) 190 SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10) 191 SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9) 192 SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8) 193 SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15) 194 SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14) 195 SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13) 196 SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12) 197 SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11) 198 SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10) 199 SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9) 200 SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8) 201 SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15) 202 SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14) 203 SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13) 204 SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12) 205 SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11) 206 SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10) 207 SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9) 208 SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8) 209 SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15) 210 SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14) 211 SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13) 212 SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12) 213 SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11) 214 SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10) 215 SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9) 216 SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8) 217 SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15) 218 SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14) 219 SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13) 220 SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12) 221 SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11) 222 SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10) 223 SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9) 224 SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8) 225 SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15) 226 SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14) 227 SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13) 228 SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12) 229 SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11) 230 SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10) 231 SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9) 232 SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8) 233 SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15) 234 SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14) 235 SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13) 236 SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12) 237 SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11) 238 SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10) 239 SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9) 240 SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8) 241 SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15) 242 SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14) 243 SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13) 244 SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12) 245 SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11) 246 SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10) 247 SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9) 248 SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8) 249 250 MOVQ dig+0(FP), BP 251 ADDQ (0*8)(BP), R8 // H0 = a + H0 252 MOVQ R8, (0*8)(BP) 253 ADDQ (1*8)(BP), R9 // H1 = b + H1 254 MOVQ R9, (1*8)(BP) 255 ADDQ (2*8)(BP), R10 // H2 = c + H2 256 MOVQ R10, (2*8)(BP) 257 ADDQ (3*8)(BP), R11 // H3 = d + H3 258 MOVQ R11, (3*8)(BP) 259 ADDQ (4*8)(BP), R12 // H4 = e + H4 260 MOVQ R12, (4*8)(BP) 261 ADDQ (5*8)(BP), R13 // H5 = f + H5 262 MOVQ R13, (5*8)(BP) 263 ADDQ (6*8)(BP), R14 // H6 = g + H6 264 MOVQ R14, (6*8)(BP) 265 ADDQ (7*8)(BP), R15 // H7 = h + H7 266 MOVQ R15, (7*8)(BP) 267 268 ADDQ $128, SI 269 CMPQ SI, 640(SP) 270 JB loop 271 272 end: 273 RET 274 275 // Version below is based on "Fast SHA512 Implementations on Intel 276 // Architecture Processors" White-paper 277 // http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf 278 // AVX2 version by Intel, same algorithm in Linux kernel: 279 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S 280 281 // James Guilford <james.guilford@intel.com> 282 // Kirk Yap <kirk.s.yap@intel.com> 283 // Tim Chen <tim.c.chen@linux.intel.com> 284 // David Cote <david.m.cote@intel.com> 285 // Aleksey Sidorov <aleksey.sidorov@intel.com> 286 287 #define YFER_SIZE (4*8) 288 #define SRND_SIZE (1*8) 289 #define INP_SIZE (1*8) 290 291 #define frame_YFER (0) 292 #define frame_SRND (frame_YFER + YFER_SIZE) 293 #define frame_INP (frame_SRND + SRND_SIZE) 294 #define frame_INPEND (frame_INP + INP_SIZE) 295 296 #define addm(p1, p2) \ 297 ADDQ p1, p2; \ 298 MOVQ p2, p1 299 300 #define COPY_YMM_AND_BSWAP(p1, p2, p3) \ 301 VMOVDQU p2, p1; \ 302 VPSHUFB p3, p1, p1 303 304 #define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \ 305 VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \ 306 VPALIGNR $RVAL, YSRC2, YDST, YDST 307 308 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607 309 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f 310 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617 311 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f 312 313 GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32 314 315 DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000 316 DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000 317 DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 318 DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 319 320 GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32 321 322 TEXT ·blockAVX2(SB), NOSPLIT, $56-32 323 MOVQ dig+0(FP), SI 324 MOVQ p_base+8(FP), DI 325 MOVQ p_len+16(FP), DX 326 327 SHRQ $7, DX 328 SHLQ $7, DX 329 330 JZ done_hash 331 ADDQ DI, DX 332 MOVQ DX, frame_INPEND(SP) 333 334 MOVQ (0*8)(SI), AX 335 MOVQ (1*8)(SI), BX 336 MOVQ (2*8)(SI), CX 337 MOVQ (3*8)(SI), R8 338 MOVQ (4*8)(SI), DX 339 MOVQ (5*8)(SI), R9 340 MOVQ (6*8)(SI), R10 341 MOVQ (7*8)(SI), R11 342 343 VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9 344 345 loop0: 346 MOVQ ·_K+0(SB), BP 347 348 // byte swap first 16 dwords 349 COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9) 350 COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9) 351 COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9) 352 COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9) 353 354 MOVQ DI, frame_INP(SP) 355 356 // schedule 64 input dwords, by doing 12 rounds of 4 each 357 MOVQ $4, frame_SRND(SP) 358 359 loop1: 360 VPADDQ (BP), Y4, Y0 361 VMOVDQU Y0, frame_YFER(SP) 362 363 MY_VPALIGNR(Y0, Y7, Y6, 8) 364 365 VPADDQ Y4, Y0, Y0 366 367 MY_VPALIGNR(Y1, Y5, Y4, 8) 368 369 VPSRLQ $1, Y1, Y2 370 VPSLLQ $(64-1), Y1, Y3 371 VPOR Y2, Y3, Y3 372 373 VPSRLQ $7, Y1, Y8 374 375 MOVQ AX, DI 376 RORXQ $41, DX, R13 377 RORXQ $18, DX, R14 378 ADDQ frame_YFER(SP), R11 379 ORQ CX, DI 380 MOVQ R9, R15 381 RORXQ $34, AX, R12 382 383 XORQ R14, R13 384 XORQ R10, R15 385 RORXQ $14, DX, R14 386 387 ANDQ DX, R15 388 XORQ R14, R13 389 RORXQ $39, AX, R14 390 ADDQ R11, R8 391 392 ANDQ BX, DI 393 XORQ R12, R14 394 RORXQ $28, AX, R12 395 396 XORQ R10, R15 397 XORQ R12, R14 398 MOVQ AX, R12 399 ANDQ CX, R12 400 401 ADDQ R13, R15 402 ORQ R12, DI 403 ADDQ R14, R11 404 405 ADDQ R15, R8 406 407 ADDQ R15, R11 408 ADDQ DI, R11 409 410 VPSRLQ $8, Y1, Y2 411 VPSLLQ $(64-8), Y1, Y1 412 VPOR Y2, Y1, Y1 413 414 VPXOR Y8, Y3, Y3 415 VPXOR Y1, Y3, Y1 416 417 VPADDQ Y1, Y0, Y0 418 419 VPERM2F128 $0x0, Y0, Y0, Y4 420 421 VPAND MASK_YMM_LO<>(SB), Y0, Y0 422 423 VPERM2F128 $0x11, Y7, Y7, Y2 424 VPSRLQ $6, Y2, Y8 425 426 MOVQ R11, DI 427 RORXQ $41, R8, R13 428 RORXQ $18, R8, R14 429 ADDQ 1*8+frame_YFER(SP), R10 430 ORQ BX, DI 431 432 MOVQ DX, R15 433 RORXQ $34, R11, R12 434 XORQ R14, R13 435 XORQ R9, R15 436 437 RORXQ $14, R8, R14 438 XORQ R14, R13 439 RORXQ $39, R11, R14 440 ANDQ R8, R15 441 ADDQ R10, CX 442 443 ANDQ AX, DI 444 XORQ R12, R14 445 446 RORXQ $28, R11, R12 447 XORQ R9, R15 448 449 XORQ R12, R14 450 MOVQ R11, R12 451 ANDQ BX, R12 452 ADDQ R13, R15 453 454 ORQ R12, DI 455 ADDQ R14, R10 456 457 ADDQ R15, CX 458 ADDQ R15, R10 459 ADDQ DI, R10 460 461 VPSRLQ $19, Y2, Y3 462 VPSLLQ $(64-19), Y2, Y1 463 VPOR Y1, Y3, Y3 464 VPXOR Y3, Y8, Y8 465 VPSRLQ $61, Y2, Y3 466 VPSLLQ $(64-61), Y2, Y1 467 VPOR Y1, Y3, Y3 468 VPXOR Y3, Y8, Y8 469 470 VPADDQ Y8, Y4, Y4 471 472 VPSRLQ $6, Y4, Y8 473 474 MOVQ R10, DI 475 RORXQ $41, CX, R13 476 ADDQ 2*8+frame_YFER(SP), R9 477 478 RORXQ $18, CX, R14 479 ORQ AX, DI 480 MOVQ R8, R15 481 XORQ DX, R15 482 483 RORXQ $34, R10, R12 484 XORQ R14, R13 485 ANDQ CX, R15 486 487 RORXQ $14, CX, R14 488 ADDQ R9, BX 489 ANDQ R11, DI 490 491 XORQ R14, R13 492 RORXQ $39, R10, R14 493 XORQ DX, R15 494 495 XORQ R12, R14 496 RORXQ $28, R10, R12 497 498 XORQ R12, R14 499 MOVQ R10, R12 500 ANDQ AX, R12 501 ADDQ R13, R15 502 503 ORQ R12, DI 504 ADDQ R14, R9 505 ADDQ R15, BX 506 ADDQ R15, R9 507 508 ADDQ DI, R9 509 510 VPSRLQ $19, Y4, Y3 511 VPSLLQ $(64-19), Y4, Y1 512 VPOR Y1, Y3, Y3 513 VPXOR Y3, Y8, Y8 514 VPSRLQ $61, Y4, Y3 515 VPSLLQ $(64-61), Y4, Y1 516 VPOR Y1, Y3, Y3 517 VPXOR Y3, Y8, Y8 518 519 VPADDQ Y8, Y0, Y2 520 521 VPBLENDD $0xF0, Y2, Y4, Y4 522 523 MOVQ R9, DI 524 RORXQ $41, BX, R13 525 RORXQ $18, BX, R14 526 ADDQ 3*8+frame_YFER(SP), DX 527 ORQ R11, DI 528 529 MOVQ CX, R15 530 RORXQ $34, R9, R12 531 XORQ R14, R13 532 XORQ R8, R15 533 534 RORXQ $14, BX, R14 535 ANDQ BX, R15 536 ADDQ DX, AX 537 ANDQ R10, DI 538 539 XORQ R14, R13 540 XORQ R8, R15 541 542 RORXQ $39, R9, R14 543 ADDQ R13, R15 544 545 XORQ R12, R14 546 ADDQ R15, AX 547 548 RORXQ $28, R9, R12 549 550 XORQ R12, R14 551 MOVQ R9, R12 552 ANDQ R11, R12 553 ORQ R12, DI 554 555 ADDQ R14, DX 556 ADDQ R15, DX 557 ADDQ DI, DX 558 559 VPADDQ 1*32(BP), Y5, Y0 560 VMOVDQU Y0, frame_YFER(SP) 561 562 MY_VPALIGNR(Y0, Y4, Y7, 8) 563 564 VPADDQ Y5, Y0, Y0 565 566 MY_VPALIGNR(Y1, Y6, Y5, 8) 567 568 VPSRLQ $1, Y1, Y2 569 VPSLLQ $(64-1), Y1, Y3 570 VPOR Y2, Y3, Y3 571 572 VPSRLQ $7, Y1, Y8 573 574 MOVQ DX, DI 575 RORXQ $41, AX, R13 576 RORXQ $18, AX, R14 577 ADDQ frame_YFER(SP), R8 578 ORQ R10, DI 579 MOVQ BX, R15 580 RORXQ $34, DX, R12 581 582 XORQ R14, R13 583 XORQ CX, R15 584 RORXQ $14, AX, R14 585 586 ANDQ AX, R15 587 XORQ R14, R13 588 RORXQ $39, DX, R14 589 ADDQ R8, R11 590 591 ANDQ R9, DI 592 XORQ R12, R14 593 RORXQ $28, DX, R12 594 595 XORQ CX, R15 596 XORQ R12, R14 597 MOVQ DX, R12 598 ANDQ R10, R12 599 600 ADDQ R13, R15 601 ORQ R12, DI 602 ADDQ R14, R8 603 604 ADDQ R15, R11 605 606 ADDQ R15, R8 607 ADDQ DI, R8 608 609 VPSRLQ $8, Y1, Y2 610 VPSLLQ $(64-8), Y1, Y1 611 VPOR Y2, Y1, Y1 612 613 VPXOR Y8, Y3, Y3 614 VPXOR Y1, Y3, Y1 615 616 VPADDQ Y1, Y0, Y0 617 618 VPERM2F128 $0x0, Y0, Y0, Y5 619 620 VPAND MASK_YMM_LO<>(SB), Y0, Y0 621 622 VPERM2F128 $0x11, Y4, Y4, Y2 623 VPSRLQ $6, Y2, Y8 624 625 MOVQ R8, DI 626 RORXQ $41, R11, R13 627 RORXQ $18, R11, R14 628 ADDQ 1*8+frame_YFER(SP), CX 629 ORQ R9, DI 630 631 MOVQ AX, R15 632 RORXQ $34, R8, R12 633 XORQ R14, R13 634 XORQ BX, R15 635 636 RORXQ $14, R11, R14 637 XORQ R14, R13 638 RORXQ $39, R8, R14 639 ANDQ R11, R15 640 ADDQ CX, R10 641 642 ANDQ DX, DI 643 XORQ R12, R14 644 645 RORXQ $28, R8, R12 646 XORQ BX, R15 647 648 XORQ R12, R14 649 MOVQ R8, R12 650 ANDQ R9, R12 651 ADDQ R13, R15 652 653 ORQ R12, DI 654 ADDQ R14, CX 655 656 ADDQ R15, R10 657 ADDQ R15, CX 658 ADDQ DI, CX 659 660 VPSRLQ $19, Y2, Y3 661 VPSLLQ $(64-19), Y2, Y1 662 VPOR Y1, Y3, Y3 663 VPXOR Y3, Y8, Y8 664 VPSRLQ $61, Y2, Y3 665 VPSLLQ $(64-61), Y2, Y1 666 VPOR Y1, Y3, Y3 667 VPXOR Y3, Y8, Y8 668 669 VPADDQ Y8, Y5, Y5 670 671 VPSRLQ $6, Y5, Y8 672 673 MOVQ CX, DI 674 RORXQ $41, R10, R13 675 ADDQ 2*8+frame_YFER(SP), BX 676 677 RORXQ $18, R10, R14 678 ORQ DX, DI 679 MOVQ R11, R15 680 XORQ AX, R15 681 682 RORXQ $34, CX, R12 683 XORQ R14, R13 684 ANDQ R10, R15 685 686 RORXQ $14, R10, R14 687 ADDQ BX, R9 688 ANDQ R8, DI 689 690 XORQ R14, R13 691 RORXQ $39, CX, R14 692 XORQ AX, R15 693 694 XORQ R12, R14 695 RORXQ $28, CX, R12 696 697 XORQ R12, R14 698 MOVQ CX, R12 699 ANDQ DX, R12 700 ADDQ R13, R15 701 702 ORQ R12, DI 703 ADDQ R14, BX 704 ADDQ R15, R9 705 ADDQ R15, BX 706 707 ADDQ DI, BX 708 709 VPSRLQ $19, Y5, Y3 710 VPSLLQ $(64-19), Y5, Y1 711 VPOR Y1, Y3, Y3 712 VPXOR Y3, Y8, Y8 713 VPSRLQ $61, Y5, Y3 714 VPSLLQ $(64-61), Y5, Y1 715 VPOR Y1, Y3, Y3 716 VPXOR Y3, Y8, Y8 717 718 VPADDQ Y8, Y0, Y2 719 720 VPBLENDD $0xF0, Y2, Y5, Y5 721 722 MOVQ BX, DI 723 RORXQ $41, R9, R13 724 RORXQ $18, R9, R14 725 ADDQ 3*8+frame_YFER(SP), AX 726 ORQ R8, DI 727 728 MOVQ R10, R15 729 RORXQ $34, BX, R12 730 XORQ R14, R13 731 XORQ R11, R15 732 733 RORXQ $14, R9, R14 734 ANDQ R9, R15 735 ADDQ AX, DX 736 ANDQ CX, DI 737 738 XORQ R14, R13 739 XORQ R11, R15 740 741 RORXQ $39, BX, R14 742 ADDQ R13, R15 743 744 XORQ R12, R14 745 ADDQ R15, DX 746 747 RORXQ $28, BX, R12 748 749 XORQ R12, R14 750 MOVQ BX, R12 751 ANDQ R8, R12 752 ORQ R12, DI 753 754 ADDQ R14, AX 755 ADDQ R15, AX 756 ADDQ DI, AX 757 758 VPADDQ 2*32(BP), Y6, Y0 759 VMOVDQU Y0, frame_YFER(SP) 760 761 MY_VPALIGNR(Y0, Y5, Y4, 8) 762 763 VPADDQ Y6, Y0, Y0 764 765 MY_VPALIGNR(Y1, Y7, Y6, 8) 766 767 VPSRLQ $1, Y1, Y2 768 VPSLLQ $(64-1), Y1, Y3 769 VPOR Y2, Y3, Y3 770 771 VPSRLQ $7, Y1, Y8 772 773 MOVQ AX, DI 774 RORXQ $41, DX, R13 775 RORXQ $18, DX, R14 776 ADDQ frame_YFER(SP), R11 777 ORQ CX, DI 778 MOVQ R9, R15 779 RORXQ $34, AX, R12 780 781 XORQ R14, R13 782 XORQ R10, R15 783 RORXQ $14, DX, R14 784 785 ANDQ DX, R15 786 XORQ R14, R13 787 RORXQ $39, AX, R14 788 ADDQ R11, R8 789 790 ANDQ BX, DI 791 XORQ R12, R14 792 RORXQ $28, AX, R12 793 794 XORQ R10, R15 795 XORQ R12, R14 796 MOVQ AX, R12 797 ANDQ CX, R12 798 799 ADDQ R13, R15 800 ORQ R12, DI 801 ADDQ R14, R11 802 803 ADDQ R15, R8 804 805 ADDQ R15, R11 806 ADDQ DI, R11 807 808 VPSRLQ $8, Y1, Y2 809 VPSLLQ $(64-8), Y1, Y1 810 VPOR Y2, Y1, Y1 811 812 VPXOR Y8, Y3, Y3 813 VPXOR Y1, Y3, Y1 814 815 VPADDQ Y1, Y0, Y0 816 817 VPERM2F128 $0x0, Y0, Y0, Y6 818 819 VPAND MASK_YMM_LO<>(SB), Y0, Y0 820 821 VPERM2F128 $0x11, Y5, Y5, Y2 822 VPSRLQ $6, Y2, Y8 823 824 MOVQ R11, DI 825 RORXQ $41, R8, R13 826 RORXQ $18, R8, R14 827 ADDQ 1*8+frame_YFER(SP), R10 828 ORQ BX, DI 829 830 MOVQ DX, R15 831 RORXQ $34, R11, R12 832 XORQ R14, R13 833 XORQ R9, R15 834 835 RORXQ $14, R8, R14 836 XORQ R14, R13 837 RORXQ $39, R11, R14 838 ANDQ R8, R15 839 ADDQ R10, CX 840 841 ANDQ AX, DI 842 XORQ R12, R14 843 844 RORXQ $28, R11, R12 845 XORQ R9, R15 846 847 XORQ R12, R14 848 MOVQ R11, R12 849 ANDQ BX, R12 850 ADDQ R13, R15 851 852 ORQ R12, DI 853 ADDQ R14, R10 854 855 ADDQ R15, CX 856 ADDQ R15, R10 857 ADDQ DI, R10 858 859 VPSRLQ $19, Y2, Y3 860 VPSLLQ $(64-19), Y2, Y1 861 VPOR Y1, Y3, Y3 862 VPXOR Y3, Y8, Y8 863 VPSRLQ $61, Y2, Y3 864 VPSLLQ $(64-61), Y2, Y1 865 VPOR Y1, Y3, Y3 866 VPXOR Y3, Y8, Y8 867 868 VPADDQ Y8, Y6, Y6 869 870 VPSRLQ $6, Y6, Y8 871 872 MOVQ R10, DI 873 RORXQ $41, CX, R13 874 ADDQ 2*8+frame_YFER(SP), R9 875 876 RORXQ $18, CX, R14 877 ORQ AX, DI 878 MOVQ R8, R15 879 XORQ DX, R15 880 881 RORXQ $34, R10, R12 882 XORQ R14, R13 883 ANDQ CX, R15 884 885 RORXQ $14, CX, R14 886 ADDQ R9, BX 887 ANDQ R11, DI 888 889 XORQ R14, R13 890 RORXQ $39, R10, R14 891 XORQ DX, R15 892 893 XORQ R12, R14 894 RORXQ $28, R10, R12 895 896 XORQ R12, R14 897 MOVQ R10, R12 898 ANDQ AX, R12 899 ADDQ R13, R15 900 901 ORQ R12, DI 902 ADDQ R14, R9 903 ADDQ R15, BX 904 ADDQ R15, R9 905 906 ADDQ DI, R9 907 908 VPSRLQ $19, Y6, Y3 909 VPSLLQ $(64-19), Y6, Y1 910 VPOR Y1, Y3, Y3 911 VPXOR Y3, Y8, Y8 912 VPSRLQ $61, Y6, Y3 913 VPSLLQ $(64-61), Y6, Y1 914 VPOR Y1, Y3, Y3 915 VPXOR Y3, Y8, Y8 916 917 VPADDQ Y8, Y0, Y2 918 919 VPBLENDD $0xF0, Y2, Y6, Y6 920 921 MOVQ R9, DI 922 RORXQ $41, BX, R13 923 RORXQ $18, BX, R14 924 ADDQ 3*8+frame_YFER(SP), DX 925 ORQ R11, DI 926 927 MOVQ CX, R15 928 RORXQ $34, R9, R12 929 XORQ R14, R13 930 XORQ R8, R15 931 932 RORXQ $14, BX, R14 933 ANDQ BX, R15 934 ADDQ DX, AX 935 ANDQ R10, DI 936 937 XORQ R14, R13 938 XORQ R8, R15 939 940 RORXQ $39, R9, R14 941 ADDQ R13, R15 942 943 XORQ R12, R14 944 ADDQ R15, AX 945 946 RORXQ $28, R9, R12 947 948 XORQ R12, R14 949 MOVQ R9, R12 950 ANDQ R11, R12 951 ORQ R12, DI 952 953 ADDQ R14, DX 954 ADDQ R15, DX 955 ADDQ DI, DX 956 957 VPADDQ 3*32(BP), Y7, Y0 958 VMOVDQU Y0, frame_YFER(SP) 959 ADDQ $(4*32), BP 960 961 MY_VPALIGNR(Y0, Y6, Y5, 8) 962 963 VPADDQ Y7, Y0, Y0 964 965 MY_VPALIGNR(Y1, Y4, Y7, 8) 966 967 VPSRLQ $1, Y1, Y2 968 VPSLLQ $(64-1), Y1, Y3 969 VPOR Y2, Y3, Y3 970 971 VPSRLQ $7, Y1, Y8 972 973 MOVQ DX, DI 974 RORXQ $41, AX, R13 975 RORXQ $18, AX, R14 976 ADDQ frame_YFER(SP), R8 977 ORQ R10, DI 978 MOVQ BX, R15 979 RORXQ $34, DX, R12 980 981 XORQ R14, R13 982 XORQ CX, R15 983 RORXQ $14, AX, R14 984 985 ANDQ AX, R15 986 XORQ R14, R13 987 RORXQ $39, DX, R14 988 ADDQ R8, R11 989 990 ANDQ R9, DI 991 XORQ R12, R14 992 RORXQ $28, DX, R12 993 994 XORQ CX, R15 995 XORQ R12, R14 996 MOVQ DX, R12 997 ANDQ R10, R12 998 999 ADDQ R13, R15 1000 ORQ R12, DI 1001 ADDQ R14, R8 1002 1003 ADDQ R15, R11 1004 1005 ADDQ R15, R8 1006 ADDQ DI, R8 1007 1008 VPSRLQ $8, Y1, Y2 1009 VPSLLQ $(64-8), Y1, Y1 1010 VPOR Y2, Y1, Y1 1011 1012 VPXOR Y8, Y3, Y3 1013 VPXOR Y1, Y3, Y1 1014 1015 VPADDQ Y1, Y0, Y0 1016 1017 VPERM2F128 $0x0, Y0, Y0, Y7 1018 1019 VPAND MASK_YMM_LO<>(SB), Y0, Y0 1020 1021 VPERM2F128 $0x11, Y6, Y6, Y2 1022 VPSRLQ $6, Y2, Y8 1023 1024 MOVQ R8, DI 1025 RORXQ $41, R11, R13 1026 RORXQ $18, R11, R14 1027 ADDQ 1*8+frame_YFER(SP), CX 1028 ORQ R9, DI 1029 1030 MOVQ AX, R15 1031 RORXQ $34, R8, R12 1032 XORQ R14, R13 1033 XORQ BX, R15 1034 1035 RORXQ $14, R11, R14 1036 XORQ R14, R13 1037 RORXQ $39, R8, R14 1038 ANDQ R11, R15 1039 ADDQ CX, R10 1040 1041 ANDQ DX, DI 1042 XORQ R12, R14 1043 1044 RORXQ $28, R8, R12 1045 XORQ BX, R15 1046 1047 XORQ R12, R14 1048 MOVQ R8, R12 1049 ANDQ R9, R12 1050 ADDQ R13, R15 1051 1052 ORQ R12, DI 1053 ADDQ R14, CX 1054 1055 ADDQ R15, R10 1056 ADDQ R15, CX 1057 ADDQ DI, CX 1058 1059 VPSRLQ $19, Y2, Y3 1060 VPSLLQ $(64-19), Y2, Y1 1061 VPOR Y1, Y3, Y3 1062 VPXOR Y3, Y8, Y8 1063 VPSRLQ $61, Y2, Y3 1064 VPSLLQ $(64-61), Y2, Y1 1065 VPOR Y1, Y3, Y3 1066 VPXOR Y3, Y8, Y8 1067 1068 VPADDQ Y8, Y7, Y7 1069 1070 VPSRLQ $6, Y7, Y8 1071 1072 MOVQ CX, DI 1073 RORXQ $41, R10, R13 1074 ADDQ 2*8+frame_YFER(SP), BX 1075 1076 RORXQ $18, R10, R14 1077 ORQ DX, DI 1078 MOVQ R11, R15 1079 XORQ AX, R15 1080 1081 RORXQ $34, CX, R12 1082 XORQ R14, R13 1083 ANDQ R10, R15 1084 1085 RORXQ $14, R10, R14 1086 ADDQ BX, R9 1087 ANDQ R8, DI 1088 1089 XORQ R14, R13 1090 RORXQ $39, CX, R14 1091 XORQ AX, R15 1092 1093 XORQ R12, R14 1094 RORXQ $28, CX, R12 1095 1096 XORQ R12, R14 1097 MOVQ CX, R12 1098 ANDQ DX, R12 1099 ADDQ R13, R15 1100 1101 ORQ R12, DI 1102 ADDQ R14, BX 1103 ADDQ R15, R9 1104 ADDQ R15, BX 1105 1106 ADDQ DI, BX 1107 1108 VPSRLQ $19, Y7, Y3 1109 VPSLLQ $(64-19), Y7, Y1 1110 VPOR Y1, Y3, Y3 1111 VPXOR Y3, Y8, Y8 1112 VPSRLQ $61, Y7, Y3 1113 VPSLLQ $(64-61), Y7, Y1 1114 VPOR Y1, Y3, Y3 1115 VPXOR Y3, Y8, Y8 1116 1117 VPADDQ Y8, Y0, Y2 1118 1119 VPBLENDD $0xF0, Y2, Y7, Y7 1120 1121 MOVQ BX, DI 1122 RORXQ $41, R9, R13 1123 RORXQ $18, R9, R14 1124 ADDQ 3*8+frame_YFER(SP), AX 1125 ORQ R8, DI 1126 1127 MOVQ R10, R15 1128 RORXQ $34, BX, R12 1129 XORQ R14, R13 1130 XORQ R11, R15 1131 1132 RORXQ $14, R9, R14 1133 ANDQ R9, R15 1134 ADDQ AX, DX 1135 ANDQ CX, DI 1136 1137 XORQ R14, R13 1138 XORQ R11, R15 1139 1140 RORXQ $39, BX, R14 1141 ADDQ R13, R15 1142 1143 XORQ R12, R14 1144 ADDQ R15, DX 1145 1146 RORXQ $28, BX, R12 1147 1148 XORQ R12, R14 1149 MOVQ BX, R12 1150 ANDQ R8, R12 1151 ORQ R12, DI 1152 1153 ADDQ R14, AX 1154 ADDQ R15, AX 1155 ADDQ DI, AX 1156 1157 SUBQ $1, frame_SRND(SP) 1158 JNE loop1 1159 1160 MOVQ $2, frame_SRND(SP) 1161 1162 loop2: 1163 VPADDQ (BP), Y4, Y0 1164 VMOVDQU Y0, frame_YFER(SP) 1165 1166 MOVQ R9, R15 1167 RORXQ $41, DX, R13 1168 RORXQ $18, DX, R14 1169 XORQ R10, R15 1170 1171 XORQ R14, R13 1172 RORXQ $14, DX, R14 1173 ANDQ DX, R15 1174 1175 XORQ R14, R13 1176 RORXQ $34, AX, R12 1177 XORQ R10, R15 1178 RORXQ $39, AX, R14 1179 MOVQ AX, DI 1180 1181 XORQ R12, R14 1182 RORXQ $28, AX, R12 1183 ADDQ frame_YFER(SP), R11 1184 ORQ CX, DI 1185 1186 XORQ R12, R14 1187 MOVQ AX, R12 1188 ANDQ BX, DI 1189 ANDQ CX, R12 1190 ADDQ R13, R15 1191 1192 ADDQ R11, R8 1193 ORQ R12, DI 1194 ADDQ R14, R11 1195 1196 ADDQ R15, R8 1197 1198 ADDQ R15, R11 1199 MOVQ DX, R15 1200 RORXQ $41, R8, R13 1201 RORXQ $18, R8, R14 1202 XORQ R9, R15 1203 1204 XORQ R14, R13 1205 RORXQ $14, R8, R14 1206 ANDQ R8, R15 1207 ADDQ DI, R11 1208 1209 XORQ R14, R13 1210 RORXQ $34, R11, R12 1211 XORQ R9, R15 1212 RORXQ $39, R11, R14 1213 MOVQ R11, DI 1214 1215 XORQ R12, R14 1216 RORXQ $28, R11, R12 1217 ADDQ 8*1+frame_YFER(SP), R10 1218 ORQ BX, DI 1219 1220 XORQ R12, R14 1221 MOVQ R11, R12 1222 ANDQ AX, DI 1223 ANDQ BX, R12 1224 ADDQ R13, R15 1225 1226 ADDQ R10, CX 1227 ORQ R12, DI 1228 ADDQ R14, R10 1229 1230 ADDQ R15, CX 1231 1232 ADDQ R15, R10 1233 MOVQ R8, R15 1234 RORXQ $41, CX, R13 1235 RORXQ $18, CX, R14 1236 XORQ DX, R15 1237 1238 XORQ R14, R13 1239 RORXQ $14, CX, R14 1240 ANDQ CX, R15 1241 ADDQ DI, R10 1242 1243 XORQ R14, R13 1244 RORXQ $34, R10, R12 1245 XORQ DX, R15 1246 RORXQ $39, R10, R14 1247 MOVQ R10, DI 1248 1249 XORQ R12, R14 1250 RORXQ $28, R10, R12 1251 ADDQ 8*2+frame_YFER(SP), R9 1252 ORQ AX, DI 1253 1254 XORQ R12, R14 1255 MOVQ R10, R12 1256 ANDQ R11, DI 1257 ANDQ AX, R12 1258 ADDQ R13, R15 1259 1260 ADDQ R9, BX 1261 ORQ R12, DI 1262 ADDQ R14, R9 1263 1264 ADDQ R15, BX 1265 1266 ADDQ R15, R9 1267 MOVQ CX, R15 1268 RORXQ $41, BX, R13 1269 RORXQ $18, BX, R14 1270 XORQ R8, R15 1271 1272 XORQ R14, R13 1273 RORXQ $14, BX, R14 1274 ANDQ BX, R15 1275 ADDQ DI, R9 1276 1277 XORQ R14, R13 1278 RORXQ $34, R9, R12 1279 XORQ R8, R15 1280 RORXQ $39, R9, R14 1281 MOVQ R9, DI 1282 1283 XORQ R12, R14 1284 RORXQ $28, R9, R12 1285 ADDQ 8*3+frame_YFER(SP), DX 1286 ORQ R11, DI 1287 1288 XORQ R12, R14 1289 MOVQ R9, R12 1290 ANDQ R10, DI 1291 ANDQ R11, R12 1292 ADDQ R13, R15 1293 1294 ADDQ DX, AX 1295 ORQ R12, DI 1296 ADDQ R14, DX 1297 1298 ADDQ R15, AX 1299 1300 ADDQ R15, DX 1301 1302 ADDQ DI, DX 1303 1304 VPADDQ 1*32(BP), Y5, Y0 1305 VMOVDQU Y0, frame_YFER(SP) 1306 ADDQ $(2*32), BP 1307 1308 MOVQ BX, R15 1309 RORXQ $41, AX, R13 1310 RORXQ $18, AX, R14 1311 XORQ CX, R15 1312 1313 XORQ R14, R13 1314 RORXQ $14, AX, R14 1315 ANDQ AX, R15 1316 1317 XORQ R14, R13 1318 RORXQ $34, DX, R12 1319 XORQ CX, R15 1320 RORXQ $39, DX, R14 1321 MOVQ DX, DI 1322 1323 XORQ R12, R14 1324 RORXQ $28, DX, R12 1325 ADDQ frame_YFER(SP), R8 1326 ORQ R10, DI 1327 1328 XORQ R12, R14 1329 MOVQ DX, R12 1330 ANDQ R9, DI 1331 ANDQ R10, R12 1332 ADDQ R13, R15 1333 1334 ADDQ R8, R11 1335 ORQ R12, DI 1336 ADDQ R14, R8 1337 1338 ADDQ R15, R11 1339 1340 ADDQ R15, R8 1341 MOVQ AX, R15 1342 RORXQ $41, R11, R13 1343 RORXQ $18, R11, R14 1344 XORQ BX, R15 1345 1346 XORQ R14, R13 1347 RORXQ $14, R11, R14 1348 ANDQ R11, R15 1349 ADDQ DI, R8 1350 1351 XORQ R14, R13 1352 RORXQ $34, R8, R12 1353 XORQ BX, R15 1354 RORXQ $39, R8, R14 1355 MOVQ R8, DI 1356 1357 XORQ R12, R14 1358 RORXQ $28, R8, R12 1359 ADDQ 8*1+frame_YFER(SP), CX 1360 ORQ R9, DI 1361 1362 XORQ R12, R14 1363 MOVQ R8, R12 1364 ANDQ DX, DI 1365 ANDQ R9, R12 1366 ADDQ R13, R15 1367 1368 ADDQ CX, R10 1369 ORQ R12, DI 1370 ADDQ R14, CX 1371 1372 ADDQ R15, R10 1373 1374 ADDQ R15, CX 1375 MOVQ R11, R15 1376 RORXQ $41, R10, R13 1377 RORXQ $18, R10, R14 1378 XORQ AX, R15 1379 1380 XORQ R14, R13 1381 RORXQ $14, R10, R14 1382 ANDQ R10, R15 1383 ADDQ DI, CX 1384 1385 XORQ R14, R13 1386 RORXQ $34, CX, R12 1387 XORQ AX, R15 1388 RORXQ $39, CX, R14 1389 MOVQ CX, DI 1390 1391 XORQ R12, R14 1392 RORXQ $28, CX, R12 1393 ADDQ 8*2+frame_YFER(SP), BX 1394 ORQ DX, DI 1395 1396 XORQ R12, R14 1397 MOVQ CX, R12 1398 ANDQ R8, DI 1399 ANDQ DX, R12 1400 ADDQ R13, R15 1401 1402 ADDQ BX, R9 1403 ORQ R12, DI 1404 ADDQ R14, BX 1405 1406 ADDQ R15, R9 1407 1408 ADDQ R15, BX 1409 MOVQ R10, R15 1410 RORXQ $41, R9, R13 1411 RORXQ $18, R9, R14 1412 XORQ R11, R15 1413 1414 XORQ R14, R13 1415 RORXQ $14, R9, R14 1416 ANDQ R9, R15 1417 ADDQ DI, BX 1418 1419 XORQ R14, R13 1420 RORXQ $34, BX, R12 1421 XORQ R11, R15 1422 RORXQ $39, BX, R14 1423 MOVQ BX, DI 1424 1425 XORQ R12, R14 1426 RORXQ $28, BX, R12 1427 ADDQ 8*3+frame_YFER(SP), AX 1428 ORQ R8, DI 1429 1430 XORQ R12, R14 1431 MOVQ BX, R12 1432 ANDQ CX, DI 1433 ANDQ R8, R12 1434 ADDQ R13, R15 1435 1436 ADDQ AX, DX 1437 ORQ R12, DI 1438 ADDQ R14, AX 1439 1440 ADDQ R15, DX 1441 1442 ADDQ R15, AX 1443 1444 ADDQ DI, AX 1445 1446 VMOVDQU Y6, Y4 1447 VMOVDQU Y7, Y5 1448 1449 SUBQ $1, frame_SRND(SP) 1450 JNE loop2 1451 1452 addm(8*0(SI),AX) 1453 addm(8*1(SI),BX) 1454 addm(8*2(SI),CX) 1455 addm(8*3(SI),R8) 1456 addm(8*4(SI),DX) 1457 addm(8*5(SI),R9) 1458 addm(8*6(SI),R10) 1459 addm(8*7(SI),R11) 1460 1461 MOVQ frame_INP(SP), DI 1462 ADDQ $128, DI 1463 CMPQ DI, frame_INPEND(SP) 1464 JNE loop0 1465 1466 done_hash: 1467 VZEROUPPER 1468 RET