github.com/insolar/x-crypto@v0.0.0-20191031140942-75fab8a325f6/sha1/sha1block_amd64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // AVX2 version by Intel, same algorithm as code in Linux kernel: 6 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S 7 // Authors: 8 // Ilya Albrekht <ilya.albrekht@intel.com> 9 // Maxim Locktyukhin <maxim.locktyukhin@intel.com> 10 // Ronen Zohar <ronen.zohar@intel.com> 11 // Chandramouli Narayanan <mouli@linux.intel.com> 12 13 14 #include "textflag.h" 15 16 // SHA-1 block routine. See sha1block.go for Go equivalent. 17 // 18 // There are 80 rounds of 4 types: 19 // - rounds 0-15 are type 1 and load data (ROUND1 macro). 20 // - rounds 16-19 are type 1 and do not load data (ROUND1x macro). 21 // - rounds 20-39 are type 2 and do not load data (ROUND2 macro). 22 // - rounds 40-59 are type 3 and do not load data (ROUND3 macro). 23 // - rounds 60-79 are type 4 and do not load data (ROUND4 macro). 24 // 25 // Each round loads or shuffles the data, then computes a per-round 26 // function of b, c, d, and then mixes the result into and rotates the 27 // five registers a, b, c, d, e holding the intermediate results. 28 // 29 // The register rotation is implemented by rotating the arguments to 30 // the round macros instead of by explicit move instructions. 31 32 #define LOAD(index) \ 33 MOVL (index*4)(SI), R10; \ 34 BSWAPL R10; \ 35 MOVL R10, (index*4)(SP) 36 37 #define SHUFFLE(index) \ 38 MOVL (((index)&0xf)*4)(SP), R10; \ 39 XORL (((index-3)&0xf)*4)(SP), R10; \ 40 XORL (((index-8)&0xf)*4)(SP), R10; \ 41 XORL (((index-14)&0xf)*4)(SP), R10; \ 42 ROLL $1, R10; \ 43 MOVL R10, (((index)&0xf)*4)(SP) 44 45 #define FUNC1(a, b, c, d, e) \ 46 MOVL d, R9; \ 47 XORL c, R9; \ 48 ANDL b, R9; \ 49 XORL d, R9 50 51 #define FUNC2(a, b, c, d, e) \ 52 MOVL b, R9; \ 53 XORL c, R9; \ 54 XORL d, R9 55 56 #define FUNC3(a, b, c, d, e) \ 57 MOVL b, R8; \ 58 ORL c, R8; \ 59 ANDL d, R8; \ 60 MOVL b, R9; \ 61 ANDL c, R9; \ 62 ORL R8, R9 63 64 #define FUNC4 FUNC2 65 66 #define MIX(a, b, c, d, e, const) \ 67 ROLL $30, b; \ 68 ADDL R9, e; \ 69 MOVL a, R8; \ 70 ROLL $5, R8; \ 71 LEAL const(e)(R10*1), e; \ 72 ADDL R8, e 73 74 #define ROUND1(a, b, c, d, e, index) \ 75 LOAD(index); \ 76 FUNC1(a, b, c, d, e); \ 77 MIX(a, b, c, d, e, 0x5A827999) 78 79 #define ROUND1x(a, b, c, d, e, index) \ 80 SHUFFLE(index); \ 81 FUNC1(a, b, c, d, e); \ 82 MIX(a, b, c, d, e, 0x5A827999) 83 84 #define ROUND2(a, b, c, d, e, index) \ 85 SHUFFLE(index); \ 86 FUNC2(a, b, c, d, e); \ 87 MIX(a, b, c, d, e, 0x6ED9EBA1) 88 89 #define ROUND3(a, b, c, d, e, index) \ 90 SHUFFLE(index); \ 91 FUNC3(a, b, c, d, e); \ 92 MIX(a, b, c, d, e, 0x8F1BBCDC) 93 94 #define ROUND4(a, b, c, d, e, index) \ 95 SHUFFLE(index); \ 96 FUNC4(a, b, c, d, e); \ 97 MIX(a, b, c, d, e, 0xCA62C1D6) 98 99 TEXT ·blockAMD64(SB),NOSPLIT,$64-32 100 MOVQ dig+0(FP), BP 101 MOVQ p_base+8(FP), SI 102 MOVQ p_len+16(FP), DX 103 SHRQ $6, DX 104 SHLQ $6, DX 105 106 LEAQ (SI)(DX*1), DI 107 MOVL (0*4)(BP), AX 108 MOVL (1*4)(BP), BX 109 MOVL (2*4)(BP), CX 110 MOVL (3*4)(BP), DX 111 MOVL (4*4)(BP), BP 112 113 CMPQ SI, DI 114 JEQ end 115 116 loop: 117 MOVL AX, R11 118 MOVL BX, R12 119 MOVL CX, R13 120 MOVL DX, R14 121 MOVL BP, R15 122 123 ROUND1(AX, BX, CX, DX, BP, 0) 124 ROUND1(BP, AX, BX, CX, DX, 1) 125 ROUND1(DX, BP, AX, BX, CX, 2) 126 ROUND1(CX, DX, BP, AX, BX, 3) 127 ROUND1(BX, CX, DX, BP, AX, 4) 128 ROUND1(AX, BX, CX, DX, BP, 5) 129 ROUND1(BP, AX, BX, CX, DX, 6) 130 ROUND1(DX, BP, AX, BX, CX, 7) 131 ROUND1(CX, DX, BP, AX, BX, 8) 132 ROUND1(BX, CX, DX, BP, AX, 9) 133 ROUND1(AX, BX, CX, DX, BP, 10) 134 ROUND1(BP, AX, BX, CX, DX, 11) 135 ROUND1(DX, BP, AX, BX, CX, 12) 136 ROUND1(CX, DX, BP, AX, BX, 13) 137 ROUND1(BX, CX, DX, BP, AX, 14) 138 ROUND1(AX, BX, CX, DX, BP, 15) 139 140 ROUND1x(BP, AX, BX, CX, DX, 16) 141 ROUND1x(DX, BP, AX, BX, CX, 17) 142 ROUND1x(CX, DX, BP, AX, BX, 18) 143 ROUND1x(BX, CX, DX, BP, AX, 19) 144 145 ROUND2(AX, BX, CX, DX, BP, 20) 146 ROUND2(BP, AX, BX, CX, DX, 21) 147 ROUND2(DX, BP, AX, BX, CX, 22) 148 ROUND2(CX, DX, BP, AX, BX, 23) 149 ROUND2(BX, CX, DX, BP, AX, 24) 150 ROUND2(AX, BX, CX, DX, BP, 25) 151 ROUND2(BP, AX, BX, CX, DX, 26) 152 ROUND2(DX, BP, AX, BX, CX, 27) 153 ROUND2(CX, DX, BP, AX, BX, 28) 154 ROUND2(BX, CX, DX, BP, AX, 29) 155 ROUND2(AX, BX, CX, DX, BP, 30) 156 ROUND2(BP, AX, BX, CX, DX, 31) 157 ROUND2(DX, BP, AX, BX, CX, 32) 158 ROUND2(CX, DX, BP, AX, BX, 33) 159 ROUND2(BX, CX, DX, BP, AX, 34) 160 ROUND2(AX, BX, CX, DX, BP, 35) 161 ROUND2(BP, AX, BX, CX, DX, 36) 162 ROUND2(DX, BP, AX, BX, CX, 37) 163 ROUND2(CX, DX, BP, AX, BX, 38) 164 ROUND2(BX, CX, DX, BP, AX, 39) 165 166 ROUND3(AX, BX, CX, DX, BP, 40) 167 ROUND3(BP, AX, BX, CX, DX, 41) 168 ROUND3(DX, BP, AX, BX, CX, 42) 169 ROUND3(CX, DX, BP, AX, BX, 43) 170 ROUND3(BX, CX, DX, BP, AX, 44) 171 ROUND3(AX, BX, CX, DX, BP, 45) 172 ROUND3(BP, AX, BX, CX, DX, 46) 173 ROUND3(DX, BP, AX, BX, CX, 47) 174 ROUND3(CX, DX, BP, AX, BX, 48) 175 ROUND3(BX, CX, DX, BP, AX, 49) 176 ROUND3(AX, BX, CX, DX, BP, 50) 177 ROUND3(BP, AX, BX, CX, DX, 51) 178 ROUND3(DX, BP, AX, BX, CX, 52) 179 ROUND3(CX, DX, BP, AX, BX, 53) 180 ROUND3(BX, CX, DX, BP, AX, 54) 181 ROUND3(AX, BX, CX, DX, BP, 55) 182 ROUND3(BP, AX, BX, CX, DX, 56) 183 ROUND3(DX, BP, AX, BX, CX, 57) 184 ROUND3(CX, DX, BP, AX, BX, 58) 185 ROUND3(BX, CX, DX, BP, AX, 59) 186 187 ROUND4(AX, BX, CX, DX, BP, 60) 188 ROUND4(BP, AX, BX, CX, DX, 61) 189 ROUND4(DX, BP, AX, BX, CX, 62) 190 ROUND4(CX, DX, BP, AX, BX, 63) 191 ROUND4(BX, CX, DX, BP, AX, 64) 192 ROUND4(AX, BX, CX, DX, BP, 65) 193 ROUND4(BP, AX, BX, CX, DX, 66) 194 ROUND4(DX, BP, AX, BX, CX, 67) 195 ROUND4(CX, DX, BP, AX, BX, 68) 196 ROUND4(BX, CX, DX, BP, AX, 69) 197 ROUND4(AX, BX, CX, DX, BP, 70) 198 ROUND4(BP, AX, BX, CX, DX, 71) 199 ROUND4(DX, BP, AX, BX, CX, 72) 200 ROUND4(CX, DX, BP, AX, BX, 73) 201 ROUND4(BX, CX, DX, BP, AX, 74) 202 ROUND4(AX, BX, CX, DX, BP, 75) 203 ROUND4(BP, AX, BX, CX, DX, 76) 204 ROUND4(DX, BP, AX, BX, CX, 77) 205 ROUND4(CX, DX, BP, AX, BX, 78) 206 ROUND4(BX, CX, DX, BP, AX, 79) 207 208 ADDL R11, AX 209 ADDL R12, BX 210 ADDL R13, CX 211 ADDL R14, DX 212 ADDL R15, BP 213 214 ADDQ $64, SI 215 CMPQ SI, DI 216 JB loop 217 218 end: 219 MOVQ dig+0(FP), DI 220 MOVL AX, (0*4)(DI) 221 MOVL BX, (1*4)(DI) 222 MOVL CX, (2*4)(DI) 223 MOVL DX, (3*4)(DI) 224 MOVL BP, (4*4)(DI) 225 RET 226 227 228 // This is the implementation using AVX2, BMI1 and BMI2. It is based on: 229 // "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" 230 // From http://software.intel.com/en-us/articles 231 // (look for improving-the-performance-of-the-secure-hash-algorithm-1) 232 // This implementation is 2x unrolled, and interleaves vector instructions, 233 // used to precompute W, with scalar computation of current round 234 // for optimal scheduling. 235 236 // Trivial helper macros. 237 #define UPDATE_HASH(A,TB,C,D,E) \ 238 ADDL (R9), A \ 239 MOVL A, (R9) \ 240 ADDL 4(R9), TB \ 241 MOVL TB, 4(R9) \ 242 ADDL 8(R9), C \ 243 MOVL C, 8(R9) \ 244 ADDL 12(R9), D \ 245 MOVL D, 12(R9) \ 246 ADDL 16(R9), E \ 247 MOVL E, 16(R9) 248 249 250 251 // Helper macros for PRECALC, which does precomputations 252 #define PRECALC_0(OFFSET) \ 253 VMOVDQU OFFSET(R10),X0 254 255 #define PRECALC_1(OFFSET) \ 256 VINSERTI128 $1, OFFSET(R13), Y0, Y0 257 258 #define PRECALC_2(YREG) \ 259 VPSHUFB Y10, Y0, YREG 260 261 #define PRECALC_4(YREG,K_OFFSET) \ 262 VPADDD K_OFFSET(R8), YREG, Y0 263 264 #define PRECALC_7(OFFSET) \ 265 VMOVDQU Y0, (OFFSET*2)(R14) 266 267 268 // Message scheduling pre-compute for rounds 0-15 269 // R13 is a pointer to even 64-byte block 270 // R10 is a pointer to odd 64-byte block 271 // R14 is a pointer to temp buffer 272 // X0 is used as temp register 273 // YREG is clobbered as part of computation 274 // OFFSET chooses 16 byte chunk within a block 275 // R8 is a pointer to constants block 276 // K_OFFSET chooses K constants relevant to this round 277 // X10 holds swap mask 278 #define PRECALC_00_15(OFFSET,YREG) \ 279 PRECALC_0(OFFSET) \ 280 PRECALC_1(OFFSET) \ 281 PRECALC_2(YREG) \ 282 PRECALC_4(YREG,0x0) \ 283 PRECALC_7(OFFSET) 284 285 286 // Helper macros for PRECALC_16_31 287 #define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ 288 VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14] 289 VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3] 290 291 #define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ 292 VPXOR REG_SUB_8, REG, REG \ 293 VPXOR REG_SUB_16, Y0, Y0 294 295 #define PRECALC_18(REG) \ 296 VPXOR Y0, REG, REG \ 297 VPSLLDQ $12, REG, Y9 298 299 #define PRECALC_19(REG) \ 300 VPSLLD $1, REG, Y0 \ 301 VPSRLD $31, REG, REG 302 303 #define PRECALC_20(REG) \ 304 VPOR REG, Y0, Y0 \ 305 VPSLLD $2, Y9, REG 306 307 #define PRECALC_21(REG) \ 308 VPSRLD $30, Y9, Y9 \ 309 VPXOR REG, Y0, Y0 310 311 #define PRECALC_23(REG,K_OFFSET,OFFSET) \ 312 VPXOR Y9, Y0, REG \ 313 VPADDD K_OFFSET(R8), REG, Y0 \ 314 VMOVDQU Y0, (OFFSET)(R14) 315 316 // Message scheduling pre-compute for rounds 16-31 317 // calculating last 32 w[i] values in 8 XMM registers 318 // pre-calculate K+w[i] values and store to mem 319 // for later load by ALU add instruction. 320 // "brute force" vectorization for rounds 16-31 only 321 // due to w[i]->w[i-3] dependency. 322 // clobbers 5 input ymm registers REG_SUB* 323 // uses X0 and X9 as temp registers 324 // As always, R8 is a pointer to constants block 325 // and R14 is a pointer to temp buffer 326 #define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \ 327 PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ 328 PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ 329 PRECALC_18(REG) \ 330 PRECALC_19(REG) \ 331 PRECALC_20(REG) \ 332 PRECALC_21(REG) \ 333 PRECALC_23(REG,K_OFFSET,OFFSET) 334 335 336 // Helper macros for PRECALC_32_79 337 #define PRECALC_32(REG_SUB_8,REG_SUB_4) \ 338 VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0 339 340 #define PRECALC_33(REG_SUB_28,REG) \ 341 VPXOR REG_SUB_28, REG, REG 342 343 #define PRECALC_34(REG_SUB_16) \ 344 VPXOR REG_SUB_16, Y0, Y0 345 346 #define PRECALC_35(REG) \ 347 VPXOR Y0, REG, REG 348 349 #define PRECALC_36(REG) \ 350 VPSLLD $2, REG, Y0 351 352 #define PRECALC_37(REG) \ 353 VPSRLD $30, REG, REG \ 354 VPOR REG, Y0, REG 355 356 #define PRECALC_39(REG,K_OFFSET,OFFSET) \ 357 VPADDD K_OFFSET(R8), REG, Y0 \ 358 VMOVDQU Y0, (OFFSET)(R14) 359 360 // Message scheduling pre-compute for rounds 32-79 361 // In SHA-1 specification we have: 362 // w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 363 // Which is the same as: 364 // w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 365 // This allows for more efficient vectorization, 366 // since w[i]->w[i-3] dependency is broken 367 #define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \ 368 PRECALC_32(REG_SUB_8,REG_SUB_4) \ 369 PRECALC_33(REG_SUB_28,REG) \ 370 PRECALC_34(REG_SUB_16) \ 371 PRECALC_35(REG) \ 372 PRECALC_36(REG) \ 373 PRECALC_37(REG) \ 374 PRECALC_39(REG,K_OFFSET,OFFSET) 375 376 #define PRECALC \ 377 PRECALC_00_15(0,Y15) \ 378 PRECALC_00_15(0x10,Y14) \ 379 PRECALC_00_15(0x20,Y13) \ 380 PRECALC_00_15(0x30,Y12) \ 381 PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \ 382 PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \ 383 PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \ 384 PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \ 385 PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \ 386 PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \ 387 PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \ 388 PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \ 389 PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \ 390 PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \ 391 PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \ 392 PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \ 393 PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \ 394 PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \ 395 PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \ 396 PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260) 397 398 // Macros calculating individual rounds have general form 399 // CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST 400 // CALC_ROUND_{PRE,POST} macros follow 401 402 #define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \ 403 ADDL OFFSET(R15),REG_E \ 404 ANDNL REG_C,REG_A,BP \ 405 LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round 406 RORXL $0x1b, REG_A, R12 \ 407 RORXL $2, REG_A, REG_B // for next round 408 409 // Calculate F for the next round 410 #define CALC_F1_POST(REG_A,REG_B,REG_E) \ 411 ANDL REG_B,REG_A \ // b&c 412 XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d) 413 LEAL (REG_E)(R12*1), REG_E // E += A >>> 5 414 415 416 // Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX 417 #define CALC_0 \ 418 MOVL SI, BX \ // Precalculating first round 419 RORXL $2, SI, SI \ 420 ANDNL AX, BX, BP \ 421 ANDL DI, BX \ 422 XORL BP, BX \ 423 CALC_F1_PRE(0x0,CX,BX,DI,DX) \ 424 PRECALC_0(0x80) \ 425 CALC_F1_POST(CX,SI,DX) 426 427 #define CALC_1 \ 428 CALC_F1_PRE(0x4,DX,CX,SI,AX) \ 429 PRECALC_1(0x80) \ 430 CALC_F1_POST(DX,BX,AX) 431 432 #define CALC_2 \ 433 CALC_F1_PRE(0x8,AX,DX,BX,DI) \ 434 PRECALC_2(Y15) \ 435 CALC_F1_POST(AX,CX,DI) 436 437 #define CALC_3 \ 438 CALC_F1_PRE(0xc,DI,AX,CX,SI) \ 439 CALC_F1_POST(DI,DX,SI) 440 441 #define CALC_4 \ 442 CALC_F1_PRE(0x20,SI,DI,DX,BX) \ 443 PRECALC_4(Y15,0x0) \ 444 CALC_F1_POST(SI,AX,BX) 445 446 #define CALC_5 \ 447 CALC_F1_PRE(0x24,BX,SI,AX,CX) \ 448 CALC_F1_POST(BX,DI,CX) 449 450 #define CALC_6 \ 451 CALC_F1_PRE(0x28,CX,BX,DI,DX) \ 452 CALC_F1_POST(CX,SI,DX) 453 454 #define CALC_7 \ 455 CALC_F1_PRE(0x2c,DX,CX,SI,AX) \ 456 PRECALC_7(0x0) \ 457 CALC_F1_POST(DX,BX,AX) 458 459 #define CALC_8 \ 460 CALC_F1_PRE(0x40,AX,DX,BX,DI) \ 461 PRECALC_0(0x90) \ 462 CALC_F1_POST(AX,CX,DI) 463 464 #define CALC_9 \ 465 CALC_F1_PRE(0x44,DI,AX,CX,SI) \ 466 PRECALC_1(0x90) \ 467 CALC_F1_POST(DI,DX,SI) 468 469 #define CALC_10 \ 470 CALC_F1_PRE(0x48,SI,DI,DX,BX) \ 471 PRECALC_2(Y14) \ 472 CALC_F1_POST(SI,AX,BX) 473 474 #define CALC_11 \ 475 CALC_F1_PRE(0x4c,BX,SI,AX,CX) \ 476 CALC_F1_POST(BX,DI,CX) 477 478 #define CALC_12 \ 479 CALC_F1_PRE(0x60,CX,BX,DI,DX) \ 480 PRECALC_4(Y14,0x0) \ 481 CALC_F1_POST(CX,SI,DX) 482 483 #define CALC_13 \ 484 CALC_F1_PRE(0x64,DX,CX,SI,AX) \ 485 CALC_F1_POST(DX,BX,AX) 486 487 #define CALC_14 \ 488 CALC_F1_PRE(0x68,AX,DX,BX,DI) \ 489 CALC_F1_POST(AX,CX,DI) 490 491 #define CALC_15 \ 492 CALC_F1_PRE(0x6c,DI,AX,CX,SI) \ 493 PRECALC_7(0x10) \ 494 CALC_F1_POST(DI,DX,SI) 495 496 #define CALC_16 \ 497 CALC_F1_PRE(0x80,SI,DI,DX,BX) \ 498 PRECALC_0(0xa0) \ 499 CALC_F1_POST(SI,AX,BX) 500 501 #define CALC_17 \ 502 CALC_F1_PRE(0x84,BX,SI,AX,CX) \ 503 PRECALC_1(0xa0) \ 504 CALC_F1_POST(BX,DI,CX) 505 506 #define CALC_18 \ 507 CALC_F1_PRE(0x88,CX,BX,DI,DX) \ 508 PRECALC_2(Y13) \ 509 CALC_F1_POST(CX,SI,DX) 510 511 512 #define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \ 513 ADDL OFFSET(R15),REG_E \ 514 LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round 515 RORXL $0x1b, REG_A, R12 \ 516 RORXL $2, REG_A, REG_B // for next round 517 518 #define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \ 519 XORL REG_B, REG_A \ 520 ADDL R12, REG_E \ 521 XORL REG_C, REG_A 522 523 #define CALC_19 \ 524 CALC_F2_PRE(0x8c,DX,CX,AX) \ 525 CALC_F2_POST(DX,BX,SI,AX) 526 527 #define CALC_20 \ 528 CALC_F2_PRE(0xa0,AX,DX,DI) \ 529 PRECALC_4(Y13,0x0) \ 530 CALC_F2_POST(AX,CX,BX,DI) 531 532 #define CALC_21 \ 533 CALC_F2_PRE(0xa4,DI,AX,SI) \ 534 CALC_F2_POST(DI,DX,CX,SI) 535 536 #define CALC_22 \ 537 CALC_F2_PRE(0xa8,SI,DI,BX) \ 538 CALC_F2_POST(SI,AX,DX,BX) 539 540 #define CALC_23 \ 541 CALC_F2_PRE(0xac,BX,SI,CX) \ 542 PRECALC_7(0x20) \ 543 CALC_F2_POST(BX,DI,AX,CX) 544 545 #define CALC_24 \ 546 CALC_F2_PRE(0xc0,CX,BX,DX) \ 547 PRECALC_0(0xb0) \ 548 CALC_F2_POST(CX,SI,DI,DX) 549 550 #define CALC_25 \ 551 CALC_F2_PRE(0xc4,DX,CX,AX) \ 552 PRECALC_1(0xb0) \ 553 CALC_F2_POST(DX,BX,SI,AX) 554 555 #define CALC_26 \ 556 CALC_F2_PRE(0xc8,AX,DX,DI) \ 557 PRECALC_2(Y12) \ 558 CALC_F2_POST(AX,CX,BX,DI) 559 560 #define CALC_27 \ 561 CALC_F2_PRE(0xcc,DI,AX,SI) \ 562 CALC_F2_POST(DI,DX,CX,SI) 563 564 #define CALC_28 \ 565 CALC_F2_PRE(0xe0,SI,DI,BX) \ 566 PRECALC_4(Y12,0x0) \ 567 CALC_F2_POST(SI,AX,DX,BX) 568 569 #define CALC_29 \ 570 CALC_F2_PRE(0xe4,BX,SI,CX) \ 571 CALC_F2_POST(BX,DI,AX,CX) 572 573 #define CALC_30 \ 574 CALC_F2_PRE(0xe8,CX,BX,DX) \ 575 CALC_F2_POST(CX,SI,DI,DX) 576 577 #define CALC_31 \ 578 CALC_F2_PRE(0xec,DX,CX,AX) \ 579 PRECALC_7(0x30) \ 580 CALC_F2_POST(DX,BX,SI,AX) 581 582 #define CALC_32 \ 583 CALC_F2_PRE(0x100,AX,DX,DI) \ 584 PRECALC_16(Y15,Y14,Y12,Y8) \ 585 CALC_F2_POST(AX,CX,BX,DI) 586 587 #define CALC_33 \ 588 CALC_F2_PRE(0x104,DI,AX,SI) \ 589 PRECALC_17(Y15,Y13,Y8) \ 590 CALC_F2_POST(DI,DX,CX,SI) 591 592 #define CALC_34 \ 593 CALC_F2_PRE(0x108,SI,DI,BX) \ 594 PRECALC_18(Y8) \ 595 CALC_F2_POST(SI,AX,DX,BX) 596 597 #define CALC_35 \ 598 CALC_F2_PRE(0x10c,BX,SI,CX) \ 599 PRECALC_19(Y8) \ 600 CALC_F2_POST(BX,DI,AX,CX) 601 602 #define CALC_36 \ 603 CALC_F2_PRE(0x120,CX,BX,DX) \ 604 PRECALC_20(Y8) \ 605 CALC_F2_POST(CX,SI,DI,DX) 606 607 #define CALC_37 \ 608 CALC_F2_PRE(0x124,DX,CX,AX) \ 609 PRECALC_21(Y8) \ 610 CALC_F2_POST(DX,BX,SI,AX) 611 612 #define CALC_38 \ 613 CALC_F2_PRE(0x128,AX,DX,DI) \ 614 CALC_F2_POST(AX,CX,BX,DI) 615 616 617 #define CALC_F3_PRE(OFFSET,REG_E) \ 618 ADDL OFFSET(R15),REG_E 619 620 #define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \ 621 LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round 622 MOVL REG_B, BP \ 623 ORL REG_A, BP \ 624 RORXL $0x1b, REG_A, R12 \ 625 RORXL $2, REG_A, REG_TB \ 626 ANDL REG_C, BP \ // Calculate F for the next round 627 ANDL REG_B, REG_A \ 628 ORL BP, REG_A \ 629 ADDL R12, REG_E 630 631 #define CALC_39 \ 632 CALC_F3_PRE(0x12c,SI) \ 633 PRECALC_23(Y8,0x0,0x80) \ 634 CALC_F3_POST(DI,DX,CX,SI,AX) 635 636 #define CALC_40 \ 637 CALC_F3_PRE(0x140,BX) \ 638 PRECALC_16(Y14,Y13,Y8,Y7) \ 639 CALC_F3_POST(SI,AX,DX,BX,DI) 640 641 #define CALC_41 \ 642 CALC_F3_PRE(0x144,CX) \ 643 PRECALC_17(Y14,Y12,Y7) \ 644 CALC_F3_POST(BX,DI,AX,CX,SI) 645 646 #define CALC_42 \ 647 CALC_F3_PRE(0x148,DX) \ 648 PRECALC_18(Y7) \ 649 CALC_F3_POST(CX,SI,DI,DX,BX) 650 651 #define CALC_43 \ 652 CALC_F3_PRE(0x14c,AX) \ 653 PRECALC_19(Y7) \ 654 CALC_F3_POST(DX,BX,SI,AX,CX) 655 656 #define CALC_44 \ 657 CALC_F3_PRE(0x160,DI) \ 658 PRECALC_20(Y7) \ 659 CALC_F3_POST(AX,CX,BX,DI,DX) 660 661 #define CALC_45 \ 662 CALC_F3_PRE(0x164,SI) \ 663 PRECALC_21(Y7) \ 664 CALC_F3_POST(DI,DX,CX,SI,AX) 665 666 #define CALC_46 \ 667 CALC_F3_PRE(0x168,BX) \ 668 CALC_F3_POST(SI,AX,DX,BX,DI) 669 670 #define CALC_47 \ 671 CALC_F3_PRE(0x16c,CX) \ 672 VPXOR Y9, Y0, Y7 \ 673 VPADDD 0x20(R8), Y7, Y0 \ 674 VMOVDQU Y0, 0xa0(R14) \ 675 CALC_F3_POST(BX,DI,AX,CX,SI) 676 677 #define CALC_48 \ 678 CALC_F3_PRE(0x180,DX) \ 679 PRECALC_16(Y13,Y12,Y7,Y5) \ 680 CALC_F3_POST(CX,SI,DI,DX,BX) 681 682 #define CALC_49 \ 683 CALC_F3_PRE(0x184,AX) \ 684 PRECALC_17(Y13,Y8,Y5) \ 685 CALC_F3_POST(DX,BX,SI,AX,CX) 686 687 #define CALC_50 \ 688 CALC_F3_PRE(0x188,DI) \ 689 PRECALC_18(Y5) \ 690 CALC_F3_POST(AX,CX,BX,DI,DX) 691 692 #define CALC_51 \ 693 CALC_F3_PRE(0x18c,SI) \ 694 PRECALC_19(Y5) \ 695 CALC_F3_POST(DI,DX,CX,SI,AX) 696 697 #define CALC_52 \ 698 CALC_F3_PRE(0x1a0,BX) \ 699 PRECALC_20(Y5) \ 700 CALC_F3_POST(SI,AX,DX,BX,DI) 701 702 #define CALC_53 \ 703 CALC_F3_PRE(0x1a4,CX) \ 704 PRECALC_21(Y5) \ 705 CALC_F3_POST(BX,DI,AX,CX,SI) 706 707 #define CALC_54 \ 708 CALC_F3_PRE(0x1a8,DX) \ 709 CALC_F3_POST(CX,SI,DI,DX,BX) 710 711 #define CALC_55 \ 712 CALC_F3_PRE(0x1ac,AX) \ 713 PRECALC_23(Y5,0x20,0xc0) \ 714 CALC_F3_POST(DX,BX,SI,AX,CX) 715 716 #define CALC_56 \ 717 CALC_F3_PRE(0x1c0,DI) \ 718 PRECALC_16(Y12,Y8,Y5,Y3) \ 719 CALC_F3_POST(AX,CX,BX,DI,DX) 720 721 #define CALC_57 \ 722 CALC_F3_PRE(0x1c4,SI) \ 723 PRECALC_17(Y12,Y7,Y3) \ 724 CALC_F3_POST(DI,DX,CX,SI,AX) 725 726 #define CALC_58 \ 727 CALC_F3_PRE(0x1c8,BX) \ 728 PRECALC_18(Y3) \ 729 CALC_F3_POST(SI,AX,DX,BX,DI) 730 731 #define CALC_59 \ 732 CALC_F2_PRE(0x1cc,BX,SI,CX) \ 733 PRECALC_19(Y3) \ 734 CALC_F2_POST(BX,DI,AX,CX) 735 736 #define CALC_60 \ 737 CALC_F2_PRE(0x1e0,CX,BX,DX) \ 738 PRECALC_20(Y3) \ 739 CALC_F2_POST(CX,SI,DI,DX) 740 741 #define CALC_61 \ 742 CALC_F2_PRE(0x1e4,DX,CX,AX) \ 743 PRECALC_21(Y3) \ 744 CALC_F2_POST(DX,BX,SI,AX) 745 746 #define CALC_62 \ 747 CALC_F2_PRE(0x1e8,AX,DX,DI) \ 748 CALC_F2_POST(AX,CX,BX,DI) 749 750 #define CALC_63 \ 751 CALC_F2_PRE(0x1ec,DI,AX,SI) \ 752 PRECALC_23(Y3,0x20,0xe0) \ 753 CALC_F2_POST(DI,DX,CX,SI) 754 755 #define CALC_64 \ 756 CALC_F2_PRE(0x200,SI,DI,BX) \ 757 PRECALC_32(Y5,Y3) \ 758 CALC_F2_POST(SI,AX,DX,BX) 759 760 #define CALC_65 \ 761 CALC_F2_PRE(0x204,BX,SI,CX) \ 762 PRECALC_33(Y14,Y15) \ 763 CALC_F2_POST(BX,DI,AX,CX) 764 765 #define CALC_66 \ 766 CALC_F2_PRE(0x208,CX,BX,DX) \ 767 PRECALC_34(Y8) \ 768 CALC_F2_POST(CX,SI,DI,DX) 769 770 #define CALC_67 \ 771 CALC_F2_PRE(0x20c,DX,CX,AX) \ 772 PRECALC_35(Y15) \ 773 CALC_F2_POST(DX,BX,SI,AX) 774 775 #define CALC_68 \ 776 CALC_F2_PRE(0x220,AX,DX,DI) \ 777 PRECALC_36(Y15) \ 778 CALC_F2_POST(AX,CX,BX,DI) 779 780 #define CALC_69 \ 781 CALC_F2_PRE(0x224,DI,AX,SI) \ 782 PRECALC_37(Y15) \ 783 CALC_F2_POST(DI,DX,CX,SI) 784 785 #define CALC_70 \ 786 CALC_F2_PRE(0x228,SI,DI,BX) \ 787 CALC_F2_POST(SI,AX,DX,BX) 788 789 #define CALC_71 \ 790 CALC_F2_PRE(0x22c,BX,SI,CX) \ 791 PRECALC_39(Y15,0x20,0x100) \ 792 CALC_F2_POST(BX,DI,AX,CX) 793 794 #define CALC_72 \ 795 CALC_F2_PRE(0x240,CX,BX,DX) \ 796 PRECALC_32(Y3,Y15) \ 797 CALC_F2_POST(CX,SI,DI,DX) 798 799 #define CALC_73 \ 800 CALC_F2_PRE(0x244,DX,CX,AX) \ 801 PRECALC_33(Y13,Y14) \ 802 CALC_F2_POST(DX,BX,SI,AX) 803 804 #define CALC_74 \ 805 CALC_F2_PRE(0x248,AX,DX,DI) \ 806 PRECALC_34(Y7) \ 807 CALC_F2_POST(AX,CX,BX,DI) 808 809 #define CALC_75 \ 810 CALC_F2_PRE(0x24c,DI,AX,SI) \ 811 PRECALC_35(Y14) \ 812 CALC_F2_POST(DI,DX,CX,SI) 813 814 #define CALC_76 \ 815 CALC_F2_PRE(0x260,SI,DI,BX) \ 816 PRECALC_36(Y14) \ 817 CALC_F2_POST(SI,AX,DX,BX) 818 819 #define CALC_77 \ 820 CALC_F2_PRE(0x264,BX,SI,CX) \ 821 PRECALC_37(Y14) \ 822 CALC_F2_POST(BX,DI,AX,CX) 823 824 #define CALC_78 \ 825 CALC_F2_PRE(0x268,CX,BX,DX) \ 826 CALC_F2_POST(CX,SI,DI,DX) 827 828 #define CALC_79 \ 829 ADDL 0x26c(R15), AX \ 830 LEAL (AX)(CX*1), AX \ 831 RORXL $0x1b, DX, R12 \ 832 PRECALC_39(Y14,0x20,0x120) \ 833 ADDL R12, AX 834 835 // Similar to CALC_0 836 #define CALC_80 \ 837 MOVL CX, DX \ 838 RORXL $2, CX, CX \ 839 ANDNL SI, DX, BP \ 840 ANDL BX, DX \ 841 XORL BP, DX \ 842 CALC_F1_PRE(0x10,AX,DX,BX,DI) \ 843 PRECALC_32(Y15,Y14) \ 844 CALC_F1_POST(AX,CX,DI) 845 846 #define CALC_81 \ 847 CALC_F1_PRE(0x14,DI,AX,CX,SI) \ 848 PRECALC_33(Y12,Y13) \ 849 CALC_F1_POST(DI,DX,SI) 850 851 #define CALC_82 \ 852 CALC_F1_PRE(0x18,SI,DI,DX,BX) \ 853 PRECALC_34(Y5) \ 854 CALC_F1_POST(SI,AX,BX) 855 856 #define CALC_83 \ 857 CALC_F1_PRE(0x1c,BX,SI,AX,CX) \ 858 PRECALC_35(Y13) \ 859 CALC_F1_POST(BX,DI,CX) 860 861 #define CALC_84 \ 862 CALC_F1_PRE(0x30,CX,BX,DI,DX) \ 863 PRECALC_36(Y13) \ 864 CALC_F1_POST(CX,SI,DX) 865 866 #define CALC_85 \ 867 CALC_F1_PRE(0x34,DX,CX,SI,AX) \ 868 PRECALC_37(Y13) \ 869 CALC_F1_POST(DX,BX,AX) 870 871 #define CALC_86 \ 872 CALC_F1_PRE(0x38,AX,DX,BX,DI) \ 873 CALC_F1_POST(AX,CX,DI) 874 875 #define CALC_87 \ 876 CALC_F1_PRE(0x3c,DI,AX,CX,SI) \ 877 PRECALC_39(Y13,0x40,0x140) \ 878 CALC_F1_POST(DI,DX,SI) 879 880 #define CALC_88 \ 881 CALC_F1_PRE(0x50,SI,DI,DX,BX) \ 882 PRECALC_32(Y14,Y13) \ 883 CALC_F1_POST(SI,AX,BX) 884 885 #define CALC_89 \ 886 CALC_F1_PRE(0x54,BX,SI,AX,CX) \ 887 PRECALC_33(Y8,Y12) \ 888 CALC_F1_POST(BX,DI,CX) 889 890 #define CALC_90 \ 891 CALC_F1_PRE(0x58,CX,BX,DI,DX) \ 892 PRECALC_34(Y3) \ 893 CALC_F1_POST(CX,SI,DX) 894 895 #define CALC_91 \ 896 CALC_F1_PRE(0x5c,DX,CX,SI,AX) \ 897 PRECALC_35(Y12) \ 898 CALC_F1_POST(DX,BX,AX) 899 900 #define CALC_92 \ 901 CALC_F1_PRE(0x70,AX,DX,BX,DI) \ 902 PRECALC_36(Y12) \ 903 CALC_F1_POST(AX,CX,DI) 904 905 #define CALC_93 \ 906 CALC_F1_PRE(0x74,DI,AX,CX,SI) \ 907 PRECALC_37(Y12) \ 908 CALC_F1_POST(DI,DX,SI) 909 910 #define CALC_94 \ 911 CALC_F1_PRE(0x78,SI,DI,DX,BX) \ 912 CALC_F1_POST(SI,AX,BX) 913 914 #define CALC_95 \ 915 CALC_F1_PRE(0x7c,BX,SI,AX,CX) \ 916 PRECALC_39(Y12,0x40,0x160) \ 917 CALC_F1_POST(BX,DI,CX) 918 919 #define CALC_96 \ 920 CALC_F1_PRE(0x90,CX,BX,DI,DX) \ 921 PRECALC_32(Y13,Y12) \ 922 CALC_F1_POST(CX,SI,DX) 923 924 #define CALC_97 \ 925 CALC_F1_PRE(0x94,DX,CX,SI,AX) \ 926 PRECALC_33(Y7,Y8) \ 927 CALC_F1_POST(DX,BX,AX) 928 929 #define CALC_98 \ 930 CALC_F1_PRE(0x98,AX,DX,BX,DI) \ 931 PRECALC_34(Y15) \ 932 CALC_F1_POST(AX,CX,DI) 933 934 #define CALC_99 \ 935 CALC_F2_PRE(0x9c,DI,AX,SI) \ 936 PRECALC_35(Y8) \ 937 CALC_F2_POST(DI,DX,CX,SI) 938 939 #define CALC_100 \ 940 CALC_F2_PRE(0xb0,SI,DI,BX) \ 941 PRECALC_36(Y8) \ 942 CALC_F2_POST(SI,AX,DX,BX) 943 944 #define CALC_101 \ 945 CALC_F2_PRE(0xb4,BX,SI,CX) \ 946 PRECALC_37(Y8) \ 947 CALC_F2_POST(BX,DI,AX,CX) 948 949 #define CALC_102 \ 950 CALC_F2_PRE(0xb8,CX,BX,DX) \ 951 CALC_F2_POST(CX,SI,DI,DX) 952 953 #define CALC_103 \ 954 CALC_F2_PRE(0xbc,DX,CX,AX) \ 955 PRECALC_39(Y8,0x40,0x180) \ 956 CALC_F2_POST(DX,BX,SI,AX) 957 958 #define CALC_104 \ 959 CALC_F2_PRE(0xd0,AX,DX,DI) \ 960 PRECALC_32(Y12,Y8) \ 961 CALC_F2_POST(AX,CX,BX,DI) 962 963 #define CALC_105 \ 964 CALC_F2_PRE(0xd4,DI,AX,SI) \ 965 PRECALC_33(Y5,Y7) \ 966 CALC_F2_POST(DI,DX,CX,SI) 967 968 #define CALC_106 \ 969 CALC_F2_PRE(0xd8,SI,DI,BX) \ 970 PRECALC_34(Y14) \ 971 CALC_F2_POST(SI,AX,DX,BX) 972 973 #define CALC_107 \ 974 CALC_F2_PRE(0xdc,BX,SI,CX) \ 975 PRECALC_35(Y7) \ 976 CALC_F2_POST(BX,DI,AX,CX) 977 978 #define CALC_108 \ 979 CALC_F2_PRE(0xf0,CX,BX,DX) \ 980 PRECALC_36(Y7) \ 981 CALC_F2_POST(CX,SI,DI,DX) 982 983 #define CALC_109 \ 984 CALC_F2_PRE(0xf4,DX,CX,AX) \ 985 PRECALC_37(Y7) \ 986 CALC_F2_POST(DX,BX,SI,AX) 987 988 #define CALC_110 \ 989 CALC_F2_PRE(0xf8,AX,DX,DI) \ 990 CALC_F2_POST(AX,CX,BX,DI) 991 992 #define CALC_111 \ 993 CALC_F2_PRE(0xfc,DI,AX,SI) \ 994 PRECALC_39(Y7,0x40,0x1a0) \ 995 CALC_F2_POST(DI,DX,CX,SI) 996 997 #define CALC_112 \ 998 CALC_F2_PRE(0x110,SI,DI,BX) \ 999 PRECALC_32(Y8,Y7) \ 1000 CALC_F2_POST(SI,AX,DX,BX) 1001 1002 #define CALC_113 \ 1003 CALC_F2_PRE(0x114,BX,SI,CX) \ 1004 PRECALC_33(Y3,Y5) \ 1005 CALC_F2_POST(BX,DI,AX,CX) 1006 1007 #define CALC_114 \ 1008 CALC_F2_PRE(0x118,CX,BX,DX) \ 1009 PRECALC_34(Y13) \ 1010 CALC_F2_POST(CX,SI,DI,DX) 1011 1012 #define CALC_115 \ 1013 CALC_F2_PRE(0x11c,DX,CX,AX) \ 1014 PRECALC_35(Y5) \ 1015 CALC_F2_POST(DX,BX,SI,AX) 1016 1017 #define CALC_116 \ 1018 CALC_F2_PRE(0x130,AX,DX,DI) \ 1019 PRECALC_36(Y5) \ 1020 CALC_F2_POST(AX,CX,BX,DI) 1021 1022 #define CALC_117 \ 1023 CALC_F2_PRE(0x134,DI,AX,SI) \ 1024 PRECALC_37(Y5) \ 1025 CALC_F2_POST(DI,DX,CX,SI) 1026 1027 #define CALC_118 \ 1028 CALC_F2_PRE(0x138,SI,DI,BX) \ 1029 CALC_F2_POST(SI,AX,DX,BX) 1030 1031 #define CALC_119 \ 1032 CALC_F3_PRE(0x13c,CX) \ 1033 PRECALC_39(Y5,0x40,0x1c0) \ 1034 CALC_F3_POST(BX,DI,AX,CX,SI) 1035 1036 #define CALC_120 \ 1037 CALC_F3_PRE(0x150,DX) \ 1038 PRECALC_32(Y7,Y5) \ 1039 CALC_F3_POST(CX,SI,DI,DX,BX) 1040 1041 #define CALC_121 \ 1042 CALC_F3_PRE(0x154,AX) \ 1043 PRECALC_33(Y15,Y3) \ 1044 CALC_F3_POST(DX,BX,SI,AX,CX) 1045 1046 #define CALC_122 \ 1047 CALC_F3_PRE(0x158,DI) \ 1048 PRECALC_34(Y12) \ 1049 CALC_F3_POST(AX,CX,BX,DI,DX) 1050 1051 #define CALC_123 \ 1052 CALC_F3_PRE(0x15c,SI) \ 1053 PRECALC_35(Y3) \ 1054 CALC_F3_POST(DI,DX,CX,SI,AX) 1055 1056 #define CALC_124 \ 1057 CALC_F3_PRE(0x170,BX) \ 1058 PRECALC_36(Y3) \ 1059 CALC_F3_POST(SI,AX,DX,BX,DI) 1060 1061 #define CALC_125 \ 1062 CALC_F3_PRE(0x174,CX) \ 1063 PRECALC_37(Y3) \ 1064 CALC_F3_POST(BX,DI,AX,CX,SI) 1065 1066 #define CALC_126 \ 1067 CALC_F3_PRE(0x178,DX) \ 1068 CALC_F3_POST(CX,SI,DI,DX,BX) 1069 1070 #define CALC_127 \ 1071 CALC_F3_PRE(0x17c,AX) \ 1072 PRECALC_39(Y3,0x60,0x1e0) \ 1073 CALC_F3_POST(DX,BX,SI,AX,CX) 1074 1075 #define CALC_128 \ 1076 CALC_F3_PRE(0x190,DI) \ 1077 PRECALC_32(Y5,Y3) \ 1078 CALC_F3_POST(AX,CX,BX,DI,DX) 1079 1080 #define CALC_129 \ 1081 CALC_F3_PRE(0x194,SI) \ 1082 PRECALC_33(Y14,Y15) \ 1083 CALC_F3_POST(DI,DX,CX,SI,AX) 1084 1085 #define CALC_130 \ 1086 CALC_F3_PRE(0x198,BX) \ 1087 PRECALC_34(Y8) \ 1088 CALC_F3_POST(SI,AX,DX,BX,DI) 1089 1090 #define CALC_131 \ 1091 CALC_F3_PRE(0x19c,CX) \ 1092 PRECALC_35(Y15) \ 1093 CALC_F3_POST(BX,DI,AX,CX,SI) 1094 1095 #define CALC_132 \ 1096 CALC_F3_PRE(0x1b0,DX) \ 1097 PRECALC_36(Y15) \ 1098 CALC_F3_POST(CX,SI,DI,DX,BX) 1099 1100 #define CALC_133 \ 1101 CALC_F3_PRE(0x1b4,AX) \ 1102 PRECALC_37(Y15) \ 1103 CALC_F3_POST(DX,BX,SI,AX,CX) 1104 1105 #define CALC_134 \ 1106 CALC_F3_PRE(0x1b8,DI) \ 1107 CALC_F3_POST(AX,CX,BX,DI,DX) 1108 1109 #define CALC_135 \ 1110 CALC_F3_PRE(0x1bc,SI) \ 1111 PRECALC_39(Y15,0x60,0x200) \ 1112 CALC_F3_POST(DI,DX,CX,SI,AX) 1113 1114 #define CALC_136 \ 1115 CALC_F3_PRE(0x1d0,BX) \ 1116 PRECALC_32(Y3,Y15) \ 1117 CALC_F3_POST(SI,AX,DX,BX,DI) 1118 1119 #define CALC_137 \ 1120 CALC_F3_PRE(0x1d4,CX) \ 1121 PRECALC_33(Y13,Y14) \ 1122 CALC_F3_POST(BX,DI,AX,CX,SI) 1123 1124 #define CALC_138 \ 1125 CALC_F3_PRE(0x1d8,DX) \ 1126 PRECALC_34(Y7) \ 1127 CALC_F3_POST(CX,SI,DI,DX,BX) 1128 1129 #define CALC_139 \ 1130 CALC_F2_PRE(0x1dc,DX,CX,AX) \ 1131 PRECALC_35(Y14) \ 1132 CALC_F2_POST(DX,BX,SI,AX) 1133 1134 #define CALC_140 \ 1135 CALC_F2_PRE(0x1f0,AX,DX,DI) \ 1136 PRECALC_36(Y14) \ 1137 CALC_F2_POST(AX,CX,BX,DI) 1138 1139 #define CALC_141 \ 1140 CALC_F2_PRE(0x1f4,DI,AX,SI) \ 1141 PRECALC_37(Y14) \ 1142 CALC_F2_POST(DI,DX,CX,SI) 1143 1144 #define CALC_142 \ 1145 CALC_F2_PRE(0x1f8,SI,DI,BX) \ 1146 CALC_F2_POST(SI,AX,DX,BX) 1147 1148 #define CALC_143 \ 1149 CALC_F2_PRE(0x1fc,BX,SI,CX) \ 1150 PRECALC_39(Y14,0x60,0x220) \ 1151 CALC_F2_POST(BX,DI,AX,CX) 1152 1153 #define CALC_144 \ 1154 CALC_F2_PRE(0x210,CX,BX,DX) \ 1155 PRECALC_32(Y15,Y14) \ 1156 CALC_F2_POST(CX,SI,DI,DX) 1157 1158 #define CALC_145 \ 1159 CALC_F2_PRE(0x214,DX,CX,AX) \ 1160 PRECALC_33(Y12,Y13) \ 1161 CALC_F2_POST(DX,BX,SI,AX) 1162 1163 #define CALC_146 \ 1164 CALC_F2_PRE(0x218,AX,DX,DI) \ 1165 PRECALC_34(Y5) \ 1166 CALC_F2_POST(AX,CX,BX,DI) 1167 1168 #define CALC_147 \ 1169 CALC_F2_PRE(0x21c,DI,AX,SI) \ 1170 PRECALC_35(Y13) \ 1171 CALC_F2_POST(DI,DX,CX,SI) 1172 1173 #define CALC_148 \ 1174 CALC_F2_PRE(0x230,SI,DI,BX) \ 1175 PRECALC_36(Y13) \ 1176 CALC_F2_POST(SI,AX,DX,BX) 1177 1178 #define CALC_149 \ 1179 CALC_F2_PRE(0x234,BX,SI,CX) \ 1180 PRECALC_37(Y13) \ 1181 CALC_F2_POST(BX,DI,AX,CX) 1182 1183 #define CALC_150 \ 1184 CALC_F2_PRE(0x238,CX,BX,DX) \ 1185 CALC_F2_POST(CX,SI,DI,DX) 1186 1187 #define CALC_151 \ 1188 CALC_F2_PRE(0x23c,DX,CX,AX) \ 1189 PRECALC_39(Y13,0x60,0x240) \ 1190 CALC_F2_POST(DX,BX,SI,AX) 1191 1192 #define CALC_152 \ 1193 CALC_F2_PRE(0x250,AX,DX,DI) \ 1194 PRECALC_32(Y14,Y13) \ 1195 CALC_F2_POST(AX,CX,BX,DI) 1196 1197 #define CALC_153 \ 1198 CALC_F2_PRE(0x254,DI,AX,SI) \ 1199 PRECALC_33(Y8,Y12) \ 1200 CALC_F2_POST(DI,DX,CX,SI) 1201 1202 #define CALC_154 \ 1203 CALC_F2_PRE(0x258,SI,DI,BX) \ 1204 PRECALC_34(Y3) \ 1205 CALC_F2_POST(SI,AX,DX,BX) 1206 1207 #define CALC_155 \ 1208 CALC_F2_PRE(0x25c,BX,SI,CX) \ 1209 PRECALC_35(Y12) \ 1210 CALC_F2_POST(BX,DI,AX,CX) 1211 1212 #define CALC_156 \ 1213 CALC_F2_PRE(0x270,CX,BX,DX) \ 1214 PRECALC_36(Y12) \ 1215 CALC_F2_POST(CX,SI,DI,DX) 1216 1217 #define CALC_157 \ 1218 CALC_F2_PRE(0x274,DX,CX,AX) \ 1219 PRECALC_37(Y12) \ 1220 CALC_F2_POST(DX,BX,SI,AX) 1221 1222 #define CALC_158 \ 1223 CALC_F2_PRE(0x278,AX,DX,DI) \ 1224 CALC_F2_POST(AX,CX,BX,DI) 1225 1226 #define CALC_159 \ 1227 ADDL 0x27c(R15),SI \ 1228 LEAL (SI)(AX*1), SI \ 1229 RORXL $0x1b, DI, R12 \ 1230 PRECALC_39(Y12,0x60,0x260) \ 1231 ADDL R12, SI 1232 1233 1234 1235 #define CALC \ 1236 MOVL (R9), CX \ 1237 MOVL 4(R9), SI \ 1238 MOVL 8(R9), DI \ 1239 MOVL 12(R9), AX \ 1240 MOVL 16(R9), DX \ 1241 MOVQ SP, R14 \ 1242 LEAQ (2*4*80+32)(SP), R15 \ 1243 PRECALC \ // Precalc WK for first 2 blocks 1244 XCHGQ R15, R14 \ 1245 loop: \ // this loops is unrolled 1246 CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block 1247 JNE begin \ 1248 VZEROUPPER \ 1249 RET \ 1250 begin: \ 1251 CALC_0 \ 1252 CALC_1 \ 1253 CALC_2 \ 1254 CALC_3 \ 1255 CALC_4 \ 1256 CALC_5 \ 1257 CALC_6 \ 1258 CALC_7 \ 1259 CALC_8 \ 1260 CALC_9 \ 1261 CALC_10 \ 1262 CALC_11 \ 1263 CALC_12 \ 1264 CALC_13 \ 1265 CALC_14 \ 1266 CALC_15 \ 1267 CALC_16 \ 1268 CALC_17 \ 1269 CALC_18 \ 1270 CALC_19 \ 1271 CALC_20 \ 1272 CALC_21 \ 1273 CALC_22 \ 1274 CALC_23 \ 1275 CALC_24 \ 1276 CALC_25 \ 1277 CALC_26 \ 1278 CALC_27 \ 1279 CALC_28 \ 1280 CALC_29 \ 1281 CALC_30 \ 1282 CALC_31 \ 1283 CALC_32 \ 1284 CALC_33 \ 1285 CALC_34 \ 1286 CALC_35 \ 1287 CALC_36 \ 1288 CALC_37 \ 1289 CALC_38 \ 1290 CALC_39 \ 1291 CALC_40 \ 1292 CALC_41 \ 1293 CALC_42 \ 1294 CALC_43 \ 1295 CALC_44 \ 1296 CALC_45 \ 1297 CALC_46 \ 1298 CALC_47 \ 1299 CALC_48 \ 1300 CALC_49 \ 1301 CALC_50 \ 1302 CALC_51 \ 1303 CALC_52 \ 1304 CALC_53 \ 1305 CALC_54 \ 1306 CALC_55 \ 1307 CALC_56 \ 1308 CALC_57 \ 1309 CALC_58 \ 1310 CALC_59 \ 1311 ADDQ $128, R10 \ // move to next even-64-byte block 1312 CMPQ R10, R11 \ // is current block the last one? 1313 CMOVQCC R8, R10 \ // signal the last iteration smartly 1314 CALC_60 \ 1315 CALC_61 \ 1316 CALC_62 \ 1317 CALC_63 \ 1318 CALC_64 \ 1319 CALC_65 \ 1320 CALC_66 \ 1321 CALC_67 \ 1322 CALC_68 \ 1323 CALC_69 \ 1324 CALC_70 \ 1325 CALC_71 \ 1326 CALC_72 \ 1327 CALC_73 \ 1328 CALC_74 \ 1329 CALC_75 \ 1330 CALC_76 \ 1331 CALC_77 \ 1332 CALC_78 \ 1333 CALC_79 \ 1334 UPDATE_HASH(AX,DX,BX,SI,DI) \ 1335 CMPQ R10, R8 \ // is current block the last one? 1336 JE loop\ 1337 MOVL DX, CX \ 1338 CALC_80 \ 1339 CALC_81 \ 1340 CALC_82 \ 1341 CALC_83 \ 1342 CALC_84 \ 1343 CALC_85 \ 1344 CALC_86 \ 1345 CALC_87 \ 1346 CALC_88 \ 1347 CALC_89 \ 1348 CALC_90 \ 1349 CALC_91 \ 1350 CALC_92 \ 1351 CALC_93 \ 1352 CALC_94 \ 1353 CALC_95 \ 1354 CALC_96 \ 1355 CALC_97 \ 1356 CALC_98 \ 1357 CALC_99 \ 1358 CALC_100 \ 1359 CALC_101 \ 1360 CALC_102 \ 1361 CALC_103 \ 1362 CALC_104 \ 1363 CALC_105 \ 1364 CALC_106 \ 1365 CALC_107 \ 1366 CALC_108 \ 1367 CALC_109 \ 1368 CALC_110 \ 1369 CALC_111 \ 1370 CALC_112 \ 1371 CALC_113 \ 1372 CALC_114 \ 1373 CALC_115 \ 1374 CALC_116 \ 1375 CALC_117 \ 1376 CALC_118 \ 1377 CALC_119 \ 1378 CALC_120 \ 1379 CALC_121 \ 1380 CALC_122 \ 1381 CALC_123 \ 1382 CALC_124 \ 1383 CALC_125 \ 1384 CALC_126 \ 1385 CALC_127 \ 1386 CALC_128 \ 1387 CALC_129 \ 1388 CALC_130 \ 1389 CALC_131 \ 1390 CALC_132 \ 1391 CALC_133 \ 1392 CALC_134 \ 1393 CALC_135 \ 1394 CALC_136 \ 1395 CALC_137 \ 1396 CALC_138 \ 1397 CALC_139 \ 1398 ADDQ $128, R13 \ //move to next even-64-byte block 1399 CMPQ R13, R11 \ //is current block the last one? 1400 CMOVQCC R8, R10 \ 1401 CALC_140 \ 1402 CALC_141 \ 1403 CALC_142 \ 1404 CALC_143 \ 1405 CALC_144 \ 1406 CALC_145 \ 1407 CALC_146 \ 1408 CALC_147 \ 1409 CALC_148 \ 1410 CALC_149 \ 1411 CALC_150 \ 1412 CALC_151 \ 1413 CALC_152 \ 1414 CALC_153 \ 1415 CALC_154 \ 1416 CALC_155 \ 1417 CALC_156 \ 1418 CALC_157 \ 1419 CALC_158 \ 1420 CALC_159 \ 1421 UPDATE_HASH(SI,DI,DX,CX,BX) \ 1422 MOVL SI, R12 \ //Reset state for AVX2 reg permutation 1423 MOVL DI, SI \ 1424 MOVL DX, DI \ 1425 MOVL BX, DX \ 1426 MOVL CX, AX \ 1427 MOVL R12, CX \ 1428 XCHGQ R15, R14 \ 1429 JMP loop 1430 1431 1432 1433 TEXT ·blockAVX2(SB),$1408-32 1434 1435 MOVQ dig+0(FP), DI 1436 MOVQ p_base+8(FP), SI 1437 MOVQ p_len+16(FP), DX 1438 SHRQ $6, DX 1439 SHLQ $6, DX 1440 1441 MOVQ $K_XMM_AR<>(SB), R8 1442 1443 MOVQ DI, R9 1444 MOVQ SI, R10 1445 LEAQ 64(SI), R13 1446 1447 ADDQ SI, DX 1448 ADDQ $64, DX 1449 MOVQ DX, R11 1450 1451 CMPQ R13, R11 1452 CMOVQCC R8, R13 1453 1454 VMOVDQU BSWAP_SHUFB_CTL<>(SB), Y10 1455 1456 CALC // RET is inside macros 1457 1458 DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999 1459 DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999 1460 DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999 1461 DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999 1462 DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999 1463 DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999 1464 DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999 1465 DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999 1466 DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1 1467 DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1 1468 DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1 1469 DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1 1470 DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1 1471 DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1 1472 DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1 1473 DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1 1474 DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc 1475 DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc 1476 DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc 1477 DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc 1478 DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc 1479 DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc 1480 DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc 1481 DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc 1482 DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6 1483 DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6 1484 DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6 1485 DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6 1486 DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6 1487 DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6 1488 DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6 1489 DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6 1490 GLOBL K_XMM_AR<>(SB),RODATA,$128 1491 1492 DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203 1493 DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607 1494 DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b 1495 DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f 1496 DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203 1497 DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607 1498 DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b 1499 DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f 1500 GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32