github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/crypto/sha1/sha1block_amd64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // AVX2 version by Intel, same algorithm as code in Linux kernel: 6 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S 7 // Authors: 8 // Ilya Albrekht <ilya.albrekht@intel.com> 9 // Maxim Locktyukhin <maxim.locktyukhin@intel.com> 10 // Ronen Zohar <ronen.zohar@intel.com> 11 // Chandramouli Narayanan <mouli@linux.intel.com> 12 13 //go:build !purego 14 15 #include "textflag.h" 16 17 // SHA-1 block routine. See sha1block.go for Go equivalent. 18 // 19 // There are 80 rounds of 4 types: 20 // - rounds 0-15 are type 1 and load data (ROUND1 macro). 21 // - rounds 16-19 are type 1 and do not load data (ROUND1x macro). 22 // - rounds 20-39 are type 2 and do not load data (ROUND2 macro). 23 // - rounds 40-59 are type 3 and do not load data (ROUND3 macro). 24 // - rounds 60-79 are type 4 and do not load data (ROUND4 macro). 25 // 26 // Each round loads or shuffles the data, then computes a per-round 27 // function of b, c, d, and then mixes the result into and rotates the 28 // five registers a, b, c, d, e holding the intermediate results. 29 // 30 // The register rotation is implemented by rotating the arguments to 31 // the round macros instead of by explicit move instructions. 32 33 #define LOAD(index) \ 34 MOVL (index*4)(SI), R10; \ 35 BSWAPL R10; \ 36 MOVL R10, (index*4)(SP) 37 38 #define SHUFFLE(index) \ 39 MOVL (((index)&0xf)*4)(SP), R10; \ 40 XORL (((index-3)&0xf)*4)(SP), R10; \ 41 XORL (((index-8)&0xf)*4)(SP), R10; \ 42 XORL (((index-14)&0xf)*4)(SP), R10; \ 43 ROLL $1, R10; \ 44 MOVL R10, (((index)&0xf)*4)(SP) 45 46 #define FUNC1(a, b, c, d, e) \ 47 MOVL d, R9; \ 48 XORL c, R9; \ 49 ANDL b, R9; \ 50 XORL d, R9 51 52 #define FUNC2(a, b, c, d, e) \ 53 MOVL b, R9; \ 54 XORL c, R9; \ 55 XORL d, R9 56 57 #define FUNC3(a, b, c, d, e) \ 58 MOVL b, R8; \ 59 ORL c, R8; \ 60 ANDL d, R8; \ 61 MOVL b, R9; \ 62 ANDL c, R9; \ 63 ORL R8, R9 64 65 #define FUNC4 FUNC2 66 67 #define MIX(a, b, c, d, e, const) \ 68 ROLL $30, b; \ 69 ADDL R9, e; \ 70 MOVL a, R8; \ 71 ROLL $5, R8; \ 72 LEAL const(e)(R10*1), e; \ 73 ADDL R8, e 74 75 #define ROUND1(a, b, c, d, e, index) \ 76 LOAD(index); \ 77 FUNC1(a, b, c, d, e); \ 78 MIX(a, b, c, d, e, 0x5A827999) 79 80 #define ROUND1x(a, b, c, d, e, index) \ 81 SHUFFLE(index); \ 82 FUNC1(a, b, c, d, e); \ 83 MIX(a, b, c, d, e, 0x5A827999) 84 85 #define ROUND2(a, b, c, d, e, index) \ 86 SHUFFLE(index); \ 87 FUNC2(a, b, c, d, e); \ 88 MIX(a, b, c, d, e, 0x6ED9EBA1) 89 90 #define ROUND3(a, b, c, d, e, index) \ 91 SHUFFLE(index); \ 92 FUNC3(a, b, c, d, e); \ 93 MIX(a, b, c, d, e, 0x8F1BBCDC) 94 95 #define ROUND4(a, b, c, d, e, index) \ 96 SHUFFLE(index); \ 97 FUNC4(a, b, c, d, e); \ 98 MIX(a, b, c, d, e, 0xCA62C1D6) 99 100 TEXT ·blockAMD64(SB),NOSPLIT,$64-32 101 MOVQ dig+0(FP), BP 102 MOVQ p_base+8(FP), SI 103 MOVQ p_len+16(FP), DX 104 SHRQ $6, DX 105 SHLQ $6, DX 106 107 LEAQ (SI)(DX*1), DI 108 MOVL (0*4)(BP), AX 109 MOVL (1*4)(BP), BX 110 MOVL (2*4)(BP), CX 111 MOVL (3*4)(BP), DX 112 MOVL (4*4)(BP), BP 113 114 CMPQ SI, DI 115 JEQ end 116 117 loop: 118 MOVL AX, R11 119 MOVL BX, R12 120 MOVL CX, R13 121 MOVL DX, R14 122 MOVL BP, R15 123 124 ROUND1(AX, BX, CX, DX, BP, 0) 125 ROUND1(BP, AX, BX, CX, DX, 1) 126 ROUND1(DX, BP, AX, BX, CX, 2) 127 ROUND1(CX, DX, BP, AX, BX, 3) 128 ROUND1(BX, CX, DX, BP, AX, 4) 129 ROUND1(AX, BX, CX, DX, BP, 5) 130 ROUND1(BP, AX, BX, CX, DX, 6) 131 ROUND1(DX, BP, AX, BX, CX, 7) 132 ROUND1(CX, DX, BP, AX, BX, 8) 133 ROUND1(BX, CX, DX, BP, AX, 9) 134 ROUND1(AX, BX, CX, DX, BP, 10) 135 ROUND1(BP, AX, BX, CX, DX, 11) 136 ROUND1(DX, BP, AX, BX, CX, 12) 137 ROUND1(CX, DX, BP, AX, BX, 13) 138 ROUND1(BX, CX, DX, BP, AX, 14) 139 ROUND1(AX, BX, CX, DX, BP, 15) 140 141 ROUND1x(BP, AX, BX, CX, DX, 16) 142 ROUND1x(DX, BP, AX, BX, CX, 17) 143 ROUND1x(CX, DX, BP, AX, BX, 18) 144 ROUND1x(BX, CX, DX, BP, AX, 19) 145 146 ROUND2(AX, BX, CX, DX, BP, 20) 147 ROUND2(BP, AX, BX, CX, DX, 21) 148 ROUND2(DX, BP, AX, BX, CX, 22) 149 ROUND2(CX, DX, BP, AX, BX, 23) 150 ROUND2(BX, CX, DX, BP, AX, 24) 151 ROUND2(AX, BX, CX, DX, BP, 25) 152 ROUND2(BP, AX, BX, CX, DX, 26) 153 ROUND2(DX, BP, AX, BX, CX, 27) 154 ROUND2(CX, DX, BP, AX, BX, 28) 155 ROUND2(BX, CX, DX, BP, AX, 29) 156 ROUND2(AX, BX, CX, DX, BP, 30) 157 ROUND2(BP, AX, BX, CX, DX, 31) 158 ROUND2(DX, BP, AX, BX, CX, 32) 159 ROUND2(CX, DX, BP, AX, BX, 33) 160 ROUND2(BX, CX, DX, BP, AX, 34) 161 ROUND2(AX, BX, CX, DX, BP, 35) 162 ROUND2(BP, AX, BX, CX, DX, 36) 163 ROUND2(DX, BP, AX, BX, CX, 37) 164 ROUND2(CX, DX, BP, AX, BX, 38) 165 ROUND2(BX, CX, DX, BP, AX, 39) 166 167 ROUND3(AX, BX, CX, DX, BP, 40) 168 ROUND3(BP, AX, BX, CX, DX, 41) 169 ROUND3(DX, BP, AX, BX, CX, 42) 170 ROUND3(CX, DX, BP, AX, BX, 43) 171 ROUND3(BX, CX, DX, BP, AX, 44) 172 ROUND3(AX, BX, CX, DX, BP, 45) 173 ROUND3(BP, AX, BX, CX, DX, 46) 174 ROUND3(DX, BP, AX, BX, CX, 47) 175 ROUND3(CX, DX, BP, AX, BX, 48) 176 ROUND3(BX, CX, DX, BP, AX, 49) 177 ROUND3(AX, BX, CX, DX, BP, 50) 178 ROUND3(BP, AX, BX, CX, DX, 51) 179 ROUND3(DX, BP, AX, BX, CX, 52) 180 ROUND3(CX, DX, BP, AX, BX, 53) 181 ROUND3(BX, CX, DX, BP, AX, 54) 182 ROUND3(AX, BX, CX, DX, BP, 55) 183 ROUND3(BP, AX, BX, CX, DX, 56) 184 ROUND3(DX, BP, AX, BX, CX, 57) 185 ROUND3(CX, DX, BP, AX, BX, 58) 186 ROUND3(BX, CX, DX, BP, AX, 59) 187 188 ROUND4(AX, BX, CX, DX, BP, 60) 189 ROUND4(BP, AX, BX, CX, DX, 61) 190 ROUND4(DX, BP, AX, BX, CX, 62) 191 ROUND4(CX, DX, BP, AX, BX, 63) 192 ROUND4(BX, CX, DX, BP, AX, 64) 193 ROUND4(AX, BX, CX, DX, BP, 65) 194 ROUND4(BP, AX, BX, CX, DX, 66) 195 ROUND4(DX, BP, AX, BX, CX, 67) 196 ROUND4(CX, DX, BP, AX, BX, 68) 197 ROUND4(BX, CX, DX, BP, AX, 69) 198 ROUND4(AX, BX, CX, DX, BP, 70) 199 ROUND4(BP, AX, BX, CX, DX, 71) 200 ROUND4(DX, BP, AX, BX, CX, 72) 201 ROUND4(CX, DX, BP, AX, BX, 73) 202 ROUND4(BX, CX, DX, BP, AX, 74) 203 ROUND4(AX, BX, CX, DX, BP, 75) 204 ROUND4(BP, AX, BX, CX, DX, 76) 205 ROUND4(DX, BP, AX, BX, CX, 77) 206 ROUND4(CX, DX, BP, AX, BX, 78) 207 ROUND4(BX, CX, DX, BP, AX, 79) 208 209 ADDL R11, AX 210 ADDL R12, BX 211 ADDL R13, CX 212 ADDL R14, DX 213 ADDL R15, BP 214 215 ADDQ $64, SI 216 CMPQ SI, DI 217 JB loop 218 219 end: 220 MOVQ dig+0(FP), DI 221 MOVL AX, (0*4)(DI) 222 MOVL BX, (1*4)(DI) 223 MOVL CX, (2*4)(DI) 224 MOVL DX, (3*4)(DI) 225 MOVL BP, (4*4)(DI) 226 RET 227 228 229 // This is the implementation using AVX2, BMI1 and BMI2. It is based on: 230 // "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" 231 // From http://software.intel.com/en-us/articles 232 // (look for improving-the-performance-of-the-secure-hash-algorithm-1) 233 // This implementation is 2x unrolled, and interleaves vector instructions, 234 // used to precompute W, with scalar computation of current round 235 // for optimal scheduling. 236 237 // Trivial helper macros. 238 #define UPDATE_HASH(A,TB,C,D,E) \ 239 ADDL (R9), A \ 240 MOVL A, (R9) \ 241 ADDL 4(R9), TB \ 242 MOVL TB, 4(R9) \ 243 ADDL 8(R9), C \ 244 MOVL C, 8(R9) \ 245 ADDL 12(R9), D \ 246 MOVL D, 12(R9) \ 247 ADDL 16(R9), E \ 248 MOVL E, 16(R9) 249 250 251 252 // Helper macros for PRECALC, which does precomputations 253 #define PRECALC_0(OFFSET) \ 254 VMOVDQU OFFSET(R10),X0 255 256 #define PRECALC_1(OFFSET) \ 257 VINSERTI128 $1, OFFSET(R13), Y0, Y0 258 259 #define PRECALC_2(YREG) \ 260 VPSHUFB Y10, Y0, YREG 261 262 #define PRECALC_4(YREG,K_OFFSET) \ 263 VPADDD K_OFFSET(R8), YREG, Y0 264 265 #define PRECALC_7(OFFSET) \ 266 VMOVDQU Y0, (OFFSET*2)(R14) 267 268 269 // Message scheduling pre-compute for rounds 0-15 270 // R13 is a pointer to even 64-byte block 271 // R10 is a pointer to odd 64-byte block 272 // R14 is a pointer to temp buffer 273 // X0 is used as temp register 274 // YREG is clobbered as part of computation 275 // OFFSET chooses 16 byte chunk within a block 276 // R8 is a pointer to constants block 277 // K_OFFSET chooses K constants relevant to this round 278 // X10 holds swap mask 279 #define PRECALC_00_15(OFFSET,YREG) \ 280 PRECALC_0(OFFSET) \ 281 PRECALC_1(OFFSET) \ 282 PRECALC_2(YREG) \ 283 PRECALC_4(YREG,0x0) \ 284 PRECALC_7(OFFSET) 285 286 287 // Helper macros for PRECALC_16_31 288 #define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ 289 VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14] 290 VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3] 291 292 #define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ 293 VPXOR REG_SUB_8, REG, REG \ 294 VPXOR REG_SUB_16, Y0, Y0 295 296 #define PRECALC_18(REG) \ 297 VPXOR Y0, REG, REG \ 298 VPSLLDQ $12, REG, Y9 299 300 #define PRECALC_19(REG) \ 301 VPSLLD $1, REG, Y0 \ 302 VPSRLD $31, REG, REG 303 304 #define PRECALC_20(REG) \ 305 VPOR REG, Y0, Y0 \ 306 VPSLLD $2, Y9, REG 307 308 #define PRECALC_21(REG) \ 309 VPSRLD $30, Y9, Y9 \ 310 VPXOR REG, Y0, Y0 311 312 #define PRECALC_23(REG,K_OFFSET,OFFSET) \ 313 VPXOR Y9, Y0, REG \ 314 VPADDD K_OFFSET(R8), REG, Y0 \ 315 VMOVDQU Y0, (OFFSET)(R14) 316 317 // Message scheduling pre-compute for rounds 16-31 318 // calculating last 32 w[i] values in 8 XMM registers 319 // pre-calculate K+w[i] values and store to mem 320 // for later load by ALU add instruction. 321 // "brute force" vectorization for rounds 16-31 only 322 // due to w[i]->w[i-3] dependency. 323 // clobbers 5 input ymm registers REG_SUB* 324 // uses X0 and X9 as temp registers 325 // As always, R8 is a pointer to constants block 326 // and R14 is a pointer to temp buffer 327 #define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \ 328 PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ 329 PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ 330 PRECALC_18(REG) \ 331 PRECALC_19(REG) \ 332 PRECALC_20(REG) \ 333 PRECALC_21(REG) \ 334 PRECALC_23(REG,K_OFFSET,OFFSET) 335 336 337 // Helper macros for PRECALC_32_79 338 #define PRECALC_32(REG_SUB_8,REG_SUB_4) \ 339 VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0 340 341 #define PRECALC_33(REG_SUB_28,REG) \ 342 VPXOR REG_SUB_28, REG, REG 343 344 #define PRECALC_34(REG_SUB_16) \ 345 VPXOR REG_SUB_16, Y0, Y0 346 347 #define PRECALC_35(REG) \ 348 VPXOR Y0, REG, REG 349 350 #define PRECALC_36(REG) \ 351 VPSLLD $2, REG, Y0 352 353 #define PRECALC_37(REG) \ 354 VPSRLD $30, REG, REG \ 355 VPOR REG, Y0, REG 356 357 #define PRECALC_39(REG,K_OFFSET,OFFSET) \ 358 VPADDD K_OFFSET(R8), REG, Y0 \ 359 VMOVDQU Y0, (OFFSET)(R14) 360 361 // Message scheduling pre-compute for rounds 32-79 362 // In SHA-1 specification we have: 363 // w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 364 // Which is the same as: 365 // w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 366 // This allows for more efficient vectorization, 367 // since w[i]->w[i-3] dependency is broken 368 #define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \ 369 PRECALC_32(REG_SUB_8,REG_SUB_4) \ 370 PRECALC_33(REG_SUB_28,REG) \ 371 PRECALC_34(REG_SUB_16) \ 372 PRECALC_35(REG) \ 373 PRECALC_36(REG) \ 374 PRECALC_37(REG) \ 375 PRECALC_39(REG,K_OFFSET,OFFSET) 376 377 #define PRECALC \ 378 PRECALC_00_15(0,Y15) \ 379 PRECALC_00_15(0x10,Y14) \ 380 PRECALC_00_15(0x20,Y13) \ 381 PRECALC_00_15(0x30,Y12) \ 382 PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \ 383 PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \ 384 PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \ 385 PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \ 386 PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \ 387 PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \ 388 PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \ 389 PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \ 390 PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \ 391 PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \ 392 PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \ 393 PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \ 394 PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \ 395 PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \ 396 PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \ 397 PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260) 398 399 // Macros calculating individual rounds have general form 400 // CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST 401 // CALC_ROUND_{PRE,POST} macros follow 402 403 #define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \ 404 ADDL OFFSET(R15),REG_E \ 405 ANDNL REG_C,REG_A,BP \ 406 LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round 407 RORXL $0x1b, REG_A, R12 \ 408 RORXL $2, REG_A, REG_B // for next round 409 410 // Calculate F for the next round 411 #define CALC_F1_POST(REG_A,REG_B,REG_E) \ 412 ANDL REG_B,REG_A \ // b&c 413 XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d) 414 LEAL (REG_E)(R12*1), REG_E // E += A >>> 5 415 416 417 // Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX 418 #define CALC_0 \ 419 MOVL SI, BX \ // Precalculating first round 420 RORXL $2, SI, SI \ 421 ANDNL AX, BX, BP \ 422 ANDL DI, BX \ 423 XORL BP, BX \ 424 CALC_F1_PRE(0x0,CX,BX,DI,DX) \ 425 PRECALC_0(0x80) \ 426 CALC_F1_POST(CX,SI,DX) 427 428 #define CALC_1 \ 429 CALC_F1_PRE(0x4,DX,CX,SI,AX) \ 430 PRECALC_1(0x80) \ 431 CALC_F1_POST(DX,BX,AX) 432 433 #define CALC_2 \ 434 CALC_F1_PRE(0x8,AX,DX,BX,DI) \ 435 PRECALC_2(Y15) \ 436 CALC_F1_POST(AX,CX,DI) 437 438 #define CALC_3 \ 439 CALC_F1_PRE(0xc,DI,AX,CX,SI) \ 440 CALC_F1_POST(DI,DX,SI) 441 442 #define CALC_4 \ 443 CALC_F1_PRE(0x20,SI,DI,DX,BX) \ 444 PRECALC_4(Y15,0x0) \ 445 CALC_F1_POST(SI,AX,BX) 446 447 #define CALC_5 \ 448 CALC_F1_PRE(0x24,BX,SI,AX,CX) \ 449 CALC_F1_POST(BX,DI,CX) 450 451 #define CALC_6 \ 452 CALC_F1_PRE(0x28,CX,BX,DI,DX) \ 453 CALC_F1_POST(CX,SI,DX) 454 455 #define CALC_7 \ 456 CALC_F1_PRE(0x2c,DX,CX,SI,AX) \ 457 PRECALC_7(0x0) \ 458 CALC_F1_POST(DX,BX,AX) 459 460 #define CALC_8 \ 461 CALC_F1_PRE(0x40,AX,DX,BX,DI) \ 462 PRECALC_0(0x90) \ 463 CALC_F1_POST(AX,CX,DI) 464 465 #define CALC_9 \ 466 CALC_F1_PRE(0x44,DI,AX,CX,SI) \ 467 PRECALC_1(0x90) \ 468 CALC_F1_POST(DI,DX,SI) 469 470 #define CALC_10 \ 471 CALC_F1_PRE(0x48,SI,DI,DX,BX) \ 472 PRECALC_2(Y14) \ 473 CALC_F1_POST(SI,AX,BX) 474 475 #define CALC_11 \ 476 CALC_F1_PRE(0x4c,BX,SI,AX,CX) \ 477 CALC_F1_POST(BX,DI,CX) 478 479 #define CALC_12 \ 480 CALC_F1_PRE(0x60,CX,BX,DI,DX) \ 481 PRECALC_4(Y14,0x0) \ 482 CALC_F1_POST(CX,SI,DX) 483 484 #define CALC_13 \ 485 CALC_F1_PRE(0x64,DX,CX,SI,AX) \ 486 CALC_F1_POST(DX,BX,AX) 487 488 #define CALC_14 \ 489 CALC_F1_PRE(0x68,AX,DX,BX,DI) \ 490 CALC_F1_POST(AX,CX,DI) 491 492 #define CALC_15 \ 493 CALC_F1_PRE(0x6c,DI,AX,CX,SI) \ 494 PRECALC_7(0x10) \ 495 CALC_F1_POST(DI,DX,SI) 496 497 #define CALC_16 \ 498 CALC_F1_PRE(0x80,SI,DI,DX,BX) \ 499 PRECALC_0(0xa0) \ 500 CALC_F1_POST(SI,AX,BX) 501 502 #define CALC_17 \ 503 CALC_F1_PRE(0x84,BX,SI,AX,CX) \ 504 PRECALC_1(0xa0) \ 505 CALC_F1_POST(BX,DI,CX) 506 507 #define CALC_18 \ 508 CALC_F1_PRE(0x88,CX,BX,DI,DX) \ 509 PRECALC_2(Y13) \ 510 CALC_F1_POST(CX,SI,DX) 511 512 513 #define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \ 514 ADDL OFFSET(R15),REG_E \ 515 LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round 516 RORXL $0x1b, REG_A, R12 \ 517 RORXL $2, REG_A, REG_B // for next round 518 519 #define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \ 520 XORL REG_B, REG_A \ 521 ADDL R12, REG_E \ 522 XORL REG_C, REG_A 523 524 #define CALC_19 \ 525 CALC_F2_PRE(0x8c,DX,CX,AX) \ 526 CALC_F2_POST(DX,BX,SI,AX) 527 528 #define CALC_20 \ 529 CALC_F2_PRE(0xa0,AX,DX,DI) \ 530 PRECALC_4(Y13,0x0) \ 531 CALC_F2_POST(AX,CX,BX,DI) 532 533 #define CALC_21 \ 534 CALC_F2_PRE(0xa4,DI,AX,SI) \ 535 CALC_F2_POST(DI,DX,CX,SI) 536 537 #define CALC_22 \ 538 CALC_F2_PRE(0xa8,SI,DI,BX) \ 539 CALC_F2_POST(SI,AX,DX,BX) 540 541 #define CALC_23 \ 542 CALC_F2_PRE(0xac,BX,SI,CX) \ 543 PRECALC_7(0x20) \ 544 CALC_F2_POST(BX,DI,AX,CX) 545 546 #define CALC_24 \ 547 CALC_F2_PRE(0xc0,CX,BX,DX) \ 548 PRECALC_0(0xb0) \ 549 CALC_F2_POST(CX,SI,DI,DX) 550 551 #define CALC_25 \ 552 CALC_F2_PRE(0xc4,DX,CX,AX) \ 553 PRECALC_1(0xb0) \ 554 CALC_F2_POST(DX,BX,SI,AX) 555 556 #define CALC_26 \ 557 CALC_F2_PRE(0xc8,AX,DX,DI) \ 558 PRECALC_2(Y12) \ 559 CALC_F2_POST(AX,CX,BX,DI) 560 561 #define CALC_27 \ 562 CALC_F2_PRE(0xcc,DI,AX,SI) \ 563 CALC_F2_POST(DI,DX,CX,SI) 564 565 #define CALC_28 \ 566 CALC_F2_PRE(0xe0,SI,DI,BX) \ 567 PRECALC_4(Y12,0x0) \ 568 CALC_F2_POST(SI,AX,DX,BX) 569 570 #define CALC_29 \ 571 CALC_F2_PRE(0xe4,BX,SI,CX) \ 572 CALC_F2_POST(BX,DI,AX,CX) 573 574 #define CALC_30 \ 575 CALC_F2_PRE(0xe8,CX,BX,DX) \ 576 CALC_F2_POST(CX,SI,DI,DX) 577 578 #define CALC_31 \ 579 CALC_F2_PRE(0xec,DX,CX,AX) \ 580 PRECALC_7(0x30) \ 581 CALC_F2_POST(DX,BX,SI,AX) 582 583 #define CALC_32 \ 584 CALC_F2_PRE(0x100,AX,DX,DI) \ 585 PRECALC_16(Y15,Y14,Y12,Y8) \ 586 CALC_F2_POST(AX,CX,BX,DI) 587 588 #define CALC_33 \ 589 CALC_F2_PRE(0x104,DI,AX,SI) \ 590 PRECALC_17(Y15,Y13,Y8) \ 591 CALC_F2_POST(DI,DX,CX,SI) 592 593 #define CALC_34 \ 594 CALC_F2_PRE(0x108,SI,DI,BX) \ 595 PRECALC_18(Y8) \ 596 CALC_F2_POST(SI,AX,DX,BX) 597 598 #define CALC_35 \ 599 CALC_F2_PRE(0x10c,BX,SI,CX) \ 600 PRECALC_19(Y8) \ 601 CALC_F2_POST(BX,DI,AX,CX) 602 603 #define CALC_36 \ 604 CALC_F2_PRE(0x120,CX,BX,DX) \ 605 PRECALC_20(Y8) \ 606 CALC_F2_POST(CX,SI,DI,DX) 607 608 #define CALC_37 \ 609 CALC_F2_PRE(0x124,DX,CX,AX) \ 610 PRECALC_21(Y8) \ 611 CALC_F2_POST(DX,BX,SI,AX) 612 613 #define CALC_38 \ 614 CALC_F2_PRE(0x128,AX,DX,DI) \ 615 CALC_F2_POST(AX,CX,BX,DI) 616 617 618 #define CALC_F3_PRE(OFFSET,REG_E) \ 619 ADDL OFFSET(R15),REG_E 620 621 #define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \ 622 LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round 623 MOVL REG_B, BP \ 624 ORL REG_A, BP \ 625 RORXL $0x1b, REG_A, R12 \ 626 RORXL $2, REG_A, REG_TB \ 627 ANDL REG_C, BP \ // Calculate F for the next round 628 ANDL REG_B, REG_A \ 629 ORL BP, REG_A \ 630 ADDL R12, REG_E 631 632 #define CALC_39 \ 633 CALC_F3_PRE(0x12c,SI) \ 634 PRECALC_23(Y8,0x0,0x80) \ 635 CALC_F3_POST(DI,DX,CX,SI,AX) 636 637 #define CALC_40 \ 638 CALC_F3_PRE(0x140,BX) \ 639 PRECALC_16(Y14,Y13,Y8,Y7) \ 640 CALC_F3_POST(SI,AX,DX,BX,DI) 641 642 #define CALC_41 \ 643 CALC_F3_PRE(0x144,CX) \ 644 PRECALC_17(Y14,Y12,Y7) \ 645 CALC_F3_POST(BX,DI,AX,CX,SI) 646 647 #define CALC_42 \ 648 CALC_F3_PRE(0x148,DX) \ 649 PRECALC_18(Y7) \ 650 CALC_F3_POST(CX,SI,DI,DX,BX) 651 652 #define CALC_43 \ 653 CALC_F3_PRE(0x14c,AX) \ 654 PRECALC_19(Y7) \ 655 CALC_F3_POST(DX,BX,SI,AX,CX) 656 657 #define CALC_44 \ 658 CALC_F3_PRE(0x160,DI) \ 659 PRECALC_20(Y7) \ 660 CALC_F3_POST(AX,CX,BX,DI,DX) 661 662 #define CALC_45 \ 663 CALC_F3_PRE(0x164,SI) \ 664 PRECALC_21(Y7) \ 665 CALC_F3_POST(DI,DX,CX,SI,AX) 666 667 #define CALC_46 \ 668 CALC_F3_PRE(0x168,BX) \ 669 CALC_F3_POST(SI,AX,DX,BX,DI) 670 671 #define CALC_47 \ 672 CALC_F3_PRE(0x16c,CX) \ 673 VPXOR Y9, Y0, Y7 \ 674 VPADDD 0x20(R8), Y7, Y0 \ 675 VMOVDQU Y0, 0xa0(R14) \ 676 CALC_F3_POST(BX,DI,AX,CX,SI) 677 678 #define CALC_48 \ 679 CALC_F3_PRE(0x180,DX) \ 680 PRECALC_16(Y13,Y12,Y7,Y5) \ 681 CALC_F3_POST(CX,SI,DI,DX,BX) 682 683 #define CALC_49 \ 684 CALC_F3_PRE(0x184,AX) \ 685 PRECALC_17(Y13,Y8,Y5) \ 686 CALC_F3_POST(DX,BX,SI,AX,CX) 687 688 #define CALC_50 \ 689 CALC_F3_PRE(0x188,DI) \ 690 PRECALC_18(Y5) \ 691 CALC_F3_POST(AX,CX,BX,DI,DX) 692 693 #define CALC_51 \ 694 CALC_F3_PRE(0x18c,SI) \ 695 PRECALC_19(Y5) \ 696 CALC_F3_POST(DI,DX,CX,SI,AX) 697 698 #define CALC_52 \ 699 CALC_F3_PRE(0x1a0,BX) \ 700 PRECALC_20(Y5) \ 701 CALC_F3_POST(SI,AX,DX,BX,DI) 702 703 #define CALC_53 \ 704 CALC_F3_PRE(0x1a4,CX) \ 705 PRECALC_21(Y5) \ 706 CALC_F3_POST(BX,DI,AX,CX,SI) 707 708 #define CALC_54 \ 709 CALC_F3_PRE(0x1a8,DX) \ 710 CALC_F3_POST(CX,SI,DI,DX,BX) 711 712 #define CALC_55 \ 713 CALC_F3_PRE(0x1ac,AX) \ 714 PRECALC_23(Y5,0x20,0xc0) \ 715 CALC_F3_POST(DX,BX,SI,AX,CX) 716 717 #define CALC_56 \ 718 CALC_F3_PRE(0x1c0,DI) \ 719 PRECALC_16(Y12,Y8,Y5,Y3) \ 720 CALC_F3_POST(AX,CX,BX,DI,DX) 721 722 #define CALC_57 \ 723 CALC_F3_PRE(0x1c4,SI) \ 724 PRECALC_17(Y12,Y7,Y3) \ 725 CALC_F3_POST(DI,DX,CX,SI,AX) 726 727 #define CALC_58 \ 728 CALC_F3_PRE(0x1c8,BX) \ 729 PRECALC_18(Y3) \ 730 CALC_F3_POST(SI,AX,DX,BX,DI) 731 732 #define CALC_59 \ 733 CALC_F2_PRE(0x1cc,BX,SI,CX) \ 734 PRECALC_19(Y3) \ 735 CALC_F2_POST(BX,DI,AX,CX) 736 737 #define CALC_60 \ 738 CALC_F2_PRE(0x1e0,CX,BX,DX) \ 739 PRECALC_20(Y3) \ 740 CALC_F2_POST(CX,SI,DI,DX) 741 742 #define CALC_61 \ 743 CALC_F2_PRE(0x1e4,DX,CX,AX) \ 744 PRECALC_21(Y3) \ 745 CALC_F2_POST(DX,BX,SI,AX) 746 747 #define CALC_62 \ 748 CALC_F2_PRE(0x1e8,AX,DX,DI) \ 749 CALC_F2_POST(AX,CX,BX,DI) 750 751 #define CALC_63 \ 752 CALC_F2_PRE(0x1ec,DI,AX,SI) \ 753 PRECALC_23(Y3,0x20,0xe0) \ 754 CALC_F2_POST(DI,DX,CX,SI) 755 756 #define CALC_64 \ 757 CALC_F2_PRE(0x200,SI,DI,BX) \ 758 PRECALC_32(Y5,Y3) \ 759 CALC_F2_POST(SI,AX,DX,BX) 760 761 #define CALC_65 \ 762 CALC_F2_PRE(0x204,BX,SI,CX) \ 763 PRECALC_33(Y14,Y15) \ 764 CALC_F2_POST(BX,DI,AX,CX) 765 766 #define CALC_66 \ 767 CALC_F2_PRE(0x208,CX,BX,DX) \ 768 PRECALC_34(Y8) \ 769 CALC_F2_POST(CX,SI,DI,DX) 770 771 #define CALC_67 \ 772 CALC_F2_PRE(0x20c,DX,CX,AX) \ 773 PRECALC_35(Y15) \ 774 CALC_F2_POST(DX,BX,SI,AX) 775 776 #define CALC_68 \ 777 CALC_F2_PRE(0x220,AX,DX,DI) \ 778 PRECALC_36(Y15) \ 779 CALC_F2_POST(AX,CX,BX,DI) 780 781 #define CALC_69 \ 782 CALC_F2_PRE(0x224,DI,AX,SI) \ 783 PRECALC_37(Y15) \ 784 CALC_F2_POST(DI,DX,CX,SI) 785 786 #define CALC_70 \ 787 CALC_F2_PRE(0x228,SI,DI,BX) \ 788 CALC_F2_POST(SI,AX,DX,BX) 789 790 #define CALC_71 \ 791 CALC_F2_PRE(0x22c,BX,SI,CX) \ 792 PRECALC_39(Y15,0x20,0x100) \ 793 CALC_F2_POST(BX,DI,AX,CX) 794 795 #define CALC_72 \ 796 CALC_F2_PRE(0x240,CX,BX,DX) \ 797 PRECALC_32(Y3,Y15) \ 798 CALC_F2_POST(CX,SI,DI,DX) 799 800 #define CALC_73 \ 801 CALC_F2_PRE(0x244,DX,CX,AX) \ 802 PRECALC_33(Y13,Y14) \ 803 CALC_F2_POST(DX,BX,SI,AX) 804 805 #define CALC_74 \ 806 CALC_F2_PRE(0x248,AX,DX,DI) \ 807 PRECALC_34(Y7) \ 808 CALC_F2_POST(AX,CX,BX,DI) 809 810 #define CALC_75 \ 811 CALC_F2_PRE(0x24c,DI,AX,SI) \ 812 PRECALC_35(Y14) \ 813 CALC_F2_POST(DI,DX,CX,SI) 814 815 #define CALC_76 \ 816 CALC_F2_PRE(0x260,SI,DI,BX) \ 817 PRECALC_36(Y14) \ 818 CALC_F2_POST(SI,AX,DX,BX) 819 820 #define CALC_77 \ 821 CALC_F2_PRE(0x264,BX,SI,CX) \ 822 PRECALC_37(Y14) \ 823 CALC_F2_POST(BX,DI,AX,CX) 824 825 #define CALC_78 \ 826 CALC_F2_PRE(0x268,CX,BX,DX) \ 827 CALC_F2_POST(CX,SI,DI,DX) 828 829 #define CALC_79 \ 830 ADDL 0x26c(R15), AX \ 831 LEAL (AX)(CX*1), AX \ 832 RORXL $0x1b, DX, R12 \ 833 PRECALC_39(Y14,0x20,0x120) \ 834 ADDL R12, AX 835 836 // Similar to CALC_0 837 #define CALC_80 \ 838 MOVL CX, DX \ 839 RORXL $2, CX, CX \ 840 ANDNL SI, DX, BP \ 841 ANDL BX, DX \ 842 XORL BP, DX \ 843 CALC_F1_PRE(0x10,AX,DX,BX,DI) \ 844 PRECALC_32(Y15,Y14) \ 845 CALC_F1_POST(AX,CX,DI) 846 847 #define CALC_81 \ 848 CALC_F1_PRE(0x14,DI,AX,CX,SI) \ 849 PRECALC_33(Y12,Y13) \ 850 CALC_F1_POST(DI,DX,SI) 851 852 #define CALC_82 \ 853 CALC_F1_PRE(0x18,SI,DI,DX,BX) \ 854 PRECALC_34(Y5) \ 855 CALC_F1_POST(SI,AX,BX) 856 857 #define CALC_83 \ 858 CALC_F1_PRE(0x1c,BX,SI,AX,CX) \ 859 PRECALC_35(Y13) \ 860 CALC_F1_POST(BX,DI,CX) 861 862 #define CALC_84 \ 863 CALC_F1_PRE(0x30,CX,BX,DI,DX) \ 864 PRECALC_36(Y13) \ 865 CALC_F1_POST(CX,SI,DX) 866 867 #define CALC_85 \ 868 CALC_F1_PRE(0x34,DX,CX,SI,AX) \ 869 PRECALC_37(Y13) \ 870 CALC_F1_POST(DX,BX,AX) 871 872 #define CALC_86 \ 873 CALC_F1_PRE(0x38,AX,DX,BX,DI) \ 874 CALC_F1_POST(AX,CX,DI) 875 876 #define CALC_87 \ 877 CALC_F1_PRE(0x3c,DI,AX,CX,SI) \ 878 PRECALC_39(Y13,0x40,0x140) \ 879 CALC_F1_POST(DI,DX,SI) 880 881 #define CALC_88 \ 882 CALC_F1_PRE(0x50,SI,DI,DX,BX) \ 883 PRECALC_32(Y14,Y13) \ 884 CALC_F1_POST(SI,AX,BX) 885 886 #define CALC_89 \ 887 CALC_F1_PRE(0x54,BX,SI,AX,CX) \ 888 PRECALC_33(Y8,Y12) \ 889 CALC_F1_POST(BX,DI,CX) 890 891 #define CALC_90 \ 892 CALC_F1_PRE(0x58,CX,BX,DI,DX) \ 893 PRECALC_34(Y3) \ 894 CALC_F1_POST(CX,SI,DX) 895 896 #define CALC_91 \ 897 CALC_F1_PRE(0x5c,DX,CX,SI,AX) \ 898 PRECALC_35(Y12) \ 899 CALC_F1_POST(DX,BX,AX) 900 901 #define CALC_92 \ 902 CALC_F1_PRE(0x70,AX,DX,BX,DI) \ 903 PRECALC_36(Y12) \ 904 CALC_F1_POST(AX,CX,DI) 905 906 #define CALC_93 \ 907 CALC_F1_PRE(0x74,DI,AX,CX,SI) \ 908 PRECALC_37(Y12) \ 909 CALC_F1_POST(DI,DX,SI) 910 911 #define CALC_94 \ 912 CALC_F1_PRE(0x78,SI,DI,DX,BX) \ 913 CALC_F1_POST(SI,AX,BX) 914 915 #define CALC_95 \ 916 CALC_F1_PRE(0x7c,BX,SI,AX,CX) \ 917 PRECALC_39(Y12,0x40,0x160) \ 918 CALC_F1_POST(BX,DI,CX) 919 920 #define CALC_96 \ 921 CALC_F1_PRE(0x90,CX,BX,DI,DX) \ 922 PRECALC_32(Y13,Y12) \ 923 CALC_F1_POST(CX,SI,DX) 924 925 #define CALC_97 \ 926 CALC_F1_PRE(0x94,DX,CX,SI,AX) \ 927 PRECALC_33(Y7,Y8) \ 928 CALC_F1_POST(DX,BX,AX) 929 930 #define CALC_98 \ 931 CALC_F1_PRE(0x98,AX,DX,BX,DI) \ 932 PRECALC_34(Y15) \ 933 CALC_F1_POST(AX,CX,DI) 934 935 #define CALC_99 \ 936 CALC_F2_PRE(0x9c,DI,AX,SI) \ 937 PRECALC_35(Y8) \ 938 CALC_F2_POST(DI,DX,CX,SI) 939 940 #define CALC_100 \ 941 CALC_F2_PRE(0xb0,SI,DI,BX) \ 942 PRECALC_36(Y8) \ 943 CALC_F2_POST(SI,AX,DX,BX) 944 945 #define CALC_101 \ 946 CALC_F2_PRE(0xb4,BX,SI,CX) \ 947 PRECALC_37(Y8) \ 948 CALC_F2_POST(BX,DI,AX,CX) 949 950 #define CALC_102 \ 951 CALC_F2_PRE(0xb8,CX,BX,DX) \ 952 CALC_F2_POST(CX,SI,DI,DX) 953 954 #define CALC_103 \ 955 CALC_F2_PRE(0xbc,DX,CX,AX) \ 956 PRECALC_39(Y8,0x40,0x180) \ 957 CALC_F2_POST(DX,BX,SI,AX) 958 959 #define CALC_104 \ 960 CALC_F2_PRE(0xd0,AX,DX,DI) \ 961 PRECALC_32(Y12,Y8) \ 962 CALC_F2_POST(AX,CX,BX,DI) 963 964 #define CALC_105 \ 965 CALC_F2_PRE(0xd4,DI,AX,SI) \ 966 PRECALC_33(Y5,Y7) \ 967 CALC_F2_POST(DI,DX,CX,SI) 968 969 #define CALC_106 \ 970 CALC_F2_PRE(0xd8,SI,DI,BX) \ 971 PRECALC_34(Y14) \ 972 CALC_F2_POST(SI,AX,DX,BX) 973 974 #define CALC_107 \ 975 CALC_F2_PRE(0xdc,BX,SI,CX) \ 976 PRECALC_35(Y7) \ 977 CALC_F2_POST(BX,DI,AX,CX) 978 979 #define CALC_108 \ 980 CALC_F2_PRE(0xf0,CX,BX,DX) \ 981 PRECALC_36(Y7) \ 982 CALC_F2_POST(CX,SI,DI,DX) 983 984 #define CALC_109 \ 985 CALC_F2_PRE(0xf4,DX,CX,AX) \ 986 PRECALC_37(Y7) \ 987 CALC_F2_POST(DX,BX,SI,AX) 988 989 #define CALC_110 \ 990 CALC_F2_PRE(0xf8,AX,DX,DI) \ 991 CALC_F2_POST(AX,CX,BX,DI) 992 993 #define CALC_111 \ 994 CALC_F2_PRE(0xfc,DI,AX,SI) \ 995 PRECALC_39(Y7,0x40,0x1a0) \ 996 CALC_F2_POST(DI,DX,CX,SI) 997 998 #define CALC_112 \ 999 CALC_F2_PRE(0x110,SI,DI,BX) \ 1000 PRECALC_32(Y8,Y7) \ 1001 CALC_F2_POST(SI,AX,DX,BX) 1002 1003 #define CALC_113 \ 1004 CALC_F2_PRE(0x114,BX,SI,CX) \ 1005 PRECALC_33(Y3,Y5) \ 1006 CALC_F2_POST(BX,DI,AX,CX) 1007 1008 #define CALC_114 \ 1009 CALC_F2_PRE(0x118,CX,BX,DX) \ 1010 PRECALC_34(Y13) \ 1011 CALC_F2_POST(CX,SI,DI,DX) 1012 1013 #define CALC_115 \ 1014 CALC_F2_PRE(0x11c,DX,CX,AX) \ 1015 PRECALC_35(Y5) \ 1016 CALC_F2_POST(DX,BX,SI,AX) 1017 1018 #define CALC_116 \ 1019 CALC_F2_PRE(0x130,AX,DX,DI) \ 1020 PRECALC_36(Y5) \ 1021 CALC_F2_POST(AX,CX,BX,DI) 1022 1023 #define CALC_117 \ 1024 CALC_F2_PRE(0x134,DI,AX,SI) \ 1025 PRECALC_37(Y5) \ 1026 CALC_F2_POST(DI,DX,CX,SI) 1027 1028 #define CALC_118 \ 1029 CALC_F2_PRE(0x138,SI,DI,BX) \ 1030 CALC_F2_POST(SI,AX,DX,BX) 1031 1032 #define CALC_119 \ 1033 CALC_F3_PRE(0x13c,CX) \ 1034 PRECALC_39(Y5,0x40,0x1c0) \ 1035 CALC_F3_POST(BX,DI,AX,CX,SI) 1036 1037 #define CALC_120 \ 1038 CALC_F3_PRE(0x150,DX) \ 1039 PRECALC_32(Y7,Y5) \ 1040 CALC_F3_POST(CX,SI,DI,DX,BX) 1041 1042 #define CALC_121 \ 1043 CALC_F3_PRE(0x154,AX) \ 1044 PRECALC_33(Y15,Y3) \ 1045 CALC_F3_POST(DX,BX,SI,AX,CX) 1046 1047 #define CALC_122 \ 1048 CALC_F3_PRE(0x158,DI) \ 1049 PRECALC_34(Y12) \ 1050 CALC_F3_POST(AX,CX,BX,DI,DX) 1051 1052 #define CALC_123 \ 1053 CALC_F3_PRE(0x15c,SI) \ 1054 PRECALC_35(Y3) \ 1055 CALC_F3_POST(DI,DX,CX,SI,AX) 1056 1057 #define CALC_124 \ 1058 CALC_F3_PRE(0x170,BX) \ 1059 PRECALC_36(Y3) \ 1060 CALC_F3_POST(SI,AX,DX,BX,DI) 1061 1062 #define CALC_125 \ 1063 CALC_F3_PRE(0x174,CX) \ 1064 PRECALC_37(Y3) \ 1065 CALC_F3_POST(BX,DI,AX,CX,SI) 1066 1067 #define CALC_126 \ 1068 CALC_F3_PRE(0x178,DX) \ 1069 CALC_F3_POST(CX,SI,DI,DX,BX) 1070 1071 #define CALC_127 \ 1072 CALC_F3_PRE(0x17c,AX) \ 1073 PRECALC_39(Y3,0x60,0x1e0) \ 1074 CALC_F3_POST(DX,BX,SI,AX,CX) 1075 1076 #define CALC_128 \ 1077 CALC_F3_PRE(0x190,DI) \ 1078 PRECALC_32(Y5,Y3) \ 1079 CALC_F3_POST(AX,CX,BX,DI,DX) 1080 1081 #define CALC_129 \ 1082 CALC_F3_PRE(0x194,SI) \ 1083 PRECALC_33(Y14,Y15) \ 1084 CALC_F3_POST(DI,DX,CX,SI,AX) 1085 1086 #define CALC_130 \ 1087 CALC_F3_PRE(0x198,BX) \ 1088 PRECALC_34(Y8) \ 1089 CALC_F3_POST(SI,AX,DX,BX,DI) 1090 1091 #define CALC_131 \ 1092 CALC_F3_PRE(0x19c,CX) \ 1093 PRECALC_35(Y15) \ 1094 CALC_F3_POST(BX,DI,AX,CX,SI) 1095 1096 #define CALC_132 \ 1097 CALC_F3_PRE(0x1b0,DX) \ 1098 PRECALC_36(Y15) \ 1099 CALC_F3_POST(CX,SI,DI,DX,BX) 1100 1101 #define CALC_133 \ 1102 CALC_F3_PRE(0x1b4,AX) \ 1103 PRECALC_37(Y15) \ 1104 CALC_F3_POST(DX,BX,SI,AX,CX) 1105 1106 #define CALC_134 \ 1107 CALC_F3_PRE(0x1b8,DI) \ 1108 CALC_F3_POST(AX,CX,BX,DI,DX) 1109 1110 #define CALC_135 \ 1111 CALC_F3_PRE(0x1bc,SI) \ 1112 PRECALC_39(Y15,0x60,0x200) \ 1113 CALC_F3_POST(DI,DX,CX,SI,AX) 1114 1115 #define CALC_136 \ 1116 CALC_F3_PRE(0x1d0,BX) \ 1117 PRECALC_32(Y3,Y15) \ 1118 CALC_F3_POST(SI,AX,DX,BX,DI) 1119 1120 #define CALC_137 \ 1121 CALC_F3_PRE(0x1d4,CX) \ 1122 PRECALC_33(Y13,Y14) \ 1123 CALC_F3_POST(BX,DI,AX,CX,SI) 1124 1125 #define CALC_138 \ 1126 CALC_F3_PRE(0x1d8,DX) \ 1127 PRECALC_34(Y7) \ 1128 CALC_F3_POST(CX,SI,DI,DX,BX) 1129 1130 #define CALC_139 \ 1131 CALC_F2_PRE(0x1dc,DX,CX,AX) \ 1132 PRECALC_35(Y14) \ 1133 CALC_F2_POST(DX,BX,SI,AX) 1134 1135 #define CALC_140 \ 1136 CALC_F2_PRE(0x1f0,AX,DX,DI) \ 1137 PRECALC_36(Y14) \ 1138 CALC_F2_POST(AX,CX,BX,DI) 1139 1140 #define CALC_141 \ 1141 CALC_F2_PRE(0x1f4,DI,AX,SI) \ 1142 PRECALC_37(Y14) \ 1143 CALC_F2_POST(DI,DX,CX,SI) 1144 1145 #define CALC_142 \ 1146 CALC_F2_PRE(0x1f8,SI,DI,BX) \ 1147 CALC_F2_POST(SI,AX,DX,BX) 1148 1149 #define CALC_143 \ 1150 CALC_F2_PRE(0x1fc,BX,SI,CX) \ 1151 PRECALC_39(Y14,0x60,0x220) \ 1152 CALC_F2_POST(BX,DI,AX,CX) 1153 1154 #define CALC_144 \ 1155 CALC_F2_PRE(0x210,CX,BX,DX) \ 1156 PRECALC_32(Y15,Y14) \ 1157 CALC_F2_POST(CX,SI,DI,DX) 1158 1159 #define CALC_145 \ 1160 CALC_F2_PRE(0x214,DX,CX,AX) \ 1161 PRECALC_33(Y12,Y13) \ 1162 CALC_F2_POST(DX,BX,SI,AX) 1163 1164 #define CALC_146 \ 1165 CALC_F2_PRE(0x218,AX,DX,DI) \ 1166 PRECALC_34(Y5) \ 1167 CALC_F2_POST(AX,CX,BX,DI) 1168 1169 #define CALC_147 \ 1170 CALC_F2_PRE(0x21c,DI,AX,SI) \ 1171 PRECALC_35(Y13) \ 1172 CALC_F2_POST(DI,DX,CX,SI) 1173 1174 #define CALC_148 \ 1175 CALC_F2_PRE(0x230,SI,DI,BX) \ 1176 PRECALC_36(Y13) \ 1177 CALC_F2_POST(SI,AX,DX,BX) 1178 1179 #define CALC_149 \ 1180 CALC_F2_PRE(0x234,BX,SI,CX) \ 1181 PRECALC_37(Y13) \ 1182 CALC_F2_POST(BX,DI,AX,CX) 1183 1184 #define CALC_150 \ 1185 CALC_F2_PRE(0x238,CX,BX,DX) \ 1186 CALC_F2_POST(CX,SI,DI,DX) 1187 1188 #define CALC_151 \ 1189 CALC_F2_PRE(0x23c,DX,CX,AX) \ 1190 PRECALC_39(Y13,0x60,0x240) \ 1191 CALC_F2_POST(DX,BX,SI,AX) 1192 1193 #define CALC_152 \ 1194 CALC_F2_PRE(0x250,AX,DX,DI) \ 1195 PRECALC_32(Y14,Y13) \ 1196 CALC_F2_POST(AX,CX,BX,DI) 1197 1198 #define CALC_153 \ 1199 CALC_F2_PRE(0x254,DI,AX,SI) \ 1200 PRECALC_33(Y8,Y12) \ 1201 CALC_F2_POST(DI,DX,CX,SI) 1202 1203 #define CALC_154 \ 1204 CALC_F2_PRE(0x258,SI,DI,BX) \ 1205 PRECALC_34(Y3) \ 1206 CALC_F2_POST(SI,AX,DX,BX) 1207 1208 #define CALC_155 \ 1209 CALC_F2_PRE(0x25c,BX,SI,CX) \ 1210 PRECALC_35(Y12) \ 1211 CALC_F2_POST(BX,DI,AX,CX) 1212 1213 #define CALC_156 \ 1214 CALC_F2_PRE(0x270,CX,BX,DX) \ 1215 PRECALC_36(Y12) \ 1216 CALC_F2_POST(CX,SI,DI,DX) 1217 1218 #define CALC_157 \ 1219 CALC_F2_PRE(0x274,DX,CX,AX) \ 1220 PRECALC_37(Y12) \ 1221 CALC_F2_POST(DX,BX,SI,AX) 1222 1223 #define CALC_158 \ 1224 CALC_F2_PRE(0x278,AX,DX,DI) \ 1225 CALC_F2_POST(AX,CX,BX,DI) 1226 1227 #define CALC_159 \ 1228 ADDL 0x27c(R15),SI \ 1229 LEAL (SI)(AX*1), SI \ 1230 RORXL $0x1b, DI, R12 \ 1231 PRECALC_39(Y12,0x60,0x260) \ 1232 ADDL R12, SI 1233 1234 1235 1236 #define CALC \ 1237 MOVL (R9), CX \ 1238 MOVL 4(R9), SI \ 1239 MOVL 8(R9), DI \ 1240 MOVL 12(R9), AX \ 1241 MOVL 16(R9), DX \ 1242 MOVQ SP, R14 \ 1243 LEAQ (2*4*80+32)(SP), R15 \ 1244 PRECALC \ // Precalc WK for first 2 blocks 1245 XCHGQ R15, R14 \ 1246 loop: \ // this loops is unrolled 1247 CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block 1248 JNE begin \ 1249 VZEROUPPER \ 1250 RET \ 1251 begin: \ 1252 CALC_0 \ 1253 CALC_1 \ 1254 CALC_2 \ 1255 CALC_3 \ 1256 CALC_4 \ 1257 CALC_5 \ 1258 CALC_6 \ 1259 CALC_7 \ 1260 CALC_8 \ 1261 CALC_9 \ 1262 CALC_10 \ 1263 CALC_11 \ 1264 CALC_12 \ 1265 CALC_13 \ 1266 CALC_14 \ 1267 CALC_15 \ 1268 CALC_16 \ 1269 CALC_17 \ 1270 CALC_18 \ 1271 CALC_19 \ 1272 CALC_20 \ 1273 CALC_21 \ 1274 CALC_22 \ 1275 CALC_23 \ 1276 CALC_24 \ 1277 CALC_25 \ 1278 CALC_26 \ 1279 CALC_27 \ 1280 CALC_28 \ 1281 CALC_29 \ 1282 CALC_30 \ 1283 CALC_31 \ 1284 CALC_32 \ 1285 CALC_33 \ 1286 CALC_34 \ 1287 CALC_35 \ 1288 CALC_36 \ 1289 CALC_37 \ 1290 CALC_38 \ 1291 CALC_39 \ 1292 CALC_40 \ 1293 CALC_41 \ 1294 CALC_42 \ 1295 CALC_43 \ 1296 CALC_44 \ 1297 CALC_45 \ 1298 CALC_46 \ 1299 CALC_47 \ 1300 CALC_48 \ 1301 CALC_49 \ 1302 CALC_50 \ 1303 CALC_51 \ 1304 CALC_52 \ 1305 CALC_53 \ 1306 CALC_54 \ 1307 CALC_55 \ 1308 CALC_56 \ 1309 CALC_57 \ 1310 CALC_58 \ 1311 CALC_59 \ 1312 ADDQ $128, R10 \ // move to next even-64-byte block 1313 CMPQ R10, R11 \ // is current block the last one? 1314 CMOVQCC R8, R10 \ // signal the last iteration smartly 1315 CALC_60 \ 1316 CALC_61 \ 1317 CALC_62 \ 1318 CALC_63 \ 1319 CALC_64 \ 1320 CALC_65 \ 1321 CALC_66 \ 1322 CALC_67 \ 1323 CALC_68 \ 1324 CALC_69 \ 1325 CALC_70 \ 1326 CALC_71 \ 1327 CALC_72 \ 1328 CALC_73 \ 1329 CALC_74 \ 1330 CALC_75 \ 1331 CALC_76 \ 1332 CALC_77 \ 1333 CALC_78 \ 1334 CALC_79 \ 1335 UPDATE_HASH(AX,DX,BX,SI,DI) \ 1336 CMPQ R10, R8 \ // is current block the last one? 1337 JE loop\ 1338 MOVL DX, CX \ 1339 CALC_80 \ 1340 CALC_81 \ 1341 CALC_82 \ 1342 CALC_83 \ 1343 CALC_84 \ 1344 CALC_85 \ 1345 CALC_86 \ 1346 CALC_87 \ 1347 CALC_88 \ 1348 CALC_89 \ 1349 CALC_90 \ 1350 CALC_91 \ 1351 CALC_92 \ 1352 CALC_93 \ 1353 CALC_94 \ 1354 CALC_95 \ 1355 CALC_96 \ 1356 CALC_97 \ 1357 CALC_98 \ 1358 CALC_99 \ 1359 CALC_100 \ 1360 CALC_101 \ 1361 CALC_102 \ 1362 CALC_103 \ 1363 CALC_104 \ 1364 CALC_105 \ 1365 CALC_106 \ 1366 CALC_107 \ 1367 CALC_108 \ 1368 CALC_109 \ 1369 CALC_110 \ 1370 CALC_111 \ 1371 CALC_112 \ 1372 CALC_113 \ 1373 CALC_114 \ 1374 CALC_115 \ 1375 CALC_116 \ 1376 CALC_117 \ 1377 CALC_118 \ 1378 CALC_119 \ 1379 CALC_120 \ 1380 CALC_121 \ 1381 CALC_122 \ 1382 CALC_123 \ 1383 CALC_124 \ 1384 CALC_125 \ 1385 CALC_126 \ 1386 CALC_127 \ 1387 CALC_128 \ 1388 CALC_129 \ 1389 CALC_130 \ 1390 CALC_131 \ 1391 CALC_132 \ 1392 CALC_133 \ 1393 CALC_134 \ 1394 CALC_135 \ 1395 CALC_136 \ 1396 CALC_137 \ 1397 CALC_138 \ 1398 CALC_139 \ 1399 ADDQ $128, R13 \ //move to next even-64-byte block 1400 CMPQ R13, R11 \ //is current block the last one? 1401 CMOVQCC R8, R10 \ 1402 CALC_140 \ 1403 CALC_141 \ 1404 CALC_142 \ 1405 CALC_143 \ 1406 CALC_144 \ 1407 CALC_145 \ 1408 CALC_146 \ 1409 CALC_147 \ 1410 CALC_148 \ 1411 CALC_149 \ 1412 CALC_150 \ 1413 CALC_151 \ 1414 CALC_152 \ 1415 CALC_153 \ 1416 CALC_154 \ 1417 CALC_155 \ 1418 CALC_156 \ 1419 CALC_157 \ 1420 CALC_158 \ 1421 CALC_159 \ 1422 UPDATE_HASH(SI,DI,DX,CX,BX) \ 1423 MOVL SI, R12 \ //Reset state for AVX2 reg permutation 1424 MOVL DI, SI \ 1425 MOVL DX, DI \ 1426 MOVL BX, DX \ 1427 MOVL CX, AX \ 1428 MOVL R12, CX \ 1429 XCHGQ R15, R14 \ 1430 JMP loop 1431 1432 1433 1434 TEXT ·blockAVX2(SB),$1408-32 1435 1436 MOVQ dig+0(FP), DI 1437 MOVQ p_base+8(FP), SI 1438 MOVQ p_len+16(FP), DX 1439 SHRQ $6, DX 1440 SHLQ $6, DX 1441 1442 MOVQ $K_XMM_AR<>(SB), R8 1443 1444 MOVQ DI, R9 1445 MOVQ SI, R10 1446 LEAQ 64(SI), R13 1447 1448 ADDQ SI, DX 1449 ADDQ $64, DX 1450 MOVQ DX, R11 1451 1452 CMPQ R13, R11 1453 CMOVQCC R8, R13 1454 1455 VMOVDQU BSWAP_SHUFB_CTL<>(SB), Y10 1456 1457 CALC // RET is inside macros 1458 1459 DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999 1460 DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999 1461 DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999 1462 DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999 1463 DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999 1464 DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999 1465 DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999 1466 DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999 1467 DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1 1468 DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1 1469 DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1 1470 DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1 1471 DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1 1472 DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1 1473 DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1 1474 DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1 1475 DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc 1476 DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc 1477 DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc 1478 DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc 1479 DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc 1480 DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc 1481 DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc 1482 DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc 1483 DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6 1484 DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6 1485 DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6 1486 DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6 1487 DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6 1488 DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6 1489 DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6 1490 DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6 1491 GLOBL K_XMM_AR<>(SB),RODATA,$128 1492 1493 DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203 1494 DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607 1495 DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b 1496 DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f 1497 DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203 1498 DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607 1499 DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b 1500 DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f 1501 GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32