gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/sm3/sm3block_amd64.s (about) 1 #include "textflag.h" 2 3 // Wt = Mt; for 0 <= t <= 3 4 #define MSGSCHEDULE0(index) \ 5 MOVL (index*4)(SI), AX; \ 6 BSWAPL AX; \ 7 MOVL AX, (index*4)(BP) 8 9 // Wt+4 = Mt+4; for 0 <= t <= 11 10 #define MSGSCHEDULE01(index) \ 11 MOVL ((index+4)*4)(SI), AX; \ 12 BSWAPL AX; \ 13 MOVL AX, ((index+4)*4)(BP) 14 15 // x = Wt-12 XOR Wt-5 XOR ROTL(15, Wt+1) 16 // p1(x) = x XOR ROTL(15, x) XOR ROTL(23, x) 17 // Wt+4 = p1(x) XOR ROTL(7, Wt-9) XOR Wt-2 18 // for 12 <= t <= 63 19 #define MSGSCHEDULE1(index) \ 20 MOVL ((index+1)*4)(BP), AX; \ 21 ROLL $15, AX; \ 22 MOVL ((index-12)*4)(BP), BX; \ 23 XORL BX, AX; \ 24 MOVL ((index-5)*4)(BP), BX; \ 25 XORL BX, AX; \ 26 MOVL AX, BX; \ 27 ROLL $15, BX; \ 28 MOVL AX, CX; \ 29 ROLL $23, CX; \ 30 XORL BX, AX; \ 31 XORL CX, AX; \ 32 MOVL ((index-9)*4)(BP), BX; \ 33 ROLL $7, BX; \ 34 MOVL ((index-2)*4)(BP), CX; \ 35 XORL BX, AX; \ 36 XORL CX, AX; \ 37 MOVL AX, ((index+4)*4)(BP) 38 39 // Calculate ss1 in BX 40 // x = ROTL(12, a) + e + ROTL(index, const) 41 // ret = ROTL(7, x) 42 #define SM3SS1(const, a, e) \ 43 MOVL a, BX; \ 44 ROLL $12, BX; \ 45 ADDL e, BX; \ 46 ADDL $const, BX; \ 47 ROLL $7, BX 48 49 // Calculate tt1 in CX 50 // ret = (a XOR b XOR c) + d + (ROTL(12, a) XOR ss1) + (Wt XOR Wt+4) 51 #define SM3TT10(index, a, b, c, d) \ 52 MOVL a, CX; \ 53 MOVL b, DX; \ 54 XORL CX, DX; \ 55 MOVL c, DI; \ 56 XORL DI, DX; \ // (a XOR b XOR c) 57 ADDL d, DX; \ // (a XOR b XOR c) + d 58 MOVL ((index)*4)(BP), DI; \ //Wt 59 XORL DI, AX; \ //Wt XOR Wt+4 60 ADDL AX, DX; \ 61 ROLL $12, CX; \ 62 XORL BX, CX; \ // ROTL(12, a) XOR ss1 63 ADDL DX, CX // (a XOR b XOR c) + d + (ROTL(12, a) XOR ss1) 64 65 // Calculate tt2 in BX 66 // ret = (e XOR f XOR g) + h + ss1 + Wt 67 #define SM3TT20(e, f, g, h) \ 68 ADDL h, DI; \ //Wt + h 69 ADDL BX, DI; \ //Wt + h + ss1 70 MOVL e, BX; \ 71 MOVL f, DX; \ 72 XORL DX, BX; \ // e XOR f 73 MOVL g, DX; \ 74 XORL DX, BX; \ // e XOR f XOR g 75 ADDL DI, BX // (e XOR f XOR g) + Wt + h + ss1 76 77 // Calculate tt1 in CX, used DX, DI 78 // ret = ((a AND b) OR (a AND c) OR (b AND c)) + d + (ROTL(12, a) XOR ss1) + (Wt XOR Wt+4) 79 #define SM3TT11(index, a, b, c, d) \ 80 MOVL a, CX; \ 81 MOVL b, DX; \ 82 ANDL CX, DX; \ // a AND b 83 MOVL c, DI; \ 84 ANDL DI, CX; \ // a AND c 85 ORL DX, CX; \ // (a AND b) OR (a AND c) 86 MOVL b, DX; \ 87 ANDL DI, DX; \ // b AND c 88 ORL CX, DX; \ // (a AND b) OR (a AND c) OR (b AND c) 89 ADDL d, DX; \ 90 MOVL a, CX; \ 91 ROLL $12, CX; \ 92 XORL BX, CX; \ 93 ADDL DX, CX; \ // ((a AND b) OR (a AND c) OR (b AND c)) + d + (ROTL(12, a) XOR ss1) 94 MOVL ((index)*4)(BP), DI; \ 95 XORL DI, AX; \ // Wt XOR Wt+4 96 ADDL AX, CX 97 98 // Calculate tt2 in BX 99 // ret = ((e AND f) OR (NOT(e) AND g)) + h + ss1 + Wt 100 #define SM3TT21(e, f, g, h) \ 101 ADDL h, DI; \ // Wt + h 102 ADDL BX, DI; \ // h + ss1 + Wt 103 MOVL e, BX; \ 104 MOVL f, DX; \ 105 ANDL BX, DX; \ // e AND f 106 NOTL BX; \ // NOT(e) 107 MOVL g, AX; \ 108 ANDL AX, BX; \ // NOT(e) AND g 109 ORL DX, BX; \ 110 ADDL DI, BX 111 112 #define COPYRESULT(b, d, f, h) \ 113 ROLL $9, b; \ 114 MOVL CX, h; \ // a = ttl 115 ROLL $19, f; \ 116 MOVL BX, CX; \ 117 ROLL $9, CX; \ 118 XORL BX, CX; \ // tt2 XOR ROTL(9, tt2) 119 ROLL $17, BX; \ 120 XORL BX, CX; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2) 121 MOVL CX, d // e = tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2) 122 123 #define SM3ROUND0(index, const, a, b, c, d, e, f, g, h) \ 124 MSGSCHEDULE01(index); \ 125 SM3SS1(const, a, e); \ 126 SM3TT10(index, a, b, c, d); \ 127 SM3TT20(e, f, g, h); \ 128 COPYRESULT(b, d, f, h) 129 130 #define SM3ROUND1(index, const, a, b, c, d, e, f, g, h) \ 131 MSGSCHEDULE1(index); \ 132 SM3SS1(const, a, e); \ 133 SM3TT10(index, a, b, c, d); \ 134 SM3TT20(e, f, g, h); \ 135 COPYRESULT(b, d, f, h) 136 137 #define SM3ROUND2(index, const, a, b, c, d, e, f, g, h) \ 138 MSGSCHEDULE1(index); \ 139 SM3SS1(const, a, e); \ 140 SM3TT11(index, a, b, c, d); \ 141 SM3TT21(e, f, g, h); \ 142 COPYRESULT(b, d, f, h) 143 144 // Definitions for AVX2 version 145 146 // xorm (mem), reg 147 // Xor reg to mem using reg-mem xor and store 148 #define xorm(P1, P2) \ 149 XORL P2, P1; \ 150 MOVL P1, P2 151 152 #define XDWORD0 Y4 153 #define XDWORD1 Y5 154 #define XDWORD2 Y6 155 #define XDWORD3 Y7 156 157 #define XWORD0 X4 158 #define XWORD1 X5 159 #define XWORD2 X6 160 #define XWORD3 X7 161 162 #define XTMP0 Y0 163 #define XTMP1 Y1 164 #define XTMP2 Y2 165 #define XTMP3 Y3 166 #define XTMP4 Y8 167 #define XTMP5 Y11 168 169 #define XFER Y9 170 171 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 172 #define X_BYTE_FLIP_MASK X13 173 174 #define NUM_BYTES DX 175 #define INP DI 176 177 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h) 178 179 #define a AX 180 #define b BX 181 #define c CX 182 #define d R8 183 #define e DX 184 #define f R9 185 #define g R10 186 #define h R11 187 188 //#define old_h R11 189 190 //#define TBL BP 191 192 #define SRND SI // SRND is same register as CTX 193 194 #define T1 R12 195 196 #define y0 R13 197 #define y1 R14 198 #define y2 R15 199 #define y3 DI 200 201 // Offsets 202 #define XFER_SIZE 4*64*4 203 #define INP_END_SIZE 8 204 #define INP_SIZE 8 205 206 #define _XFER 0 207 #define _INP_END _XFER + XFER_SIZE 208 #define _INP _INP_END + INP_END_SIZE 209 #define STACK_SIZE _INP + INP_SIZE 210 211 #define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 212 ; \ // ############################# RND N + 0 ############################// 213 RORXL $(-12), a, y0; \ // y0 = a <<< 12 214 MOVL e, y1; \ 215 ADDL $const, y1; \ 216 VPALIGNR $12, XDWORD0, XDWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3} 217 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 218 RORXL $(-7), y1, y2; \ // y2 = SS1 219 VPSLLD $7, XTMP0, XTMP1; \ 220 XORL y2, y0 \ // y0 = SS2 221 ADDL (disp + 0*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 222 VPSRLD $(32-7), XTMP0, XTMP0; \ 223 ADDL h, y2; \ // y2 = h + SS1 + W 224 ADDL (disp + 0*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 225 ADDL d, y0; \ // y0 = d + SS2 + W' 226 VPOR XTMP0, XTMP1, XTMP1; \ // XTMP1 = W[-13] rol 7 227 ; \ 228 MOVL a, y1; \ 229 XORL b, y1; \ 230 XORL c, y1; \ 231 VPALIGNR $8, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-6] = {w13,w12,w11,w10} 232 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 233 ; \ 234 MOVL e, y1; \ 235 VPXOR XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-6] XOR (W[-13] rol 7) 236 XORL f, y1; \ 237 XORL g, y1; \ 238 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 239 VPALIGNR $12, XDWORD1, XDWORD2, XTMP1; \ // XTMP1 = W[-9] = {w10,w9,w8,w7} 240 ; \ 241 ROLL $9, b; \ 242 ROLL $19, f; \ 243 MOVL y0, h; \ // h = tt1 244 ; \ 245 RORXL $(-9), y2, y0; \ 246 VPXOR XDWORD0, XTMP1, XTMP1; \ // XTMP1 = W[-9] XOR W[-16] 247 RORXL $(-17), y2, y1; \ 248 XORL y0, y2; \ 249 XORL y1, y2; \ 250 VPSHUFD $0xA5, XDWORD3, XTMP2; \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13} 251 MOVL y2, d // d = P(tt2) 252 253 #define ROUND_AND_SCHED_N_0_1(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 254 ; \ // ############################# RND N + 1 ############################// 255 RORXL $-12, a, y0; \ // y0 = a <<< 12 256 MOVL e, y1; \ 257 VPSLLQ $15, XTMP2, XTMP2; \ // XTMP2 = W[-3] rol 15 {BxAx} 258 ADDL $const, y1; \ 259 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 260 RORXL $-7, y1, y2; \ // y2 = SS1 261 VPSHUFB shuff_00BA<>(SB), XTMP2, XTMP2; \ // XTMP2 = W[-3] rol 15 {00BA} 262 XORL y2, y0 \ // y0 = SS2 263 ADDL (disp + 1*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 264 ADDL h, y2; \ // y2 = h + SS1 + W 265 VPXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {xxBA} 266 ADDL (disp + 1*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 267 ADDL d, y0; \ // y0 = d + SS2 + W' 268 ; \ 269 MOVL a, y1; \ 270 XORL b, y1; \ 271 VPSLLD $15, XTMP2, XTMP3; \ 272 XORL c, y1; \ 273 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 274 ; \ 275 MOVL e, y1; \ 276 VPSRLD $(32-15), XTMP2, XTMP4; \ 277 XORL f, y1; \ 278 XORL g, y1; \ 279 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 280 VPOR XTMP3, XTMP4, XTMP4; \ // XTMP4 = XTMP2 rol 15 {xxBA} 281 ; \ 282 ROLL $9, b; \ 283 ROLL $19, f; \ 284 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA}) 285 MOVL y0, h; \ // h = tt1 286 ; \ 287 RORXL $-9, y2, y0; \ 288 RORXL $-17, y2, y1; \ 289 VPSLLD $23, XTMP2, XTMP3; \ 290 XORL y0, y2; \ 291 XORL y1, y2; \ 292 VPSRLD $(32-23), XTMP2, XTMP5; \ 293 MOVL y2, d // d = P(tt2) 294 295 #define ROUND_AND_SCHED_N_0_2(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 296 ; \ // ############################# RND N + 2 ############################// 297 RORXL $-12, a, y0; \ // y0 = a <<< 12 298 MOVL e, y1; \ 299 VPOR XTMP3, XTMP5, XTMP5; \ //XTMP5 = XTMP2 rol 23 {xxBA} 300 ADDL $const, y1; \ 301 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 302 RORXL $-7, y1, y2; \ // y2 = SS1 303 VPXOR XTMP4, XTMP5, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA}) XOR (XTMP2 rol 23 {xxBA}) 304 XORL y2, y0 \ // y0 = SS2 305 ADDL (disp + 2*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 306 ADDL h, y2; \ // y2 = h + SS1 + W 307 VPXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {..., ..., W[1], W[0]} 308 ADDL (disp + 2*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 309 ADDL d, y0; \ // y0 = d + SS2 + W' 310 ; \ 311 MOVL a, y1; \ 312 VPALIGNR $12, XDWORD3, XTMP2, XTMP3; \ // XTMP3 = {..., W[1], W[0], w15} 313 XORL b, y1; \ 314 XORL c, y1; \ 315 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 316 VPSHUFD $80, XTMP3, XTMP4; \ // XTMP4 = = W[-3] {DDCC} 317 ; \ 318 MOVL e, y1; \ 319 XORL f, y1; \ 320 XORL g, y1; \ 321 VPSLLQ $15, XTMP4, XTMP4; \ // XTMP4 = W[-3] rol 15 {DxCx} 322 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 323 ; \ 324 ROLL $9, b; \ 325 ROLL $19, f; \ 326 VPSHUFB shuff_DC00<>(SB), XTMP4, XTMP4; \ // XTMP4 = W[-3] rol 15 {DC00} 327 MOVL y0, h; \ // h = tt1 328 ; \ 329 RORXL $-9, y2, y0; \ 330 RORXL $-17, y2, y1; \ 331 VPXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {DCxx} 332 XORL y0, y2; \ 333 XORL y1, y2; \ 334 VPSLLD $15, XTMP4, XTMP5; \ 335 MOVL y2, d // d = P(tt2) 336 337 #define ROUND_AND_SCHED_N_0_3(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 338 ; \ // ############################# RND N + 3 ############################// 339 RORXL $-12, a, y0; \ // y0 = a <<< 12 340 MOVL e, y1; \ 341 VPSRLD $(32-15), XTMP4, XTMP3; \ 342 ADDL $const, y1; \ 343 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 344 VPOR XTMP3, XTMP5, XTMP3; \ // XTMP3 = XTMP4 rol 15 {DCxx} 345 RORXL $-7, y1, y2; \ // y2 = SS1 346 XORL y2, y0 \ // y0 = SS2 347 VPXOR XTMP3, XTMP4, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {DCxx}) 348 ADDL (disp + 3*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 349 ADDL h, y2; \ // y2 = h + SS1 + W 350 ADDL (disp + 3*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 351 VPSLLD $23, XTMP4, XTMP5; \ 352 ADDL d, y0; \ // y0 = d + SS2 + W' 353 ; \ 354 MOVL a, y1; \ 355 XORL b, y1; \ 356 VPSRLD $(32-23), XTMP4, XTMP1; \ 357 XORL c, y1; \ 358 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 359 ; \ 360 VPOR XTMP1, XTMP5, XTMP1; \ // XTMP1 = XTMP4 rol 23 {DCxx} 361 MOVL e, y1; \ 362 XORL f, y1; \ 363 VPXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {DCxx}) XOR (XTMP4 rol 23 {DCxx}) 364 XORL g, y1; \ 365 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 366 ; \ 367 ROLL $9, b; \ 368 VPXOR XTMP1, XTMP0, XTMP1; \ // XTMP1 = {W[3], W[2], ..., ...} 369 ROLL $19, f; \ 370 MOVL y0, h; \ // h = tt1 371 ; \ 372 RORXL $-9, y2, y0; \ 373 VPALIGNR $8, XTMP1, XTMP2, XTMP3; \ // XTMP3 = {W[1], W[0], W[3], W[2]} 374 RORXL $-17, y2, y1; \ 375 XORL y0, y2; \ 376 XORL y1, y2; \ 377 VPSHUFD $0x4E, XTMP3, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} 378 MOVL y2, d // d = P(tt2) 379 380 #define ROUND_AND_SCHED_N_1_0(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 381 ; \ // ############################# RND N + 0 ############################// 382 RORXL $-12, a, y0; \ // y0 = a <<< 12 383 MOVL e, y1; \ 384 ADDL $const, y1; \ 385 VPALIGNR $12, XDWORD0, XDWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3} 386 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 387 RORXL $-7, y1, y2; \ // y2 = SS1 388 XORL y2, y0 \ // y0 = SS2 389 VPSLLD $7, XTMP0, XTMP1; \ 390 ADDL (disp + 0*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 391 ADDL h, y2; \ // y2 = h + SS1 + W 392 ADDL (disp + 0*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 393 ADDL d, y0; \ // y0 = d + SS2 + W' 394 VPSRLD $(32-7), XTMP0, XTMP0; \ 395 ; \ 396 MOVL a, y1; \ 397 MOVL b, y3; \ 398 VPOR XTMP0, XTMP1, XTMP1; \ // XTMP1 = W[-13] rol 7 399 ANDL y1, y3; \ 400 MOVL c, T1; \ 401 ANDL T1, y1; \ 402 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) 403 VPALIGNR $8, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-6] = {w13,w12,w11,w10} 404 MOVL b, y3; \ 405 ANDL T1, y3; \ 406 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) OR (b AND c) 407 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 408 ; \ 409 VPXOR XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-6] XOR (W[-13] rol 7) 410 MOVL e, y1; \ 411 MOVL f, y3; \ 412 ANDL y1, y3; \ // y3 = e AND f 413 NOTL y1; \ 414 MOVL g, T1; \ 415 ANDL T1, y1; \ // y1 = NOT(e) AND g 416 ORL y3, y1; \ // y1 = (e AND f) OR (NOT(e) AND g) 417 VPALIGNR $12, XDWORD1, XDWORD2, XTMP1; \ // XTMP1 = W[-9] = {w10,w9,w8,w7} 418 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 419 ; \ 420 ROLL $9, b; \ 421 ROLL $19, f; \ 422 VPXOR XDWORD0, XTMP1, XTMP1; \ // XTMP1 = W[-9] XOR W[-16] 423 MOVL y0, h; \ // h = tt1 424 ; \ 425 RORXL $-9, y2, y0; \ 426 RORXL $-17, y2, y1; \ 427 XORL y0, y2; \ 428 XORL y1, y2; \ 429 VPSHUFD $0xA5, XDWORD3, XTMP2; \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13} 430 MOVL y2, d // d = P(tt2) 431 432 #define ROUND_AND_SCHED_N_1_1(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 433 ; \ // ############################# RND N + 1 ############################// 434 RORXL $-12, a, y0; \ // y0 = a <<< 12 435 MOVL e, y1; \ 436 ADDL $const, y1; \ 437 VPSLLQ $15, XTMP2, XTMP2; \ // XTMP2 = W[-3] rol 15 {BxAx} 438 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 439 RORXL $-7, y1, y2; \ // y2 = SS1 440 VPSHUFB shuff_00BA<>(SB), XTMP2, XTMP2; \ // XTMP2 = W[-3] rol 15 {00BA} 441 XORL y2, y0 \ // y0 = SS2 442 ADDL (disp + 1*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 443 ADDL h, y2; \ // y2 = h + SS1 + W 444 VPXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {xxBA} 445 ADDL (disp + 1*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 446 ADDL d, y0; \ // y0 = d + SS2 + W' 447 ; \ 448 MOVL a, y1; \ 449 VPSLLD $15, XTMP2, XTMP3; \ 450 MOVL b, y3; \ 451 ANDL y1, y3; \ 452 MOVL c, T1; \ 453 ANDL T1, y1; \ 454 VPSRLD $(32-15), XTMP2, XTMP4; \ 455 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) 456 MOVL b, y3; \ 457 ANDL T1, y3; \ 458 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) OR (b AND c) 459 VPOR XTMP3, XTMP4, XTMP4; \ // XTMP4 = XTMP2 rol 15 {xxBA} 460 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 461 ; \ 462 MOVL e, y1; \ 463 MOVL f, y3; \ 464 ANDL y1, y3; \ // y3 = e AND f 465 NOTL y1; \ 466 MOVL g, T1; \ 467 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA}) 468 ANDL T1, y1; \ // y1 = NOT(e) AND g 469 ORL y3, y1; \ // y1 = (e AND f) OR (NOT(e) AND g) 470 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 471 ; \ 472 ROLL $9, b; \ 473 ROLL $19, f; \ 474 VPSLLD $23, XTMP2, XTMP3; \ 475 MOVL y0, h; \ // h = tt1 476 ; \ 477 RORXL $-9, y2, y0; \ 478 RORXL $-17, y2, y1; \ 479 XORL y0, y2; \ 480 VPSRLD $(32-23), XTMP2, XTMP5; \ 481 XORL y1, y2; \ 482 MOVL y2, d // d = P(tt2) 483 484 #define ROUND_AND_SCHED_N_1_2(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 485 ; \ // ############################# RND N + 2 ############################// 486 RORXL $-12, a, y0; \ // y0 = a <<< 12 487 MOVL e, y1; \ 488 ADDL $const, y1; \ 489 VPOR XTMP3, XTMP5, XTMP5; \ //XTMP5 = XTMP2 rol 23 {xxBA} 490 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 491 RORXL $-7, y1, y2; \ // y2 = SS1 492 XORL y2, y0 \ // y0 = SS2 493 VPXOR XTMP4, XTMP5, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA}) XOR (XTMP2 rol 23 {xxBA}) 494 ADDL (disp + 2*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 495 ADDL h, y2; \ // y2 = h + SS1 + W 496 ADDL (disp + 2*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 497 VPXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {..., ..., W[1], W[0]} 498 ADDL d, y0; \ // y0 = d + SS2 + W' 499 ; \ 500 MOVL a, y1; \ 501 MOVL b, y3; \ 502 VPALIGNR $12, XDWORD3, XTMP2, XTMP3; \ // XTMP3 = {..., W[1], W[0], w15} 503 ANDL y1, y3; \ 504 MOVL c, T1; \ 505 ANDL T1, y1; \ 506 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) 507 VPSHUFD $80, XTMP3, XTMP4; \ // XTMP4 = = W[-3] {DDCC} 508 MOVL b, y3; \ 509 ANDL T1, y3; \ 510 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) OR (b AND c) 511 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 512 VPSLLQ $15, XTMP4, XTMP4; \ // XTMP4 = W[-3] rol 15 {DxCx} 513 ; \ 514 MOVL e, y1; \ 515 MOVL f, y3; \ 516 ANDL y1, y3; \ // y3 = e AND f 517 NOTL y1; \ 518 VPSHUFB shuff_DC00<>(SB), XTMP4, XTMP4; \ // XTMP4 = W[-3] rol 15 {DC00} 519 MOVL g, T1; \ 520 ANDL T1, y1; \ // y1 = NOT(e) AND g 521 ORL y3, y1; \ // y1 = (e AND f) OR (NOT(e) AND g) 522 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 523 VPXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {DCxx} 524 ; \ 525 ROLL $9, b; \ 526 ROLL $19, f; \ 527 MOVL y0, h; \ // h = tt1 528 VPSLLD $15, XTMP4, XTMP5; \ 529 ; \ 530 RORXL $-9, y2, y0; \ 531 RORXL $-17, y2, y1; \ 532 XORL y0, y2; \ 533 XORL y1, y2; \ 534 VPSRLD $(32-15), XTMP4, XTMP3; \ 535 MOVL y2, d // d = P(tt2) 536 537 #define ROUND_AND_SCHED_N_1_3(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 538 ; \ // ############################# RND N + 3 ############################// 539 RORXL $-12, a, y0; \ // y0 = a <<< 12 540 MOVL e, y1; \ 541 ADDL $const, y1; \ 542 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 543 VPOR XTMP3, XTMP5, XTMP3; \ // XTMP3 = XTMP4 rol 15 {DCxx} 544 RORXL $-7, y1, y2; \ // y2 = SS1 545 XORL y2, y0 \ // y0 = SS2 546 ADDL (disp + 3*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 547 VPXOR XTMP3, XTMP4, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {DCxx}) 548 ADDL h, y2; \ // y2 = h + SS1 + W 549 ADDL (disp + 3*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 550 ADDL d, y0; \ // y0 = d + SS2 + W' 551 ; \ 552 MOVL a, y1; \ 553 VPSLLD $23, XTMP4, XTMP5; \ 554 MOVL b, y3; \ 555 ANDL y1, y3; \ 556 MOVL c, T1; \ 557 ANDL T1, y1; \ 558 VPSRLD $(32-23), XTMP4, XTMP1; \ 559 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) 560 MOVL b, y3; \ 561 ANDL T1, y3; \ 562 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) OR (b AND c) 563 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 564 VPOR XTMP1, XTMP5, XTMP1; \ // XTMP1 = XTMP4 rol 23 {DCxx} 565 ; \ 566 MOVL e, y1; \ 567 MOVL f, y3; \ 568 ANDL y1, y3; \ // y3 = e AND f 569 NOTL y1; \ 570 VPXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {DCxx}) XOR (XTMP4 rol 23 {DCxx}) 571 MOVL g, T1; \ 572 ANDL T1, y1; \ // y1 = NOT(e) AND g 573 ORL y3, y1; \ // y1 = (e AND f) OR (NOT(e) AND g) 574 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 575 VPXOR XTMP1, XTMP0, XTMP1; \ // XTMP1 = {W[3], W[2], ..., ...} 576 ; \ 577 ROLL $9, b; \ 578 ROLL $19, f; \ 579 MOVL y0, h; \ // h = tt1 580 VPALIGNR $8, XTMP1, XTMP2, XTMP3; \ // XTMP3 = {W[1], W[0], W[3], W[2]} 581 ; \ 582 RORXL $-9, y2, y0; \ 583 RORXL $-17, y2, y1; \ 584 XORL y0, y2; \ 585 XORL y1, y2; \ 586 VPSHUFD $0x4E, XTMP3, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} 587 MOVL y2, d // d = P(tt2) 588 589 #define ROUND_N_0_0(disp, const, a, b, c, d, e, f, g, h) \ 590 ; \ // ############################# RND N + 0 ############################// 591 RORXL $-12, a, y0; \ // y0 = a <<< 12 592 MOVL e, y1; \ 593 ADDL $const, y1; \ 594 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 595 RORXL $-7, y1, y2; \ // y2 = SS1 596 XORL y2, y0 \ // y0 = SS2 597 ADDL (disp + 0*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 598 ADDL h, y2; \ // y2 = h + SS1 + W 599 ADDL (disp + 0*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 600 ADDL d, y0; \ // y0 = d + SS2 + W' 601 ; \ 602 MOVL a, y1; \ 603 XORL b, y1; \ 604 XORL c, y1; \ 605 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 606 ; \ 607 MOVL e, y1; \ 608 XORL f, y1; \ 609 XORL g, y1; \ 610 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 611 ; \ 612 ROLL $9, b; \ 613 ROLL $19, f; \ 614 MOVL y0, h; \ // h = tt1 615 ; \ 616 RORXL $-9, y2, y0; \ 617 RORXL $-17, y2, y1; \ 618 XORL y0, y2; \ 619 XORL y1, y2; \ 620 MOVL y2, d // d = P(tt2) 621 622 #define ROUND_N_0_1(disp, const, a, b, c, d, e, f, g, h) \ 623 ; \ // ############################# RND N + 1 ############################// 624 RORXL $-12, a, y0; \ // y0 = a <<< 12 625 MOVL e, y1; \ 626 ADDL $const, y1; \ 627 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 628 RORXL $-7, y1, y2; \ // y2 = SS1 629 XORL y2, y0 \ // y0 = SS2 630 ADDL (disp + 1*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 631 ADDL h, y2; \ // y2 = h + SS1 + W 632 ADDL (disp + 1*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 633 ADDL d, y0; \ // y0 = d + SS2 + W' 634 ; \ 635 MOVL a, y1; \ 636 XORL b, y1; \ 637 XORL c, y1; \ 638 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 639 ; \ 640 MOVL e, y1; \ 641 XORL f, y1; \ 642 XORL g, y1; \ 643 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 644 ; \ 645 ROLL $9, b; \ 646 ROLL $19, f; \ 647 MOVL y0, h; \ // h = tt1 648 ; \ 649 RORXL $-9, y2, y0; \ 650 RORXL $-17, y2, y1; \ 651 XORL y0, y2; \ 652 XORL y1, y2; \ 653 MOVL y2, d // d = P(tt2) 654 655 #define ROUND_N_0_2(disp, const, a, b, c, d, e, f, g, h) \ 656 ; \ // ############################# RND N + 2 ############################// 657 RORXL $-12, a, y0; \ // y0 = a <<< 12 658 MOVL e, y1; \ 659 ADDL $const, y1; \ 660 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 661 RORXL $-7, y1, y2; \ // y2 = SS1 662 XORL y2, y0 \ // y0 = SS2 663 ADDL (disp + 2*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 664 ADDL h, y2; \ // y2 = h + SS1 + W 665 ADDL (disp + 2*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 666 ADDL d, y0; \ // y0 = d + SS2 + W' 667 ; \ 668 MOVL a, y1; \ 669 XORL b, y1; \ 670 XORL c, y1; \ 671 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 672 ; \ 673 MOVL e, y1; \ 674 XORL f, y1; \ 675 XORL g, y1; \ 676 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 677 ; \ 678 ROLL $9, b; \ 679 ROLL $19, f; \ 680 MOVL y0, h; \ // h = tt1 681 ; \ 682 RORXL $-9, y2, y0; \ 683 RORXL $-17, y2, y1; \ 684 XORL y0, y2; \ 685 XORL y1, y2; \ 686 MOVL y2, d // d = P(tt2) 687 688 #define ROUND_N_0_3(disp, const, a, b, c, d, e, f, g, h) \ 689 ; \ // ############################# RND N + 3 ############################// 690 RORXL $-12, a, y0; \ // y0 = a <<< 12 691 MOVL e, y1; \ 692 ADDL $const, y1; \ 693 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 694 RORXL $-7, y1, y2; \ // y2 = SS1 695 XORL y2, y0 \ // y0 = SS2 696 ADDL (disp + 3*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 697 ADDL h, y2; \ // y2 = h + SS1 + W 698 ADDL (disp + 3*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 699 ADDL d, y0; \ // y0 = d + SS2 + W' 700 ; \ 701 MOVL a, y1; \ 702 XORL b, y1; \ 703 XORL c, y1; \ 704 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 705 ; \ 706 MOVL e, y1; \ 707 XORL f, y1; \ 708 XORL g, y1; \ 709 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 710 ; \ 711 ROLL $9, b; \ 712 ROLL $19, f; \ 713 MOVL y0, h; \ // h = tt1 714 ; \ 715 RORXL $-9, y2, y0; \ 716 RORXL $-17, y2, y1; \ 717 XORL y0, y2; \ 718 XORL y1, y2; \ 719 MOVL y2, d // d = P(tt2) 720 721 #define ROUND_N_1_0(disp, const, a, b, c, d, e, f, g, h) \ 722 ; \ // ############################# RND N + 0 ############################// 723 RORXL $-12, a, y0; \ // y0 = a <<< 12 724 MOVL e, y1; \ 725 ADDL $const, y1; \ 726 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 727 RORXL $-7, y1, y2; \ // y2 = SS1 728 XORL y2, y0 \ // y0 = SS2 729 ADDL (disp + 0*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 730 ADDL h, y2; \ // y2 = h + SS1 + W 731 ADDL (disp + 0*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 732 ADDL d, y0; \ // y0 = d + SS2 + W' 733 ; \ 734 MOVL a, y1; \ 735 MOVL b, y3; \ 736 ANDL y1, y3; \ 737 MOVL c, T1; \ 738 ANDL T1, y1; \ 739 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) 740 MOVL b, y3; \ 741 ANDL T1, y3; \ 742 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) OR (b AND c) 743 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 744 ; \ 745 MOVL e, y1; \ 746 MOVL f, y3; \ 747 ANDL y1, y3; \ // y3 = e AND f 748 NOTL y1; \ 749 MOVL g, T1; \ 750 ANDL T1, y1; \ // y1 = NOT(e) AND g 751 ORL y3, y1; \ // y1 = (e AND f) OR (NOT(e) AND g) 752 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 753 ; \ 754 ROLL $9, b; \ 755 ROLL $19, f; \ 756 MOVL y0, h; \ // h = tt1 757 ; \ 758 RORXL $-9, y2, y0; \ 759 RORXL $-17, y2, y1; \ 760 XORL y0, y2; \ 761 XORL y1, y2; \ 762 MOVL y2, d // d = P(tt2) 763 764 #define ROUND_N_1_1(disp, const, a, b, c, d, e, f, g, h) \ 765 ; \ // ############################# RND N + 1 ############################// 766 RORXL $-12, a, y0; \ // y0 = a <<< 12 767 MOVL e, y1; \ 768 ADDL $const, y1; \ 769 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 770 RORXL $-7, y1, y2; \ // y2 = SS1 771 XORL y2, y0 \ // y0 = SS2 772 ADDL (disp + 1*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 773 ADDL h, y2; \ // y2 = h + SS1 + W 774 ADDL (disp + 1*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 775 ADDL d, y0; \ // y0 = d + SS2 + W' 776 ; \ 777 MOVL a, y1; \ 778 MOVL b, y3; \ 779 ANDL y1, y3; \ 780 MOVL c, T1; \ 781 ANDL T1, y1; \ 782 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) 783 MOVL b, y3; \ 784 ANDL T1, y3; \ 785 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) OR (b AND c) 786 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 787 ; \ 788 MOVL e, y1; \ 789 MOVL f, y3; \ 790 ANDL y1, y3; \ // y3 = e AND f 791 NOTL y1; \ 792 MOVL g, T1; \ 793 ANDL T1, y1; \ // y1 = NOT(e) AND g 794 ORL y3, y1; \ // y1 = (e AND f) OR (NOT(e) AND g) 795 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 796 ; \ 797 ROLL $9, b; \ 798 ROLL $19, f; \ 799 MOVL y0, h; \ // h = tt1 800 ; \ 801 RORXL $-9, y2, y0; \ 802 RORXL $-17, y2, y1; \ 803 XORL y0, y2; \ 804 XORL y1, y2; \ 805 MOVL y2, d // d = P(tt2) 806 807 #define ROUND_N_1_2(disp, const, a, b, c, d, e, f, g, h) \ 808 ; \ // ############################# RND N + 2 ############################// 809 RORXL $-12, a, y0; \ // y0 = a <<< 12 810 MOVL e, y1; \ 811 ADDL $const, y1; \ 812 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 813 RORXL $-7, y1, y2; \ // y2 = SS1 814 XORL y2, y0 \ // y0 = SS2 815 ADDL (disp + 2*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 816 ADDL h, y2; \ // y2 = h + SS1 + W 817 ADDL (disp + 2*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 818 ADDL d, y0; \ // y0 = d + SS2 + W' 819 ; \ 820 MOVL a, y1; \ 821 MOVL b, y3; \ 822 ANDL y1, y3; \ 823 MOVL c, T1; \ 824 ANDL T1, y1; \ 825 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) 826 MOVL b, y3; \ 827 ANDL T1, y3; \ 828 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) OR (b AND c) 829 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 830 ; \ 831 MOVL e, y1; \ 832 MOVL f, y3; \ 833 ANDL y1, y3; \ // y3 = e AND f 834 NOTL y1; \ 835 MOVL g, T1; \ 836 ANDL T1, y1; \ // y1 = NOT(e) AND g 837 ORL y3, y1; \ // y1 = (e AND f) OR (NOT(e) AND g) 838 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 839 ; \ 840 ROLL $9, b; \ 841 ROLL $19, f; \ 842 MOVL y0, h; \ // h = tt1 843 ; \ 844 RORXL $-9, y2, y0; \ 845 RORXL $-17, y2, y1; \ 846 XORL y0, y2; \ 847 XORL y1, y2; \ 848 MOVL y2, d // d = P(tt2) 849 850 #define ROUND_N_1_3(disp, const, a, b, c, d, e, f, g, h) \ 851 ; \ // ############################# RND N + 3 ############################// 852 RORXL $-12, a, y0; \ // y0 = a <<< 12 853 MOVL e, y1; \ 854 ADDL $const, y1; \ 855 ADDL y0, y1; \ // y1 = a <<< 12 + e + T 856 RORXL $-7, y1, y2; \ // y2 = SS1 857 XORL y2, y0 \ // y0 = SS2 858 ADDL (disp + 3*4)(SP)(SRND*1), y2; \ // y2 = SS1 + W 859 ADDL h, y2; \ // y2 = h + SS1 + W 860 ADDL (disp + 3*4 + 32)(SP)(SRND*1), y0;\ // y2 = SS2 + W' 861 ADDL d, y0; \ // y0 = d + SS2 + W' 862 ; \ 863 MOVL a, y1; \ 864 MOVL b, y3; \ 865 ANDL y1, y3; \ 866 MOVL c, T1; \ 867 ANDL T1, y1; \ 868 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) 869 MOVL b, y3; \ 870 ANDL T1, y3; \ 871 ORL y3, y1; \ // y1 = (a AND b) OR (a AND c) OR (b AND c) 872 ADDL y1, y0; \ // y0 = FF(a, b, c) + d + SS2 + W' = tt1 873 ; \ 874 MOVL e, y1; \ 875 MOVL f, y3; \ 876 ANDL y1, y3; \ // y3 = e AND f 877 NOTL y1; \ 878 MOVL g, T1; \ 879 ANDL T1, y1; \ // y1 = NOT(e) AND g 880 ORL y3, y1; \ // y1 = (e AND f) OR (NOT(e) AND g) 881 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 882 ; \ 883 ROLL $9, b; \ 884 ROLL $19, f; \ 885 MOVL y0, h; \ // h = tt1 886 ; \ 887 RORXL $-9, y2, y0; \ 888 RORXL $-17, y2, y1; \ 889 XORL y0, y2; \ 890 XORL y1, y2; \ 891 MOVL y2, d // d = P(tt2) 892 893 TEXT ·block(SB), 0, $1048-32 894 CMPB ·useAVX2(SB), $1 895 JE avx2 896 897 MOVQ p_base+8(FP), SI 898 MOVQ p_len+16(FP), DX 899 SHRQ $6, DX 900 SHLQ $6, DX 901 902 LEAQ (SI)(DX*1), DI 903 MOVQ DI, 272(SP) 904 CMPQ SI, DI 905 JEQ end 906 907 MOVQ dig+0(FP), BP 908 MOVL (0*4)(BP), R8 // a = H0 909 MOVL (1*4)(BP), R9 // b = H1 910 MOVL (2*4)(BP), R10 // c = H2 911 MOVL (3*4)(BP), R11 // d = H3 912 MOVL (4*4)(BP), R12 // e = H4 913 MOVL (5*4)(BP), R13 // f = H5 914 MOVL (6*4)(BP), R14 // g = H6 915 MOVL (7*4)(BP), R15 // h = H7 916 917 loop: 918 MOVQ SP, BP 919 920 MSGSCHEDULE0(0) 921 MSGSCHEDULE0(1) 922 MSGSCHEDULE0(2) 923 MSGSCHEDULE0(3) 924 925 SM3ROUND0(0, 0x79cc4519, R8, R9, R10, R11, R12, R13, R14, R15) 926 SM3ROUND0(1, 0xf3988a32, R15, R8, R9, R10, R11, R12, R13, R14) 927 SM3ROUND0(2, 0xe7311465, R14, R15, R8, R9, R10, R11, R12, R13) 928 SM3ROUND0(3, 0xce6228cb, R13, R14, R15, R8, R9, R10, R11, R12) 929 SM3ROUND0(4, 0x9cc45197, R12, R13, R14, R15, R8, R9, R10, R11) 930 SM3ROUND0(5, 0x3988a32f, R11, R12, R13, R14, R15, R8, R9, R10) 931 SM3ROUND0(6, 0x7311465e, R10, R11, R12, R13, R14, R15, R8, R9) 932 SM3ROUND0(7, 0xe6228cbc, R9, R10, R11, R12, R13, R14, R15, R8) 933 SM3ROUND0(8, 0xcc451979, R8, R9, R10, R11, R12, R13, R14, R15) 934 SM3ROUND0(9, 0x988a32f3, R15, R8, R9, R10, R11, R12, R13, R14) 935 SM3ROUND0(10, 0x311465e7, R14, R15, R8, R9, R10, R11, R12, R13) 936 SM3ROUND0(11, 0x6228cbce, R13, R14, R15, R8, R9, R10, R11, R12) 937 938 SM3ROUND1(12, 0xc451979c, R12, R13, R14, R15, R8, R9, R10, R11) 939 SM3ROUND1(13, 0x88a32f39, R11, R12, R13, R14, R15, R8, R9, R10) 940 SM3ROUND1(14, 0x11465e73, R10, R11, R12, R13, R14, R15, R8, R9) 941 SM3ROUND1(15, 0x228cbce6, R9, R10, R11, R12, R13, R14, R15, R8) 942 943 SM3ROUND2(16, 0x9d8a7a87, R8, R9, R10, R11, R12, R13, R14, R15) 944 SM3ROUND2(17, 0x3b14f50f, R15, R8, R9, R10, R11, R12, R13, R14) 945 SM3ROUND2(18, 0x7629ea1e, R14, R15, R8, R9, R10, R11, R12, R13) 946 SM3ROUND2(19, 0xec53d43c, R13, R14, R15, R8, R9, R10, R11, R12) 947 SM3ROUND2(20, 0xd8a7a879, R12, R13, R14, R15, R8, R9, R10, R11) 948 SM3ROUND2(21, 0xb14f50f3, R11, R12, R13, R14, R15, R8, R9, R10) 949 SM3ROUND2(22, 0x629ea1e7, R10, R11, R12, R13, R14, R15, R8, R9) 950 SM3ROUND2(23, 0xc53d43ce, R9, R10, R11, R12, R13, R14, R15, R8) 951 SM3ROUND2(24, 0x8a7a879d, R8, R9, R10, R11, R12, R13, R14, R15) 952 SM3ROUND2(25, 0x14f50f3b, R15, R8, R9, R10, R11, R12, R13, R14) 953 SM3ROUND2(26, 0x29ea1e76, R14, R15, R8, R9, R10, R11, R12, R13) 954 SM3ROUND2(27, 0x53d43cec, R13, R14, R15, R8, R9, R10, R11, R12) 955 SM3ROUND2(28, 0xa7a879d8, R12, R13, R14, R15, R8, R9, R10, R11) 956 SM3ROUND2(29, 0x4f50f3b1, R11, R12, R13, R14, R15, R8, R9, R10) 957 SM3ROUND2(30, 0x9ea1e762, R10, R11, R12, R13, R14, R15, R8, R9) 958 SM3ROUND2(31, 0x3d43cec5, R9, R10, R11, R12, R13, R14, R15, R8) 959 SM3ROUND2(32, 0x7a879d8a, R8, R9, R10, R11, R12, R13, R14, R15) 960 SM3ROUND2(33, 0xf50f3b14, R15, R8, R9, R10, R11, R12, R13, R14) 961 SM3ROUND2(34, 0xea1e7629, R14, R15, R8, R9, R10, R11, R12, R13) 962 SM3ROUND2(35, 0xd43cec53, R13, R14, R15, R8, R9, R10, R11, R12) 963 SM3ROUND2(36, 0xa879d8a7, R12, R13, R14, R15, R8, R9, R10, R11) 964 SM3ROUND2(37, 0x50f3b14f, R11, R12, R13, R14, R15, R8, R9, R10) 965 SM3ROUND2(38, 0xa1e7629e, R10, R11, R12, R13, R14, R15, R8, R9) 966 SM3ROUND2(39, 0x43cec53d, R9, R10, R11, R12, R13, R14, R15, R8) 967 SM3ROUND2(40, 0x879d8a7a, R8, R9, R10, R11, R12, R13, R14, R15) 968 SM3ROUND2(41, 0xf3b14f5, R15, R8, R9, R10, R11, R12, R13, R14) 969 SM3ROUND2(42, 0x1e7629ea, R14, R15, R8, R9, R10, R11, R12, R13) 970 SM3ROUND2(43, 0x3cec53d4, R13, R14, R15, R8, R9, R10, R11, R12) 971 SM3ROUND2(44, 0x79d8a7a8, R12, R13, R14, R15, R8, R9, R10, R11) 972 SM3ROUND2(45, 0xf3b14f50, R11, R12, R13, R14, R15, R8, R9, R10) 973 SM3ROUND2(46, 0xe7629ea1, R10, R11, R12, R13, R14, R15, R8, R9) 974 SM3ROUND2(47, 0xcec53d43, R9, R10, R11, R12, R13, R14, R15, R8) 975 SM3ROUND2(48, 0x9d8a7a87, R8, R9, R10, R11, R12, R13, R14, R15) 976 SM3ROUND2(49, 0x3b14f50f, R15, R8, R9, R10, R11, R12, R13, R14) 977 SM3ROUND2(50, 0x7629ea1e, R14, R15, R8, R9, R10, R11, R12, R13) 978 SM3ROUND2(51, 0xec53d43c, R13, R14, R15, R8, R9, R10, R11, R12) 979 SM3ROUND2(52, 0xd8a7a879, R12, R13, R14, R15, R8, R9, R10, R11) 980 SM3ROUND2(53, 0xb14f50f3, R11, R12, R13, R14, R15, R8, R9, R10) 981 SM3ROUND2(54, 0x629ea1e7, R10, R11, R12, R13, R14, R15, R8, R9) 982 SM3ROUND2(55, 0xc53d43ce, R9, R10, R11, R12, R13, R14, R15, R8) 983 SM3ROUND2(56, 0x8a7a879d, R8, R9, R10, R11, R12, R13, R14, R15) 984 SM3ROUND2(57, 0x14f50f3b, R15, R8, R9, R10, R11, R12, R13, R14) 985 SM3ROUND2(58, 0x29ea1e76, R14, R15, R8, R9, R10, R11, R12, R13) 986 SM3ROUND2(59, 0x53d43cec, R13, R14, R15, R8, R9, R10, R11, R12) 987 SM3ROUND2(60, 0xa7a879d8, R12, R13, R14, R15, R8, R9, R10, R11) 988 SM3ROUND2(61, 0x4f50f3b1, R11, R12, R13, R14, R15, R8, R9, R10) 989 SM3ROUND2(62, 0x9ea1e762, R10, R11, R12, R13, R14, R15, R8, R9) 990 SM3ROUND2(63, 0x3d43cec5, R9, R10, R11, R12, R13, R14, R15, R8) 991 992 MOVQ dig+0(FP), BP 993 994 XORL (0*4)(BP), R8 // H0 = a XOR H0 995 MOVL R8, (0*4)(BP) 996 XORL (1*4)(BP), R9 // H1 = b XOR H1 997 MOVL R9, (1*4)(BP) 998 XORL (2*4)(BP), R10 // H2 = c XOR H2 999 MOVL R10, (2*4)(BP) 1000 XORL (3*4)(BP), R11 // H3 = d XOR H3 1001 MOVL R11, (3*4)(BP) 1002 XORL (4*4)(BP), R12 // H4 = e XOR H4 1003 MOVL R12, (4*4)(BP) 1004 XORL (5*4)(BP), R13 // H5 = f XOR H5 1005 MOVL R13, (5*4)(BP) 1006 XORL (6*4)(BP), R14 // H6 = g XOR H6 1007 MOVL R14, (6*4)(BP) 1008 XORL (7*4)(BP), R15 // H7 = h XOR H7 1009 MOVL R15, (7*4)(BP) 1010 1011 ADDQ $64, SI 1012 CMPQ SI, 272(SP) 1013 JB loop 1014 1015 end: 1016 RET 1017 1018 avx2: 1019 MOVQ dig+0(FP), CTX // d.h[8] 1020 MOVQ p_base+8(FP), INP 1021 MOVQ p_len+16(FP), NUM_BYTES 1022 1023 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block 1024 MOVQ NUM_BYTES, _INP_END(SP) 1025 1026 CMPQ NUM_BYTES, INP 1027 JE avx2_only_one_block 1028 1029 // Load initial digest 1030 MOVL 0(CTX), a // a = H0 1031 MOVL 4(CTX), b // b = H1 1032 MOVL 8(CTX), c // c = H2 1033 MOVL 12(CTX), d // d = H3 1034 MOVL 16(CTX), e // e = H4 1035 MOVL 20(CTX), f // f = H5 1036 MOVL 24(CTX), g // g = H6 1037 MOVL 28(CTX), h // h = H7 1038 1039 avx2_loop0: // at each iteration works with one block (512 bit) 1040 1041 VMOVDQU (0*32)(INP), XTMP0 1042 VMOVDQU (1*32)(INP), XTMP1 1043 VMOVDQU (2*32)(INP), XTMP2 1044 VMOVDQU (3*32)(INP), XTMP3 1045 1046 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 1047 1048 // Apply Byte Flip Mask: LE -> BE 1049 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0 1050 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1 1051 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2 1052 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3 1053 1054 // Transpose data into high/low parts 1055 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w19, w18, w17, w16; w3, w2, w1, w0 1056 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w23, w22, w21, w20; w7, w6, w5, w4 1057 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w27, w26, w25, w24; w11, w10, w9, w8 1058 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w31, w30, w29, w28; w15, w14, w13, w12 1059 1060 avx2_last_block_enter: 1061 ADDQ $64, INP 1062 MOVQ INP, _INP(SP) 1063 XORQ SRND, SRND 1064 1065 avx2_loop1: // for w0 - w47 1066 // Do 4 rounds and scheduling 1067 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) 1068 VPXOR XDWORD0, XDWORD1, XFER 1069 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 1070 ROUND_AND_SCHED_N_0_0(_XFER + 0*32, 0x79cc4519, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1071 ROUND_AND_SCHED_N_0_1(_XFER + 0*32, 0xf3988a32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1072 ROUND_AND_SCHED_N_0_2(_XFER + 0*32, 0xe7311465, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1073 ROUND_AND_SCHED_N_0_3(_XFER + 0*32, 0xce6228cb, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1074 1075 // Do 4 rounds and scheduling 1076 VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1) 1077 VPXOR XDWORD1, XDWORD2, XFER 1078 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) 1079 ROUND_AND_SCHED_N_0_0(_XFER + 2*32, 0x9cc45197, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1080 ROUND_AND_SCHED_N_0_1(_XFER + 2*32, 0x3988a32f, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1081 ROUND_AND_SCHED_N_0_2(_XFER + 2*32, 0x7311465e, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1082 ROUND_AND_SCHED_N_0_3(_XFER + 2*32, 0xe6228cbc, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1083 1084 // Do 4 rounds and scheduling 1085 VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1) 1086 VPXOR XDWORD2, XDWORD3, XFER 1087 VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1) 1088 ROUND_AND_SCHED_N_0_0(_XFER + 4*32, 0xcc451979, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1089 ROUND_AND_SCHED_N_0_1(_XFER + 4*32, 0x988a32f3, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1090 ROUND_AND_SCHED_N_0_2(_XFER + 4*32, 0x311465e7, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1091 ROUND_AND_SCHED_N_0_3(_XFER + 4*32, 0x6228cbce, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1092 1093 // Do 4 rounds and scheduling 1094 VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1) 1095 VPXOR XDWORD3, XDWORD0, XFER 1096 VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1) 1097 ROUND_AND_SCHED_N_0_0(_XFER + 6*32, 0xc451979c, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1098 ROUND_AND_SCHED_N_0_1(_XFER + 6*32, 0x88a32f39, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1099 ROUND_AND_SCHED_N_0_2(_XFER + 6*32, 0x11465e73, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1100 ROUND_AND_SCHED_N_0_3(_XFER + 6*32, 0x228cbce6, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1101 1102 ADDQ $8*32, SRND 1103 1104 // Do 4 rounds and scheduling 1105 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) 1106 VPXOR XDWORD0, XDWORD1, XFER 1107 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 1108 ROUND_AND_SCHED_N_1_0(_XFER + 0*32, 0x9d8a7a87, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1109 ROUND_AND_SCHED_N_1_1(_XFER + 0*32, 0x3b14f50f, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1110 ROUND_AND_SCHED_N_1_2(_XFER + 0*32, 0x7629ea1e, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1111 ROUND_AND_SCHED_N_1_3(_XFER + 0*32, 0xec53d43c, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1112 1113 // Do 4 rounds and scheduling 1114 VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1) 1115 VPXOR XDWORD1, XDWORD2, XFER 1116 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) 1117 ROUND_AND_SCHED_N_1_0(_XFER + 2*32, 0xd8a7a879, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1118 ROUND_AND_SCHED_N_1_1(_XFER + 2*32, 0xb14f50f3, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1119 ROUND_AND_SCHED_N_1_2(_XFER + 2*32, 0x629ea1e7, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1120 ROUND_AND_SCHED_N_1_3(_XFER + 2*32, 0xc53d43ce, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1121 1122 // Do 4 rounds and scheduling 1123 VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1) 1124 VPXOR XDWORD2, XDWORD3, XFER 1125 VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1) 1126 1127 ROUND_AND_SCHED_N_1_0(_XFER + 4*32, 0x8a7a879d, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1128 ROUND_AND_SCHED_N_1_1(_XFER + 4*32, 0x14f50f3b, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1129 ROUND_AND_SCHED_N_1_2(_XFER + 4*32, 0x29ea1e76, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1130 ROUND_AND_SCHED_N_1_3(_XFER + 4*32, 0x53d43cec, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1131 1132 // Do 4 rounds and scheduling 1133 VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1) 1134 VPXOR XDWORD3, XDWORD0, XFER 1135 VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1) 1136 ROUND_AND_SCHED_N_1_0(_XFER + 6*32, 0xa7a879d8, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1137 ROUND_AND_SCHED_N_1_1(_XFER + 6*32, 0x4f50f3b1, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1138 ROUND_AND_SCHED_N_1_2(_XFER + 6*32, 0x9ea1e762, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1139 ROUND_AND_SCHED_N_1_3(_XFER + 6*32, 0x3d43cec5, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1140 1141 ADDQ $8*32, SRND 1142 1143 // Do 4 rounds and scheduling 1144 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) 1145 VPXOR XDWORD0, XDWORD1, XFER 1146 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 1147 ROUND_AND_SCHED_N_1_0(_XFER + 0*32, 0x7a879d8a, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1148 ROUND_AND_SCHED_N_1_1(_XFER + 0*32, 0xf50f3b14, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1149 ROUND_AND_SCHED_N_1_2(_XFER + 0*32, 0xea1e7629, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1150 ROUND_AND_SCHED_N_1_3(_XFER + 0*32, 0xd43cec53, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1151 1152 // Do 4 rounds and scheduling 1153 VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1) 1154 VPXOR XDWORD1, XDWORD2, XFER 1155 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) 1156 ROUND_AND_SCHED_N_1_0(_XFER + 2*32, 0xa879d8a7, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1157 ROUND_AND_SCHED_N_1_1(_XFER + 2*32, 0x50f3b14f, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1158 ROUND_AND_SCHED_N_1_2(_XFER + 2*32, 0xa1e7629e, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1159 ROUND_AND_SCHED_N_1_3(_XFER + 2*32, 0x43cec53d, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 1160 1161 // Do 4 rounds and scheduling 1162 VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1) 1163 VPXOR XDWORD2, XDWORD3, XFER 1164 VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1) 1165 ROUND_AND_SCHED_N_1_0(_XFER + 4*32, 0x879d8a7a, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1166 ROUND_AND_SCHED_N_1_1(_XFER + 4*32, 0xf3b14f5, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1167 ROUND_AND_SCHED_N_1_2(_XFER + 4*32, 0x1e7629ea, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1168 ROUND_AND_SCHED_N_1_3(_XFER + 4*32, 0x3cec53d4, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 1169 1170 // Do 4 rounds and scheduling 1171 VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1) 1172 VPXOR XDWORD3, XDWORD0, XFER 1173 VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1) 1174 ROUND_AND_SCHED_N_1_0(_XFER + 6*32, 0x79d8a7a8, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1175 ROUND_AND_SCHED_N_1_1(_XFER + 6*32, 0xf3b14f50, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1176 ROUND_AND_SCHED_N_1_2(_XFER + 6*32, 0xe7629ea1, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1177 ROUND_AND_SCHED_N_1_3(_XFER + 6*32, 0xcec53d43, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 1178 1179 ADDQ $8*32, SRND 1180 1181 // w48 - w63 processed with no scheduling (last 16 rounds) 1182 // Do 4 rounds and scheduling 1183 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) 1184 VPXOR XDWORD0, XDWORD1, XFER 1185 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 1186 ROUND_AND_SCHED_N_1_0(_XFER + 0*32, 0x9d8a7a87, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1187 ROUND_AND_SCHED_N_1_1(_XFER + 0*32, 0x3b14f50f, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1188 ROUND_AND_SCHED_N_1_2(_XFER + 0*32, 0x7629ea1e, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1189 ROUND_AND_SCHED_N_1_3(_XFER + 0*32, 0xec53d43c, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 1190 1191 // Do 4 rounds and scheduling 1192 VMOVDQU XDWORD1, (_XFER + 2*32)(SP)(SRND*1) 1193 VPXOR XDWORD1, XDWORD2, XFER 1194 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) 1195 ROUND_N_1_0(_XFER + 2*32, 0xd8a7a879, e, f, g, h, a, b, c, d) 1196 ROUND_N_1_1(_XFER + 2*32, 0xb14f50f3, d, e, f, g, h, a, b, c) 1197 ROUND_N_1_2(_XFER + 2*32, 0x629ea1e7, c, d, e, f, g, h, a, b) 1198 ROUND_N_1_3(_XFER + 2*32, 0xc53d43ce, b, c, d, e, f, g, h, a) 1199 1200 // Do 4 rounds and scheduling 1201 VMOVDQU XDWORD2, (_XFER + 4*32)(SP)(SRND*1) 1202 VPXOR XDWORD2, XDWORD3, XFER 1203 VMOVDQU XFER, (_XFER + 5*32)(SP)(SRND*1) 1204 ROUND_N_1_0(_XFER + 4*32, 0x8a7a879d, a, b, c, d, e, f, g, h) 1205 ROUND_N_1_1(_XFER + 4*32, 0x14f50f3b, h, a, b, c, d, e, f, g) 1206 ROUND_N_1_2(_XFER + 4*32, 0x29ea1e76, g, h, a, b, c, d, e, f) 1207 ROUND_N_1_3(_XFER + 4*32, 0x53d43cec, f, g, h, a, b, c, d, e) 1208 1209 // Do 4 rounds and scheduling 1210 VMOVDQU XDWORD3, (_XFER + 6*32)(SP)(SRND*1) 1211 VPXOR XDWORD3, XDWORD0, XFER 1212 VMOVDQU XFER, (_XFER + 7*32)(SP)(SRND*1) 1213 ROUND_N_1_0(_XFER + 6*32, 0xa7a879d8, e, f, g, h, a, b, c, d) 1214 ROUND_N_1_1(_XFER + 6*32, 0x4f50f3b1, d, e, f, g, h, a, b, c) 1215 ROUND_N_1_2(_XFER + 6*32, 0x9ea1e762, c, d, e, f, g, h, a, b) 1216 ROUND_N_1_3(_XFER + 6*32, 0x3d43cec5, b, c, d, e, f, g, h, a) 1217 1218 MOVQ dig+0(FP), CTX // d.h[8] 1219 MOVQ _INP(SP), INP 1220 1221 xorm( 0(CTX), a) 1222 xorm( 4(CTX), b) 1223 xorm( 8(CTX), c) 1224 xorm( 12(CTX), d) 1225 xorm( 16(CTX), e) 1226 xorm( 20(CTX), f) 1227 xorm( 24(CTX), g) 1228 xorm( 28(CTX), h) 1229 1230 CMPQ _INP_END(SP), INP 1231 JB done_hash 1232 1233 XORQ SRND, SRND 1234 1235 avx2_loop3: // Do second block using previously scheduled results 1236 ROUND_N_0_0(_XFER + 0*32 + 16, 0x79cc4519, a, b, c, d, e, f, g, h) 1237 ROUND_N_0_1(_XFER + 0*32 + 16, 0xf3988a32, h, a, b, c, d, e, f, g) 1238 ROUND_N_0_2(_XFER + 0*32 + 16, 0xe7311465, g, h, a, b, c, d, e, f) 1239 ROUND_N_0_3(_XFER + 0*32 + 16, 0xce6228cb, f, g, h, a, b, c, d, e) 1240 1241 ROUND_N_0_0(_XFER + 2*32 + 16, 0x9cc45197, e, f, g, h, a, b, c, d) 1242 ROUND_N_0_1(_XFER + 2*32 + 16, 0x3988a32f, d, e, f, g, h, a, b, c) 1243 ROUND_N_0_2(_XFER + 2*32 + 16, 0x7311465e, c, d, e, f, g, h, a, b) 1244 ROUND_N_0_3(_XFER + 2*32 + 16, 0xe6228cbc, b, c, d, e, f, g, h, a) 1245 1246 ROUND_N_0_0(_XFER + 4*32 + 16, 0xcc451979, a, b, c, d, e, f, g, h) 1247 ROUND_N_0_1(_XFER + 4*32 + 16, 0x988a32f3, h, a, b, c, d, e, f, g) 1248 ROUND_N_0_2(_XFER + 4*32 + 16, 0x311465e7, g, h, a, b, c, d, e, f) 1249 ROUND_N_0_3(_XFER + 4*32 + 16, 0x6228cbce, f, g, h, a, b, c, d, e) 1250 1251 ROUND_N_0_0(_XFER + 6*32 + 16, 0xc451979c, e, f, g, h, a, b, c, d) 1252 ROUND_N_0_1(_XFER + 6*32 + 16, 0x88a32f39, d, e, f, g, h, a, b, c) 1253 ROUND_N_0_2(_XFER + 6*32 + 16, 0x11465e73, c, d, e, f, g, h, a, b) 1254 ROUND_N_0_3(_XFER + 6*32 + 16, 0x228cbce6, b, c, d, e, f, g, h, a) 1255 1256 ADDQ $8*32, SRND 1257 1258 ROUND_N_1_0(_XFER + 0*32 + 16, 0x9d8a7a87, a, b, c, d, e, f, g, h) 1259 ROUND_N_1_1(_XFER + 0*32 + 16, 0x3b14f50f, h, a, b, c, d, e, f, g) 1260 ROUND_N_1_2(_XFER + 0*32 + 16, 0x7629ea1e, g, h, a, b, c, d, e, f) 1261 ROUND_N_1_3(_XFER + 0*32 + 16, 0xec53d43c, f, g, h, a, b, c, d, e) 1262 1263 ROUND_N_1_0(_XFER + 2*32 + 16, 0xd8a7a879, e, f, g, h, a, b, c, d) 1264 ROUND_N_1_1(_XFER + 2*32 + 16, 0xb14f50f3, d, e, f, g, h, a, b, c) 1265 ROUND_N_1_2(_XFER + 2*32 + 16, 0x629ea1e7, c, d, e, f, g, h, a, b) 1266 ROUND_N_1_3(_XFER + 2*32 + 16, 0xc53d43ce, b, c, d, e, f, g, h, a) 1267 1268 ROUND_N_1_0(_XFER + 4*32 + 16, 0x8a7a879d, a, b, c, d, e, f, g, h) 1269 ROUND_N_1_1(_XFER + 4*32 + 16, 0x14f50f3b, h, a, b, c, d, e, f, g) 1270 ROUND_N_1_2(_XFER + 4*32 + 16, 0x29ea1e76, g, h, a, b, c, d, e, f) 1271 ROUND_N_1_3(_XFER + 4*32 + 16, 0x53d43cec, f, g, h, a, b, c, d, e) 1272 1273 ROUND_N_1_0(_XFER + 6*32 + 16, 0xa7a879d8, e, f, g, h, a, b, c, d) 1274 ROUND_N_1_1(_XFER + 6*32 + 16, 0x4f50f3b1, d, e, f, g, h, a, b, c) 1275 ROUND_N_1_2(_XFER + 6*32 + 16, 0x9ea1e762, c, d, e, f, g, h, a, b) 1276 ROUND_N_1_3(_XFER + 6*32 + 16, 0x3d43cec5, b, c, d, e, f, g, h, a) 1277 1278 ADDQ $8*32, SRND 1279 1280 ROUND_N_1_0(_XFER + 0*32 + 16, 0x7a879d8a, a, b, c, d, e, f, g, h) 1281 ROUND_N_1_1(_XFER + 0*32 + 16, 0xf50f3b14, h, a, b, c, d, e, f, g) 1282 ROUND_N_1_2(_XFER + 0*32 + 16, 0xea1e7629, g, h, a, b, c, d, e, f) 1283 ROUND_N_1_3(_XFER + 0*32 + 16, 0xd43cec53, f, g, h, a, b, c, d, e) 1284 1285 ROUND_N_1_0(_XFER + 2*32 + 16, 0xa879d8a7, e, f, g, h, a, b, c, d) 1286 ROUND_N_1_1(_XFER + 2*32 + 16, 0x50f3b14f, d, e, f, g, h, a, b, c) 1287 ROUND_N_1_2(_XFER + 2*32 + 16, 0xa1e7629e, c, d, e, f, g, h, a, b) 1288 ROUND_N_1_3(_XFER + 2*32 + 16, 0x43cec53d, b, c, d, e, f, g, h, a) 1289 1290 ROUND_N_1_0(_XFER + 4*32 + 16, 0x879d8a7a, a, b, c, d, e, f, g, h) 1291 ROUND_N_1_1(_XFER + 4*32 + 16, 0xf3b14f5, h, a, b, c, d, e, f, g) 1292 ROUND_N_1_2(_XFER + 4*32 + 16, 0x1e7629ea, g, h, a, b, c, d, e, f) 1293 ROUND_N_1_3(_XFER + 4*32 + 16, 0x3cec53d4, f, g, h, a, b, c, d, e) 1294 1295 ROUND_N_1_0(_XFER + 6*32 + 16, 0x79d8a7a8, e, f, g, h, a, b, c, d) 1296 ROUND_N_1_1(_XFER + 6*32 + 16, 0xf3b14f50, d, e, f, g, h, a, b, c) 1297 ROUND_N_1_2(_XFER + 6*32 + 16, 0xe7629ea1, c, d, e, f, g, h, a, b) 1298 ROUND_N_1_3(_XFER + 6*32 + 16, 0xcec53d43, b, c, d, e, f, g, h, a) 1299 1300 ADDQ $8*32, SRND 1301 1302 ROUND_N_1_0(_XFER + 0*32 + 16, 0x9d8a7a87, a, b, c, d, e, f, g, h) 1303 ROUND_N_1_1(_XFER + 0*32 + 16, 0x3b14f50f, h, a, b, c, d, e, f, g) 1304 ROUND_N_1_2(_XFER + 0*32 + 16, 0x7629ea1e, g, h, a, b, c, d, e, f) 1305 ROUND_N_1_3(_XFER + 0*32 + 16, 0xec53d43c, f, g, h, a, b, c, d, e) 1306 1307 ROUND_N_1_0(_XFER + 2*32 + 16, 0xd8a7a879, e, f, g, h, a, b, c, d) 1308 ROUND_N_1_1(_XFER + 2*32 + 16, 0xb14f50f3, d, e, f, g, h, a, b, c) 1309 ROUND_N_1_2(_XFER + 2*32 + 16, 0x629ea1e7, c, d, e, f, g, h, a, b) 1310 ROUND_N_1_3(_XFER + 2*32 + 16, 0xc53d43ce, b, c, d, e, f, g, h, a) 1311 1312 ROUND_N_1_0(_XFER + 4*32 + 16, 0x8a7a879d, a, b, c, d, e, f, g, h) 1313 ROUND_N_1_1(_XFER + 4*32 + 16, 0x14f50f3b, h, a, b, c, d, e, f, g) 1314 ROUND_N_1_2(_XFER + 4*32 + 16, 0x29ea1e76, g, h, a, b, c, d, e, f) 1315 ROUND_N_1_3(_XFER + 4*32 + 16, 0x53d43cec, f, g, h, a, b, c, d, e) 1316 1317 ROUND_N_1_0(_XFER + 6*32 + 16, 0xa7a879d8, e, f, g, h, a, b, c, d) 1318 ROUND_N_1_1(_XFER + 6*32 + 16, 0x4f50f3b1, d, e, f, g, h, a, b, c) 1319 ROUND_N_1_2(_XFER + 6*32 + 16, 0x9ea1e762, c, d, e, f, g, h, a, b) 1320 ROUND_N_1_3(_XFER + 6*32 + 16, 0x3d43cec5, b, c, d, e, f, g, h, a) 1321 1322 MOVQ dig+0(FP), CTX // d.h[8] 1323 MOVQ _INP(SP), INP 1324 ADDQ $64, INP 1325 1326 xorm( 0(CTX), a) 1327 xorm( 4(CTX), b) 1328 xorm( 8(CTX), c) 1329 xorm( 12(CTX), d) 1330 xorm( 16(CTX), e) 1331 xorm( 20(CTX), f) 1332 xorm( 24(CTX), g) 1333 xorm( 28(CTX), h) 1334 1335 CMPQ _INP_END(SP), INP 1336 JA avx2_loop0 1337 JB done_hash 1338 1339 avx2_do_last_block: 1340 1341 VMOVDQU 0(INP), XWORD0 1342 VMOVDQU 16(INP), XWORD1 1343 VMOVDQU 32(INP), XWORD2 1344 VMOVDQU 48(INP), XWORD3 1345 1346 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 1347 1348 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 1349 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 1350 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 1351 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 1352 1353 JMP avx2_last_block_enter 1354 1355 avx2_only_one_block: 1356 // Load initial digest 1357 MOVL 0(CTX), a // a = H0 1358 MOVL 4(CTX), b // b = H1 1359 MOVL 8(CTX), c // c = H2 1360 MOVL 12(CTX), d // d = H3 1361 MOVL 16(CTX), e // e = H4 1362 MOVL 20(CTX), f // f = H5 1363 MOVL 24(CTX), g // g = H6 1364 MOVL 28(CTX), h // h = H7 1365 1366 JMP avx2_do_last_block 1367 1368 done_hash: 1369 VZEROUPPER 1370 RET 1371 1372 // shuffle byte order from LE to BE 1373 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 1374 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 1375 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203 1376 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b 1377 GLOBL flip_mask<>(SB), 8, $32 1378 1379 // shuffle BxAx -> 00BA 1380 DATA shuff_00BA<>+0x00(SB)/8, $0x0f0e0d0c07060504 1381 DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF 1382 DATA shuff_00BA<>+0x10(SB)/8, $0x0f0e0d0c07060504 1383 DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 1384 GLOBL shuff_00BA<>(SB), 8, $32 1385 1386 // shuffle DxCx -> DC00 1387 DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF 1388 DATA shuff_DC00<>+0x08(SB)/8, $0x0f0e0d0c07060504 1389 DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 1390 DATA shuff_DC00<>+0x18(SB)/8, $0x0f0e0d0c07060504 1391 GLOBL shuff_DC00<>(SB), 8, $32