github.com/zebozhuang/go@v0.0.0-20200207033046-f8a98f6f5c5d/src/crypto/sha256/sha256block_amd64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // SHA256 block routine. See sha256block.go for Go equivalent. 8 // 9 // The algorithm is detailed in FIPS 180-4: 10 // 11 // http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf 12 13 // The avx2-version is described in an Intel White-Paper: 14 // "Fast SHA-256 Implementations on Intel Architecture Processors" 15 // To find it, surf to http://www.intel.com/p/en_US/embedded 16 // and search for that title. 17 // AVX2 version by Intel, same algorithm as code in Linux kernel: 18 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S 19 // by 20 // James Guilford <james.guilford@intel.com> 21 // Kirk Yap <kirk.s.yap@intel.com> 22 // Tim Chen <tim.c.chen@linux.intel.com> 23 24 // Wt = Mt; for 0 <= t <= 15 25 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 26 // 27 // a = H0 28 // b = H1 29 // c = H2 30 // d = H3 31 // e = H4 32 // f = H5 33 // g = H6 34 // h = H7 35 // 36 // for t = 0 to 63 { 37 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt 38 // T2 = BIGSIGMA0(a) + Maj(a,b,c) 39 // h = g 40 // g = f 41 // f = e 42 // e = d + T1 43 // d = c 44 // c = b 45 // b = a 46 // a = T1 + T2 47 // } 48 // 49 // H0 = a + H0 50 // H1 = b + H1 51 // H2 = c + H2 52 // H3 = d + H3 53 // H4 = e + H4 54 // H5 = f + H5 55 // H6 = g + H6 56 // H7 = h + H7 57 58 // Wt = Mt; for 0 <= t <= 15 59 #define MSGSCHEDULE0(index) \ 60 MOVL (index*4)(SI), AX; \ 61 BSWAPL AX; \ 62 MOVL AX, (index*4)(BP) 63 64 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 65 // SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x) 66 // SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x) 67 #define MSGSCHEDULE1(index) \ 68 MOVL ((index-2)*4)(BP), AX; \ 69 MOVL AX, CX; \ 70 RORL $17, AX; \ 71 MOVL CX, DX; \ 72 RORL $19, CX; \ 73 SHRL $10, DX; \ 74 MOVL ((index-15)*4)(BP), BX; \ 75 XORL CX, AX; \ 76 MOVL BX, CX; \ 77 XORL DX, AX; \ 78 RORL $7, BX; \ 79 MOVL CX, DX; \ 80 SHRL $3, DX; \ 81 RORL $18, CX; \ 82 ADDL ((index-7)*4)(BP), AX; \ 83 XORL CX, BX; \ 84 XORL DX, BX; \ 85 ADDL ((index-16)*4)(BP), BX; \ 86 ADDL BX, AX; \ 87 MOVL AX, ((index)*4)(BP) 88 89 // Calculate T1 in AX - uses AX, CX and DX registers. 90 // h is also used as an accumulator. Wt is passed in AX. 91 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt 92 // BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x) 93 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) 94 #define SHA256T1(const, e, f, g, h) \ 95 ADDL AX, h; \ 96 MOVL e, AX; \ 97 ADDL $const, h; \ 98 MOVL e, CX; \ 99 RORL $6, AX; \ 100 MOVL e, DX; \ 101 RORL $11, CX; \ 102 XORL CX, AX; \ 103 MOVL e, CX; \ 104 RORL $25, DX; \ 105 ANDL f, CX; \ 106 XORL AX, DX; \ 107 MOVL e, AX; \ 108 NOTL AX; \ 109 ADDL DX, h; \ 110 ANDL g, AX; \ 111 XORL CX, AX; \ 112 ADDL h, AX 113 114 // Calculate T2 in BX - uses BX, CX, DX and DI registers. 115 // T2 = BIGSIGMA0(a) + Maj(a, b, c) 116 // BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x) 117 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) 118 #define SHA256T2(a, b, c) \ 119 MOVL a, DI; \ 120 MOVL c, BX; \ 121 RORL $2, DI; \ 122 MOVL a, DX; \ 123 ANDL b, BX; \ 124 RORL $13, DX; \ 125 MOVL a, CX; \ 126 ANDL c, CX; \ 127 XORL DX, DI; \ 128 XORL CX, BX; \ 129 MOVL a, DX; \ 130 MOVL b, CX; \ 131 RORL $22, DX; \ 132 ANDL a, CX; \ 133 XORL CX, BX; \ 134 XORL DX, DI; \ 135 ADDL DI, BX 136 137 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2. 138 // The values for e and a are stored in d and h, ready for rotation. 139 #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \ 140 SHA256T1(const, e, f, g, h); \ 141 SHA256T2(a, b, c); \ 142 MOVL BX, h; \ 143 ADDL AX, d; \ 144 ADDL AX, h 145 146 #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \ 147 MSGSCHEDULE0(index); \ 148 SHA256ROUND(index, const, a, b, c, d, e, f, g, h) 149 150 #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \ 151 MSGSCHEDULE1(index); \ 152 SHA256ROUND(index, const, a, b, c, d, e, f, g, h) 153 154 155 // Definitions for AVX2 version 156 157 // addm (mem), reg 158 // Add reg to mem using reg-mem add and store 159 #define addm(P1, P2) \ 160 ADDL P2, P1; \ 161 MOVL P1, P2 162 163 #define XDWORD0 Y4 164 #define XDWORD1 Y5 165 #define XDWORD2 Y6 166 #define XDWORD3 Y7 167 168 #define XWORD0 X4 169 #define XWORD1 X5 170 #define XWORD2 X6 171 #define XWORD3 X7 172 173 #define XTMP0 Y0 174 #define XTMP1 Y1 175 #define XTMP2 Y2 176 #define XTMP3 Y3 177 #define XTMP4 Y8 178 #define XTMP5 Y11 179 180 #define XFER Y9 181 182 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 183 #define X_BYTE_FLIP_MASK X13 184 185 #define NUM_BYTES DX 186 #define INP DI 187 188 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h) 189 190 #define a AX 191 #define b BX 192 #define c CX 193 #define d R8 194 #define e DX 195 #define f R9 196 #define g R10 197 #define h R11 198 199 #define old_h R11 200 201 #define TBL BP 202 203 #define SRND SI // SRND is same register as CTX 204 205 #define T1 R12 206 207 #define y0 R13 208 #define y1 R14 209 #define y2 R15 210 #define y3 DI 211 212 // Offsets 213 #define XFER_SIZE 2*64*4 214 #define INP_END_SIZE 8 215 #define INP_SIZE 8 216 #define TMP_SIZE 4 217 218 #define _XFER 0 219 #define _INP_END _XFER + XFER_SIZE 220 #define _INP _INP_END + INP_END_SIZE 221 #define _TMP _INP + INP_SIZE 222 #define STACK_SIZE _TMP + TMP_SIZE 223 224 #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 225 ; \ // ############################# RND N + 0 ############################// 226 MOVL a, y3; \ // y3 = a // MAJA 227 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 228 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 229 ; \ 230 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w 231 ORL c, y3; \ // y3 = a|c // MAJA 232 VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7] 233 MOVL f, y2; \ // y2 = f // CH 234 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 235 ; \ 236 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 237 XORL g, y2; \ // y2 = f^g // CH 238 VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1 239 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 240 ; \ 241 ANDL e, y2; \ // y2 = (f^g)&e // CH 242 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 243 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 244 ADDL h, d; \ // d = k + w + h + d // -- 245 ; \ 246 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 247 VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15] 248 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 249 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 250 ; \ 251 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 252 VPSRLD $7, XTMP1, XTMP2; \ 253 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 254 MOVL a, T1; \ // T1 = a // MAJB 255 ANDL c, T1; \ // T1 = a&c // MAJB 256 ; \ 257 ADDL y0, y2; \ // y2 = S1 + CH // -- 258 VPSLLD $(32-7), XTMP1, XTMP3; \ 259 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 260 ADDL y1, h; \ // h = k + w + h + S0 // -- 261 ; \ 262 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 263 VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 264 ; \ 265 VPSRLD $18, XTMP1, XTMP2; \ 266 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 267 ADDL y3, h // h = t1 + S0 + MAJ // -- 268 269 #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 270 ; \ // ################################### RND N + 1 ############################ 271 ; \ 272 MOVL a, y3; \ // y3 = a // MAJA 273 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 274 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 275 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 276 ORL c, y3; \ // y3 = a|c // MAJA 277 ; \ 278 VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3 279 MOVL f, y2; \ // y2 = f // CH 280 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 281 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 282 XORL g, y2; \ // y2 = f^g // CH 283 ; \ 284 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 285 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 286 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 287 ANDL e, y2; \ // y2 = (f^g)&e // CH 288 ADDL h, d; \ // d = k + w + h + d // -- 289 ; \ 290 VPSLLD $(32-18), XTMP1, XTMP1; \ 291 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 292 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 293 ; \ 294 VPXOR XTMP1, XTMP3, XTMP3; \ 295 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 296 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 297 ; \ 298 VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 299 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 300 MOVL a, T1; \ // T1 = a // MAJB 301 ANDL c, T1; \ // T1 = a&c // MAJB 302 ADDL y0, y2; \ // y2 = S1 + CH // -- 303 ; \ 304 VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0 305 VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA} 306 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 307 ADDL y1, h; \ // h = k + w + h + S0 // -- 308 ; \ 309 VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0 310 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 311 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 312 ADDL y3, h; \ // h = t1 + S0 + MAJ // -- 313 ; \ 314 VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA} 315 316 #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 317 ; \ // ################################### RND N + 2 ############################ 318 ; \ 319 MOVL a, y3; \ // y3 = a // MAJA 320 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 321 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 322 ; \ 323 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA} 324 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 325 ORL c, y3; \ // y3 = a|c // MAJA 326 MOVL f, y2; \ // y2 = f // CH 327 XORL g, y2; \ // y2 = f^g // CH 328 ; \ 329 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 330 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 331 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA} 332 ANDL e, y2; \ // y2 = (f^g)&e // CH 333 ; \ 334 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 335 VPXOR XTMP3, XTMP2, XTMP2; \ 336 ADDL h, d; \ // d = k + w + h + d // -- 337 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 338 ; \ 339 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 340 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 341 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA} 342 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 343 ; \ 344 MOVL f, _TMP(SP); \ 345 MOVQ $shuff_00BA<>(SB), f; \ // f is used to keep SHUF_00BA 346 VPSHUFB (f), XTMP4, XTMP4; \ // XTMP4 = s1 {00BA} 347 MOVL _TMP(SP), f; \ // f is restored 348 ; \ 349 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 350 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 351 VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]} 352 ; \ 353 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 354 MOVL a, T1; \ // T1 = a // MAJB 355 ANDL c, T1; \ // T1 = a&c // MAJB 356 ADDL y0, y2; \ // y2 = S1 + CH // -- 357 VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC} 358 ; \ 359 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 360 ADDL y1, h; \ // h = k + w + h + S0 // -- 361 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 362 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 363 ; \ 364 ADDL y3, h // h = t1 + S0 + MAJ // -- 365 366 #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 367 ; \ // ################################### RND N + 3 ############################ 368 ; \ 369 MOVL a, y3; \ // y3 = a // MAJA 370 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 371 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 372 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 373 ORL c, y3; \ // y3 = a|c // MAJA 374 ; \ 375 VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC} 376 MOVL f, y2; \ // y2 = f // CH 377 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 378 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 379 XORL g, y2; \ // y2 = f^g // CH 380 ; \ 381 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC} 382 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 383 ANDL e, y2; \ // y2 = (f^g)&e // CH 384 ADDL h, d; \ // d = k + w + h + d // -- 385 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 386 ; \ 387 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC} 388 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 389 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 390 ; \ 391 VPXOR XTMP3, XTMP2, XTMP2; \ 392 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 393 ADDL y0, y2; \ // y2 = S1 + CH // -- 394 ; \ 395 VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC} 396 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 397 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 398 ; \ 399 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 400 ; \ 401 MOVL f, _TMP(SP); \ // Save f 402 MOVQ $shuff_DC00<>(SB), f; \ // SHUF_00DC 403 VPSHUFB (f), XTMP5, XTMP5; \ // XTMP5 = s1 {DC00} 404 MOVL _TMP(SP), f; \ // Restore f 405 ; \ 406 VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} 407 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 408 MOVL a, T1; \ // T1 = a // MAJB 409 ANDL c, T1; \ // T1 = a&c // MAJB 410 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 411 ; \ 412 ADDL y1, h; \ // h = k + w + h + S0 // -- 413 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 414 ADDL y3, h // h = t1 + S0 + MAJ // -- 415 416 #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \ 417 ; \ // ################################### RND N + 0 ########################### 418 MOVL f, y2; \ // y2 = f // CH 419 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 420 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 421 XORL g, y2; \ // y2 = f^g // CH 422 ; \ 423 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 424 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 425 ANDL e, y2; \ // y2 = (f^g)&e // CH 426 ; \ 427 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 428 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 429 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 430 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 431 MOVL a, y3; \ // y3 = a // MAJA 432 ; \ 433 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 434 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 435 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 436 ORL c, y3; \ // y3 = a|c // MAJA 437 ; \ 438 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 439 MOVL a, T1; \ // T1 = a // MAJB 440 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 441 ANDL c, T1; \ // T1 = a&c // MAJB 442 ADDL y0, y2; \ // y2 = S1 + CH // -- 443 ; \ 444 ADDL h, d; \ // d = k + w + h + d // -- 445 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 446 ADDL y1, h; \ // h = k + w + h + S0 // -- 447 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 448 449 #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \ 450 ; \ // ################################### RND N + 1 ########################### 451 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // -- 452 MOVL f, y2; \ // y2 = f // CH 453 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 454 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 455 XORL g, y2; \ // y2 = f^g // CH 456 ; \ 457 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 458 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 459 ANDL e, y2; \ // y2 = (f^g)&e // CH 460 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 461 ; \ 462 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 463 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 464 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 465 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 466 MOVL a, y3; \ // y3 = a // MAJA 467 ; \ 468 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 469 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 470 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 471 ORL c, y3; \ // y3 = a|c // MAJA 472 ; \ 473 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 474 MOVL a, T1; \ // T1 = a // MAJB 475 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 476 ANDL c, T1; \ // T1 = a&c // MAJB 477 ADDL y0, y2; \ // y2 = S1 + CH // -- 478 ; \ 479 ADDL h, d; \ // d = k + w + h + d // -- 480 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 481 ADDL y1, h; \ // h = k + w + h + S0 // -- 482 ; \ 483 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 484 485 #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \ 486 ; \ // ################################### RND N + 2 ############################## 487 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 488 MOVL f, y2; \ // y2 = f // CH 489 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 490 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 491 XORL g, y2; \ // y2 = f^g // CH 492 ; \ 493 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 494 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 495 ANDL e, y2; \ // y2 = (f^g)&e // CH 496 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 497 ; \ 498 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 499 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 500 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 501 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 502 MOVL a, y3; \ // y3 = a // MAJA 503 ; \ 504 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 505 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 506 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 507 ORL c, y3; \ // y3 = a|c // MAJA 508 ; \ 509 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 510 MOVL a, T1; \ // T1 = a // MAJB 511 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 512 ANDL c, T1; \ // T1 = a&c // MAJB 513 ADDL y0, y2; \ // y2 = S1 + CH // -- 514 ; \ 515 ADDL h, d; \ // d = k + w + h + d // -- 516 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 517 ADDL y1, h; \ // h = k + w + h + S0 // -- 518 ; \ 519 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 520 521 #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \ 522 ; \ // ################################### RND N + 3 ########################### 523 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 524 MOVL f, y2; \ // y2 = f // CH 525 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 526 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 527 XORL g, y2; \ // y2 = f^g // CH 528 ; \ 529 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 530 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 531 ANDL e, y2; \ // y2 = (f^g)&e // CH 532 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 533 ; \ 534 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 535 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 536 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 537 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 538 MOVL a, y3; \ // y3 = a // MAJA 539 ; \ 540 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 541 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 542 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 543 ORL c, y3; \ // y3 = a|c // MAJA 544 ; \ 545 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 546 MOVL a, T1; \ // T1 = a // MAJB 547 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 548 ANDL c, T1; \ // T1 = a&c // MAJB 549 ADDL y0, y2; \ // y2 = S1 + CH // -- 550 ; \ 551 ADDL h, d; \ // d = k + w + h + d // -- 552 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 553 ADDL y1, h; \ // h = k + w + h + S0 // -- 554 ; \ 555 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 556 ; \ 557 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 558 ; \ 559 ADDL y3, h // h = t1 + S0 + MAJ // -- 560 561 TEXT ·block(SB), 0, $536-32 562 CMPB ·useAVX2(SB), $1 563 JE avx2 564 565 MOVQ p_base+8(FP), SI 566 MOVQ p_len+16(FP), DX 567 SHRQ $6, DX 568 SHLQ $6, DX 569 570 LEAQ (SI)(DX*1), DI 571 MOVQ DI, 256(SP) 572 CMPQ SI, DI 573 JEQ end 574 575 MOVQ dig+0(FP), BP 576 MOVL (0*4)(BP), R8 // a = H0 577 MOVL (1*4)(BP), R9 // b = H1 578 MOVL (2*4)(BP), R10 // c = H2 579 MOVL (3*4)(BP), R11 // d = H3 580 MOVL (4*4)(BP), R12 // e = H4 581 MOVL (5*4)(BP), R13 // f = H5 582 MOVL (6*4)(BP), R14 // g = H6 583 MOVL (7*4)(BP), R15 // h = H7 584 585 loop: 586 MOVQ SP, BP 587 588 SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15) 589 SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14) 590 SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13) 591 SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12) 592 SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11) 593 SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10) 594 SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9) 595 SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8) 596 SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15) 597 SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14) 598 SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13) 599 SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12) 600 SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11) 601 SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10) 602 SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9) 603 SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8) 604 605 SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15) 606 SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14) 607 SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13) 608 SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12) 609 SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11) 610 SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10) 611 SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9) 612 SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8) 613 SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15) 614 SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14) 615 SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13) 616 SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12) 617 SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11) 618 SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10) 619 SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9) 620 SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8) 621 SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15) 622 SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14) 623 SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13) 624 SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12) 625 SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11) 626 SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10) 627 SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9) 628 SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8) 629 SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15) 630 SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14) 631 SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13) 632 SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12) 633 SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11) 634 SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10) 635 SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9) 636 SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8) 637 SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15) 638 SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14) 639 SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13) 640 SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12) 641 SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11) 642 SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10) 643 SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9) 644 SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8) 645 SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15) 646 SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14) 647 SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13) 648 SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12) 649 SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11) 650 SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10) 651 SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9) 652 SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8) 653 654 MOVQ dig+0(FP), BP 655 ADDL (0*4)(BP), R8 // H0 = a + H0 656 MOVL R8, (0*4)(BP) 657 ADDL (1*4)(BP), R9 // H1 = b + H1 658 MOVL R9, (1*4)(BP) 659 ADDL (2*4)(BP), R10 // H2 = c + H2 660 MOVL R10, (2*4)(BP) 661 ADDL (3*4)(BP), R11 // H3 = d + H3 662 MOVL R11, (3*4)(BP) 663 ADDL (4*4)(BP), R12 // H4 = e + H4 664 MOVL R12, (4*4)(BP) 665 ADDL (5*4)(BP), R13 // H5 = f + H5 666 MOVL R13, (5*4)(BP) 667 ADDL (6*4)(BP), R14 // H6 = g + H6 668 MOVL R14, (6*4)(BP) 669 ADDL (7*4)(BP), R15 // H7 = h + H7 670 MOVL R15, (7*4)(BP) 671 672 ADDQ $64, SI 673 CMPQ SI, 256(SP) 674 JB loop 675 676 end: 677 RET 678 679 avx2: 680 MOVQ dig+0(FP), CTX // d.h[8] 681 MOVQ p_base+8(FP), INP 682 MOVQ p_len+16(FP), NUM_BYTES 683 684 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block 685 MOVQ NUM_BYTES, _INP_END(SP) 686 687 CMPQ NUM_BYTES, INP 688 JE avx2_only_one_block 689 690 // Load initial digest 691 MOVL 0(CTX), a // a = H0 692 MOVL 4(CTX), b // b = H1 693 MOVL 8(CTX), c // c = H2 694 MOVL 12(CTX), d // d = H3 695 MOVL 16(CTX), e // e = H4 696 MOVL 20(CTX), f // f = H5 697 MOVL 24(CTX), g // g = H6 698 MOVL 28(CTX), h // h = H7 699 700 avx2_loop0: // at each iteration works with one block (512 bit) 701 702 VMOVDQU (0*32)(INP), XTMP0 703 VMOVDQU (1*32)(INP), XTMP1 704 VMOVDQU (2*32)(INP), XTMP2 705 VMOVDQU (3*32)(INP), XTMP3 706 707 MOVQ $flip_mask<>(SB), BP // BYTE_FLIP_MASK 708 VMOVDQU (BP), BYTE_FLIP_MASK 709 710 // Apply Byte Flip Mask: LE -> BE 711 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0 712 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1 713 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2 714 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3 715 716 // Transpose data into high/low parts 717 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0 718 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4 719 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8 720 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12 721 722 MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants 723 724 avx2_last_block_enter: 725 ADDQ $64, INP 726 MOVQ INP, _INP(SP) 727 XORQ SRND, SRND 728 729 avx2_loop1: // for w0 - w47 730 // Do 4 rounds and scheduling 731 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER 732 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) 733 ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 734 ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 735 ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 736 ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 737 738 // Do 4 rounds and scheduling 739 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER 740 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 741 ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 742 ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 743 ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 744 ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 745 746 // Do 4 rounds and scheduling 747 VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER 748 VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1) 749 ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 750 ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 751 ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 752 ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 753 754 // Do 4 rounds and scheduling 755 VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER 756 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) 757 ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 758 ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 759 ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 760 ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 761 762 ADDQ $4*32, SRND 763 CMPQ SRND, $3*4*32 764 JB avx2_loop1 765 766 avx2_loop2: 767 // w48 - w63 processed with no scheduliung (last 16 rounds) 768 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER 769 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) 770 DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h) 771 DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h) 772 DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g) 773 DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f) 774 775 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER 776 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 777 DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e) 778 DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d) 779 DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c) 780 DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b) 781 782 ADDQ $2*32, SRND 783 784 VMOVDQU XDWORD2, XDWORD0 785 VMOVDQU XDWORD3, XDWORD1 786 787 CMPQ SRND, $4*4*32 788 JB avx2_loop2 789 790 MOVQ dig+0(FP), CTX // d.h[8] 791 MOVQ _INP(SP), INP 792 793 addm( 0(CTX), a) 794 addm( 4(CTX), b) 795 addm( 8(CTX), c) 796 addm( 12(CTX), d) 797 addm( 16(CTX), e) 798 addm( 20(CTX), f) 799 addm( 24(CTX), g) 800 addm( 28(CTX), h) 801 802 CMPQ _INP_END(SP), INP 803 JB done_hash 804 805 XORQ SRND, SRND 806 807 avx2_loop3: // Do second block using previously scheduled results 808 DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a) 809 DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h) 810 DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g) 811 DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f) 812 813 DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e) 814 DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d) 815 DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c) 816 DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b) 817 818 ADDQ $2*32, SRND 819 CMPQ SRND, $4*4*32 820 JB avx2_loop3 821 822 MOVQ dig+0(FP), CTX // d.h[8] 823 MOVQ _INP(SP), INP 824 ADDQ $64, INP 825 826 addm( 0(CTX), a) 827 addm( 4(CTX), b) 828 addm( 8(CTX), c) 829 addm( 12(CTX), d) 830 addm( 16(CTX), e) 831 addm( 20(CTX), f) 832 addm( 24(CTX), g) 833 addm( 28(CTX), h) 834 835 CMPQ _INP_END(SP), INP 836 JA avx2_loop0 837 JB done_hash 838 839 avx2_do_last_block: 840 841 VMOVDQU 0(INP), XWORD0 842 VMOVDQU 16(INP), XWORD1 843 VMOVDQU 32(INP), XWORD2 844 VMOVDQU 48(INP), XWORD3 845 846 MOVQ $flip_mask<>(SB), BP 847 VMOVDQU (BP), X_BYTE_FLIP_MASK 848 849 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 850 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 851 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 852 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 853 854 MOVQ $K256<>(SB), TBL 855 856 JMP avx2_last_block_enter 857 858 avx2_only_one_block: 859 // Load initial digest 860 MOVL 0(CTX), a // a = H0 861 MOVL 4(CTX), b // b = H1 862 MOVL 8(CTX), c // c = H2 863 MOVL 12(CTX), d // d = H3 864 MOVL 16(CTX), e // e = H4 865 MOVL 20(CTX), f // f = H5 866 MOVL 24(CTX), g // g = H6 867 MOVL 28(CTX), h // h = H7 868 869 JMP avx2_do_last_block 870 871 done_hash: 872 VZEROUPPER 873 RET 874 875 // shuffle byte order from LE to BE 876 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 877 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 878 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203 879 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b 880 GLOBL flip_mask<>(SB), 8, $32 881 882 // shuffle xBxA -> 00BA 883 DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100 884 DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF 885 DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100 886 DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 887 GLOBL shuff_00BA<>(SB), 8, $32 888 889 // shuffle xDxC -> DC00 890 DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF 891 DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100 892 DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 893 DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100 894 GLOBL shuff_DC00<>(SB), 8, $32 895 896 // Round specific constants 897 DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1 898 DATA K256<>+0x04(SB)/4, $0x71374491 // k2 899 DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3 900 DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4 901 DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1 902 DATA K256<>+0x14(SB)/4, $0x71374491 // k2 903 DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3 904 DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4 905 906 DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8 907 DATA K256<>+0x24(SB)/4, $0x59f111f1 908 DATA K256<>+0x28(SB)/4, $0x923f82a4 909 DATA K256<>+0x2c(SB)/4, $0xab1c5ed5 910 DATA K256<>+0x30(SB)/4, $0x3956c25b 911 DATA K256<>+0x34(SB)/4, $0x59f111f1 912 DATA K256<>+0x38(SB)/4, $0x923f82a4 913 DATA K256<>+0x3c(SB)/4, $0xab1c5ed5 914 915 DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12 916 DATA K256<>+0x44(SB)/4, $0x12835b01 917 DATA K256<>+0x48(SB)/4, $0x243185be 918 DATA K256<>+0x4c(SB)/4, $0x550c7dc3 919 DATA K256<>+0x50(SB)/4, $0xd807aa98 920 DATA K256<>+0x54(SB)/4, $0x12835b01 921 DATA K256<>+0x58(SB)/4, $0x243185be 922 DATA K256<>+0x5c(SB)/4, $0x550c7dc3 923 924 DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16 925 DATA K256<>+0x64(SB)/4, $0x80deb1fe 926 DATA K256<>+0x68(SB)/4, $0x9bdc06a7 927 DATA K256<>+0x6c(SB)/4, $0xc19bf174 928 DATA K256<>+0x70(SB)/4, $0x72be5d74 929 DATA K256<>+0x74(SB)/4, $0x80deb1fe 930 DATA K256<>+0x78(SB)/4, $0x9bdc06a7 931 DATA K256<>+0x7c(SB)/4, $0xc19bf174 932 933 DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20 934 DATA K256<>+0x84(SB)/4, $0xefbe4786 935 DATA K256<>+0x88(SB)/4, $0x0fc19dc6 936 DATA K256<>+0x8c(SB)/4, $0x240ca1cc 937 DATA K256<>+0x90(SB)/4, $0xe49b69c1 938 DATA K256<>+0x94(SB)/4, $0xefbe4786 939 DATA K256<>+0x98(SB)/4, $0x0fc19dc6 940 DATA K256<>+0x9c(SB)/4, $0x240ca1cc 941 942 DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24 943 DATA K256<>+0xa4(SB)/4, $0x4a7484aa 944 DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc 945 DATA K256<>+0xac(SB)/4, $0x76f988da 946 DATA K256<>+0xb0(SB)/4, $0x2de92c6f 947 DATA K256<>+0xb4(SB)/4, $0x4a7484aa 948 DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc 949 DATA K256<>+0xbc(SB)/4, $0x76f988da 950 951 DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28 952 DATA K256<>+0xc4(SB)/4, $0xa831c66d 953 DATA K256<>+0xc8(SB)/4, $0xb00327c8 954 DATA K256<>+0xcc(SB)/4, $0xbf597fc7 955 DATA K256<>+0xd0(SB)/4, $0x983e5152 956 DATA K256<>+0xd4(SB)/4, $0xa831c66d 957 DATA K256<>+0xd8(SB)/4, $0xb00327c8 958 DATA K256<>+0xdc(SB)/4, $0xbf597fc7 959 960 DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32 961 DATA K256<>+0xe4(SB)/4, $0xd5a79147 962 DATA K256<>+0xe8(SB)/4, $0x06ca6351 963 DATA K256<>+0xec(SB)/4, $0x14292967 964 DATA K256<>+0xf0(SB)/4, $0xc6e00bf3 965 DATA K256<>+0xf4(SB)/4, $0xd5a79147 966 DATA K256<>+0xf8(SB)/4, $0x06ca6351 967 DATA K256<>+0xfc(SB)/4, $0x14292967 968 969 DATA K256<>+0x100(SB)/4, $0x27b70a85 970 DATA K256<>+0x104(SB)/4, $0x2e1b2138 971 DATA K256<>+0x108(SB)/4, $0x4d2c6dfc 972 DATA K256<>+0x10c(SB)/4, $0x53380d13 973 DATA K256<>+0x110(SB)/4, $0x27b70a85 974 DATA K256<>+0x114(SB)/4, $0x2e1b2138 975 DATA K256<>+0x118(SB)/4, $0x4d2c6dfc 976 DATA K256<>+0x11c(SB)/4, $0x53380d13 977 978 DATA K256<>+0x120(SB)/4, $0x650a7354 979 DATA K256<>+0x124(SB)/4, $0x766a0abb 980 DATA K256<>+0x128(SB)/4, $0x81c2c92e 981 DATA K256<>+0x12c(SB)/4, $0x92722c85 982 DATA K256<>+0x130(SB)/4, $0x650a7354 983 DATA K256<>+0x134(SB)/4, $0x766a0abb 984 DATA K256<>+0x138(SB)/4, $0x81c2c92e 985 DATA K256<>+0x13c(SB)/4, $0x92722c85 986 987 DATA K256<>+0x140(SB)/4, $0xa2bfe8a1 988 DATA K256<>+0x144(SB)/4, $0xa81a664b 989 DATA K256<>+0x148(SB)/4, $0xc24b8b70 990 DATA K256<>+0x14c(SB)/4, $0xc76c51a3 991 DATA K256<>+0x150(SB)/4, $0xa2bfe8a1 992 DATA K256<>+0x154(SB)/4, $0xa81a664b 993 DATA K256<>+0x158(SB)/4, $0xc24b8b70 994 DATA K256<>+0x15c(SB)/4, $0xc76c51a3 995 996 DATA K256<>+0x160(SB)/4, $0xd192e819 997 DATA K256<>+0x164(SB)/4, $0xd6990624 998 DATA K256<>+0x168(SB)/4, $0xf40e3585 999 DATA K256<>+0x16c(SB)/4, $0x106aa070 1000 DATA K256<>+0x170(SB)/4, $0xd192e819 1001 DATA K256<>+0x174(SB)/4, $0xd6990624 1002 DATA K256<>+0x178(SB)/4, $0xf40e3585 1003 DATA K256<>+0x17c(SB)/4, $0x106aa070 1004 1005 DATA K256<>+0x180(SB)/4, $0x19a4c116 1006 DATA K256<>+0x184(SB)/4, $0x1e376c08 1007 DATA K256<>+0x188(SB)/4, $0x2748774c 1008 DATA K256<>+0x18c(SB)/4, $0x34b0bcb5 1009 DATA K256<>+0x190(SB)/4, $0x19a4c116 1010 DATA K256<>+0x194(SB)/4, $0x1e376c08 1011 DATA K256<>+0x198(SB)/4, $0x2748774c 1012 DATA K256<>+0x19c(SB)/4, $0x34b0bcb5 1013 1014 DATA K256<>+0x1a0(SB)/4, $0x391c0cb3 1015 DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a 1016 DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f 1017 DATA K256<>+0x1ac(SB)/4, $0x682e6ff3 1018 DATA K256<>+0x1b0(SB)/4, $0x391c0cb3 1019 DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a 1020 DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f 1021 DATA K256<>+0x1bc(SB)/4, $0x682e6ff3 1022 1023 DATA K256<>+0x1c0(SB)/4, $0x748f82ee 1024 DATA K256<>+0x1c4(SB)/4, $0x78a5636f 1025 DATA K256<>+0x1c8(SB)/4, $0x84c87814 1026 DATA K256<>+0x1cc(SB)/4, $0x8cc70208 1027 DATA K256<>+0x1d0(SB)/4, $0x748f82ee 1028 DATA K256<>+0x1d4(SB)/4, $0x78a5636f 1029 DATA K256<>+0x1d8(SB)/4, $0x84c87814 1030 DATA K256<>+0x1dc(SB)/4, $0x8cc70208 1031 1032 DATA K256<>+0x1e0(SB)/4, $0x90befffa 1033 DATA K256<>+0x1e4(SB)/4, $0xa4506ceb 1034 DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7 1035 DATA K256<>+0x1ec(SB)/4, $0xc67178f2 1036 DATA K256<>+0x1f0(SB)/4, $0x90befffa 1037 DATA K256<>+0x1f4(SB)/4, $0xa4506ceb 1038 DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7 1039 DATA K256<>+0x1fc(SB)/4, $0xc67178f2 1040 1041 GLOBL K256<>(SB), (NOPTR + RODATA), $512