github.com/insolar/x-crypto@v0.0.0-20191031140942-75fab8a325f6/sha256/sha256block_amd64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // SHA256 block routine. See sha256block.go for Go equivalent. 8 // 9 // The algorithm is detailed in FIPS 180-4: 10 // 11 // https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf 12 13 // The avx2-version is described in an Intel White-Paper: 14 // "Fast SHA-256 Implementations on Intel Architecture Processors" 15 // To find it, surf to http://www.intel.com/p/en_US/embedded 16 // and search for that title. 17 // AVX2 version by Intel, same algorithm as code in Linux kernel: 18 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S 19 // by 20 // James Guilford <james.guilford@intel.com> 21 // Kirk Yap <kirk.s.yap@intel.com> 22 // Tim Chen <tim.c.chen@linux.intel.com> 23 24 // Wt = Mt; for 0 <= t <= 15 25 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 26 // 27 // a = H0 28 // b = H1 29 // c = H2 30 // d = H3 31 // e = H4 32 // f = H5 33 // g = H6 34 // h = H7 35 // 36 // for t = 0 to 63 { 37 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt 38 // T2 = BIGSIGMA0(a) + Maj(a,b,c) 39 // h = g 40 // g = f 41 // f = e 42 // e = d + T1 43 // d = c 44 // c = b 45 // b = a 46 // a = T1 + T2 47 // } 48 // 49 // H0 = a + H0 50 // H1 = b + H1 51 // H2 = c + H2 52 // H3 = d + H3 53 // H4 = e + H4 54 // H5 = f + H5 55 // H6 = g + H6 56 // H7 = h + H7 57 58 // Wt = Mt; for 0 <= t <= 15 59 #define MSGSCHEDULE0(index) \ 60 MOVL (index*4)(SI), AX; \ 61 BSWAPL AX; \ 62 MOVL AX, (index*4)(BP) 63 64 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 65 // SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x) 66 // SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x) 67 #define MSGSCHEDULE1(index) \ 68 MOVL ((index-2)*4)(BP), AX; \ 69 MOVL AX, CX; \ 70 RORL $17, AX; \ 71 MOVL CX, DX; \ 72 RORL $19, CX; \ 73 SHRL $10, DX; \ 74 MOVL ((index-15)*4)(BP), BX; \ 75 XORL CX, AX; \ 76 MOVL BX, CX; \ 77 XORL DX, AX; \ 78 RORL $7, BX; \ 79 MOVL CX, DX; \ 80 SHRL $3, DX; \ 81 RORL $18, CX; \ 82 ADDL ((index-7)*4)(BP), AX; \ 83 XORL CX, BX; \ 84 XORL DX, BX; \ 85 ADDL ((index-16)*4)(BP), BX; \ 86 ADDL BX, AX; \ 87 MOVL AX, ((index)*4)(BP) 88 89 // Calculate T1 in AX - uses AX, CX and DX registers. 90 // h is also used as an accumulator. Wt is passed in AX. 91 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt 92 // BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x) 93 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) 94 #define SHA256T1(const, e, f, g, h) \ 95 ADDL AX, h; \ 96 MOVL e, AX; \ 97 ADDL $const, h; \ 98 MOVL e, CX; \ 99 RORL $6, AX; \ 100 MOVL e, DX; \ 101 RORL $11, CX; \ 102 XORL CX, AX; \ 103 MOVL e, CX; \ 104 RORL $25, DX; \ 105 ANDL f, CX; \ 106 XORL AX, DX; \ 107 MOVL e, AX; \ 108 NOTL AX; \ 109 ADDL DX, h; \ 110 ANDL g, AX; \ 111 XORL CX, AX; \ 112 ADDL h, AX 113 114 // Calculate T2 in BX - uses BX, CX, DX and DI registers. 115 // T2 = BIGSIGMA0(a) + Maj(a, b, c) 116 // BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x) 117 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) 118 #define SHA256T2(a, b, c) \ 119 MOVL a, DI; \ 120 MOVL c, BX; \ 121 RORL $2, DI; \ 122 MOVL a, DX; \ 123 ANDL b, BX; \ 124 RORL $13, DX; \ 125 MOVL a, CX; \ 126 ANDL c, CX; \ 127 XORL DX, DI; \ 128 XORL CX, BX; \ 129 MOVL a, DX; \ 130 MOVL b, CX; \ 131 RORL $22, DX; \ 132 ANDL a, CX; \ 133 XORL CX, BX; \ 134 XORL DX, DI; \ 135 ADDL DI, BX 136 137 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2. 138 // The values for e and a are stored in d and h, ready for rotation. 139 #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \ 140 SHA256T1(const, e, f, g, h); \ 141 SHA256T2(a, b, c); \ 142 MOVL BX, h; \ 143 ADDL AX, d; \ 144 ADDL AX, h 145 146 #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \ 147 MSGSCHEDULE0(index); \ 148 SHA256ROUND(index, const, a, b, c, d, e, f, g, h) 149 150 #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \ 151 MSGSCHEDULE1(index); \ 152 SHA256ROUND(index, const, a, b, c, d, e, f, g, h) 153 154 155 // Definitions for AVX2 version 156 157 // addm (mem), reg 158 // Add reg to mem using reg-mem add and store 159 #define addm(P1, P2) \ 160 ADDL P2, P1; \ 161 MOVL P1, P2 162 163 #define XDWORD0 Y4 164 #define XDWORD1 Y5 165 #define XDWORD2 Y6 166 #define XDWORD3 Y7 167 168 #define XWORD0 X4 169 #define XWORD1 X5 170 #define XWORD2 X6 171 #define XWORD3 X7 172 173 #define XTMP0 Y0 174 #define XTMP1 Y1 175 #define XTMP2 Y2 176 #define XTMP3 Y3 177 #define XTMP4 Y8 178 #define XTMP5 Y11 179 180 #define XFER Y9 181 182 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 183 #define X_BYTE_FLIP_MASK X13 184 185 #define NUM_BYTES DX 186 #define INP DI 187 188 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h) 189 190 #define a AX 191 #define b BX 192 #define c CX 193 #define d R8 194 #define e DX 195 #define f R9 196 #define g R10 197 #define h R11 198 199 #define old_h R11 200 201 #define TBL BP 202 203 #define SRND SI // SRND is same register as CTX 204 205 #define T1 R12 206 207 #define y0 R13 208 #define y1 R14 209 #define y2 R15 210 #define y3 DI 211 212 // Offsets 213 #define XFER_SIZE 2*64*4 214 #define INP_END_SIZE 8 215 #define INP_SIZE 8 216 217 #define _XFER 0 218 #define _INP_END _XFER + XFER_SIZE 219 #define _INP _INP_END + INP_END_SIZE 220 #define STACK_SIZE _INP + INP_SIZE 221 222 #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 223 ; \ // ############################# RND N + 0 ############################// 224 MOVL a, y3; \ // y3 = a // MAJA 225 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 226 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 227 ; \ 228 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w 229 ORL c, y3; \ // y3 = a|c // MAJA 230 VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7] 231 MOVL f, y2; \ // y2 = f // CH 232 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 233 ; \ 234 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 235 XORL g, y2; \ // y2 = f^g // CH 236 VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1 237 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 238 ; \ 239 ANDL e, y2; \ // y2 = (f^g)&e // CH 240 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 241 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 242 ADDL h, d; \ // d = k + w + h + d // -- 243 ; \ 244 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 245 VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15] 246 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 247 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 248 ; \ 249 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 250 VPSRLD $7, XTMP1, XTMP2; \ 251 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 252 MOVL a, T1; \ // T1 = a // MAJB 253 ANDL c, T1; \ // T1 = a&c // MAJB 254 ; \ 255 ADDL y0, y2; \ // y2 = S1 + CH // -- 256 VPSLLD $(32-7), XTMP1, XTMP3; \ 257 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 258 ADDL y1, h; \ // h = k + w + h + S0 // -- 259 ; \ 260 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 261 VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 262 ; \ 263 VPSRLD $18, XTMP1, XTMP2; \ 264 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 265 ADDL y3, h // h = t1 + S0 + MAJ // -- 266 267 #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 268 ; \ // ################################### RND N + 1 ############################ 269 ; \ 270 MOVL a, y3; \ // y3 = a // MAJA 271 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 272 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 273 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 274 ORL c, y3; \ // y3 = a|c // MAJA 275 ; \ 276 VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3 277 MOVL f, y2; \ // y2 = f // CH 278 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 279 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 280 XORL g, y2; \ // y2 = f^g // CH 281 ; \ 282 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 283 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 284 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 285 ANDL e, y2; \ // y2 = (f^g)&e // CH 286 ADDL h, d; \ // d = k + w + h + d // -- 287 ; \ 288 VPSLLD $(32-18), XTMP1, XTMP1; \ 289 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 290 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 291 ; \ 292 VPXOR XTMP1, XTMP3, XTMP3; \ 293 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 294 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 295 ; \ 296 VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 297 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 298 MOVL a, T1; \ // T1 = a // MAJB 299 ANDL c, T1; \ // T1 = a&c // MAJB 300 ADDL y0, y2; \ // y2 = S1 + CH // -- 301 ; \ 302 VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0 303 VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA} 304 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 305 ADDL y1, h; \ // h = k + w + h + S0 // -- 306 ; \ 307 VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0 308 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 309 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 310 ADDL y3, h; \ // h = t1 + S0 + MAJ // -- 311 ; \ 312 VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA} 313 314 #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 315 ; \ // ################################### RND N + 2 ############################ 316 ; \ 317 MOVL a, y3; \ // y3 = a // MAJA 318 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 319 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 320 ; \ 321 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA} 322 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 323 ORL c, y3; \ // y3 = a|c // MAJA 324 MOVL f, y2; \ // y2 = f // CH 325 XORL g, y2; \ // y2 = f^g // CH 326 ; \ 327 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 328 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 329 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA} 330 ANDL e, y2; \ // y2 = (f^g)&e // CH 331 ; \ 332 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 333 VPXOR XTMP3, XTMP2, XTMP2; \ 334 ADDL h, d; \ // d = k + w + h + d // -- 335 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 336 ; \ 337 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 338 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 339 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA} 340 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 341 ; \ 342 VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA} 343 ; \ 344 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 345 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 346 VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]} 347 ; \ 348 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 349 MOVL a, T1; \ // T1 = a // MAJB 350 ANDL c, T1; \ // T1 = a&c // MAJB 351 ADDL y0, y2; \ // y2 = S1 + CH // -- 352 VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC} 353 ; \ 354 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 355 ADDL y1, h; \ // h = k + w + h + S0 // -- 356 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 357 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 358 ; \ 359 ADDL y3, h // h = t1 + S0 + MAJ // -- 360 361 #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 362 ; \ // ################################### RND N + 3 ############################ 363 ; \ 364 MOVL a, y3; \ // y3 = a // MAJA 365 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 366 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 367 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 368 ORL c, y3; \ // y3 = a|c // MAJA 369 ; \ 370 VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC} 371 MOVL f, y2; \ // y2 = f // CH 372 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 373 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 374 XORL g, y2; \ // y2 = f^g // CH 375 ; \ 376 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC} 377 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 378 ANDL e, y2; \ // y2 = (f^g)&e // CH 379 ADDL h, d; \ // d = k + w + h + d // -- 380 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 381 ; \ 382 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC} 383 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 384 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 385 ; \ 386 VPXOR XTMP3, XTMP2, XTMP2; \ 387 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 388 ADDL y0, y2; \ // y2 = S1 + CH // -- 389 ; \ 390 VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC} 391 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 392 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 393 ; \ 394 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 395 ; \ 396 VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00} 397 ; \ 398 VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} 399 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 400 MOVL a, T1; \ // T1 = a // MAJB 401 ANDL c, T1; \ // T1 = a&c // MAJB 402 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 403 ; \ 404 ADDL y1, h; \ // h = k + w + h + S0 // -- 405 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 406 ADDL y3, h // h = t1 + S0 + MAJ // -- 407 408 #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \ 409 ; \ // ################################### RND N + 0 ########################### 410 MOVL f, y2; \ // y2 = f // CH 411 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 412 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 413 XORL g, y2; \ // y2 = f^g // CH 414 ; \ 415 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 416 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 417 ANDL e, y2; \ // y2 = (f^g)&e // CH 418 ; \ 419 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 420 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 421 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 422 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 423 MOVL a, y3; \ // y3 = a // MAJA 424 ; \ 425 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 426 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 427 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 428 ORL c, y3; \ // y3 = a|c // MAJA 429 ; \ 430 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 431 MOVL a, T1; \ // T1 = a // MAJB 432 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 433 ANDL c, T1; \ // T1 = a&c // MAJB 434 ADDL y0, y2; \ // y2 = S1 + CH // -- 435 ; \ 436 ADDL h, d; \ // d = k + w + h + d // -- 437 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 438 ADDL y1, h; \ // h = k + w + h + S0 // -- 439 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 440 441 #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \ 442 ; \ // ################################### RND N + 1 ########################### 443 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // -- 444 MOVL f, y2; \ // y2 = f // CH 445 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 446 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 447 XORL g, y2; \ // y2 = f^g // CH 448 ; \ 449 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 450 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 451 ANDL e, y2; \ // y2 = (f^g)&e // CH 452 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 453 ; \ 454 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 455 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 456 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 457 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 458 MOVL a, y3; \ // y3 = a // MAJA 459 ; \ 460 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 461 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 462 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 463 ORL c, y3; \ // y3 = a|c // MAJA 464 ; \ 465 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 466 MOVL a, T1; \ // T1 = a // MAJB 467 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 468 ANDL c, T1; \ // T1 = a&c // MAJB 469 ADDL y0, y2; \ // y2 = S1 + CH // -- 470 ; \ 471 ADDL h, d; \ // d = k + w + h + d // -- 472 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 473 ADDL y1, h; \ // h = k + w + h + S0 // -- 474 ; \ 475 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 476 477 #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \ 478 ; \ // ################################### RND N + 2 ############################## 479 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 480 MOVL f, y2; \ // y2 = f // CH 481 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 482 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 483 XORL g, y2; \ // y2 = f^g // CH 484 ; \ 485 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 486 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 487 ANDL e, y2; \ // y2 = (f^g)&e // CH 488 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 489 ; \ 490 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 491 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 492 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 493 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 494 MOVL a, y3; \ // y3 = a // MAJA 495 ; \ 496 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 497 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 498 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 499 ORL c, y3; \ // y3 = a|c // MAJA 500 ; \ 501 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 502 MOVL a, T1; \ // T1 = a // MAJB 503 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 504 ANDL c, T1; \ // T1 = a&c // MAJB 505 ADDL y0, y2; \ // y2 = S1 + CH // -- 506 ; \ 507 ADDL h, d; \ // d = k + w + h + d // -- 508 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 509 ADDL y1, h; \ // h = k + w + h + S0 // -- 510 ; \ 511 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 512 513 #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \ 514 ; \ // ################################### RND N + 3 ########################### 515 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 516 MOVL f, y2; \ // y2 = f // CH 517 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 518 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 519 XORL g, y2; \ // y2 = f^g // CH 520 ; \ 521 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 522 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 523 ANDL e, y2; \ // y2 = (f^g)&e // CH 524 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 525 ; \ 526 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 527 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 528 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 529 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 530 MOVL a, y3; \ // y3 = a // MAJA 531 ; \ 532 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 533 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 534 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 535 ORL c, y3; \ // y3 = a|c // MAJA 536 ; \ 537 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 538 MOVL a, T1; \ // T1 = a // MAJB 539 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 540 ANDL c, T1; \ // T1 = a&c // MAJB 541 ADDL y0, y2; \ // y2 = S1 + CH // -- 542 ; \ 543 ADDL h, d; \ // d = k + w + h + d // -- 544 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 545 ADDL y1, h; \ // h = k + w + h + S0 // -- 546 ; \ 547 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 548 ; \ 549 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 550 ; \ 551 ADDL y3, h // h = t1 + S0 + MAJ // -- 552 553 TEXT ·block(SB), 0, $536-32 554 CMPB ·useAVX2(SB), $1 555 JE avx2 556 557 MOVQ p_base+8(FP), SI 558 MOVQ p_len+16(FP), DX 559 SHRQ $6, DX 560 SHLQ $6, DX 561 562 LEAQ (SI)(DX*1), DI 563 MOVQ DI, 256(SP) 564 CMPQ SI, DI 565 JEQ end 566 567 MOVQ dig+0(FP), BP 568 MOVL (0*4)(BP), R8 // a = H0 569 MOVL (1*4)(BP), R9 // b = H1 570 MOVL (2*4)(BP), R10 // c = H2 571 MOVL (3*4)(BP), R11 // d = H3 572 MOVL (4*4)(BP), R12 // e = H4 573 MOVL (5*4)(BP), R13 // f = H5 574 MOVL (6*4)(BP), R14 // g = H6 575 MOVL (7*4)(BP), R15 // h = H7 576 577 loop: 578 MOVQ SP, BP 579 580 SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15) 581 SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14) 582 SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13) 583 SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12) 584 SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11) 585 SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10) 586 SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9) 587 SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8) 588 SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15) 589 SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14) 590 SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13) 591 SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12) 592 SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11) 593 SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10) 594 SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9) 595 SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8) 596 597 SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15) 598 SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14) 599 SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13) 600 SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12) 601 SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11) 602 SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10) 603 SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9) 604 SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8) 605 SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15) 606 SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14) 607 SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13) 608 SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12) 609 SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11) 610 SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10) 611 SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9) 612 SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8) 613 SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15) 614 SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14) 615 SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13) 616 SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12) 617 SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11) 618 SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10) 619 SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9) 620 SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8) 621 SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15) 622 SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14) 623 SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13) 624 SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12) 625 SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11) 626 SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10) 627 SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9) 628 SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8) 629 SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15) 630 SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14) 631 SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13) 632 SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12) 633 SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11) 634 SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10) 635 SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9) 636 SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8) 637 SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15) 638 SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14) 639 SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13) 640 SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12) 641 SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11) 642 SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10) 643 SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9) 644 SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8) 645 646 MOVQ dig+0(FP), BP 647 ADDL (0*4)(BP), R8 // H0 = a + H0 648 MOVL R8, (0*4)(BP) 649 ADDL (1*4)(BP), R9 // H1 = b + H1 650 MOVL R9, (1*4)(BP) 651 ADDL (2*4)(BP), R10 // H2 = c + H2 652 MOVL R10, (2*4)(BP) 653 ADDL (3*4)(BP), R11 // H3 = d + H3 654 MOVL R11, (3*4)(BP) 655 ADDL (4*4)(BP), R12 // H4 = e + H4 656 MOVL R12, (4*4)(BP) 657 ADDL (5*4)(BP), R13 // H5 = f + H5 658 MOVL R13, (5*4)(BP) 659 ADDL (6*4)(BP), R14 // H6 = g + H6 660 MOVL R14, (6*4)(BP) 661 ADDL (7*4)(BP), R15 // H7 = h + H7 662 MOVL R15, (7*4)(BP) 663 664 ADDQ $64, SI 665 CMPQ SI, 256(SP) 666 JB loop 667 668 end: 669 RET 670 671 avx2: 672 MOVQ dig+0(FP), CTX // d.h[8] 673 MOVQ p_base+8(FP), INP 674 MOVQ p_len+16(FP), NUM_BYTES 675 676 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block 677 MOVQ NUM_BYTES, _INP_END(SP) 678 679 CMPQ NUM_BYTES, INP 680 JE avx2_only_one_block 681 682 // Load initial digest 683 MOVL 0(CTX), a // a = H0 684 MOVL 4(CTX), b // b = H1 685 MOVL 8(CTX), c // c = H2 686 MOVL 12(CTX), d // d = H3 687 MOVL 16(CTX), e // e = H4 688 MOVL 20(CTX), f // f = H5 689 MOVL 24(CTX), g // g = H6 690 MOVL 28(CTX), h // h = H7 691 692 avx2_loop0: // at each iteration works with one block (512 bit) 693 694 VMOVDQU (0*32)(INP), XTMP0 695 VMOVDQU (1*32)(INP), XTMP1 696 VMOVDQU (2*32)(INP), XTMP2 697 VMOVDQU (3*32)(INP), XTMP3 698 699 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 700 701 // Apply Byte Flip Mask: LE -> BE 702 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0 703 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1 704 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2 705 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3 706 707 // Transpose data into high/low parts 708 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0 709 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4 710 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8 711 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12 712 713 MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants 714 715 avx2_last_block_enter: 716 ADDQ $64, INP 717 MOVQ INP, _INP(SP) 718 XORQ SRND, SRND 719 720 avx2_loop1: // for w0 - w47 721 // Do 4 rounds and scheduling 722 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER 723 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) 724 ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 725 ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 726 ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 727 ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 728 729 // Do 4 rounds and scheduling 730 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER 731 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 732 ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 733 ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 734 ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 735 ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 736 737 // Do 4 rounds and scheduling 738 VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER 739 VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1) 740 ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 741 ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 742 ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 743 ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 744 745 // Do 4 rounds and scheduling 746 VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER 747 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) 748 ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 749 ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 750 ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 751 ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 752 753 ADDQ $4*32, SRND 754 CMPQ SRND, $3*4*32 755 JB avx2_loop1 756 757 avx2_loop2: 758 // w48 - w63 processed with no scheduling (last 16 rounds) 759 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER 760 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) 761 DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h) 762 DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h) 763 DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g) 764 DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f) 765 766 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER 767 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 768 DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e) 769 DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d) 770 DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c) 771 DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b) 772 773 ADDQ $2*32, SRND 774 775 VMOVDQU XDWORD2, XDWORD0 776 VMOVDQU XDWORD3, XDWORD1 777 778 CMPQ SRND, $4*4*32 779 JB avx2_loop2 780 781 MOVQ dig+0(FP), CTX // d.h[8] 782 MOVQ _INP(SP), INP 783 784 addm( 0(CTX), a) 785 addm( 4(CTX), b) 786 addm( 8(CTX), c) 787 addm( 12(CTX), d) 788 addm( 16(CTX), e) 789 addm( 20(CTX), f) 790 addm( 24(CTX), g) 791 addm( 28(CTX), h) 792 793 CMPQ _INP_END(SP), INP 794 JB done_hash 795 796 XORQ SRND, SRND 797 798 avx2_loop3: // Do second block using previously scheduled results 799 DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a) 800 DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h) 801 DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g) 802 DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f) 803 804 DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e) 805 DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d) 806 DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c) 807 DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b) 808 809 ADDQ $2*32, SRND 810 CMPQ SRND, $4*4*32 811 JB avx2_loop3 812 813 MOVQ dig+0(FP), CTX // d.h[8] 814 MOVQ _INP(SP), INP 815 ADDQ $64, INP 816 817 addm( 0(CTX), a) 818 addm( 4(CTX), b) 819 addm( 8(CTX), c) 820 addm( 12(CTX), d) 821 addm( 16(CTX), e) 822 addm( 20(CTX), f) 823 addm( 24(CTX), g) 824 addm( 28(CTX), h) 825 826 CMPQ _INP_END(SP), INP 827 JA avx2_loop0 828 JB done_hash 829 830 avx2_do_last_block: 831 832 VMOVDQU 0(INP), XWORD0 833 VMOVDQU 16(INP), XWORD1 834 VMOVDQU 32(INP), XWORD2 835 VMOVDQU 48(INP), XWORD3 836 837 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 838 839 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 840 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 841 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 842 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 843 844 MOVQ $K256<>(SB), TBL 845 846 JMP avx2_last_block_enter 847 848 avx2_only_one_block: 849 // Load initial digest 850 MOVL 0(CTX), a // a = H0 851 MOVL 4(CTX), b // b = H1 852 MOVL 8(CTX), c // c = H2 853 MOVL 12(CTX), d // d = H3 854 MOVL 16(CTX), e // e = H4 855 MOVL 20(CTX), f // f = H5 856 MOVL 24(CTX), g // g = H6 857 MOVL 28(CTX), h // h = H7 858 859 JMP avx2_do_last_block 860 861 done_hash: 862 VZEROUPPER 863 RET 864 865 // shuffle byte order from LE to BE 866 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 867 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 868 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203 869 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b 870 GLOBL flip_mask<>(SB), 8, $32 871 872 // shuffle xBxA -> 00BA 873 DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100 874 DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF 875 DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100 876 DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 877 GLOBL shuff_00BA<>(SB), 8, $32 878 879 // shuffle xDxC -> DC00 880 DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF 881 DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100 882 DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 883 DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100 884 GLOBL shuff_DC00<>(SB), 8, $32 885 886 // Round specific constants 887 DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1 888 DATA K256<>+0x04(SB)/4, $0x71374491 // k2 889 DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3 890 DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4 891 DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1 892 DATA K256<>+0x14(SB)/4, $0x71374491 // k2 893 DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3 894 DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4 895 896 DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8 897 DATA K256<>+0x24(SB)/4, $0x59f111f1 898 DATA K256<>+0x28(SB)/4, $0x923f82a4 899 DATA K256<>+0x2c(SB)/4, $0xab1c5ed5 900 DATA K256<>+0x30(SB)/4, $0x3956c25b 901 DATA K256<>+0x34(SB)/4, $0x59f111f1 902 DATA K256<>+0x38(SB)/4, $0x923f82a4 903 DATA K256<>+0x3c(SB)/4, $0xab1c5ed5 904 905 DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12 906 DATA K256<>+0x44(SB)/4, $0x12835b01 907 DATA K256<>+0x48(SB)/4, $0x243185be 908 DATA K256<>+0x4c(SB)/4, $0x550c7dc3 909 DATA K256<>+0x50(SB)/4, $0xd807aa98 910 DATA K256<>+0x54(SB)/4, $0x12835b01 911 DATA K256<>+0x58(SB)/4, $0x243185be 912 DATA K256<>+0x5c(SB)/4, $0x550c7dc3 913 914 DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16 915 DATA K256<>+0x64(SB)/4, $0x80deb1fe 916 DATA K256<>+0x68(SB)/4, $0x9bdc06a7 917 DATA K256<>+0x6c(SB)/4, $0xc19bf174 918 DATA K256<>+0x70(SB)/4, $0x72be5d74 919 DATA K256<>+0x74(SB)/4, $0x80deb1fe 920 DATA K256<>+0x78(SB)/4, $0x9bdc06a7 921 DATA K256<>+0x7c(SB)/4, $0xc19bf174 922 923 DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20 924 DATA K256<>+0x84(SB)/4, $0xefbe4786 925 DATA K256<>+0x88(SB)/4, $0x0fc19dc6 926 DATA K256<>+0x8c(SB)/4, $0x240ca1cc 927 DATA K256<>+0x90(SB)/4, $0xe49b69c1 928 DATA K256<>+0x94(SB)/4, $0xefbe4786 929 DATA K256<>+0x98(SB)/4, $0x0fc19dc6 930 DATA K256<>+0x9c(SB)/4, $0x240ca1cc 931 932 DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24 933 DATA K256<>+0xa4(SB)/4, $0x4a7484aa 934 DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc 935 DATA K256<>+0xac(SB)/4, $0x76f988da 936 DATA K256<>+0xb0(SB)/4, $0x2de92c6f 937 DATA K256<>+0xb4(SB)/4, $0x4a7484aa 938 DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc 939 DATA K256<>+0xbc(SB)/4, $0x76f988da 940 941 DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28 942 DATA K256<>+0xc4(SB)/4, $0xa831c66d 943 DATA K256<>+0xc8(SB)/4, $0xb00327c8 944 DATA K256<>+0xcc(SB)/4, $0xbf597fc7 945 DATA K256<>+0xd0(SB)/4, $0x983e5152 946 DATA K256<>+0xd4(SB)/4, $0xa831c66d 947 DATA K256<>+0xd8(SB)/4, $0xb00327c8 948 DATA K256<>+0xdc(SB)/4, $0xbf597fc7 949 950 DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32 951 DATA K256<>+0xe4(SB)/4, $0xd5a79147 952 DATA K256<>+0xe8(SB)/4, $0x06ca6351 953 DATA K256<>+0xec(SB)/4, $0x14292967 954 DATA K256<>+0xf0(SB)/4, $0xc6e00bf3 955 DATA K256<>+0xf4(SB)/4, $0xd5a79147 956 DATA K256<>+0xf8(SB)/4, $0x06ca6351 957 DATA K256<>+0xfc(SB)/4, $0x14292967 958 959 DATA K256<>+0x100(SB)/4, $0x27b70a85 960 DATA K256<>+0x104(SB)/4, $0x2e1b2138 961 DATA K256<>+0x108(SB)/4, $0x4d2c6dfc 962 DATA K256<>+0x10c(SB)/4, $0x53380d13 963 DATA K256<>+0x110(SB)/4, $0x27b70a85 964 DATA K256<>+0x114(SB)/4, $0x2e1b2138 965 DATA K256<>+0x118(SB)/4, $0x4d2c6dfc 966 DATA K256<>+0x11c(SB)/4, $0x53380d13 967 968 DATA K256<>+0x120(SB)/4, $0x650a7354 969 DATA K256<>+0x124(SB)/4, $0x766a0abb 970 DATA K256<>+0x128(SB)/4, $0x81c2c92e 971 DATA K256<>+0x12c(SB)/4, $0x92722c85 972 DATA K256<>+0x130(SB)/4, $0x650a7354 973 DATA K256<>+0x134(SB)/4, $0x766a0abb 974 DATA K256<>+0x138(SB)/4, $0x81c2c92e 975 DATA K256<>+0x13c(SB)/4, $0x92722c85 976 977 DATA K256<>+0x140(SB)/4, $0xa2bfe8a1 978 DATA K256<>+0x144(SB)/4, $0xa81a664b 979 DATA K256<>+0x148(SB)/4, $0xc24b8b70 980 DATA K256<>+0x14c(SB)/4, $0xc76c51a3 981 DATA K256<>+0x150(SB)/4, $0xa2bfe8a1 982 DATA K256<>+0x154(SB)/4, $0xa81a664b 983 DATA K256<>+0x158(SB)/4, $0xc24b8b70 984 DATA K256<>+0x15c(SB)/4, $0xc76c51a3 985 986 DATA K256<>+0x160(SB)/4, $0xd192e819 987 DATA K256<>+0x164(SB)/4, $0xd6990624 988 DATA K256<>+0x168(SB)/4, $0xf40e3585 989 DATA K256<>+0x16c(SB)/4, $0x106aa070 990 DATA K256<>+0x170(SB)/4, $0xd192e819 991 DATA K256<>+0x174(SB)/4, $0xd6990624 992 DATA K256<>+0x178(SB)/4, $0xf40e3585 993 DATA K256<>+0x17c(SB)/4, $0x106aa070 994 995 DATA K256<>+0x180(SB)/4, $0x19a4c116 996 DATA K256<>+0x184(SB)/4, $0x1e376c08 997 DATA K256<>+0x188(SB)/4, $0x2748774c 998 DATA K256<>+0x18c(SB)/4, $0x34b0bcb5 999 DATA K256<>+0x190(SB)/4, $0x19a4c116 1000 DATA K256<>+0x194(SB)/4, $0x1e376c08 1001 DATA K256<>+0x198(SB)/4, $0x2748774c 1002 DATA K256<>+0x19c(SB)/4, $0x34b0bcb5 1003 1004 DATA K256<>+0x1a0(SB)/4, $0x391c0cb3 1005 DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a 1006 DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f 1007 DATA K256<>+0x1ac(SB)/4, $0x682e6ff3 1008 DATA K256<>+0x1b0(SB)/4, $0x391c0cb3 1009 DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a 1010 DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f 1011 DATA K256<>+0x1bc(SB)/4, $0x682e6ff3 1012 1013 DATA K256<>+0x1c0(SB)/4, $0x748f82ee 1014 DATA K256<>+0x1c4(SB)/4, $0x78a5636f 1015 DATA K256<>+0x1c8(SB)/4, $0x84c87814 1016 DATA K256<>+0x1cc(SB)/4, $0x8cc70208 1017 DATA K256<>+0x1d0(SB)/4, $0x748f82ee 1018 DATA K256<>+0x1d4(SB)/4, $0x78a5636f 1019 DATA K256<>+0x1d8(SB)/4, $0x84c87814 1020 DATA K256<>+0x1dc(SB)/4, $0x8cc70208 1021 1022 DATA K256<>+0x1e0(SB)/4, $0x90befffa 1023 DATA K256<>+0x1e4(SB)/4, $0xa4506ceb 1024 DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7 1025 DATA K256<>+0x1ec(SB)/4, $0xc67178f2 1026 DATA K256<>+0x1f0(SB)/4, $0x90befffa 1027 DATA K256<>+0x1f4(SB)/4, $0xa4506ceb 1028 DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7 1029 DATA K256<>+0x1fc(SB)/4, $0xc67178f2 1030 1031 GLOBL K256<>(SB), (NOPTR + RODATA), $512