github.com/tidwall/go@v0.0.0-20170415222209-6694a6888b7d/src/crypto/sha256/sha256block_amd64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // SHA256 block routine. See sha256block.go for Go equivalent. 8 // 9 // The algorithm is detailed in FIPS 180-4: 10 // 11 // http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf 12 13 // The avx2-version is described in an Intel White-Paper: 14 // "Fast SHA-256 Implementations on Intel Architecture Processors" 15 // To find it, surf to http://www.intel.com/p/en_US/embedded 16 // and search for that title. 17 // AVX2 version by Intel, same algorithm as code in Linux kernel: 18 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S 19 // by 20 // James Guilford <james.guilford@intel.com> 21 // Kirk Yap <kirk.s.yap@intel.com> 22 // Tim Chen <tim.c.chen@linux.intel.com> 23 24 // Wt = Mt; for 0 <= t <= 15 25 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 26 // 27 // a = H0 28 // b = H1 29 // c = H2 30 // d = H3 31 // e = H4 32 // f = H5 33 // g = H6 34 // h = H7 35 // 36 // for t = 0 to 63 { 37 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt 38 // T2 = BIGSIGMA0(a) + Maj(a,b,c) 39 // h = g 40 // g = f 41 // f = e 42 // e = d + T1 43 // d = c 44 // c = b 45 // b = a 46 // a = T1 + T2 47 // } 48 // 49 // H0 = a + H0 50 // H1 = b + H1 51 // H2 = c + H2 52 // H3 = d + H3 53 // H4 = e + H4 54 // H5 = f + H5 55 // H6 = g + H6 56 // H7 = h + H7 57 58 // Wt = Mt; for 0 <= t <= 15 59 #define MSGSCHEDULE0(index) \ 60 MOVL (index*4)(SI), AX; \ 61 BSWAPL AX; \ 62 MOVL AX, (index*4)(BP) 63 64 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 65 // SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x) 66 // SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x) 67 #define MSGSCHEDULE1(index) \ 68 MOVL ((index-2)*4)(BP), AX; \ 69 MOVL AX, CX; \ 70 RORL $17, AX; \ 71 MOVL CX, DX; \ 72 RORL $19, CX; \ 73 SHRL $10, DX; \ 74 MOVL ((index-15)*4)(BP), BX; \ 75 XORL CX, AX; \ 76 MOVL BX, CX; \ 77 XORL DX, AX; \ 78 RORL $7, BX; \ 79 MOVL CX, DX; \ 80 SHRL $3, DX; \ 81 RORL $18, CX; \ 82 ADDL ((index-7)*4)(BP), AX; \ 83 XORL CX, BX; \ 84 XORL DX, BX; \ 85 ADDL ((index-16)*4)(BP), BX; \ 86 ADDL BX, AX; \ 87 MOVL AX, ((index)*4)(BP) 88 89 // Calculate T1 in AX - uses AX, CX and DX registers. 90 // h is also used as an accumulator. Wt is passed in AX. 91 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt 92 // BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x) 93 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) 94 #define SHA256T1(const, e, f, g, h) \ 95 ADDL AX, h; \ 96 MOVL e, AX; \ 97 ADDL $const, h; \ 98 MOVL e, CX; \ 99 RORL $6, AX; \ 100 MOVL e, DX; \ 101 RORL $11, CX; \ 102 XORL CX, AX; \ 103 MOVL e, CX; \ 104 RORL $25, DX; \ 105 ANDL f, CX; \ 106 XORL AX, DX; \ 107 MOVL e, AX; \ 108 NOTL AX; \ 109 ADDL DX, h; \ 110 ANDL g, AX; \ 111 XORL CX, AX; \ 112 ADDL h, AX 113 114 // Calculate T2 in BX - uses BX, CX, DX and DI registers. 115 // T2 = BIGSIGMA0(a) + Maj(a, b, c) 116 // BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x) 117 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) 118 #define SHA256T2(a, b, c) \ 119 MOVL a, DI; \ 120 MOVL c, BX; \ 121 RORL $2, DI; \ 122 MOVL a, DX; \ 123 ANDL b, BX; \ 124 RORL $13, DX; \ 125 MOVL a, CX; \ 126 ANDL c, CX; \ 127 XORL DX, DI; \ 128 XORL CX, BX; \ 129 MOVL a, DX; \ 130 MOVL b, CX; \ 131 RORL $22, DX; \ 132 ANDL a, CX; \ 133 XORL CX, BX; \ 134 XORL DX, DI; \ 135 ADDL DI, BX 136 137 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2. 138 // The values for e and a are stored in d and h, ready for rotation. 139 #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \ 140 SHA256T1(const, e, f, g, h); \ 141 SHA256T2(a, b, c); \ 142 MOVL BX, h; \ 143 ADDL AX, d; \ 144 ADDL AX, h 145 146 #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \ 147 MSGSCHEDULE0(index); \ 148 SHA256ROUND(index, const, a, b, c, d, e, f, g, h) 149 150 #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \ 151 MSGSCHEDULE1(index); \ 152 SHA256ROUND(index, const, a, b, c, d, e, f, g, h) 153 154 155 // Definitions for AVX2 version 156 157 // addm (mem), reg 158 // Add reg to mem using reg-mem add and store 159 #define addm(P1, P2) \ 160 ADDL P2, P1; \ 161 MOVL P1, P2 162 163 #define XDWORD0 Y4 164 #define XDWORD1 Y5 165 #define XDWORD2 Y6 166 #define XDWORD3 Y7 167 168 #define XWORD0 X4 169 #define XWORD1 X5 170 #define XWORD2 X6 171 #define XWORD3 X7 172 173 #define XTMP0 Y0 174 #define XTMP1 Y1 175 #define XTMP2 Y2 176 #define XTMP3 Y3 177 #define XTMP4 Y8 178 #define XTMP5 Y11 179 180 #define XFER Y9 181 182 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 183 #define X_BYTE_FLIP_MASK X13 184 185 #define NUM_BYTES DX 186 #define INP DI 187 188 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h) 189 190 #define a AX 191 #define b BX 192 #define c CX 193 #define d R8 194 #define e DX 195 #define f R9 196 #define g R10 197 #define h R11 198 199 #define old_h R11 200 201 #define TBL BP 202 203 #define SRND SI // SRND is same register as CTX 204 205 #define T1 R12 206 207 #define y0 R13 208 #define y1 R14 209 #define y2 R15 210 #define y3 DI 211 212 // Offsets 213 #define XFER_SIZE 2*64*4 214 #define INP_END_SIZE 8 215 #define INP_SIZE 8 216 #define TMP_SIZE 4 217 218 #define _XFER 0 219 #define _INP_END _XFER + XFER_SIZE 220 #define _INP _INP_END + INP_END_SIZE 221 #define _TMP _INP + INP_SIZE 222 #define STACK_SIZE _TMP + TMP_SIZE 223 224 #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 225 ; \ // ############################# RND N + 0 ############################// 226 MOVL a, y3; \ // y3 = a // MAJA 227 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 228 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 229 ; \ 230 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w 231 ORL c, y3; \ // y3 = a|c // MAJA 232 VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7] 233 MOVL f, y2; \ // y2 = f // CH 234 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 235 ; \ 236 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 237 XORL g, y2; \ // y2 = f^g // CH 238 VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1 239 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 240 ; \ 241 ANDL e, y2; \ // y2 = (f^g)&e // CH 242 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 243 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 244 ADDL h, d; \ // d = k + w + h + d // -- 245 ; \ 246 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 247 VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15] 248 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 249 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 250 ; \ 251 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 252 VPSRLD $7, XTMP1, XTMP2; \ 253 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 254 MOVL a, T1; \ // T1 = a // MAJB 255 ANDL c, T1; \ // T1 = a&c // MAJB 256 ; \ 257 ADDL y0, y2; \ // y2 = S1 + CH // -- 258 VPSLLD $(32-7), XTMP1, XTMP3; \ 259 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 260 ADDL y1, h; \ // h = k + w + h + S0 // -- 261 ; \ 262 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 263 VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 264 ; \ 265 VPSRLD $18, XTMP1, XTMP2; \ 266 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 267 ADDL y3, h // h = t1 + S0 + MAJ // -- 268 269 #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 270 ; \ // ################################### RND N + 1 ############################ 271 ; \ 272 MOVL a, y3; \ // y3 = a // MAJA 273 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 274 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 275 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 276 ORL c, y3; \ // y3 = a|c // MAJA 277 ; \ 278 VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3 279 MOVL f, y2; \ // y2 = f // CH 280 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 281 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 282 XORL g, y2; \ // y2 = f^g // CH 283 ; \ 284 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 285 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 286 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 287 ANDL e, y2; \ // y2 = (f^g)&e // CH 288 ADDL h, d; \ // d = k + w + h + d // -- 289 ; \ 290 VPSLLD $(32-18), XTMP1, XTMP1; \ 291 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 292 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 293 ; \ 294 VPXOR XTMP1, XTMP3, XTMP3; \ 295 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 296 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 297 ; \ 298 VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 299 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 300 MOVL a, T1; \ // T1 = a // MAJB 301 ANDL c, T1; \ // T1 = a&c // MAJB 302 ADDL y0, y2; \ // y2 = S1 + CH // -- 303 ; \ 304 VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0 305 VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA} 306 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 307 ADDL y1, h; \ // h = k + w + h + S0 // -- 308 ; \ 309 VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0 310 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 311 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 312 ADDL y3, h; \ // h = t1 + S0 + MAJ // -- 313 ; \ 314 VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA} 315 316 #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 317 ; \ // ################################### RND N + 2 ############################ 318 ; \ 319 MOVL a, y3; \ // y3 = a // MAJA 320 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 321 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 322 ; \ 323 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA} 324 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 325 ORL c, y3; \ // y3 = a|c // MAJA 326 MOVL f, y2; \ // y2 = f // CH 327 XORL g, y2; \ // y2 = f^g // CH 328 ; \ 329 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 330 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 331 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA} 332 ANDL e, y2; \ // y2 = (f^g)&e // CH 333 ; \ 334 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 335 VPXOR XTMP3, XTMP2, XTMP2; \ 336 ADDL h, d; \ // d = k + w + h + d // -- 337 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 338 ; \ 339 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 340 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 341 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA} 342 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 343 ; \ 344 MOVL f, _TMP(SP); \ 345 MOVQ $shuff_00BA<>(SB), f; \ // f is used to keep SHUF_00BA 346 VPSHUFB (f), XTMP4, XTMP4; \ // XTMP4 = s1 {00BA} 347 MOVL _TMP(SP), f; \ // f is restored 348 ; \ 349 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 350 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 351 VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]} 352 ; \ 353 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 354 MOVL a, T1; \ // T1 = a // MAJB 355 ANDL c, T1; \ // T1 = a&c // MAJB 356 ADDL y0, y2; \ // y2 = S1 + CH // -- 357 VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC} 358 ; \ 359 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 360 ADDL y1, h; \ // h = k + w + h + S0 // -- 361 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 362 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 363 ; \ 364 ADDL y3, h // h = t1 + S0 + MAJ // -- 365 366 #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 367 ; \ // ################################### RND N + 3 ############################ 368 ; \ 369 MOVL a, y3; \ // y3 = a // MAJA 370 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 371 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 372 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 373 ORL c, y3; \ // y3 = a|c // MAJA 374 ; \ 375 VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC} 376 MOVL f, y2; \ // y2 = f // CH 377 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 378 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 379 XORL g, y2; \ // y2 = f^g // CH 380 ; \ 381 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC} 382 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 383 ANDL e, y2; \ // y2 = (f^g)&e // CH 384 ADDL h, d; \ // d = k + w + h + d // -- 385 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 386 ; \ 387 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC} 388 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 389 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 390 ; \ 391 VPXOR XTMP3, XTMP2, XTMP2; \ 392 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 393 ADDL y0, y2; \ // y2 = S1 + CH // -- 394 ; \ 395 VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC} 396 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 397 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 398 ; \ 399 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 400 ; \ 401 MOVL f, _TMP(SP); \ // Save f 402 MOVQ $shuff_DC00<>(SB), f; \ // SHUF_00DC 403 VPSHUFB (f), XTMP5, XTMP5; \ // XTMP5 = s1 {DC00} 404 MOVL _TMP(SP), f; \ // Restore f 405 ; \ 406 VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} 407 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 408 MOVL a, T1; \ // T1 = a // MAJB 409 ANDL c, T1; \ // T1 = a&c // MAJB 410 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 411 ; \ 412 ADDL y1, h; \ // h = k + w + h + S0 // -- 413 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 414 ADDL y3, h // h = t1 + S0 + MAJ // -- 415 416 #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \ 417 ; \ // ################################### RND N + 0 ########################### 418 MOVL f, y2; \ // y2 = f // CH 419 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 420 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 421 XORL g, y2; \ // y2 = f^g // CH 422 ; \ 423 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 424 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 425 ANDL e, y2; \ // y2 = (f^g)&e // CH 426 ; \ 427 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 428 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 429 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 430 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 431 MOVL a, y3; \ // y3 = a // MAJA 432 ; \ 433 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 434 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 435 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 436 ORL c, y3; \ // y3 = a|c // MAJA 437 ; \ 438 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 439 MOVL a, T1; \ // T1 = a // MAJB 440 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 441 ANDL c, T1; \ // T1 = a&c // MAJB 442 ADDL y0, y2; \ // y2 = S1 + CH // -- 443 ; \ 444 ADDL h, d; \ // d = k + w + h + d // -- 445 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 446 ADDL y1, h; \ // h = k + w + h + S0 // -- 447 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 448 449 #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \ 450 ; \ // ################################### RND N + 1 ########################### 451 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // -- 452 MOVL f, y2; \ // y2 = f // CH 453 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 454 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 455 XORL g, y2; \ // y2 = f^g // CH 456 ; \ 457 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 458 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 459 ANDL e, y2; \ // y2 = (f^g)&e // CH 460 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 461 ; \ 462 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 463 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 464 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 465 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 466 MOVL a, y3; \ // y3 = a // MAJA 467 ; \ 468 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 469 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 470 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 471 ORL c, y3; \ // y3 = a|c // MAJA 472 ; \ 473 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 474 MOVL a, T1; \ // T1 = a // MAJB 475 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 476 ANDL c, T1; \ // T1 = a&c // MAJB 477 ADDL y0, y2; \ // y2 = S1 + CH // -- 478 ; \ 479 ADDL h, d; \ // d = k + w + h + d // -- 480 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 481 ADDL y1, h; \ // h = k + w + h + S0 // -- 482 ; \ 483 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 484 485 #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \ 486 ; \ // ################################### RND N + 2 ############################## 487 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 488 MOVL f, y2; \ // y2 = f // CH 489 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 490 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 491 XORL g, y2; \ // y2 = f^g // CH 492 ; \ 493 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 494 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 495 ANDL e, y2; \ // y2 = (f^g)&e // CH 496 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 497 ; \ 498 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 499 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 500 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 501 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 502 MOVL a, y3; \ // y3 = a // MAJA 503 ; \ 504 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 505 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 506 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 507 ORL c, y3; \ // y3 = a|c // MAJA 508 ; \ 509 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 510 MOVL a, T1; \ // T1 = a // MAJB 511 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 512 ANDL c, T1; \ // T1 = a&c // MAJB 513 ADDL y0, y2; \ // y2 = S1 + CH // -- 514 ; \ 515 ADDL h, d; \ // d = k + w + h + d // -- 516 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 517 ADDL y1, h; \ // h = k + w + h + S0 // -- 518 ; \ 519 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 520 521 #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \ 522 ; \ // ################################### RND N + 3 ########################### 523 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 524 MOVL f, y2; \ // y2 = f // CH 525 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 526 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 527 XORL g, y2; \ // y2 = f^g // CH 528 ; \ 529 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 530 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 531 ANDL e, y2; \ // y2 = (f^g)&e // CH 532 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 533 ; \ 534 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 535 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 536 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 537 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 538 MOVL a, y3; \ // y3 = a // MAJA 539 ; \ 540 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 541 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 542 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 543 ORL c, y3; \ // y3 = a|c // MAJA 544 ; \ 545 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 546 MOVL a, T1; \ // T1 = a // MAJB 547 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 548 ANDL c, T1; \ // T1 = a&c // MAJB 549 ADDL y0, y2; \ // y2 = S1 + CH // -- 550 ; \ 551 ADDL h, d; \ // d = k + w + h + d // -- 552 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 553 ADDL y1, h; \ // h = k + w + h + S0 // -- 554 ; \ 555 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 556 ; \ 557 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 558 ; \ 559 ADDL y3, h // h = t1 + S0 + MAJ // -- 560 561 TEXT ·block(SB), 0, $536-32 562 CMPB runtime·support_avx2(SB), $0 563 JE noavx2bmi2 564 CMPB runtime·support_bmi2(SB), $1 // check for RORXL instruction 565 JE avx2 566 noavx2bmi2: 567 568 MOVQ p_base+8(FP), SI 569 MOVQ p_len+16(FP), DX 570 SHRQ $6, DX 571 SHLQ $6, DX 572 573 LEAQ (SI)(DX*1), DI 574 MOVQ DI, 256(SP) 575 CMPQ SI, DI 576 JEQ end 577 578 MOVQ dig+0(FP), BP 579 MOVL (0*4)(BP), R8 // a = H0 580 MOVL (1*4)(BP), R9 // b = H1 581 MOVL (2*4)(BP), R10 // c = H2 582 MOVL (3*4)(BP), R11 // d = H3 583 MOVL (4*4)(BP), R12 // e = H4 584 MOVL (5*4)(BP), R13 // f = H5 585 MOVL (6*4)(BP), R14 // g = H6 586 MOVL (7*4)(BP), R15 // h = H7 587 588 loop: 589 MOVQ SP, BP 590 591 SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15) 592 SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14) 593 SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13) 594 SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12) 595 SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11) 596 SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10) 597 SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9) 598 SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8) 599 SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15) 600 SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14) 601 SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13) 602 SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12) 603 SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11) 604 SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10) 605 SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9) 606 SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8) 607 608 SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15) 609 SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14) 610 SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13) 611 SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12) 612 SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11) 613 SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10) 614 SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9) 615 SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8) 616 SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15) 617 SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14) 618 SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13) 619 SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12) 620 SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11) 621 SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10) 622 SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9) 623 SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8) 624 SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15) 625 SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14) 626 SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13) 627 SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12) 628 SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11) 629 SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10) 630 SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9) 631 SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8) 632 SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15) 633 SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14) 634 SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13) 635 SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12) 636 SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11) 637 SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10) 638 SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9) 639 SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8) 640 SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15) 641 SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14) 642 SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13) 643 SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12) 644 SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11) 645 SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10) 646 SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9) 647 SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8) 648 SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15) 649 SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14) 650 SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13) 651 SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12) 652 SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11) 653 SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10) 654 SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9) 655 SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8) 656 657 MOVQ dig+0(FP), BP 658 ADDL (0*4)(BP), R8 // H0 = a + H0 659 MOVL R8, (0*4)(BP) 660 ADDL (1*4)(BP), R9 // H1 = b + H1 661 MOVL R9, (1*4)(BP) 662 ADDL (2*4)(BP), R10 // H2 = c + H2 663 MOVL R10, (2*4)(BP) 664 ADDL (3*4)(BP), R11 // H3 = d + H3 665 MOVL R11, (3*4)(BP) 666 ADDL (4*4)(BP), R12 // H4 = e + H4 667 MOVL R12, (4*4)(BP) 668 ADDL (5*4)(BP), R13 // H5 = f + H5 669 MOVL R13, (5*4)(BP) 670 ADDL (6*4)(BP), R14 // H6 = g + H6 671 MOVL R14, (6*4)(BP) 672 ADDL (7*4)(BP), R15 // H7 = h + H7 673 MOVL R15, (7*4)(BP) 674 675 ADDQ $64, SI 676 CMPQ SI, 256(SP) 677 JB loop 678 679 end: 680 RET 681 682 avx2: 683 MOVQ dig+0(FP), CTX // d.h[8] 684 MOVQ p_base+8(FP), INP 685 MOVQ p_len+16(FP), NUM_BYTES 686 687 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block 688 MOVQ NUM_BYTES, _INP_END(SP) 689 690 CMPQ NUM_BYTES, INP 691 JE avx2_only_one_block 692 693 // Load initial digest 694 MOVL 0(CTX), a // a = H0 695 MOVL 4(CTX), b // b = H1 696 MOVL 8(CTX), c // c = H2 697 MOVL 12(CTX), d // d = H3 698 MOVL 16(CTX), e // e = H4 699 MOVL 20(CTX), f // f = H5 700 MOVL 24(CTX), g // g = H6 701 MOVL 28(CTX), h // h = H7 702 703 avx2_loop0: // at each iteration works with one block (512 bit) 704 705 VMOVDQU (0*32)(INP), XTMP0 706 VMOVDQU (1*32)(INP), XTMP1 707 VMOVDQU (2*32)(INP), XTMP2 708 VMOVDQU (3*32)(INP), XTMP3 709 710 MOVQ $flip_mask<>(SB), BP // BYTE_FLIP_MASK 711 VMOVDQU (BP), BYTE_FLIP_MASK 712 713 // Apply Byte Flip Mask: LE -> BE 714 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0 715 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1 716 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2 717 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3 718 719 // Transpose data into high/low parts 720 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0 721 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4 722 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8 723 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12 724 725 MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants 726 727 avx2_last_block_enter: 728 ADDQ $64, INP 729 MOVQ INP, _INP(SP) 730 XORQ SRND, SRND 731 732 avx2_loop1: // for w0 - w47 733 // Do 4 rounds and scheduling 734 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER 735 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) 736 ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 737 ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 738 ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 739 ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 740 741 // Do 4 rounds and scheduling 742 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER 743 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 744 ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 745 ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 746 ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 747 ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 748 749 // Do 4 rounds and scheduling 750 VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER 751 VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1) 752 ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 753 ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 754 ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 755 ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 756 757 // Do 4 rounds and scheduling 758 VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER 759 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) 760 ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 761 ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 762 ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 763 ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 764 765 ADDQ $4*32, SRND 766 CMPQ SRND, $3*4*32 767 JB avx2_loop1 768 769 avx2_loop2: 770 // w48 - w63 processed with no scheduliung (last 16 rounds) 771 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER 772 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) 773 DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h) 774 DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h) 775 DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g) 776 DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f) 777 778 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER 779 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 780 DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e) 781 DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d) 782 DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c) 783 DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b) 784 785 ADDQ $2*32, SRND 786 787 VMOVDQU XDWORD2, XDWORD0 788 VMOVDQU XDWORD3, XDWORD1 789 790 CMPQ SRND, $4*4*32 791 JB avx2_loop2 792 793 MOVQ dig+0(FP), CTX // d.h[8] 794 MOVQ _INP(SP), INP 795 796 addm( 0(CTX), a) 797 addm( 4(CTX), b) 798 addm( 8(CTX), c) 799 addm( 12(CTX), d) 800 addm( 16(CTX), e) 801 addm( 20(CTX), f) 802 addm( 24(CTX), g) 803 addm( 28(CTX), h) 804 805 CMPQ _INP_END(SP), INP 806 JB done_hash 807 808 XORQ SRND, SRND 809 810 avx2_loop3: // Do second block using previously scheduled results 811 DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a) 812 DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h) 813 DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g) 814 DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f) 815 816 DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e) 817 DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d) 818 DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c) 819 DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b) 820 821 ADDQ $2*32, SRND 822 CMPQ SRND, $4*4*32 823 JB avx2_loop3 824 825 MOVQ dig+0(FP), CTX // d.h[8] 826 MOVQ _INP(SP), INP 827 ADDQ $64, INP 828 829 addm( 0(CTX), a) 830 addm( 4(CTX), b) 831 addm( 8(CTX), c) 832 addm( 12(CTX), d) 833 addm( 16(CTX), e) 834 addm( 20(CTX), f) 835 addm( 24(CTX), g) 836 addm( 28(CTX), h) 837 838 CMPQ _INP_END(SP), INP 839 JA avx2_loop0 840 JB done_hash 841 842 avx2_do_last_block: 843 844 VMOVDQU 0(INP), XWORD0 845 VMOVDQU 16(INP), XWORD1 846 VMOVDQU 32(INP), XWORD2 847 VMOVDQU 48(INP), XWORD3 848 849 MOVQ $flip_mask<>(SB), BP 850 VMOVDQU (BP), X_BYTE_FLIP_MASK 851 852 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 853 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 854 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 855 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 856 857 MOVQ $K256<>(SB), TBL 858 859 JMP avx2_last_block_enter 860 861 avx2_only_one_block: 862 // Load initial digest 863 MOVL 0(CTX), a // a = H0 864 MOVL 4(CTX), b // b = H1 865 MOVL 8(CTX), c // c = H2 866 MOVL 12(CTX), d // d = H3 867 MOVL 16(CTX), e // e = H4 868 MOVL 20(CTX), f // f = H5 869 MOVL 24(CTX), g // g = H6 870 MOVL 28(CTX), h // h = H7 871 872 JMP avx2_do_last_block 873 874 done_hash: 875 VZEROUPPER 876 RET 877 878 // shuffle byte order from LE to BE 879 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 880 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 881 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203 882 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b 883 GLOBL flip_mask<>(SB), 8, $32 884 885 // shuffle xBxA -> 00BA 886 DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100 887 DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF 888 DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100 889 DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 890 GLOBL shuff_00BA<>(SB), 8, $32 891 892 // shuffle xDxC -> DC00 893 DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF 894 DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100 895 DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 896 DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100 897 GLOBL shuff_DC00<>(SB), 8, $32 898 899 // Round specific constants 900 DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1 901 DATA K256<>+0x04(SB)/4, $0x71374491 // k2 902 DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3 903 DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4 904 DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1 905 DATA K256<>+0x14(SB)/4, $0x71374491 // k2 906 DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3 907 DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4 908 909 DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8 910 DATA K256<>+0x24(SB)/4, $0x59f111f1 911 DATA K256<>+0x28(SB)/4, $0x923f82a4 912 DATA K256<>+0x2c(SB)/4, $0xab1c5ed5 913 DATA K256<>+0x30(SB)/4, $0x3956c25b 914 DATA K256<>+0x34(SB)/4, $0x59f111f1 915 DATA K256<>+0x38(SB)/4, $0x923f82a4 916 DATA K256<>+0x3c(SB)/4, $0xab1c5ed5 917 918 DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12 919 DATA K256<>+0x44(SB)/4, $0x12835b01 920 DATA K256<>+0x48(SB)/4, $0x243185be 921 DATA K256<>+0x4c(SB)/4, $0x550c7dc3 922 DATA K256<>+0x50(SB)/4, $0xd807aa98 923 DATA K256<>+0x54(SB)/4, $0x12835b01 924 DATA K256<>+0x58(SB)/4, $0x243185be 925 DATA K256<>+0x5c(SB)/4, $0x550c7dc3 926 927 DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16 928 DATA K256<>+0x64(SB)/4, $0x80deb1fe 929 DATA K256<>+0x68(SB)/4, $0x9bdc06a7 930 DATA K256<>+0x6c(SB)/4, $0xc19bf174 931 DATA K256<>+0x70(SB)/4, $0x72be5d74 932 DATA K256<>+0x74(SB)/4, $0x80deb1fe 933 DATA K256<>+0x78(SB)/4, $0x9bdc06a7 934 DATA K256<>+0x7c(SB)/4, $0xc19bf174 935 936 DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20 937 DATA K256<>+0x84(SB)/4, $0xefbe4786 938 DATA K256<>+0x88(SB)/4, $0x0fc19dc6 939 DATA K256<>+0x8c(SB)/4, $0x240ca1cc 940 DATA K256<>+0x90(SB)/4, $0xe49b69c1 941 DATA K256<>+0x94(SB)/4, $0xefbe4786 942 DATA K256<>+0x98(SB)/4, $0x0fc19dc6 943 DATA K256<>+0x9c(SB)/4, $0x240ca1cc 944 945 DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24 946 DATA K256<>+0xa4(SB)/4, $0x4a7484aa 947 DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc 948 DATA K256<>+0xac(SB)/4, $0x76f988da 949 DATA K256<>+0xb0(SB)/4, $0x2de92c6f 950 DATA K256<>+0xb4(SB)/4, $0x4a7484aa 951 DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc 952 DATA K256<>+0xbc(SB)/4, $0x76f988da 953 954 DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28 955 DATA K256<>+0xc4(SB)/4, $0xa831c66d 956 DATA K256<>+0xc8(SB)/4, $0xb00327c8 957 DATA K256<>+0xcc(SB)/4, $0xbf597fc7 958 DATA K256<>+0xd0(SB)/4, $0x983e5152 959 DATA K256<>+0xd4(SB)/4, $0xa831c66d 960 DATA K256<>+0xd8(SB)/4, $0xb00327c8 961 DATA K256<>+0xdc(SB)/4, $0xbf597fc7 962 963 DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32 964 DATA K256<>+0xe4(SB)/4, $0xd5a79147 965 DATA K256<>+0xe8(SB)/4, $0x06ca6351 966 DATA K256<>+0xec(SB)/4, $0x14292967 967 DATA K256<>+0xf0(SB)/4, $0xc6e00bf3 968 DATA K256<>+0xf4(SB)/4, $0xd5a79147 969 DATA K256<>+0xf8(SB)/4, $0x06ca6351 970 DATA K256<>+0xfc(SB)/4, $0x14292967 971 972 DATA K256<>+0x100(SB)/4, $0x27b70a85 973 DATA K256<>+0x104(SB)/4, $0x2e1b2138 974 DATA K256<>+0x108(SB)/4, $0x4d2c6dfc 975 DATA K256<>+0x10c(SB)/4, $0x53380d13 976 DATA K256<>+0x110(SB)/4, $0x27b70a85 977 DATA K256<>+0x114(SB)/4, $0x2e1b2138 978 DATA K256<>+0x118(SB)/4, $0x4d2c6dfc 979 DATA K256<>+0x11c(SB)/4, $0x53380d13 980 981 DATA K256<>+0x120(SB)/4, $0x650a7354 982 DATA K256<>+0x124(SB)/4, $0x766a0abb 983 DATA K256<>+0x128(SB)/4, $0x81c2c92e 984 DATA K256<>+0x12c(SB)/4, $0x92722c85 985 DATA K256<>+0x130(SB)/4, $0x650a7354 986 DATA K256<>+0x134(SB)/4, $0x766a0abb 987 DATA K256<>+0x138(SB)/4, $0x81c2c92e 988 DATA K256<>+0x13c(SB)/4, $0x92722c85 989 990 DATA K256<>+0x140(SB)/4, $0xa2bfe8a1 991 DATA K256<>+0x144(SB)/4, $0xa81a664b 992 DATA K256<>+0x148(SB)/4, $0xc24b8b70 993 DATA K256<>+0x14c(SB)/4, $0xc76c51a3 994 DATA K256<>+0x150(SB)/4, $0xa2bfe8a1 995 DATA K256<>+0x154(SB)/4, $0xa81a664b 996 DATA K256<>+0x158(SB)/4, $0xc24b8b70 997 DATA K256<>+0x15c(SB)/4, $0xc76c51a3 998 999 DATA K256<>+0x160(SB)/4, $0xd192e819 1000 DATA K256<>+0x164(SB)/4, $0xd6990624 1001 DATA K256<>+0x168(SB)/4, $0xf40e3585 1002 DATA K256<>+0x16c(SB)/4, $0x106aa070 1003 DATA K256<>+0x170(SB)/4, $0xd192e819 1004 DATA K256<>+0x174(SB)/4, $0xd6990624 1005 DATA K256<>+0x178(SB)/4, $0xf40e3585 1006 DATA K256<>+0x17c(SB)/4, $0x106aa070 1007 1008 DATA K256<>+0x180(SB)/4, $0x19a4c116 1009 DATA K256<>+0x184(SB)/4, $0x1e376c08 1010 DATA K256<>+0x188(SB)/4, $0x2748774c 1011 DATA K256<>+0x18c(SB)/4, $0x34b0bcb5 1012 DATA K256<>+0x190(SB)/4, $0x19a4c116 1013 DATA K256<>+0x194(SB)/4, $0x1e376c08 1014 DATA K256<>+0x198(SB)/4, $0x2748774c 1015 DATA K256<>+0x19c(SB)/4, $0x34b0bcb5 1016 1017 DATA K256<>+0x1a0(SB)/4, $0x391c0cb3 1018 DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a 1019 DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f 1020 DATA K256<>+0x1ac(SB)/4, $0x682e6ff3 1021 DATA K256<>+0x1b0(SB)/4, $0x391c0cb3 1022 DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a 1023 DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f 1024 DATA K256<>+0x1bc(SB)/4, $0x682e6ff3 1025 1026 DATA K256<>+0x1c0(SB)/4, $0x748f82ee 1027 DATA K256<>+0x1c4(SB)/4, $0x78a5636f 1028 DATA K256<>+0x1c8(SB)/4, $0x84c87814 1029 DATA K256<>+0x1cc(SB)/4, $0x8cc70208 1030 DATA K256<>+0x1d0(SB)/4, $0x748f82ee 1031 DATA K256<>+0x1d4(SB)/4, $0x78a5636f 1032 DATA K256<>+0x1d8(SB)/4, $0x84c87814 1033 DATA K256<>+0x1dc(SB)/4, $0x8cc70208 1034 1035 DATA K256<>+0x1e0(SB)/4, $0x90befffa 1036 DATA K256<>+0x1e4(SB)/4, $0xa4506ceb 1037 DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7 1038 DATA K256<>+0x1ec(SB)/4, $0xc67178f2 1039 DATA K256<>+0x1f0(SB)/4, $0x90befffa 1040 DATA K256<>+0x1f4(SB)/4, $0xa4506ceb 1041 DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7 1042 DATA K256<>+0x1fc(SB)/4, $0xc67178f2 1043 1044 GLOBL K256<>(SB), (NOPTR + RODATA), $512