github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/crypto/sha256/sha256block_amd64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 #include "textflag.h" 8 9 // SHA256 block routine. See sha256block.go for Go equivalent. 10 // 11 // The algorithm is detailed in FIPS 180-4: 12 // 13 // https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf 14 15 // The avx2-version is described in an Intel White-Paper: 16 // "Fast SHA-256 Implementations on Intel Architecture Processors" 17 // To find it, surf to http://www.intel.com/p/en_US/embedded 18 // and search for that title. 19 // AVX2 version by Intel, same algorithm as code in Linux kernel: 20 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S 21 // by 22 // James Guilford <james.guilford@intel.com> 23 // Kirk Yap <kirk.s.yap@intel.com> 24 // Tim Chen <tim.c.chen@linux.intel.com> 25 26 // Wt = Mt; for 0 <= t <= 15 27 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 28 // 29 // a = H0 30 // b = H1 31 // c = H2 32 // d = H3 33 // e = H4 34 // f = H5 35 // g = H6 36 // h = H7 37 // 38 // for t = 0 to 63 { 39 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt 40 // T2 = BIGSIGMA0(a) + Maj(a,b,c) 41 // h = g 42 // g = f 43 // f = e 44 // e = d + T1 45 // d = c 46 // c = b 47 // b = a 48 // a = T1 + T2 49 // } 50 // 51 // H0 = a + H0 52 // H1 = b + H1 53 // H2 = c + H2 54 // H3 = d + H3 55 // H4 = e + H4 56 // H5 = f + H5 57 // H6 = g + H6 58 // H7 = h + H7 59 60 // Wt = Mt; for 0 <= t <= 15 61 #define MSGSCHEDULE0(index) \ 62 MOVL (index*4)(SI), AX; \ 63 BSWAPL AX; \ 64 MOVL AX, (index*4)(BP) 65 66 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 67 // SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x) 68 // SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x) 69 #define MSGSCHEDULE1(index) \ 70 MOVL ((index-2)*4)(BP), AX; \ 71 MOVL AX, CX; \ 72 RORL $17, AX; \ 73 MOVL CX, DX; \ 74 RORL $19, CX; \ 75 SHRL $10, DX; \ 76 MOVL ((index-15)*4)(BP), BX; \ 77 XORL CX, AX; \ 78 MOVL BX, CX; \ 79 XORL DX, AX; \ 80 RORL $7, BX; \ 81 MOVL CX, DX; \ 82 SHRL $3, DX; \ 83 RORL $18, CX; \ 84 ADDL ((index-7)*4)(BP), AX; \ 85 XORL CX, BX; \ 86 XORL DX, BX; \ 87 ADDL ((index-16)*4)(BP), BX; \ 88 ADDL BX, AX; \ 89 MOVL AX, ((index)*4)(BP) 90 91 // Calculate T1 in AX - uses AX, CX and DX registers. 92 // h is also used as an accumulator. Wt is passed in AX. 93 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt 94 // BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x) 95 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) 96 #define SHA256T1(const, e, f, g, h) \ 97 ADDL AX, h; \ 98 MOVL e, AX; \ 99 ADDL $const, h; \ 100 MOVL e, CX; \ 101 RORL $6, AX; \ 102 MOVL e, DX; \ 103 RORL $11, CX; \ 104 XORL CX, AX; \ 105 MOVL e, CX; \ 106 RORL $25, DX; \ 107 ANDL f, CX; \ 108 XORL AX, DX; \ 109 MOVL e, AX; \ 110 NOTL AX; \ 111 ADDL DX, h; \ 112 ANDL g, AX; \ 113 XORL CX, AX; \ 114 ADDL h, AX 115 116 // Calculate T2 in BX - uses BX, CX, DX and DI registers. 117 // T2 = BIGSIGMA0(a) + Maj(a, b, c) 118 // BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x) 119 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) 120 #define SHA256T2(a, b, c) \ 121 MOVL a, DI; \ 122 MOVL c, BX; \ 123 RORL $2, DI; \ 124 MOVL a, DX; \ 125 ANDL b, BX; \ 126 RORL $13, DX; \ 127 MOVL a, CX; \ 128 ANDL c, CX; \ 129 XORL DX, DI; \ 130 XORL CX, BX; \ 131 MOVL a, DX; \ 132 MOVL b, CX; \ 133 RORL $22, DX; \ 134 ANDL a, CX; \ 135 XORL CX, BX; \ 136 XORL DX, DI; \ 137 ADDL DI, BX 138 139 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2. 140 // The values for e and a are stored in d and h, ready for rotation. 141 #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \ 142 SHA256T1(const, e, f, g, h); \ 143 SHA256T2(a, b, c); \ 144 MOVL BX, h; \ 145 ADDL AX, d; \ 146 ADDL AX, h 147 148 #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \ 149 MSGSCHEDULE0(index); \ 150 SHA256ROUND(index, const, a, b, c, d, e, f, g, h) 151 152 #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \ 153 MSGSCHEDULE1(index); \ 154 SHA256ROUND(index, const, a, b, c, d, e, f, g, h) 155 156 157 // Definitions for AVX2 version 158 159 // addm (mem), reg 160 // Add reg to mem using reg-mem add and store 161 #define addm(P1, P2) \ 162 ADDL P2, P1; \ 163 MOVL P1, P2 164 165 #define XDWORD0 Y4 166 #define XDWORD1 Y5 167 #define XDWORD2 Y6 168 #define XDWORD3 Y7 169 170 #define XWORD0 X4 171 #define XWORD1 X5 172 #define XWORD2 X6 173 #define XWORD3 X7 174 175 #define XTMP0 Y0 176 #define XTMP1 Y1 177 #define XTMP2 Y2 178 #define XTMP3 Y3 179 #define XTMP4 Y8 180 #define XTMP5 Y11 181 182 #define XFER Y9 183 184 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 185 #define X_BYTE_FLIP_MASK X13 186 187 #define NUM_BYTES DX 188 #define INP DI 189 190 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h) 191 192 #define a AX 193 #define b BX 194 #define c CX 195 #define d R8 196 #define e DX 197 #define f R9 198 #define g R10 199 #define h R11 200 201 #define old_h R11 202 203 #define TBL BP 204 205 #define SRND SI // SRND is same register as CTX 206 207 #define T1 R12 208 209 #define y0 R13 210 #define y1 R14 211 #define y2 R15 212 #define y3 DI 213 214 // Offsets 215 #define XFER_SIZE 2*64*4 216 #define INP_END_SIZE 8 217 #define INP_SIZE 8 218 219 #define _XFER 0 220 #define _INP_END _XFER + XFER_SIZE 221 #define _INP _INP_END + INP_END_SIZE 222 #define STACK_SIZE _INP + INP_SIZE 223 224 #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 225 ; \ // ############################# RND N + 0 ############################// 226 MOVL a, y3; \ // y3 = a // MAJA 227 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 228 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 229 ; \ 230 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w 231 ORL c, y3; \ // y3 = a|c // MAJA 232 VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7] 233 MOVL f, y2; \ // y2 = f // CH 234 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 235 ; \ 236 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 237 XORL g, y2; \ // y2 = f^g // CH 238 VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1 239 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 240 ; \ 241 ANDL e, y2; \ // y2 = (f^g)&e // CH 242 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 243 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 244 ADDL h, d; \ // d = k + w + h + d // -- 245 ; \ 246 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 247 VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15] 248 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 249 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 250 ; \ 251 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 252 VPSRLD $7, XTMP1, XTMP2; \ 253 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 254 MOVL a, T1; \ // T1 = a // MAJB 255 ANDL c, T1; \ // T1 = a&c // MAJB 256 ; \ 257 ADDL y0, y2; \ // y2 = S1 + CH // -- 258 VPSLLD $(32-7), XTMP1, XTMP3; \ 259 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 260 ADDL y1, h; \ // h = k + w + h + S0 // -- 261 ; \ 262 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 263 VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 264 ; \ 265 VPSRLD $18, XTMP1, XTMP2; \ 266 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 267 ADDL y3, h // h = t1 + S0 + MAJ // -- 268 269 #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 270 ; \ // ################################### RND N + 1 ############################ 271 ; \ 272 MOVL a, y3; \ // y3 = a // MAJA 273 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 274 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 275 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 276 ORL c, y3; \ // y3 = a|c // MAJA 277 ; \ 278 VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3 279 MOVL f, y2; \ // y2 = f // CH 280 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 281 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 282 XORL g, y2; \ // y2 = f^g // CH 283 ; \ 284 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 285 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 286 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 287 ANDL e, y2; \ // y2 = (f^g)&e // CH 288 ADDL h, d; \ // d = k + w + h + d // -- 289 ; \ 290 VPSLLD $(32-18), XTMP1, XTMP1; \ 291 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 292 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 293 ; \ 294 VPXOR XTMP1, XTMP3, XTMP3; \ 295 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 296 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 297 ; \ 298 VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 299 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 300 MOVL a, T1; \ // T1 = a // MAJB 301 ANDL c, T1; \ // T1 = a&c // MAJB 302 ADDL y0, y2; \ // y2 = S1 + CH // -- 303 ; \ 304 VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0 305 VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA} 306 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 307 ADDL y1, h; \ // h = k + w + h + S0 // -- 308 ; \ 309 VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0 310 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 311 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 312 ADDL y3, h; \ // h = t1 + S0 + MAJ // -- 313 ; \ 314 VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA} 315 316 #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 317 ; \ // ################################### RND N + 2 ############################ 318 ; \ 319 MOVL a, y3; \ // y3 = a // MAJA 320 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 321 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 322 ; \ 323 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA} 324 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 325 ORL c, y3; \ // y3 = a|c // MAJA 326 MOVL f, y2; \ // y2 = f // CH 327 XORL g, y2; \ // y2 = f^g // CH 328 ; \ 329 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 330 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 331 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA} 332 ANDL e, y2; \ // y2 = (f^g)&e // CH 333 ; \ 334 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 335 VPXOR XTMP3, XTMP2, XTMP2; \ 336 ADDL h, d; \ // d = k + w + h + d // -- 337 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 338 ; \ 339 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 340 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 341 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA} 342 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 343 ; \ 344 VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA} 345 ; \ 346 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 347 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 348 VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]} 349 ; \ 350 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 351 MOVL a, T1; \ // T1 = a // MAJB 352 ANDL c, T1; \ // T1 = a&c // MAJB 353 ADDL y0, y2; \ // y2 = S1 + CH // -- 354 VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC} 355 ; \ 356 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 357 ADDL y1, h; \ // h = k + w + h + S0 // -- 358 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 359 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 360 ; \ 361 ADDL y3, h // h = t1 + S0 + MAJ // -- 362 363 #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 364 ; \ // ################################### RND N + 3 ############################ 365 ; \ 366 MOVL a, y3; \ // y3 = a // MAJA 367 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 368 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 369 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 370 ORL c, y3; \ // y3 = a|c // MAJA 371 ; \ 372 VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC} 373 MOVL f, y2; \ // y2 = f // CH 374 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 375 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 376 XORL g, y2; \ // y2 = f^g // CH 377 ; \ 378 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC} 379 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 380 ANDL e, y2; \ // y2 = (f^g)&e // CH 381 ADDL h, d; \ // d = k + w + h + d // -- 382 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 383 ; \ 384 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC} 385 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 386 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 387 ; \ 388 VPXOR XTMP3, XTMP2, XTMP2; \ 389 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 390 ADDL y0, y2; \ // y2 = S1 + CH // -- 391 ; \ 392 VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC} 393 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 394 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 395 ; \ 396 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 397 ; \ 398 VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00} 399 ; \ 400 VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} 401 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 402 MOVL a, T1; \ // T1 = a // MAJB 403 ANDL c, T1; \ // T1 = a&c // MAJB 404 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 405 ; \ 406 ADDL y1, h; \ // h = k + w + h + S0 // -- 407 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 408 ADDL y3, h // h = t1 + S0 + MAJ // -- 409 410 #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \ 411 ; \ // ################################### RND N + 0 ########################### 412 MOVL f, y2; \ // y2 = f // CH 413 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 414 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 415 XORL g, y2; \ // y2 = f^g // CH 416 ; \ 417 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 418 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 419 ANDL e, y2; \ // y2 = (f^g)&e // CH 420 ; \ 421 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 422 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 423 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 424 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 425 MOVL a, y3; \ // y3 = a // MAJA 426 ; \ 427 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 428 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 429 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 430 ORL c, y3; \ // y3 = a|c // MAJA 431 ; \ 432 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 433 MOVL a, T1; \ // T1 = a // MAJB 434 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 435 ANDL c, T1; \ // T1 = a&c // MAJB 436 ADDL y0, y2; \ // y2 = S1 + CH // -- 437 ; \ 438 ADDL h, d; \ // d = k + w + h + d // -- 439 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 440 ADDL y1, h; \ // h = k + w + h + S0 // -- 441 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 442 443 #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \ 444 ; \ // ################################### RND N + 1 ########################### 445 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // -- 446 MOVL f, y2; \ // y2 = f // CH 447 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 448 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 449 XORL g, y2; \ // y2 = f^g // CH 450 ; \ 451 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 452 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 453 ANDL e, y2; \ // y2 = (f^g)&e // CH 454 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 455 ; \ 456 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 457 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 458 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 459 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 460 MOVL a, y3; \ // y3 = a // MAJA 461 ; \ 462 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 463 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 464 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 465 ORL c, y3; \ // y3 = a|c // MAJA 466 ; \ 467 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 468 MOVL a, T1; \ // T1 = a // MAJB 469 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 470 ANDL c, T1; \ // T1 = a&c // MAJB 471 ADDL y0, y2; \ // y2 = S1 + CH // -- 472 ; \ 473 ADDL h, d; \ // d = k + w + h + d // -- 474 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 475 ADDL y1, h; \ // h = k + w + h + S0 // -- 476 ; \ 477 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 478 479 #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \ 480 ; \ // ################################### RND N + 2 ############################## 481 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 482 MOVL f, y2; \ // y2 = f // CH 483 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 484 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 485 XORL g, y2; \ // y2 = f^g // CH 486 ; \ 487 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 488 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 489 ANDL e, y2; \ // y2 = (f^g)&e // CH 490 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 491 ; \ 492 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 493 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 494 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 495 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 496 MOVL a, y3; \ // y3 = a // MAJA 497 ; \ 498 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 499 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 500 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 501 ORL c, y3; \ // y3 = a|c // MAJA 502 ; \ 503 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 504 MOVL a, T1; \ // T1 = a // MAJB 505 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 506 ANDL c, T1; \ // T1 = a&c // MAJB 507 ADDL y0, y2; \ // y2 = S1 + CH // -- 508 ; \ 509 ADDL h, d; \ // d = k + w + h + d // -- 510 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 511 ADDL y1, h; \ // h = k + w + h + S0 // -- 512 ; \ 513 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 514 515 #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \ 516 ; \ // ################################### RND N + 3 ########################### 517 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 518 MOVL f, y2; \ // y2 = f // CH 519 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 520 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 521 XORL g, y2; \ // y2 = f^g // CH 522 ; \ 523 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 524 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 525 ANDL e, y2; \ // y2 = (f^g)&e // CH 526 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 527 ; \ 528 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 529 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 530 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 531 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 532 MOVL a, y3; \ // y3 = a // MAJA 533 ; \ 534 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 535 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 536 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 537 ORL c, y3; \ // y3 = a|c // MAJA 538 ; \ 539 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 540 MOVL a, T1; \ // T1 = a // MAJB 541 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 542 ANDL c, T1; \ // T1 = a&c // MAJB 543 ADDL y0, y2; \ // y2 = S1 + CH // -- 544 ; \ 545 ADDL h, d; \ // d = k + w + h + d // -- 546 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 547 ADDL y1, h; \ // h = k + w + h + S0 // -- 548 ; \ 549 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 550 ; \ 551 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 552 ; \ 553 ADDL y3, h // h = t1 + S0 + MAJ // -- 554 555 // Definitions for sha-ni version 556 // 557 // The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2 558 // It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version 559 // 560 // Reference 561 // S. Gulley, et al, "New Instructions Supporting the Secure Hash 562 // Algorithm on Intel® Architecture Processors", July 2013 563 // https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html 564 // 565 566 #define digestPtr DI // input/output, base pointer to digest hash vector H0, H1, ..., H7 567 #define dataPtr SI // input, base pointer to first input data block 568 #define numBytes DX // input, number of input bytes to be processed 569 #define sha256Constants AX // round contents from K256 table, indexed by round number x 32 570 #define msg X0 // input data 571 #define state0 X1 // round intermediates and outputs 572 #define state1 X2 573 #define m0 X3 // m0, m1,... m4 -- round message temps 574 #define m1 X4 575 #define m2 X5 576 #define m3 X6 577 #define m4 X7 578 #define shufMask X8 // input data endian conversion control mask 579 #define abefSave X9 // digest hash vector inter-block buffer abef 580 #define cdghSave X10 // digest hash vector inter-block buffer cdgh 581 582 #define nop(m,a) // nop instead of final SHA256MSG1 for first and last few rounds 583 584 #define sha256msg1(m,a) \ // final SHA256MSG1 for middle rounds that require it 585 SHA256MSG1 m, a 586 587 #define vmov(a,b) \ // msg copy for all but rounds 12-15 588 VMOVDQA a, b 589 590 #define vmovrev(a,b) \ // reverse copy for rounds 12-15 591 VMOVDQA b, a 592 593 // sha rounds 0 to 11 594 // identical with the exception of the final msg op 595 // which is replaced with a nop for rounds where it is not needed 596 // refer to Gulley, et al for more information 597 #define rounds0to11(m,a,c,sha256Msg1) \ 598 VMOVDQU c*16(dataPtr), msg \ 599 PSHUFB shufMask, msg \ 600 VMOVDQA msg, m \ 601 PADDD (c*32)(sha256Constants), msg \ 602 SHA256RNDS2 msg, state0, state1 \ 603 PSHUFD $0x0e, msg, msg \ 604 SHA256RNDS2 msg, state1, state0 \ 605 sha256Msg1 (m,a) 606 607 // sha rounds 12 to 59 608 // identical with the exception of the final msg op 609 // and the reverse copy(m,msg) in round 12 which is required 610 // after the last data load 611 // refer to Gulley, et al for more information 612 #define rounds12to59(m,c,a,t,sha256Msg1,movop) \ 613 movop (m,msg) \ 614 PADDD (c*32)(sha256Constants), msg \ 615 SHA256RNDS2 msg, state0, state1 \ 616 VMOVDQA m, m4 \ 617 PALIGNR $4, a, m4 \ 618 PADDD m4, t \ 619 SHA256MSG2 m, t \ 620 PSHUFD $0x0e, msg, msg \ 621 SHA256RNDS2 msg, state1, state0 \ 622 sha256Msg1 (m,a) 623 624 TEXT ·block(SB), 0, $536-32 625 CMPB ·useSHA(SB), $1 626 JE sha_ni 627 CMPB ·useAVX2(SB), $1 628 JE avx2 629 630 MOVQ p_base+8(FP), SI 631 MOVQ p_len+16(FP), DX 632 SHRQ $6, DX 633 SHLQ $6, DX 634 635 LEAQ (SI)(DX*1), DI 636 MOVQ DI, 256(SP) 637 CMPQ SI, DI 638 JEQ end 639 640 MOVQ dig+0(FP), BP 641 MOVL (0*4)(BP), R8 // a = H0 642 MOVL (1*4)(BP), R9 // b = H1 643 MOVL (2*4)(BP), R10 // c = H2 644 MOVL (3*4)(BP), R11 // d = H3 645 MOVL (4*4)(BP), R12 // e = H4 646 MOVL (5*4)(BP), R13 // f = H5 647 MOVL (6*4)(BP), R14 // g = H6 648 MOVL (7*4)(BP), R15 // h = H7 649 650 loop: 651 MOVQ SP, BP 652 653 SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15) 654 SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14) 655 SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13) 656 SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12) 657 SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11) 658 SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10) 659 SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9) 660 SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8) 661 SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15) 662 SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14) 663 SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13) 664 SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12) 665 SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11) 666 SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10) 667 SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9) 668 SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8) 669 670 SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15) 671 SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14) 672 SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13) 673 SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12) 674 SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11) 675 SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10) 676 SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9) 677 SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8) 678 SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15) 679 SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14) 680 SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13) 681 SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12) 682 SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11) 683 SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10) 684 SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9) 685 SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8) 686 SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15) 687 SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14) 688 SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13) 689 SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12) 690 SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11) 691 SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10) 692 SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9) 693 SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8) 694 SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15) 695 SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14) 696 SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13) 697 SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12) 698 SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11) 699 SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10) 700 SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9) 701 SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8) 702 SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15) 703 SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14) 704 SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13) 705 SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12) 706 SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11) 707 SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10) 708 SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9) 709 SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8) 710 SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15) 711 SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14) 712 SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13) 713 SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12) 714 SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11) 715 SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10) 716 SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9) 717 SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8) 718 719 MOVQ dig+0(FP), BP 720 ADDL (0*4)(BP), R8 // H0 = a + H0 721 MOVL R8, (0*4)(BP) 722 ADDL (1*4)(BP), R9 // H1 = b + H1 723 MOVL R9, (1*4)(BP) 724 ADDL (2*4)(BP), R10 // H2 = c + H2 725 MOVL R10, (2*4)(BP) 726 ADDL (3*4)(BP), R11 // H3 = d + H3 727 MOVL R11, (3*4)(BP) 728 ADDL (4*4)(BP), R12 // H4 = e + H4 729 MOVL R12, (4*4)(BP) 730 ADDL (5*4)(BP), R13 // H5 = f + H5 731 MOVL R13, (5*4)(BP) 732 ADDL (6*4)(BP), R14 // H6 = g + H6 733 MOVL R14, (6*4)(BP) 734 ADDL (7*4)(BP), R15 // H7 = h + H7 735 MOVL R15, (7*4)(BP) 736 737 ADDQ $64, SI 738 CMPQ SI, 256(SP) 739 JB loop 740 741 end: 742 RET 743 744 avx2: 745 MOVQ dig+0(FP), CTX // d.h[8] 746 MOVQ p_base+8(FP), INP 747 MOVQ p_len+16(FP), NUM_BYTES 748 749 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block 750 MOVQ NUM_BYTES, _INP_END(SP) 751 752 CMPQ NUM_BYTES, INP 753 JE avx2_only_one_block 754 755 // Load initial digest 756 MOVL 0(CTX), a // a = H0 757 MOVL 4(CTX), b // b = H1 758 MOVL 8(CTX), c // c = H2 759 MOVL 12(CTX), d // d = H3 760 MOVL 16(CTX), e // e = H4 761 MOVL 20(CTX), f // f = H5 762 MOVL 24(CTX), g // g = H6 763 MOVL 28(CTX), h // h = H7 764 765 avx2_loop0: // at each iteration works with one block (512 bit) 766 767 VMOVDQU (0*32)(INP), XTMP0 768 VMOVDQU (1*32)(INP), XTMP1 769 VMOVDQU (2*32)(INP), XTMP2 770 VMOVDQU (3*32)(INP), XTMP3 771 772 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 773 774 // Apply Byte Flip Mask: LE -> BE 775 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0 776 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1 777 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2 778 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3 779 780 // Transpose data into high/low parts 781 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0 782 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4 783 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8 784 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12 785 786 MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants 787 788 avx2_last_block_enter: 789 ADDQ $64, INP 790 MOVQ INP, _INP(SP) 791 XORQ SRND, SRND 792 793 avx2_loop1: // for w0 - w47 794 // Do 4 rounds and scheduling 795 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER 796 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) 797 ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 798 ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 799 ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 800 ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 801 802 // Do 4 rounds and scheduling 803 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER 804 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 805 ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 806 ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 807 ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 808 ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 809 810 // Do 4 rounds and scheduling 811 VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER 812 VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1) 813 ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 814 ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 815 ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 816 ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 817 818 // Do 4 rounds and scheduling 819 VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER 820 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) 821 ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 822 ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 823 ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 824 ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 825 826 ADDQ $4*32, SRND 827 CMPQ SRND, $3*4*32 828 JB avx2_loop1 829 830 avx2_loop2: 831 // w48 - w63 processed with no scheduling (last 16 rounds) 832 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER 833 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) 834 DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h) 835 DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h) 836 DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g) 837 DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f) 838 839 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER 840 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 841 DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e) 842 DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d) 843 DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c) 844 DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b) 845 846 ADDQ $2*32, SRND 847 848 VMOVDQU XDWORD2, XDWORD0 849 VMOVDQU XDWORD3, XDWORD1 850 851 CMPQ SRND, $4*4*32 852 JB avx2_loop2 853 854 MOVQ dig+0(FP), CTX // d.h[8] 855 MOVQ _INP(SP), INP 856 857 addm( 0(CTX), a) 858 addm( 4(CTX), b) 859 addm( 8(CTX), c) 860 addm( 12(CTX), d) 861 addm( 16(CTX), e) 862 addm( 20(CTX), f) 863 addm( 24(CTX), g) 864 addm( 28(CTX), h) 865 866 CMPQ _INP_END(SP), INP 867 JB done_hash 868 869 XORQ SRND, SRND 870 871 avx2_loop3: // Do second block using previously scheduled results 872 DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a) 873 DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h) 874 DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g) 875 DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f) 876 877 DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e) 878 DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d) 879 DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c) 880 DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b) 881 882 ADDQ $2*32, SRND 883 CMPQ SRND, $4*4*32 884 JB avx2_loop3 885 886 MOVQ dig+0(FP), CTX // d.h[8] 887 MOVQ _INP(SP), INP 888 ADDQ $64, INP 889 890 addm( 0(CTX), a) 891 addm( 4(CTX), b) 892 addm( 8(CTX), c) 893 addm( 12(CTX), d) 894 addm( 16(CTX), e) 895 addm( 20(CTX), f) 896 addm( 24(CTX), g) 897 addm( 28(CTX), h) 898 899 CMPQ _INP_END(SP), INP 900 JA avx2_loop0 901 JB done_hash 902 903 avx2_do_last_block: 904 905 VMOVDQU 0(INP), XWORD0 906 VMOVDQU 16(INP), XWORD1 907 VMOVDQU 32(INP), XWORD2 908 VMOVDQU 48(INP), XWORD3 909 910 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 911 912 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 913 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 914 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 915 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 916 917 MOVQ $K256<>(SB), TBL 918 919 JMP avx2_last_block_enter 920 921 avx2_only_one_block: 922 // Load initial digest 923 MOVL 0(CTX), a // a = H0 924 MOVL 4(CTX), b // b = H1 925 MOVL 8(CTX), c // c = H2 926 MOVL 12(CTX), d // d = H3 927 MOVL 16(CTX), e // e = H4 928 MOVL 20(CTX), f // f = H5 929 MOVL 24(CTX), g // g = H6 930 MOVL 28(CTX), h // h = H7 931 932 JMP avx2_do_last_block 933 934 done_hash: 935 VZEROUPPER 936 RET 937 938 sha_ni: 939 MOVQ dig+0(FP), digestPtr // init digest hash vector H0, H1,..., H7 pointer 940 MOVQ p_base+8(FP), dataPtr // init input data base pointer 941 MOVQ p_len+16(FP), numBytes // get number of input bytes to hash 942 SHRQ $6, numBytes // force modulo 64 input buffer length 943 SHLQ $6, numBytes 944 CMPQ numBytes, $0 // exit early for zero-length input buffer 945 JEQ done 946 ADDQ dataPtr, numBytes // point numBytes to end of input buffer 947 VMOVDQU (0*16)(digestPtr), state0 // load initial hash values and reorder 948 VMOVDQU (1*16)(digestPtr), state1 // DCBA, HGFE -> ABEF, CDGH 949 PSHUFD $0xb1, state0, state0 // CDAB 950 PSHUFD $0x1b, state1, state1 // EFGH 951 VMOVDQA state0, m4 952 PALIGNR $8, state1, state0 // ABEF 953 PBLENDW $0xf0, m4, state1 // CDGH 954 VMOVDQA flip_mask<>(SB), shufMask 955 LEAQ K256<>(SB), sha256Constants 956 957 roundLoop: 958 // save hash values for addition after rounds 959 VMOVDQA state0, abefSave 960 VMOVDQA state1, cdghSave 961 962 // do rounds 0-59 963 rounds0to11 (m0,-,0,nop) // 0-3 964 rounds0to11 (m1,m0,1,sha256msg1) // 4-7 965 rounds0to11 (m2,m1,2,sha256msg1) // 8-11 966 VMOVDQU (3*16)(dataPtr), msg 967 PSHUFB shufMask, msg 968 rounds12to59 (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15 969 rounds12to59 (m0,4,m3,m1,sha256msg1,vmov) // 16-19 970 rounds12to59 (m1,5,m0,m2,sha256msg1,vmov) // 20-23 971 rounds12to59 (m2,6,m1,m3,sha256msg1,vmov) // 24-27 972 rounds12to59 (m3,7,m2,m0,sha256msg1,vmov) // 28-31 973 rounds12to59 (m0,8,m3,m1,sha256msg1,vmov) // 32-35 974 rounds12to59 (m1,9,m0,m2,sha256msg1,vmov) // 36-39 975 rounds12to59 (m2,10,m1,m3,sha256msg1,vmov) // 40-43 976 rounds12to59 (m3,11,m2,m0,sha256msg1,vmov) // 44-47 977 rounds12to59 (m0,12,m3,m1,sha256msg1,vmov) // 48-51 978 rounds12to59 (m1,13,m0,m2,nop,vmov) // 52-55 979 rounds12to59 (m2,14,m1,m3,nop,vmov) // 56-59 980 981 // do rounds 60-63 982 VMOVDQA m3, msg 983 PADDD (15*32)(sha256Constants), msg 984 SHA256RNDS2 msg, state0, state1 985 PSHUFD $0x0e, msg, msg 986 SHA256RNDS2 msg, state1, state0 987 988 // add current hash values with previously saved 989 PADDD abefSave, state0 990 PADDD cdghSave, state1 991 992 // advance data pointer; loop until buffer empty 993 ADDQ $64, dataPtr 994 CMPQ numBytes, dataPtr 995 JNE roundLoop 996 997 // write hash values back in the correct order 998 PSHUFD $0x1b, state0, state0 // FEBA 999 PSHUFD $0xb1, state1, state1 // DCHG 1000 VMOVDQA state0, m4 1001 PBLENDW $0xf0, state1, state0 // DCBA 1002 PALIGNR $8, m4, state1 // HGFE 1003 VMOVDQU state0, (0*16)(digestPtr) 1004 VMOVDQU state1, (1*16)(digestPtr) 1005 1006 done: 1007 RET 1008 1009 // shuffle byte order from LE to BE 1010 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 1011 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 1012 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203 1013 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b 1014 GLOBL flip_mask<>(SB), 8, $32 1015 1016 // shuffle xBxA -> 00BA 1017 DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100 1018 DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF 1019 DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100 1020 DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 1021 GLOBL shuff_00BA<>(SB), 8, $32 1022 1023 // shuffle xDxC -> DC00 1024 DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF 1025 DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100 1026 DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 1027 DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100 1028 GLOBL shuff_DC00<>(SB), 8, $32 1029 1030 // Round specific constants 1031 DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1 1032 DATA K256<>+0x04(SB)/4, $0x71374491 // k2 1033 DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3 1034 DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4 1035 DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1 1036 DATA K256<>+0x14(SB)/4, $0x71374491 // k2 1037 DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3 1038 DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4 1039 1040 DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8 1041 DATA K256<>+0x24(SB)/4, $0x59f111f1 1042 DATA K256<>+0x28(SB)/4, $0x923f82a4 1043 DATA K256<>+0x2c(SB)/4, $0xab1c5ed5 1044 DATA K256<>+0x30(SB)/4, $0x3956c25b 1045 DATA K256<>+0x34(SB)/4, $0x59f111f1 1046 DATA K256<>+0x38(SB)/4, $0x923f82a4 1047 DATA K256<>+0x3c(SB)/4, $0xab1c5ed5 1048 1049 DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12 1050 DATA K256<>+0x44(SB)/4, $0x12835b01 1051 DATA K256<>+0x48(SB)/4, $0x243185be 1052 DATA K256<>+0x4c(SB)/4, $0x550c7dc3 1053 DATA K256<>+0x50(SB)/4, $0xd807aa98 1054 DATA K256<>+0x54(SB)/4, $0x12835b01 1055 DATA K256<>+0x58(SB)/4, $0x243185be 1056 DATA K256<>+0x5c(SB)/4, $0x550c7dc3 1057 1058 DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16 1059 DATA K256<>+0x64(SB)/4, $0x80deb1fe 1060 DATA K256<>+0x68(SB)/4, $0x9bdc06a7 1061 DATA K256<>+0x6c(SB)/4, $0xc19bf174 1062 DATA K256<>+0x70(SB)/4, $0x72be5d74 1063 DATA K256<>+0x74(SB)/4, $0x80deb1fe 1064 DATA K256<>+0x78(SB)/4, $0x9bdc06a7 1065 DATA K256<>+0x7c(SB)/4, $0xc19bf174 1066 1067 DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20 1068 DATA K256<>+0x84(SB)/4, $0xefbe4786 1069 DATA K256<>+0x88(SB)/4, $0x0fc19dc6 1070 DATA K256<>+0x8c(SB)/4, $0x240ca1cc 1071 DATA K256<>+0x90(SB)/4, $0xe49b69c1 1072 DATA K256<>+0x94(SB)/4, $0xefbe4786 1073 DATA K256<>+0x98(SB)/4, $0x0fc19dc6 1074 DATA K256<>+0x9c(SB)/4, $0x240ca1cc 1075 1076 DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24 1077 DATA K256<>+0xa4(SB)/4, $0x4a7484aa 1078 DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc 1079 DATA K256<>+0xac(SB)/4, $0x76f988da 1080 DATA K256<>+0xb0(SB)/4, $0x2de92c6f 1081 DATA K256<>+0xb4(SB)/4, $0x4a7484aa 1082 DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc 1083 DATA K256<>+0xbc(SB)/4, $0x76f988da 1084 1085 DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28 1086 DATA K256<>+0xc4(SB)/4, $0xa831c66d 1087 DATA K256<>+0xc8(SB)/4, $0xb00327c8 1088 DATA K256<>+0xcc(SB)/4, $0xbf597fc7 1089 DATA K256<>+0xd0(SB)/4, $0x983e5152 1090 DATA K256<>+0xd4(SB)/4, $0xa831c66d 1091 DATA K256<>+0xd8(SB)/4, $0xb00327c8 1092 DATA K256<>+0xdc(SB)/4, $0xbf597fc7 1093 1094 DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32 1095 DATA K256<>+0xe4(SB)/4, $0xd5a79147 1096 DATA K256<>+0xe8(SB)/4, $0x06ca6351 1097 DATA K256<>+0xec(SB)/4, $0x14292967 1098 DATA K256<>+0xf0(SB)/4, $0xc6e00bf3 1099 DATA K256<>+0xf4(SB)/4, $0xd5a79147 1100 DATA K256<>+0xf8(SB)/4, $0x06ca6351 1101 DATA K256<>+0xfc(SB)/4, $0x14292967 1102 1103 DATA K256<>+0x100(SB)/4, $0x27b70a85 1104 DATA K256<>+0x104(SB)/4, $0x2e1b2138 1105 DATA K256<>+0x108(SB)/4, $0x4d2c6dfc 1106 DATA K256<>+0x10c(SB)/4, $0x53380d13 1107 DATA K256<>+0x110(SB)/4, $0x27b70a85 1108 DATA K256<>+0x114(SB)/4, $0x2e1b2138 1109 DATA K256<>+0x118(SB)/4, $0x4d2c6dfc 1110 DATA K256<>+0x11c(SB)/4, $0x53380d13 1111 1112 DATA K256<>+0x120(SB)/4, $0x650a7354 1113 DATA K256<>+0x124(SB)/4, $0x766a0abb 1114 DATA K256<>+0x128(SB)/4, $0x81c2c92e 1115 DATA K256<>+0x12c(SB)/4, $0x92722c85 1116 DATA K256<>+0x130(SB)/4, $0x650a7354 1117 DATA K256<>+0x134(SB)/4, $0x766a0abb 1118 DATA K256<>+0x138(SB)/4, $0x81c2c92e 1119 DATA K256<>+0x13c(SB)/4, $0x92722c85 1120 1121 DATA K256<>+0x140(SB)/4, $0xa2bfe8a1 1122 DATA K256<>+0x144(SB)/4, $0xa81a664b 1123 DATA K256<>+0x148(SB)/4, $0xc24b8b70 1124 DATA K256<>+0x14c(SB)/4, $0xc76c51a3 1125 DATA K256<>+0x150(SB)/4, $0xa2bfe8a1 1126 DATA K256<>+0x154(SB)/4, $0xa81a664b 1127 DATA K256<>+0x158(SB)/4, $0xc24b8b70 1128 DATA K256<>+0x15c(SB)/4, $0xc76c51a3 1129 1130 DATA K256<>+0x160(SB)/4, $0xd192e819 1131 DATA K256<>+0x164(SB)/4, $0xd6990624 1132 DATA K256<>+0x168(SB)/4, $0xf40e3585 1133 DATA K256<>+0x16c(SB)/4, $0x106aa070 1134 DATA K256<>+0x170(SB)/4, $0xd192e819 1135 DATA K256<>+0x174(SB)/4, $0xd6990624 1136 DATA K256<>+0x178(SB)/4, $0xf40e3585 1137 DATA K256<>+0x17c(SB)/4, $0x106aa070 1138 1139 DATA K256<>+0x180(SB)/4, $0x19a4c116 1140 DATA K256<>+0x184(SB)/4, $0x1e376c08 1141 DATA K256<>+0x188(SB)/4, $0x2748774c 1142 DATA K256<>+0x18c(SB)/4, $0x34b0bcb5 1143 DATA K256<>+0x190(SB)/4, $0x19a4c116 1144 DATA K256<>+0x194(SB)/4, $0x1e376c08 1145 DATA K256<>+0x198(SB)/4, $0x2748774c 1146 DATA K256<>+0x19c(SB)/4, $0x34b0bcb5 1147 1148 DATA K256<>+0x1a0(SB)/4, $0x391c0cb3 1149 DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a 1150 DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f 1151 DATA K256<>+0x1ac(SB)/4, $0x682e6ff3 1152 DATA K256<>+0x1b0(SB)/4, $0x391c0cb3 1153 DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a 1154 DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f 1155 DATA K256<>+0x1bc(SB)/4, $0x682e6ff3 1156 1157 DATA K256<>+0x1c0(SB)/4, $0x748f82ee 1158 DATA K256<>+0x1c4(SB)/4, $0x78a5636f 1159 DATA K256<>+0x1c8(SB)/4, $0x84c87814 1160 DATA K256<>+0x1cc(SB)/4, $0x8cc70208 1161 DATA K256<>+0x1d0(SB)/4, $0x748f82ee 1162 DATA K256<>+0x1d4(SB)/4, $0x78a5636f 1163 DATA K256<>+0x1d8(SB)/4, $0x84c87814 1164 DATA K256<>+0x1dc(SB)/4, $0x8cc70208 1165 1166 DATA K256<>+0x1e0(SB)/4, $0x90befffa 1167 DATA K256<>+0x1e4(SB)/4, $0xa4506ceb 1168 DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7 1169 DATA K256<>+0x1ec(SB)/4, $0xc67178f2 1170 DATA K256<>+0x1f0(SB)/4, $0x90befffa 1171 DATA K256<>+0x1f4(SB)/4, $0xa4506ceb 1172 DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7 1173 DATA K256<>+0x1fc(SB)/4, $0xc67178f2 1174 1175 GLOBL K256<>(SB), (NOPTR + RODATA), $512