github.com/dorkamotorka/go/src@v0.0.0-20230614113921-187095f0e316/crypto/sha256/sha256block_amd64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // SHA256 block routine. See sha256block.go for Go equivalent. 8 // 9 // The algorithm is detailed in FIPS 180-4: 10 // 11 // https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf 12 13 // The avx2-version is described in an Intel White-Paper: 14 // "Fast SHA-256 Implementations on Intel Architecture Processors" 15 // To find it, surf to http://www.intel.com/p/en_US/embedded 16 // and search for that title. 17 // AVX2 version by Intel, same algorithm as code in Linux kernel: 18 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S 19 // by 20 // James Guilford <james.guilford@intel.com> 21 // Kirk Yap <kirk.s.yap@intel.com> 22 // Tim Chen <tim.c.chen@linux.intel.com> 23 24 // Wt = Mt; for 0 <= t <= 15 25 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 26 // 27 // a = H0 28 // b = H1 29 // c = H2 30 // d = H3 31 // e = H4 32 // f = H5 33 // g = H6 34 // h = H7 35 // 36 // for t = 0 to 63 { 37 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt 38 // T2 = BIGSIGMA0(a) + Maj(a,b,c) 39 // h = g 40 // g = f 41 // f = e 42 // e = d + T1 43 // d = c 44 // c = b 45 // b = a 46 // a = T1 + T2 47 // } 48 // 49 // H0 = a + H0 50 // H1 = b + H1 51 // H2 = c + H2 52 // H3 = d + H3 53 // H4 = e + H4 54 // H5 = f + H5 55 // H6 = g + H6 56 // H7 = h + H7 57 58 // Wt = Mt; for 0 <= t <= 15 59 #define MSGSCHEDULE0(index) \ 60 MOVL (index*4)(SI), AX; \ 61 BSWAPL AX; \ 62 MOVL AX, (index*4)(BP) 63 64 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 65 // SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x) 66 // SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x) 67 #define MSGSCHEDULE1(index) \ 68 MOVL ((index-2)*4)(BP), AX; \ 69 MOVL AX, CX; \ 70 RORL $17, AX; \ 71 MOVL CX, DX; \ 72 RORL $19, CX; \ 73 SHRL $10, DX; \ 74 MOVL ((index-15)*4)(BP), BX; \ 75 XORL CX, AX; \ 76 MOVL BX, CX; \ 77 XORL DX, AX; \ 78 RORL $7, BX; \ 79 MOVL CX, DX; \ 80 SHRL $3, DX; \ 81 RORL $18, CX; \ 82 ADDL ((index-7)*4)(BP), AX; \ 83 XORL CX, BX; \ 84 XORL DX, BX; \ 85 ADDL ((index-16)*4)(BP), BX; \ 86 ADDL BX, AX; \ 87 MOVL AX, ((index)*4)(BP) 88 89 // Calculate T1 in AX - uses AX, CX and DX registers. 90 // h is also used as an accumulator. Wt is passed in AX. 91 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt 92 // BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x) 93 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) 94 #define SHA256T1(const, e, f, g, h) \ 95 ADDL AX, h; \ 96 MOVL e, AX; \ 97 ADDL $const, h; \ 98 MOVL e, CX; \ 99 RORL $6, AX; \ 100 MOVL e, DX; \ 101 RORL $11, CX; \ 102 XORL CX, AX; \ 103 MOVL e, CX; \ 104 RORL $25, DX; \ 105 ANDL f, CX; \ 106 XORL AX, DX; \ 107 MOVL e, AX; \ 108 NOTL AX; \ 109 ADDL DX, h; \ 110 ANDL g, AX; \ 111 XORL CX, AX; \ 112 ADDL h, AX 113 114 // Calculate T2 in BX - uses BX, CX, DX and DI registers. 115 // T2 = BIGSIGMA0(a) + Maj(a, b, c) 116 // BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x) 117 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) 118 #define SHA256T2(a, b, c) \ 119 MOVL a, DI; \ 120 MOVL c, BX; \ 121 RORL $2, DI; \ 122 MOVL a, DX; \ 123 ANDL b, BX; \ 124 RORL $13, DX; \ 125 MOVL a, CX; \ 126 ANDL c, CX; \ 127 XORL DX, DI; \ 128 XORL CX, BX; \ 129 MOVL a, DX; \ 130 MOVL b, CX; \ 131 RORL $22, DX; \ 132 ANDL a, CX; \ 133 XORL CX, BX; \ 134 XORL DX, DI; \ 135 ADDL DI, BX 136 137 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2. 138 // The values for e and a are stored in d and h, ready for rotation. 139 #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \ 140 SHA256T1(const, e, f, g, h); \ 141 SHA256T2(a, b, c); \ 142 MOVL BX, h; \ 143 ADDL AX, d; \ 144 ADDL AX, h 145 146 #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \ 147 MSGSCHEDULE0(index); \ 148 SHA256ROUND(index, const, a, b, c, d, e, f, g, h) 149 150 #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \ 151 MSGSCHEDULE1(index); \ 152 SHA256ROUND(index, const, a, b, c, d, e, f, g, h) 153 154 155 // Definitions for AVX2 version 156 157 // addm (mem), reg 158 // Add reg to mem using reg-mem add and store 159 #define addm(P1, P2) \ 160 ADDL P2, P1; \ 161 MOVL P1, P2 162 163 #define XDWORD0 Y4 164 #define XDWORD1 Y5 165 #define XDWORD2 Y6 166 #define XDWORD3 Y7 167 168 #define XWORD0 X4 169 #define XWORD1 X5 170 #define XWORD2 X6 171 #define XWORD3 X7 172 173 #define XTMP0 Y0 174 #define XTMP1 Y1 175 #define XTMP2 Y2 176 #define XTMP3 Y3 177 #define XTMP4 Y8 178 #define XTMP5 Y11 179 180 #define XFER Y9 181 182 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 183 #define X_BYTE_FLIP_MASK X13 184 185 #define NUM_BYTES DX 186 #define INP DI 187 188 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h) 189 190 #define a AX 191 #define b BX 192 #define c CX 193 #define d R8 194 #define e DX 195 #define f R9 196 #define g R10 197 #define h R11 198 199 #define old_h R11 200 201 #define TBL BP 202 203 #define SRND SI // SRND is same register as CTX 204 205 #define T1 R12 206 207 #define y0 R13 208 #define y1 R14 209 #define y2 R15 210 #define y3 DI 211 212 // Offsets 213 #define XFER_SIZE 2*64*4 214 #define INP_END_SIZE 8 215 #define INP_SIZE 8 216 217 #define _XFER 0 218 #define _INP_END _XFER + XFER_SIZE 219 #define _INP _INP_END + INP_END_SIZE 220 #define STACK_SIZE _INP + INP_SIZE 221 222 #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 223 ; \ // ############################# RND N + 0 ############################// 224 MOVL a, y3; \ // y3 = a // MAJA 225 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 226 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 227 ; \ 228 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w 229 ORL c, y3; \ // y3 = a|c // MAJA 230 VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7] 231 MOVL f, y2; \ // y2 = f // CH 232 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 233 ; \ 234 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 235 XORL g, y2; \ // y2 = f^g // CH 236 VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1 237 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 238 ; \ 239 ANDL e, y2; \ // y2 = (f^g)&e // CH 240 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 241 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 242 ADDL h, d; \ // d = k + w + h + d // -- 243 ; \ 244 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 245 VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15] 246 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 247 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 248 ; \ 249 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 250 VPSRLD $7, XTMP1, XTMP2; \ 251 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 252 MOVL a, T1; \ // T1 = a // MAJB 253 ANDL c, T1; \ // T1 = a&c // MAJB 254 ; \ 255 ADDL y0, y2; \ // y2 = S1 + CH // -- 256 VPSLLD $(32-7), XTMP1, XTMP3; \ 257 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 258 ADDL y1, h; \ // h = k + w + h + S0 // -- 259 ; \ 260 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 261 VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 262 ; \ 263 VPSRLD $18, XTMP1, XTMP2; \ 264 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 265 ADDL y3, h // h = t1 + S0 + MAJ // -- 266 267 #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 268 ; \ // ################################### RND N + 1 ############################ 269 ; \ 270 MOVL a, y3; \ // y3 = a // MAJA 271 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 272 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 273 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 274 ORL c, y3; \ // y3 = a|c // MAJA 275 ; \ 276 VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3 277 MOVL f, y2; \ // y2 = f // CH 278 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 279 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 280 XORL g, y2; \ // y2 = f^g // CH 281 ; \ 282 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 283 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 284 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 285 ANDL e, y2; \ // y2 = (f^g)&e // CH 286 ADDL h, d; \ // d = k + w + h + d // -- 287 ; \ 288 VPSLLD $(32-18), XTMP1, XTMP1; \ 289 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 290 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 291 ; \ 292 VPXOR XTMP1, XTMP3, XTMP3; \ 293 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 294 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 295 ; \ 296 VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 297 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 298 MOVL a, T1; \ // T1 = a // MAJB 299 ANDL c, T1; \ // T1 = a&c // MAJB 300 ADDL y0, y2; \ // y2 = S1 + CH // -- 301 ; \ 302 VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0 303 VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA} 304 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 305 ADDL y1, h; \ // h = k + w + h + S0 // -- 306 ; \ 307 VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0 308 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 309 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 310 ADDL y3, h; \ // h = t1 + S0 + MAJ // -- 311 ; \ 312 VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA} 313 314 #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 315 ; \ // ################################### RND N + 2 ############################ 316 ; \ 317 MOVL a, y3; \ // y3 = a // MAJA 318 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 319 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 320 ; \ 321 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA} 322 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 323 ORL c, y3; \ // y3 = a|c // MAJA 324 MOVL f, y2; \ // y2 = f // CH 325 XORL g, y2; \ // y2 = f^g // CH 326 ; \ 327 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 328 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 329 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA} 330 ANDL e, y2; \ // y2 = (f^g)&e // CH 331 ; \ 332 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 333 VPXOR XTMP3, XTMP2, XTMP2; \ 334 ADDL h, d; \ // d = k + w + h + d // -- 335 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 336 ; \ 337 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 338 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 339 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA} 340 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 341 ; \ 342 VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA} 343 ; \ 344 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 345 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 346 VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]} 347 ; \ 348 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 349 MOVL a, T1; \ // T1 = a // MAJB 350 ANDL c, T1; \ // T1 = a&c // MAJB 351 ADDL y0, y2; \ // y2 = S1 + CH // -- 352 VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC} 353 ; \ 354 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 355 ADDL y1, h; \ // h = k + w + h + S0 // -- 356 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 357 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 358 ; \ 359 ADDL y3, h // h = t1 + S0 + MAJ // -- 360 361 #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 362 ; \ // ################################### RND N + 3 ############################ 363 ; \ 364 MOVL a, y3; \ // y3 = a // MAJA 365 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 366 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 367 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 368 ORL c, y3; \ // y3 = a|c // MAJA 369 ; \ 370 VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC} 371 MOVL f, y2; \ // y2 = f // CH 372 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 373 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 374 XORL g, y2; \ // y2 = f^g // CH 375 ; \ 376 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC} 377 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 378 ANDL e, y2; \ // y2 = (f^g)&e // CH 379 ADDL h, d; \ // d = k + w + h + d // -- 380 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 381 ; \ 382 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC} 383 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 384 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 385 ; \ 386 VPXOR XTMP3, XTMP2, XTMP2; \ 387 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 388 ADDL y0, y2; \ // y2 = S1 + CH // -- 389 ; \ 390 VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC} 391 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 392 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 393 ; \ 394 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 395 ; \ 396 VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00} 397 ; \ 398 VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} 399 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 400 MOVL a, T1; \ // T1 = a // MAJB 401 ANDL c, T1; \ // T1 = a&c // MAJB 402 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 403 ; \ 404 ADDL y1, h; \ // h = k + w + h + S0 // -- 405 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 406 ADDL y3, h // h = t1 + S0 + MAJ // -- 407 408 #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \ 409 ; \ // ################################### RND N + 0 ########################### 410 MOVL f, y2; \ // y2 = f // CH 411 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 412 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 413 XORL g, y2; \ // y2 = f^g // CH 414 ; \ 415 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 416 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 417 ANDL e, y2; \ // y2 = (f^g)&e // CH 418 ; \ 419 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 420 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 421 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 422 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 423 MOVL a, y3; \ // y3 = a // MAJA 424 ; \ 425 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 426 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 427 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 428 ORL c, y3; \ // y3 = a|c // MAJA 429 ; \ 430 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 431 MOVL a, T1; \ // T1 = a // MAJB 432 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 433 ANDL c, T1; \ // T1 = a&c // MAJB 434 ADDL y0, y2; \ // y2 = S1 + CH // -- 435 ; \ 436 ADDL h, d; \ // d = k + w + h + d // -- 437 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 438 ADDL y1, h; \ // h = k + w + h + S0 // -- 439 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 440 441 #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \ 442 ; \ // ################################### RND N + 1 ########################### 443 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // -- 444 MOVL f, y2; \ // y2 = f // CH 445 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 446 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 447 XORL g, y2; \ // y2 = f^g // CH 448 ; \ 449 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 450 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 451 ANDL e, y2; \ // y2 = (f^g)&e // CH 452 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 453 ; \ 454 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 455 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 456 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 457 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 458 MOVL a, y3; \ // y3 = a // MAJA 459 ; \ 460 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 461 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 462 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 463 ORL c, y3; \ // y3 = a|c // MAJA 464 ; \ 465 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 466 MOVL a, T1; \ // T1 = a // MAJB 467 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 468 ANDL c, T1; \ // T1 = a&c // MAJB 469 ADDL y0, y2; \ // y2 = S1 + CH // -- 470 ; \ 471 ADDL h, d; \ // d = k + w + h + d // -- 472 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 473 ADDL y1, h; \ // h = k + w + h + S0 // -- 474 ; \ 475 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 476 477 #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \ 478 ; \ // ################################### RND N + 2 ############################## 479 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 480 MOVL f, y2; \ // y2 = f // CH 481 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 482 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 483 XORL g, y2; \ // y2 = f^g // CH 484 ; \ 485 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 486 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 487 ANDL e, y2; \ // y2 = (f^g)&e // CH 488 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 489 ; \ 490 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 491 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 492 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 493 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 494 MOVL a, y3; \ // y3 = a // MAJA 495 ; \ 496 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 497 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 498 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 499 ORL c, y3; \ // y3 = a|c // MAJA 500 ; \ 501 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 502 MOVL a, T1; \ // T1 = a // MAJB 503 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 504 ANDL c, T1; \ // T1 = a&c // MAJB 505 ADDL y0, y2; \ // y2 = S1 + CH // -- 506 ; \ 507 ADDL h, d; \ // d = k + w + h + d // -- 508 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 509 ADDL y1, h; \ // h = k + w + h + S0 // -- 510 ; \ 511 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- 512 513 #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \ 514 ; \ // ################################### RND N + 3 ########################### 515 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 516 MOVL f, y2; \ // y2 = f // CH 517 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A 518 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B 519 XORL g, y2; \ // y2 = f^g // CH 520 ; \ 521 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 522 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 523 ANDL e, y2; \ // y2 = (f^g)&e // CH 524 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- 525 ; \ 526 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 527 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B 528 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH 529 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A 530 MOVL a, y3; \ // y3 = a // MAJA 531 ; \ 532 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 533 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 534 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- 535 ORL c, y3; \ // y3 = a|c // MAJA 536 ; \ 537 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 538 MOVL a, T1; \ // T1 = a // MAJB 539 ANDL b, y3; \ // y3 = (a|c)&b // MAJA 540 ANDL c, T1; \ // T1 = a&c // MAJB 541 ADDL y0, y2; \ // y2 = S1 + CH // -- 542 ; \ 543 ADDL h, d; \ // d = k + w + h + d // -- 544 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ 545 ADDL y1, h; \ // h = k + w + h + S0 // -- 546 ; \ 547 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- 548 ; \ 549 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- 550 ; \ 551 ADDL y3, h // h = t1 + S0 + MAJ // -- 552 553 // Definitions for sha-ni version 554 // 555 // The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2 556 // It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version 557 // 558 // Reference 559 // S. Gulley, et al, "New Instructions Supporting the Secure Hash 560 // Algorithm on Intel® Architecture Processors", July 2013 561 // https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html 562 // 563 564 #define digestPtr DI // input/output, base pointer to digest hash vector H0, H1, ..., H7 565 #define dataPtr SI // input, base pointer to first input data block 566 #define numBytes DX // input, number of input bytes to be processed 567 #define sha256Constants AX // round contants from K256 table, indexed by round number x 32 568 #define msg X0 // input data 569 #define state0 X1 // round intermediates and outputs 570 #define state1 X2 571 #define m0 X3 // m0, m1,... m4 -- round message temps 572 #define m1 X4 573 #define m2 X5 574 #define m3 X6 575 #define m4 X7 576 #define shufMask X8 // input data endian conversion control mask 577 #define abefSave X9 // digest hash vector inter-block buffer abef 578 #define cdghSave X10 // digest hash vector inter-block buffer cdgh 579 580 #define nop(m,a) // nop instead of final SHA256MSG1 for first and last few rounds 581 582 #define sha256msg1(m,a) \ // final SHA256MSG1 for middle rounds that require it 583 SHA256MSG1 m, a 584 585 #define vmov(a,b) \ // msg copy for all but rounds 12-15 586 VMOVDQA a, b 587 588 #define vmovrev(a,b) \ // reverse copy for rounds 12-15 589 VMOVDQA b, a 590 591 // sha rounds 0 to 11 592 // identical with the exception of the final msg op 593 // which is replaced with a nop for rounds where it is not needed 594 // refer to Gulley, et al for more information 595 #define rounds0to11(m,a,c,sha256Msg1) \ 596 VMOVDQU c*16(dataPtr), msg \ 597 PSHUFB shufMask, msg \ 598 VMOVDQA msg, m \ 599 PADDD (c*32)(sha256Constants), msg \ 600 SHA256RNDS2 msg, state0, state1 \ 601 PSHUFD $0x0e, msg, msg \ 602 SHA256RNDS2 msg, state1, state0 \ 603 sha256Msg1 (m,a) 604 605 // sha rounds 12 to 59 606 // identical with the exception of the final msg op 607 // and the reverse copy(m,msg) in round 12 which is required 608 // after the last data load 609 // refer to Gulley, et al for more information 610 #define rounds12to59(m,c,a,t,sha256Msg1,movop) \ 611 movop (m,msg) \ 612 PADDD (c*32)(sha256Constants), msg \ 613 SHA256RNDS2 msg, state0, state1 \ 614 VMOVDQA m, m4 \ 615 PALIGNR $4, a, m4 \ 616 PADDD m4, t \ 617 SHA256MSG2 m, t \ 618 PSHUFD $0x0e, msg, msg \ 619 SHA256RNDS2 msg, state1, state0 \ 620 sha256Msg1 (m,a) 621 622 TEXT ·block(SB), 0, $536-32 623 CMPB ·useSHA(SB), $1 624 JE sha_ni 625 CMPB ·useAVX2(SB), $1 626 JE avx2 627 628 MOVQ p_base+8(FP), SI 629 MOVQ p_len+16(FP), DX 630 SHRQ $6, DX 631 SHLQ $6, DX 632 633 LEAQ (SI)(DX*1), DI 634 MOVQ DI, 256(SP) 635 CMPQ SI, DI 636 JEQ end 637 638 MOVQ dig+0(FP), BP 639 MOVL (0*4)(BP), R8 // a = H0 640 MOVL (1*4)(BP), R9 // b = H1 641 MOVL (2*4)(BP), R10 // c = H2 642 MOVL (3*4)(BP), R11 // d = H3 643 MOVL (4*4)(BP), R12 // e = H4 644 MOVL (5*4)(BP), R13 // f = H5 645 MOVL (6*4)(BP), R14 // g = H6 646 MOVL (7*4)(BP), R15 // h = H7 647 648 loop: 649 MOVQ SP, BP 650 651 SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15) 652 SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14) 653 SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13) 654 SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12) 655 SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11) 656 SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10) 657 SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9) 658 SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8) 659 SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15) 660 SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14) 661 SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13) 662 SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12) 663 SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11) 664 SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10) 665 SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9) 666 SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8) 667 668 SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15) 669 SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14) 670 SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13) 671 SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12) 672 SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11) 673 SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10) 674 SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9) 675 SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8) 676 SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15) 677 SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14) 678 SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13) 679 SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12) 680 SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11) 681 SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10) 682 SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9) 683 SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8) 684 SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15) 685 SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14) 686 SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13) 687 SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12) 688 SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11) 689 SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10) 690 SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9) 691 SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8) 692 SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15) 693 SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14) 694 SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13) 695 SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12) 696 SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11) 697 SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10) 698 SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9) 699 SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8) 700 SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15) 701 SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14) 702 SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13) 703 SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12) 704 SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11) 705 SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10) 706 SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9) 707 SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8) 708 SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15) 709 SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14) 710 SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13) 711 SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12) 712 SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11) 713 SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10) 714 SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9) 715 SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8) 716 717 MOVQ dig+0(FP), BP 718 ADDL (0*4)(BP), R8 // H0 = a + H0 719 MOVL R8, (0*4)(BP) 720 ADDL (1*4)(BP), R9 // H1 = b + H1 721 MOVL R9, (1*4)(BP) 722 ADDL (2*4)(BP), R10 // H2 = c + H2 723 MOVL R10, (2*4)(BP) 724 ADDL (3*4)(BP), R11 // H3 = d + H3 725 MOVL R11, (3*4)(BP) 726 ADDL (4*4)(BP), R12 // H4 = e + H4 727 MOVL R12, (4*4)(BP) 728 ADDL (5*4)(BP), R13 // H5 = f + H5 729 MOVL R13, (5*4)(BP) 730 ADDL (6*4)(BP), R14 // H6 = g + H6 731 MOVL R14, (6*4)(BP) 732 ADDL (7*4)(BP), R15 // H7 = h + H7 733 MOVL R15, (7*4)(BP) 734 735 ADDQ $64, SI 736 CMPQ SI, 256(SP) 737 JB loop 738 739 end: 740 RET 741 742 avx2: 743 MOVQ dig+0(FP), CTX // d.h[8] 744 MOVQ p_base+8(FP), INP 745 MOVQ p_len+16(FP), NUM_BYTES 746 747 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block 748 MOVQ NUM_BYTES, _INP_END(SP) 749 750 CMPQ NUM_BYTES, INP 751 JE avx2_only_one_block 752 753 // Load initial digest 754 MOVL 0(CTX), a // a = H0 755 MOVL 4(CTX), b // b = H1 756 MOVL 8(CTX), c // c = H2 757 MOVL 12(CTX), d // d = H3 758 MOVL 16(CTX), e // e = H4 759 MOVL 20(CTX), f // f = H5 760 MOVL 24(CTX), g // g = H6 761 MOVL 28(CTX), h // h = H7 762 763 avx2_loop0: // at each iteration works with one block (512 bit) 764 765 VMOVDQU (0*32)(INP), XTMP0 766 VMOVDQU (1*32)(INP), XTMP1 767 VMOVDQU (2*32)(INP), XTMP2 768 VMOVDQU (3*32)(INP), XTMP3 769 770 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 771 772 // Apply Byte Flip Mask: LE -> BE 773 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0 774 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1 775 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2 776 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3 777 778 // Transpose data into high/low parts 779 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0 780 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4 781 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8 782 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12 783 784 MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants 785 786 avx2_last_block_enter: 787 ADDQ $64, INP 788 MOVQ INP, _INP(SP) 789 XORQ SRND, SRND 790 791 avx2_loop1: // for w0 - w47 792 // Do 4 rounds and scheduling 793 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER 794 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) 795 ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 796 ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 797 ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 798 ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 799 800 // Do 4 rounds and scheduling 801 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER 802 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 803 ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 804 ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 805 ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 806 ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 807 808 // Do 4 rounds and scheduling 809 VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER 810 VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1) 811 ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 812 ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 813 ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 814 ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 815 816 // Do 4 rounds and scheduling 817 VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER 818 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) 819 ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 820 ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 821 ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 822 ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 823 824 ADDQ $4*32, SRND 825 CMPQ SRND, $3*4*32 826 JB avx2_loop1 827 828 avx2_loop2: 829 // w48 - w63 processed with no scheduling (last 16 rounds) 830 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER 831 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) 832 DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h) 833 DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h) 834 DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g) 835 DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f) 836 837 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER 838 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) 839 DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e) 840 DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d) 841 DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c) 842 DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b) 843 844 ADDQ $2*32, SRND 845 846 VMOVDQU XDWORD2, XDWORD0 847 VMOVDQU XDWORD3, XDWORD1 848 849 CMPQ SRND, $4*4*32 850 JB avx2_loop2 851 852 MOVQ dig+0(FP), CTX // d.h[8] 853 MOVQ _INP(SP), INP 854 855 addm( 0(CTX), a) 856 addm( 4(CTX), b) 857 addm( 8(CTX), c) 858 addm( 12(CTX), d) 859 addm( 16(CTX), e) 860 addm( 20(CTX), f) 861 addm( 24(CTX), g) 862 addm( 28(CTX), h) 863 864 CMPQ _INP_END(SP), INP 865 JB done_hash 866 867 XORQ SRND, SRND 868 869 avx2_loop3: // Do second block using previously scheduled results 870 DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a) 871 DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h) 872 DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g) 873 DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f) 874 875 DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e) 876 DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d) 877 DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c) 878 DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b) 879 880 ADDQ $2*32, SRND 881 CMPQ SRND, $4*4*32 882 JB avx2_loop3 883 884 MOVQ dig+0(FP), CTX // d.h[8] 885 MOVQ _INP(SP), INP 886 ADDQ $64, INP 887 888 addm( 0(CTX), a) 889 addm( 4(CTX), b) 890 addm( 8(CTX), c) 891 addm( 12(CTX), d) 892 addm( 16(CTX), e) 893 addm( 20(CTX), f) 894 addm( 24(CTX), g) 895 addm( 28(CTX), h) 896 897 CMPQ _INP_END(SP), INP 898 JA avx2_loop0 899 JB done_hash 900 901 avx2_do_last_block: 902 903 VMOVDQU 0(INP), XWORD0 904 VMOVDQU 16(INP), XWORD1 905 VMOVDQU 32(INP), XWORD2 906 VMOVDQU 48(INP), XWORD3 907 908 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 909 910 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 911 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 912 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 913 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 914 915 MOVQ $K256<>(SB), TBL 916 917 JMP avx2_last_block_enter 918 919 avx2_only_one_block: 920 // Load initial digest 921 MOVL 0(CTX), a // a = H0 922 MOVL 4(CTX), b // b = H1 923 MOVL 8(CTX), c // c = H2 924 MOVL 12(CTX), d // d = H3 925 MOVL 16(CTX), e // e = H4 926 MOVL 20(CTX), f // f = H5 927 MOVL 24(CTX), g // g = H6 928 MOVL 28(CTX), h // h = H7 929 930 JMP avx2_do_last_block 931 932 done_hash: 933 VZEROUPPER 934 RET 935 936 sha_ni: 937 MOVQ dig+0(FP), digestPtr // init digest hash vector H0, H1,..., H7 pointer 938 MOVQ p_base+8(FP), dataPtr // init input data base pointer 939 MOVQ p_len+16(FP), numBytes // get number of input bytes to hash 940 SHRQ $6, numBytes // force modulo 64 input buffer length 941 SHLQ $6, numBytes 942 CMPQ numBytes, $0 // exit early for zero-length input buffer 943 JEQ done 944 ADDQ dataPtr, numBytes // point numBytes to end of input buffer 945 VMOVDQU (0*16)(digestPtr), state0 // load initial hash values and reorder 946 VMOVDQU (1*16)(digestPtr), state1 // DCBA, HGFE -> ABEF, CDGH 947 PSHUFD $0xb1, state0, state0 // CDAB 948 PSHUFD $0x1b, state1, state1 // EFGH 949 VMOVDQA state0, m4 950 PALIGNR $8, state1, state0 // ABEF 951 PBLENDW $0xf0, m4, state1 // CDGH 952 VMOVDQA flip_mask<>(SB), shufMask 953 LEAQ K256<>(SB), sha256Constants 954 955 roundLoop: 956 // save hash values for addition after rounds 957 VMOVDQA state0, abefSave 958 VMOVDQA state1, cdghSave 959 960 // do rounds 0-59 961 rounds0to11 (m0,-,0,nop) // 0-3 962 rounds0to11 (m1,m0,1,sha256msg1) // 4-7 963 rounds0to11 (m2,m1,2,sha256msg1) // 8-11 964 VMOVDQU (3*16)(dataPtr), msg 965 PSHUFB shufMask, msg 966 rounds12to59 (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15 967 rounds12to59 (m0,4,m3,m1,sha256msg1,vmov) // 16-19 968 rounds12to59 (m1,5,m0,m2,sha256msg1,vmov) // 20-23 969 rounds12to59 (m2,6,m1,m3,sha256msg1,vmov) // 24-27 970 rounds12to59 (m3,7,m2,m0,sha256msg1,vmov) // 28-31 971 rounds12to59 (m0,8,m3,m1,sha256msg1,vmov) // 32-35 972 rounds12to59 (m1,9,m0,m2,sha256msg1,vmov) // 36-39 973 rounds12to59 (m2,10,m1,m3,sha256msg1,vmov) // 40-43 974 rounds12to59 (m3,11,m2,m0,sha256msg1,vmov) // 44-47 975 rounds12to59 (m0,12,m3,m1,sha256msg1,vmov) // 48-51 976 rounds12to59 (m1,13,m0,m2,nop,vmov) // 52-55 977 rounds12to59 (m2,14,m1,m3,nop,vmov) // 56-59 978 979 // do rounds 60-63 980 VMOVDQA m3, msg 981 PADDD (15*32)(sha256Constants), msg 982 SHA256RNDS2 msg, state0, state1 983 PSHUFD $0x0e, msg, msg 984 SHA256RNDS2 msg, state1, state0 985 986 // add current hash values with previously saved 987 PADDD abefSave, state0 988 PADDD cdghSave, state1 989 990 // advance data pointer; loop until buffer empty 991 ADDQ $64, dataPtr 992 CMPQ numBytes, dataPtr 993 JNE roundLoop 994 995 // write hash values back in the correct order 996 PSHUFD $0x1b, state0, state0 // FEBA 997 PSHUFD $0xb1, state1, state1 // DCHG 998 VMOVDQA state0, m4 999 PBLENDW $0xf0, state1, state0 // DCBA 1000 PALIGNR $8, m4, state1 // HGFE 1001 VMOVDQU state0, (0*16)(digestPtr) 1002 VMOVDQU state1, (1*16)(digestPtr) 1003 1004 done: 1005 RET 1006 1007 // shuffle byte order from LE to BE 1008 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 1009 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 1010 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203 1011 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b 1012 GLOBL flip_mask<>(SB), 8, $32 1013 1014 // shuffle xBxA -> 00BA 1015 DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100 1016 DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF 1017 DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100 1018 DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 1019 GLOBL shuff_00BA<>(SB), 8, $32 1020 1021 // shuffle xDxC -> DC00 1022 DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF 1023 DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100 1024 DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 1025 DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100 1026 GLOBL shuff_DC00<>(SB), 8, $32 1027 1028 // Round specific constants 1029 DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1 1030 DATA K256<>+0x04(SB)/4, $0x71374491 // k2 1031 DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3 1032 DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4 1033 DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1 1034 DATA K256<>+0x14(SB)/4, $0x71374491 // k2 1035 DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3 1036 DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4 1037 1038 DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8 1039 DATA K256<>+0x24(SB)/4, $0x59f111f1 1040 DATA K256<>+0x28(SB)/4, $0x923f82a4 1041 DATA K256<>+0x2c(SB)/4, $0xab1c5ed5 1042 DATA K256<>+0x30(SB)/4, $0x3956c25b 1043 DATA K256<>+0x34(SB)/4, $0x59f111f1 1044 DATA K256<>+0x38(SB)/4, $0x923f82a4 1045 DATA K256<>+0x3c(SB)/4, $0xab1c5ed5 1046 1047 DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12 1048 DATA K256<>+0x44(SB)/4, $0x12835b01 1049 DATA K256<>+0x48(SB)/4, $0x243185be 1050 DATA K256<>+0x4c(SB)/4, $0x550c7dc3 1051 DATA K256<>+0x50(SB)/4, $0xd807aa98 1052 DATA K256<>+0x54(SB)/4, $0x12835b01 1053 DATA K256<>+0x58(SB)/4, $0x243185be 1054 DATA K256<>+0x5c(SB)/4, $0x550c7dc3 1055 1056 DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16 1057 DATA K256<>+0x64(SB)/4, $0x80deb1fe 1058 DATA K256<>+0x68(SB)/4, $0x9bdc06a7 1059 DATA K256<>+0x6c(SB)/4, $0xc19bf174 1060 DATA K256<>+0x70(SB)/4, $0x72be5d74 1061 DATA K256<>+0x74(SB)/4, $0x80deb1fe 1062 DATA K256<>+0x78(SB)/4, $0x9bdc06a7 1063 DATA K256<>+0x7c(SB)/4, $0xc19bf174 1064 1065 DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20 1066 DATA K256<>+0x84(SB)/4, $0xefbe4786 1067 DATA K256<>+0x88(SB)/4, $0x0fc19dc6 1068 DATA K256<>+0x8c(SB)/4, $0x240ca1cc 1069 DATA K256<>+0x90(SB)/4, $0xe49b69c1 1070 DATA K256<>+0x94(SB)/4, $0xefbe4786 1071 DATA K256<>+0x98(SB)/4, $0x0fc19dc6 1072 DATA K256<>+0x9c(SB)/4, $0x240ca1cc 1073 1074 DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24 1075 DATA K256<>+0xa4(SB)/4, $0x4a7484aa 1076 DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc 1077 DATA K256<>+0xac(SB)/4, $0x76f988da 1078 DATA K256<>+0xb0(SB)/4, $0x2de92c6f 1079 DATA K256<>+0xb4(SB)/4, $0x4a7484aa 1080 DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc 1081 DATA K256<>+0xbc(SB)/4, $0x76f988da 1082 1083 DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28 1084 DATA K256<>+0xc4(SB)/4, $0xa831c66d 1085 DATA K256<>+0xc8(SB)/4, $0xb00327c8 1086 DATA K256<>+0xcc(SB)/4, $0xbf597fc7 1087 DATA K256<>+0xd0(SB)/4, $0x983e5152 1088 DATA K256<>+0xd4(SB)/4, $0xa831c66d 1089 DATA K256<>+0xd8(SB)/4, $0xb00327c8 1090 DATA K256<>+0xdc(SB)/4, $0xbf597fc7 1091 1092 DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32 1093 DATA K256<>+0xe4(SB)/4, $0xd5a79147 1094 DATA K256<>+0xe8(SB)/4, $0x06ca6351 1095 DATA K256<>+0xec(SB)/4, $0x14292967 1096 DATA K256<>+0xf0(SB)/4, $0xc6e00bf3 1097 DATA K256<>+0xf4(SB)/4, $0xd5a79147 1098 DATA K256<>+0xf8(SB)/4, $0x06ca6351 1099 DATA K256<>+0xfc(SB)/4, $0x14292967 1100 1101 DATA K256<>+0x100(SB)/4, $0x27b70a85 1102 DATA K256<>+0x104(SB)/4, $0x2e1b2138 1103 DATA K256<>+0x108(SB)/4, $0x4d2c6dfc 1104 DATA K256<>+0x10c(SB)/4, $0x53380d13 1105 DATA K256<>+0x110(SB)/4, $0x27b70a85 1106 DATA K256<>+0x114(SB)/4, $0x2e1b2138 1107 DATA K256<>+0x118(SB)/4, $0x4d2c6dfc 1108 DATA K256<>+0x11c(SB)/4, $0x53380d13 1109 1110 DATA K256<>+0x120(SB)/4, $0x650a7354 1111 DATA K256<>+0x124(SB)/4, $0x766a0abb 1112 DATA K256<>+0x128(SB)/4, $0x81c2c92e 1113 DATA K256<>+0x12c(SB)/4, $0x92722c85 1114 DATA K256<>+0x130(SB)/4, $0x650a7354 1115 DATA K256<>+0x134(SB)/4, $0x766a0abb 1116 DATA K256<>+0x138(SB)/4, $0x81c2c92e 1117 DATA K256<>+0x13c(SB)/4, $0x92722c85 1118 1119 DATA K256<>+0x140(SB)/4, $0xa2bfe8a1 1120 DATA K256<>+0x144(SB)/4, $0xa81a664b 1121 DATA K256<>+0x148(SB)/4, $0xc24b8b70 1122 DATA K256<>+0x14c(SB)/4, $0xc76c51a3 1123 DATA K256<>+0x150(SB)/4, $0xa2bfe8a1 1124 DATA K256<>+0x154(SB)/4, $0xa81a664b 1125 DATA K256<>+0x158(SB)/4, $0xc24b8b70 1126 DATA K256<>+0x15c(SB)/4, $0xc76c51a3 1127 1128 DATA K256<>+0x160(SB)/4, $0xd192e819 1129 DATA K256<>+0x164(SB)/4, $0xd6990624 1130 DATA K256<>+0x168(SB)/4, $0xf40e3585 1131 DATA K256<>+0x16c(SB)/4, $0x106aa070 1132 DATA K256<>+0x170(SB)/4, $0xd192e819 1133 DATA K256<>+0x174(SB)/4, $0xd6990624 1134 DATA K256<>+0x178(SB)/4, $0xf40e3585 1135 DATA K256<>+0x17c(SB)/4, $0x106aa070 1136 1137 DATA K256<>+0x180(SB)/4, $0x19a4c116 1138 DATA K256<>+0x184(SB)/4, $0x1e376c08 1139 DATA K256<>+0x188(SB)/4, $0x2748774c 1140 DATA K256<>+0x18c(SB)/4, $0x34b0bcb5 1141 DATA K256<>+0x190(SB)/4, $0x19a4c116 1142 DATA K256<>+0x194(SB)/4, $0x1e376c08 1143 DATA K256<>+0x198(SB)/4, $0x2748774c 1144 DATA K256<>+0x19c(SB)/4, $0x34b0bcb5 1145 1146 DATA K256<>+0x1a0(SB)/4, $0x391c0cb3 1147 DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a 1148 DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f 1149 DATA K256<>+0x1ac(SB)/4, $0x682e6ff3 1150 DATA K256<>+0x1b0(SB)/4, $0x391c0cb3 1151 DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a 1152 DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f 1153 DATA K256<>+0x1bc(SB)/4, $0x682e6ff3 1154 1155 DATA K256<>+0x1c0(SB)/4, $0x748f82ee 1156 DATA K256<>+0x1c4(SB)/4, $0x78a5636f 1157 DATA K256<>+0x1c8(SB)/4, $0x84c87814 1158 DATA K256<>+0x1cc(SB)/4, $0x8cc70208 1159 DATA K256<>+0x1d0(SB)/4, $0x748f82ee 1160 DATA K256<>+0x1d4(SB)/4, $0x78a5636f 1161 DATA K256<>+0x1d8(SB)/4, $0x84c87814 1162 DATA K256<>+0x1dc(SB)/4, $0x8cc70208 1163 1164 DATA K256<>+0x1e0(SB)/4, $0x90befffa 1165 DATA K256<>+0x1e4(SB)/4, $0xa4506ceb 1166 DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7 1167 DATA K256<>+0x1ec(SB)/4, $0xc67178f2 1168 DATA K256<>+0x1f0(SB)/4, $0x90befffa 1169 DATA K256<>+0x1f4(SB)/4, $0xa4506ceb 1170 DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7 1171 DATA K256<>+0x1fc(SB)/4, $0xc67178f2 1172 1173 GLOBL K256<>(SB), (NOPTR + RODATA), $512