github.com/slayercat/go@v0.0.0-20170428012452-c51559813f61/src/crypto/sha256/sha256block_amd64.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // SHA256 block routine. See sha256block.go for Go equivalent.
     8  //
     9  // The algorithm is detailed in FIPS 180-4:
    10  //
    11  //  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12  
    13  // The avx2-version is described in an Intel White-Paper:
    14  // "Fast SHA-256 Implementations on Intel Architecture Processors"
    15  // To find it, surf to http://www.intel.com/p/en_US/embedded
    16  // and search for that title.
    17  // AVX2 version by Intel, same algorithm as code in Linux kernel:
    18  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
    19  // by
    20  //     James Guilford <james.guilford@intel.com>
    21  //     Kirk Yap <kirk.s.yap@intel.com>
    22  //     Tim Chen <tim.c.chen@linux.intel.com>
    23  
    24  // Wt = Mt; for 0 <= t <= 15
    25  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    26  //
    27  // a = H0
    28  // b = H1
    29  // c = H2
    30  // d = H3
    31  // e = H4
    32  // f = H5
    33  // g = H6
    34  // h = H7
    35  //
    36  // for t = 0 to 63 {
    37  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    38  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    39  //    h = g
    40  //    g = f
    41  //    f = e
    42  //    e = d + T1
    43  //    d = c
    44  //    c = b
    45  //    b = a
    46  //    a = T1 + T2
    47  // }
    48  //
    49  // H0 = a + H0
    50  // H1 = b + H1
    51  // H2 = c + H2
    52  // H3 = d + H3
    53  // H4 = e + H4
    54  // H5 = f + H5
    55  // H6 = g + H6
    56  // H7 = h + H7
    57  
    58  // Wt = Mt; for 0 <= t <= 15
    59  #define MSGSCHEDULE0(index) \
    60  	MOVL	(index*4)(SI), AX; \
    61  	BSWAPL	AX; \
    62  	MOVL	AX, (index*4)(BP)
    63  
    64  // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    65  //   SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
    66  //   SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
    67  #define MSGSCHEDULE1(index) \
    68  	MOVL	((index-2)*4)(BP), AX; \
    69  	MOVL	AX, CX; \
    70  	RORL	$17, AX; \
    71  	MOVL	CX, DX; \
    72  	RORL	$19, CX; \
    73  	SHRL	$10, DX; \
    74  	MOVL	((index-15)*4)(BP), BX; \
    75  	XORL	CX, AX; \
    76  	MOVL	BX, CX; \
    77  	XORL	DX, AX; \
    78  	RORL	$7, BX; \
    79  	MOVL	CX, DX; \
    80  	SHRL	$3, DX; \
    81  	RORL	$18, CX; \
    82  	ADDL	((index-7)*4)(BP), AX; \
    83  	XORL	CX, BX; \
    84  	XORL	DX, BX; \
    85  	ADDL	((index-16)*4)(BP), BX; \
    86  	ADDL	BX, AX; \
    87  	MOVL	AX, ((index)*4)(BP)
    88  
    89  // Calculate T1 in AX - uses AX, CX and DX registers.
    90  // h is also used as an accumulator. Wt is passed in AX.
    91  //   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    92  //     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
    93  //     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    94  #define SHA256T1(const, e, f, g, h) \
    95  	ADDL	AX, h; \
    96  	MOVL	e, AX; \
    97  	ADDL	$const, h; \
    98  	MOVL	e, CX; \
    99  	RORL	$6, AX; \
   100  	MOVL	e, DX; \
   101  	RORL	$11, CX; \
   102  	XORL	CX, AX; \
   103  	MOVL	e, CX; \
   104  	RORL	$25, DX; \
   105  	ANDL	f, CX; \
   106  	XORL	AX, DX; \
   107  	MOVL	e, AX; \
   108  	NOTL	AX; \
   109  	ADDL	DX, h; \
   110  	ANDL	g, AX; \
   111  	XORL	CX, AX; \
   112  	ADDL	h, AX
   113  
   114  // Calculate T2 in BX - uses BX, CX, DX and DI registers.
   115  //   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   116  //     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
   117  //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   118  #define SHA256T2(a, b, c) \
   119  	MOVL	a, DI; \
   120  	MOVL	c, BX; \
   121  	RORL	$2, DI; \
   122  	MOVL	a, DX; \
   123  	ANDL	b, BX; \
   124  	RORL	$13, DX; \
   125  	MOVL	a, CX; \
   126  	ANDL	c, CX; \
   127  	XORL	DX, DI; \
   128  	XORL	CX, BX; \
   129  	MOVL	a, DX; \
   130  	MOVL	b, CX; \
   131  	RORL	$22, DX; \
   132  	ANDL	a, CX; \
   133  	XORL	CX, BX; \
   134  	XORL	DX, DI; \
   135  	ADDL	DI, BX
   136  
   137  // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   138  // The values for e and a are stored in d and h, ready for rotation.
   139  #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
   140  	SHA256T1(const, e, f, g, h); \
   141  	SHA256T2(a, b, c); \
   142  	MOVL	BX, h; \
   143  	ADDL	AX, d; \
   144  	ADDL	AX, h
   145  
   146  #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
   147  	MSGSCHEDULE0(index); \
   148  	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   149  
   150  #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
   151  	MSGSCHEDULE1(index); \
   152  	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   153  
   154  
   155  // Definitions for AVX2 version
   156  
   157  // addm (mem), reg
   158  // Add reg to mem using reg-mem add and store
   159  #define addm(P1, P2) \
   160  	ADDL P2, P1; \
   161  	MOVL P1, P2
   162  
   163  #define XDWORD0 Y4
   164  #define XDWORD1 Y5
   165  #define XDWORD2 Y6
   166  #define XDWORD3 Y7
   167  
   168  #define XWORD0 X4
   169  #define XWORD1 X5
   170  #define XWORD2 X6
   171  #define XWORD3 X7
   172  
   173  #define XTMP0 Y0
   174  #define XTMP1 Y1
   175  #define XTMP2 Y2
   176  #define XTMP3 Y3
   177  #define XTMP4 Y8
   178  #define XTMP5 Y11
   179  
   180  #define XFER  Y9
   181  
   182  #define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
   183  #define X_BYTE_FLIP_MASK X13
   184  
   185  #define NUM_BYTES DX
   186  #define INP	DI
   187  
   188  #define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
   189  
   190  #define a AX
   191  #define b BX
   192  #define c CX
   193  #define d R8
   194  #define e DX
   195  #define f R9
   196  #define g R10
   197  #define h R11
   198  
   199  #define old_h R11
   200  
   201  #define TBL BP
   202  
   203  #define SRND SI // SRND is same register as CTX
   204  
   205  #define T1 R12
   206  
   207  #define y0 R13
   208  #define y1 R14
   209  #define y2 R15
   210  #define y3 DI
   211  
   212  // Offsets
   213  #define XFER_SIZE 2*64*4
   214  #define INP_END_SIZE 8
   215  #define INP_SIZE 8
   216  #define TMP_SIZE 4
   217  
   218  #define _XFER 0
   219  #define _INP_END _XFER + XFER_SIZE
   220  #define _INP _INP_END + INP_END_SIZE
   221  #define _TMP _INP + INP_SIZE
   222  #define STACK_SIZE _TMP + TMP_SIZE
   223  
   224  #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   225  	;                                     \ // #############################  RND N + 0 ############################//
   226  	MOVL     a, y3;                       \ // y3 = a					// MAJA
   227  	RORXL    $25, e, y0;                  \ // y0 = e >> 25				// S1A
   228  	RORXL    $11, e, y1;                  \ // y1 = e >> 11				// S1B
   229  	;                                     \
   230  	ADDL     (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h        // disp = k + w
   231  	ORL      c, y3;                       \ // y3 = a|c				// MAJA
   232  	VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
   233  	MOVL     f, y2;                       \ // y2 = f				// CH
   234  	RORXL    $13, a, T1;                  \ // T1 = a >> 13			// S0B
   235  	;                                     \
   236  	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)					// S1
   237  	XORL     g, y2;                       \ // y2 = f^g                              	// CH
   238  	VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
   239  	RORXL    $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   240  	;                                     \
   241  	ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
   242  	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   243  	RORXL    $22, a, y1;                  \ // y1 = a >> 22							// S0A
   244  	ADDL     h, d;                        \ // d = k + w + h + d                     	// --
   245  	;                                     \
   246  	ANDL     b, y3;                       \ // y3 = (a|c)&b							// MAJA
   247  	VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
   248  	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   249  	RORXL    $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   250  	;                                     \
   251  	XORL     g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   252  	VPSRLD   $7, XTMP1, XTMP2;            \
   253  	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   254  	MOVL     a, T1;                       \ // T1 = a								// MAJB
   255  	ANDL     c, T1;                       \ // T1 = a&c								// MAJB
   256  	;                                     \
   257  	ADDL     y0, y2;                      \ // y2 = S1 + CH							// --
   258  	VPSLLD   $(32-7), XTMP1, XTMP3;       \
   259  	ORL      T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   260  	ADDL     y1, h;                       \ // h = k + w + h + S0					// --
   261  	;                                     \
   262  	ADDL     y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   263  	VPOR     XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7
   264  	;                                     \
   265  	VPSRLD   $18, XTMP1, XTMP2;           \
   266  	ADDL     y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   267  	ADDL     y3, h                        // h = t1 + S0 + MAJ                     // --
   268  
   269  #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   270  	;                                    \ // ################################### RND N + 1 ############################
   271  	;                                    \
   272  	MOVL    a, y3;                       \ // y3 = a                       // MAJA
   273  	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   274  	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   275  	ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h         		// --
   276  	ORL     c, y3;                       \ // y3 = a|c						// MAJA
   277  	;                                    \
   278  	VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
   279  	MOVL    f, y2;                       \ // y2 = f						// CH
   280  	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   281  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   282  	XORL    g, y2;                       \ // y2 = f^g						// CH
   283  	;                                    \
   284  	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   285  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   286  	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   287  	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   288  	ADDL    h, d;                        \ // d = k + w + h + d				// --
   289  	;                                    \
   290  	VPSLLD  $(32-18), XTMP1, XTMP1;      \
   291  	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   292  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   293  	;                                    \
   294  	VPXOR   XTMP1, XTMP3, XTMP3;         \
   295  	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   296  	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g		// CH
   297  	;                                    \
   298  	VPXOR   XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
   299  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   300  	MOVL    a, T1;                       \ // T1 = a						// MAJB
   301  	ANDL    c, T1;                       \ // T1 = a&c						// MAJB
   302  	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   303  	;                                    \
   304  	VPXOR   XTMP4, XTMP3, XTMP1;         \ // XTMP1 = s0
   305  	VPSHUFD $0xFA, XDWORD3, XTMP2;       \ // XTMP2 = W[-2] {BBAA}
   306  	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   307  	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   308  	;                                    \
   309  	VPADDD  XTMP1, XTMP0, XTMP0;         \ // XTMP0 = W[-16] + W[-7] + s0
   310  	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   311  	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   312  	ADDL    y3, h;                       \ // h = t1 + S0 + MAJ                     // --
   313  	;                                    \
   314  	VPSRLD  $10, XTMP2, XTMP4            // XTMP4 = W[-2] >> 10 {BBAA}
   315  
   316  #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   317  	;                                    \ // ################################### RND N + 2 ############################
   318  	;                                    \
   319  	MOVL    a, y3;                       \ // y3 = a							// MAJA
   320  	RORXL   $25, e, y0;                  \ // y0 = e >> 25						// S1A
   321  	ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h        			// --
   322  	;                                    \
   323  	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
   324  	RORXL   $11, e, y1;                  \ // y1 = e >> 11						// S1B
   325  	ORL     c, y3;                       \ // y3 = a|c                         // MAJA
   326  	MOVL    f, y2;                       \ // y2 = f                           // CH
   327  	XORL    g, y2;                       \ // y2 = f^g                         // CH
   328  	;                                    \
   329  	RORXL   $13, a, T1;                  \ // T1 = a >> 13						// S0B
   330  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)			// S1
   331  	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xBxA}
   332  	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   333  	;                                    \
   334  	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)					// S1
   335  	VPXOR   XTMP3, XTMP2, XTMP2;         \
   336  	ADDL    h, d;                        \ // d = k + w + h + d				// --
   337  	ANDL    b, y3;                       \ // y3 = (a|c)&b						// MAJA
   338  	;                                    \
   339  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   340  	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   341  	VPXOR   XTMP2, XTMP4, XTMP4;         \ // XTMP4 = s1 {xBxA}
   342  	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   343  	;                                    \
   344  	MOVL    f, _TMP(SP);                 \
   345  	MOVQ    $shuff_00BA<>(SB), f;        \ // f is used to keep SHUF_00BA
   346  	VPSHUFB (f), XTMP4, XTMP4;           \ // XTMP4 = s1 {00BA}
   347  	MOVL    _TMP(SP), f;                 \ // f is restored
   348  	;                                    \
   349  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   350  	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   351  	VPADDD  XTMP4, XTMP0, XTMP0;         \ // XTMP0 = {..., ..., W[1], W[0]}
   352  	;                                    \
   353  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   354  	MOVL    a, T1;                       \ // T1 = a                                // MAJB
   355  	ANDL    c, T1;                       \ // T1 = a&c                              // MAJB
   356  	ADDL    y0, y2;                      \ // y2 = S1 + CH                          // --
   357  	VPSHUFD $80, XTMP0, XTMP2;           \ // XTMP2 = W[-2] {DDCC}
   358  	;                                    \
   359  	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   360  	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   361  	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   362  	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   363  	;                                    \
   364  	ADDL    y3, h                        // h = t1 + S0 + MAJ                     // --
   365  
   366  #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   367  	;                                    \ // ################################### RND N + 3 ############################
   368  	;                                    \
   369  	MOVL    a, y3;                       \ // y3 = a						// MAJA
   370  	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   371  	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   372  	ADDL    (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h				// --
   373  	ORL     c, y3;                       \ // y3 = a|c                     // MAJA
   374  	;                                    \
   375  	VPSRLD  $10, XTMP2, XTMP5;           \ // XTMP5 = W[-2] >> 10 {DDCC}
   376  	MOVL    f, y2;                       \ // y2 = f						// CH
   377  	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   378  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   379  	XORL    g, y2;                       \ // y2 = f^g						// CH
   380  	;                                    \
   381  	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xDxC}
   382  	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   383  	ANDL    e, y2;                       \ // y2 = (f^g)&e					// CH
   384  	ADDL    h, d;                        \ // d = k + w + h + d			// --
   385  	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   386  	;                                    \
   387  	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xDxC}
   388  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   389  	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   390  	;                                    \
   391  	VPXOR   XTMP3, XTMP2, XTMP2;         \
   392  	RORXL   $22, a, y1;                  \ // y1 = a >> 22					// S0A
   393  	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   394  	;                                    \
   395  	VPXOR   XTMP2, XTMP5, XTMP5;         \ // XTMP5 = s1 {xDxC}
   396  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   397  	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   398  	;                                    \
   399  	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   400  	;                                    \
   401  	MOVL    f, _TMP(SP);                 \ // Save f
   402  	MOVQ    $shuff_DC00<>(SB), f;        \ // SHUF_00DC
   403  	VPSHUFB (f), XTMP5, XTMP5;           \ // XTMP5 = s1 {DC00}
   404  	MOVL    _TMP(SP), f;                 \ // Restore f
   405  	;                                    \
   406  	VPADDD  XTMP0, XTMP5, XDWORD0;       \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
   407  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   408  	MOVL    a, T1;                       \ // T1 = a							// MAJB
   409  	ANDL    c, T1;                       \ // T1 = a&c							// MAJB
   410  	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)		// MAJ
   411  	;                                    \
   412  	ADDL    y1, h;                       \ // h = k + w + h + S0				// --
   413  	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   414  	ADDL    y3, h                        // h = t1 + S0 + MAJ				// --
   415  
   416  #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
   417  	;                                  \ // ################################### RND N + 0 ###########################
   418  	MOVL  f, y2;                       \ // y2 = f					// CH
   419  	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   420  	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   421  	XORL  g, y2;                       \ // y2 = f^g					// CH
   422  	;                                  \
   423  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)	// S1
   424  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)			// S1
   425  	ANDL  e, y2;                       \ // y2 = (f^g)&e				// CH
   426  	;                                  \
   427  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   428  	RORXL $13, a, T1;                  \ // T1 = a >> 13						// S0B
   429  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   430  	RORXL $22, a, y1;                  \ // y1 = a >> 22						// S0A
   431  	MOVL  a, y3;                       \ // y3 = a							// MAJA
   432  	;                                  \
   433  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)			// S0
   434  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)					// S0
   435  	ADDL  (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   436  	ORL   c, y3;                       \ // y3 = a|c							// MAJA
   437  	;                                  \
   438  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   439  	MOVL  a, T1;                       \ // T1 = a							// MAJB
   440  	ANDL  b, y3;                       \ // y3 = (a|c)&b						// MAJA
   441  	ANDL  c, T1;                       \ // T1 = a&c							// MAJB
   442  	ADDL  y0, y2;                      \ // y2 = S1 + CH						// --
   443  	;                                  \
   444  	ADDL  h, d;                        \ // d = k + w + h + d					// --
   445  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   446  	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   447  	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1	// --
   448  
   449  #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
   450  	;                                  \ // ################################### RND N + 1 ###########################
   451  	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
   452  	MOVL  f, y2;                       \ // y2 = f                                // CH
   453  	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   454  	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   455  	XORL  g, y2;                       \ // y2 = f^g                             // CH
   456  	;                                  \
   457  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   458  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   459  	ANDL  e, y2;                       \ // y2 = (f^g)&e                         // CH
   460  	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ                    // --
   461  	;                                  \
   462  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   463  	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   464  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   465  	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   466  	MOVL  a, y3;                       \ // y3 = a                               // MAJA
   467  	;                                  \
   468  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   469  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   470  	ADDL  (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   471  	ORL   c, y3;                       \ // y3 = a|c                             // MAJA
   472  	;                                  \
   473  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   474  	MOVL  a, T1;                       \ // T1 = a                               // MAJB
   475  	ANDL  b, y3;                       \ // y3 = (a|c)&b                         // MAJA
   476  	ANDL  c, T1;                       \ // T1 = a&c                             // MAJB
   477  	ADDL  y0, y2;                      \ // y2 = S1 + CH                         // --
   478  	;                                  \
   479  	ADDL  h, d;                        \ // d = k + w + h + d                    // --
   480  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)            // MAJ
   481  	ADDL  y1, h;                       \ // h = k + w + h + S0                   // --
   482  	;                                  \
   483  	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   484  
   485  #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
   486  	;                                  \ // ################################### RND N + 2 ##############################
   487  	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   488  	MOVL  f, y2;                       \ // y2 = f								// CH
   489  	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   490  	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   491  	XORL  g, y2;                       \ // y2 = f^g								// CH
   492  	;                                  \
   493  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   494  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   495  	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   496  	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   497  	;                                  \
   498  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   499  	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   500  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   501  	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   502  	MOVL  a, y3;                       \ // y3 = a								// MAJA
   503  	;                                  \
   504  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   505  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   506  	ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
   507  	ORL   c, y3;                       \ // y3 = a|c								// MAJA
   508  	;                                  \
   509  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   510  	MOVL  a, T1;                       \ // T1 = a								// MAJB
   511  	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   512  	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   513  	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   514  	;                                  \
   515  	ADDL  h, d;                        \ // d = k + w + h + d					// --
   516  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   517  	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   518  	;                                  \
   519  	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   520  
   521  #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
   522  	;                                  \ // ################################### RND N + 3 ###########################
   523  	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   524  	MOVL  f, y2;                       \ // y2 = f								// CH
   525  	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   526  	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   527  	XORL  g, y2;                       \ // y2 = f^g								// CH
   528  	;                                  \
   529  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   530  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   531  	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   532  	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   533  	;                                  \
   534  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   535  	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   536  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   537  	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   538  	MOVL  a, y3;                       \ // y3 = a								// MAJA
   539  	;                                  \
   540  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   541  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   542  	ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
   543  	ORL   c, y3;                       \ // y3 = a|c								// MAJA
   544  	;                                  \
   545  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   546  	MOVL  a, T1;                       \ // T1 = a								// MAJB
   547  	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   548  	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   549  	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   550  	;                                  \
   551  	ADDL  h, d;                        \ // d = k + w + h + d					// --
   552  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   553  	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   554  	;                                  \
   555  	ADDL  y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1	// --
   556  	;                                  \
   557  	ADDL  y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   558  	;                                  \
   559  	ADDL  y3, h                        // h = t1 + S0 + MAJ					// --
   560  
   561  TEXT ·block(SB), 0, $536-32
   562  	CMPB runtime·support_avx2(SB), $0
   563  	JE   noavx2bmi2
   564  	CMPB runtime·support_bmi2(SB), $1  // check for RORXL instruction
   565  	JE   avx2
   566  noavx2bmi2:
   567  
   568  	MOVQ p_base+8(FP), SI
   569  	MOVQ p_len+16(FP), DX
   570  	SHRQ $6, DX
   571  	SHLQ $6, DX
   572  
   573  	LEAQ (SI)(DX*1), DI
   574  	MOVQ DI, 256(SP)
   575  	CMPQ SI, DI
   576  	JEQ  end
   577  
   578  	MOVQ dig+0(FP), BP
   579  	MOVL (0*4)(BP), R8  // a = H0
   580  	MOVL (1*4)(BP), R9  // b = H1
   581  	MOVL (2*4)(BP), R10 // c = H2
   582  	MOVL (3*4)(BP), R11 // d = H3
   583  	MOVL (4*4)(BP), R12 // e = H4
   584  	MOVL (5*4)(BP), R13 // f = H5
   585  	MOVL (6*4)(BP), R14 // g = H6
   586  	MOVL (7*4)(BP), R15 // h = H7
   587  
   588  loop:
   589  	MOVQ SP, BP
   590  
   591  	SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
   592  	SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
   593  	SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
   594  	SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
   595  	SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
   596  	SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
   597  	SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
   598  	SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
   599  	SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
   600  	SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
   601  	SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
   602  	SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
   603  	SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
   604  	SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
   605  	SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
   606  	SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
   607  
   608  	SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
   609  	SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
   610  	SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
   611  	SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
   612  	SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
   613  	SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
   614  	SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
   615  	SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
   616  	SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
   617  	SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
   618  	SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
   619  	SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
   620  	SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
   621  	SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
   622  	SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
   623  	SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
   624  	SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
   625  	SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
   626  	SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
   627  	SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
   628  	SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
   629  	SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
   630  	SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
   631  	SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
   632  	SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
   633  	SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
   634  	SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
   635  	SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
   636  	SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
   637  	SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
   638  	SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
   639  	SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
   640  	SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
   641  	SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
   642  	SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
   643  	SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
   644  	SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
   645  	SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
   646  	SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
   647  	SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
   648  	SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
   649  	SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
   650  	SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
   651  	SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
   652  	SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
   653  	SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
   654  	SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
   655  	SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
   656  
   657  	MOVQ dig+0(FP), BP
   658  	ADDL (0*4)(BP), R8  // H0 = a + H0
   659  	MOVL R8, (0*4)(BP)
   660  	ADDL (1*4)(BP), R9  // H1 = b + H1
   661  	MOVL R9, (1*4)(BP)
   662  	ADDL (2*4)(BP), R10 // H2 = c + H2
   663  	MOVL R10, (2*4)(BP)
   664  	ADDL (3*4)(BP), R11 // H3 = d + H3
   665  	MOVL R11, (3*4)(BP)
   666  	ADDL (4*4)(BP), R12 // H4 = e + H4
   667  	MOVL R12, (4*4)(BP)
   668  	ADDL (5*4)(BP), R13 // H5 = f + H5
   669  	MOVL R13, (5*4)(BP)
   670  	ADDL (6*4)(BP), R14 // H6 = g + H6
   671  	MOVL R14, (6*4)(BP)
   672  	ADDL (7*4)(BP), R15 // H7 = h + H7
   673  	MOVL R15, (7*4)(BP)
   674  
   675  	ADDQ $64, SI
   676  	CMPQ SI, 256(SP)
   677  	JB   loop
   678  
   679  end:
   680  	RET
   681  
   682  avx2:
   683  	MOVQ dig+0(FP), CTX          // d.h[8]
   684  	MOVQ p_base+8(FP), INP
   685  	MOVQ p_len+16(FP), NUM_BYTES
   686  
   687  	LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
   688  	MOVQ NUM_BYTES, _INP_END(SP)
   689  
   690  	CMPQ NUM_BYTES, INP
   691  	JE   avx2_only_one_block
   692  
   693  	// Load initial digest
   694  	MOVL 0(CTX), a  // a = H0
   695  	MOVL 4(CTX), b  // b = H1
   696  	MOVL 8(CTX), c  // c = H2
   697  	MOVL 12(CTX), d // d = H3
   698  	MOVL 16(CTX), e // e = H4
   699  	MOVL 20(CTX), f // f = H5
   700  	MOVL 24(CTX), g // g = H6
   701  	MOVL 28(CTX), h // h = H7
   702  
   703  avx2_loop0: // at each iteration works with one block (512 bit)
   704  
   705  	VMOVDQU (0*32)(INP), XTMP0
   706  	VMOVDQU (1*32)(INP), XTMP1
   707  	VMOVDQU (2*32)(INP), XTMP2
   708  	VMOVDQU (3*32)(INP), XTMP3
   709  
   710  	MOVQ    $flip_mask<>(SB), BP // BYTE_FLIP_MASK
   711  	VMOVDQU (BP), BYTE_FLIP_MASK
   712  
   713  	// Apply Byte Flip Mask: LE -> BE
   714  	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
   715  	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
   716  	VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
   717  	VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
   718  
   719  	// Transpose data into high/low parts
   720  	VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
   721  	VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
   722  	VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
   723  	VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
   724  
   725  	MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
   726  
   727  avx2_last_block_enter:
   728  	ADDQ $64, INP
   729  	MOVQ INP, _INP(SP)
   730  	XORQ SRND, SRND
   731  
   732  avx2_loop1: // for w0 - w47
   733  	// Do 4 rounds and scheduling
   734  	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   735  	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   736  	ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   737  	ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   738  	ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   739  	ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   740  
   741  	// Do 4 rounds and scheduling
   742  	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   743  	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   744  	ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   745  	ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   746  	ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   747  	ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   748  
   749  	// Do 4 rounds and scheduling
   750  	VPADDD  2*32(TBL)(SRND*1), XDWORD2, XFER
   751  	VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
   752  	ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   753  	ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   754  	ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   755  	ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   756  
   757  	// Do 4 rounds and scheduling
   758  	VPADDD  3*32(TBL)(SRND*1), XDWORD3, XFER
   759  	VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
   760  	ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   761  	ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   762  	ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   763  	ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   764  
   765  	ADDQ $4*32, SRND
   766  	CMPQ SRND, $3*4*32
   767  	JB   avx2_loop1
   768  
   769  avx2_loop2:
   770  	// w48 - w63 processed with no scheduliung (last 16 rounds)
   771  	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   772  	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   773  	DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
   774  	DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
   775  	DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
   776  	DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
   777  
   778  	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   779  	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   780  	DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
   781  	DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
   782  	DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
   783  	DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
   784  
   785  	ADDQ $2*32, SRND
   786  
   787  	VMOVDQU XDWORD2, XDWORD0
   788  	VMOVDQU XDWORD3, XDWORD1
   789  
   790  	CMPQ SRND, $4*4*32
   791  	JB   avx2_loop2
   792  
   793  	MOVQ dig+0(FP), CTX // d.h[8]
   794  	MOVQ _INP(SP), INP
   795  
   796  	addm(  0(CTX), a)
   797  	addm(  4(CTX), b)
   798  	addm(  8(CTX), c)
   799  	addm( 12(CTX), d)
   800  	addm( 16(CTX), e)
   801  	addm( 20(CTX), f)
   802  	addm( 24(CTX), g)
   803  	addm( 28(CTX), h)
   804  
   805  	CMPQ _INP_END(SP), INP
   806  	JB   done_hash
   807  
   808  	XORQ SRND, SRND
   809  
   810  avx2_loop3: // Do second block using previously scheduled results
   811  	DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
   812  	DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
   813  	DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
   814  	DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
   815  
   816  	DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
   817  	DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
   818  	DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
   819  	DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
   820  
   821  	ADDQ $2*32, SRND
   822  	CMPQ SRND, $4*4*32
   823  	JB   avx2_loop3
   824  
   825  	MOVQ dig+0(FP), CTX // d.h[8]
   826  	MOVQ _INP(SP), INP
   827  	ADDQ $64, INP
   828  
   829  	addm(  0(CTX), a)
   830  	addm(  4(CTX), b)
   831  	addm(  8(CTX), c)
   832  	addm( 12(CTX), d)
   833  	addm( 16(CTX), e)
   834  	addm( 20(CTX), f)
   835  	addm( 24(CTX), g)
   836  	addm( 28(CTX), h)
   837  
   838  	CMPQ _INP_END(SP), INP
   839  	JA   avx2_loop0
   840  	JB   done_hash
   841  
   842  avx2_do_last_block:
   843  
   844  	VMOVDQU 0(INP), XWORD0
   845  	VMOVDQU 16(INP), XWORD1
   846  	VMOVDQU 32(INP), XWORD2
   847  	VMOVDQU 48(INP), XWORD3
   848  
   849  	MOVQ    $flip_mask<>(SB), BP
   850  	VMOVDQU (BP), X_BYTE_FLIP_MASK
   851  
   852  	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
   853  	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
   854  	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
   855  	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
   856  
   857  	MOVQ $K256<>(SB), TBL
   858  
   859  	JMP avx2_last_block_enter
   860  
   861  avx2_only_one_block:
   862  	// Load initial digest
   863  	MOVL 0(CTX), a  // a = H0
   864  	MOVL 4(CTX), b  // b = H1
   865  	MOVL 8(CTX), c  // c = H2
   866  	MOVL 12(CTX), d // d = H3
   867  	MOVL 16(CTX), e // e = H4
   868  	MOVL 20(CTX), f // f = H5
   869  	MOVL 24(CTX), g // g = H6
   870  	MOVL 28(CTX), h // h = H7
   871  
   872  	JMP avx2_do_last_block
   873  
   874  done_hash:
   875  	VZEROUPPER
   876  	RET
   877  
   878  // shuffle byte order from LE to BE
   879  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
   880  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
   881  DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
   882  DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
   883  GLOBL flip_mask<>(SB), 8, $32
   884  
   885  // shuffle xBxA -> 00BA
   886  DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
   887  DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
   888  DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
   889  DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   890  GLOBL shuff_00BA<>(SB), 8, $32
   891  
   892  // shuffle xDxC -> DC00
   893  DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
   894  DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
   895  DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   896  DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
   897  GLOBL shuff_DC00<>(SB), 8, $32
   898  
   899  // Round specific constants
   900  DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
   901  DATA K256<>+0x04(SB)/4, $0x71374491 // k2
   902  DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
   903  DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
   904  DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
   905  DATA K256<>+0x14(SB)/4, $0x71374491 // k2
   906  DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
   907  DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
   908  
   909  DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
   910  DATA K256<>+0x24(SB)/4, $0x59f111f1
   911  DATA K256<>+0x28(SB)/4, $0x923f82a4
   912  DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
   913  DATA K256<>+0x30(SB)/4, $0x3956c25b
   914  DATA K256<>+0x34(SB)/4, $0x59f111f1
   915  DATA K256<>+0x38(SB)/4, $0x923f82a4
   916  DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
   917  
   918  DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
   919  DATA K256<>+0x44(SB)/4, $0x12835b01
   920  DATA K256<>+0x48(SB)/4, $0x243185be
   921  DATA K256<>+0x4c(SB)/4, $0x550c7dc3
   922  DATA K256<>+0x50(SB)/4, $0xd807aa98
   923  DATA K256<>+0x54(SB)/4, $0x12835b01
   924  DATA K256<>+0x58(SB)/4, $0x243185be
   925  DATA K256<>+0x5c(SB)/4, $0x550c7dc3
   926  
   927  DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
   928  DATA K256<>+0x64(SB)/4, $0x80deb1fe
   929  DATA K256<>+0x68(SB)/4, $0x9bdc06a7
   930  DATA K256<>+0x6c(SB)/4, $0xc19bf174
   931  DATA K256<>+0x70(SB)/4, $0x72be5d74
   932  DATA K256<>+0x74(SB)/4, $0x80deb1fe
   933  DATA K256<>+0x78(SB)/4, $0x9bdc06a7
   934  DATA K256<>+0x7c(SB)/4, $0xc19bf174
   935  
   936  DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
   937  DATA K256<>+0x84(SB)/4, $0xefbe4786
   938  DATA K256<>+0x88(SB)/4, $0x0fc19dc6
   939  DATA K256<>+0x8c(SB)/4, $0x240ca1cc
   940  DATA K256<>+0x90(SB)/4, $0xe49b69c1
   941  DATA K256<>+0x94(SB)/4, $0xefbe4786
   942  DATA K256<>+0x98(SB)/4, $0x0fc19dc6
   943  DATA K256<>+0x9c(SB)/4, $0x240ca1cc
   944  
   945  DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
   946  DATA K256<>+0xa4(SB)/4, $0x4a7484aa
   947  DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
   948  DATA K256<>+0xac(SB)/4, $0x76f988da
   949  DATA K256<>+0xb0(SB)/4, $0x2de92c6f
   950  DATA K256<>+0xb4(SB)/4, $0x4a7484aa
   951  DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
   952  DATA K256<>+0xbc(SB)/4, $0x76f988da
   953  
   954  DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
   955  DATA K256<>+0xc4(SB)/4, $0xa831c66d
   956  DATA K256<>+0xc8(SB)/4, $0xb00327c8
   957  DATA K256<>+0xcc(SB)/4, $0xbf597fc7
   958  DATA K256<>+0xd0(SB)/4, $0x983e5152
   959  DATA K256<>+0xd4(SB)/4, $0xa831c66d
   960  DATA K256<>+0xd8(SB)/4, $0xb00327c8
   961  DATA K256<>+0xdc(SB)/4, $0xbf597fc7
   962  
   963  DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
   964  DATA K256<>+0xe4(SB)/4, $0xd5a79147
   965  DATA K256<>+0xe8(SB)/4, $0x06ca6351
   966  DATA K256<>+0xec(SB)/4, $0x14292967
   967  DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
   968  DATA K256<>+0xf4(SB)/4, $0xd5a79147
   969  DATA K256<>+0xf8(SB)/4, $0x06ca6351
   970  DATA K256<>+0xfc(SB)/4, $0x14292967
   971  
   972  DATA K256<>+0x100(SB)/4, $0x27b70a85
   973  DATA K256<>+0x104(SB)/4, $0x2e1b2138
   974  DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
   975  DATA K256<>+0x10c(SB)/4, $0x53380d13
   976  DATA K256<>+0x110(SB)/4, $0x27b70a85
   977  DATA K256<>+0x114(SB)/4, $0x2e1b2138
   978  DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
   979  DATA K256<>+0x11c(SB)/4, $0x53380d13
   980  
   981  DATA K256<>+0x120(SB)/4, $0x650a7354
   982  DATA K256<>+0x124(SB)/4, $0x766a0abb
   983  DATA K256<>+0x128(SB)/4, $0x81c2c92e
   984  DATA K256<>+0x12c(SB)/4, $0x92722c85
   985  DATA K256<>+0x130(SB)/4, $0x650a7354
   986  DATA K256<>+0x134(SB)/4, $0x766a0abb
   987  DATA K256<>+0x138(SB)/4, $0x81c2c92e
   988  DATA K256<>+0x13c(SB)/4, $0x92722c85
   989  
   990  DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
   991  DATA K256<>+0x144(SB)/4, $0xa81a664b
   992  DATA K256<>+0x148(SB)/4, $0xc24b8b70
   993  DATA K256<>+0x14c(SB)/4, $0xc76c51a3
   994  DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
   995  DATA K256<>+0x154(SB)/4, $0xa81a664b
   996  DATA K256<>+0x158(SB)/4, $0xc24b8b70
   997  DATA K256<>+0x15c(SB)/4, $0xc76c51a3
   998  
   999  DATA K256<>+0x160(SB)/4, $0xd192e819
  1000  DATA K256<>+0x164(SB)/4, $0xd6990624
  1001  DATA K256<>+0x168(SB)/4, $0xf40e3585
  1002  DATA K256<>+0x16c(SB)/4, $0x106aa070
  1003  DATA K256<>+0x170(SB)/4, $0xd192e819
  1004  DATA K256<>+0x174(SB)/4, $0xd6990624
  1005  DATA K256<>+0x178(SB)/4, $0xf40e3585
  1006  DATA K256<>+0x17c(SB)/4, $0x106aa070
  1007  
  1008  DATA K256<>+0x180(SB)/4, $0x19a4c116
  1009  DATA K256<>+0x184(SB)/4, $0x1e376c08
  1010  DATA K256<>+0x188(SB)/4, $0x2748774c
  1011  DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
  1012  DATA K256<>+0x190(SB)/4, $0x19a4c116
  1013  DATA K256<>+0x194(SB)/4, $0x1e376c08
  1014  DATA K256<>+0x198(SB)/4, $0x2748774c
  1015  DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
  1016  
  1017  DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
  1018  DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
  1019  DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
  1020  DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
  1021  DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
  1022  DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
  1023  DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
  1024  DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
  1025  
  1026  DATA K256<>+0x1c0(SB)/4, $0x748f82ee
  1027  DATA K256<>+0x1c4(SB)/4, $0x78a5636f
  1028  DATA K256<>+0x1c8(SB)/4, $0x84c87814
  1029  DATA K256<>+0x1cc(SB)/4, $0x8cc70208
  1030  DATA K256<>+0x1d0(SB)/4, $0x748f82ee
  1031  DATA K256<>+0x1d4(SB)/4, $0x78a5636f
  1032  DATA K256<>+0x1d8(SB)/4, $0x84c87814
  1033  DATA K256<>+0x1dc(SB)/4, $0x8cc70208
  1034  
  1035  DATA K256<>+0x1e0(SB)/4, $0x90befffa
  1036  DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
  1037  DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
  1038  DATA K256<>+0x1ec(SB)/4, $0xc67178f2
  1039  DATA K256<>+0x1f0(SB)/4, $0x90befffa
  1040  DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
  1041  DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
  1042  DATA K256<>+0x1fc(SB)/4, $0xc67178f2
  1043  
  1044  GLOBL K256<>(SB), (NOPTR + RODATA), $512