github.com/zebozhuang/go@v0.0.0-20200207033046-f8a98f6f5c5d/src/crypto/sha256/sha256block_amd64.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // SHA256 block routine. See sha256block.go for Go equivalent.
     8  //
     9  // The algorithm is detailed in FIPS 180-4:
    10  //
    11  //  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12  
    13  // The avx2-version is described in an Intel White-Paper:
    14  // "Fast SHA-256 Implementations on Intel Architecture Processors"
    15  // To find it, surf to http://www.intel.com/p/en_US/embedded
    16  // and search for that title.
    17  // AVX2 version by Intel, same algorithm as code in Linux kernel:
    18  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
    19  // by
    20  //     James Guilford <james.guilford@intel.com>
    21  //     Kirk Yap <kirk.s.yap@intel.com>
    22  //     Tim Chen <tim.c.chen@linux.intel.com>
    23  
    24  // Wt = Mt; for 0 <= t <= 15
    25  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    26  //
    27  // a = H0
    28  // b = H1
    29  // c = H2
    30  // d = H3
    31  // e = H4
    32  // f = H5
    33  // g = H6
    34  // h = H7
    35  //
    36  // for t = 0 to 63 {
    37  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    38  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    39  //    h = g
    40  //    g = f
    41  //    f = e
    42  //    e = d + T1
    43  //    d = c
    44  //    c = b
    45  //    b = a
    46  //    a = T1 + T2
    47  // }
    48  //
    49  // H0 = a + H0
    50  // H1 = b + H1
    51  // H2 = c + H2
    52  // H3 = d + H3
    53  // H4 = e + H4
    54  // H5 = f + H5
    55  // H6 = g + H6
    56  // H7 = h + H7
    57  
    58  // Wt = Mt; for 0 <= t <= 15
    59  #define MSGSCHEDULE0(index) \
    60  	MOVL	(index*4)(SI), AX; \
    61  	BSWAPL	AX; \
    62  	MOVL	AX, (index*4)(BP)
    63  
    64  // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    65  //   SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
    66  //   SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
    67  #define MSGSCHEDULE1(index) \
    68  	MOVL	((index-2)*4)(BP), AX; \
    69  	MOVL	AX, CX; \
    70  	RORL	$17, AX; \
    71  	MOVL	CX, DX; \
    72  	RORL	$19, CX; \
    73  	SHRL	$10, DX; \
    74  	MOVL	((index-15)*4)(BP), BX; \
    75  	XORL	CX, AX; \
    76  	MOVL	BX, CX; \
    77  	XORL	DX, AX; \
    78  	RORL	$7, BX; \
    79  	MOVL	CX, DX; \
    80  	SHRL	$3, DX; \
    81  	RORL	$18, CX; \
    82  	ADDL	((index-7)*4)(BP), AX; \
    83  	XORL	CX, BX; \
    84  	XORL	DX, BX; \
    85  	ADDL	((index-16)*4)(BP), BX; \
    86  	ADDL	BX, AX; \
    87  	MOVL	AX, ((index)*4)(BP)
    88  
    89  // Calculate T1 in AX - uses AX, CX and DX registers.
    90  // h is also used as an accumulator. Wt is passed in AX.
    91  //   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    92  //     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
    93  //     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    94  #define SHA256T1(const, e, f, g, h) \
    95  	ADDL	AX, h; \
    96  	MOVL	e, AX; \
    97  	ADDL	$const, h; \
    98  	MOVL	e, CX; \
    99  	RORL	$6, AX; \
   100  	MOVL	e, DX; \
   101  	RORL	$11, CX; \
   102  	XORL	CX, AX; \
   103  	MOVL	e, CX; \
   104  	RORL	$25, DX; \
   105  	ANDL	f, CX; \
   106  	XORL	AX, DX; \
   107  	MOVL	e, AX; \
   108  	NOTL	AX; \
   109  	ADDL	DX, h; \
   110  	ANDL	g, AX; \
   111  	XORL	CX, AX; \
   112  	ADDL	h, AX
   113  
   114  // Calculate T2 in BX - uses BX, CX, DX and DI registers.
   115  //   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   116  //     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
   117  //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   118  #define SHA256T2(a, b, c) \
   119  	MOVL	a, DI; \
   120  	MOVL	c, BX; \
   121  	RORL	$2, DI; \
   122  	MOVL	a, DX; \
   123  	ANDL	b, BX; \
   124  	RORL	$13, DX; \
   125  	MOVL	a, CX; \
   126  	ANDL	c, CX; \
   127  	XORL	DX, DI; \
   128  	XORL	CX, BX; \
   129  	MOVL	a, DX; \
   130  	MOVL	b, CX; \
   131  	RORL	$22, DX; \
   132  	ANDL	a, CX; \
   133  	XORL	CX, BX; \
   134  	XORL	DX, DI; \
   135  	ADDL	DI, BX
   136  
   137  // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   138  // The values for e and a are stored in d and h, ready for rotation.
   139  #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
   140  	SHA256T1(const, e, f, g, h); \
   141  	SHA256T2(a, b, c); \
   142  	MOVL	BX, h; \
   143  	ADDL	AX, d; \
   144  	ADDL	AX, h
   145  
   146  #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
   147  	MSGSCHEDULE0(index); \
   148  	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   149  
   150  #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
   151  	MSGSCHEDULE1(index); \
   152  	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   153  
   154  
   155  // Definitions for AVX2 version
   156  
   157  // addm (mem), reg
   158  // Add reg to mem using reg-mem add and store
   159  #define addm(P1, P2) \
   160  	ADDL P2, P1; \
   161  	MOVL P1, P2
   162  
   163  #define XDWORD0 Y4
   164  #define XDWORD1 Y5
   165  #define XDWORD2 Y6
   166  #define XDWORD3 Y7
   167  
   168  #define XWORD0 X4
   169  #define XWORD1 X5
   170  #define XWORD2 X6
   171  #define XWORD3 X7
   172  
   173  #define XTMP0 Y0
   174  #define XTMP1 Y1
   175  #define XTMP2 Y2
   176  #define XTMP3 Y3
   177  #define XTMP4 Y8
   178  #define XTMP5 Y11
   179  
   180  #define XFER  Y9
   181  
   182  #define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
   183  #define X_BYTE_FLIP_MASK X13
   184  
   185  #define NUM_BYTES DX
   186  #define INP	DI
   187  
   188  #define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
   189  
   190  #define a AX
   191  #define b BX
   192  #define c CX
   193  #define d R8
   194  #define e DX
   195  #define f R9
   196  #define g R10
   197  #define h R11
   198  
   199  #define old_h R11
   200  
   201  #define TBL BP
   202  
   203  #define SRND SI // SRND is same register as CTX
   204  
   205  #define T1 R12
   206  
   207  #define y0 R13
   208  #define y1 R14
   209  #define y2 R15
   210  #define y3 DI
   211  
   212  // Offsets
   213  #define XFER_SIZE 2*64*4
   214  #define INP_END_SIZE 8
   215  #define INP_SIZE 8
   216  #define TMP_SIZE 4
   217  
   218  #define _XFER 0
   219  #define _INP_END _XFER + XFER_SIZE
   220  #define _INP _INP_END + INP_END_SIZE
   221  #define _TMP _INP + INP_SIZE
   222  #define STACK_SIZE _TMP + TMP_SIZE
   223  
   224  #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   225  	;                                     \ // #############################  RND N + 0 ############################//
   226  	MOVL     a, y3;                       \ // y3 = a					// MAJA
   227  	RORXL    $25, e, y0;                  \ // y0 = e >> 25				// S1A
   228  	RORXL    $11, e, y1;                  \ // y1 = e >> 11				// S1B
   229  	;                                     \
   230  	ADDL     (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h        // disp = k + w
   231  	ORL      c, y3;                       \ // y3 = a|c				// MAJA
   232  	VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
   233  	MOVL     f, y2;                       \ // y2 = f				// CH
   234  	RORXL    $13, a, T1;                  \ // T1 = a >> 13			// S0B
   235  	;                                     \
   236  	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)					// S1
   237  	XORL     g, y2;                       \ // y2 = f^g                              	// CH
   238  	VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
   239  	RORXL    $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   240  	;                                     \
   241  	ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
   242  	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   243  	RORXL    $22, a, y1;                  \ // y1 = a >> 22							// S0A
   244  	ADDL     h, d;                        \ // d = k + w + h + d                     	// --
   245  	;                                     \
   246  	ANDL     b, y3;                       \ // y3 = (a|c)&b							// MAJA
   247  	VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
   248  	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   249  	RORXL    $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   250  	;                                     \
   251  	XORL     g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   252  	VPSRLD   $7, XTMP1, XTMP2;            \
   253  	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   254  	MOVL     a, T1;                       \ // T1 = a								// MAJB
   255  	ANDL     c, T1;                       \ // T1 = a&c								// MAJB
   256  	;                                     \
   257  	ADDL     y0, y2;                      \ // y2 = S1 + CH							// --
   258  	VPSLLD   $(32-7), XTMP1, XTMP3;       \
   259  	ORL      T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   260  	ADDL     y1, h;                       \ // h = k + w + h + S0					// --
   261  	;                                     \
   262  	ADDL     y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   263  	VPOR     XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7
   264  	;                                     \
   265  	VPSRLD   $18, XTMP1, XTMP2;           \
   266  	ADDL     y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   267  	ADDL     y3, h                        // h = t1 + S0 + MAJ                     // --
   268  
   269  #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   270  	;                                    \ // ################################### RND N + 1 ############################
   271  	;                                    \
   272  	MOVL    a, y3;                       \ // y3 = a                       // MAJA
   273  	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   274  	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   275  	ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h         		// --
   276  	ORL     c, y3;                       \ // y3 = a|c						// MAJA
   277  	;                                    \
   278  	VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
   279  	MOVL    f, y2;                       \ // y2 = f						// CH
   280  	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   281  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   282  	XORL    g, y2;                       \ // y2 = f^g						// CH
   283  	;                                    \
   284  	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   285  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   286  	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   287  	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   288  	ADDL    h, d;                        \ // d = k + w + h + d				// --
   289  	;                                    \
   290  	VPSLLD  $(32-18), XTMP1, XTMP1;      \
   291  	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   292  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   293  	;                                    \
   294  	VPXOR   XTMP1, XTMP3, XTMP3;         \
   295  	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   296  	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g		// CH
   297  	;                                    \
   298  	VPXOR   XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
   299  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   300  	MOVL    a, T1;                       \ // T1 = a						// MAJB
   301  	ANDL    c, T1;                       \ // T1 = a&c						// MAJB
   302  	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   303  	;                                    \
   304  	VPXOR   XTMP4, XTMP3, XTMP1;         \ // XTMP1 = s0
   305  	VPSHUFD $0xFA, XDWORD3, XTMP2;       \ // XTMP2 = W[-2] {BBAA}
   306  	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   307  	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   308  	;                                    \
   309  	VPADDD  XTMP1, XTMP0, XTMP0;         \ // XTMP0 = W[-16] + W[-7] + s0
   310  	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   311  	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   312  	ADDL    y3, h;                       \ // h = t1 + S0 + MAJ                     // --
   313  	;                                    \
   314  	VPSRLD  $10, XTMP2, XTMP4            // XTMP4 = W[-2] >> 10 {BBAA}
   315  
   316  #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   317  	;                                    \ // ################################### RND N + 2 ############################
   318  	;                                    \
   319  	MOVL    a, y3;                       \ // y3 = a							// MAJA
   320  	RORXL   $25, e, y0;                  \ // y0 = e >> 25						// S1A
   321  	ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h        			// --
   322  	;                                    \
   323  	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
   324  	RORXL   $11, e, y1;                  \ // y1 = e >> 11						// S1B
   325  	ORL     c, y3;                       \ // y3 = a|c                         // MAJA
   326  	MOVL    f, y2;                       \ // y2 = f                           // CH
   327  	XORL    g, y2;                       \ // y2 = f^g                         // CH
   328  	;                                    \
   329  	RORXL   $13, a, T1;                  \ // T1 = a >> 13						// S0B
   330  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)			// S1
   331  	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xBxA}
   332  	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   333  	;                                    \
   334  	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)					// S1
   335  	VPXOR   XTMP3, XTMP2, XTMP2;         \
   336  	ADDL    h, d;                        \ // d = k + w + h + d				// --
   337  	ANDL    b, y3;                       \ // y3 = (a|c)&b						// MAJA
   338  	;                                    \
   339  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   340  	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   341  	VPXOR   XTMP2, XTMP4, XTMP4;         \ // XTMP4 = s1 {xBxA}
   342  	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   343  	;                                    \
   344  	MOVL    f, _TMP(SP);                 \
   345  	MOVQ    $shuff_00BA<>(SB), f;        \ // f is used to keep SHUF_00BA
   346  	VPSHUFB (f), XTMP4, XTMP4;           \ // XTMP4 = s1 {00BA}
   347  	MOVL    _TMP(SP), f;                 \ // f is restored
   348  	;                                    \
   349  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   350  	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   351  	VPADDD  XTMP4, XTMP0, XTMP0;         \ // XTMP0 = {..., ..., W[1], W[0]}
   352  	;                                    \
   353  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   354  	MOVL    a, T1;                       \ // T1 = a                                // MAJB
   355  	ANDL    c, T1;                       \ // T1 = a&c                              // MAJB
   356  	ADDL    y0, y2;                      \ // y2 = S1 + CH                          // --
   357  	VPSHUFD $80, XTMP0, XTMP2;           \ // XTMP2 = W[-2] {DDCC}
   358  	;                                    \
   359  	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   360  	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   361  	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   362  	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   363  	;                                    \
   364  	ADDL    y3, h                        // h = t1 + S0 + MAJ                     // --
   365  
   366  #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   367  	;                                    \ // ################################### RND N + 3 ############################
   368  	;                                    \
   369  	MOVL    a, y3;                       \ // y3 = a						// MAJA
   370  	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   371  	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   372  	ADDL    (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h				// --
   373  	ORL     c, y3;                       \ // y3 = a|c                     // MAJA
   374  	;                                    \
   375  	VPSRLD  $10, XTMP2, XTMP5;           \ // XTMP5 = W[-2] >> 10 {DDCC}
   376  	MOVL    f, y2;                       \ // y2 = f						// CH
   377  	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   378  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   379  	XORL    g, y2;                       \ // y2 = f^g						// CH
   380  	;                                    \
   381  	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xDxC}
   382  	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   383  	ANDL    e, y2;                       \ // y2 = (f^g)&e					// CH
   384  	ADDL    h, d;                        \ // d = k + w + h + d			// --
   385  	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   386  	;                                    \
   387  	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xDxC}
   388  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   389  	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   390  	;                                    \
   391  	VPXOR   XTMP3, XTMP2, XTMP2;         \
   392  	RORXL   $22, a, y1;                  \ // y1 = a >> 22					// S0A
   393  	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   394  	;                                    \
   395  	VPXOR   XTMP2, XTMP5, XTMP5;         \ // XTMP5 = s1 {xDxC}
   396  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   397  	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   398  	;                                    \
   399  	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   400  	;                                    \
   401  	MOVL    f, _TMP(SP);                 \ // Save f
   402  	MOVQ    $shuff_DC00<>(SB), f;        \ // SHUF_00DC
   403  	VPSHUFB (f), XTMP5, XTMP5;           \ // XTMP5 = s1 {DC00}
   404  	MOVL    _TMP(SP), f;                 \ // Restore f
   405  	;                                    \
   406  	VPADDD  XTMP0, XTMP5, XDWORD0;       \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
   407  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   408  	MOVL    a, T1;                       \ // T1 = a							// MAJB
   409  	ANDL    c, T1;                       \ // T1 = a&c							// MAJB
   410  	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)		// MAJ
   411  	;                                    \
   412  	ADDL    y1, h;                       \ // h = k + w + h + S0				// --
   413  	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   414  	ADDL    y3, h                        // h = t1 + S0 + MAJ				// --
   415  
   416  #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
   417  	;                                  \ // ################################### RND N + 0 ###########################
   418  	MOVL  f, y2;                       \ // y2 = f					// CH
   419  	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   420  	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   421  	XORL  g, y2;                       \ // y2 = f^g					// CH
   422  	;                                  \
   423  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)	// S1
   424  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)			// S1
   425  	ANDL  e, y2;                       \ // y2 = (f^g)&e				// CH
   426  	;                                  \
   427  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   428  	RORXL $13, a, T1;                  \ // T1 = a >> 13						// S0B
   429  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   430  	RORXL $22, a, y1;                  \ // y1 = a >> 22						// S0A
   431  	MOVL  a, y3;                       \ // y3 = a							// MAJA
   432  	;                                  \
   433  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)			// S0
   434  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)					// S0
   435  	ADDL  (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   436  	ORL   c, y3;                       \ // y3 = a|c							// MAJA
   437  	;                                  \
   438  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   439  	MOVL  a, T1;                       \ // T1 = a							// MAJB
   440  	ANDL  b, y3;                       \ // y3 = (a|c)&b						// MAJA
   441  	ANDL  c, T1;                       \ // T1 = a&c							// MAJB
   442  	ADDL  y0, y2;                      \ // y2 = S1 + CH						// --
   443  	;                                  \
   444  	ADDL  h, d;                        \ // d = k + w + h + d					// --
   445  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   446  	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   447  	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1	// --
   448  
   449  #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
   450  	;                                  \ // ################################### RND N + 1 ###########################
   451  	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
   452  	MOVL  f, y2;                       \ // y2 = f                                // CH
   453  	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   454  	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   455  	XORL  g, y2;                       \ // y2 = f^g                             // CH
   456  	;                                  \
   457  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   458  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   459  	ANDL  e, y2;                       \ // y2 = (f^g)&e                         // CH
   460  	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ                    // --
   461  	;                                  \
   462  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   463  	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   464  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   465  	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   466  	MOVL  a, y3;                       \ // y3 = a                               // MAJA
   467  	;                                  \
   468  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   469  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   470  	ADDL  (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   471  	ORL   c, y3;                       \ // y3 = a|c                             // MAJA
   472  	;                                  \
   473  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   474  	MOVL  a, T1;                       \ // T1 = a                               // MAJB
   475  	ANDL  b, y3;                       \ // y3 = (a|c)&b                         // MAJA
   476  	ANDL  c, T1;                       \ // T1 = a&c                             // MAJB
   477  	ADDL  y0, y2;                      \ // y2 = S1 + CH                         // --
   478  	;                                  \
   479  	ADDL  h, d;                        \ // d = k + w + h + d                    // --
   480  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)            // MAJ
   481  	ADDL  y1, h;                       \ // h = k + w + h + S0                   // --
   482  	;                                  \
   483  	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   484  
   485  #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
   486  	;                                  \ // ################################### RND N + 2 ##############################
   487  	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   488  	MOVL  f, y2;                       \ // y2 = f								// CH
   489  	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   490  	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   491  	XORL  g, y2;                       \ // y2 = f^g								// CH
   492  	;                                  \
   493  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   494  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   495  	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   496  	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   497  	;                                  \
   498  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   499  	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   500  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   501  	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   502  	MOVL  a, y3;                       \ // y3 = a								// MAJA
   503  	;                                  \
   504  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   505  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   506  	ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
   507  	ORL   c, y3;                       \ // y3 = a|c								// MAJA
   508  	;                                  \
   509  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   510  	MOVL  a, T1;                       \ // T1 = a								// MAJB
   511  	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   512  	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   513  	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   514  	;                                  \
   515  	ADDL  h, d;                        \ // d = k + w + h + d					// --
   516  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   517  	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   518  	;                                  \
   519  	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   520  
   521  #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
   522  	;                                  \ // ################################### RND N + 3 ###########################
   523  	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   524  	MOVL  f, y2;                       \ // y2 = f								// CH
   525  	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   526  	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   527  	XORL  g, y2;                       \ // y2 = f^g								// CH
   528  	;                                  \
   529  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   530  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   531  	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   532  	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   533  	;                                  \
   534  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   535  	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   536  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   537  	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   538  	MOVL  a, y3;                       \ // y3 = a								// MAJA
   539  	;                                  \
   540  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   541  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   542  	ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
   543  	ORL   c, y3;                       \ // y3 = a|c								// MAJA
   544  	;                                  \
   545  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   546  	MOVL  a, T1;                       \ // T1 = a								// MAJB
   547  	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   548  	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   549  	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   550  	;                                  \
   551  	ADDL  h, d;                        \ // d = k + w + h + d					// --
   552  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   553  	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   554  	;                                  \
   555  	ADDL  y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1	// --
   556  	;                                  \
   557  	ADDL  y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   558  	;                                  \
   559  	ADDL  y3, h                        // h = t1 + S0 + MAJ					// --
   560  
   561  TEXT ·block(SB), 0, $536-32
   562  	CMPB ·useAVX2(SB), $1
   563  	JE   avx2
   564  
   565  	MOVQ p_base+8(FP), SI
   566  	MOVQ p_len+16(FP), DX
   567  	SHRQ $6, DX
   568  	SHLQ $6, DX
   569  
   570  	LEAQ (SI)(DX*1), DI
   571  	MOVQ DI, 256(SP)
   572  	CMPQ SI, DI
   573  	JEQ  end
   574  
   575  	MOVQ dig+0(FP), BP
   576  	MOVL (0*4)(BP), R8  // a = H0
   577  	MOVL (1*4)(BP), R9  // b = H1
   578  	MOVL (2*4)(BP), R10 // c = H2
   579  	MOVL (3*4)(BP), R11 // d = H3
   580  	MOVL (4*4)(BP), R12 // e = H4
   581  	MOVL (5*4)(BP), R13 // f = H5
   582  	MOVL (6*4)(BP), R14 // g = H6
   583  	MOVL (7*4)(BP), R15 // h = H7
   584  
   585  loop:
   586  	MOVQ SP, BP
   587  
   588  	SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
   589  	SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
   590  	SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
   591  	SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
   592  	SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
   593  	SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
   594  	SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
   595  	SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
   596  	SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
   597  	SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
   598  	SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
   599  	SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
   600  	SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
   601  	SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
   602  	SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
   603  	SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
   604  
   605  	SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
   606  	SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
   607  	SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
   608  	SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
   609  	SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
   610  	SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
   611  	SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
   612  	SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
   613  	SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
   614  	SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
   615  	SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
   616  	SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
   617  	SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
   618  	SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
   619  	SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
   620  	SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
   621  	SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
   622  	SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
   623  	SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
   624  	SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
   625  	SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
   626  	SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
   627  	SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
   628  	SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
   629  	SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
   630  	SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
   631  	SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
   632  	SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
   633  	SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
   634  	SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
   635  	SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
   636  	SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
   637  	SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
   638  	SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
   639  	SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
   640  	SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
   641  	SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
   642  	SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
   643  	SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
   644  	SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
   645  	SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
   646  	SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
   647  	SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
   648  	SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
   649  	SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
   650  	SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
   651  	SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
   652  	SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
   653  
   654  	MOVQ dig+0(FP), BP
   655  	ADDL (0*4)(BP), R8  // H0 = a + H0
   656  	MOVL R8, (0*4)(BP)
   657  	ADDL (1*4)(BP), R9  // H1 = b + H1
   658  	MOVL R9, (1*4)(BP)
   659  	ADDL (2*4)(BP), R10 // H2 = c + H2
   660  	MOVL R10, (2*4)(BP)
   661  	ADDL (3*4)(BP), R11 // H3 = d + H3
   662  	MOVL R11, (3*4)(BP)
   663  	ADDL (4*4)(BP), R12 // H4 = e + H4
   664  	MOVL R12, (4*4)(BP)
   665  	ADDL (5*4)(BP), R13 // H5 = f + H5
   666  	MOVL R13, (5*4)(BP)
   667  	ADDL (6*4)(BP), R14 // H6 = g + H6
   668  	MOVL R14, (6*4)(BP)
   669  	ADDL (7*4)(BP), R15 // H7 = h + H7
   670  	MOVL R15, (7*4)(BP)
   671  
   672  	ADDQ $64, SI
   673  	CMPQ SI, 256(SP)
   674  	JB   loop
   675  
   676  end:
   677  	RET
   678  
   679  avx2:
   680  	MOVQ dig+0(FP), CTX          // d.h[8]
   681  	MOVQ p_base+8(FP), INP
   682  	MOVQ p_len+16(FP), NUM_BYTES
   683  
   684  	LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
   685  	MOVQ NUM_BYTES, _INP_END(SP)
   686  
   687  	CMPQ NUM_BYTES, INP
   688  	JE   avx2_only_one_block
   689  
   690  	// Load initial digest
   691  	MOVL 0(CTX), a  // a = H0
   692  	MOVL 4(CTX), b  // b = H1
   693  	MOVL 8(CTX), c  // c = H2
   694  	MOVL 12(CTX), d // d = H3
   695  	MOVL 16(CTX), e // e = H4
   696  	MOVL 20(CTX), f // f = H5
   697  	MOVL 24(CTX), g // g = H6
   698  	MOVL 28(CTX), h // h = H7
   699  
   700  avx2_loop0: // at each iteration works with one block (512 bit)
   701  
   702  	VMOVDQU (0*32)(INP), XTMP0
   703  	VMOVDQU (1*32)(INP), XTMP1
   704  	VMOVDQU (2*32)(INP), XTMP2
   705  	VMOVDQU (3*32)(INP), XTMP3
   706  
   707  	MOVQ    $flip_mask<>(SB), BP // BYTE_FLIP_MASK
   708  	VMOVDQU (BP), BYTE_FLIP_MASK
   709  
   710  	// Apply Byte Flip Mask: LE -> BE
   711  	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
   712  	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
   713  	VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
   714  	VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
   715  
   716  	// Transpose data into high/low parts
   717  	VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
   718  	VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
   719  	VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
   720  	VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
   721  
   722  	MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
   723  
   724  avx2_last_block_enter:
   725  	ADDQ $64, INP
   726  	MOVQ INP, _INP(SP)
   727  	XORQ SRND, SRND
   728  
   729  avx2_loop1: // for w0 - w47
   730  	// Do 4 rounds and scheduling
   731  	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   732  	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   733  	ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   734  	ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   735  	ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   736  	ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   737  
   738  	// Do 4 rounds and scheduling
   739  	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   740  	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   741  	ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   742  	ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   743  	ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   744  	ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   745  
   746  	// Do 4 rounds and scheduling
   747  	VPADDD  2*32(TBL)(SRND*1), XDWORD2, XFER
   748  	VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
   749  	ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   750  	ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   751  	ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   752  	ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   753  
   754  	// Do 4 rounds and scheduling
   755  	VPADDD  3*32(TBL)(SRND*1), XDWORD3, XFER
   756  	VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
   757  	ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   758  	ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   759  	ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   760  	ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   761  
   762  	ADDQ $4*32, SRND
   763  	CMPQ SRND, $3*4*32
   764  	JB   avx2_loop1
   765  
   766  avx2_loop2:
   767  	// w48 - w63 processed with no scheduliung (last 16 rounds)
   768  	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   769  	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   770  	DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
   771  	DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
   772  	DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
   773  	DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
   774  
   775  	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   776  	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   777  	DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
   778  	DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
   779  	DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
   780  	DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
   781  
   782  	ADDQ $2*32, SRND
   783  
   784  	VMOVDQU XDWORD2, XDWORD0
   785  	VMOVDQU XDWORD3, XDWORD1
   786  
   787  	CMPQ SRND, $4*4*32
   788  	JB   avx2_loop2
   789  
   790  	MOVQ dig+0(FP), CTX // d.h[8]
   791  	MOVQ _INP(SP), INP
   792  
   793  	addm(  0(CTX), a)
   794  	addm(  4(CTX), b)
   795  	addm(  8(CTX), c)
   796  	addm( 12(CTX), d)
   797  	addm( 16(CTX), e)
   798  	addm( 20(CTX), f)
   799  	addm( 24(CTX), g)
   800  	addm( 28(CTX), h)
   801  
   802  	CMPQ _INP_END(SP), INP
   803  	JB   done_hash
   804  
   805  	XORQ SRND, SRND
   806  
   807  avx2_loop3: // Do second block using previously scheduled results
   808  	DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
   809  	DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
   810  	DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
   811  	DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
   812  
   813  	DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
   814  	DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
   815  	DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
   816  	DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
   817  
   818  	ADDQ $2*32, SRND
   819  	CMPQ SRND, $4*4*32
   820  	JB   avx2_loop3
   821  
   822  	MOVQ dig+0(FP), CTX // d.h[8]
   823  	MOVQ _INP(SP), INP
   824  	ADDQ $64, INP
   825  
   826  	addm(  0(CTX), a)
   827  	addm(  4(CTX), b)
   828  	addm(  8(CTX), c)
   829  	addm( 12(CTX), d)
   830  	addm( 16(CTX), e)
   831  	addm( 20(CTX), f)
   832  	addm( 24(CTX), g)
   833  	addm( 28(CTX), h)
   834  
   835  	CMPQ _INP_END(SP), INP
   836  	JA   avx2_loop0
   837  	JB   done_hash
   838  
   839  avx2_do_last_block:
   840  
   841  	VMOVDQU 0(INP), XWORD0
   842  	VMOVDQU 16(INP), XWORD1
   843  	VMOVDQU 32(INP), XWORD2
   844  	VMOVDQU 48(INP), XWORD3
   845  
   846  	MOVQ    $flip_mask<>(SB), BP
   847  	VMOVDQU (BP), X_BYTE_FLIP_MASK
   848  
   849  	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
   850  	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
   851  	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
   852  	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
   853  
   854  	MOVQ $K256<>(SB), TBL
   855  
   856  	JMP avx2_last_block_enter
   857  
   858  avx2_only_one_block:
   859  	// Load initial digest
   860  	MOVL 0(CTX), a  // a = H0
   861  	MOVL 4(CTX), b  // b = H1
   862  	MOVL 8(CTX), c  // c = H2
   863  	MOVL 12(CTX), d // d = H3
   864  	MOVL 16(CTX), e // e = H4
   865  	MOVL 20(CTX), f // f = H5
   866  	MOVL 24(CTX), g // g = H6
   867  	MOVL 28(CTX), h // h = H7
   868  
   869  	JMP avx2_do_last_block
   870  
   871  done_hash:
   872  	VZEROUPPER
   873  	RET
   874  
   875  // shuffle byte order from LE to BE
   876  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
   877  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
   878  DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
   879  DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
   880  GLOBL flip_mask<>(SB), 8, $32
   881  
   882  // shuffle xBxA -> 00BA
   883  DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
   884  DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
   885  DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
   886  DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   887  GLOBL shuff_00BA<>(SB), 8, $32
   888  
   889  // shuffle xDxC -> DC00
   890  DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
   891  DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
   892  DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   893  DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
   894  GLOBL shuff_DC00<>(SB), 8, $32
   895  
   896  // Round specific constants
   897  DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
   898  DATA K256<>+0x04(SB)/4, $0x71374491 // k2
   899  DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
   900  DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
   901  DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
   902  DATA K256<>+0x14(SB)/4, $0x71374491 // k2
   903  DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
   904  DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
   905  
   906  DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
   907  DATA K256<>+0x24(SB)/4, $0x59f111f1
   908  DATA K256<>+0x28(SB)/4, $0x923f82a4
   909  DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
   910  DATA K256<>+0x30(SB)/4, $0x3956c25b
   911  DATA K256<>+0x34(SB)/4, $0x59f111f1
   912  DATA K256<>+0x38(SB)/4, $0x923f82a4
   913  DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
   914  
   915  DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
   916  DATA K256<>+0x44(SB)/4, $0x12835b01
   917  DATA K256<>+0x48(SB)/4, $0x243185be
   918  DATA K256<>+0x4c(SB)/4, $0x550c7dc3
   919  DATA K256<>+0x50(SB)/4, $0xd807aa98
   920  DATA K256<>+0x54(SB)/4, $0x12835b01
   921  DATA K256<>+0x58(SB)/4, $0x243185be
   922  DATA K256<>+0x5c(SB)/4, $0x550c7dc3
   923  
   924  DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
   925  DATA K256<>+0x64(SB)/4, $0x80deb1fe
   926  DATA K256<>+0x68(SB)/4, $0x9bdc06a7
   927  DATA K256<>+0x6c(SB)/4, $0xc19bf174
   928  DATA K256<>+0x70(SB)/4, $0x72be5d74
   929  DATA K256<>+0x74(SB)/4, $0x80deb1fe
   930  DATA K256<>+0x78(SB)/4, $0x9bdc06a7
   931  DATA K256<>+0x7c(SB)/4, $0xc19bf174
   932  
   933  DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
   934  DATA K256<>+0x84(SB)/4, $0xefbe4786
   935  DATA K256<>+0x88(SB)/4, $0x0fc19dc6
   936  DATA K256<>+0x8c(SB)/4, $0x240ca1cc
   937  DATA K256<>+0x90(SB)/4, $0xe49b69c1
   938  DATA K256<>+0x94(SB)/4, $0xefbe4786
   939  DATA K256<>+0x98(SB)/4, $0x0fc19dc6
   940  DATA K256<>+0x9c(SB)/4, $0x240ca1cc
   941  
   942  DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
   943  DATA K256<>+0xa4(SB)/4, $0x4a7484aa
   944  DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
   945  DATA K256<>+0xac(SB)/4, $0x76f988da
   946  DATA K256<>+0xb0(SB)/4, $0x2de92c6f
   947  DATA K256<>+0xb4(SB)/4, $0x4a7484aa
   948  DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
   949  DATA K256<>+0xbc(SB)/4, $0x76f988da
   950  
   951  DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
   952  DATA K256<>+0xc4(SB)/4, $0xa831c66d
   953  DATA K256<>+0xc8(SB)/4, $0xb00327c8
   954  DATA K256<>+0xcc(SB)/4, $0xbf597fc7
   955  DATA K256<>+0xd0(SB)/4, $0x983e5152
   956  DATA K256<>+0xd4(SB)/4, $0xa831c66d
   957  DATA K256<>+0xd8(SB)/4, $0xb00327c8
   958  DATA K256<>+0xdc(SB)/4, $0xbf597fc7
   959  
   960  DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
   961  DATA K256<>+0xe4(SB)/4, $0xd5a79147
   962  DATA K256<>+0xe8(SB)/4, $0x06ca6351
   963  DATA K256<>+0xec(SB)/4, $0x14292967
   964  DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
   965  DATA K256<>+0xf4(SB)/4, $0xd5a79147
   966  DATA K256<>+0xf8(SB)/4, $0x06ca6351
   967  DATA K256<>+0xfc(SB)/4, $0x14292967
   968  
   969  DATA K256<>+0x100(SB)/4, $0x27b70a85
   970  DATA K256<>+0x104(SB)/4, $0x2e1b2138
   971  DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
   972  DATA K256<>+0x10c(SB)/4, $0x53380d13
   973  DATA K256<>+0x110(SB)/4, $0x27b70a85
   974  DATA K256<>+0x114(SB)/4, $0x2e1b2138
   975  DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
   976  DATA K256<>+0x11c(SB)/4, $0x53380d13
   977  
   978  DATA K256<>+0x120(SB)/4, $0x650a7354
   979  DATA K256<>+0x124(SB)/4, $0x766a0abb
   980  DATA K256<>+0x128(SB)/4, $0x81c2c92e
   981  DATA K256<>+0x12c(SB)/4, $0x92722c85
   982  DATA K256<>+0x130(SB)/4, $0x650a7354
   983  DATA K256<>+0x134(SB)/4, $0x766a0abb
   984  DATA K256<>+0x138(SB)/4, $0x81c2c92e
   985  DATA K256<>+0x13c(SB)/4, $0x92722c85
   986  
   987  DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
   988  DATA K256<>+0x144(SB)/4, $0xa81a664b
   989  DATA K256<>+0x148(SB)/4, $0xc24b8b70
   990  DATA K256<>+0x14c(SB)/4, $0xc76c51a3
   991  DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
   992  DATA K256<>+0x154(SB)/4, $0xa81a664b
   993  DATA K256<>+0x158(SB)/4, $0xc24b8b70
   994  DATA K256<>+0x15c(SB)/4, $0xc76c51a3
   995  
   996  DATA K256<>+0x160(SB)/4, $0xd192e819
   997  DATA K256<>+0x164(SB)/4, $0xd6990624
   998  DATA K256<>+0x168(SB)/4, $0xf40e3585
   999  DATA K256<>+0x16c(SB)/4, $0x106aa070
  1000  DATA K256<>+0x170(SB)/4, $0xd192e819
  1001  DATA K256<>+0x174(SB)/4, $0xd6990624
  1002  DATA K256<>+0x178(SB)/4, $0xf40e3585
  1003  DATA K256<>+0x17c(SB)/4, $0x106aa070
  1004  
  1005  DATA K256<>+0x180(SB)/4, $0x19a4c116
  1006  DATA K256<>+0x184(SB)/4, $0x1e376c08
  1007  DATA K256<>+0x188(SB)/4, $0x2748774c
  1008  DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
  1009  DATA K256<>+0x190(SB)/4, $0x19a4c116
  1010  DATA K256<>+0x194(SB)/4, $0x1e376c08
  1011  DATA K256<>+0x198(SB)/4, $0x2748774c
  1012  DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
  1013  
  1014  DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
  1015  DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
  1016  DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
  1017  DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
  1018  DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
  1019  DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
  1020  DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
  1021  DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
  1022  
  1023  DATA K256<>+0x1c0(SB)/4, $0x748f82ee
  1024  DATA K256<>+0x1c4(SB)/4, $0x78a5636f
  1025  DATA K256<>+0x1c8(SB)/4, $0x84c87814
  1026  DATA K256<>+0x1cc(SB)/4, $0x8cc70208
  1027  DATA K256<>+0x1d0(SB)/4, $0x748f82ee
  1028  DATA K256<>+0x1d4(SB)/4, $0x78a5636f
  1029  DATA K256<>+0x1d8(SB)/4, $0x84c87814
  1030  DATA K256<>+0x1dc(SB)/4, $0x8cc70208
  1031  
  1032  DATA K256<>+0x1e0(SB)/4, $0x90befffa
  1033  DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
  1034  DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
  1035  DATA K256<>+0x1ec(SB)/4, $0xc67178f2
  1036  DATA K256<>+0x1f0(SB)/4, $0x90befffa
  1037  DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
  1038  DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
  1039  DATA K256<>+0x1fc(SB)/4, $0xc67178f2
  1040  
  1041  GLOBL K256<>(SB), (NOPTR + RODATA), $512