github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/crypto/sha256/sha256block_amd64.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // SHA256 block routine. See sha256block.go for Go equivalent.
     8  //
     9  // The algorithm is detailed in FIPS 180-4:
    10  //
    11  //  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12  
    13  // The avx2-version is described in an Intel White-Paper:
    14  // "Fast SHA-256 Implementations on Intel Architecture Processors"
    15  // To find it, surf to http://www.intel.com/p/en_US/embedded
    16  // and search for that title.
    17  // AVX2 version by Intel, same algorithm as code in Linux kernel:
    18  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
    19  // by
    20  //     James Guilford <james.guilford@intel.com>
    21  //     Kirk Yap <kirk.s.yap@intel.com>
    22  //     Tim Chen <tim.c.chen@linux.intel.com>
    23  
    24  // Wt = Mt; for 0 <= t <= 15
    25  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    26  //
    27  // a = H0
    28  // b = H1
    29  // c = H2
    30  // d = H3
    31  // e = H4
    32  // f = H5
    33  // g = H6
    34  // h = H7
    35  //
    36  // for t = 0 to 63 {
    37  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    38  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    39  //    h = g
    40  //    g = f
    41  //    f = e
    42  //    e = d + T1
    43  //    d = c
    44  //    c = b
    45  //    b = a
    46  //    a = T1 + T2
    47  // }
    48  //
    49  // H0 = a + H0
    50  // H1 = b + H1
    51  // H2 = c + H2
    52  // H3 = d + H3
    53  // H4 = e + H4
    54  // H5 = f + H5
    55  // H6 = g + H6
    56  // H7 = h + H7
    57  
    58  // Wt = Mt; for 0 <= t <= 15
    59  #define MSGSCHEDULE0(index) \
    60  	MOVL	(index*4)(SI), AX; \
    61  	BSWAPL	AX; \
    62  	MOVL	AX, (index*4)(BP)
    63  
    64  // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    65  //   SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
    66  //   SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
    67  #define MSGSCHEDULE1(index) \
    68  	MOVL	((index-2)*4)(BP), AX; \
    69  	MOVL	AX, CX; \
    70  	RORL	$17, AX; \
    71  	MOVL	CX, DX; \
    72  	RORL	$19, CX; \
    73  	SHRL	$10, DX; \
    74  	MOVL	((index-15)*4)(BP), BX; \
    75  	XORL	CX, AX; \
    76  	MOVL	BX, CX; \
    77  	XORL	DX, AX; \
    78  	RORL	$7, BX; \
    79  	MOVL	CX, DX; \
    80  	SHRL	$3, DX; \
    81  	RORL	$18, CX; \
    82  	ADDL	((index-7)*4)(BP), AX; \
    83  	XORL	CX, BX; \
    84  	XORL	DX, BX; \
    85  	ADDL	((index-16)*4)(BP), BX; \
    86  	ADDL	BX, AX; \
    87  	MOVL	AX, ((index)*4)(BP)
    88  
    89  // Calculate T1 in AX - uses AX, CX and DX registers.
    90  // h is also used as an accumulator. Wt is passed in AX.
    91  //   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    92  //     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
    93  //     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    94  #define SHA256T1(const, e, f, g, h) \
    95  	ADDL	AX, h; \
    96  	MOVL	e, AX; \
    97  	ADDL	$const, h; \
    98  	MOVL	e, CX; \
    99  	RORL	$6, AX; \
   100  	MOVL	e, DX; \
   101  	RORL	$11, CX; \
   102  	XORL	CX, AX; \
   103  	MOVL	e, CX; \
   104  	RORL	$25, DX; \
   105  	ANDL	f, CX; \
   106  	XORL	AX, DX; \
   107  	MOVL	e, AX; \
   108  	NOTL	AX; \
   109  	ADDL	DX, h; \
   110  	ANDL	g, AX; \
   111  	XORL	CX, AX; \
   112  	ADDL	h, AX
   113  
   114  // Calculate T2 in BX - uses BX, CX, DX and DI registers.
   115  //   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   116  //     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
   117  //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   118  #define SHA256T2(a, b, c) \
   119  	MOVL	a, DI; \
   120  	MOVL	c, BX; \
   121  	RORL	$2, DI; \
   122  	MOVL	a, DX; \
   123  	ANDL	b, BX; \
   124  	RORL	$13, DX; \
   125  	MOVL	a, CX; \
   126  	ANDL	c, CX; \
   127  	XORL	DX, DI; \
   128  	XORL	CX, BX; \
   129  	MOVL	a, DX; \
   130  	MOVL	b, CX; \
   131  	RORL	$22, DX; \
   132  	ANDL	a, CX; \
   133  	XORL	CX, BX; \
   134  	XORL	DX, DI; \
   135  	ADDL	DI, BX
   136  
   137  // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   138  // The values for e and a are stored in d and h, ready for rotation.
   139  #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
   140  	SHA256T1(const, e, f, g, h); \
   141  	SHA256T2(a, b, c); \
   142  	MOVL	BX, h; \
   143  	ADDL	AX, d; \
   144  	ADDL	AX, h
   145  
   146  #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
   147  	MSGSCHEDULE0(index); \
   148  	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   149  
   150  #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
   151  	MSGSCHEDULE1(index); \
   152  	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   153  
   154  
   155  // Definitions for AVX2 version
   156  
   157  // addm (mem), reg
   158  // Add reg to mem using reg-mem add and store
   159  #define addm(P1, P2) \
   160  	ADDL P2, P1; \
   161  	MOVL P1, P2
   162  
   163  #define XDWORD0 Y4
   164  #define XDWORD1 Y5
   165  #define XDWORD2 Y6
   166  #define XDWORD3 Y7
   167  
   168  #define XWORD0 X4
   169  #define XWORD1 X5
   170  #define XWORD2 X6
   171  #define XWORD3 X7
   172  
   173  #define XTMP0 Y0
   174  #define XTMP1 Y1
   175  #define XTMP2 Y2
   176  #define XTMP3 Y3
   177  #define XTMP4 Y8
   178  #define XTMP5 Y11
   179  
   180  #define XFER  Y9
   181  
   182  #define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
   183  #define X_BYTE_FLIP_MASK X13
   184  
   185  #define NUM_BYTES DX
   186  #define INP	DI
   187  
   188  #define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
   189  
   190  #define a AX
   191  #define b BX
   192  #define c CX
   193  #define d R8
   194  #define e DX
   195  #define f R9
   196  #define g R10
   197  #define h R11
   198  
   199  #define old_h R11
   200  
   201  #define TBL BP
   202  
   203  #define SRND SI // SRND is same register as CTX
   204  
   205  #define T1 R12
   206  
   207  #define y0 R13
   208  #define y1 R14
   209  #define y2 R15
   210  #define y3 DI
   211  
   212  // Offsets
   213  #define XFER_SIZE 2*64*4
   214  #define INP_END_SIZE 8
   215  #define INP_SIZE 8
   216  
   217  #define _XFER 0
   218  #define _INP_END _XFER + XFER_SIZE
   219  #define _INP _INP_END + INP_END_SIZE
   220  #define STACK_SIZE _INP + INP_SIZE
   221  
   222  #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   223  	;                                     \ // #############################  RND N + 0 ############################//
   224  	MOVL     a, y3;                       \ // y3 = a					// MAJA
   225  	RORXL    $25, e, y0;                  \ // y0 = e >> 25				// S1A
   226  	RORXL    $11, e, y1;                  \ // y1 = e >> 11				// S1B
   227  	;                                     \
   228  	ADDL     (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h        // disp = k + w
   229  	ORL      c, y3;                       \ // y3 = a|c				// MAJA
   230  	VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
   231  	MOVL     f, y2;                       \ // y2 = f				// CH
   232  	RORXL    $13, a, T1;                  \ // T1 = a >> 13			// S0B
   233  	;                                     \
   234  	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)					// S1
   235  	XORL     g, y2;                       \ // y2 = f^g                              	// CH
   236  	VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
   237  	RORXL    $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   238  	;                                     \
   239  	ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
   240  	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   241  	RORXL    $22, a, y1;                  \ // y1 = a >> 22							// S0A
   242  	ADDL     h, d;                        \ // d = k + w + h + d                     	// --
   243  	;                                     \
   244  	ANDL     b, y3;                       \ // y3 = (a|c)&b							// MAJA
   245  	VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
   246  	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   247  	RORXL    $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   248  	;                                     \
   249  	XORL     g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   250  	VPSRLD   $7, XTMP1, XTMP2;            \
   251  	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   252  	MOVL     a, T1;                       \ // T1 = a								// MAJB
   253  	ANDL     c, T1;                       \ // T1 = a&c								// MAJB
   254  	;                                     \
   255  	ADDL     y0, y2;                      \ // y2 = S1 + CH							// --
   256  	VPSLLD   $(32-7), XTMP1, XTMP3;       \
   257  	ORL      T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   258  	ADDL     y1, h;                       \ // h = k + w + h + S0					// --
   259  	;                                     \
   260  	ADDL     y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   261  	VPOR     XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7
   262  	;                                     \
   263  	VPSRLD   $18, XTMP1, XTMP2;           \
   264  	ADDL     y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   265  	ADDL     y3, h                        // h = t1 + S0 + MAJ                     // --
   266  
   267  #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   268  	;                                    \ // ################################### RND N + 1 ############################
   269  	;                                    \
   270  	MOVL    a, y3;                       \ // y3 = a                       // MAJA
   271  	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   272  	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   273  	ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h         		// --
   274  	ORL     c, y3;                       \ // y3 = a|c						// MAJA
   275  	;                                    \
   276  	VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
   277  	MOVL    f, y2;                       \ // y2 = f						// CH
   278  	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   279  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   280  	XORL    g, y2;                       \ // y2 = f^g						// CH
   281  	;                                    \
   282  	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   283  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   284  	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   285  	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   286  	ADDL    h, d;                        \ // d = k + w + h + d				// --
   287  	;                                    \
   288  	VPSLLD  $(32-18), XTMP1, XTMP1;      \
   289  	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   290  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   291  	;                                    \
   292  	VPXOR   XTMP1, XTMP3, XTMP3;         \
   293  	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   294  	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g		// CH
   295  	;                                    \
   296  	VPXOR   XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
   297  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   298  	MOVL    a, T1;                       \ // T1 = a						// MAJB
   299  	ANDL    c, T1;                       \ // T1 = a&c						// MAJB
   300  	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   301  	;                                    \
   302  	VPXOR   XTMP4, XTMP3, XTMP1;         \ // XTMP1 = s0
   303  	VPSHUFD $0xFA, XDWORD3, XTMP2;       \ // XTMP2 = W[-2] {BBAA}
   304  	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   305  	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   306  	;                                    \
   307  	VPADDD  XTMP1, XTMP0, XTMP0;         \ // XTMP0 = W[-16] + W[-7] + s0
   308  	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   309  	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   310  	ADDL    y3, h;                       \ // h = t1 + S0 + MAJ                     // --
   311  	;                                    \
   312  	VPSRLD  $10, XTMP2, XTMP4            // XTMP4 = W[-2] >> 10 {BBAA}
   313  
   314  #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   315  	;                                    \ // ################################### RND N + 2 ############################
   316  	;                                    \
   317  	MOVL    a, y3;                       \ // y3 = a							// MAJA
   318  	RORXL   $25, e, y0;                  \ // y0 = e >> 25						// S1A
   319  	ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h        			// --
   320  	;                                    \
   321  	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
   322  	RORXL   $11, e, y1;                  \ // y1 = e >> 11						// S1B
   323  	ORL     c, y3;                       \ // y3 = a|c                         // MAJA
   324  	MOVL    f, y2;                       \ // y2 = f                           // CH
   325  	XORL    g, y2;                       \ // y2 = f^g                         // CH
   326  	;                                    \
   327  	RORXL   $13, a, T1;                  \ // T1 = a >> 13						// S0B
   328  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)			// S1
   329  	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xBxA}
   330  	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   331  	;                                    \
   332  	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)					// S1
   333  	VPXOR   XTMP3, XTMP2, XTMP2;         \
   334  	ADDL    h, d;                        \ // d = k + w + h + d				// --
   335  	ANDL    b, y3;                       \ // y3 = (a|c)&b						// MAJA
   336  	;                                    \
   337  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   338  	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   339  	VPXOR   XTMP2, XTMP4, XTMP4;         \ // XTMP4 = s1 {xBxA}
   340  	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   341  	;                                    \
   342  	VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
   343  	;                                    \
   344  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   345  	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   346  	VPADDD  XTMP4, XTMP0, XTMP0;         \ // XTMP0 = {..., ..., W[1], W[0]}
   347  	;                                    \
   348  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   349  	MOVL    a, T1;                       \ // T1 = a                                // MAJB
   350  	ANDL    c, T1;                       \ // T1 = a&c                              // MAJB
   351  	ADDL    y0, y2;                      \ // y2 = S1 + CH                          // --
   352  	VPSHUFD $80, XTMP0, XTMP2;           \ // XTMP2 = W[-2] {DDCC}
   353  	;                                    \
   354  	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   355  	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   356  	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   357  	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   358  	;                                    \
   359  	ADDL    y3, h                        // h = t1 + S0 + MAJ                     // --
   360  
   361  #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   362  	;                                    \ // ################################### RND N + 3 ############################
   363  	;                                    \
   364  	MOVL    a, y3;                       \ // y3 = a						// MAJA
   365  	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   366  	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   367  	ADDL    (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h				// --
   368  	ORL     c, y3;                       \ // y3 = a|c                     // MAJA
   369  	;                                    \
   370  	VPSRLD  $10, XTMP2, XTMP5;           \ // XTMP5 = W[-2] >> 10 {DDCC}
   371  	MOVL    f, y2;                       \ // y2 = f						// CH
   372  	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   373  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   374  	XORL    g, y2;                       \ // y2 = f^g						// CH
   375  	;                                    \
   376  	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xDxC}
   377  	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   378  	ANDL    e, y2;                       \ // y2 = (f^g)&e					// CH
   379  	ADDL    h, d;                        \ // d = k + w + h + d			// --
   380  	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   381  	;                                    \
   382  	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xDxC}
   383  	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   384  	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   385  	;                                    \
   386  	VPXOR   XTMP3, XTMP2, XTMP2;         \
   387  	RORXL   $22, a, y1;                  \ // y1 = a >> 22					// S0A
   388  	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   389  	;                                    \
   390  	VPXOR   XTMP2, XTMP5, XTMP5;         \ // XTMP5 = s1 {xDxC}
   391  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   392  	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   393  	;                                    \
   394  	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   395  	;                                    \
   396  	VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
   397  	;                                    \
   398  	VPADDD  XTMP0, XTMP5, XDWORD0;       \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
   399  	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   400  	MOVL    a, T1;                       \ // T1 = a							// MAJB
   401  	ANDL    c, T1;                       \ // T1 = a&c							// MAJB
   402  	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)		// MAJ
   403  	;                                    \
   404  	ADDL    y1, h;                       \ // h = k + w + h + S0				// --
   405  	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   406  	ADDL    y3, h                        // h = t1 + S0 + MAJ				// --
   407  
   408  #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
   409  	;                                  \ // ################################### RND N + 0 ###########################
   410  	MOVL  f, y2;                       \ // y2 = f					// CH
   411  	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   412  	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   413  	XORL  g, y2;                       \ // y2 = f^g					// CH
   414  	;                                  \
   415  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)	// S1
   416  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)			// S1
   417  	ANDL  e, y2;                       \ // y2 = (f^g)&e				// CH
   418  	;                                  \
   419  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   420  	RORXL $13, a, T1;                  \ // T1 = a >> 13						// S0B
   421  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   422  	RORXL $22, a, y1;                  \ // y1 = a >> 22						// S0A
   423  	MOVL  a, y3;                       \ // y3 = a							// MAJA
   424  	;                                  \
   425  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)			// S0
   426  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)					// S0
   427  	ADDL  (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   428  	ORL   c, y3;                       \ // y3 = a|c							// MAJA
   429  	;                                  \
   430  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   431  	MOVL  a, T1;                       \ // T1 = a							// MAJB
   432  	ANDL  b, y3;                       \ // y3 = (a|c)&b						// MAJA
   433  	ANDL  c, T1;                       \ // T1 = a&c							// MAJB
   434  	ADDL  y0, y2;                      \ // y2 = S1 + CH						// --
   435  	;                                  \
   436  	ADDL  h, d;                        \ // d = k + w + h + d					// --
   437  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   438  	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   439  	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1	// --
   440  
   441  #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
   442  	;                                  \ // ################################### RND N + 1 ###########################
   443  	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
   444  	MOVL  f, y2;                       \ // y2 = f                                // CH
   445  	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   446  	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   447  	XORL  g, y2;                       \ // y2 = f^g                             // CH
   448  	;                                  \
   449  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   450  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   451  	ANDL  e, y2;                       \ // y2 = (f^g)&e                         // CH
   452  	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ                    // --
   453  	;                                  \
   454  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   455  	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   456  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   457  	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   458  	MOVL  a, y3;                       \ // y3 = a                               // MAJA
   459  	;                                  \
   460  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   461  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   462  	ADDL  (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   463  	ORL   c, y3;                       \ // y3 = a|c                             // MAJA
   464  	;                                  \
   465  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   466  	MOVL  a, T1;                       \ // T1 = a                               // MAJB
   467  	ANDL  b, y3;                       \ // y3 = (a|c)&b                         // MAJA
   468  	ANDL  c, T1;                       \ // T1 = a&c                             // MAJB
   469  	ADDL  y0, y2;                      \ // y2 = S1 + CH                         // --
   470  	;                                  \
   471  	ADDL  h, d;                        \ // d = k + w + h + d                    // --
   472  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)            // MAJ
   473  	ADDL  y1, h;                       \ // h = k + w + h + S0                   // --
   474  	;                                  \
   475  	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   476  
   477  #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
   478  	;                                  \ // ################################### RND N + 2 ##############################
   479  	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   480  	MOVL  f, y2;                       \ // y2 = f								// CH
   481  	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   482  	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   483  	XORL  g, y2;                       \ // y2 = f^g								// CH
   484  	;                                  \
   485  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   486  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   487  	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   488  	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   489  	;                                  \
   490  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   491  	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   492  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   493  	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   494  	MOVL  a, y3;                       \ // y3 = a								// MAJA
   495  	;                                  \
   496  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   497  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   498  	ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
   499  	ORL   c, y3;                       \ // y3 = a|c								// MAJA
   500  	;                                  \
   501  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   502  	MOVL  a, T1;                       \ // T1 = a								// MAJB
   503  	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   504  	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   505  	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   506  	;                                  \
   507  	ADDL  h, d;                        \ // d = k + w + h + d					// --
   508  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   509  	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   510  	;                                  \
   511  	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   512  
   513  #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
   514  	;                                  \ // ################################### RND N + 3 ###########################
   515  	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   516  	MOVL  f, y2;                       \ // y2 = f								// CH
   517  	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   518  	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   519  	XORL  g, y2;                       \ // y2 = f^g								// CH
   520  	;                                  \
   521  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   522  	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   523  	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   524  	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   525  	;                                  \
   526  	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   527  	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   528  	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   529  	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   530  	MOVL  a, y3;                       \ // y3 = a								// MAJA
   531  	;                                  \
   532  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   533  	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   534  	ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
   535  	ORL   c, y3;                       \ // y3 = a|c								// MAJA
   536  	;                                  \
   537  	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   538  	MOVL  a, T1;                       \ // T1 = a								// MAJB
   539  	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   540  	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   541  	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   542  	;                                  \
   543  	ADDL  h, d;                        \ // d = k + w + h + d					// --
   544  	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   545  	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   546  	;                                  \
   547  	ADDL  y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1	// --
   548  	;                                  \
   549  	ADDL  y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   550  	;                                  \
   551  	ADDL  y3, h                        // h = t1 + S0 + MAJ					// --
   552  
   553  TEXT ·block(SB), 0, $536-32
   554  	CMPB ·useAVX2(SB), $1
   555  	JE   avx2
   556  
   557  	MOVQ p_base+8(FP), SI
   558  	MOVQ p_len+16(FP), DX
   559  	SHRQ $6, DX
   560  	SHLQ $6, DX
   561  
   562  	LEAQ (SI)(DX*1), DI
   563  	MOVQ DI, 256(SP)
   564  	CMPQ SI, DI
   565  	JEQ  end
   566  
   567  	MOVQ dig+0(FP), BP
   568  	MOVL (0*4)(BP), R8  // a = H0
   569  	MOVL (1*4)(BP), R9  // b = H1
   570  	MOVL (2*4)(BP), R10 // c = H2
   571  	MOVL (3*4)(BP), R11 // d = H3
   572  	MOVL (4*4)(BP), R12 // e = H4
   573  	MOVL (5*4)(BP), R13 // f = H5
   574  	MOVL (6*4)(BP), R14 // g = H6
   575  	MOVL (7*4)(BP), R15 // h = H7
   576  
   577  loop:
   578  	MOVQ SP, BP
   579  
   580  	SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
   581  	SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
   582  	SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
   583  	SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
   584  	SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
   585  	SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
   586  	SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
   587  	SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
   588  	SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
   589  	SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
   590  	SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
   591  	SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
   592  	SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
   593  	SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
   594  	SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
   595  	SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
   596  
   597  	SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
   598  	SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
   599  	SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
   600  	SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
   601  	SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
   602  	SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
   603  	SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
   604  	SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
   605  	SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
   606  	SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
   607  	SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
   608  	SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
   609  	SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
   610  	SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
   611  	SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
   612  	SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
   613  	SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
   614  	SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
   615  	SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
   616  	SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
   617  	SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
   618  	SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
   619  	SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
   620  	SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
   621  	SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
   622  	SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
   623  	SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
   624  	SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
   625  	SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
   626  	SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
   627  	SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
   628  	SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
   629  	SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
   630  	SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
   631  	SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
   632  	SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
   633  	SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
   634  	SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
   635  	SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
   636  	SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
   637  	SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
   638  	SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
   639  	SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
   640  	SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
   641  	SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
   642  	SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
   643  	SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
   644  	SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
   645  
   646  	MOVQ dig+0(FP), BP
   647  	ADDL (0*4)(BP), R8  // H0 = a + H0
   648  	MOVL R8, (0*4)(BP)
   649  	ADDL (1*4)(BP), R9  // H1 = b + H1
   650  	MOVL R9, (1*4)(BP)
   651  	ADDL (2*4)(BP), R10 // H2 = c + H2
   652  	MOVL R10, (2*4)(BP)
   653  	ADDL (3*4)(BP), R11 // H3 = d + H3
   654  	MOVL R11, (3*4)(BP)
   655  	ADDL (4*4)(BP), R12 // H4 = e + H4
   656  	MOVL R12, (4*4)(BP)
   657  	ADDL (5*4)(BP), R13 // H5 = f + H5
   658  	MOVL R13, (5*4)(BP)
   659  	ADDL (6*4)(BP), R14 // H6 = g + H6
   660  	MOVL R14, (6*4)(BP)
   661  	ADDL (7*4)(BP), R15 // H7 = h + H7
   662  	MOVL R15, (7*4)(BP)
   663  
   664  	ADDQ $64, SI
   665  	CMPQ SI, 256(SP)
   666  	JB   loop
   667  
   668  end:
   669  	RET
   670  
   671  avx2:
   672  	MOVQ dig+0(FP), CTX          // d.h[8]
   673  	MOVQ p_base+8(FP), INP
   674  	MOVQ p_len+16(FP), NUM_BYTES
   675  
   676  	LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
   677  	MOVQ NUM_BYTES, _INP_END(SP)
   678  
   679  	CMPQ NUM_BYTES, INP
   680  	JE   avx2_only_one_block
   681  
   682  	// Load initial digest
   683  	MOVL 0(CTX), a  // a = H0
   684  	MOVL 4(CTX), b  // b = H1
   685  	MOVL 8(CTX), c  // c = H2
   686  	MOVL 12(CTX), d // d = H3
   687  	MOVL 16(CTX), e // e = H4
   688  	MOVL 20(CTX), f // f = H5
   689  	MOVL 24(CTX), g // g = H6
   690  	MOVL 28(CTX), h // h = H7
   691  
   692  avx2_loop0: // at each iteration works with one block (512 bit)
   693  
   694  	VMOVDQU (0*32)(INP), XTMP0
   695  	VMOVDQU (1*32)(INP), XTMP1
   696  	VMOVDQU (2*32)(INP), XTMP2
   697  	VMOVDQU (3*32)(INP), XTMP3
   698  
   699  	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
   700  
   701  	// Apply Byte Flip Mask: LE -> BE
   702  	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
   703  	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
   704  	VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
   705  	VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
   706  
   707  	// Transpose data into high/low parts
   708  	VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
   709  	VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
   710  	VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
   711  	VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
   712  
   713  	MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
   714  
   715  avx2_last_block_enter:
   716  	ADDQ $64, INP
   717  	MOVQ INP, _INP(SP)
   718  	XORQ SRND, SRND
   719  
   720  avx2_loop1: // for w0 - w47
   721  	// Do 4 rounds and scheduling
   722  	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   723  	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   724  	ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   725  	ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   726  	ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   727  	ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   728  
   729  	// Do 4 rounds and scheduling
   730  	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   731  	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   732  	ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   733  	ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   734  	ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   735  	ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   736  
   737  	// Do 4 rounds and scheduling
   738  	VPADDD  2*32(TBL)(SRND*1), XDWORD2, XFER
   739  	VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
   740  	ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   741  	ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   742  	ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   743  	ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   744  
   745  	// Do 4 rounds and scheduling
   746  	VPADDD  3*32(TBL)(SRND*1), XDWORD3, XFER
   747  	VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
   748  	ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   749  	ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   750  	ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   751  	ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   752  
   753  	ADDQ $4*32, SRND
   754  	CMPQ SRND, $3*4*32
   755  	JB   avx2_loop1
   756  
   757  avx2_loop2:
   758  	// w48 - w63 processed with no scheduliung (last 16 rounds)
   759  	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   760  	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   761  	DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
   762  	DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
   763  	DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
   764  	DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
   765  
   766  	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   767  	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   768  	DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
   769  	DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
   770  	DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
   771  	DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
   772  
   773  	ADDQ $2*32, SRND
   774  
   775  	VMOVDQU XDWORD2, XDWORD0
   776  	VMOVDQU XDWORD3, XDWORD1
   777  
   778  	CMPQ SRND, $4*4*32
   779  	JB   avx2_loop2
   780  
   781  	MOVQ dig+0(FP), CTX // d.h[8]
   782  	MOVQ _INP(SP), INP
   783  
   784  	addm(  0(CTX), a)
   785  	addm(  4(CTX), b)
   786  	addm(  8(CTX), c)
   787  	addm( 12(CTX), d)
   788  	addm( 16(CTX), e)
   789  	addm( 20(CTX), f)
   790  	addm( 24(CTX), g)
   791  	addm( 28(CTX), h)
   792  
   793  	CMPQ _INP_END(SP), INP
   794  	JB   done_hash
   795  
   796  	XORQ SRND, SRND
   797  
   798  avx2_loop3: // Do second block using previously scheduled results
   799  	DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
   800  	DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
   801  	DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
   802  	DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
   803  
   804  	DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
   805  	DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
   806  	DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
   807  	DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
   808  
   809  	ADDQ $2*32, SRND
   810  	CMPQ SRND, $4*4*32
   811  	JB   avx2_loop3
   812  
   813  	MOVQ dig+0(FP), CTX // d.h[8]
   814  	MOVQ _INP(SP), INP
   815  	ADDQ $64, INP
   816  
   817  	addm(  0(CTX), a)
   818  	addm(  4(CTX), b)
   819  	addm(  8(CTX), c)
   820  	addm( 12(CTX), d)
   821  	addm( 16(CTX), e)
   822  	addm( 20(CTX), f)
   823  	addm( 24(CTX), g)
   824  	addm( 28(CTX), h)
   825  
   826  	CMPQ _INP_END(SP), INP
   827  	JA   avx2_loop0
   828  	JB   done_hash
   829  
   830  avx2_do_last_block:
   831  
   832  	VMOVDQU 0(INP), XWORD0
   833  	VMOVDQU 16(INP), XWORD1
   834  	VMOVDQU 32(INP), XWORD2
   835  	VMOVDQU 48(INP), XWORD3
   836  
   837  	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
   838  
   839  	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
   840  	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
   841  	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
   842  	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
   843  
   844  	MOVQ $K256<>(SB), TBL
   845  
   846  	JMP avx2_last_block_enter
   847  
   848  avx2_only_one_block:
   849  	// Load initial digest
   850  	MOVL 0(CTX), a  // a = H0
   851  	MOVL 4(CTX), b  // b = H1
   852  	MOVL 8(CTX), c  // c = H2
   853  	MOVL 12(CTX), d // d = H3
   854  	MOVL 16(CTX), e // e = H4
   855  	MOVL 20(CTX), f // f = H5
   856  	MOVL 24(CTX), g // g = H6
   857  	MOVL 28(CTX), h // h = H7
   858  
   859  	JMP avx2_do_last_block
   860  
   861  done_hash:
   862  	VZEROUPPER
   863  	RET
   864  
   865  // shuffle byte order from LE to BE
   866  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
   867  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
   868  DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
   869  DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
   870  GLOBL flip_mask<>(SB), 8, $32
   871  
   872  // shuffle xBxA -> 00BA
   873  DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
   874  DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
   875  DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
   876  DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   877  GLOBL shuff_00BA<>(SB), 8, $32
   878  
   879  // shuffle xDxC -> DC00
   880  DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
   881  DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
   882  DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   883  DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
   884  GLOBL shuff_DC00<>(SB), 8, $32
   885  
   886  // Round specific constants
   887  DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
   888  DATA K256<>+0x04(SB)/4, $0x71374491 // k2
   889  DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
   890  DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
   891  DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
   892  DATA K256<>+0x14(SB)/4, $0x71374491 // k2
   893  DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
   894  DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
   895  
   896  DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
   897  DATA K256<>+0x24(SB)/4, $0x59f111f1
   898  DATA K256<>+0x28(SB)/4, $0x923f82a4
   899  DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
   900  DATA K256<>+0x30(SB)/4, $0x3956c25b
   901  DATA K256<>+0x34(SB)/4, $0x59f111f1
   902  DATA K256<>+0x38(SB)/4, $0x923f82a4
   903  DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
   904  
   905  DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
   906  DATA K256<>+0x44(SB)/4, $0x12835b01
   907  DATA K256<>+0x48(SB)/4, $0x243185be
   908  DATA K256<>+0x4c(SB)/4, $0x550c7dc3
   909  DATA K256<>+0x50(SB)/4, $0xd807aa98
   910  DATA K256<>+0x54(SB)/4, $0x12835b01
   911  DATA K256<>+0x58(SB)/4, $0x243185be
   912  DATA K256<>+0x5c(SB)/4, $0x550c7dc3
   913  
   914  DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
   915  DATA K256<>+0x64(SB)/4, $0x80deb1fe
   916  DATA K256<>+0x68(SB)/4, $0x9bdc06a7
   917  DATA K256<>+0x6c(SB)/4, $0xc19bf174
   918  DATA K256<>+0x70(SB)/4, $0x72be5d74
   919  DATA K256<>+0x74(SB)/4, $0x80deb1fe
   920  DATA K256<>+0x78(SB)/4, $0x9bdc06a7
   921  DATA K256<>+0x7c(SB)/4, $0xc19bf174
   922  
   923  DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
   924  DATA K256<>+0x84(SB)/4, $0xefbe4786
   925  DATA K256<>+0x88(SB)/4, $0x0fc19dc6
   926  DATA K256<>+0x8c(SB)/4, $0x240ca1cc
   927  DATA K256<>+0x90(SB)/4, $0xe49b69c1
   928  DATA K256<>+0x94(SB)/4, $0xefbe4786
   929  DATA K256<>+0x98(SB)/4, $0x0fc19dc6
   930  DATA K256<>+0x9c(SB)/4, $0x240ca1cc
   931  
   932  DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
   933  DATA K256<>+0xa4(SB)/4, $0x4a7484aa
   934  DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
   935  DATA K256<>+0xac(SB)/4, $0x76f988da
   936  DATA K256<>+0xb0(SB)/4, $0x2de92c6f
   937  DATA K256<>+0xb4(SB)/4, $0x4a7484aa
   938  DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
   939  DATA K256<>+0xbc(SB)/4, $0x76f988da
   940  
   941  DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
   942  DATA K256<>+0xc4(SB)/4, $0xa831c66d
   943  DATA K256<>+0xc8(SB)/4, $0xb00327c8
   944  DATA K256<>+0xcc(SB)/4, $0xbf597fc7
   945  DATA K256<>+0xd0(SB)/4, $0x983e5152
   946  DATA K256<>+0xd4(SB)/4, $0xa831c66d
   947  DATA K256<>+0xd8(SB)/4, $0xb00327c8
   948  DATA K256<>+0xdc(SB)/4, $0xbf597fc7
   949  
   950  DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
   951  DATA K256<>+0xe4(SB)/4, $0xd5a79147
   952  DATA K256<>+0xe8(SB)/4, $0x06ca6351
   953  DATA K256<>+0xec(SB)/4, $0x14292967
   954  DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
   955  DATA K256<>+0xf4(SB)/4, $0xd5a79147
   956  DATA K256<>+0xf8(SB)/4, $0x06ca6351
   957  DATA K256<>+0xfc(SB)/4, $0x14292967
   958  
   959  DATA K256<>+0x100(SB)/4, $0x27b70a85
   960  DATA K256<>+0x104(SB)/4, $0x2e1b2138
   961  DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
   962  DATA K256<>+0x10c(SB)/4, $0x53380d13
   963  DATA K256<>+0x110(SB)/4, $0x27b70a85
   964  DATA K256<>+0x114(SB)/4, $0x2e1b2138
   965  DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
   966  DATA K256<>+0x11c(SB)/4, $0x53380d13
   967  
   968  DATA K256<>+0x120(SB)/4, $0x650a7354
   969  DATA K256<>+0x124(SB)/4, $0x766a0abb
   970  DATA K256<>+0x128(SB)/4, $0x81c2c92e
   971  DATA K256<>+0x12c(SB)/4, $0x92722c85
   972  DATA K256<>+0x130(SB)/4, $0x650a7354
   973  DATA K256<>+0x134(SB)/4, $0x766a0abb
   974  DATA K256<>+0x138(SB)/4, $0x81c2c92e
   975  DATA K256<>+0x13c(SB)/4, $0x92722c85
   976  
   977  DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
   978  DATA K256<>+0x144(SB)/4, $0xa81a664b
   979  DATA K256<>+0x148(SB)/4, $0xc24b8b70
   980  DATA K256<>+0x14c(SB)/4, $0xc76c51a3
   981  DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
   982  DATA K256<>+0x154(SB)/4, $0xa81a664b
   983  DATA K256<>+0x158(SB)/4, $0xc24b8b70
   984  DATA K256<>+0x15c(SB)/4, $0xc76c51a3
   985  
   986  DATA K256<>+0x160(SB)/4, $0xd192e819
   987  DATA K256<>+0x164(SB)/4, $0xd6990624
   988  DATA K256<>+0x168(SB)/4, $0xf40e3585
   989  DATA K256<>+0x16c(SB)/4, $0x106aa070
   990  DATA K256<>+0x170(SB)/4, $0xd192e819
   991  DATA K256<>+0x174(SB)/4, $0xd6990624
   992  DATA K256<>+0x178(SB)/4, $0xf40e3585
   993  DATA K256<>+0x17c(SB)/4, $0x106aa070
   994  
   995  DATA K256<>+0x180(SB)/4, $0x19a4c116
   996  DATA K256<>+0x184(SB)/4, $0x1e376c08
   997  DATA K256<>+0x188(SB)/4, $0x2748774c
   998  DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
   999  DATA K256<>+0x190(SB)/4, $0x19a4c116
  1000  DATA K256<>+0x194(SB)/4, $0x1e376c08
  1001  DATA K256<>+0x198(SB)/4, $0x2748774c
  1002  DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
  1003  
  1004  DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
  1005  DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
  1006  DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
  1007  DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
  1008  DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
  1009  DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
  1010  DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
  1011  DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
  1012  
  1013  DATA K256<>+0x1c0(SB)/4, $0x748f82ee
  1014  DATA K256<>+0x1c4(SB)/4, $0x78a5636f
  1015  DATA K256<>+0x1c8(SB)/4, $0x84c87814
  1016  DATA K256<>+0x1cc(SB)/4, $0x8cc70208
  1017  DATA K256<>+0x1d0(SB)/4, $0x748f82ee
  1018  DATA K256<>+0x1d4(SB)/4, $0x78a5636f
  1019  DATA K256<>+0x1d8(SB)/4, $0x84c87814
  1020  DATA K256<>+0x1dc(SB)/4, $0x8cc70208
  1021  
  1022  DATA K256<>+0x1e0(SB)/4, $0x90befffa
  1023  DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
  1024  DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
  1025  DATA K256<>+0x1ec(SB)/4, $0xc67178f2
  1026  DATA K256<>+0x1f0(SB)/4, $0x90befffa
  1027  DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
  1028  DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
  1029  DATA K256<>+0x1fc(SB)/4, $0xc67178f2
  1030  
  1031  GLOBL K256<>(SB), (NOPTR + RODATA), $512