github.com/emmansun/gmsm@v0.29.1/sm3/sm3block_avx2_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #include "sm3_const_asm.s"
     6  
     7  // Definitions for AVX2 version
     8  
     9  // xorm (mem), reg
    10  // Xor reg to mem using reg-mem xor and store
    11  #define xorm(P1, P2) \
    12  	XORL P2, P1; \
    13  	MOVL P1, P2
    14  
    15  #define XDWORD0 Y4
    16  #define XDWORD1 Y5
    17  #define XDWORD2 Y6
    18  #define XDWORD3 Y7
    19  
    20  #define XWORD0 X4
    21  #define XWORD1 X5
    22  #define XWORD2 X6
    23  #define XWORD3 X7
    24  
    25  #define XTMP0 Y0
    26  #define XTMP1 Y1
    27  #define XTMP2 Y2
    28  #define XTMP3 Y3
    29  #define XTMP4 Y8
    30  
    31  #define XFER  Y9
    32  #define R08_SHUFFLE_MASK Y10
    33  
    34  #define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
    35  #define X_BYTE_FLIP_MASK X13
    36  
    37  #define NUM_BYTES DX
    38  #define INP	DI
    39  
    40  #define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
    41  
    42  #define a AX
    43  #define b BX
    44  #define c CX
    45  #define d DX
    46  #define e R8
    47  #define f R9
    48  #define g R10
    49  #define h R11
    50  
    51  #define y0 R12
    52  #define y1 R13
    53  #define y2 R14
    54  
    55  // Offsets
    56  #define XFER_SIZE 4*64*4
    57  #define INP_END_SIZE 8
    58  
    59  #define _XFER 0
    60  #define _INP_END _XFER + XFER_SIZE
    61  #define STACK_SIZE _INP_END + INP_END_SIZE
    62  
    63  #define P0(tt2, tmp, out) \
    64  	RORXL    $23, tt2, tmp;                        \
    65  	RORXL    $15, tt2, out;                        \
    66  	XORL     tmp, out;                             \
    67  	XORL     tt2, out
    68  
    69  // For rounds [0 - 16)
    70  #define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
    71  	;                                          \ // #############################  RND N + 0 ############################//
    72  	RORXL    $20, a, y0;                       \ // y0 = a <<< 12, RORXL is BMI2 instr
    73  	MOVL     e, y2;                            \
    74  	ADDL     $const, y2;                       \
    75  	VPALIGNR $12, XDWORD0, XDWORD1, XTMP0;     \ // XTMP0 = W[-13] = {w6,w5,w4,w3}
    76  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
    77  	ROLL     $7, y2;                           \ // y2 = SS1
    78  	XORL     y2, y0                            \ // y0 = SS2
    79  	VPSLLD   $7, XTMP0, XTMP1;                 \ // XTMP1 = W[-13] << 7 = {w6<<7,w5<<7,w4<<7,w3<<7}
    80  	ADDL     (disp + 0*4)(SP), y2;             \ // y2 = SS1 + W
    81  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
    82  	ADDL     (disp + 0*4 + 32)(SP), y0;        \ // y0 = SS2 + W'
    83  	VPSRLD   $(32-7), XTMP0, XTMP0;            \ // XTMP0 = W[-13] >> 25 = {w6>>25,w5>>25,w4>>25,w3>>25}
    84  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
    85  	MOVL     a, h;                             \
    86  	XORL     b, h;                             \
    87  	VPOR     XTMP0, XTMP1, XTMP1;              \ // XTMP1 = W[-13] rol 7
    88  	XORL     c, h;                             \
    89  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
    90  	MOVL     e, y1;                            \
    91  	VPALIGNR $8, XDWORD2, XDWORD3, XTMP0;      \ // XTMP0 = W[-6] = {w13,w12,w11,w10}
    92  	XORL     f, y1;                            \
    93  	XORL     g, y1;                            \
    94  	ADDL     y1, y2;                           \ // y2 = GG(e, f, g) + h + SS1 + W = tt2  
    95  	VPXOR   XTMP1, XTMP0, XTMP0;               \ // XTMP0 = W[-6] ^ (W[-13] rol 7)
    96  	ROLL     $9, b;                            \
    97  	ROLL     $19, f;                           \
    98  	VPALIGNR $12, XDWORD1, XDWORD2, XTMP1;     \ // XTMP1 = W[-9] = {w10,w9,w8,w7}
    99  	P0(y2, y0, d);                             \
   100  	VPXOR XDWORD0, XTMP1, XTMP1;               \ // XTMP1 = W[-9] ^ W[-16]
   101  
   102  #define ROUND_AND_SCHED_N_0_1(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   103  	;                                          \ // #############################  RND N + 1 ############################//
   104  	RORXL    $20, a, y0;                       \ // y0 = a <<< 12
   105  	MOVL     e, y2;                            \
   106  	ADDL     $const, y2;                       \
   107  	VPSHUFD $0xA5, XDWORD3, XTMP2;             \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13}
   108  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   109  	ROLL     $7, y2;                           \ // y2 = SS1
   110  	XORL     y2, y0                            \ // y0 = SS2
   111  	VPSRLQ  $17, XTMP2, XTMP2;                 \ // XTMP2 = W[-3] rol 15 {xBxA}
   112  	ADDL     (disp + 1*4)(SP), y2;             \ // y2 = SS1 + W
   113  	ADDL     h, y2;                            \ // y2 = h + SS1 + W
   114  	ADDL     (disp + 1*4 + 32)(SP), y0;        \ // y0 = SS2 + W'
   115  	VPXOR   XTMP1, XTMP2, XTMP2;               \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxxA}
   116  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   117  	MOVL     a, h;                             \
   118  	XORL     b, h;                             \
   119  	VPSHUFD $0x00, XTMP2, XTMP2;               \ // XTMP2 = {AAAA}
   120  	XORL     c, h;                             \
   121  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   122  	MOVL     e, y1;                            \ 
   123  	XORL     f, y1;                            \
   124  	VPSRLQ  $17, XTMP2, XTMP3;                 \ // XTMP3 = XTMP2 rol 15 {xxxA}
   125  	XORL     g, y1;                            \
   126  	ADDL     y1, y2;                           \ // y2 = GG(e, f, g) + h + SS1 + W = tt2
   127  	ROLL     $9, b;                            \
   128  	ROLL     $19, f;                           \
   129  	VPSRLQ  $9, XTMP2, XTMP4;                  \ // XTMP4 = XTMP2 rol 23 {xxxA}
   130  	P0(y2, y0, d);                             \
   131  	VPXOR    XTMP2, XTMP4, XTMP4;              \ // XTMP4 = XTMP2 ^ (XTMP2 rol 23 {xxxA})
   132  
   133  #define ROUND_AND_SCHED_N_0_2(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   134  	;                                          \ // #############################  RND N + 2 ############################//
   135  	RORXL    $20, a, y0;                       \ // y0 = a <<< 12
   136  	MOVL     e, y2;                            \
   137  	ADDL     $const, y2;                       \
   138  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   139  	VPXOR    XTMP4, XTMP3, XTMP4;              \ // XTMP4 = XTMP2 ^ (XTMP2 rol 15 {xxxA}) ^ (XTMP2 rol 23 {xxxA})
   140  	ROLL     $7, y2;                           \ // y2 = SS1
   141  	XORL     y2, y0                            \ // y0 = SS2
   142  	ADDL     (disp + 2*4)(SP), y2;             \ // y2 = SS1 + W
   143  	VPXOR    XTMP4, XTMP0, XTMP2;              \ // XTMP2 = {..., ..., ..., W[0]}
   144  	ADDL     h, y2;                            \ // y2 = h + SS1 + W
   145  	ADDL     (disp + 2*4 + 32)(SP), y0;        \ // y0 = SS2 + W'
   146  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   147  	VPALIGNR $4, XDWORD3, XTMP2, XTMP3;        \ // XTMP3 = {W[0], w15, w14, w13}
   148  	MOVL     a, h;                             \
   149  	XORL     b, h;                             \
   150  	XORL     c, h;                             \
   151  	VPSLLD   $15, XTMP3, XTMP4;                \
   152  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   153  	MOVL     e, y1;                            \
   154  	XORL     f, y1;                            \
   155  	XORL     g, y1;                            \
   156  	VPSRLD   $(32-15), XTMP3, XTMP3;           \
   157  	ADDL     y1, y2;                           \ // y2 = GG(e, f, g) + h + SS1 + W = tt2  
   158  	ROLL     $9, b;                            \
   159  	ROLL     $19, f;                           \
   160  	VPOR     XTMP3, XTMP4, XTMP4;              \ // XTMP4 = (W[-3] rol 15) {DCBA}
   161  	P0(y2, y0, d);                             \
   162  	VPXOR   XTMP1, XTMP4, XTMP4;               \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {DCBA}
   163  
   164  #define ROUND_AND_SCHED_N_0_3(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   165  	;                                          \ // #############################  RND N + 3 ############################//
   166  	RORXL    $20, a, y0;                       \ // y0 = a <<< 12
   167  	MOVL     e, y2;                            \
   168  	ADDL     $const, y2;                       \
   169  	VPSLLD   $15, XTMP4, XTMP2;                \
   170  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   171  	ROLL     $7, y2;                           \ // y2 = SS1
   172  	XORL     y2, y0                            \ // y0 = SS2
   173  	VPSRLD   $(32-15), XTMP4, XTMP3;           \
   174  	ADDL     (disp + 3*4)(SP), y2;             \ // y2 = SS1 + W
   175  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
   176  	ADDL     (disp + 3*4 + 32)(SP), y0;        \ // y2 = SS2 + W'
   177  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   178  	VPOR     XTMP3, XTMP2, XTMP3;              \ // XTMP3 = XTMP4 rol 15 {DCBA}
   179  	MOVL     a, h;                             \
   180  	XORL     b, h;                             \
   181  	XORL     c, h;                             \
   182  	VPSHUFB  R08_SHUFFLE_MASK, XTMP3, XTMP1;   \ // XTMP1 = XTMP4 rol 23 {DCBA}
   183  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   184  	MOVL     e, y1;                            \
   185  	XORL     f, y1;                            \
   186  	XORL     g, y1;                            \
   187  	VPXOR    XTMP3, XTMP4, XTMP3;              \ // XTMP3 = XTMP4 ^ (XTMP4 rol 15 {DCBA})
   188  	ADDL     y1, y2;                           \ // y2 = GG(e, f, g) + h + SS1 + W = tt2  
   189  	ROLL     $9, b;                            \
   190  	ROLL     $19, f;                           \
   191  	VPXOR    XTMP3, XTMP1, XTMP1;              \ // XTMP1 = XTMP4 ^ (XTMP4 rol 15 {DCBA}) ^ (XTMP4 rol 23 {DCBA})
   192  	P0(y2, y0, d);                             \
   193  	VPXOR    XTMP1, XTMP0, XDWORD0;            \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
   194  
   195  // For rounds [16 - 64)
   196  #define ROUND_AND_SCHED_N_1_0(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   197  	;                                          \ // #############################  RND N + 0 ############################//
   198  	RORXL    $20, a, y0;                       \ // y0 = a <<< 12
   199  	MOVL     e, y2;                            \
   200  	ADDL     $const, y2;                       \
   201  	VPALIGNR $12, XDWORD0, XDWORD1, XTMP0;     \ // XTMP0 = W[-13] = {w6,w5,w4,w3}
   202  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   203  	ROLL     $7, y2;                           \ // y2 = SS1
   204  	XORL     y2, y0                            \ // y0 = SS2
   205  	VPSLLD   $7, XTMP0, XTMP1;                 \ // XTMP1 = W[-13] << 7 = {w6<<7,w5<<7,w4<<7,w3<<7}
   206  	ADDL     (disp + 0*4)(SP), y2;             \ // y2 = SS1 + W
   207  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
   208  	ADDL     (disp + 0*4 + 32)(SP), y0;        \ // y0 = SS2 + W'
   209  	VPSRLD   $(32-7), XTMP0, XTMP0;            \ // XTMP0 = W[-13] >> 25 = {w6>>25,w5>>25,w4>>25,w3>>25}
   210  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   211  	MOVL     a, y1;                            \
   212  	ORL      b, y1;                            \
   213  	VPOR     XTMP0, XTMP1, XTMP1;              \ // XTMP1 = W[-13] rol 7 = {ROTL(7,w6),ROTL(7,w5),ROTL(7,w4),ROTL(7,w3)}
   214  	MOVL     a, h;                             \
   215  	ANDL     b, h;                             \
   216  	ANDL     c, y1;                            \
   217  	ORL      y1, h;                            \ // h =  (a AND b) OR (a AND c) OR (b AND c)  
   218  	VPALIGNR $8, XDWORD2, XDWORD3, XTMP0;      \ // XTMP0 = W[-6] = {w13,w12,w11,w10}
   219  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   220  	MOVL     f, y1;                            \
   221  	XORL     g, y1;                            \
   222  	ANDL     e, y1;                            \
   223  	VPXOR   XTMP1, XTMP0, XTMP0;               \ // XTMP0 = W[-6] ^ (W[-13] rol 7) 
   224  	XORL     g, y1;                            \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)
   225  	ADDL     y1, y2;                           \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2  	 
   226  	ROLL     $9, b;                            \
   227  	ROLL     $19, f;                           \
   228  	VPALIGNR $12, XDWORD1, XDWORD2, XTMP1;     \ // XTMP1 = W[-9] = {w10,w9,w8,w7}
   229  	P0(y2, y0, d);                             \
   230  	VPXOR XDWORD0, XTMP1, XTMP1;               \ // XTMP1 = W[-9] ^ W[-16]
   231  
   232  #define ROUND_AND_SCHED_N_1_1(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   233  	;                                          \ // #############################  RND N + 1 ############################//
   234  	RORXL    $20, a, y0;                       \ // y0 = a <<< 12
   235  	MOVL     e, y2;                            \
   236  	ADDL     $const, y2;                       \
   237  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   238  	VPSHUFD $0xA5, XDWORD3, XTMP2;             \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13}
   239  	ROLL    $7, y2;                            \ // y2 = SS1
   240  	XORL     y2, y0                            \ // y0 = SS2
   241  	ADDL     (disp + 1*4)(SP), y2;             \ // y2 = SS1 + W
   242  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
   243  	VPSRLQ  $17, XTMP2, XTMP2;                 \ // XTMP2 = W[-3] rol 15 {xBxA}
   244  	ADDL     (disp + 1*4 + 32)(SP), y0;        \ // y0 = SS2 + W'
   245  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   246  	MOVL     a, y1;                            \
   247  	ORL      b, y1;                            \
   248  	VPXOR   XTMP1, XTMP2, XTMP2;               \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxxA}
   249  	MOVL     a, h;                             \
   250  	ANDL     b, h;                             \
   251  	ANDL     c, y1;                            \
   252  	ORL      y1, h;                            \ // h =  (a AND b) OR (a AND c) OR (b AND c)     
   253  	VPSHUFD $0x00, XTMP2, XTMP2;               \ // XTMP2 = {AAAA}
   254  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   255  	MOVL     f, y1;                            \
   256  	XORL     g, y1;                            \
   257  	ANDL     e, y1;                            \
   258  	VPSRLQ  $17, XTMP2, XTMP3;                 \ // XTMP3 = XTMP2 rol 15 {xxxA}
   259  	XORL     g, y1;                            \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)
   260  	ADDL     y1, y2;                           \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2  	
   261  	ROLL     $9, b;                            \
   262  	ROLL     $19, f;                           \
   263  	VPSRLQ  $9, XTMP2, XTMP4;                  \ // XTMP4 = XTMP2 rol 23 {xxxA}
   264  	P0(y2, y0, d);                             \
   265  	VPXOR    XTMP2, XTMP4, XTMP4;              \ // XTMP4 = XTMP2 XOR (XTMP2 rol 23 {xxxA})
   266  
   267  #define ROUND_AND_SCHED_N_1_2(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   268  	;                                          \ // #############################  RND N + 2 ############################//
   269  	RORXL    $20, a, y0;                       \ // y0 = a <<< 12
   270  	MOVL     e, y2;                            \
   271  	ADDL     $const, y2;                       \
   272  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   273  	VPXOR    XTMP4, XTMP3, XTMP4;              \ // XTMP4 = XTMP2 ^ (XTMP2 rol 15 {xxxA}) ^ (XTMP2 rol 23 {xxxA})
   274  	ROLL     $7, y2;                           \ // y2 = SS1
   275  	XORL     y2, y0                            \ // y0 = SS2
   276  	ADDL     (disp + 2*4)(SP), y2;             \ // y2 = SS1 + W
   277  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
   278  	VPXOR    XTMP4, XTMP0, XTMP2;              \ // XTMP2 = {..., ..., W[1], W[0]}
   279  	ADDL     (disp + 2*4 + 32)(SP), y0;        \ // y0 = SS2 + W'
   280  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   281  	MOVL     a, y1;                            \
   282  	ORL      b, y1;                            \
   283  	VPALIGNR $4, XDWORD3, XTMP2, XTMP3;        \ // XTMP3 = {W[0], w15, w14, w13}
   284  	MOVL     a, h;                             \
   285  	ANDL     b, h;                             \
   286  	ANDL     c, y1;                            \
   287  	ORL      y1, h;                            \ // h =  (a AND b) OR (a AND c) OR (b AND c)     
   288  	VPSLLD   $15, XTMP3, XTMP4;                \
   289  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   290  	MOVL     f, y1;                            \
   291  	XORL     g, y1;                            \
   292  	ANDL     e, y1;                            \
   293  	VPSRLD   $(32-15), XTMP3, XTMP3;           \
   294  	XORL     g, y1;                            \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)
   295  	ADDL     y1, y2;                           \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2  
   296  	ROLL     $9, b;                            \
   297  	ROLL     $19, f;                           \
   298  	VPOR    XTMP3, XTMP4, XTMP4;               \ // XTMP4 = (W[-3] rol 15) {DCBA}
   299  	P0(y2, y0, d);                             \
   300  	VPXOR   XTMP1, XTMP4, XTMP4;               \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {DCBA}
   301  
   302  #define ROUND_AND_SCHED_N_1_3(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   303  	;                                          \ // #############################  RND N + 3 ############################//
   304  	RORXL    $20, a, y0;                       \ // y0 = a <<< 12
   305  	MOVL     e, y2;                            \
   306  	ADDL     $const, y2;                       \
   307  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   308  	VPSLLD   $15, XTMP4, XTMP2;                \ 
   309  	ROLL     $7, y2;                           \ // y2 = SS1
   310  	XORL     y2, y0                            \ // y0 = SS2
   311  	ADDL     (disp + 3*4)(SP), y2;             \ // y2 = SS1 + W
   312  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
   313  	VPSRLD   $(32-15), XTMP4, XTMP3;           \
   314  	ADDL     (disp + 3*4 + 32)(SP), y0;        \ // y0 = SS2 + W'
   315  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   316  	MOVL     a, y1;                            \
   317  	ORL      b, y1;                            \
   318  	VPOR     XTMP3, XTMP2, XTMP3;              \ // XTMP3 = XTMP4 rol 15 {DCBA}
   319  	MOVL     a, h;                             \
   320  	ANDL     b, h;                             \
   321  	ANDL     c, y1;                            \
   322  	ORL      y1, h;                            \ // h =  (a AND b) OR (a AND c) OR (b AND c)
   323  	VPSHUFB  R08_SHUFFLE_MASK, XTMP3, XTMP1;   \ // XTMP1 = XTMP4 rol 23 {DCBA}
   324  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   325  	MOVL     f, y1;                            \
   326  	XORL     g, y1;                            \
   327  	ANDL     e, y1;                            \
   328  	VPXOR    XTMP3, XTMP4, XTMP3;              \ // XTMP3 = XTMP4 ^ (XTMP4 rol 15 {DCBA})
   329  	XORL     g, y1;                            \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)
   330  	ADDL     y1, y2;                           \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2  
   331  	ROLL     $9, b;                            \
   332  	ROLL     $19, f;                           \
   333  	VPXOR    XTMP3, XTMP1, XTMP1;              \ // XTMP1 = XTMP4 ^ (XTMP4 rol 15 {DCBA}) ^ (XTMP4 rol 23 {DCBA})
   334  	P0(y2, y0, d);                             \
   335  	VPXOR    XTMP1, XTMP0, XDWORD0;            \ // XWORD0 = {W[3], W[2], W[1], W[0]}
   336  
   337  #define SS12(a, e, const, ss1, ss2) \
   338  	RORXL    $20, a, ss2;                         \
   339  	MOVL     e, ss1;                              \
   340  	ADDL     $const, ss1;                         \
   341  	ADDL     ss2, ss1;                            \ 
   342  	ROLL     $7, ss1;                             \ // ss1 = (a <<< 12 + e + T) <<< 7
   343  	XORL     ss1, ss2
   344  
   345  // For rounds [0 - 16)
   346  #define DO_ROUND_N_0(disp, idx, const, a, b, c, d, e, f, g, h) \
   347  	;                                            \ // #############################  RND N + 0 ############################//
   348  	SS12(a, e, const, y2, y0);                   \
   349  	ADDL     (disp + idx*4)(SP), y2;             \ // y2 = SS1 + W
   350  	ADDL     h, y2;                              \ // y2 = h + SS1 + W    
   351  	ADDL     (disp + idx*4 + 32)(SP), y0;        \ // y0 = SS2 + W'
   352  	ADDL     d, y0;                              \ // y0 = d + SS2 + W'
   353  	;                                            \
   354  	MOVL     a, h;                               \
   355  	XORL     b, h;                               \
   356  	XORL     c, h;                               \
   357  	ADDL     y0, h;                              \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   358  	;                                            \
   359  	MOVL     e, y1;                              \
   360  	XORL     f, y1;                              \
   361  	XORL     g, y1;                              \
   362  	ADDL     y1, y2;                             \ // y2 = GG(e, f, g) + h + SS1 + W = tt2  
   363  	;                                            \
   364  	ROLL     $9, b;                              \
   365  	ROLL     $19, f;                             \
   366  	;                                            \
   367  	P0(y2, y0, d)
   368  
   369  // For rounds [16 - 64)
   370  #define DO_ROUND_N_1(disp, idx, const, a, b, c, d, e, f, g, h) \
   371  	;                                            \ // #############################  RND N + 0 ############################//
   372  	SS12(a, e, const, y2, y0);                   \
   373  	ADDL     (disp + idx*4)(SP), y2;             \ // y2 = SS1 + W
   374  	ADDL     h, y2;                              \ // y2 = h + SS1 + W    
   375  	ADDL     (disp + idx*4 + 32)(SP), y0;        \ // y0 = SS2 + W'
   376  	ADDL     d, y0;                              \ // y0 = d + SS2 + W'
   377  	;                                            \
   378  	MOVL     a, y1;                              \
   379  	ORL      b, y1;                              \
   380  	MOVL     a, h;                               \
   381  	ANDL     b, h;                               \
   382  	ANDL     c, y1;                              \
   383  	ORL      y1, h;                              \ // h =  (a AND b) OR (a AND c) OR (b AND c)     
   384  	ADDL     y0, h;                              \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   385  	;                                            \
   386  	MOVL     f, y1;                              \
   387  	XORL     g, y1;                              \
   388  	ANDL     e, y1;                              \
   389  	XORL     g, y1;                              \ // y1 = GG2(e, f, g)
   390  	ADDL     y1, y2;                             \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2  
   391  	;                                            \
   392  	ROLL     $9, b;                              \
   393  	ROLL     $19, f;                             \
   394  	;                                            \
   395  	P0(y2, y0, d)
   396  
   397  TEXT ·blockAVX2(SB), 0, $1040-32
   398  	MOVQ dig+0(FP), CTX          // d.h[8]
   399  	MOVQ p_base+8(FP), INP
   400  	MOVQ p_len+16(FP), NUM_BYTES
   401  
   402  	LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
   403  	MOVQ NUM_BYTES, _INP_END(SP)
   404  
   405  	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
   406  	VMOVDQU r08_mask<>(SB), R08_SHUFFLE_MASK
   407  
   408  	CMPQ NUM_BYTES, INP
   409  	JE   avx2_only_one_block
   410  
   411  	// Load initial digest
   412  	MOVL 0(CTX), a  // a = H0
   413  	MOVL 4(CTX), b  // b = H1
   414  	MOVL 8(CTX), c  // c = H2
   415  	MOVL 12(CTX), d // d = H3
   416  	MOVL 16(CTX), e // e = H4
   417  	MOVL 20(CTX), f // f = H5
   418  	MOVL 24(CTX), g // g = H6
   419  	MOVL 28(CTX), h // h = H7
   420  
   421  avx2_loop: // at each iteration works with one block (512 bit)
   422  
   423  	VMOVDQU (0*32)(INP), XTMP0
   424  	VMOVDQU (1*32)(INP), XTMP1
   425  	VMOVDQU (2*32)(INP), XTMP2
   426  	VMOVDQU (3*32)(INP), XTMP3
   427  
   428  	// Apply Byte Flip Mask: LE -> BE
   429  	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
   430  	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
   431  	VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
   432  	VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
   433  
   434  	// Transpose data into high/low parts
   435  	VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w19, w18, w17, w16;  w3,  w2,  w1,  w0
   436  	VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w23, w22, w21, w20;  w7,  w6,  w5,  w4
   437  	VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w27, w26, w25, w24; w11, w10,  w9,  w8
   438  	VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w31, w30, w29, w28; w15, w14, w13, w12
   439  
   440  avx2_last_block_enter:
   441  	ADDQ $64, INP
   442  
   443  avx2_schedule_compress: // for w0 - w47
   444  	// Do 4 rounds and scheduling
   445  	VMOVDQU XDWORD0, (_XFER + 0*32)(SP)
   446  	VPXOR  XDWORD0, XDWORD1, XFER
   447  	VMOVDQU XFER, (_XFER + 1*32)(SP)
   448  	ROUND_AND_SCHED_N_0_0(_XFER + 0*32, T0, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   449  	ROUND_AND_SCHED_N_0_1(_XFER + 0*32, T1, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   450  	ROUND_AND_SCHED_N_0_2(_XFER + 0*32, T2, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   451  	ROUND_AND_SCHED_N_0_3(_XFER + 0*32, T3, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   452  
   453  	// Do 4 rounds and scheduling
   454  	VMOVDQU XDWORD1, (_XFER + 2*32)(SP)
   455  	VPXOR  XDWORD1, XDWORD2, XFER
   456  	VMOVDQU XFER, (_XFER + 3*32)(SP)
   457  	ROUND_AND_SCHED_N_0_0(_XFER + 2*32, T4, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   458  	ROUND_AND_SCHED_N_0_1(_XFER + 2*32, T5, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   459  	ROUND_AND_SCHED_N_0_2(_XFER + 2*32, T6, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   460  	ROUND_AND_SCHED_N_0_3(_XFER + 2*32, T7, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   461  
   462  	// Do 4 rounds and scheduling
   463  	VMOVDQU XDWORD2, (_XFER + 4*32)(SP)
   464  	VPXOR  XDWORD2, XDWORD3, XFER
   465  	VMOVDQU XFER, (_XFER + 5*32)(SP)
   466  	ROUND_AND_SCHED_N_0_0(_XFER + 4*32, T8, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   467  	ROUND_AND_SCHED_N_0_1(_XFER + 4*32, T9, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   468  	ROUND_AND_SCHED_N_0_2(_XFER + 4*32, T10, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   469  	ROUND_AND_SCHED_N_0_3(_XFER + 4*32, T11, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   470  
   471  	// Do 4 rounds and scheduling
   472  	VMOVDQU XDWORD3, (_XFER + 6*32)(SP)
   473  	VPXOR  XDWORD3, XDWORD0, XFER
   474  	VMOVDQU XFER, (_XFER + 7*32)(SP)
   475  	ROUND_AND_SCHED_N_0_0(_XFER + 6*32, T12, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   476  	ROUND_AND_SCHED_N_0_1(_XFER + 6*32, T13, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   477  	ROUND_AND_SCHED_N_0_2(_XFER + 6*32, T14, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   478  	ROUND_AND_SCHED_N_0_3(_XFER + 6*32, T15, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   479  
   480  	// Do 4 rounds and scheduling
   481  	VMOVDQU XDWORD0, (_XFER + 8*32)(SP)
   482  	VPXOR  XDWORD0, XDWORD1, XFER
   483  	VMOVDQU XFER, (_XFER + 9*32)(SP)
   484  	ROUND_AND_SCHED_N_1_0(_XFER + 8*32, T16, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   485  	ROUND_AND_SCHED_N_1_1(_XFER + 8*32, T17, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   486  	ROUND_AND_SCHED_N_1_2(_XFER + 8*32, T18, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   487  	ROUND_AND_SCHED_N_1_3(_XFER + 8*32, T19, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   488  
   489  	// Do 4 rounds and scheduling
   490  	VMOVDQU XDWORD1, (_XFER + 10*32)(SP)
   491  	VPXOR  XDWORD1, XDWORD2, XFER
   492  	VMOVDQU XFER, (_XFER + 11*32)(SP)
   493  	ROUND_AND_SCHED_N_1_0(_XFER + 10*32, T20, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   494  	ROUND_AND_SCHED_N_1_1(_XFER + 10*32, T21, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   495  	ROUND_AND_SCHED_N_1_2(_XFER + 10*32, T22, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   496  	ROUND_AND_SCHED_N_1_3(_XFER + 10*32, T23, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   497  
   498  	// Do 4 rounds and scheduling
   499  	VMOVDQU XDWORD2, (_XFER + 12*32)(SP)
   500  	VPXOR  XDWORD2, XDWORD3, XFER
   501  	VMOVDQU XFER, (_XFER + 13*32)(SP)
   502  	ROUND_AND_SCHED_N_1_0(_XFER + 12*32, T24, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   503  	ROUND_AND_SCHED_N_1_1(_XFER + 12*32, T25, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   504  	ROUND_AND_SCHED_N_1_2(_XFER + 12*32, T26, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   505  	ROUND_AND_SCHED_N_1_3(_XFER + 12*32, T27, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   506  
   507  	// Do 4 rounds and scheduling
   508  	VMOVDQU XDWORD3, (_XFER + 14*32)(SP)
   509  	VPXOR  XDWORD3, XDWORD0, XFER
   510  	VMOVDQU XFER, (_XFER + 15*32)(SP)
   511  	ROUND_AND_SCHED_N_1_0(_XFER + 14*32, T28, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   512  	ROUND_AND_SCHED_N_1_1(_XFER + 14*32, T29, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   513  	ROUND_AND_SCHED_N_1_2(_XFER + 14*32, T30, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   514  	ROUND_AND_SCHED_N_1_3(_XFER + 14*32, T31, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   515  
   516  	// Do 4 rounds and scheduling
   517  	VMOVDQU XDWORD0, (_XFER + 16*32)(SP)
   518  	VPXOR  XDWORD0, XDWORD1, XFER
   519  	VMOVDQU XFER, (_XFER + 17*32)(SP)
   520  	ROUND_AND_SCHED_N_1_0(_XFER + 16*32, T32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   521  	ROUND_AND_SCHED_N_1_1(_XFER + 16*32, T33, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   522  	ROUND_AND_SCHED_N_1_2(_XFER + 16*32, T34, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   523  	ROUND_AND_SCHED_N_1_3(_XFER + 16*32, T35, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   524  
   525  	// Do 4 rounds and scheduling
   526  	VMOVDQU XDWORD1, (_XFER + 18*32)(SP)
   527  	VPXOR  XDWORD1, XDWORD2, XFER
   528  	VMOVDQU XFER, (_XFER + 19*32)(SP)
   529  	ROUND_AND_SCHED_N_1_0(_XFER + 18*32, T36, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   530  	ROUND_AND_SCHED_N_1_1(_XFER + 18*32, T37, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   531  	ROUND_AND_SCHED_N_1_2(_XFER + 18*32, T38, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   532  	ROUND_AND_SCHED_N_1_3(_XFER + 18*32, T39, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   533  
   534  	// Do 4 rounds and scheduling
   535  	VMOVDQU XDWORD2, (_XFER + 20*32)(SP)
   536  	VPXOR  XDWORD2, XDWORD3, XFER
   537  	VMOVDQU XFER, (_XFER + 21*32)(SP)
   538  	ROUND_AND_SCHED_N_1_0(_XFER + 20*32, T40, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   539  	ROUND_AND_SCHED_N_1_1(_XFER + 20*32, T41, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   540  	ROUND_AND_SCHED_N_1_2(_XFER + 20*32, T42, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   541  	ROUND_AND_SCHED_N_1_3(_XFER + 20*32, T43, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   542  
   543  	// Do 4 rounds and scheduling
   544  	VMOVDQU XDWORD3, (_XFER + 22*32)(SP)
   545  	VPXOR  XDWORD3, XDWORD0, XFER
   546  	VMOVDQU XFER, (_XFER + 23*32)(SP)
   547  	ROUND_AND_SCHED_N_1_0(_XFER + 22*32, T44, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   548  	ROUND_AND_SCHED_N_1_1(_XFER + 22*32, T45, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   549  	ROUND_AND_SCHED_N_1_2(_XFER + 22*32, T46, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   550  	ROUND_AND_SCHED_N_1_3(_XFER + 22*32, T47, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   551  
   552  	// w48 - w63 processed with only 4 rounds scheduling (last 16 rounds)
   553  	// Do 4 rounds and scheduling
   554  	VMOVDQU XDWORD0, (_XFER + 24*32)(SP)
   555  	VPXOR  XDWORD0, XDWORD1, XFER
   556  	VMOVDQU XFER, (_XFER + 25*32)(SP)
   557  	ROUND_AND_SCHED_N_1_0(_XFER + 24*32, T48, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   558  	ROUND_AND_SCHED_N_1_1(_XFER + 24*32, T49, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   559  	ROUND_AND_SCHED_N_1_2(_XFER + 24*32, T50, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   560  	ROUND_AND_SCHED_N_1_3(_XFER + 24*32, T51, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)  
   561  
   562  	// w52 - w63 processed with no scheduling (last 12 rounds)
   563  	// Do 4 rounds
   564  	VMOVDQU XDWORD1, (_XFER + 26*32)(SP)
   565  	VPXOR  XDWORD1, XDWORD2, XFER
   566  	VMOVDQU XFER, (_XFER + 27*32)(SP)
   567  	DO_ROUND_N_1(_XFER + 26*32, 0, T52, e, f, g, h, a, b, c, d)
   568  	DO_ROUND_N_1(_XFER + 26*32, 1, T53, d, e, f, g, h, a, b, c)
   569  	DO_ROUND_N_1(_XFER + 26*32, 2, T54, c, d, e, f, g, h, a, b)
   570  	DO_ROUND_N_1(_XFER + 26*32, 3, T55, b, c, d, e, f, g, h, a)
   571  
   572  	// Do 4 rounds
   573  	VMOVDQU XDWORD2, (_XFER + 28*32)(SP)
   574  	VPXOR  XDWORD2, XDWORD3, XFER
   575  	VMOVDQU XFER, (_XFER + 29*32)(SP)
   576  	DO_ROUND_N_1(_XFER + 28*32, 0, T56, a, b, c, d, e, f, g, h)
   577  	DO_ROUND_N_1(_XFER + 28*32, 1, T57, h, a, b, c, d, e, f, g)
   578  	DO_ROUND_N_1(_XFER + 28*32, 2, T58, g, h, a, b, c, d, e, f)
   579  	DO_ROUND_N_1(_XFER + 28*32, 3, T59, f, g, h, a, b, c, d, e)
   580  
   581  	// Do 4 rounds
   582  	VMOVDQU XDWORD3, (_XFER + 30*32)(SP)
   583  	VPXOR  XDWORD3, XDWORD0, XFER
   584  	VMOVDQU XFER, (_XFER + 31*32)(SP)
   585  	DO_ROUND_N_1(_XFER + 30*32, 0, T60, e, f, g, h, a, b, c, d)
   586  	DO_ROUND_N_1(_XFER + 30*32, 1, T61, d, e, f, g, h, a, b, c)
   587  	DO_ROUND_N_1(_XFER + 30*32, 2, T62, c, d, e, f, g, h, a, b)
   588  	DO_ROUND_N_1(_XFER + 30*32, 3, T63, b, c, d, e, f, g, h, a)
   589  
   590  	xorm(  0(CTX), a)
   591  	xorm(  4(CTX), b)
   592  	xorm(  8(CTX), c)
   593  	xorm( 12(CTX), d)
   594  	xorm( 16(CTX), e)
   595  	xorm( 20(CTX), f)
   596  	xorm( 24(CTX), g)
   597  	xorm( 28(CTX), h)
   598  
   599  	CMPQ _INP_END(SP), INP
   600  	JB   done_hash
   601  
   602  avx2_compress: // Do second block using previously scheduled results
   603  	DO_ROUND_N_0(_XFER + 0*32 + 16, 0, T0, a, b, c, d, e, f, g, h)
   604  	DO_ROUND_N_0(_XFER + 0*32 + 16, 1, T1, h, a, b, c, d, e, f, g)
   605  	DO_ROUND_N_0(_XFER + 0*32 + 16, 2, T2, g, h, a, b, c, d, e, f)
   606  	DO_ROUND_N_0(_XFER + 0*32 + 16, 3, T3, f, g, h, a, b, c, d, e)
   607  
   608  	DO_ROUND_N_0(_XFER + 2*32 + 16, 0, T4, e, f, g, h, a, b, c, d)
   609  	DO_ROUND_N_0(_XFER + 2*32 + 16, 1, T5, d, e, f, g, h, a, b, c)
   610  	DO_ROUND_N_0(_XFER + 2*32 + 16, 2, T6, c, d, e, f, g, h, a, b)
   611  	DO_ROUND_N_0(_XFER + 2*32 + 16, 3, T7, b, c, d, e, f, g, h, a)
   612  
   613  	DO_ROUND_N_0(_XFER + 4*32 + 16, 0, T8, a, b, c, d, e, f, g, h)
   614  	DO_ROUND_N_0(_XFER + 4*32 + 16, 1, T9, h, a, b, c, d, e, f, g)
   615  	DO_ROUND_N_0(_XFER + 4*32 + 16, 2, T10, g, h, a, b, c, d, e, f)
   616  	DO_ROUND_N_0(_XFER + 4*32 + 16, 3, T11, f, g, h, a, b, c, d, e)
   617  
   618  	DO_ROUND_N_0(_XFER + 6*32 + 16, 0, T12, e, f, g, h, a, b, c, d)
   619  	DO_ROUND_N_0(_XFER + 6*32 + 16, 1, T13, d, e, f, g, h, a, b, c)
   620  	DO_ROUND_N_0(_XFER + 6*32 + 16, 2, T14, c, d, e, f, g, h, a, b)
   621  	DO_ROUND_N_0(_XFER + 6*32 + 16, 3, T15, b, c, d, e, f, g, h, a)
   622  
   623  	DO_ROUND_N_1(_XFER + 8*32 + 16, 0, T16, a, b, c, d, e, f, g, h)
   624  	DO_ROUND_N_1(_XFER + 8*32 + 16, 1, T17, h, a, b, c, d, e, f, g)
   625  	DO_ROUND_N_1(_XFER + 8*32 + 16, 2, T18, g, h, a, b, c, d, e, f)
   626  	DO_ROUND_N_1(_XFER + 8*32 + 16, 3, T19, f, g, h, a, b, c, d, e)
   627  
   628  	DO_ROUND_N_1(_XFER + 10*32 + 16, 0, T20, e, f, g, h, a, b, c, d)
   629  	DO_ROUND_N_1(_XFER + 10*32 + 16, 1, T21, d, e, f, g, h, a, b, c)
   630  	DO_ROUND_N_1(_XFER + 10*32 + 16, 2, T22, c, d, e, f, g, h, a, b)
   631  	DO_ROUND_N_1(_XFER + 10*32 + 16, 3, T23, b, c, d, e, f, g, h, a)
   632  
   633  	DO_ROUND_N_1(_XFER + 12*32 + 16, 0, T24, a, b, c, d, e, f, g, h)
   634  	DO_ROUND_N_1(_XFER + 12*32 + 16, 1, T25, h, a, b, c, d, e, f, g)
   635  	DO_ROUND_N_1(_XFER + 12*32 + 16, 2, T26, g, h, a, b, c, d, e, f)
   636  	DO_ROUND_N_1(_XFER + 12*32 + 16, 3, T27, f, g, h, a, b, c, d, e)
   637  
   638  	DO_ROUND_N_1(_XFER + 14*32 + 16, 0, T28, e, f, g, h, a, b, c, d)
   639  	DO_ROUND_N_1(_XFER + 14*32 + 16, 1, T29, d, e, f, g, h, a, b, c)
   640  	DO_ROUND_N_1(_XFER + 14*32 + 16, 2, T30, c, d, e, f, g, h, a, b)
   641  	DO_ROUND_N_1(_XFER + 14*32 + 16, 3, T31, b, c, d, e, f, g, h, a)
   642  
   643  	DO_ROUND_N_1(_XFER + 16*32 + 16, 0, T32, a, b, c, d, e, f, g, h)
   644  	DO_ROUND_N_1(_XFER + 16*32 + 16, 1, T33, h, a, b, c, d, e, f, g)
   645  	DO_ROUND_N_1(_XFER + 16*32 + 16, 2, T34, g, h, a, b, c, d, e, f)
   646  	DO_ROUND_N_1(_XFER + 16*32 + 16, 3, T35, f, g, h, a, b, c, d, e)
   647  
   648  	DO_ROUND_N_1(_XFER + 18*32 + 16, 0, T36, e, f, g, h, a, b, c, d)
   649  	DO_ROUND_N_1(_XFER + 18*32 + 16, 1, T37, d, e, f, g, h, a, b, c)
   650  	DO_ROUND_N_1(_XFER + 18*32 + 16, 2, T38, c, d, e, f, g, h, a, b)
   651  	DO_ROUND_N_1(_XFER + 18*32 + 16, 3, T39, b, c, d, e, f, g, h, a)
   652  
   653  	DO_ROUND_N_1(_XFER + 20*32 + 16, 0, T40, a, b, c, d, e, f, g, h)
   654  	DO_ROUND_N_1(_XFER + 20*32 + 16, 1, T41, h, a, b, c, d, e, f, g)
   655  	DO_ROUND_N_1(_XFER + 20*32 + 16, 2, T42, g, h, a, b, c, d, e, f)
   656  	DO_ROUND_N_1(_XFER + 20*32 + 16, 3, T43, f, g, h, a, b, c, d, e)
   657  
   658  	DO_ROUND_N_1(_XFER + 22*32 + 16, 0, T44, e, f, g, h, a, b, c, d)
   659  	DO_ROUND_N_1(_XFER + 22*32 + 16, 1, T45, d, e, f, g, h, a, b, c)
   660  	DO_ROUND_N_1(_XFER + 22*32 + 16, 2, T46, c, d, e, f, g, h, a, b)
   661  	DO_ROUND_N_1(_XFER + 22*32 + 16, 3, T47, b, c, d, e, f, g, h, a)
   662  
   663  	DO_ROUND_N_1(_XFER + 24*32 + 16, 0, T48, a, b, c, d, e, f, g, h)
   664  	DO_ROUND_N_1(_XFER + 24*32 + 16, 1, T49, h, a, b, c, d, e, f, g)
   665  	DO_ROUND_N_1(_XFER + 24*32 + 16, 2, T50, g, h, a, b, c, d, e, f)
   666  	DO_ROUND_N_1(_XFER + 24*32 + 16, 3, T51, f, g, h, a, b, c, d, e)
   667  
   668  	DO_ROUND_N_1(_XFER + 26*32 + 16, 0, T52, e, f, g, h, a, b, c, d)
   669  	DO_ROUND_N_1(_XFER + 26*32 + 16, 1, T53, d, e, f, g, h, a, b, c)
   670  	DO_ROUND_N_1(_XFER + 26*32 + 16, 2, T54, c, d, e, f, g, h, a, b)
   671  	DO_ROUND_N_1(_XFER + 26*32 + 16, 3, T55, b, c, d, e, f, g, h, a)
   672  
   673  	DO_ROUND_N_1(_XFER + 28*32 + 16, 0, T56, a, b, c, d, e, f, g, h)
   674  	DO_ROUND_N_1(_XFER + 28*32 + 16, 1, T57, h, a, b, c, d, e, f, g)
   675  	DO_ROUND_N_1(_XFER + 28*32 + 16, 2, T58, g, h, a, b, c, d, e, f)
   676  	DO_ROUND_N_1(_XFER + 28*32 + 16, 3, T59, f, g, h, a, b, c, d, e)
   677  
   678  	DO_ROUND_N_1(_XFER + 30*32 + 16, 0, T60, e, f, g, h, a, b, c, d)
   679  	DO_ROUND_N_1(_XFER + 30*32 + 16, 1, T61, d, e, f, g, h, a, b, c)
   680  	DO_ROUND_N_1(_XFER + 30*32 + 16, 2, T62, c, d, e, f, g, h, a, b)
   681  	DO_ROUND_N_1(_XFER + 30*32 + 16, 3, T63, b, c, d, e, f, g, h, a)
   682  
   683  	ADDQ $64, INP
   684  
   685  	xorm(  0(CTX), a)
   686  	xorm(  4(CTX), b)
   687  	xorm(  8(CTX), c)
   688  	xorm( 12(CTX), d)
   689  	xorm( 16(CTX), e)
   690  	xorm( 20(CTX), f)
   691  	xorm( 24(CTX), g)
   692  	xorm( 28(CTX), h)
   693  
   694  	CMPQ _INP_END(SP), INP
   695  	JA   avx2_loop
   696  	JB   done_hash
   697  
   698  avx2_do_last_block:
   699  
   700  	VMOVDQU 0(INP), XWORD0
   701  	VMOVDQU 16(INP), XWORD1
   702  	VMOVDQU 32(INP), XWORD2
   703  	VMOVDQU 48(INP), XWORD3
   704  
   705  	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
   706  	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
   707  	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
   708  	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
   709  
   710  	JMP avx2_last_block_enter
   711  
   712  avx2_only_one_block:
   713  	// Load initial digest
   714  	MOVL 0(CTX), a  // a = H0
   715  	MOVL 4(CTX), b  // b = H1
   716  	MOVL 8(CTX), c  // c = H2
   717  	MOVL 12(CTX), d // d = H3
   718  	MOVL 16(CTX), e // e = H4
   719  	MOVL 20(CTX), f // f = H5
   720  	MOVL 24(CTX), g // g = H6
   721  	MOVL 28(CTX), h // h = H7
   722  
   723  	JMP avx2_do_last_block
   724  
   725  done_hash:
   726  	VZEROUPPER
   727  	RET
   728  
   729  // shuffle byte order from LE to BE
   730  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
   731  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
   732  DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
   733  DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
   734  GLOBL flip_mask<>(SB), 8, $32
   735  
   736  DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
   737  DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
   738  DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003
   739  DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
   740  GLOBL r08_mask<>(SB), 8, $32