github.com/emmansun/gmsm@v0.29.1/sm3/sm3block_simd_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #include "sm3_const_asm.s"
     6  // Definitions for AVX version
     7  
     8  // xorm (mem), reg
     9  // Xor reg to mem using reg-mem xor and store
    10  #define xorm(P1, P2) \
    11  	XORL P2, P1; \
    12  	MOVL P1, P2
    13  
    14  #define XWORD0 X4
    15  #define XWORD1 X5
    16  #define XWORD2 X6
    17  #define XWORD3 X7
    18  
    19  #define XTMP0 X0
    20  #define XTMP1 X1
    21  #define XTMP2 X2
    22  #define XTMP3 X3
    23  #define XTMP4 X8
    24  
    25  #define XFER  X9
    26  #define R08_SHUFFLE_MASK X10
    27  #define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
    28  
    29  #define NUM_BYTES DX
    30  #define INP	DI
    31  
    32  #define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
    33  
    34  #define a AX
    35  #define b BX
    36  #define c CX
    37  #define d R8
    38  #define e DX
    39  #define f R9
    40  #define g R10
    41  #define h R11
    42  
    43  #define y0 R12
    44  #define y1 R13
    45  #define y2 R14
    46  
    47  // Offsets
    48  #define XFER_SIZE 2*16
    49  #define INP_END_SIZE 8
    50  
    51  #define _XFER 0
    52  #define _INP_END _XFER + XFER_SIZE
    53  #define STACK_SIZE _INP_END + INP_END_SIZE
    54  
    55  #define SS12(a, e, const, ss1, ss2) \
    56  	MOVL     a, ss2;                            \
    57  	ROLL     $12, ss2;                          \ // y0 = a <<< 12
    58  	MOVL     e, ss1;                            \
    59  	ADDL     $const, ss1;                       \
    60  	ADDL     ss2, ss1;                          \ // y2 = a <<< 12 + e + T
    61  	ROLL     $7, ss1;                           \ // y2 = SS1
    62  	XORL     ss1, ss2
    63  
    64  #define P0(tt2, tmp, out) \
    65  	MOVL     tt2, tmp;                             \
    66  	ROLL     $9, tmp;                              \
    67  	MOVL     tt2, out;                             \
    68  	ROLL     $17, out;                             \ 
    69  	XORL     tmp, out;                             \
    70  	XORL     tt2, out
    71  
    72  // For rounds [0 - 16)
    73  #define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
    74  	;                                          \ // #############################  RND N + 0 ############################//
    75  	MOVL     a, y0;                            \
    76  	ROLL     $12, y0;                          \ // y0 = a <<< 12
    77  	MOVL     e, y2;                            \
    78  	ADDL     $const, y2;                       \
    79  	VPALIGNR $12, XWORD0, XWORD1, XTMP0;       \ // XTMP0 = W[-13] = {w6,w5,w4,w3}
    80  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
    81  	ROLL     $7, y2;                           \ // y2 = SS1
    82  	XORL     y2, y0                            \ // y0 = SS2
    83  	VPSLLD   $7, XTMP0, XTMP1;                 \ // XTMP1 = W[-13] << 7 = {w6<<7,w5<<7,w4<<7,w3<<7}
    84  	ADDL     (disp + 0*4)(SP), y2;             \ // y2 = SS1 + W
    85  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
    86  	ADDL     (disp + 0*4 + 16)(SP), y0;        \ // y0 = SS2 + W'
    87  	VPSRLD   $(32-7), XTMP0, XTMP0;            \ // XTMP0 = W[-13] >> 25 = {w6>>25,w5>>25,w4>>25,w3>>25}
    88  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
    89  	MOVL     a, h;                             \
    90  	XORL     b, h;                             \
    91  	VPOR     XTMP0, XTMP1, XTMP1;              \ // XTMP1 = W[-13] rol 7
    92  	XORL     c, h;                             \
    93  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
    94  	MOVL     e, y1;                            \
    95  	VPALIGNR $8, XWORD2, XWORD3, XTMP0;        \ // XTMP0 = W[-6] = {w13,w12,w11,w10}
    96  	XORL     f, y1;                            \
    97  	XORL     g, y1;                            \
    98  	ADDL     y1, y2;                           \ // y2 = GG(e, f, g) + h + SS1 + W = tt2  
    99  	VPXOR   XTMP1, XTMP0, XTMP0;               \ // XTMP0 = W[-6] ^ (W[-13] rol 7)
   100  	ROLL     $9, b;                            \
   101  	ROLL     $19, f;                           \
   102  	MOVL     y2, y0;                           \
   103  	ROLL     $9, y0;                           \
   104  	VPALIGNR $12, XWORD1, XWORD2, XTMP1;       \ // XTMP1 = W[-9] = {w10,w9,w8,w7}
   105  	MOVL     y2, d;                            \
   106  	ROLL     $17, d;                           \ 
   107  	XORL     y0, d;                            \
   108  	XORL     y2, d;                            \ // d = P(tt2)
   109  	VPXOR XWORD0, XTMP1, XTMP1;                \ // XTMP1 = W[-9] ^ W[-16]
   110  
   111  #define ROUND_AND_SCHED_N_0_1(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
   112  	;                                          \ // #############################  RND N + 1 ############################//
   113  	MOVL     a, y0;                            \
   114  	ROLL     $12, y0;                          \ // y0 = a <<< 12
   115  	MOVL     e, y2;                            \
   116  	ADDL     $const, y2;                       \
   117  	VPSHUFD $0xA5, XWORD3, XTMP2;              \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13}
   118  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   119  	ROLL     $7, y2;                           \ // y2 = SS1
   120  	XORL     y2, y0                            \ // y0 = SS2
   121  	ADDL     (disp + 1*4)(SP), y2;             \ // y2 = SS1 + W
   122  	VPSRLQ  $17, XTMP2, XTMP2;                 \ // XTMP2 = W[-3] rol 15 {xBxA}
   123  	ADDL     h, y2;                            \ // y2 = h + SS1 + W
   124  	ADDL     (disp + 1*4 + 16)(SP), y0;        \ // y0 = SS2 + W'
   125  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   126  	VPXOR   XTMP1, XTMP2, XTMP2;               \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxxA}
   127  	MOVL     a, h;                             \
   128  	XORL     b, h;                             \
   129  	XORL     c, h;                             \
   130  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   131  	VPSHUFD $0x00, XTMP2, XTMP2;               \ // XTMP2 = {AAAA}
   132  	MOVL     e, y1;                            \ 
   133  	XORL     f, y1;                            \
   134  	XORL     g, y1;                            \
   135  	ADDL     y1, y2;                           \ // y2 = GG(e, f, g) + h + SS1 + W = tt2
   136  	VPSRLQ  $17, XTMP2, XTMP3;                 \ // XTMP3 = XTMP2 rol 15 {xxxA}
   137  	ROLL     $9, b;                            \
   138  	ROLL     $19, f;                           \
   139  	MOVL     y2, y0;                           \
   140  	ROLL     $9, y0;                           \
   141  	VPSRLQ  $9, XTMP2, XTMP4;                  \ // XTMP4 = XTMP2 rol 23 {xxxA}
   142  	MOVL     y2, d;                            \
   143  	ROLL     $17, d;                           \ 
   144  	XORL     y0, d;                            \
   145  	XORL     y2, d;                            \ // d = P(tt2)
   146  	VPXOR    XTMP2, XTMP4, XTMP4;              \ // XTMP4 = XTMP2 ^ (XTMP2 rol 23 {xxxA})
   147  
   148  #define ROUND_AND_SCHED_N_0_2(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
   149  	;                                          \ // #############################  RND N + 2 ############################//
   150  	MOVL     a, y0;                            \
   151  	ROLL     $12, y0;                          \ // y0 = a <<< 12
   152  	MOVL     e, y2;                            \
   153  	ADDL     $const, y2;                       \
   154  	VPXOR    XTMP4, XTMP3, XTMP4;              \ // XTMP4 = XTMP2 ^ (XTMP2 rol 15 {xxxA}) ^ (XTMP2 rol 23 {xxxA})
   155  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   156  	ROLL     $7, y2;                           \ // y2 = SS1
   157  	XORL     y2, y0                            \ // y0 = SS2
   158  	ADDL     (disp + 2*4)(SP), y2;             \ // y2 = SS1 + W
   159  	VPXOR    XTMP4, XTMP0, XTMP2;              \ // XTMP2 = {..., ..., ..., W[0]}
   160  	ADDL     h, y2;                            \ // y2 = h + SS1 + W
   161  	ADDL     (disp + 2*4 + 16)(SP), y0;        \ // y0 = SS2 + W'
   162  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   163  	VPALIGNR $4, XWORD3, XTMP2, XTMP3;         \ // XTMP3 = {W[0], w15, w14, w13}
   164  	MOVL     a, h;                             \
   165  	XORL     b, h;                             \
   166  	XORL     c, h;                             \
   167  	VPSLLD   $15, XTMP3, XTMP4;                \
   168  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   169  	MOVL     e, y1;                            \
   170  	XORL     f, y1;                            \
   171  	XORL     g, y1;                            \
   172  	VPSRLD   $(32-15), XTMP3, XTMP3;           \
   173  	ADDL     y1, y2;                           \ // y2 = GG(e, f, g) + h + SS1 + W = tt2  
   174  	ROLL     $9, b;                            \
   175  	ROLL     $19, f;                           \
   176  	MOVL     y2, y0;                           \
   177  	ROLL     $9, y0;                           \
   178  	VPOR     XTMP3, XTMP4, XTMP4;              \ // XTMP4 = (W[-3] rol 15) {DCxx}
   179  	MOVL     y2, d;                            \
   180  	ROLL     $17, d;                           \ 
   181  	XORL     y0, d;                            \
   182  	XORL     y2, d;                            \ // d = P(tt2)
   183  	VPXOR   XTMP1, XTMP4, XTMP4;               \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {DCxx}
   184  
   185  #define ROUND_AND_SCHED_N_0_3(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
   186  	;                                          \ // #############################  RND N + 3 ############################//
   187  	MOVL     a, y0;                            \
   188  	ROLL     $12, y0;                          \ // y0 = a <<< 12
   189  	MOVL     e, y2;                            \
   190  	ADDL     $const, y2;                       \
   191  	VPSLLD   $15, XTMP4, XTMP2;                \
   192  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   193  	ROLL     $7, y2;                           \ // y2 = SS1
   194  	XORL     y2, y0                            \ // y0 = SS2
   195  	ADDL     (disp + 3*4)(SP), y2;             \ // y2 = SS1 + W
   196  	VPSRLD   $(32-15), XTMP4, XTMP3;           \
   197  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
   198  	ADDL     (disp + 3*4 + 16)(SP), y0;        \ // y2 = SS2 + W'
   199  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   200  	VPOR     XTMP3, XTMP2, XTMP3;              \ // XTMP3 = XTMP4 rol 15 {DCxx}
   201  	MOVL     a, h;                             \
   202  	XORL     b, h;                             \
   203  	XORL     c, h;                             \
   204  	VPSHUFB  R08_SHUFFLE_MASK, XTMP3, XTMP1;   \ // XTMP1 = XTMP4 rol 23 {DCxx}
   205  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   206  	MOVL     e, y1;                            \
   207  	XORL     f, y1;                            \
   208  	XORL     g, y1;                            \
   209  	VPXOR    XTMP3, XTMP4, XTMP3;              \ // XTMP3 = XTMP4 ^ (XTMP4 rol 15 {DCxx})
   210  	ADDL     y1, y2;                           \ // y2 = GG(e, f, g) + h + SS1 + W = tt2  
   211  	ROLL     $9, b;                            \
   212  	ROLL     $19, f;                           \
   213  	MOVL     y2, y0;                           \
   214  	ROLL     $9, y0;                           \
   215  	VPXOR    XTMP3, XTMP1, XTMP1;              \ // XTMP1 = XTMP4 ^ (XTMP4 rol 15 {DCxx}) ^ (XTMP4 rol 23 {DCxx})
   216  	MOVL     y2, d;                            \
   217  	ROLL     $17, d;                           \ 
   218  	XORL     y0, d;                            \
   219  	XORL     y2, d;                            \ // d = P(tt2)
   220  	VPXOR    XTMP1, XTMP0, XWORD0;             \ // XWORD0 = {W[3], W[2], W[1], W[0]}
   221  
   222  // For rounds [16 - 64)
   223  #define ROUND_AND_SCHED_N_1_0(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
   224  	;                                          \ // #############################  RND N + 0 ############################//
   225  	MOVL     a, y0;                            \
   226  	ROLL     $12, y0;                          \ // y0 = a <<< 12
   227  	MOVL     e, y2;                            \
   228  	ADDL     $const, y2;                       \
   229  	VPALIGNR $12, XWORD0, XWORD1, XTMP0;       \ // XTMP0 = W[-13] = {w6,w5,w4,w3}
   230  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   231  	ROLL     $7, y2;                           \ // y2 = SS1
   232  	XORL     y2, y0                            \ // y0 = SS2
   233  	VPSLLD   $7, XTMP0, XTMP1;                 \ // XTMP1 = W[-13] << 7 = {w6<<7,w5<<7,w4<<7,w3<<7}
   234  	ADDL     (disp + 0*4)(SP), y2;             \ // y2 = SS1 + W
   235  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
   236  	ADDL     (disp + 0*4 + 16)(SP), y0;        \ // y0 = SS2 + W'
   237  	VPSRLD   $(32-7), XTMP0, XTMP0;            \ // XTMP0 = W[-13] >> 25 = {w6>>25,w5>>25,w4>>25,w3>>25}
   238  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   239  	MOVL     a, y1;                            \
   240  	ORL      b, y1;                            \
   241  	VPOR     XTMP0, XTMP1, XTMP1;              \ // XTMP1 = W[-13] rol 7 = {ROTL(7,w6),ROTL(7,w5),ROTL(7,w4),ROTL(7,w3)}
   242  	MOVL     a, h;                             \
   243  	ANDL     b, h;                             \
   244  	ANDL     c, y1;                            \
   245  	ORL      y1, h;                            \ // h =  (a AND b) OR (a AND c) OR (b AND c)  
   246  	VPALIGNR $8, XWORD2, XWORD3, XTMP0;        \ // XTMP0 = W[-6] = {w13,w12,w11,w10}
   247  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   248  	MOVL     f, y1;                            \
   249  	XORL     g, y1;                            \
   250  	ANDL     e, y1;                            \
   251  	VPXOR   XTMP1, XTMP0, XTMP0;               \ // XTMP0 = W[-6] ^ (W[-13] rol 7) 
   252  	XORL     g, y1;                            \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)
   253  	ADDL     y1, y2;                           \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2  	
   254  	ROLL     $9, b;                            \
   255  	ROLL     $19, f;                           \
   256  	VPALIGNR $12, XWORD1, XWORD2, XTMP1;       \ // XTMP1 = W[-9] = {w10,w9,w8,w7}
   257  	P0(y2, y0, d);                             \
   258  	VPXOR XWORD0, XTMP1, XTMP1;                \ // XTMP1 = W[-9] ^ W[-16]
   259  
   260  #define ROUND_AND_SCHED_N_1_1(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
   261  	;                                          \ // #############################  RND N + 1 ############################//
   262  	MOVL     a, y0;                            \
   263  	ROLL     $12, y0;                          \ // y0 = a <<< 12
   264  	MOVL     e, y2;                            \
   265  	ADDL     $const, y2;                       \
   266  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   267  	VPSHUFD $0xA5, XWORD3, XTMP2;              \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13}
   268  	ROLL     $7, y2;                           \ // y2 = SS1
   269  	XORL     y2, y0                            \ // y0 = SS2
   270  	ADDL     (disp + 1*4)(SP), y2;             \ // y2 = SS1 + W
   271  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
   272  	VPSRLQ  $17, XTMP2, XTMP2;                 \ // XTMP2 = W[-3] rol 15 {xBxA}
   273  	ADDL     (disp + 1*4 + 16)(SP), y0;        \ // y0 = SS2 + W'
   274  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   275  	MOVL     a, y1;                            \
   276  	ORL      b, y1;                            \
   277  	VPXOR   XTMP1, XTMP2, XTMP2;               \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxxA}
   278  	MOVL     a, h;                             \
   279  	ANDL     b, h;                             \
   280  	ANDL     c, y1;                            \
   281  	ORL      y1, h;                            \ // h =  (a AND b) OR (a AND c) OR (b AND c)     
   282  	VPSHUFD $0x00, XTMP2, XTMP2;               \ // XTMP2 = {AAAA}
   283  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   284  	MOVL     f, y1;                            \
   285  	XORL     g, y1;                            \
   286  	ANDL     e, y1;                            \
   287  	VPSRLQ  $17, XTMP2, XTMP3;                 \ // XTMP3 = XTMP2 rol 15 {xxxA}
   288  	XORL     g, y1;                            \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)
   289  	ADDL     y1, y2;                           \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2  	
   290  	ROLL     $9, b;                            \
   291  	ROLL     $19, f;                           \
   292  	VPSRLQ  $9, XTMP2, XTMP4;                  \ // XTMP4 = XTMP2 rol 23 {xxxA}
   293  	P0(y2, y0, d);                             \
   294  	VPXOR    XTMP2, XTMP4, XTMP4;              \ // XTMP4 = XTMP2 XOR (XTMP2 rol 23 {xxxA})
   295  
   296  #define ROUND_AND_SCHED_N_1_2(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
   297  	;                                          \ // #############################  RND N + 2 ############################//
   298  	MOVL     a, y0;                            \
   299  	ROLL     $12, y0;                          \ // y0 = a <<< 12
   300  	MOVL     e, y2;                            \
   301  	ADDL     $const, y2;                       \
   302  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   303  	VPXOR    XTMP4, XTMP3, XTMP4;              \ // XTMP4 = XTMP2 ^ (XTMP2 rol 15 {xxxA}) ^ (XTMP2 rol 23 {xxxA})
   304  	ROLL     $7, y2;                           \ // y2 = SS1
   305  	XORL     y2, y0                            \ // y0 = SS2
   306  	ADDL     (disp + 2*4)(SP), y2;             \ // y2 = SS1 + W
   307  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
   308  	VPXOR    XTMP4, XTMP0, XTMP2;              \ // XTMP2 = {..., ..., ..., W[0]}
   309  	ADDL     (disp + 2*4 + 16)(SP), y0;        \ // y0 = SS2 + W'
   310  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   311  	MOVL     a, y1;                            \
   312  	ORL      b, y1;                            \
   313  	VPALIGNR $4, XWORD3, XTMP2, XTMP3;         \ // XTMP3 = {W[0], w15, w14, w13}
   314  	MOVL     a, h;                             \
   315  	ANDL     b, h;                             \
   316  	ANDL     c, y1;                            \
   317  	ORL      y1, h;                            \ // h =  (a AND b) OR (a AND c) OR (b AND c)  
   318  	VPSLLD   $15, XTMP3, XTMP4;                \
   319  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   320  	MOVL     f, y1;                            \
   321  	XORL     g, y1;                            \
   322  	ANDL     e, y1;                            \
   323  	VPSRLD   $(32-15), XTMP3, XTMP3;           \
   324  	XORL     g, y1;                            \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)
   325  	ADDL     y1, y2;                           \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 
   326  	ROLL     $9, b;                            \
   327  	ROLL     $19, f;                           \
   328  	VPOR    XTMP3, XTMP4, XTMP4;               \ // XTMP4 = (W[-3] rol 15) {DCBA}
   329  	P0(y2, y0, d);                             \
   330  	VPXOR   XTMP1, XTMP4, XTMP4;               \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {DCBA}
   331  
   332  #define ROUND_AND_SCHED_N_1_3(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \
   333  	;                                          \ // #############################  RND N + 3 ############################//
   334  	MOVL     a, y0;                            \
   335  	ROLL     $12, y0;                          \ // y0 = a <<< 12
   336  	MOVL     e, y2;                            \
   337  	ADDL     $const, y2;                       \
   338  	ADDL     y0, y2;                           \ // y2 = a <<< 12 + e + T
   339  	VPSLLD   $15, XTMP4, XTMP2;                \ 
   340  	ROLL     $7, y2;                           \ // y2 = SS1
   341  	XORL     y2, y0                            \ // y0 = SS2
   342  	ADDL     (disp + 3*4)(SP), y2;             \ // y2 = SS1 + W
   343  	ADDL     h, y2;                            \ // y2 = h + SS1 + W    
   344  	VPSRLD   $(32-15), XTMP4, XTMP3;           \
   345  	ADDL     (disp + 3*4 + 16)(SP), y0;        \ // y0 = SS2 + W'
   346  	ADDL     d, y0;                            \ // y0 = d + SS2 + W'
   347  	MOVL     a, y1;                            \
   348  	ORL      b, y1;                            \
   349  	VPOR     XTMP3, XTMP2, XTMP3;              \ // XTMP3 = XTMP4 rol 15 {DCBA}
   350  	MOVL     a, h;                             \
   351  	ANDL     b, h;                             \
   352  	ANDL     c, y1;                            \
   353  	ORL      y1, h;                            \ // h =  (a AND b) OR (a AND c) OR (b AND c)  
   354  	VPSHUFB  R08_SHUFFLE_MASK, XTMP3, XTMP1;   \ // XTMP1 = XTMP4 rol 23 {DCBA}
   355  	ADDL     y0, h;                            \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   356  	MOVL     f, y1;                            \
   357  	XORL     g, y1;                            \
   358  	ANDL     e, y1;                            \
   359  	VPXOR    XTMP3, XTMP4, XTMP3;              \ // XTMP3 = XTMP4 ^ (XTMP4 rol 15 {DCBA})
   360  	XORL     g, y1;                            \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)
   361  	ADDL     y1, y2;                           \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 
   362  	ROLL     $9, b;                            \
   363  	ROLL     $19, f;                           \
   364  	VPXOR    XTMP3, XTMP1, XTMP1;              \ // XTMP1 = XTMP4 ^ (XTMP4 rol 15 {DCBA}) ^ (XTMP4 rol 23 {DCBA})
   365  	P0(y2, y0, d);                             \
   366  	VPXOR    XTMP1, XTMP0, XWORD0;             \ // XWORD0 = {W[3], W[2], W[1], W[0]}
   367  
   368  // For rounds [0 - 16)
   369  #define DO_ROUND_N_0(disp, idx, const, a, b, c, d, e, f, g, h) \
   370  	;                                            \ // #############################  RND N + 0 ############################//
   371  	SS12(a, e, const, y2, y0);                   \
   372  	ADDL     (disp + idx*4)(SP), y2;             \ // y2 = SS1 + W
   373  	ADDL     h, y2;                              \ // y2 = h + SS1 + W    
   374  	ADDL     (disp + idx*4 + 16)(SP), y0;        \ // y0 = SS2 + W'
   375  	ADDL     d, y0;                              \ // y0 = d + SS2 + W'
   376  	;                                            \
   377  	MOVL     a, h;                               \
   378  	XORL     b, h;                               \
   379  	XORL     c, h;                               \
   380  	ADDL     y0, h;                              \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   381  	;                                            \
   382  	MOVL     e, y1;                              \
   383  	XORL     f, y1;                              \
   384  	XORL     g, y1;                              \
   385  	ADDL     y1, y2;                             \ // y2 = GG(e, f, g) + h + SS1 + W = tt2  
   386  	;                                            \
   387  	ROLL     $9, b;                              \
   388  	ROLL     $19, f;                             \
   389  	;                                            \
   390  	P0(y2, y0, d)
   391  
   392  // For rounds [16 - 64)
   393  #define DO_ROUND_N_1(disp, idx, const, a, b, c, d, e, f, g, h) \
   394  	;                                            \ // #############################  RND N + 0 ############################//
   395  	SS12(a, e, const, y2, y0);                   \
   396  	ADDL     (disp + idx*4)(SP), y2;             \ // y2 = SS1 + W
   397  	ADDL     h, y2;                              \ // y2 = h + SS1 + W    
   398  	ADDL     (disp + idx*4 + 16)(SP), y0;        \ // y0 = SS2 + W'
   399  	ADDL     d, y0;                              \ // y0 = d + SS2 + W'
   400  	;                                            \
   401  	MOVL     a, y1;                              \
   402  	ORL      b, y1;                              \
   403  	MOVL     a, h;                               \
   404  	ANDL     b, h;                               \
   405  	ANDL     c, y1;                              \
   406  	ORL      y1, h;                              \ // h =  (a AND b) OR (a AND c) OR (b AND c)  
   407  	ADDL     y0, h;                              \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   408  	;                                            \
   409  	MOVL     f, y1;                              \
   410  	XORL     g, y1;                              \
   411  	ANDL     e, y1;                              \
   412  	XORL     g, y1;                              \ // y1 = GG2(e, f, g)	
   413  	ADDL     y1, y2;                             \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2  
   414  	;                                            \
   415  	ROLL     $9, b;                              \
   416  	ROLL     $19, f;                             \
   417  	;                                            \
   418  	P0(y2, y0, d)
   419  
   420  // Requires: SSE2, SSSE3
   421  #define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \
   422  	MOVOU  XWORD1, XTMP0;                    \ 
   423  	PALIGNR $12, XWORD0, XTMP0;              \ // XTMP0 = W[-13] = {w6,w5,w4,w3}
   424  	MOVOU  XTMP0, XTMP1;                     \
   425  	PSLLL  $7, XTMP1;                        \
   426  	PSRLL  $(32-7), XTMP0;                   \
   427  	POR    XTMP0, XTMP1;                     \ // XTMP1 = W[-13] rol 7
   428  	MOVOU  XWORD3, XTMP0;                    \
   429  	PALIGNR $8, XWORD2, XTMP0;               \ // XTMP0 = W[-6] = {w13,w12,w11,w10}
   430  	PXOR   XTMP1, XTMP0;                     \ // XTMP0 = W[-6] XOR (W[-13] rol 7) 
   431  	; \ // Prepare P1 parameters 
   432  	MOVOU  XWORD2, XTMP1;                    \
   433  	PALIGNR $12, XWORD1, XTMP1;              \ // XTMP1 = W[-9] = {w10,w9,w8,w7}
   434  	PXOR  XWORD0, XTMP1;                     \ // XTMP1 = W[-9] XOR W[-16]
   435  	PSHUFD $0xA5, XWORD3, XTMP2;             \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13}
   436  	PSRLQ  $17, XTMP2;                       \ // XTMP2 = W[-3] rol 15 {xBxA}
   437  	PXOR  XTMP1, XTMP2;                      \ // XTMP2 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {xxxA}
   438  	; \ // P1
   439  	PSHUFD $0x00, XTMP2, XTMP2;              \ // XTMP2 = {AAAA}
   440  	MOVOU XTMP2, XTMP3;                      \
   441  	PSRLQ  $17, XTMP3;                       \ // XTMP3 = XTMP2 rol 15 {xxxA}
   442  	MOVOU XTMP2, XTMP4;                      \
   443  	PSRLQ  $9, XTMP4;                        \ // XTMP4 = XTMP2 rol 23 {xxxA}
   444  	PXOR  XTMP2, XTMP4;                      \
   445  	PXOR  XTMP3, XTMP4;                      \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxxA}) XOR (XTMP2 rol 23 {xxxA})
   446  	; \ // First 1 words message schedule result
   447  	MOVOU XTMP0, XTMP2;                      \
   448  	PXOR  XTMP4, XTMP2;                      \ // XTMP2 = {..., ..., ..., W[0]}
   449  	; \ // Prepare P1 parameters
   450  	PALIGNR  $4, XWORD3, XTMP2;              \ // XTMP2 = {W[0], w15, w14, w13}
   451  	MOVOU XTMP2, XTMP4;                      \
   452  	PSLLL  $15, XTMP4;                       \
   453  	PSRLL  $(32-15), XTMP2;                  \
   454  	POR  XTMP2, XTMP4;                       \ // XTMP4 = W[-3] rol 15 {DCBA}
   455  	PXOR XTMP1, XTMP4;                       \ // XTMP4 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {DCBA}
   456  	; \ // P1
   457  	MOVOU XTMP4, XTMP2;                      \
   458  	PSLLL  $15, XTMP2;                       \
   459  	MOVOU XTMP4, XTMP3;                      \
   460  	PSRLL  $(32-15), XTMP3;                  \
   461  	POR  XTMP2, XTMP3;                       \ // XTMP3 = XTMP4 rol 15 {DCBA}
   462  	MOVOU XTMP3, XTMP1;                      \
   463  	PSHUFB  r08_mask<>(SB), XTMP1;           \ // XTMP1 = XTMP4 rol 23 {DCBA}
   464  	PXOR XTMP4, XTMP3;                       \
   465  	PXOR XTMP3, XTMP1;                       \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {DCBA}) XOR (XTMP4 rol 23 {DCBA})
   466  	; \ // 4 words message schedule result
   467  	MOVOU XTMP0, XWORD0;                     \
   468  	PXOR XTMP1, XWORD0
   469  
   470  TEXT ·blockSIMD(SB), 0, $48-32
   471  	MOVQ dig+0(FP), CTX          // d.h[8]
   472  	MOVQ p_base+8(FP), INP
   473  	MOVQ p_len+16(FP), NUM_BYTES
   474  
   475  	LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
   476  	MOVQ NUM_BYTES, _INP_END(SP)
   477  
   478  	// Load initial digest
   479  	MOVL 0(CTX), a  // a = H0
   480  	MOVL 4(CTX), b  // b = H1
   481  	MOVL 8(CTX), c  // c = H2
   482  	MOVL 12(CTX), d // d = H3
   483  	MOVL 16(CTX), e // e = H4
   484  	MOVL 20(CTX), f // f = H5
   485  	MOVL 24(CTX), g // g = H6
   486  	MOVL 28(CTX), h // h = H7
   487  
   488  	CMPB ·useAVX(SB), $1
   489  	JE   avx
   490  
   491  	MOVOU flip_mask<>(SB), X_BYTE_FLIP_MASK
   492  	MOVOU r08_mask<>(SB), R08_SHUFFLE_MASK
   493  
   494  sse_loop: // at each iteration works with one block (512 bit)
   495  	MOVOU 0(INP), XWORD0
   496  	MOVOU 16(INP), XWORD1
   497  	MOVOU 32(INP), XWORD2
   498  	MOVOU 48(INP), XWORD3
   499  
   500  	PSHUFB X_BYTE_FLIP_MASK, XWORD0 // w3,  w2,  w1,  w0
   501  	PSHUFB X_BYTE_FLIP_MASK, XWORD1 // w7,  w6,  w5,  w4
   502  	PSHUFB X_BYTE_FLIP_MASK, XWORD2 // w11, w10,  w9,  w8
   503  	PSHUFB X_BYTE_FLIP_MASK, XWORD3 // w15, w14, w13, w12
   504  
   505  	ADDQ $64, INP
   506  
   507  sse_schedule_compress: // for w0 - w47
   508  	// Do 4 rounds and scheduling
   509  	MOVOU XWORD0, (_XFER + 0*16)(SP)
   510  	MOVOU XWORD1, XFER
   511  	PXOR  XWORD0, XFER
   512  	MOVOU XFER, (_XFER + 1*16)(SP)
   513  	DO_ROUND_N_0(_XFER, 0, T0, a, b, c, d, e, f, g, h)
   514  	DO_ROUND_N_0(_XFER, 1, T1, h, a, b, c, d, e, f, g)
   515  	MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
   516  	DO_ROUND_N_0(_XFER, 2, T2, g, h, a, b, c, d, e, f)
   517  	DO_ROUND_N_0(_XFER, 3, T3, f, g, h, a, b, c, d, e)
   518  
   519  	// Do 4 rounds and scheduling
   520  	MOVOU XWORD1, (_XFER + 0*16)(SP)
   521  	MOVOU XWORD2, XFER
   522  	PXOR  XWORD1, XFER
   523  	MOVOU XFER, (_XFER + 1*16)(SP)
   524  	DO_ROUND_N_0(_XFER, 0, T4, e, f, g, h, a, b, c, d)
   525  	DO_ROUND_N_0(_XFER, 1, T5, d, e, f, g, h, a, b, c)
   526  	MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0)
   527  	DO_ROUND_N_0(_XFER, 2, T6, c, d, e, f, g, h, a, b)
   528  	DO_ROUND_N_0(_XFER, 3, T7, b, c, d, e, f, g, h, a)
   529  
   530  	// Do 4 rounds and scheduling
   531  	MOVOU XWORD2, (_XFER + 0*16)(SP)
   532  	MOVOU XWORD3, XFER
   533  	PXOR  XWORD2, XFER
   534  	MOVOU XFER, (_XFER + 1*16)(SP)
   535  	DO_ROUND_N_0(_XFER, 0, T8, a, b, c, d, e, f, g, h)
   536  	DO_ROUND_N_0(_XFER, 1, T9, h, a, b, c, d, e, f, g)
   537  	MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1)
   538  	DO_ROUND_N_0(_XFER, 2, T10, g, h, a, b, c, d, e, f)
   539  	DO_ROUND_N_0(_XFER, 3, T11, f, g, h, a, b, c, d, e)
   540  
   541  	// Do 4 rounds and scheduling
   542  	MOVOU XWORD3, (_XFER + 0*16)(SP)
   543  	MOVOU XWORD0, XFER
   544  	PXOR  XWORD3, XFER
   545  	MOVOU XFER, (_XFER + 1*16)(SP)
   546  	DO_ROUND_N_0(_XFER, 0, T12, e, f, g, h, a, b, c, d)
   547  	DO_ROUND_N_0(_XFER, 1, T13, d, e, f, g, h, a, b, c)
   548  	MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2)
   549  	DO_ROUND_N_0(_XFER, 2, T14, c, d, e, f, g, h, a, b)
   550  	DO_ROUND_N_0(_XFER, 3, T15, b, c, d, e, f, g, h, a)
   551  
   552  	// Do 4 rounds and scheduling
   553  	MOVOU XWORD0, (_XFER + 0*16)(SP)
   554  	MOVOU XWORD1, XFER
   555  	PXOR  XWORD0, XFER
   556  	MOVOU XFER, (_XFER + 1*16)(SP)
   557  	DO_ROUND_N_1(_XFER, 0, T16, a, b, c, d, e, f, g, h)
   558  	DO_ROUND_N_1(_XFER, 1, T17, h, a, b, c, d, e, f, g)
   559  	MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
   560  	DO_ROUND_N_1(_XFER, 2, T18, g, h, a, b, c, d, e, f)
   561  	DO_ROUND_N_1(_XFER, 3, T19, f, g, h, a, b, c, d, e)
   562  
   563  	// Do 4 rounds and scheduling
   564  	MOVOU XWORD1, (_XFER + 0*16)(SP)
   565  	MOVOU XWORD2, XFER
   566  	PXOR  XWORD1, XFER
   567  	MOVOU XFER, (_XFER + 1*16)(SP)
   568  	DO_ROUND_N_1(_XFER, 0, T20, e, f, g, h, a, b, c, d)
   569  	DO_ROUND_N_1(_XFER, 1, T21, d, e, f, g, h, a, b, c)
   570  	MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0)
   571  	DO_ROUND_N_1(_XFER, 2, T22, c, d, e, f, g, h, a, b)
   572  	DO_ROUND_N_1(_XFER, 3, T23, b, c, d, e, f, g, h, a)
   573  
   574  	// Do 4 rounds and scheduling
   575  	MOVOU XWORD2, (_XFER + 0*16)(SP)
   576  	MOVOU XWORD3, XFER
   577  	PXOR  XWORD2, XFER
   578  	MOVOU XFER, (_XFER + 1*16)(SP)
   579  	DO_ROUND_N_1(_XFER, 0, T24, a, b, c, d, e, f, g, h)
   580  	DO_ROUND_N_1(_XFER, 1, T25, h, a, b, c, d, e, f, g)
   581  	MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1)
   582  	DO_ROUND_N_1(_XFER, 2, T26, g, h, a, b, c, d, e, f)
   583  	DO_ROUND_N_1(_XFER, 3, T27, f, g, h, a, b, c, d, e)
   584  
   585  	// Do 4 rounds and scheduling
   586  	MOVOU XWORD3, (_XFER + 0*16)(SP)
   587  	MOVOU XWORD0, XFER
   588  	PXOR  XWORD3, XFER
   589  	MOVOU XFER, (_XFER + 1*16)(SP)
   590  	DO_ROUND_N_1(_XFER, 0, T28, e, f, g, h, a, b, c, d)
   591  	DO_ROUND_N_1(_XFER, 1, T29, d, e, f, g, h, a, b, c)
   592  	MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2)
   593  	DO_ROUND_N_1(_XFER, 2, T30, c, d, e, f, g, h, a, b)
   594  	DO_ROUND_N_1(_XFER, 3, T31, b, c, d, e, f, g, h, a)
   595  
   596  	// Do 4 rounds and scheduling
   597  	MOVOU XWORD0, (_XFER + 0*16)(SP)
   598  	MOVOU XWORD1, XFER
   599  	PXOR  XWORD0, XFER
   600  	MOVOU XFER, (_XFER + 1*16)(SP)
   601  	DO_ROUND_N_1(_XFER, 0, T32, a, b, c, d, e, f, g, h)
   602  	DO_ROUND_N_1(_XFER, 1, T33, h, a, b, c, d, e, f, g)
   603  	MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
   604  	DO_ROUND_N_1(_XFER, 2, T34, g, h, a, b, c, d, e, f)
   605  	DO_ROUND_N_1(_XFER, 3, T35, f, g, h, a, b, c, d, e)
   606  
   607  	// Do 4 rounds and scheduling
   608  	MOVOU XWORD1, (_XFER + 0*16)(SP)
   609  	MOVOU XWORD2, XFER
   610  	PXOR  XWORD1, XFER
   611  	MOVOU XFER, (_XFER + 1*16)(SP)
   612  	DO_ROUND_N_1(_XFER, 0, T36, e, f, g, h, a, b, c, d)
   613  	DO_ROUND_N_1(_XFER, 1, T37, d, e, f, g, h, a, b, c)
   614  	MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0)
   615  	DO_ROUND_N_1(_XFER, 2, T38, c, d, e, f, g, h, a, b)
   616  	DO_ROUND_N_1(_XFER, 3, T39, b, c, d, e, f, g, h, a)
   617  
   618  	// Do 4 rounds and scheduling
   619  	MOVOU XWORD2, (_XFER + 0*16)(SP)
   620  	MOVOU XWORD3, XFER
   621  	PXOR  XWORD2, XFER
   622  	MOVOU XFER, (_XFER + 1*16)(SP)
   623  	DO_ROUND_N_1(_XFER, 0, T40, a, b, c, d, e, f, g, h)
   624  	DO_ROUND_N_1(_XFER, 1, T41, h, a, b, c, d, e, f, g)
   625  	MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1)
   626  	DO_ROUND_N_1(_XFER, 2, T42, g, h, a, b, c, d, e, f)
   627  	DO_ROUND_N_1(_XFER, 3, T43, f, g, h, a, b, c, d, e)
   628  
   629  	// Do 4 rounds and scheduling
   630  	MOVOU XWORD3, (_XFER + 0*16)(SP)
   631  	MOVOU XWORD0, XFER
   632  	PXOR  XWORD3, XFER
   633  	MOVOU XFER, (_XFER + 1*16)(SP)
   634  	DO_ROUND_N_1(_XFER, 0, T44, e, f, g, h, a, b, c, d)
   635  	DO_ROUND_N_1(_XFER, 1, T45, d, e, f, g, h, a, b, c)
   636  	MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2)
   637  	DO_ROUND_N_1(_XFER, 2, T46, c, d, e, f, g, h, a, b)
   638  	DO_ROUND_N_1(_XFER, 3, T47, b, c, d, e, f, g, h, a)
   639  
   640  	// w48 - w63 processed with only 4 rounds scheduling (last 16 rounds)
   641  	// Do 4 rounds
   642  	MOVOU XWORD0, (_XFER + 0*16)(SP)
   643  	MOVOU XWORD1, XFER
   644  	PXOR  XWORD0, XFER
   645  	MOVOU XFER, (_XFER + 1*16)(SP)
   646  	DO_ROUND_N_1(_XFER, 0, T48, a, b, c, d, e, f, g, h)
   647  	DO_ROUND_N_1(_XFER, 1, T49, h, a, b, c, d, e, f, g)
   648  	DO_ROUND_N_1(_XFER, 2, T50, g, h, a, b, c, d, e, f)
   649  	DO_ROUND_N_1(_XFER, 3, T51, f, g, h, a, b, c, d, e)
   650  
   651  	// Do 4 rounds
   652  	MOVOU XWORD1, (_XFER + 0*16)(SP)
   653  	MOVOU XWORD2, XFER
   654  	PXOR  XWORD1, XFER
   655  	MOVOU XFER, (_XFER + 1*16)(SP)
   656  	DO_ROUND_N_1(_XFER, 0, T52, e, f, g, h, a, b, c, d)
   657  	DO_ROUND_N_1(_XFER, 1, T53, d, e, f, g, h, a, b, c)
   658  	DO_ROUND_N_1(_XFER, 2, T54, c, d, e, f, g, h, a, b)
   659  	DO_ROUND_N_1(_XFER, 3, T55, b, c, d, e, f, g, h, a)
   660  
   661  	// Do 4 rounds
   662  	MOVOU XWORD2, (_XFER + 0*16)(SP)
   663  	MOVOU XWORD3, XFER
   664  	PXOR  XWORD2, XFER
   665  	MOVOU XFER, (_XFER + 1*16)(SP)
   666  	MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
   667  	DO_ROUND_N_1(_XFER, 0, T56, a, b, c, d, e, f, g, h)
   668  	DO_ROUND_N_1(_XFER, 1, T57, h, a, b, c, d, e, f, g)
   669  	DO_ROUND_N_1(_XFER, 2, T58, g, h, a, b, c, d, e, f)
   670  	DO_ROUND_N_1(_XFER, 3, T59, f, g, h, a, b, c, d, e)
   671  
   672  	// Do 4 rounds
   673  	MOVOU XWORD3, (_XFER + 0*16)(SP)
   674  	MOVOU XWORD0, XFER
   675  	PXOR  XWORD3, XFER
   676  	MOVOU XFER, (_XFER + 1*16)(SP)
   677  	DO_ROUND_N_1(_XFER, 0, T60, e, f, g, h, a, b, c, d)
   678  	DO_ROUND_N_1(_XFER, 1, T61, d, e, f, g, h, a, b, c)
   679  	DO_ROUND_N_1(_XFER, 2, T62, c, d, e, f, g, h, a, b)
   680  	DO_ROUND_N_1(_XFER, 3, T63, b, c, d, e, f, g, h, a)
   681  
   682  	xorm(  0(CTX), a)
   683  	xorm(  4(CTX), b)
   684  	xorm(  8(CTX), c)
   685  	xorm( 12(CTX), d)
   686  	xorm( 16(CTX), e)
   687  	xorm( 20(CTX), f)
   688  	xorm( 24(CTX), g)
   689  	xorm( 28(CTX), h)
   690  
   691  	CMPQ _INP_END(SP), INP
   692  	JAE   sse_loop
   693  
   694  sse_done_hash:
   695  	RET
   696  
   697  avx:	
   698  	VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK
   699  	VMOVDQU r08_mask<>(SB), R08_SHUFFLE_MASK
   700  
   701  avx_loop: // at each iteration works with one block (512 bit)
   702  
   703  	VMOVDQU 0(INP), XWORD0
   704  	VMOVDQU 16(INP), XWORD1
   705  	VMOVDQU 32(INP), XWORD2
   706  	VMOVDQU 48(INP), XWORD3
   707  
   708  	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 // w3,  w2,  w1,  w0
   709  	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 // w7,  w6,  w5,  w4
   710  	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 // w11, w10,  w9,  w8
   711  	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 // w15, w14, w13, w12
   712  
   713  	ADDQ $64, INP
   714  
   715  avx_schedule_compress: // for w0 - w47
   716  	// Do 4 rounds and scheduling
   717  	VMOVDQU XWORD0, (_XFER + 0*16)(SP)
   718  	VPXOR  XWORD0, XWORD1, XFER
   719  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   720  	ROUND_AND_SCHED_N_0_0(_XFER, T0, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
   721  	ROUND_AND_SCHED_N_0_1(_XFER, T1, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
   722  	ROUND_AND_SCHED_N_0_2(_XFER, T2, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
   723  	ROUND_AND_SCHED_N_0_3(_XFER, T3, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
   724  
   725  	// Do 4 rounds and scheduling
   726  	VMOVDQU XWORD1, (_XFER + 0*16)(SP)
   727  	VPXOR  XWORD1, XWORD2, XFER
   728  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   729  	ROUND_AND_SCHED_N_0_0(_XFER, T4, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0)
   730  	ROUND_AND_SCHED_N_0_1(_XFER, T5, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0)
   731  	ROUND_AND_SCHED_N_0_2(_XFER, T6, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0)
   732  	ROUND_AND_SCHED_N_0_3(_XFER, T7, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0)
   733  
   734  	// Do 4 rounds and scheduling
   735  	VMOVDQU XWORD2, (_XFER + 0*16)(SP)
   736  	VPXOR  XWORD2, XWORD3, XFER
   737  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   738  	ROUND_AND_SCHED_N_0_0(_XFER, T8, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1)
   739  	ROUND_AND_SCHED_N_0_1(_XFER, T9, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1)
   740  	ROUND_AND_SCHED_N_0_2(_XFER, T10, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1)
   741  	ROUND_AND_SCHED_N_0_3(_XFER, T11, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1)
   742  
   743  	// Do 4 rounds and scheduling
   744  	VMOVDQU XWORD3, (_XFER + 0*16)(SP)
   745  	VPXOR  XWORD3, XWORD0, XFER
   746  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   747  	ROUND_AND_SCHED_N_0_0(_XFER, T12, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2)
   748  	ROUND_AND_SCHED_N_0_1(_XFER, T13, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2)
   749  	ROUND_AND_SCHED_N_0_2(_XFER, T14, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2)
   750  	ROUND_AND_SCHED_N_0_3(_XFER, T15, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2)
   751  
   752  	// Do 4 rounds and scheduling
   753  	VMOVDQU XWORD0, (_XFER + 0*16)(SP)
   754  	VPXOR  XWORD0, XWORD1, XFER
   755  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   756  	ROUND_AND_SCHED_N_1_0(_XFER, T16, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
   757  	ROUND_AND_SCHED_N_1_1(_XFER, T17, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
   758  	ROUND_AND_SCHED_N_1_2(_XFER, T18, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
   759  	ROUND_AND_SCHED_N_1_3(_XFER, T19, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
   760  
   761  	// Do 4 rounds and scheduling
   762  	VMOVDQU XWORD1, (_XFER + 0*16)(SP)
   763  	VPXOR  XWORD1, XWORD2, XFER
   764  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   765  	ROUND_AND_SCHED_N_1_0(_XFER, T20, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0)
   766  	ROUND_AND_SCHED_N_1_1(_XFER, T21, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0)
   767  	ROUND_AND_SCHED_N_1_2(_XFER, T22, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0)
   768  	ROUND_AND_SCHED_N_1_3(_XFER, T23, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0)
   769  
   770  	// Do 4 rounds and scheduling
   771  	VMOVDQU XWORD2, (_XFER + 0*16)(SP)
   772  	VPXOR  XWORD2, XWORD3, XFER
   773  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   774  
   775  	ROUND_AND_SCHED_N_1_0(_XFER, T24, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1)
   776  	ROUND_AND_SCHED_N_1_1(_XFER, T25, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1)
   777  	ROUND_AND_SCHED_N_1_2(_XFER, T26, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1)
   778  	ROUND_AND_SCHED_N_1_3(_XFER, T27, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1)
   779  
   780  	// Do 4 rounds and scheduling
   781  	VMOVDQU XWORD3, (_XFER + 0*16)(SP)
   782  	VPXOR  XWORD3, XWORD0, XFER
   783  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   784  	ROUND_AND_SCHED_N_1_0(_XFER, T28, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2)
   785  	ROUND_AND_SCHED_N_1_1(_XFER, T29, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2)
   786  	ROUND_AND_SCHED_N_1_2(_XFER, T30, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2)
   787  	ROUND_AND_SCHED_N_1_3(_XFER, T31, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2)
   788  
   789  	// Do 4 rounds and scheduling
   790  	VMOVDQU XWORD0, (_XFER + 0*16)(SP)
   791  	VPXOR  XWORD0, XWORD1, XFER
   792  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   793  	ROUND_AND_SCHED_N_1_0(_XFER, T32, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
   794  	ROUND_AND_SCHED_N_1_1(_XFER, T33, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
   795  	ROUND_AND_SCHED_N_1_2(_XFER, T34, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
   796  	ROUND_AND_SCHED_N_1_3(_XFER, T35, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)
   797  
   798  	// Do 4 rounds and scheduling
   799  	VMOVDQU XWORD1, (_XFER + 0*16)(SP)
   800  	VPXOR  XWORD1, XWORD2, XFER
   801  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   802  	ROUND_AND_SCHED_N_1_0(_XFER, T36, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0)
   803  	ROUND_AND_SCHED_N_1_1(_XFER, T37, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0)
   804  	ROUND_AND_SCHED_N_1_2(_XFER, T38, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0)
   805  	ROUND_AND_SCHED_N_1_3(_XFER, T39, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0)
   806  
   807  	// Do 4 rounds and scheduling
   808  	VMOVDQU XWORD2, (_XFER + 0*16)(SP)
   809  	VPXOR  XWORD2, XWORD3, XFER
   810  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   811  	ROUND_AND_SCHED_N_1_0(_XFER, T40, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1)
   812  	ROUND_AND_SCHED_N_1_1(_XFER, T41, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1)
   813  	ROUND_AND_SCHED_N_1_2(_XFER, T42, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1)
   814  	ROUND_AND_SCHED_N_1_3(_XFER, T43, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1)
   815  
   816  	// Do 4 rounds and scheduling
   817  	VMOVDQU XWORD3, (_XFER + 0*16)(SP)
   818  	VPXOR  XWORD3, XWORD0, XFER
   819  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   820  	ROUND_AND_SCHED_N_1_0(_XFER, T44, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2)
   821  	ROUND_AND_SCHED_N_1_1(_XFER, T45, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2)
   822  	ROUND_AND_SCHED_N_1_2(_XFER, T46, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2)
   823  	ROUND_AND_SCHED_N_1_3(_XFER, T47, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2)
   824  
   825  	// w48 - w63 processed with only 4 rounds scheduling (last 16 rounds)
   826  	// Do 4 rounds and scheduling
   827  	VMOVDQU XWORD0, (_XFER + 0*16)(SP)
   828  	VPXOR  XWORD0, XWORD1, XFER
   829  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   830  	ROUND_AND_SCHED_N_1_0(_XFER, T48, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3)
   831  	ROUND_AND_SCHED_N_1_1(_XFER, T49, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3)
   832  	ROUND_AND_SCHED_N_1_2(_XFER, T50, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3)
   833  	ROUND_AND_SCHED_N_1_3(_XFER, T51, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3)  
   834  
   835  	// w52 - w63 processed with no scheduling (last 12 rounds)
   836  	// Do 4 rounds
   837  	VMOVDQU XWORD1, (_XFER + 0*16)(SP)
   838  	VPXOR  XWORD1, XWORD2, XFER
   839  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   840  	DO_ROUND_N_1(_XFER, 0, T52, e, f, g, h, a, b, c, d)
   841  	DO_ROUND_N_1(_XFER, 1, T53, d, e, f, g, h, a, b, c)
   842  	DO_ROUND_N_1(_XFER, 2, T54, c, d, e, f, g, h, a, b)
   843  	DO_ROUND_N_1(_XFER, 3, T55, b, c, d, e, f, g, h, a)
   844  
   845  	// Do 4 rounds
   846  	VMOVDQU XWORD2, (_XFER + 0*16)(SP)
   847  	VPXOR  XWORD2, XWORD3, XFER
   848  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   849  	DO_ROUND_N_1(_XFER, 0, T56, a, b, c, d, e, f, g, h)
   850  	DO_ROUND_N_1(_XFER, 1, T57, h, a, b, c, d, e, f, g)
   851  	DO_ROUND_N_1(_XFER, 2, T58, g, h, a, b, c, d, e, f)
   852  	DO_ROUND_N_1(_XFER, 3, T59, f, g, h, a, b, c, d, e)
   853  
   854  	// Do 4 rounds
   855  	VMOVDQU XWORD3, (_XFER + 0*16)(SP)
   856  	VPXOR  XWORD3, XWORD0, XFER
   857  	VMOVDQU XFER, (_XFER + 1*16)(SP)
   858  	DO_ROUND_N_1(_XFER, 0, T60, e, f, g, h, a, b, c, d)
   859  	DO_ROUND_N_1(_XFER, 1, T61, d, e, f, g, h, a, b, c)
   860  	DO_ROUND_N_1(_XFER, 2, T62, c, d, e, f, g, h, a, b)
   861  	DO_ROUND_N_1(_XFER, 3, T63, b, c, d, e, f, g, h, a)
   862  
   863  	xorm(  0(CTX), a)
   864  	xorm(  4(CTX), b)
   865  	xorm(  8(CTX), c)
   866  	xorm( 12(CTX), d)
   867  	xorm( 16(CTX), e)
   868  	xorm( 20(CTX), f)
   869  	xorm( 24(CTX), g)
   870  	xorm( 28(CTX), h)
   871  
   872  	CMPQ _INP_END(SP), INP
   873  	JAE   avx_loop
   874  
   875  done_hash:
   876  	RET
   877  
   878  // shuffle byte order from LE to BE
   879  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
   880  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
   881  GLOBL flip_mask<>(SB), 8, $16
   882  
   883  DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
   884  DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
   885  GLOBL r08_mask<>(SB), 8, $16