github.com/emmansun/gmsm@v0.29.1/sm3/sm3block_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #include "sm3_const_asm.s"
     6  
     7  #define XWORD0 V0
     8  #define XWORD1 V1
     9  #define XWORD2 V2
    10  #define XWORD3 V3
    11  
    12  #define XTMP0 V4
    13  #define XTMP1 V5
    14  #define XTMP2 V6
    15  #define XTMP3 V7
    16  #define XTMP4 V8
    17  
    18  #define Wt V9
    19  
    20  #define a R0
    21  #define b R1
    22  #define c R2
    23  #define d R3
    24  #define e R4
    25  #define f R5
    26  #define g R6
    27  #define h R7
    28  
    29  #define y0 R8
    30  #define y1 R9
    31  #define y2 R10
    32  
    33  #define NUM_BYTES R11
    34  #define INP	R12
    35  #define CTX R13 // Beginning of digest in memory (a, b, c, ... , h)
    36  
    37  #define a1 R15
    38  #define b1 R16
    39  #define c1 R19
    40  #define d1 R20
    41  #define e1 R21
    42  #define f1 R22
    43  #define g1 R23
    44  #define h1 R24
    45  
    46  // For rounds [0 - 16)
    47  #define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \
    48  	RORW  $20, a, y0;                             \ // y0 = a <<< 12
    49  	ADDW  $const, e, y1;                          \
    50  	ADDW  y0, y1;                                 \ // y1 = a <<< 12 + e + T
    51  	VEXT $12, XWORD1.B16, XWORD0.B16, XTMP0.B16;  \ // XTMP0 = W[-13] = {w6,w5,w4,w3}, Vm = XWORD1, Vn = XWORD0
    52  	RORW  $25, y1, y2;                            \ // y2 = SS1
    53  	EORW  y2, y0;                                 \ // y0 = SS2
    54  	VMOV  XWORD0.S[0], y1;                        \
    55  	VSHL $7, XTMP0.S4, XTMP1.S4;                  \ 
    56  	ADDW  y1, y2;                                 \ // y2 = SS1 + W
    57  	ADDW  h, y2;                                  \ // y2 = h + SS1 + W
    58  	VMOV  Wt.S[0], y1;                            \
    59  	VSRI $25, XTMP0.S4, XTMP1.S4;                 \ // XTMP1 = W[-13] rol 7
    60  	ADDW  y1, y0;                                 \ // y0 = SS2 + W'
    61  	ADDW  d, y0;                                  \ // y0 = d + SS2 + W'
    62  	; \
    63  	EORW  a, b, h;                                \
    64  	VEXT $8, XWORD3.B16, XWORD2.B16, XTMP0.B16;   \ // XTMP0 = W[-6] = {w13,w12,w11,w10}
    65  	EORW  c, h;                                   \
    66  	ADDW  y0, h;                                  \ // h = FF(a, b, c) + d + SS2 + W' = tt1
    67  	; \
    68  	EORW  e, f, y1;                               \
    69  	VEOR XTMP1.B16, XTMP0.B16, XTMP0.B16;         \ // XTMP0 = W[-6] ^ (W[-13] rol 7)
    70  	EORW  g, y1;                                  \
    71  	ADDW  y1, y2;                                 \ // y2 = GG(e, f, g) + h + SS1 + W = tt2
    72  	; \
    73  	RORW  $23, b;                                 \
    74  	VEXT $12, XWORD2.B16, XWORD1.B16, XTMP1.B16;  \ // XTMP1 = W[-9] = {w10,w9,w8,w7}, Vm = XWORD2, Vn = XWORD1
    75  	RORW  $13, f;                                 \
    76  	; \
    77  	RORW  $23, y2, y0;                            \
    78  	RORW  $15, y2, d;                             \
    79  	VEOR XWORD0.B16, XTMP1.B16, XTMP1.B16;        \ // XTMP1 = W[-9] ^ W[-16]
    80  	EORW  y0, d;                                  \
    81  	EORW  y2, d;                                  \ // d = P(tt2)
    82  	VEXT $4, XWORD2.B16, XWORD3.B16, XTMP3.B16;   \ // XTMP3 = W[-3] {w11,w15,w14,w13}
    83  
    84  #define ROUND_AND_SCHED_N_0_1(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \
    85  	RORW  $20, a, y0;                             \ // y0 = a <<< 12
    86  	ADDW  $const, e, y1;                          \
    87  	ADDW  y0, y1;                                 \ // y1 = a <<< 12 + e + T
    88  	VSHL $15, XTMP3.S4, XTMP2.S4;                 \
    89  	RORW  $25, y1, y2;                            \ // y2 = SS1
    90  	EORW  y2, y0;                                 \ // y0 = SS2
    91  	VMOV  XWORD0.S[1], y1;                        \
    92  	VSRI $17, XTMP3.S4, XTMP2.S4;                 \ // XTMP2 = W[-3] rol 15 {xxBA}
    93  	ADDW  y1, y2;                                 \ // y2 = SS1 + W
    94  	ADDW  h, y2;                                  \ // y2 = h + SS1 + W
    95  	VMOV  Wt.S[1], y1;                            \
    96  	VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16;         \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxBA}
    97  	ADDW  y1, y0;                                 \ // y0 = SS2 + W'
    98  	ADDW  d, y0;                                  \ // y0 = d + SS2 + W'
    99  	; \
   100  	EORW  a, b, h;                                \
   101  	VSHL $15, XTMP2.S4, XTMP4.S4;                 \
   102  	EORW  c, h;                                   \
   103  	ADDW  y0, h;                                  \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   104  	; \
   105  	EORW  e, f, y1;                               \
   106  	VSRI $17, XTMP2.S4, XTMP4.S4;                 \ // XTMP4 =  = XTMP2 rol 15 {xxBA}
   107  	EORW  g, y1;                                  \
   108  	ADDW  y1, y2;                                 \ // y2 = GG(e, f, g) + h + SS1 + W = tt2
   109  	; \
   110  	RORW  $23, b;                                 \
   111  	VSHL $8, XTMP4.S4, XTMP3.S4;                  \
   112  	RORW  $13, f;                                 \
   113  	; \
   114  	RORW  $23, y2, y0;                            \
   115  	RORW  $15, y2, d;                             \
   116  	VSRI $24, XTMP4.S4, XTMP3.S4;                 \ // XTMP3 = XTMP2 rol 23 {xxBA}
   117  	EORW  y0, d;                                  \
   118  	EORW  y2, d;                                  \ // d = P(tt2)
   119  	VEOR XTMP2.B16, XTMP4.B16, XTMP4.B16;         \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA})	
   120  
   121  #define ROUND_AND_SCHED_N_0_2(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \
   122  	RORW  $20, a, y0;                             \ // y0 = a <<< 12
   123  	ADDW  $const, e, y1;                          \
   124  	ADDW  y0, y1;                                 \ // y1 = a <<< 12 + e + T
   125  	VEOR XTMP4.B16, XTMP3.B16, XTMP4.B16;         \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA}) XOR (XTMP2 rol 23 {xxBA})
   126  	RORW  $25, y1, y2;                            \ // y2 = SS1
   127  	EORW  y2, y0;                                 \ // y0 = SS2
   128  	VMOV  XWORD0.S[2], y1;                        \
   129  	VEOR XTMP4.B16, XTMP0.B16, XTMP2.B16;         \ // XTMP2 = {..., ..., W[1], W[0]}
   130  	ADDW  y1, y2;                                 \ // y2 = SS1 + W
   131  	ADDW  h, y2;                                  \ // y2 = h + SS1 + W
   132  	VMOV  Wt.S[2], y1;                            \
   133  	VEXT $4, XTMP2.B16, XWORD3.B16, XTMP3.B16;    \ // XTMP3 = W[-3] {W[0],w15, w14, w13}, Vm = XTMP2, Vn = XWORD3
   134  	ADDW  y1, y0;                                 \ // y0 = SS2 + W'
   135  	ADDW  d, y0;                                  \ // y0 = d + SS2 + W'
   136  	; \
   137  	EORW  a, b, h;                                \
   138  	VSHL $15, XTMP3.S4, XTMP4.S4;                 \
   139  	EORW  c, h;                                   \
   140  	ADDW  y0, h;                                  \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   141  	; \
   142  	EORW  e, f, y1;                               \
   143  	VSRI $17, XTMP3.S4, XTMP4.S4;                 \ // XTMP4 = W[-3] rol 15 {DCBA}
   144  	EORW  g, y1;                                  \
   145  	ADDW  y1, y2;                                 \ // y2 = GG(e, f, g) + h + SS1 + W = tt2
   146  	RORW  $23, b;                                 \
   147  	RORW  $13, f;                                 \
   148  	VEOR XTMP1.B16, XTMP4.B16, XTMP4.B16;         \ // XTMP4 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {DCBA}
   149  	RORW  $23, y2, y0;                            \
   150  	RORW  $15, y2, d;                             \
   151  	EORW  y0, d;                                  \
   152  	EORW  y2, d;                                  \ // d = P(tt2)
   153  	VSHL $15, XTMP4.S4, XTMP3.S4;                 \
   154  
   155  #define ROUND_AND_SCHED_N_0_3(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \
   156  	RORW  $20, a, y0;                             \ // y0 = a <<< 12
   157  	ADDW  $const, e, y1;                          \
   158  	ADDW  y0, y1;                                 \ // y1 = a <<< 12 + e + T
   159  	RORW  $25, y1, y2;                            \ // y2 = SS1
   160  	VSRI $17, XTMP4.S4, XTMP3.S4;                 \ // XTMP3 = XTMP4 rol 15 {DCBA}
   161  	EORW  y2, y0;                                 \ // y0 = SS2
   162  	VMOV  XWORD0.S[3], y1;                        \
   163  	ADDW  y1, y2;                                 \ // y2 = SS1 + W
   164  	VSHL $8, XTMP3.S4, XTMP1.S4;                  \
   165  	ADDW  h, y2;                                  \ // y2 = h + SS1 + W
   166  	VMOV  Wt.S[3], y1;                            \
   167  	ADDW  y1, y0;                                 \ // y0 = SS2 + W'
   168  	ADDW  d, y0;                                  \ // y0 = d + SS2 + W'
   169  	VSRI $24, XTMP3.S4, XTMP1.S4;                 \ // XTMP1 = XTMP4 rol 23 {DCBA}
   170  	EORW  a, b, h;                                \
   171  	EORW  c, h;                                   \
   172  	ADDW  y0, h;                                  \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   173  	EORW  e, f, y1;                               \
   174  	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16;         \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {DCBA})
   175  	EORW  g, y1;                                  \
   176  	ADDW  y1, y2;                                 \ // y2 = GG(e, f, g) + h + SS1 + W = tt2
   177  	RORW  $23, b;                                 \
   178  	RORW  $13, f;                                 \
   179  	VEOR XTMP3.B16, XTMP1.B16, XTMP1.B16;         \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {DCBA}) XOR (XTMP4 rol 23 {DCBA})
   180  	RORW  $23, y2, y0;                            \
   181  	RORW  $15, y2, d;                             \
   182  	EORW  y0, d;                                  \
   183  	EORW  y2, d;                                  \ // d = P(tt2)
   184  	VEOR XTMP1.B16, XTMP0.B16, XWORD0.B16;        \ // XWORD0 = {W[3], W[2], W[1], W[0]}	
   185  
   186  // For rounds [16 - 64)
   187  #define ROUND_AND_SCHED_N_1_0(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \
   188  	RORW  $20, a, y0;                             \ // y0 = a <<< 12
   189  	ADDW  $const, e, y1;                          \
   190  	ADDW  y0, y1;                                 \ // y1 = a <<< 12 + e + T
   191  	VEXT $12, XWORD1.B16, XWORD0.B16, XTMP0.B16;  \ // XTMP0 = W[-13] = {w6,w5,w4,w3}, Vm = XWORD1, Vn = XWORD0
   192  	RORW  $25, y1, y2;                            \ // y2 = SS1
   193  	EORW  y2, y0;                                 \ // y0 = SS2
   194  	VMOV  XWORD0.S[0], y1;                        \
   195  	VSHL $7, XTMP0.S4, XTMP1.S4;                  \ 
   196  	ADDW  y1, y2;                                 \ // y2 = SS1 + W
   197  	ADDW  h, y2;                                  \ // y2 = h + SS1 + W
   198  	VMOV  Wt.S[0], y1;                            \
   199  	VSRI $25, XTMP0.S4, XTMP1.S4;                 \ // XTMP1 = W[-13] rol 7
   200  	ADDW  y1, y0;                                 \ // y0 = SS2 + W'
   201  	ADDW  d, y0;                                  \ // y0 = d + SS2 + W'
   202  	; \
   203  	ORRW  a, b, y1;                               \
   204  	VEXT $8, XWORD3.B16, XWORD2.B16, XTMP0.B16;   \ // XTMP0 = W[-6] = {w13,w12,w11,w10}
   205  	ANDW  a, b, h;                                \
   206  	ANDW  c, y1;                                  \
   207  	ORRW  y1, h;                                  \ // h =  (a AND b) OR (a AND c) OR (b AND c)	
   208  	VEOR XTMP1.B16, XTMP0.B16, XTMP0.B16;         \ // XTMP0 = W[-6] ^ (W[-13] rol 7)
   209  	ADDW  y0, h;                                  \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   210  	; \
   211  	EORW  f, g, y1;                               \
   212  	ANDW  e, y1;                                  \	
   213  	VEXT $12, XWORD2.B16, XWORD1.B16, XTMP1.B16;  \ // XTMP1 = W[-9] = {w10,w9,w8,w7}, Vm = XWORD2, Vn = XWORD1
   214  	EORW  g, y1;                                  \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)	
   215  	ADDW  y1, y2;                                 \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 
   216  	; \
   217  	RORW  $23, b;                                 \
   218  	RORW  $13, f;                                 \
   219  	VEOR XWORD0.B16, XTMP1.B16, XTMP1.B16;        \ // XTMP1 = W[-9] ^ W[-16]
   220  	; \
   221  	RORW  $23, y2, y0;                            \
   222  	RORW  $15, y2, d;                             \
   223  	EORW  y0, d;                                  \
   224  	EORW  y2, d;                                  \ // d = P(tt2)	
   225  	VEXT $4, XWORD2.B16, XWORD3.B16, XTMP3.B16;   \ // XTMP3 = W[-3] {w11,w15,w14,w13}
   226  
   227  #define ROUND_AND_SCHED_N_1_1(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \
   228  	RORW  $20, a, y0;                             \ // y0 = a <<< 12
   229  	ADDW  $const, e, y1;                          \
   230  	ADDW  y0, y1;                                 \ // y1 = a <<< 12 + e + T
   231  	VSHL $15, XTMP3.S4, XTMP2.S4;                 \           
   232  	RORW  $25, y1, y2;                            \ // y2 = SS1
   233  	EORW  y2, y0;                                 \ // y0 = SS2
   234  	VMOV  XWORD0.S[1], y1;                        \
   235  	VSRI $17, XTMP3.S4, XTMP2.S4;                 \ // XTMP2 = W[-3] rol 15 {xxBA}
   236  	ADDW  y1, y2;                                 \ // y2 = SS1 + W
   237  	ADDW  h, y2;                                  \ // y2 = h + SS1 + W
   238  	VMOV  Wt.S[1], y1;                            \
   239  	VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16;         \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxBA}
   240  	ADDW  y1, y0;                                 \ // y0 = SS2 + W'
   241  	ADDW  d, y0;                                  \ // y0 = d + SS2 + W'
   242  	; \
   243  	ORRW  a, b, y1;                               \
   244  	VSHL $15, XTMP2.S4, XTMP4.S4;                 \
   245  	ANDW  a, b, h;                                \
   246  	ANDW  c, y1;                                  \
   247  	ORRW  y1, h;                                  \ // h =  (a AND b) OR (a AND c) OR (b AND c)	
   248  	VSRI $17, XTMP2.S4, XTMP4.S4;                 \ // XTMP4 =  = XTMP2 rol 15 {xxBA}
   249  	ADDW  y0, h;                                  \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   250  	; \
   251  	EORW  f, g, y1;                               \
   252  	ANDW  e, y1;                                  \	
   253  	EORW  g, y1;                                  \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)	
   254  	VSHL $8, XTMP4.S4, XTMP3.S4;                  \
   255  	ADDW  y1, y2;                                 \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 
   256  	; \
   257  	RORW  $23, b;                                 \
   258  	RORW  $13, f;                                 \
   259  	; \
   260  	RORW  $23, y2, y0;                            \
   261  	VSRI $24, XTMP4.S4, XTMP3.S4;                 \ // XTMP3 = XTMP2 rol 23 {xxBA}
   262  	RORW  $15, y2, d;                             \
   263  	EORW  y0, d;                                  \
   264  	EORW  y2, d;                                  \ // d = P(tt2)	
   265  	VEOR XTMP2.B16, XTMP4.B16, XTMP4.B16;         \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA})
   266  
   267  #define ROUND_AND_SCHED_N_1_2(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \
   268  	RORW  $20, a, y0;                             \ // y0 = a <<< 12
   269  	ADDW  $const, e, y1;                          \
   270  	ADDW  y0, y1;                                 \ // y1 = a <<< 12 + e + T
   271  	VEOR XTMP4.B16, XTMP3.B16, XTMP4.B16;         \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA}) XOR (XTMP2 rol 23 {xxBA})
   272  	RORW  $25, y1, y2;                            \ // y2 = SS1
   273  	EORW  y2, y0;                                 \ // y0 = SS2
   274  	VMOV  XWORD0.S[2], y1;                        \
   275  	VEOR XTMP4.B16, XTMP0.B16, XTMP2.B16;         \ // XTMP2 = {..., ..., W[1], W[0]}
   276  	ADDW  y1, y2;                                 \ // y2 = SS1 + W
   277  	ADDW  h, y2;                                  \ // y2 = h + SS1 + W
   278  	VMOV  Wt.S[2], y1;                            \
   279  	ADDW  y1, y0;                                 \ // y0 = SS2 + W'
   280  	VEXT $4, XTMP2.B16, XWORD3.B16, XTMP3.B16;    \ // XTMP3 = W[-3] {W[0],w15, w14, w13}, Vm = XTMP2, Vn = XWORD3
   281  	ADDW  d, y0;                                  \ // y0 = d + SS2 + W'
   282  	ORRW  a, b, y1;                               \
   283  	ANDW  a, b, h;                                \
   284  	ANDW  c, y1;                                  \
   285  	VSHL $15, XTMP3.S4, XTMP4.S4;                 \
   286  	ORRW  y1, h;                                  \ // h =  (a AND b) OR (a AND c) OR (b AND c)	
   287  	ADDW  y0, h;                                  \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   288  	EORW  f, g, y1;                               \
   289  	ANDW  e, y1;                                  \	
   290  	VSRI $17, XTMP3.S4, XTMP4.S4;                 \ // XTMP4 = W[-3] rol 15 {DCBA}
   291  	EORW  g, y1;                                  \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)	
   292  	ADDW  y1, y2;                                 \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 
   293  	RORW  $23, b;                                 \
   294  	RORW  $13, f;                                 \
   295  	VEOR XTMP1.B16, XTMP4.B16, XTMP4.B16;         \ // XTMP4 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {DCBA}
   296  	RORW  $23, y2, y0;                            \
   297  	RORW  $15, y2, d;                             \
   298  	EORW  y0, d;                                  \
   299  	EORW  y2, d;                                  \ // d = P(tt2)	
   300  	VSHL $15, XTMP4.S4, XTMP3.S4;                 \
   301  
   302  #define ROUND_AND_SCHED_N_1_3(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \
   303  	RORW  $20, a, y0;                             \ // y0 = a <<< 12
   304  	ADDW  $const, e, y1;                          \
   305  	ADDW  y0, y1;                                 \ // y1 = a <<< 12 + e + T
   306  	RORW  $25, y1, y2;                            \ // y2 = SS1
   307  	VSRI $17, XTMP4.S4, XTMP3.S4;                 \ // XTMP3 = XTMP4 rol 15 {DCBA}
   308  	EORW  y2, y0;                                 \ // y0 = SS2
   309  	VMOV  XWORD0.S[3], y1;                        \
   310  	ADDW  y1, y2;                                 \ // y2 = SS1 + W
   311  	ADDW  h, y2;                                  \ // y2 = h + SS1 + W
   312  	VMOV  Wt.S[3], y1;                            \
   313  	VSHL $8, XTMP3.S4, XTMP1.S4;                  \
   314  	ADDW  y1, y0;                                 \ // y0 = SS2 + W'
   315  	ADDW  d, y0;                                  \ // y0 = d + SS2 + W'
   316  	ORRW  a, b, y1;                               \
   317  	ANDW  a, b, h;                                \
   318  	ANDW  c, y1;                                  \
   319  	VSRI $24, XTMP3.S4, XTMP1.S4;                 \ // XTMP1 = XTMP4 rol 23 {DCBA}
   320  	ORRW  y1, h;                                  \ // h =  (a AND b) OR (a AND c) OR (b AND c)
   321  	ADDW  y0, h;                                  \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   322  	EORW  f, g, y1;                               \
   323  	ANDW  e, y1;                                  \
   324  	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16;         \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {DCBA})
   325  	EORW  g, y1;                                  \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)
   326  	ADDW  y1, y2;                                 \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 
   327  	RORW  $23, b;                                 \
   328  	RORW  $13, f;                                 \
   329  	VEOR XTMP3.B16, XTMP1.B16, XTMP1.B16;         \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {DCBA}) XOR (XTMP4 rol 23 {DCBA})
   330  	RORW  $23, y2, y0;                            \
   331  	RORW  $15, y2, d;                             \
   332  	EORW  y0, d;                                  \
   333  	EORW  y2, d;                                  \ // d = P(tt2)	
   334  	VEOR XTMP1.B16, XTMP0.B16, XWORD0.B16;        \ // XWORD0 = {W[3], W[2], W[1], W[0]}
   335  
   336  // For rounds [16 - 64)
   337  #define DO_ROUND_N_1(disp, idx, const, a, b, c, d, e, f, g, h, W, Wt) \
   338  	RORW  $20, a, y0;                          \ // y0 = a <<< 12
   339  	ADDW  $const, e, y1;                       \
   340  	ADDW  y0, y1;                              \ // y1 = a <<< 12 + e + T
   341  	RORW  $25, y1, y2;                         \ // y2 = SS1
   342  	EORW  y2, y0;                              \ // y0 = SS2
   343  	VMOV  W.S[idx], y1;                        \
   344  	ADDW  y1, y2;                              \ // y2 = SS1 + W
   345  	ADDW  h, y2;                               \ // y2 = h + SS1 + W
   346  	VMOV  Wt.S[idx], y1;                       \
   347  	ADDW  y1, y0;                              \ // y0 = SS2 + W'
   348  	ADDW  d, y0;                               \ // y0 = d + SS2 + W'
   349  	; \
   350  	ORRW  a, b, y1;                            \
   351  	ANDW  a, b, h;                             \
   352  	ANDW  c, y1;                               \
   353  	ORRW  y1, h;                               \ // h =  (a AND b) OR (a AND c) OR (b AND c)
   354  	ADDW  y0, h;                               \ // h = FF(a, b, c) + d + SS2 + W' = tt1
   355  	; \
   356  	EORW  f, g, y1;                            \
   357  	ANDW  e, y1;                               \
   358  	EORW  g, y1;                               \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g)
   359  	ADDW  y1, y2;                              \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 
   360  	; \
   361  	RORW  $23, b;                              \
   362  	RORW  $13, f;                              \
   363  	; \
   364  	RORW  $23, y2, y0;                         \
   365  	RORW  $15, y2, d;                          \
   366  	EORW  y0, d;                               \
   367  	EORW  y2, d;                               \ // d = P(tt2)	
   368  
   369  // func blockARM64(dig *digest, p []byte)
   370  TEXT ·blockARM64(SB), NOSPLIT, $0
   371  	MOVD dig+0(FP), CTX
   372  	MOVD p_base+8(FP), INP
   373  	MOVD p_len+16(FP), NUM_BYTES
   374  
   375  	AND	$~63, NUM_BYTES
   376  	CBZ	NUM_BYTES, end  
   377  
   378  	LDPW	(0*8)(CTX), (a, b)
   379  	LDPW	(1*8)(CTX), (c, d)
   380  	LDPW	(2*8)(CTX), (e, f)
   381  	LDPW	(3*8)(CTX), (g, h)
   382  
   383  loop:
   384  	MOVW  a, a1
   385  	MOVW  b, b1
   386  	MOVW  c, c1
   387  	MOVW  d, d1
   388  	MOVW  e, e1
   389  	MOVW  f, f1
   390  	MOVW  g, g1
   391  	MOVW  h, h1
   392  
   393  	VLD1.P	64(INP), [XWORD0.B16, XWORD1.B16, XWORD2.B16, XWORD3.B16]
   394  	VREV32	XWORD0.B16, XWORD0.B16
   395  	VREV32	XWORD1.B16, XWORD1.B16
   396  	VREV32	XWORD2.B16, XWORD2.B16
   397  	VREV32	XWORD3.B16, XWORD3.B16
   398  
   399  schedule_compress: // for w0 - w47
   400  	// Do 4 rounds and scheduling
   401  	VEOR XWORD0.B16, XWORD1.B16, Wt.B16
   402  	ROUND_AND_SCHED_N_0_0(0*16, T0, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   403  	ROUND_AND_SCHED_N_0_1(0*16, T1, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   404  	ROUND_AND_SCHED_N_0_2(0*16, T2, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   405  	ROUND_AND_SCHED_N_0_3(0*16, T3, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   406  
   407  	// Do 4 rounds and scheduling
   408  	VEOR XWORD1.B16, XWORD2.B16, Wt.B16
   409  	ROUND_AND_SCHED_N_0_0(0*16, T4, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   410  	ROUND_AND_SCHED_N_0_1(0*16, T5, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   411  	ROUND_AND_SCHED_N_0_2(0*16, T6, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   412  	ROUND_AND_SCHED_N_0_3(0*16, T7, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   413  
   414  	// Do 4 rounds and scheduling
   415  	VEOR XWORD2.B16, XWORD3.B16, Wt.B16
   416  	ROUND_AND_SCHED_N_0_0(0*16, T8, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   417  	ROUND_AND_SCHED_N_0_1(0*16, T9, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   418  	ROUND_AND_SCHED_N_0_2(0*16, T10, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   419  	ROUND_AND_SCHED_N_0_3(0*16, T11, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   420  
   421  	// Do 4 rounds and scheduling
   422  	VEOR XWORD3.B16, XWORD0.B16, Wt.B16
   423  	ROUND_AND_SCHED_N_0_0(0*16, T12, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   424  	ROUND_AND_SCHED_N_0_1(0*16, T13, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   425  	ROUND_AND_SCHED_N_0_2(0*16, T14, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   426  	ROUND_AND_SCHED_N_0_3(0*16, T15, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   427  
   428  	// Do 4 rounds and scheduling
   429  	VEOR XWORD0.B16, XWORD1.B16, Wt.B16
   430  	ROUND_AND_SCHED_N_1_0(0*16, T16, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   431  	ROUND_AND_SCHED_N_1_1(0*16, T17, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   432  	ROUND_AND_SCHED_N_1_2(0*16, T18, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   433  	ROUND_AND_SCHED_N_1_3(0*16, T19, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   434  
   435  	// Do 4 rounds and scheduling
   436  	VEOR XWORD1.B16, XWORD2.B16, Wt.B16
   437  	ROUND_AND_SCHED_N_1_0(0*16, T20, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   438  	ROUND_AND_SCHED_N_1_1(0*16, T21, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   439  	ROUND_AND_SCHED_N_1_2(0*16, T22, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   440  	ROUND_AND_SCHED_N_1_3(0*16, T23, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   441  
   442  	// Do 4 rounds and scheduling
   443  	VEOR XWORD2.B16, XWORD3.B16, Wt.B16
   444  	ROUND_AND_SCHED_N_1_0(0*16, T24, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   445  	ROUND_AND_SCHED_N_1_1(0*16, T25, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   446  	ROUND_AND_SCHED_N_1_2(0*16, T26, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   447  	ROUND_AND_SCHED_N_1_3(0*16, T27, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   448  
   449  	// Do 4 rounds and scheduling
   450  	VEOR XWORD3.B16, XWORD0.B16, Wt.B16
   451  	ROUND_AND_SCHED_N_1_0(0*16, T28, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   452  	ROUND_AND_SCHED_N_1_1(0*16, T29, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   453  	ROUND_AND_SCHED_N_1_2(0*16, T30, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   454  	ROUND_AND_SCHED_N_1_3(0*16, T31, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   455  
   456  	// Do 4 rounds and scheduling
   457  	VEOR XWORD0.B16, XWORD1.B16, Wt.B16
   458  	ROUND_AND_SCHED_N_1_0(0*16, T32, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   459  	ROUND_AND_SCHED_N_1_1(0*16, T33, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   460  	ROUND_AND_SCHED_N_1_2(0*16, T34, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   461  	ROUND_AND_SCHED_N_1_3(0*16, T35, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   462  
   463  	// Do 4 rounds and scheduling
   464  	VEOR XWORD1.B16, XWORD2.B16, Wt.B16
   465  	ROUND_AND_SCHED_N_1_0(0*16, T36, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   466  	ROUND_AND_SCHED_N_1_1(0*16, T37, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   467  	ROUND_AND_SCHED_N_1_2(0*16, T38, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   468  	ROUND_AND_SCHED_N_1_3(0*16, T39, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt)
   469  
   470  	// Do 4 rounds and scheduling
   471  	VEOR XWORD2.B16, XWORD3.B16, Wt.B16
   472  	ROUND_AND_SCHED_N_1_0(0*16, T40, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   473  	ROUND_AND_SCHED_N_1_1(0*16, T41, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   474  	ROUND_AND_SCHED_N_1_2(0*16, T42, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   475  	ROUND_AND_SCHED_N_1_3(0*16, T43, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt)
   476  
   477  	// Do 4 rounds and scheduling
   478  	VEOR XWORD3.B16, XWORD0.B16, Wt.B16
   479  	ROUND_AND_SCHED_N_1_0(0*16, T44, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   480  	ROUND_AND_SCHED_N_1_1(0*16, T45, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   481  	ROUND_AND_SCHED_N_1_2(0*16, T46, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   482  	ROUND_AND_SCHED_N_1_3(0*16, T47, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt)
   483  
   484  	// w48 - w63 processed with only 4 rounds scheduling (last 16 rounds)
   485  	// Do 4 rounds and scheduling
   486  	VEOR XWORD0.B16, XWORD1.B16, Wt.B16
   487  	ROUND_AND_SCHED_N_1_0(0*16, T48, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   488  	ROUND_AND_SCHED_N_1_1(0*16, T49, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   489  	ROUND_AND_SCHED_N_1_2(0*16, T50, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt)
   490  	ROUND_AND_SCHED_N_1_3(0*16, T51, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt)  
   491  
   492  	// w52 - w63 processed with no scheduling (last 12 rounds)
   493  	// Do 4 rounds
   494  	VEOR XWORD1.B16, XWORD2.B16, Wt.B16
   495  	DO_ROUND_N_1(0*16, 0, T52, e, f, g, h, a, b, c, d, XWORD1, Wt)
   496  	DO_ROUND_N_1(0*16, 1, T53, d, e, f, g, h, a, b, c, XWORD1, Wt)
   497  	DO_ROUND_N_1(0*16, 2, T54, c, d, e, f, g, h, a, b, XWORD1, Wt)
   498  	DO_ROUND_N_1(0*16, 3, T55, b, c, d, e, f, g, h, a, XWORD1, Wt)
   499  
   500  	// Do 4 rounds
   501  	VEOR XWORD2.B16, XWORD3.B16, Wt.B16
   502  	DO_ROUND_N_1(0*16, 0, T56, a, b, c, d, e, f, g, h, XWORD2, Wt)
   503  	DO_ROUND_N_1(0*16, 1, T57, h, a, b, c, d, e, f, g, XWORD2, Wt)
   504  	DO_ROUND_N_1(0*16, 2, T58, g, h, a, b, c, d, e, f, XWORD2, Wt)
   505  	DO_ROUND_N_1(0*16, 3, T59, f, g, h, a, b, c, d, e, XWORD2, Wt)
   506  
   507  	// Do 4 rounds
   508  	VEOR XWORD3.B16, XWORD0.B16, Wt.B16
   509  	DO_ROUND_N_1(0*16, 0, T60, e, f, g, h, a, b, c, d, XWORD3, Wt)
   510  	DO_ROUND_N_1(0*16, 1, T61, d, e, f, g, h, a, b, c, XWORD3, Wt)
   511  	DO_ROUND_N_1(0*16, 2, T62, c, d, e, f, g, h, a, b, XWORD3, Wt)
   512  	DO_ROUND_N_1(0*16, 3, T63, b, c, d, e, f, g, h, a, XWORD3, Wt)
   513  
   514  	EORW a1, a  // H0 = a XOR H0
   515  	EORW b1, b  // H1 = b XOR H1
   516  	EORW c1, c  // H0 = a XOR H0
   517  	EORW d1, d  // H1 = b XOR H1
   518  	EORW e1, e  // H0 = a XOR H0
   519  	EORW f1, f  // H1 = b XOR H1
   520  	EORW g1, g  // H0 = a XOR H0
   521  	EORW h1, h  // H1 = b XOR H1
   522   
   523  	SUB	$64, NUM_BYTES, NUM_BYTES
   524  	CBNZ	NUM_BYTES, loop  	
   525  
   526  	STPW	(a, b), (0*8)(CTX)
   527  	STPW	(c, d), (1*8)(CTX)
   528  	STPW	(e, f), (2*8)(CTX)
   529  	STPW	(g, h), (3*8)(CTX)
   530  
   531  end:	
   532  	RET