github.com/emmansun/gmsm@v0.29.1/sm3/sm3blockni_arm64.s (about)

     1  // Generated by gen_sm3block_ni.go. DO NOT EDIT.
     2  //go:build !purego
     3  
     4  #include "textflag.h"
     5  
     6  // func blockSM3NI(h []uint32, p []byte, t []uint32)
     7  TEXT ·blockSM3NI(SB), 0, $0
     8  	MOVD	h_base+0(FP), R0                           // Hash value first address
     9  	MOVD	p_base+24(FP), R1                          // message first address
    10  	MOVD	p_len+32(FP), R3                           // message length
    11  	MOVD	t_base+48(FP), R2                          // t constants first address
    12  
    13  	VLD1 (R0), [V8.S4, V9.S4]                          // load h(a,b,c,d,e,f,g,h)
    14  	VREV64  V8.S4, V8.S4
    15  	VEXT $8, V8.B16, V8.B16, V8.B16
    16  	VREV64  V9.S4, V9.S4
    17  	VEXT $8, V9.B16, V9.B16, V9.B16
    18  	LDPW	(0*8)(R2), (R5, R6)                        // load t constants
    19      
    20  blockloop:
    21  	VLD1.P	64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]    // load 64bytes message
    22  	VMOV	V8.B16, V15.B16                             // backup: V8 h(dcba)
    23  	VMOV	V9.B16, V16.B16                             // backup: V9 h(hgfe)
    24  	VREV32	V0.B16, V0.B16                              // prepare for using message in Byte format
    25  	VREV32	V1.B16, V1.B16
    26  	VREV32	V2.B16, V2.B16
    27  	VREV32	V3.B16, V3.B16    
    28  	// first 16 rounds
    29  	VMOV R5, V11.S[3]
    30  	// Extension
    31  	VEXT $12, V2.B16, V1.B16, V4.B16
    32  	VEXT $12, V1.B16, V0.B16, V6.B16
    33  	VEXT $8, V3.B16, V2.B16, V7.B16
    34  	WORD $0xce63c004          //SM3PARTW1 V4.4S, V0.4S, V3.4S
    35  	WORD $0xce66c4e4          //SM3PARTW2 V4.4S, V7.4S, V6.4S
    36  	VEOR V1.B16, V0.B16, V10.B16
    37  	// Compression
    38  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    39  	VSHL $1, V11.S4, V12.S4
    40  	VSRI $31, V11.S4, V12.S4
    41  	WORD $0xce4a80a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
    42  	WORD $0xce4088a9           //SM3TT2A V9d.4S, V5.4S, V0.S, 0
    43  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
    44  	VSHL $1, V12.S4, V11.S4
    45  	VSRI $31, V12.S4, V11.S4
    46  	WORD $0xce4a90a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
    47  	WORD $0xce4098a9           //SM3TT2A V9d.4S, V5.4S, V0.S, 1
    48  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    49  	VSHL $1, V11.S4, V12.S4
    50  	VSRI $31, V11.S4, V12.S4
    51  	WORD $0xce4aa0a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
    52  	WORD $0xce40a8a9           //SM3TT2A V9d.4S, V5.4S, V0.S, 2
    53  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
    54  	VSHL $1, V12.S4, V11.S4
    55  	VSRI $31, V12.S4, V11.S4
    56  	WORD $0xce4ab0a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
    57  	WORD $0xce40b8a9           //SM3TT2A V9d.4S, V5.4S, V0.S, 3
    58  
    59  	// Extension
    60  	VEXT $12, V3.B16, V2.B16, V0.B16
    61  	VEXT $12, V2.B16, V1.B16, V6.B16
    62  	VEXT $8, V4.B16, V3.B16, V7.B16
    63  	WORD $0xce64c020          //SM3PARTW1 V0.4S, V1.4S, V4.4S
    64  	WORD $0xce66c4e0          //SM3PARTW2 V0.4S, V7.4S, V6.4S
    65  	VEOR V2.B16, V1.B16, V10.B16
    66  	// Compression
    67  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    68  	VSHL $1, V11.S4, V12.S4
    69  	VSRI $31, V11.S4, V12.S4
    70  	WORD $0xce4a80a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
    71  	WORD $0xce4188a9           //SM3TT2A V9d.4S, V5.4S, V1.S, 0
    72  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
    73  	VSHL $1, V12.S4, V11.S4
    74  	VSRI $31, V12.S4, V11.S4
    75  	WORD $0xce4a90a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
    76  	WORD $0xce4198a9           //SM3TT2A V9d.4S, V5.4S, V1.S, 1
    77  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    78  	VSHL $1, V11.S4, V12.S4
    79  	VSRI $31, V11.S4, V12.S4
    80  	WORD $0xce4aa0a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
    81  	WORD $0xce41a8a9           //SM3TT2A V9d.4S, V5.4S, V1.S, 2
    82  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
    83  	VSHL $1, V12.S4, V11.S4
    84  	VSRI $31, V12.S4, V11.S4
    85  	WORD $0xce4ab0a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
    86  	WORD $0xce41b8a9           //SM3TT2A V9d.4S, V5.4S, V1.S, 3
    87  
    88  	// Extension
    89  	VEXT $12, V4.B16, V3.B16, V1.B16
    90  	VEXT $12, V3.B16, V2.B16, V6.B16
    91  	VEXT $8, V0.B16, V4.B16, V7.B16
    92  	WORD $0xce60c041          //SM3PARTW1 V1.4S, V2.4S, V0.4S
    93  	WORD $0xce66c4e1          //SM3PARTW2 V1.4S, V7.4S, V6.4S
    94  	VEOR V3.B16, V2.B16, V10.B16
    95  	// Compression
    96  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    97  	VSHL $1, V11.S4, V12.S4
    98  	VSRI $31, V11.S4, V12.S4
    99  	WORD $0xce4a80a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
   100  	WORD $0xce4288a9           //SM3TT2A V9d.4S, V5.4S, V2.S, 0
   101  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   102  	VSHL $1, V12.S4, V11.S4
   103  	VSRI $31, V12.S4, V11.S4
   104  	WORD $0xce4a90a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
   105  	WORD $0xce4298a9           //SM3TT2A V9d.4S, V5.4S, V2.S, 1
   106  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   107  	VSHL $1, V11.S4, V12.S4
   108  	VSRI $31, V11.S4, V12.S4
   109  	WORD $0xce4aa0a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
   110  	WORD $0xce42a8a9           //SM3TT2A V9d.4S, V5.4S, V2.S, 2
   111  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   112  	VSHL $1, V12.S4, V11.S4
   113  	VSRI $31, V12.S4, V11.S4
   114  	WORD $0xce4ab0a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
   115  	WORD $0xce42b8a9           //SM3TT2A V9d.4S, V5.4S, V2.S, 3
   116  
   117  	// Extension
   118  	VEXT $12, V0.B16, V4.B16, V2.B16
   119  	VEXT $12, V4.B16, V3.B16, V6.B16
   120  	VEXT $8, V1.B16, V0.B16, V7.B16
   121  	WORD $0xce61c062          //SM3PARTW1 V2.4S, V3.4S, V1.4S
   122  	WORD $0xce66c4e2          //SM3PARTW2 V2.4S, V7.4S, V6.4S
   123  	VEOR V4.B16, V3.B16, V10.B16
   124  	// Compression
   125  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   126  	VSHL $1, V11.S4, V12.S4
   127  	VSRI $31, V11.S4, V12.S4
   128  	WORD $0xce4a80a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
   129  	WORD $0xce4388a9           //SM3TT2A V9d.4S, V5.4S, V3.S, 0
   130  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   131  	VSHL $1, V12.S4, V11.S4
   132  	VSRI $31, V12.S4, V11.S4
   133  	WORD $0xce4a90a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
   134  	WORD $0xce4398a9           //SM3TT2A V9d.4S, V5.4S, V3.S, 1
   135  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   136  	VSHL $1, V11.S4, V12.S4
   137  	VSRI $31, V11.S4, V12.S4
   138  	WORD $0xce4aa0a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
   139  	WORD $0xce43a8a9           //SM3TT2A V9d.4S, V5.4S, V3.S, 2
   140  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   141  	VSHL $1, V12.S4, V11.S4
   142  	VSRI $31, V12.S4, V11.S4
   143  	WORD $0xce4ab0a8           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
   144  	WORD $0xce43b8a9           //SM3TT2A V9d.4S, V5.4S, V3.S, 3
   145  
   146  	// second 48 rounds
   147  	VMOV R6, V11.S[3]
   148  	// Extension
   149  	VEXT $12, V1.B16, V0.B16, V3.B16
   150  	VEXT $12, V0.B16, V4.B16, V6.B16
   151  	VEXT $8, V2.B16, V1.B16, V7.B16
   152  	WORD $0xce62c083          //SM3PARTW1 V3.4S, V4.4S, V2.4S
   153  	WORD $0xce66c4e3          //SM3PARTW2 V3.4S, V7.4S, V6.4S
   154  	VEOR V0.B16, V4.B16, V10.B16
   155  	// Compression
   156  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   157  	VSHL $1, V11.S4, V12.S4
   158  	VSRI $31, V11.S4, V12.S4
   159  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   160  	WORD $0xce448ca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 0
   161  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   162  	VSHL $1, V12.S4, V11.S4
   163  	VSRI $31, V12.S4, V11.S4
   164  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   165  	WORD $0xce449ca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 1
   166  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   167  	VSHL $1, V11.S4, V12.S4
   168  	VSRI $31, V11.S4, V12.S4
   169  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   170  	WORD $0xce44aca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 2
   171  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   172  	VSHL $1, V12.S4, V11.S4
   173  	VSRI $31, V12.S4, V11.S4
   174  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   175  	WORD $0xce44bca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 3
   176  
   177  	// Extension
   178  	VEXT $12, V2.B16, V1.B16, V4.B16
   179  	VEXT $12, V1.B16, V0.B16, V6.B16
   180  	VEXT $8, V3.B16, V2.B16, V7.B16
   181  	WORD $0xce63c004          //SM3PARTW1 V4.4S, V0.4S, V3.4S
   182  	WORD $0xce66c4e4          //SM3PARTW2 V4.4S, V7.4S, V6.4S
   183  	VEOR V1.B16, V0.B16, V10.B16
   184  	// Compression
   185  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   186  	VSHL $1, V11.S4, V12.S4
   187  	VSRI $31, V11.S4, V12.S4
   188  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   189  	WORD $0xce408ca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 0
   190  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   191  	VSHL $1, V12.S4, V11.S4
   192  	VSRI $31, V12.S4, V11.S4
   193  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   194  	WORD $0xce409ca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 1
   195  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   196  	VSHL $1, V11.S4, V12.S4
   197  	VSRI $31, V11.S4, V12.S4
   198  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   199  	WORD $0xce40aca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 2
   200  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   201  	VSHL $1, V12.S4, V11.S4
   202  	VSRI $31, V12.S4, V11.S4
   203  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   204  	WORD $0xce40bca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 3
   205  
   206  	// Extension
   207  	VEXT $12, V3.B16, V2.B16, V0.B16
   208  	VEXT $12, V2.B16, V1.B16, V6.B16
   209  	VEXT $8, V4.B16, V3.B16, V7.B16
   210  	WORD $0xce64c020          //SM3PARTW1 V0.4S, V1.4S, V4.4S
   211  	WORD $0xce66c4e0          //SM3PARTW2 V0.4S, V7.4S, V6.4S
   212  	VEOR V2.B16, V1.B16, V10.B16
   213  	// Compression
   214  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   215  	VSHL $1, V11.S4, V12.S4
   216  	VSRI $31, V11.S4, V12.S4
   217  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   218  	WORD $0xce418ca9           //SM3TT2B V9d.4S, V5.4S, V1.S, 0
   219  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   220  	VSHL $1, V12.S4, V11.S4
   221  	VSRI $31, V12.S4, V11.S4
   222  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   223  	WORD $0xce419ca9           //SM3TT2B V9d.4S, V5.4S, V1.S, 1
   224  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   225  	VSHL $1, V11.S4, V12.S4
   226  	VSRI $31, V11.S4, V12.S4
   227  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   228  	WORD $0xce41aca9           //SM3TT2B V9d.4S, V5.4S, V1.S, 2
   229  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   230  	VSHL $1, V12.S4, V11.S4
   231  	VSRI $31, V12.S4, V11.S4
   232  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   233  	WORD $0xce41bca9           //SM3TT2B V9d.4S, V5.4S, V1.S, 3
   234  
   235  	// Extension
   236  	VEXT $12, V4.B16, V3.B16, V1.B16
   237  	VEXT $12, V3.B16, V2.B16, V6.B16
   238  	VEXT $8, V0.B16, V4.B16, V7.B16
   239  	WORD $0xce60c041          //SM3PARTW1 V1.4S, V2.4S, V0.4S
   240  	WORD $0xce66c4e1          //SM3PARTW2 V1.4S, V7.4S, V6.4S
   241  	VEOR V3.B16, V2.B16, V10.B16
   242  	// Compression
   243  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   244  	VSHL $1, V11.S4, V12.S4
   245  	VSRI $31, V11.S4, V12.S4
   246  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   247  	WORD $0xce428ca9           //SM3TT2B V9d.4S, V5.4S, V2.S, 0
   248  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   249  	VSHL $1, V12.S4, V11.S4
   250  	VSRI $31, V12.S4, V11.S4
   251  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   252  	WORD $0xce429ca9           //SM3TT2B V9d.4S, V5.4S, V2.S, 1
   253  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   254  	VSHL $1, V11.S4, V12.S4
   255  	VSRI $31, V11.S4, V12.S4
   256  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   257  	WORD $0xce42aca9           //SM3TT2B V9d.4S, V5.4S, V2.S, 2
   258  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   259  	VSHL $1, V12.S4, V11.S4
   260  	VSRI $31, V12.S4, V11.S4
   261  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   262  	WORD $0xce42bca9           //SM3TT2B V9d.4S, V5.4S, V2.S, 3
   263  
   264  	// Extension
   265  	VEXT $12, V0.B16, V4.B16, V2.B16
   266  	VEXT $12, V4.B16, V3.B16, V6.B16
   267  	VEXT $8, V1.B16, V0.B16, V7.B16
   268  	WORD $0xce61c062          //SM3PARTW1 V2.4S, V3.4S, V1.4S
   269  	WORD $0xce66c4e2          //SM3PARTW2 V2.4S, V7.4S, V6.4S
   270  	VEOR V4.B16, V3.B16, V10.B16
   271  	// Compression
   272  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   273  	VSHL $1, V11.S4, V12.S4
   274  	VSRI $31, V11.S4, V12.S4
   275  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   276  	WORD $0xce438ca9           //SM3TT2B V9d.4S, V5.4S, V3.S, 0
   277  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   278  	VSHL $1, V12.S4, V11.S4
   279  	VSRI $31, V12.S4, V11.S4
   280  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   281  	WORD $0xce439ca9           //SM3TT2B V9d.4S, V5.4S, V3.S, 1
   282  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   283  	VSHL $1, V11.S4, V12.S4
   284  	VSRI $31, V11.S4, V12.S4
   285  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   286  	WORD $0xce43aca9           //SM3TT2B V9d.4S, V5.4S, V3.S, 2
   287  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   288  	VSHL $1, V12.S4, V11.S4
   289  	VSRI $31, V12.S4, V11.S4
   290  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   291  	WORD $0xce43bca9           //SM3TT2B V9d.4S, V5.4S, V3.S, 3
   292  
   293  	// Extension
   294  	VEXT $12, V1.B16, V0.B16, V3.B16
   295  	VEXT $12, V0.B16, V4.B16, V6.B16
   296  	VEXT $8, V2.B16, V1.B16, V7.B16
   297  	WORD $0xce62c083          //SM3PARTW1 V3.4S, V4.4S, V2.4S
   298  	WORD $0xce66c4e3          //SM3PARTW2 V3.4S, V7.4S, V6.4S
   299  	VEOR V0.B16, V4.B16, V10.B16
   300  	// Compression
   301  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   302  	VSHL $1, V11.S4, V12.S4
   303  	VSRI $31, V11.S4, V12.S4
   304  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   305  	WORD $0xce448ca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 0
   306  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   307  	VSHL $1, V12.S4, V11.S4
   308  	VSRI $31, V12.S4, V11.S4
   309  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   310  	WORD $0xce449ca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 1
   311  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   312  	VSHL $1, V11.S4, V12.S4
   313  	VSRI $31, V11.S4, V12.S4
   314  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   315  	WORD $0xce44aca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 2
   316  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   317  	VSHL $1, V12.S4, V11.S4
   318  	VSRI $31, V12.S4, V11.S4
   319  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   320  	WORD $0xce44bca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 3
   321  
   322  	// Extension
   323  	VEXT $12, V2.B16, V1.B16, V4.B16
   324  	VEXT $12, V1.B16, V0.B16, V6.B16
   325  	VEXT $8, V3.B16, V2.B16, V7.B16
   326  	WORD $0xce63c004          //SM3PARTW1 V4.4S, V0.4S, V3.4S
   327  	WORD $0xce66c4e4          //SM3PARTW2 V4.4S, V7.4S, V6.4S
   328  	VEOR V1.B16, V0.B16, V10.B16
   329  	// Compression
   330  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   331  	VSHL $1, V11.S4, V12.S4
   332  	VSRI $31, V11.S4, V12.S4
   333  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   334  	WORD $0xce408ca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 0
   335  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   336  	VSHL $1, V12.S4, V11.S4
   337  	VSRI $31, V12.S4, V11.S4
   338  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   339  	WORD $0xce409ca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 1
   340  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   341  	VSHL $1, V11.S4, V12.S4
   342  	VSRI $31, V11.S4, V12.S4
   343  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   344  	WORD $0xce40aca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 2
   345  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   346  	VSHL $1, V12.S4, V11.S4
   347  	VSRI $31, V12.S4, V11.S4
   348  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   349  	WORD $0xce40bca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 3
   350  
   351  	// Extension
   352  	VEXT $12, V3.B16, V2.B16, V0.B16
   353  	VEXT $12, V2.B16, V1.B16, V6.B16
   354  	VEXT $8, V4.B16, V3.B16, V7.B16
   355  	WORD $0xce64c020          //SM3PARTW1 V0.4S, V1.4S, V4.4S
   356  	WORD $0xce66c4e0          //SM3PARTW2 V0.4S, V7.4S, V6.4S
   357  	VEOR V2.B16, V1.B16, V10.B16
   358  	// Compression
   359  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   360  	VSHL $1, V11.S4, V12.S4
   361  	VSRI $31, V11.S4, V12.S4
   362  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   363  	WORD $0xce418ca9           //SM3TT2B V9d.4S, V5.4S, V1.S, 0
   364  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   365  	VSHL $1, V12.S4, V11.S4
   366  	VSRI $31, V12.S4, V11.S4
   367  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   368  	WORD $0xce419ca9           //SM3TT2B V9d.4S, V5.4S, V1.S, 1
   369  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   370  	VSHL $1, V11.S4, V12.S4
   371  	VSRI $31, V11.S4, V12.S4
   372  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   373  	WORD $0xce41aca9           //SM3TT2B V9d.4S, V5.4S, V1.S, 2
   374  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   375  	VSHL $1, V12.S4, V11.S4
   376  	VSRI $31, V12.S4, V11.S4
   377  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   378  	WORD $0xce41bca9           //SM3TT2B V9d.4S, V5.4S, V1.S, 3
   379  
   380  	// Extension
   381  	VEXT $12, V4.B16, V3.B16, V1.B16
   382  	VEXT $12, V3.B16, V2.B16, V6.B16
   383  	VEXT $8, V0.B16, V4.B16, V7.B16
   384  	WORD $0xce60c041          //SM3PARTW1 V1.4S, V2.4S, V0.4S
   385  	WORD $0xce66c4e1          //SM3PARTW2 V1.4S, V7.4S, V6.4S
   386  	VEOR V3.B16, V2.B16, V10.B16
   387  	// Compression
   388  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   389  	VSHL $1, V11.S4, V12.S4
   390  	VSRI $31, V11.S4, V12.S4
   391  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   392  	WORD $0xce428ca9           //SM3TT2B V9d.4S, V5.4S, V2.S, 0
   393  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   394  	VSHL $1, V12.S4, V11.S4
   395  	VSRI $31, V12.S4, V11.S4
   396  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   397  	WORD $0xce429ca9           //SM3TT2B V9d.4S, V5.4S, V2.S, 1
   398  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   399  	VSHL $1, V11.S4, V12.S4
   400  	VSRI $31, V11.S4, V12.S4
   401  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   402  	WORD $0xce42aca9           //SM3TT2B V9d.4S, V5.4S, V2.S, 2
   403  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   404  	VSHL $1, V12.S4, V11.S4
   405  	VSRI $31, V12.S4, V11.S4
   406  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   407  	WORD $0xce42bca9           //SM3TT2B V9d.4S, V5.4S, V2.S, 3
   408  
   409  	VEOR V4.B16, V3.B16, V10.B16
   410  	// Compression
   411  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   412  	VSHL $1, V11.S4, V12.S4
   413  	VSRI $31, V11.S4, V12.S4
   414  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   415  	WORD $0xce438ca9           //SM3TT2B V9d.4S, V5.4S, V3.S, 0
   416  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   417  	VSHL $1, V12.S4, V11.S4
   418  	VSRI $31, V12.S4, V11.S4
   419  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   420  	WORD $0xce439ca9           //SM3TT2B V9d.4S, V5.4S, V3.S, 1
   421  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   422  	VSHL $1, V11.S4, V12.S4
   423  	VSRI $31, V11.S4, V12.S4
   424  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   425  	WORD $0xce43aca9           //SM3TT2B V9d.4S, V5.4S, V3.S, 2
   426  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   427  	VSHL $1, V12.S4, V11.S4
   428  	VSRI $31, V12.S4, V11.S4
   429  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   430  	WORD $0xce43bca9           //SM3TT2B V9d.4S, V5.4S, V3.S, 3
   431  
   432  	VEOR V0.B16, V4.B16, V10.B16
   433  	// Compression
   434  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   435  	VSHL $1, V11.S4, V12.S4
   436  	VSRI $31, V11.S4, V12.S4
   437  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   438  	WORD $0xce448ca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 0
   439  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   440  	VSHL $1, V12.S4, V11.S4
   441  	VSRI $31, V12.S4, V11.S4
   442  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   443  	WORD $0xce449ca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 1
   444  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   445  	VSHL $1, V11.S4, V12.S4
   446  	VSRI $31, V11.S4, V12.S4
   447  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   448  	WORD $0xce44aca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 2
   449  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   450  	VSHL $1, V12.S4, V11.S4
   451  	VSRI $31, V12.S4, V11.S4
   452  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   453  	WORD $0xce44bca9           //SM3TT2B V9d.4S, V5.4S, V4.S, 3
   454  
   455  	VEOR V1.B16, V0.B16, V10.B16
   456  	// Compression
   457  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   458  	VSHL $1, V11.S4, V12.S4
   459  	VSRI $31, V11.S4, V12.S4
   460  	WORD $0xce4a84a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   461  	WORD $0xce408ca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 0
   462  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   463  	VSHL $1, V12.S4, V11.S4
   464  	VSRI $31, V12.S4, V11.S4
   465  	WORD $0xce4a94a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   466  	WORD $0xce409ca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 1
   467  	WORD $0xce4b2505           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   468  	VSHL $1, V11.S4, V12.S4
   469  	VSRI $31, V11.S4, V12.S4
   470  	WORD $0xce4aa4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   471  	WORD $0xce40aca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 2
   472  	WORD $0xce4c2505           //SM3SS1 V5.4S, V8.4S, V12.4S, V9.4S
   473  	VSHL $1, V12.S4, V11.S4
   474  	VSRI $31, V12.S4, V11.S4
   475  	WORD $0xce4ab4a8           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   476  	WORD $0xce40bca9           //SM3TT2B V9d.4S, V5.4S, V0.S, 3
   477  
   478  	SUB	$64, R3, R3                                  // message length - 64bytes, then compare with 64bytes
   479  	VEOR	V8.B16, V15.B16, V8.B16
   480  	VEOR	V9.B16, V16.B16, V9.B16
   481  	CBNZ	R3, blockloop
   482  
   483  sm3ret:
   484  	VREV64  V8.S4, V8.S4
   485  	VEXT $8, V8.B16, V8.B16, V8.B16
   486  	VREV64  V9.S4, V9.S4
   487  	VEXT $8, V9.B16, V9.B16, V9.B16
   488  	VST1	[V8.S4, V9.S4], (R0)                       // store hash value H	
   489  	RET
   490