github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/sm3/sm3blockni_arm64.s (about)

     1  // Generated by gen_sm3block_ni.go. DO NOT EDIT.
     2  
     3  #include "textflag.h"
     4  
     5  // func blockSM3NI(h []uint32, p []byte, t []uint32)
     6  TEXT ·blockSM3NI(SB), 0, $0
     7  	MOVD	h_base+0(FP), R0                           // Hash value first address
     8  	MOVD	p_base+24(FP), R1                          // message first address
     9  	MOVD	p_len+32(FP), R3                           // message length
    10  	MOVD	t_base+48(FP), R2                          // t constants first address
    11  
    12  	VLD1 (R0), [V8.S4, V9.S4]                          // load h(a,b,c,d,e,f,g,h)
    13  	LDPW	(0*8)(R2), (R5, R6)                        // load t constants
    14      
    15  blockloop:
    16  	VLD1.P	64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]    // load 64bytes message
    17  	VMOV	V8.B16, V15.B16                             // backup: V8 h(dcba)
    18  	VMOV	V9.B16, V16.B16                             // backup: V9 h(hgfe)
    19  	VREV32	V0.B16, V0.B16                              // prepare for using message in Byte format
    20  	VREV32	V1.B16, V1.B16
    21  	VREV32	V2.B16, V2.B16
    22  	VREV32	V3.B16, V3.B16    
    23  	// first 16 rounds
    24  	VMOV R5, V11.S[3]
    25  	// Extension
    26  	VEXT $3, V2.B16, V1.B16, V4.B16
    27  	VEXT $3, V1.B16, V0.B16, V6.B16
    28  	VEXT $2, V3.B16, V2.B16, V7.B16
    29  	WORD $0x04c063ce          //SM3PARTW1 V4.4S, V0.4S, V3.4S
    30  	WORD $0xe4c466ce          //SM3PARTW2 V4.4S, V7.4S, V6.4S
    31  	VEOR V1.B16, V0.B16, V10.B16
    32  	// Compression
    33  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    34  	VSHL $1, V11.S4, V11.S4
    35  	WORD $0xa8804ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
    36  	WORD $0xa98840ce           //SM3TT2A V9d.4S, V5.4S, V0.S, 0
    37  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    38  	VSHL $1, V11.S4, V11.S4
    39  	WORD $0xa8904ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
    40  	WORD $0xa99840ce           //SM3TT2A V9d.4S, V5.4S, V0.S, 1
    41  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    42  	VSHL $1, V11.S4, V11.S4
    43  	WORD $0xa8a04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
    44  	WORD $0xa9a840ce           //SM3TT2A V9d.4S, V5.4S, V0.S, 2
    45  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    46  	VSHL $1, V11.S4, V11.S4
    47  	WORD $0xa8b04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
    48  	WORD $0xa9b840ce           //SM3TT2A V9d.4S, V5.4S, V0.S, 3
    49  
    50  	// Extension
    51  	VEXT $3, V3.B16, V2.B16, V0.B16
    52  	VEXT $3, V2.B16, V1.B16, V6.B16
    53  	VEXT $2, V4.B16, V3.B16, V7.B16
    54  	WORD $0x20c064ce          //SM3PARTW1 V0.4S, V1.4S, V4.4S
    55  	WORD $0xe0c466ce          //SM3PARTW2 V0.4S, V7.4S, V6.4S
    56  	VEOR V2.B16, V1.B16, V10.B16
    57  	// Compression
    58  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    59  	VSHL $1, V11.S4, V11.S4
    60  	WORD $0xa8804ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
    61  	WORD $0xa98841ce           //SM3TT2A V9d.4S, V5.4S, V1.S, 0
    62  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    63  	VSHL $1, V11.S4, V11.S4
    64  	WORD $0xa8904ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
    65  	WORD $0xa99841ce           //SM3TT2A V9d.4S, V5.4S, V1.S, 1
    66  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    67  	VSHL $1, V11.S4, V11.S4
    68  	WORD $0xa8a04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
    69  	WORD $0xa9a841ce           //SM3TT2A V9d.4S, V5.4S, V1.S, 2
    70  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    71  	VSHL $1, V11.S4, V11.S4
    72  	WORD $0xa8b04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
    73  	WORD $0xa9b841ce           //SM3TT2A V9d.4S, V5.4S, V1.S, 3
    74  
    75  	// Extension
    76  	VEXT $3, V4.B16, V3.B16, V1.B16
    77  	VEXT $3, V3.B16, V2.B16, V6.B16
    78  	VEXT $2, V0.B16, V4.B16, V7.B16
    79  	WORD $0x41c060ce          //SM3PARTW1 V1.4S, V2.4S, V0.4S
    80  	WORD $0xe1c466ce          //SM3PARTW2 V1.4S, V7.4S, V6.4S
    81  	VEOR V3.B16, V2.B16, V10.B16
    82  	// Compression
    83  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    84  	VSHL $1, V11.S4, V11.S4
    85  	WORD $0xa8804ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
    86  	WORD $0xa98842ce           //SM3TT2A V9d.4S, V5.4S, V2.S, 0
    87  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    88  	VSHL $1, V11.S4, V11.S4
    89  	WORD $0xa8904ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
    90  	WORD $0xa99842ce           //SM3TT2A V9d.4S, V5.4S, V2.S, 1
    91  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    92  	VSHL $1, V11.S4, V11.S4
    93  	WORD $0xa8a04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
    94  	WORD $0xa9a842ce           //SM3TT2A V9d.4S, V5.4S, V2.S, 2
    95  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
    96  	VSHL $1, V11.S4, V11.S4
    97  	WORD $0xa8b04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
    98  	WORD $0xa9b842ce           //SM3TT2A V9d.4S, V5.4S, V2.S, 3
    99  
   100  	// Extension
   101  	VEXT $3, V0.B16, V4.B16, V2.B16
   102  	VEXT $3, V4.B16, V3.B16, V6.B16
   103  	VEXT $2, V1.B16, V0.B16, V7.B16
   104  	WORD $0x62c061ce          //SM3PARTW1 V2.4S, V3.4S, V1.4S
   105  	WORD $0xe2c466ce          //SM3PARTW2 V2.4S, V7.4S, V6.4S
   106  	VEOR V4.B16, V3.B16, V10.B16
   107  	// Compression
   108  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   109  	VSHL $1, V11.S4, V11.S4
   110  	WORD $0xa8804ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 0
   111  	WORD $0xa98843ce           //SM3TT2A V9d.4S, V5.4S, V3.S, 0
   112  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   113  	VSHL $1, V11.S4, V11.S4
   114  	WORD $0xa8904ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 1
   115  	WORD $0xa99843ce           //SM3TT2A V9d.4S, V5.4S, V3.S, 1
   116  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   117  	VSHL $1, V11.S4, V11.S4
   118  	WORD $0xa8a04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 2
   119  	WORD $0xa9a843ce           //SM3TT2A V9d.4S, V5.4S, V3.S, 2
   120  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   121  	VSHL $1, V11.S4, V11.S4
   122  	WORD $0xa8b04ace           //SM3TT1A V8d.4S, V5.4S, V10.S, 3
   123  	WORD $0xa9b843ce           //SM3TT2A V9d.4S, V5.4S, V3.S, 3
   124  
   125  	// second 48 rounds
   126  	VMOV R6, V11.S[3]
   127  	// Extension
   128  	VEXT $3, V1.B16, V0.B16, V3.B16
   129  	VEXT $3, V0.B16, V4.B16, V6.B16
   130  	VEXT $2, V2.B16, V1.B16, V7.B16
   131  	WORD $0x83c062ce          //SM3PARTW1 V3.4S, V4.4S, V2.4S
   132  	WORD $0xe3c466ce          //SM3PARTW2 V3.4S, V7.4S, V6.4S
   133  	VEOR V0.B16, V4.B16, V10.B16
   134  	// Compression
   135  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   136  	VSHL $1, V11.S4, V11.S4
   137  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   138  	WORD $0xa98c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 0
   139  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   140  	VSHL $1, V11.S4, V11.S4
   141  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   142  	WORD $0xa99c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 1
   143  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   144  	VSHL $1, V11.S4, V11.S4
   145  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   146  	WORD $0xa9ac44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 2
   147  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   148  	VSHL $1, V11.S4, V11.S4
   149  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   150  	WORD $0xa9bc44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 3
   151  
   152  	// Extension
   153  	VEXT $3, V2.B16, V1.B16, V4.B16
   154  	VEXT $3, V1.B16, V0.B16, V6.B16
   155  	VEXT $2, V3.B16, V2.B16, V7.B16
   156  	WORD $0x04c063ce          //SM3PARTW1 V4.4S, V0.4S, V3.4S
   157  	WORD $0xe4c466ce          //SM3PARTW2 V4.4S, V7.4S, V6.4S
   158  	VEOR V1.B16, V0.B16, V10.B16
   159  	// Compression
   160  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   161  	VSHL $1, V11.S4, V11.S4
   162  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   163  	WORD $0xa98c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 0
   164  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   165  	VSHL $1, V11.S4, V11.S4
   166  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   167  	WORD $0xa99c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 1
   168  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   169  	VSHL $1, V11.S4, V11.S4
   170  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   171  	WORD $0xa9ac40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 2
   172  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   173  	VSHL $1, V11.S4, V11.S4
   174  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   175  	WORD $0xa9bc40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 3
   176  
   177  	// Extension
   178  	VEXT $3, V3.B16, V2.B16, V0.B16
   179  	VEXT $3, V2.B16, V1.B16, V6.B16
   180  	VEXT $2, V4.B16, V3.B16, V7.B16
   181  	WORD $0x20c064ce          //SM3PARTW1 V0.4S, V1.4S, V4.4S
   182  	WORD $0xe0c466ce          //SM3PARTW2 V0.4S, V7.4S, V6.4S
   183  	VEOR V2.B16, V1.B16, V10.B16
   184  	// Compression
   185  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   186  	VSHL $1, V11.S4, V11.S4
   187  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   188  	WORD $0xa98c41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 0
   189  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   190  	VSHL $1, V11.S4, V11.S4
   191  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   192  	WORD $0xa99c41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 1
   193  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   194  	VSHL $1, V11.S4, V11.S4
   195  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   196  	WORD $0xa9ac41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 2
   197  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   198  	VSHL $1, V11.S4, V11.S4
   199  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   200  	WORD $0xa9bc41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 3
   201  
   202  	// Extension
   203  	VEXT $3, V4.B16, V3.B16, V1.B16
   204  	VEXT $3, V3.B16, V2.B16, V6.B16
   205  	VEXT $2, V0.B16, V4.B16, V7.B16
   206  	WORD $0x41c060ce          //SM3PARTW1 V1.4S, V2.4S, V0.4S
   207  	WORD $0xe1c466ce          //SM3PARTW2 V1.4S, V7.4S, V6.4S
   208  	VEOR V3.B16, V2.B16, V10.B16
   209  	// Compression
   210  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   211  	VSHL $1, V11.S4, V11.S4
   212  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   213  	WORD $0xa98c42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 0
   214  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   215  	VSHL $1, V11.S4, V11.S4
   216  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   217  	WORD $0xa99c42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 1
   218  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   219  	VSHL $1, V11.S4, V11.S4
   220  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   221  	WORD $0xa9ac42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 2
   222  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   223  	VSHL $1, V11.S4, V11.S4
   224  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   225  	WORD $0xa9bc42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 3
   226  
   227  	// Extension
   228  	VEXT $3, V0.B16, V4.B16, V2.B16
   229  	VEXT $3, V4.B16, V3.B16, V6.B16
   230  	VEXT $2, V1.B16, V0.B16, V7.B16
   231  	WORD $0x62c061ce          //SM3PARTW1 V2.4S, V3.4S, V1.4S
   232  	WORD $0xe2c466ce          //SM3PARTW2 V2.4S, V7.4S, V6.4S
   233  	VEOR V4.B16, V3.B16, V10.B16
   234  	// Compression
   235  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   236  	VSHL $1, V11.S4, V11.S4
   237  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   238  	WORD $0xa98c43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 0
   239  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   240  	VSHL $1, V11.S4, V11.S4
   241  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   242  	WORD $0xa99c43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 1
   243  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   244  	VSHL $1, V11.S4, V11.S4
   245  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   246  	WORD $0xa9ac43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 2
   247  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   248  	VSHL $1, V11.S4, V11.S4
   249  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   250  	WORD $0xa9bc43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 3
   251  
   252  	// Extension
   253  	VEXT $3, V1.B16, V0.B16, V3.B16
   254  	VEXT $3, V0.B16, V4.B16, V6.B16
   255  	VEXT $2, V2.B16, V1.B16, V7.B16
   256  	WORD $0x83c062ce          //SM3PARTW1 V3.4S, V4.4S, V2.4S
   257  	WORD $0xe3c466ce          //SM3PARTW2 V3.4S, V7.4S, V6.4S
   258  	VEOR V0.B16, V4.B16, V10.B16
   259  	// Compression
   260  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   261  	VSHL $1, V11.S4, V11.S4
   262  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   263  	WORD $0xa98c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 0
   264  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   265  	VSHL $1, V11.S4, V11.S4
   266  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   267  	WORD $0xa99c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 1
   268  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   269  	VSHL $1, V11.S4, V11.S4
   270  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   271  	WORD $0xa9ac44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 2
   272  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   273  	VSHL $1, V11.S4, V11.S4
   274  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   275  	WORD $0xa9bc44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 3
   276  
   277  	// Extension
   278  	VEXT $3, V2.B16, V1.B16, V4.B16
   279  	VEXT $3, V1.B16, V0.B16, V6.B16
   280  	VEXT $2, V3.B16, V2.B16, V7.B16
   281  	WORD $0x04c063ce          //SM3PARTW1 V4.4S, V0.4S, V3.4S
   282  	WORD $0xe4c466ce          //SM3PARTW2 V4.4S, V7.4S, V6.4S
   283  	VEOR V1.B16, V0.B16, V10.B16
   284  	// Compression
   285  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   286  	VSHL $1, V11.S4, V11.S4
   287  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   288  	WORD $0xa98c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 0
   289  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   290  	VSHL $1, V11.S4, V11.S4
   291  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   292  	WORD $0xa99c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 1
   293  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   294  	VSHL $1, V11.S4, V11.S4
   295  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   296  	WORD $0xa9ac40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 2
   297  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   298  	VSHL $1, V11.S4, V11.S4
   299  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   300  	WORD $0xa9bc40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 3
   301  
   302  	// Extension
   303  	VEXT $3, V3.B16, V2.B16, V0.B16
   304  	VEXT $3, V2.B16, V1.B16, V6.B16
   305  	VEXT $2, V4.B16, V3.B16, V7.B16
   306  	WORD $0x20c064ce          //SM3PARTW1 V0.4S, V1.4S, V4.4S
   307  	WORD $0xe0c466ce          //SM3PARTW2 V0.4S, V7.4S, V6.4S
   308  	VEOR V2.B16, V1.B16, V10.B16
   309  	// Compression
   310  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   311  	VSHL $1, V11.S4, V11.S4
   312  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   313  	WORD $0xa98c41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 0
   314  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   315  	VSHL $1, V11.S4, V11.S4
   316  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   317  	WORD $0xa99c41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 1
   318  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   319  	VSHL $1, V11.S4, V11.S4
   320  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   321  	WORD $0xa9ac41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 2
   322  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   323  	VSHL $1, V11.S4, V11.S4
   324  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   325  	WORD $0xa9bc41ce           //SM3TT2B V9d.4S, V5.4S, V1.S, 3
   326  
   327  	// Extension
   328  	VEXT $3, V4.B16, V3.B16, V1.B16
   329  	VEXT $3, V3.B16, V2.B16, V6.B16
   330  	VEXT $2, V0.B16, V4.B16, V7.B16
   331  	WORD $0x41c060ce          //SM3PARTW1 V1.4S, V2.4S, V0.4S
   332  	WORD $0xe1c466ce          //SM3PARTW2 V1.4S, V7.4S, V6.4S
   333  	VEOR V3.B16, V2.B16, V10.B16
   334  	// Compression
   335  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   336  	VSHL $1, V11.S4, V11.S4
   337  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   338  	WORD $0xa98c42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 0
   339  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   340  	VSHL $1, V11.S4, V11.S4
   341  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   342  	WORD $0xa99c42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 1
   343  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   344  	VSHL $1, V11.S4, V11.S4
   345  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   346  	WORD $0xa9ac42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 2
   347  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   348  	VSHL $1, V11.S4, V11.S4
   349  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   350  	WORD $0xa9bc42ce           //SM3TT2B V9d.4S, V5.4S, V2.S, 3
   351  
   352  	VEOR V4.B16, V3.B16, V10.B16
   353  	// Compression
   354  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   355  	VSHL $1, V11.S4, V11.S4
   356  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   357  	WORD $0xa98c43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 0
   358  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   359  	VSHL $1, V11.S4, V11.S4
   360  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   361  	WORD $0xa99c43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 1
   362  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   363  	VSHL $1, V11.S4, V11.S4
   364  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   365  	WORD $0xa9ac43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 2
   366  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   367  	VSHL $1, V11.S4, V11.S4
   368  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   369  	WORD $0xa9bc43ce           //SM3TT2B V9d.4S, V5.4S, V3.S, 3
   370  
   371  	VEOR V0.B16, V4.B16, V10.B16
   372  	// Compression
   373  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   374  	VSHL $1, V11.S4, V11.S4
   375  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   376  	WORD $0xa98c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 0
   377  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   378  	VSHL $1, V11.S4, V11.S4
   379  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   380  	WORD $0xa99c44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 1
   381  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   382  	VSHL $1, V11.S4, V11.S4
   383  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   384  	WORD $0xa9ac44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 2
   385  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   386  	VSHL $1, V11.S4, V11.S4
   387  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   388  	WORD $0xa9bc44ce           //SM3TT2B V9d.4S, V5.4S, V4.S, 3
   389  
   390  	VEOR V1.B16, V0.B16, V10.B16
   391  	// Compression
   392  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   393  	VSHL $1, V11.S4, V11.S4
   394  	WORD $0xa8844ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 0
   395  	WORD $0xa98c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 0
   396  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   397  	VSHL $1, V11.S4, V11.S4
   398  	WORD $0xa8944ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 1
   399  	WORD $0xa99c40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 1
   400  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   401  	VSHL $1, V11.S4, V11.S4
   402  	WORD $0xa8a44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 2
   403  	WORD $0xa9ac40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 2
   404  	WORD $0x05254bce           //SM3SS1 V5.4S, V8.4S, V11.4S, V9.4S
   405  	VSHL $1, V11.S4, V11.S4
   406  	WORD $0xa8b44ace           //SM3TT1B V8d.4S, V5.4S, V10.S, 3
   407  	WORD $0xa9bc40ce           //SM3TT2B V9d.4S, V5.4S, V0.S, 3
   408  
   409  	SUB	$64, R3, R3                                  // message length - 64bytes, then compare with 64bytes
   410  	VEOR	V8.B16, V15.B16, V8.B16
   411  	VEOR	V9.B16, V16.B16, V9.B16
   412  	CBNZ	R3, blockloop
   413  
   414  sm3ret:
   415  	VST1	[V8.S4, V9.S4], (R0)                       // store hash value H	
   416  	RET