github.com/emmansun/gmsm@v0.29.1/zuc/eia256_asm_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  DATA bit_reverse_table<>+0x00(SB)/8, $0x0e060a020c040800
     6  DATA bit_reverse_table<>+0x08(SB)/8, $0x0f070b030d050901
     7  DATA bit_reverse_table<>+0x10(SB)/8, $0xe060a020c0408000
     8  DATA bit_reverse_table<>+0x18(SB)/8, $0xf070b030d0509010
     9  GLOBL bit_reverse_table<>(SB), RODATA, $32
    10  
    11  DATA shuf_mask_dw<>+0x00(SB)/8, $0xffffffff03020100
    12  DATA shuf_mask_dw<>+0x08(SB)/8, $0xffffffff07060504
    13  DATA shuf_mask_dw<>+0x10(SB)/8, $0xffffffff0b0a0908
    14  DATA shuf_mask_dw<>+0x18(SB)/8, $0xffffffff0f0e0d0c
    15  GLOBL shuf_mask_dw<>(SB), RODATA, $32
    16  
    17  #define AX R2
    18  #define BX R3
    19  #define CX R4
    20  #define DX R5
    21  
    22  #define XTMP1 V1
    23  #define XTMP2 V2
    24  #define XTMP3 V3
    25  #define XTMP4 V4
    26  #define XTMP5 V5
    27  #define XTMP6 V6
    28  #define XDATA V7
    29  #define XDIGEST V8
    30  #define KS_L V9
    31  #define KS_M1 V10
    32  #define KS_M2 V11
    33  #define KS_H V12
    34  #define BIT_REV_TAB_L V20
    35  #define BIT_REV_TAB_H V21
    36  #define BIT_REV_AND_TAB V22
    37  #define SHUF_MASK_DW0_DW1 V23
    38  #define SHUF_MASK_DW2_DW3 V24
    39  
    40  #define LOAD_GLOBAL_DATA() \
    41  	MOVD $bit_reverse_table<>(SB), R0                         \
    42  	VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16]         \
    43  	MOVW $0x0F0F0F0F, R0                                      \
    44  	VDUP R0, BIT_REV_AND_TAB.S4                               \
    45  	MOVD $shuf_mask_dw<>(SB), R0                              \
    46  	VLD1 (R0), [SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16]
    47  
    48  // func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
    49  TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
    50  	MOVD t+0(FP), AX
    51  	MOVD ks+8(FP), BX
    52  	MOVD p+16(FP), CX
    53  
    54  	LOAD_GLOBAL_DATA()
    55  
    56  	// Reverse data bytes
    57  	VLD1 (CX), [XDATA.B16]
    58  	VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
    59  	VUSHR $4, XDATA.S4, XTMP1.S4
    60  	VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
    61  
    62  	VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
    63  	VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
    64  	VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes
    65  
    66  	// ZUC authentication part, 4x32 data bits
    67  	// setup KS
    68  	VLD1 (BX), [XTMP1.B16, XTMP2.B16]
    69  	VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
    70  	// TODO: Any better solution???
    71  	VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4
    72  	VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32]
    73  	VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160]
    74  	VDUP XTMP1.S[3], KS_M1.S4
    75  	VMOV XTMP1.S[2], KS_M1.S[1]
    76  	VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
    77  	
    78  	// setup DATA
    79  	VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
    80  	VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
    81  
    82  	// clmul
    83  	// xor the results from 4 32-bit words together
    84  
    85  	// Calculate lower 32 bits of tag
    86  	VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1
    87  	VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1
    88  	VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1
    89  	VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1
    90  
    91  	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
    92  	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
    93  	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
    94  
    95  	// Move previous result to low 32 bits and XOR with previous digest
    96  	VMOV XTMP3.S[1], XDIGEST.S[0]
    97  
    98  	// Prepare data and calculate bits 63-32 of tag
    99  	VEXT	$8, KS_L.B16, KS_L.B16, XTMP5.B16
   100  	VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
   101  	VEXT	$8, XTMP1.B16, XTMP1.B16, XTMP5.B16
   102  	VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1
   103  	VEXT	$8, KS_M1.B16, KS_M1.B16, XTMP1.B16
   104  	VPMULL XTMP1.D1, XTMP2.D1, XTMP5.Q1
   105  	VEXT	$8, XTMP2.B16, XTMP2.B16, XTMP1.B16
   106  	VPMULL KS_M2.D1, XTMP1.D1, XTMP6.Q1
   107  
   108  	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
   109  	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
   110  	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
   111  
   112  	VMOV XTMP3.S[1], XDIGEST.S[1]
   113  
   114  	VMOV XDIGEST.D[0], R10
   115  	MOVD (AX), R11
   116  	EOR R10, R11
   117  	MOVD R11, (AX)
   118  
   119  	RET
   120  
   121  // func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
   122  TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
   123  	MOVD t+0(FP), AX
   124  	MOVD ks+8(FP), BX
   125  	MOVD p+16(FP), CX
   126  
   127  	LOAD_GLOBAL_DATA()
   128  
   129  	// Reverse data bytes
   130  	VLD1 (CX), [XDATA.B16]
   131  	VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
   132  	VUSHR $4, XDATA.S4, XTMP1.S4
   133  	VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
   134  
   135  	VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
   136  	VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
   137  	VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes
   138  
   139  	// ZUC authentication part, 4x32 data bits
   140  	// setup KS
   141  	VLD1 (BX), [XTMP1.B16, XTMP2.B16]
   142  	VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front
   143  	// TODO: Any better solution??? We can use VTBL, but there are no performance imprvoement if we can't reuse MASK constant
   144  	VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4
   145  	VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16  // KS bits [63:32 31:0 95:64 63:32]
   146  	VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160]
   147  	VMOVQ $0x0b0a09080f0e0d0c, $0x0b0a09080f0e0d0c, XTMP4
   148  	VTBL XTMP4.B16, [XTMP2.B16], KS_H.B16  // KS bits [255:224 223:192 255:224 223:192]
   149  	VDUP XTMP1.S[3], KS_M1.S4
   150  	VMOV XTMP1.S[2], KS_M1.S[1]
   151  	VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96]
   152  
   153  	// setup DATA
   154  	VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s]
   155  	VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s]
   156  
   157  	// clmul
   158  	// xor the results from 4 32-bit words together
   159  
   160  	// Calculate lower 32 bits of tag
   161  	VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1
   162  	VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1
   163  	VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1
   164  	VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1
   165  
   166  	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
   167  	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
   168  	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
   169  
   170  	// Move previous result to low 32 bits and XOR with previous digest
   171  	VMOV XTMP3.S[1], XDIGEST.S[0]
   172  
   173  	// Prepare data and calculate bits 63-32 of tag
   174  	VEXT	$8, KS_L.B16, KS_L.B16, XTMP5.B16
   175  	VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
   176  	VEXT	$8, XTMP1.B16, XTMP1.B16, XTMP5.B16
   177  	VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1
   178  	VEXT	$8, KS_M1.B16, KS_M1.B16, XTMP6.B16
   179  	VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1
   180  	VEXT	$8, XTMP2.B16, XTMP2.B16, KS_L.B16
   181  	VPMULL KS_M2.D1, KS_L.D1, XTMP6.Q1
   182  
   183  	// XOR all the products and keep only 32-63 bits
   184  	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
   185  	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
   186  	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
   187  
   188  	VMOV XTMP3.S[1], XDIGEST.S[1]
   189  
   190  	// Prepare data and calculate bits 95-64 of tag
   191  	VPMULL KS_M1.D1, XTMP1.D1, XTMP3.Q1
   192  	VPMULL2 KS_M1.D2, XTMP1.D2, XTMP4.Q1
   193  	VPMULL KS_M2.D1, XTMP2.D1, XTMP5.Q1
   194  	VPMULL2 KS_M2.D2, XTMP2.D2, XTMP6.Q1
   195  
   196  	// XOR all the products and move bits 63-32 to bits 95-64
   197  	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
   198  	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
   199  	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
   200  
   201  	VMOV XTMP3.S[1], XDIGEST.S[2]
   202  
   203  	// Prepare data and calculate bits 127-96 of tag
   204  	VEXT	$8, KS_M1.B16, KS_M1.B16, XTMP5.B16
   205  	VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1
   206  	VEXT	$8, XTMP1.B16, XTMP1.B16, XTMP5.B16
   207  	VPMULL KS_M2.D1, XTMP5.D1, XTMP4.Q1
   208  	VEXT	$8, KS_M2.B16, KS_M2.B16, XTMP6.B16
   209  	VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1
   210  	VEXT	$8, XTMP2.B16, XTMP2.B16, KS_L.B16
   211  	VPMULL KS_H.D1, KS_L.D1, XTMP6.Q1
   212  
   213  	// XOR all the products and move bits 63-32 to bits 127-96
   214  	VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16
   215  	VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16
   216  	VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16
   217  
   218  	VMOV XTMP3.S[1], XDIGEST.S[3]
   219  
   220  	VLD1 (AX), [XTMP1.B16]
   221  	VEOR XTMP1.B16, XDIGEST.B16, XDIGEST.B16
   222  	VST1 [XDIGEST.B16], (AX)
   223  
   224  	RET