github.com/emmansun/gmsm@v0.29.1/zuc/eia_asm_amd64.s (about)

     1  // Referenced Intel(R) Multi-Buffer Crypto for IPsec
     2  // https://github.com/intel/intel-ipsec-mb/
     3  //go:build !purego
     4  
     5  #include "textflag.h"
     6  
     7  DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800
     8  DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901
     9  GLOBL bit_reverse_table_l<>(SB), RODATA, $16
    10  
    11  DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000
    12  DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010
    13  GLOBL bit_reverse_table_h<>(SB), RODATA, $16
    14  
    15  DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
    16  DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
    17  GLOBL bit_reverse_and_table<>(SB), RODATA, $16
    18  
    19  DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100
    20  DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
    21  GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16
    22  
    23  DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908
    24  DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c
    25  GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
    26  
    27  #define XTMP1 X1
    28  #define XTMP2 X2
    29  #define XTMP3 X3
    30  #define XTMP4 X4
    31  #define XTMP5 X5
    32  #define XTMP6 X6
    33  #define XDATA X7
    34  #define XDIGEST X8
    35  #define KS_L X9
    36  #define KS_M1 X10
    37  #define KS_M2 X11
    38  #define KS_H X12
    39  
    40  // func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
    41  TEXT ·eia3Round16B(SB),NOSPLIT,$0
    42  	MOVQ t+0(FP), AX
    43  	MOVQ ks+8(FP), BX
    44  	MOVQ p+16(FP), CX
    45  	MOVQ tagSize+24(FP), DX
    46  
    47  	CMPB ·useAVX(SB), $1
    48  	JE   avx
    49  
    50  	// Reverse data bytes
    51  	MOVUPS (0)(CX), XDATA
    52  	MOVOU bit_reverse_and_table<>(SB), XTMP4
    53  	MOVOU XDATA, XTMP2
    54  	PAND  XTMP4, XTMP2
    55  
    56  	PANDN XDATA, XTMP4
    57  	PSRLQ $4, XTMP4
    58  
    59  	MOVOU bit_reverse_table_h<>(SB), XTMP3
    60  	PSHUFB XTMP2, XTMP3
    61  
    62  	MOVOU bit_reverse_table_l<>(SB), XTMP1
    63  	PSHUFB XTMP4, XTMP1
    64  
    65  	PXOR XTMP1, XTMP3  // XTMP3 - bit reverse data bytes
    66  
    67  	// ZUC authentication part, 4x32 data bits
    68  	// setup KS
    69  	MOVUPS (0*4)(BX), XTMP1
    70  	MOVUPS (2*4)(BX), XTMP2
    71  	PSHUFD $0x61, XTMP1, KS_L  // KS bits [63:32 31:0 95:64 63:32]
    72  	PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
    73  
    74  	// setup DATA
    75  	MOVOU XTMP3, XTMP1
    76  	PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
    77  	MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
    78  
    79  	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
    80  	MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
    81  
    82  	// clmul
    83  	// xor the results from 4 32-bit words together
    84  	// Calculate lower 32 bits of tag
    85  	PCLMULQDQ $0x00, KS_L, XTMP1
    86  	PCLMULQDQ $0x11, KS_L, XTMP2
    87  	PCLMULQDQ $0x00, KS_M1, XDIGEST
    88  	PCLMULQDQ $0x11, KS_M1, XTMP3
    89  
    90  	// XOR all products and move 32-bits to lower 32 bits
    91  	PXOR XTMP1, XTMP2
    92  	PXOR XTMP3, XDIGEST
    93  	PXOR XTMP2, XDIGEST
    94  	PSRLDQ $4, XDIGEST
    95  
    96  	// Update tag
    97  	MOVL XDIGEST, R10
    98  	XORL R10, (AX)
    99  
   100  	// Copy last 16 bytes of KS to the front
   101  	MOVUPS (4*4)(BX), XTMP1
   102  	MOVUPS XTMP1, (0*4)(BX)
   103  
   104  	RET
   105  
   106  avx:
   107  	VMOVDQU (0)(CX), XDATA
   108  
   109  	// Reverse data bytes
   110  	VMOVDQU bit_reverse_and_table<>(SB), XTMP1 
   111  	VPAND XTMP1, XDATA, XTMP2
   112  	VPANDN XDATA, XTMP1, XTMP3
   113  	VPSRLD $4, XTMP3, XTMP3
   114  
   115  	VMOVDQU bit_reverse_table_h<>(SB), XTMP1
   116  	VPSHUFB XTMP2, XTMP1, XTMP4
   117  	VMOVDQU bit_reverse_table_l<>(SB), XTMP1
   118  	VPSHUFB XTMP3, XTMP1, XTMP1
   119  	VPOR XTMP1, XTMP4, XTMP4
   120  	
   121  	// ZUC authentication part, 4x32 data bits
   122  	// setup KS
   123  	VPSHUFD $0x61, (0*4)(BX), KS_L  // KS bits [63:32 31:0 95:64 63:32]
   124  	VPSHUFD $0x61, (2*4)(BX), KS_M1  // KS bits [63:32 31:0 95:64 63:32]
   125  
   126  	// setup DATA
   127  	// Data bytes [31:0 0s 63:32 0s]
   128  	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
   129  	// Data bytes [95:64 0s 127:96 0s]
   130  	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
   131  
   132  	// clmul
   133  	// xor the results from 4 32-bit words together
   134  	// Calculate lower 32 bits of tag
   135  	VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
   136  	VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
   137  	VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
   138  	VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
   139  
   140  	VPXOR XTMP3, XTMP4, XTMP3
   141  	VPXOR XTMP5, XTMP6, XTMP5
   142  	VPXOR XTMP3, XTMP5, XDIGEST
   143  
   144  	VMOVQ XDIGEST, R10
   145  	SHRQ $32, R10
   146  	XORL R10, (AX)
   147  
   148  	// Copy last 16 bytes of KS to the front
   149  	VMOVDQU (4*4)(BX), XTMP1
   150  	VMOVDQU XTMP1, (0*4)(BX)
   151  
   152  	VZEROUPPER
   153  	RET