github.com/emmansun/gmsm@v0.29.1/zuc/eia256_asm_amd64.s (about)

     1  // Referenced Intel(R) Multi-Buffer Crypto for IPsec
     2  // https://github.com/intel/intel-ipsec-mb/
     3  //go:build !purego
     4  
     5  #include "textflag.h"
     6  
     7  DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800
     8  DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901
     9  GLOBL bit_reverse_table_l<>(SB), RODATA, $16
    10  
    11  DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000
    12  DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010
    13  GLOBL bit_reverse_table_h<>(SB), RODATA, $16
    14  
    15  DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
    16  DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
    17  GLOBL bit_reverse_and_table<>(SB), RODATA, $16
    18  
    19  DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100
    20  DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
    21  GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16
    22  
    23  DATA shuf_mask_0_0_dw1_0<>+0x00(SB)/8, $0xffffffffffffffff
    24  DATA shuf_mask_0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504
    25  GLOBL shuf_mask_0_0_dw1_0<>(SB), RODATA, $16
    26  
    27  DATA shuf_mask_0_0_0_dw1<>+0x00(SB)/8, $0xffffffffffffffff
    28  DATA shuf_mask_0_0_0_dw1<>+0x08(SB)/8, $0x07060504ffffffff
    29  GLOBL shuf_mask_0_0_0_dw1<>(SB), RODATA, $16
    30  
    31  DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908
    32  DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c
    33  GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
    34  
    35  DATA bits_32_63<>+0x00(SB)/8, $0xffffffff00000000
    36  DATA bits_32_63<>+0x08(SB)/8, $0x0000000000000000
    37  GLOBL bits_32_63<>(SB), RODATA, $16
    38  
    39  
    40  #define XTMP1 X1
    41  #define XTMP2 X2
    42  #define XTMP3 X3
    43  #define XTMP4 X4
    44  #define XTMP5 X5
    45  #define XTMP6 X6
    46  #define XDATA X7
    47  #define XDIGEST X8
    48  #define KS_L X9
    49  #define KS_M1 X10
    50  #define KS_M2 X11
    51  #define KS_H X12
    52  
    53  // func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
    54  TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
    55  	MOVQ t+0(FP), AX
    56  	MOVQ ks+8(FP), BX
    57  	MOVQ p+16(FP), CX
    58  
    59  	CMPB ·useAVX(SB), $1
    60  	JE   avx
    61  
    62  	// Reverse data bytes
    63  	MOVUPS (0)(CX), XDATA
    64  	MOVOU bit_reverse_and_table<>(SB), XTMP4
    65  	MOVOU XDATA, XTMP2
    66  	PAND  XTMP4, XTMP2
    67  
    68  	PANDN XDATA, XTMP4
    69  	PSRLQ $4, XTMP4
    70  
    71  	MOVOU bit_reverse_table_h<>(SB), XTMP3
    72  	PSHUFB XTMP2, XTMP3
    73  
    74  	MOVOU bit_reverse_table_l<>(SB), XTMP1
    75  	PSHUFB XTMP4, XTMP1
    76  
    77  	PXOR XTMP1, XTMP3  // XTMP3 - bit reverse data bytes
    78  
    79  	// ZUC authentication part, 4x32 data bits
    80  	// setup KS
    81  	MOVUPS (0*4)(BX), XTMP1
    82  	MOVUPS (2*4)(BX), XTMP2
    83  	MOVUPS (4*4)(BX), XTMP4
    84  	PSHUFD $0x61, XTMP1, KS_L  // KS bits [63:32 31:0 95:64 63:32]
    85  	PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
    86  	PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
    87  
    88  	// setup DATA
    89  	MOVOU XTMP3, XTMP1
    90  	PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
    91  	MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
    92  
    93  	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
    94  	MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
    95  
    96  	// clmul
    97  	// xor the results from 4 32-bit words together
    98  	// Save data for following products
    99  	MOVOU XTMP2, XTMP5 //  Data bits [31:0 0s 63:32 0s]
   100  	MOVOU XTMP3, XTMP6 //  Data bits [95:64 0s 127:96 0s]
   101  
   102  	// Calculate lower 32 bits of tag
   103  	PCLMULQDQ $0x00, KS_L, XTMP1
   104  	PCLMULQDQ $0x11, KS_L, XTMP2
   105  	PCLMULQDQ $0x00, KS_M1, XDIGEST
   106  	PCLMULQDQ $0x11, KS_M1, XTMP3
   107  
   108  	// XOR all products and move bits 63-32 bits to lower 32 bits
   109  	PXOR XTMP1, XTMP2
   110  	PXOR XTMP3, XDIGEST
   111  	PXOR XTMP2, XDIGEST
   112  	MOVQ XDIGEST, XDIGEST // Clear top 64 bits
   113  	PSRLDQ $4, XDIGEST
   114  
   115  	// Prepare data and calculate bits 63-32 of tag
   116  	MOVOU XTMP5, XTMP1
   117  	MOVOU XTMP5, XTMP2
   118  	MOVOU XTMP6, XTMP3
   119  	MOVOU XTMP6, XTMP4
   120  
   121  	PCLMULQDQ $0x10, KS_L, XTMP1
   122  	PCLMULQDQ $0x01, KS_M1, XTMP2
   123  	PCLMULQDQ $0x10, KS_M1, XTMP3
   124  	PCLMULQDQ $0x01, KS_M2, XTMP4
   125  
   126  	// XOR all the products and keep only bits 63-32
   127  	PXOR XTMP2, XTMP1
   128  	PXOR XTMP4, XTMP3
   129  	PXOR XTMP3, XTMP1
   130  	PAND bits_32_63<>(SB), XTMP1
   131  
   132  	// OR with lower 32 bits, to construct 64 bits of tag
   133  	POR XTMP1, XDIGEST
   134  
   135  	// Update tag
   136  	MOVQ XDIGEST, R10
   137  	XORQ R10, (AX)
   138  
   139  	// Copy last 16 bytes of KS to the front
   140  	MOVUPS (4*4)(BX), XTMP1
   141  	MOVUPS XTMP1, (0*4)(BX)
   142  
   143  	RET
   144  
   145  avx:
   146  	VMOVDQU (0)(CX), XDATA
   147  
   148  	// Reverse data bytes
   149  	VMOVDQU bit_reverse_and_table<>(SB), XTMP1 
   150  	VPAND XTMP1, XDATA, XTMP2
   151  	VPANDN XDATA, XTMP1, XTMP3
   152  	VPSRLD $4, XTMP3, XTMP3
   153  
   154  	VMOVDQU bit_reverse_table_h<>(SB), XTMP1
   155  	VPSHUFB XTMP2, XTMP1, XTMP4
   156  	VMOVDQU bit_reverse_table_l<>(SB), XTMP1
   157  	VPSHUFB XTMP3, XTMP1, XTMP1
   158  	VPOR XTMP1, XTMP4, XTMP4
   159  	
   160  	// ZUC authentication part, 4x32 data bits
   161  	// setup KS
   162  	VPSHUFD $0x61, (0*4)(BX), KS_L  // KS bits [63:32 31:0 95:64 63:32]
   163  	VPSHUFD $0x61, (2*4)(BX), KS_M1  // KS bits [63:32 31:0 95:64 63:32]
   164  	VPSHUFD $0x61, (4*4)(BX), KS_M2  // KS bits [191:160 159:128 223:192 191:160]
   165  
   166  	// setup DATA
   167  	// Data bytes [31:0 0s 63:32 0s]
   168  	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
   169  	// Data bytes [95:64 0s 127:96 0s]
   170  	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
   171  
   172  
   173  	// clmul
   174  	// xor the results from 4 32-bit words together
   175  	// Calculate lower 32 bits of tag
   176  	VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
   177  	VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
   178  	VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
   179  	VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
   180  
   181  	VPXOR XTMP3, XTMP4, XTMP3
   182  	VPXOR XTMP5, XTMP6, XTMP5
   183  	VPXOR XTMP3, XTMP5, XTMP3
   184  
   185  	// Move previous result to low 32 bits and XOR with previous digest
   186  	VMOVQ XTMP3, XTMP3  // Clear top 64 bits
   187  	VPSRLDQ $4, XTMP3, XDIGEST
   188  
   189  	VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
   190  	VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
   191  	VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
   192  	VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
   193  
   194  	// XOR all the products and keep only 32-63 bits
   195  	VPXOR XTMP4, XTMP3, XTMP3
   196  	VPXOR XTMP6, XTMP5, XTMP5
   197  	VPXOR XTMP5, XTMP3, XTMP3
   198  	VPAND bits_32_63<>(SB), XTMP3, XTMP3
   199  
   200  	// XOR with bits 32-63 of previous digest
   201  	VPXOR XTMP3, XDIGEST, XDIGEST
   202  
   203  	// Update tag
   204  	VMOVQ XDIGEST, R10
   205  	XORQ R10, (AX)
   206  
   207  	// Copy last 16 bytes of KS to the front
   208  	VMOVDQU (4*4)(BX), XTMP1
   209  	VMOVDQU XTMP1, (0*4)(BX)
   210  
   211  	VZEROUPPER
   212  	RET
   213  
   214  // func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte)
   215  TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
   216  	MOVQ t+0(FP), AX
   217  	MOVQ ks+8(FP), BX
   218  	MOVQ p+16(FP), CX
   219  
   220  	CMPB ·useAVX(SB), $1
   221  	JE   avx
   222  
   223  	// Reverse data bytes
   224  	MOVUPS (0)(CX), XDATA
   225  	MOVOU bit_reverse_and_table<>(SB), XTMP4
   226  	MOVOU XDATA, XTMP2
   227  	PAND  XTMP4, XTMP2
   228  
   229  	PANDN XDATA, XTMP4
   230  	PSRLQ $4, XTMP4
   231  
   232  	MOVOU bit_reverse_table_h<>(SB), XTMP3
   233  	PSHUFB XTMP2, XTMP3
   234  
   235  	MOVOU bit_reverse_table_l<>(SB), XTMP1
   236  	PSHUFB XTMP4, XTMP1
   237  
   238  	PXOR XTMP1, XTMP3  // XTMP3 - bit reverse data bytes
   239  
   240  	// ZUC authentication part, 4x32 data bits
   241  	// setup KS
   242  	MOVUPS (0*4)(BX), XTMP1
   243  	MOVUPS (2*4)(BX), XTMP2
   244  	MOVUPS (4*4)(BX), XTMP4
   245  	PSHUFD $0x61, XTMP1, KS_L  // KS bits [63:32 31:0 95:64 63:32]
   246  	PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96]
   247  	PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160]
   248  	PSHUFD $0xBB, XTMP4, KS_H // KS bits [255:224 223:192 255:224 223:192]
   249  
   250  	// setup DATA
   251  	MOVOU XTMP3, XTMP1
   252  	PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1
   253  	MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s]
   254  
   255  	PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3
   256  	MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s]
   257  
   258  	// clmul
   259  	// xor the results from 4 32-bit words together
   260  	// Save data for following products
   261  	MOVOU XTMP2, XTMP5 //  Data bits [31:0 0s 63:32 0s]
   262  	MOVOU XTMP3, XTMP6 //  Data bits [95:64 0s 127:96 0s]
   263  
   264  	// Calculate lower 32 bits of tag
   265  	PCLMULQDQ $0x00, KS_L, XTMP1
   266  	PCLMULQDQ $0x11, KS_L, XTMP2
   267  	PCLMULQDQ $0x00, KS_M1, XDIGEST
   268  	PCLMULQDQ $0x11, KS_M1, XTMP3
   269  
   270  	// XOR all products and move bits 63-32 bits to lower 32 bits
   271  	PXOR XTMP1, XTMP2
   272  	PXOR XTMP3, XDIGEST
   273  	PXOR XTMP2, XDIGEST
   274  	MOVQ XDIGEST, XDIGEST // Clear top 64 bits
   275  	PSRLDQ $4, XDIGEST
   276  
   277  	// Prepare data and calculate bits 63-32 of tag
   278  	MOVOU XTMP5, XTMP1
   279  	MOVOU XTMP5, XTMP2
   280  	MOVOU XTMP6, XTMP3
   281  	MOVOU XTMP6, XTMP4
   282  
   283  	PCLMULQDQ $0x10, KS_L, XTMP1
   284  	PCLMULQDQ $0x01, KS_M1, XTMP2
   285  	PCLMULQDQ $0x10, KS_M1, XTMP3
   286  	PCLMULQDQ $0x01, KS_M2, XTMP4
   287  
   288  	// XOR all the products and keep only bits 63-32
   289  	PXOR XTMP2, XTMP1
   290  	PXOR XTMP4, XTMP3
   291  	PXOR XTMP3, XTMP1
   292  	PAND bits_32_63<>(SB), XTMP1
   293  
   294  	// OR with lower 32 bits, to construct 64 bits of tag
   295  	POR XTMP1, XDIGEST
   296  
   297  	// Prepare data and calculate bits 95-64 of tag
   298  	MOVOU XTMP5, XTMP1
   299  	MOVOU XTMP5, XTMP2
   300  	MOVOU XTMP6, XTMP3
   301  	MOVOU XTMP6, XTMP4
   302  
   303  	PCLMULQDQ $0x00, KS_M1, XTMP1
   304  	PCLMULQDQ $0x11, KS_M1, XTMP2
   305  	PCLMULQDQ $0x00, KS_M2, XTMP3
   306  	PCLMULQDQ $0x11, KS_M2, XTMP4
   307  
   308  	// XOR all the products and move bits 63-32 to bits 95-64
   309  	PXOR XTMP2, XTMP1
   310  	PXOR XTMP4, XTMP3
   311  	PXOR XTMP3, XTMP1
   312  	PSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP1
   313  
   314  	// OR with lower 64 bits, to construct 96 bits of tag
   315  	POR XTMP1, XDIGEST
   316  
   317  	// Prepare data and calculate bits 127-96 of tag
   318  	MOVOU XTMP5, XTMP1
   319  	MOVOU XTMP5, XTMP2
   320  	MOVOU XTMP6, XTMP3
   321  	MOVOU XTMP6, XTMP4
   322  
   323  	PCLMULQDQ $0x10, KS_M1, XTMP1
   324  	PCLMULQDQ $0x01, KS_M2, XTMP2
   325  	PCLMULQDQ $0x10, KS_M2, XTMP3
   326  	PCLMULQDQ $0x01, KS_H, XTMP4
   327  
   328  	// XOR all the products and move bits 63-32 to bits 127-96
   329  	PXOR XTMP2, XTMP1
   330  	PXOR XTMP4, XTMP3
   331  	PXOR XTMP3, XTMP1
   332  	PSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP1
   333  
   334  	// OR with lower 96 bits, to construct 128 bits of tag
   335  	POR XTMP1, XDIGEST
   336  
   337  	// Update tag
   338  	MOVUPS (AX), XTMP1
   339  	PXOR XTMP1, XDIGEST
   340  	MOVUPS XDIGEST, (AX)
   341  
   342  	// Copy last 16 bytes of KS to the front
   343  	MOVUPS (4*4)(BX), XTMP1
   344  	MOVUPS XTMP1, (0*4)(BX)
   345  
   346  	RET
   347  
   348  avx:
   349  	VMOVDQU (0)(CX), XDATA
   350  
   351  	// Reverse data bytes
   352  	VMOVDQU bit_reverse_and_table<>(SB), XTMP1 
   353  	VPAND XTMP1, XDATA, XTMP2
   354  	VPANDN XDATA, XTMP1, XTMP3
   355  	VPSRLD $4, XTMP3, XTMP3
   356  
   357  	VMOVDQU bit_reverse_table_h<>(SB), XTMP1
   358  	VPSHUFB XTMP2, XTMP1, XTMP4
   359  	VMOVDQU bit_reverse_table_l<>(SB), XTMP1
   360  	VPSHUFB XTMP3, XTMP1, XTMP1
   361  	VPOR XTMP1, XTMP4, XTMP4
   362  	
   363  	// ZUC authentication part, 4x32 data bits
   364  	// setup KS
   365  	VPSHUFD $0x61, (0*4)(BX), KS_L  // KS bits [63:32 31:0 95:64 63:32]
   366  	VPSHUFD $0x61, (2*4)(BX), KS_M1  // KS bits [63:32 31:0 95:64 63:32]
   367  	VPSHUFD $0x61, (4*4)(BX), KS_M2  // KS bits [191:160 159:128 223:192 191:160]
   368  	VPSHUFD $0xBB, (4*4)(BX), KS_H  // KS bits [255:224 223:192 255:224 223:192]
   369  
   370  	// setup DATA
   371  	// Data bytes [31:0 0s 63:32 0s]
   372  	VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1
   373  	// Data bytes [95:64 0s 127:96 0s]
   374  	VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2
   375  
   376  
   377  	// clmul
   378  	// xor the results from 4 32-bit words together
   379  	// Calculate lower 32 bits of tag
   380  	VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3
   381  	VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4
   382  	VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5
   383  	VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6
   384  
   385  	VPXOR XTMP3, XTMP4, XTMP3
   386  	VPXOR XTMP5, XTMP6, XTMP5
   387  	VPXOR XTMP3, XTMP5, XTMP3
   388  
   389  	// Move previous result to low 32 bits and XOR with previous digest
   390  	VMOVQ XTMP3, XTMP3  // Clear top 64 bits
   391  	VPSRLDQ $4, XTMP3, XDIGEST
   392  
   393  	VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3
   394  	VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4
   395  	VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5
   396  	VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6
   397  
   398  	// XOR all the products and keep only 32-63 bits
   399  	VPXOR XTMP4, XTMP3, XTMP3
   400  	VPXOR XTMP6, XTMP5, XTMP5
   401  	VPXOR XTMP5, XTMP3, XTMP3
   402  	VPAND bits_32_63<>(SB), XTMP3, XTMP3
   403  
   404  	// XOR with bits 32-63 of previous digest
   405  	VPXOR XTMP3, XDIGEST, XDIGEST
   406  
   407  	// Prepare data and calculate bits 95-64 of tag
   408  	VPCLMULQDQ $0x00, KS_M1, XTMP1, XTMP3
   409  	VPCLMULQDQ $0x11, KS_M1, XTMP1, XTMP4
   410  	VPCLMULQDQ $0x00, KS_M2, XTMP2, XTMP5
   411  	VPCLMULQDQ $0x11, KS_M2, XTMP2, XTMP6
   412  
   413  	// XOR all the products and move bits 63-32 to bits 95-64
   414  	VPXOR XTMP4, XTMP3, XTMP3
   415  	VPXOR XTMP6, XTMP5, XTMP5
   416  	VPXOR XTMP5, XTMP3, XTMP3
   417  
   418  	VPSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP3, XTMP3
   419  
   420  	// XOR with previous bits 64-95 of previous digest
   421  	VPXOR XTMP3, XDIGEST, XDIGEST
   422  
   423  	// Prepare data and calculate bits 127-96 of tag
   424  	VPCLMULQDQ $0x10, KS_M1, XTMP1, XTMP3
   425  	VPCLMULQDQ $0x01, KS_M2, XTMP1, XTMP4
   426  	VPCLMULQDQ $0x10, KS_M2, XTMP2, XTMP5
   427  	VPCLMULQDQ $0x01, KS_H, XTMP2, XTMP6
   428  
   429  	// XOR all the products and move bits 63-32 to bits 127-96
   430  	VPXOR XTMP4, XTMP3, XTMP3
   431  	VPXOR XTMP6, XTMP5, XTMP5
   432  	VPXOR XTMP5, XTMP3, XTMP3
   433  
   434  	VPSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP3, XTMP3
   435  
   436  	// XOR with previous bits 64-95 of previous digest
   437  	VPXOR XTMP3, XDIGEST, XDIGEST
   438  
   439  	// Update tag
   440  	VPXOR (AX), XDIGEST, XDIGEST
   441  	VMOVDQA XDIGEST, (AX)
   442  
   443  	// Copy last 16 bytes of KS to the front
   444  	VMOVDQU (4*4)(BX), XTMP1
   445  	VMOVDQU XTMP1, (0*4)(BX)
   446  
   447  	VZEROUPPER
   448  	RET