github.com/emmansun/gmsm@v0.29.1/zuc/asm_amd64.s (about)

     1  // Referenced Intel(R) Multi-Buffer Crypto for IPsec
     2  // https://github.com/intel/intel-ipsec-mb/
     3  // https://gist.github.com/emmansun/15d2fce6659ab97ffaf7ab66e278caee
     4  //go:build !purego
     5  
     6  #include "textflag.h"
     7  
     8  DATA Top3_bits_of_the_byte<>+0x00(SB)/8, $0xe0e0e0e0e0e0e0e0
     9  DATA Top3_bits_of_the_byte<>+0x08(SB)/8, $0xe0e0e0e0e0e0e0e0
    10  GLOBL Top3_bits_of_the_byte<>(SB), RODATA, $16
    11  
    12  DATA Bottom5_bits_of_the_byte<>+0x00(SB)/8, $0x1f1f1f1f1f1f1f1f
    13  DATA Bottom5_bits_of_the_byte<>+0x08(SB)/8, $0x1f1f1f1f1f1f1f1f
    14  GLOBL Bottom5_bits_of_the_byte<>(SB), RODATA, $16
    15  
    16  DATA Low_nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
    17  DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
    18  GLOBL Low_nibble_mask<>(SB), RODATA, $16
    19  
    20  DATA High_nibble_mask<>+0x00(SB)/8, $0xF0F0F0F0F0F0F0F0
    21  DATA High_nibble_mask<>+0x08(SB)/8, $0xF0F0F0F0F0F0F0F0
    22  GLOBL High_nibble_mask<>(SB), RODATA, $16
    23  
    24  DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09
    25  DATA P1<>+0x08(SB)/8, $0x090305070C000400
    26  GLOBL P1<>(SB), RODATA, $16
    27  
    28  DATA P2<>+0x00(SB)/8, $0x040C000705060D08
    29  DATA P2<>+0x08(SB)/8, $0x0209030F0A0E010B
    30  GLOBL P2<>(SB), RODATA, $16
    31  
    32  DATA P3<>+0x00(SB)/8, $0x0F0A0D00060A0602
    33  DATA P3<>+0x08(SB)/8, $0x0D0C0900050D0303
    34  GLOBL P3<>(SB), RODATA, $16
    35  
    36  DATA Aes_to_Zuc_mul_low_nibble<>+0x00(SB)/8, $0x1D1C9F9E83820100
    37  DATA Aes_to_Zuc_mul_low_nibble<>+0x08(SB)/8, $0x3938BBBAA7A62524
    38  GLOBL Aes_to_Zuc_mul_low_nibble<>(SB), RODATA, $16
    39  
    40  DATA Aes_to_Zuc_mul_high_nibble<>+0x00(SB)/8, $0xA174A97CDD08D500
    41  DATA Aes_to_Zuc_mul_high_nibble<>+0x08(SB)/8, $0x3DE835E04194499C
    42  GLOBL Aes_to_Zuc_mul_high_nibble<>(SB), RODATA, $16
    43  
    44  DATA Comb_matrix_mul_low_nibble<>+0x00(SB)/8, $0xCFDB6571BEAA1400
    45  DATA Comb_matrix_mul_low_nibble<>+0x08(SB)/8, $0x786CD2C6091DA3B7
    46  GLOBL Comb_matrix_mul_low_nibble<>(SB), RODATA, $16
    47  
    48  DATA Comb_matrix_mul_high_nibble<>+0x00(SB)/8, $0x638CFA1523CCBA55
    49  DATA Comb_matrix_mul_high_nibble<>+0x08(SB)/8, $0x3FD0A6497F90E609
    50  GLOBL Comb_matrix_mul_high_nibble<>(SB), RODATA, $16
    51  
    52  DATA Shuf_mask<>+0x00(SB)/8, $0x0B0E0104070A0D00
    53  DATA Shuf_mask<>+0x08(SB)/8, $0x0306090C0F020508
    54  GLOBL Shuf_mask<>(SB), RODATA, $16
    55  
    56  DATA Cancel_aes<>+0x00(SB)/8, $0x6363636363636363
    57  DATA Cancel_aes<>+0x08(SB)/8, $0x6363636363636363
    58  GLOBL Cancel_aes<>(SB), RODATA, $16
    59  
    60  DATA CombMatrix<>+0x00(SB)/8, $0x3C1A99B2AD1ED43A
    61  DATA CombMatrix<>+0x08(SB)/8, $0x3C1A99B2AD1ED43A
    62  GLOBL CombMatrix<>(SB), RODATA, $16
    63  
    64  DATA mask_S0<>+0x00(SB)/8, $0xff00ff00ff00ff00
    65  DATA mask_S0<>+0x08(SB)/8, $0xff00ff00ff00ff00
    66  GLOBL mask_S0<>(SB), RODATA, $16
    67  
    68  DATA mask_S1<>+0x00(SB)/8, $0x00ff00ff00ff00ff
    69  DATA mask_S1<>+0x08(SB)/8, $0x00ff00ff00ff00ff
    70  GLOBL mask_S1<>(SB), RODATA, $16
    71  
    72  // shuffle byte order from LE to BE
    73  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
    74  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
    75  GLOBL flip_mask<>(SB), RODATA, $16
    76  
    77  #define OFFSET_FR1      (16*4)
    78  #define OFFSET_FR2      (17*4)
    79  #define OFFSET_BRC_X0   (18*4)
    80  #define OFFSET_BRC_X1   (19*4)
    81  #define OFFSET_BRC_X2   (20*4)
    82  #define OFFSET_BRC_X3   (21*4)
    83  
    84  #define SHLDL(a, b, n) \  // NO SHLDL in GOLANG now
    85  	SHLL n, a          \
    86  	SHRL n, b          \  
    87  	ORL  b, a
    88  
    89  // Rotate left 5 bits in each byte, within an XMM register, SSE version.
    90  #define Rotl_5_SSE(XDATA, XTMP0)               \
    91  	MOVOU XDATA, XTMP0                         \
    92  	PSLLL $5, XTMP0                            \
    93  	PSRLL $3, XDATA                            \
    94  	PAND Top3_bits_of_the_byte<>(SB), XTMP0    \
    95  	PAND Bottom5_bits_of_the_byte<>(SB), XDATA \
    96  	POR XTMP0, XDATA
    97  
    98  // Compute 16 S0 box values from 16 bytes, SSE version.
    99  #define S0_comput_SSE(IN_OUT, XTMP1, XTMP2)    \
   100  	MOVOU IN_OUT, XTMP1                        \
   101  	\
   102  	PAND Low_nibble_mask<>(SB), IN_OUT         \  // x2
   103  	\
   104  	PAND High_nibble_mask<>(SB), XTMP1         \ 
   105  	PSRLQ $4, XTMP1                            \  // x1
   106  	\
   107  	MOVOU P1<>(SB), XTMP2                      \
   108  	PSHUFB IN_OUT, XTMP2                       \ // P1[x2]
   109  	PXOR XTMP1, XTMP2                          \ // q = x1 ^ P1[x2], XTMP1 free
   110  	\
   111  	MOVOU P2<>(SB), XTMP1                      \
   112  	PSHUFB XTMP2, XTMP1                        \ // P2[q]
   113  	PXOR IN_OUT, XTMP1                         \ // r = x2 ^ P2[q]; IN_OUT free
   114  	\
   115  	MOVOU P3<>(SB), IN_OUT                     \
   116  	PSHUFB XTMP1, IN_OUT                       \ // P3[r]
   117  	PXOR XTMP2, IN_OUT                         \ // s = q ^ P3[r], XTMP2 free
   118  	\ // s << 4 (since high nibble of each byte is 0, no masking is required)
   119  	PSLLQ $4, IN_OUT                           \
   120  	POR XTMP1, IN_OUT                          \ // t = (s << 4) | r
   121  	Rotl_5_SSE(IN_OUT, XTMP1)
   122  
   123  // Perform 8x8 matrix multiplication using lookup tables with partial results
   124  // for high and low nible of each input byte, SSE versiion.
   125  #define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP)        \
   126  	\ // Get low nibble of input data
   127  	MOVOU Low_nibble_mask<>(SB), XTMP                  \
   128  	PAND XIN, XTMP                                     \
   129  	\ // Get low nibble of output
   130  	PSHUFB XTMP, XLO                                   \
   131  	\ // Get high nibble of input data
   132  	MOVOU High_nibble_mask<>(SB), XTMP                 \
   133  	PAND XIN, XTMP                                     \
   134  	PSRLQ $4, XTMP                                     \
   135  	\ // Get high nibble of output
   136  	PSHUFB XTMP, XHI_OUT                               \
   137  	\ // XOR high and low nibbles to get full bytes
   138  	PXOR XLO, XHI_OUT
   139  
   140  // Compute 16 S1 box values from 16 bytes, stored in XMM register
   141  #define S1_comput_SSE(XIN_OUT, XTMP1, XTMP2, XTMP3)    \
   142  	MOVOU Aes_to_Zuc_mul_low_nibble<>(SB), XTMP1       \
   143  	MOVOU Aes_to_Zuc_mul_high_nibble<>(SB), XTMP2      \
   144  	MUL_PSHUFB_SSE(XIN_OUT, XTMP1, XTMP2, XTMP3)       \
   145  	\
   146  	PSHUFB Shuf_mask<>(SB), XTMP2                      \
   147  	AESENCLAST Cancel_aes<>(SB), XTMP2                 \
   148  	\
   149  	MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1       \
   150  	MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT    \
   151  	MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3)
   152  
   153  // Rotate left 5 bits in each byte, within an XMM register, AVX version.
   154  #define Rotl_5_AVX(XDATA, XTMP0)                       \
   155  	VPSLLD $5, XDATA, XTMP0                            \
   156  	VPSRLD $3, XDATA, XDATA                            \
   157  	VPAND Top3_bits_of_the_byte<>(SB), XTMP0, XTMP0    \
   158  	VPAND Bottom5_bits_of_the_byte<>(SB), XDATA, XDATA \
   159  	VPOR XTMP0, XDATA, XDATA
   160  
   161  // Compute 16 S0 box values from 16 bytes, AVX version.
   162  #define S0_comput_AVX(IN_OUT, XTMP1, XTMP2)      \
   163  	VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1  \
   164  	VPSRLQ $4, XTMP1, XTMP1                      \ // x1
   165  	\
   166  	VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT  \ // x2
   167  	\
   168  	VMOVDQU P1<>(SB), XTMP2                      \
   169  	VPSHUFB IN_OUT, XTMP2, XTMP2                 \ // P1[x2]
   170  	VPXOR XTMP1, XTMP2, XTMP2                    \ // q = x1 ^ P1[x2] ; XTMP1 free
   171  	\
   172  	VMOVDQU P2<>(SB), XTMP1                      \
   173  	VPSHUFB XTMP2, XTMP1, XTMP1                  \ // P2[q]
   174  	VPXOR IN_OUT, XTMP1, XTMP1                   \ // r = x2 ^ P2[q] ; IN_OUT free
   175  	\
   176  	VMOVDQU P3<>(SB), IN_OUT                     \
   177  	VPSHUFB XTMP1, IN_OUT, IN_OUT                \ // P3[r]
   178  	VPXOR XTMP2, IN_OUT, IN_OUT                  \ // s = q ^ P3[r] ; XTMP2 free
   179  	\ // s << 4 (since high nibble of each byte is 0, no masking is required)
   180  	VPSLLQ $4, IN_OUT, IN_OUT                    \
   181  	VPOR XTMP1, IN_OUT, IN_OUT                   \ // t = (s << 4) | r
   182  	Rotl_5_AVX(IN_OUT, XTMP1)
   183  
   184  // Perform 8x8 matrix multiplication using lookup tables with partial results
   185  // for high and low nible of each input byte, AVX version.
   186  #define MUL_PSHUFB_AVX(XIN, XLO, XHI_OUT, XTMP)        \
   187  	\ // Get low nibble of input data
   188  	VPAND Low_nibble_mask<>(SB), XIN, XTMP             \
   189  	\ // Get low nibble of output
   190  	VPSHUFB XTMP, XLO, XLO                             \
   191  	\ // Get high nibble of input data
   192  	VPAND High_nibble_mask<>(SB), XIN, XTMP            \
   193  	VPSRLQ $4, XTMP, XTMP                              \
   194  	\ // Get high nibble of output
   195  	VPSHUFB XTMP, XHI_OUT, XHI_OUT                     \
   196  	\ // XOR high and low nibbles to get full bytes
   197  	VPXOR XLO, XHI_OUT, XHI_OUT
   198  
   199  // Compute 16 S1 box values from 16 bytes, stored in XMM register
   200  #define S1_comput_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3)       \
   201  	\ // gf2p8affineqb  XIN_OUT, [rel Aes_to_Zuc], 0x00
   202  	VMOVDQU Aes_to_Zuc_mul_low_nibble<>(SB), XTMP1        \
   203  	VMOVDQU Aes_to_Zuc_mul_high_nibble<>(SB), XTMP2       \
   204  	MUL_PSHUFB_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3)          \
   205  	\
   206  	VPSHUFB Shuf_mask<>(SB), XTMP2, XTMP2                 \
   207  	VAESENCLAST Cancel_aes<>(SB), XTMP2, XTMP2            \
   208  	\ // gf2p8affineqb  XIN_OUT, [rel CombMatrix], 0x55
   209  	VMOVDQU Comb_matrix_mul_low_nibble<>(SB), XTMP1       \
   210  	VMOVDQU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT    \
   211  	MUL_PSHUFB_AVX(XTMP2, XTMP1, XIN_OUT, XTMP3)
   212  
   213  #define F_R1 R9
   214  #define F_R2 R10
   215  #define BRC_X0 R11
   216  #define BRC_X1 R12
   217  #define BRC_X2 R13
   218  #define BRC_X3 R14
   219  
   220  // BITS_REORG(idx)
   221  //
   222  // params
   223  //      %1 - round number
   224  // uses
   225  //      AX, BX, CX, DX
   226  // return 
   227  //      updates R11, R12, R13, R14
   228  //
   229  #define BITS_REORG(idx)                      \
   230  	MOVL (((15 + idx) % 16)*4)(SI), BRC_X0   \
   231  	MOVL (((14 + idx) % 16)*4)(SI), AX       \
   232  	MOVL (((11 + idx) % 16)*4)(SI), BRC_X1   \
   233  	MOVL (((9 + idx) % 16)*4)(SI), BX        \
   234  	MOVL (((7 + idx) % 16)*4)(SI), BRC_X2    \ 
   235  	MOVL (((5 + idx) % 16)*4)(SI), CX        \
   236  	MOVL (((2 + idx) % 16)*4)(SI), BRC_X3    \
   237  	MOVL (((0 + idx) % 16)*4)(SI), DX        \
   238  	SHRL $15, BRC_X0                         \
   239  	SHLL $16, AX                             \
   240  	SHLL $1, BX                              \
   241  	SHLL $1, CX                              \
   242  	SHLL $1, DX                              \
   243  	SHLDL(BRC_X0, AX, $16)                   \
   244  	SHLDL(BRC_X1, BX, $16)                   \
   245  	SHLDL(BRC_X2, CX, $16)                   \
   246  	SHLDL(BRC_X3, DX, $16)                      
   247  
   248  // LFSR_UPDT calculates the next state word and places/overwrites it to lfsr[idx % 16]
   249  // 
   250  // params
   251  //      %1 - round number
   252  // uses
   253  //      AX as input (ZERO or W), BX, CX, DX, R8
   254  #define LFSR_UPDT(idx)                       \
   255  	MOVL (((0 + idx) % 16)*4)(SI), BX        \
   256  	MOVL (((4 + idx) % 16)*4)(SI), CX        \
   257  	MOVL (((10 + idx) % 16)*4)(SI), DX       \
   258  	MOVL (((13 + idx) % 16)*4)(SI), R8       \
   259  	\ // Calculate 64-bit LFSR feedback
   260  	ADDQ BX, AX                              \
   261  	SHLQ $8, BX                              \
   262  	SHLQ $20, CX                             \
   263  	SHLQ $21, DX                             \
   264  	SHLQ $17, R8                             \
   265  	ADDQ BX, AX                              \
   266  	ADDQ CX, AX                              \
   267  	ADDQ DX, AX                              \
   268  	ADDQ R8, AX                              \
   269  	MOVL (((15 + idx) % 16)*4)(SI), R8       \
   270  	SHLQ $15, R8                             \
   271  	ADDQ R8, AX                              \
   272  	\ // Reduce it to 31-bit value
   273  	MOVQ AX, BX                              \
   274  	ANDQ $0x7FFFFFFF, AX                     \
   275  	SHRQ $31, BX                             \
   276  	ADDQ BX, AX                              \
   277  	\
   278  	MOVQ AX, BX                              \
   279  	SUBQ $0x7FFFFFFF, AX                     \
   280  	CMOVQCS BX, AX                           \
   281  	\ // LFSR_S16 = (LFSR_S15++) = AX
   282  	MOVL AX, (((0 + idx) % 16)*4)(SI)
   283  
   284  #define NONLIN_FUN                           \
   285  	MOVL BRC_X0, AX                          \
   286  	XORL F_R1, AX                            \ // F_R1 xor BRC_X1
   287  	ADDL F_R2, AX                            \ // W = (F_R1 xor BRC_X1) + F_R2
   288  	ADDL BRC_X1, F_R1                        \ // W1= F_R1 + BRC_X1
   289  	XORL BRC_X2, F_R2                        \ // W2= F_R2 ^ BRC_X2
   290  	\
   291  	MOVL F_R1, DX                            \
   292  	MOVL F_R2, CX                            \
   293  	SHLDL(DX, CX, $16)                       \ // P = (W1 << 16) | (W2 >> 16)
   294  	SHLDL(F_R2, F_R1, $16)                   \ // Q = (W2 << 16) | (W1 >> 16)
   295  	MOVL DX, BX                              \ // start L1 
   296  	MOVL DX, CX                              \
   297  	ROLL $2, BX                              \
   298  	ROLL $24, CX                             \
   299  	XORL CX, DX                              \
   300  	XORL BX, DX                              \
   301  	ROLL $8, BX                              \
   302  	XORL BX, DX                              \
   303  	ROLL $8, BX                              \
   304  	XORL BX, DX                              \ // U = L1(P) = EDX, hi(RDX)=0
   305  	MOVL F_R2, BX                            \  
   306  	MOVL F_R2, CX                            \
   307  	ROLL $8, BX                              \
   308  	XORL BX, F_R2                            \
   309  	ROLL $14, CX                             \
   310  	XORL CX, F_R2                            \
   311  	ROLL $8, CX                              \
   312  	XORL CX, F_R2                            \
   313  	ROLL $8, CX                              \
   314  	XORL CX, F_R2                            \ // V = L2(Q) = R11D, hi(R11)=0
   315  	SHLQ $32, F_R2                           \ // DX = V || U
   316  	XORQ F_R2, DX                             
   317  
   318  // Non-Linear function F, SSE version.
   319  // uses
   320  //      AX, BX, CX, DX, R8
   321  //      X0, X1, X2, X3, X4
   322  // return 
   323  //      W in AX
   324  //      updated F_R1, F_R2  
   325  #define NONLIN_FUN_SSE                       \
   326  	NONLIN_FUN                               \
   327  	MOVQ DX, X0                              \
   328  	MOVOU X0, X1                             \ 
   329  	S0_comput_SSE(X1, X2, X3)                \
   330  	S1_comput_SSE(X0, X2, X3, X4)            \
   331  	\
   332  	PAND mask_S1<>(SB), X0                   \
   333  	PAND mask_S0<>(SB), X1                   \ 
   334  	PXOR X1, X0                              \ 
   335  	\
   336  	MOVL X0, F_R1                            \ // F_R1
   337  	PEXTRD $1, X0, F_R2
   338  
   339  // RESTORE_LFSR_0, appends the first 4 bytes to last.
   340  #define RESTORE_LFSR_0                       \
   341  	MOVL (0*4)(SI), AX                       \ // first 4-bytes
   342  	MOVUPS (4)(SI), X0                       \ 
   343  	MOVUPS (20)(SI), X1                      \ 
   344  	MOVUPS (36)(SI), X2                      \
   345  	MOVQ (52)(SI), BX                        \
   346  	MOVL (60)(SI), CX                        \ // last 4-bytes
   347  	\
   348  	MOVUPS X0, (SI)                          \  
   349  	MOVUPS X1, (16)(SI)                      \  
   350  	MOVUPS X2, (32)(SI)                      \
   351  	MOVQ BX, (48)(SI)                        \
   352  	MOVL CX, (56)(SI)                        \
   353  	MOVL AX, (60)(SI) 
   354  
   355  // RESTORE_LFSR_2, appends the first 8 bytes to last.
   356  #define RESTORE_LFSR_2                       \
   357  	MOVQ (0)(SI), AX                         \ // first 8-bytes
   358  	MOVUPS (8)(SI), X0                       \ 
   359  	MOVUPS (24)(SI), X1                      \ 
   360  	MOVUPS (40)(SI), X2                      \
   361  	MOVQ (56)(SI), BX                        \ // last 8-bytes
   362  	\
   363  	MOVUPS X0, (SI)                          \  
   364  	MOVUPS X1, (16)(SI)                      \  
   365  	MOVUPS X2, (32)(SI)                      \
   366  	MOVQ BX, (48)(SI)                        \
   367  	MOVQ AX, (56)(SI)
   368  
   369  // RESTORE_LFSR_4, appends the first 16 bytes to last.
   370  #define RESTORE_LFSR_4                       \
   371  	MOVUPS (0)(SI), X0                       \ // first 16 bytes
   372  	MOVUPS (16)(SI), X1                      \
   373  	MOVUPS (32)(SI), X2                      \
   374  	MOVUPS (48)(SI), X3                      \ // last 16 bytes
   375  	\
   376  	MOVUPS X1, (0)(SI)                       \
   377  	MOVUPS X2, (16)(SI)                      \
   378  	MOVUPS X3, (32)(SI)                      \
   379  	MOVUPS X0, (48)(SI)
   380  
   381  // RESTORE_LFSR_8, appends the first 32 bytes to last.
   382  #define RESTORE_LFSR_8                       \
   383  	MOVUPS (0)(SI), X0                       \
   384  	MOVUPS (16)(SI), X1                      \
   385  	MOVUPS (32)(SI), X2                      \
   386  	MOVUPS (48)(SI), X3                      \
   387  	\
   388  	MOVUPS X2, (0)(SI)                       \
   389  	MOVUPS X3, (16)(SI)                      \
   390  	MOVUPS X0, (32)(SI)                      \
   391  	MOVUPS X1, (48)(SI)
   392  
   393  // Non-Linear function F, AVX version.
   394  // uses
   395  //      AX, BX, CX, DX, R8
   396  //      X0, X1, X2, X3, X4
   397  // return 
   398  //      W in AX
   399  //      updated F_R1, F_R2
   400  #define NONLIN_FUN_AVX                       \
   401  	NONLIN_FUN                               \
   402  	VMOVQ DX, X0                             \
   403  	VMOVDQA X0, X1                           \ 
   404  	S0_comput_AVX(X1, X2, X3)                \
   405  	S1_comput_AVX(X0, X2, X3, X4)            \
   406  	\
   407  	VPAND mask_S1<>(SB), X0, X0              \
   408  	VPAND mask_S0<>(SB), X1, X1              \ 
   409  	VPXOR X1, X0, X0                         \ 
   410  	\
   411  	MOVL X0, F_R1                            \ // F_R1
   412  	VPEXTRD $1, X0, F_R2   
   413  
   414  #define LOAD_STATE                           \
   415  	MOVL OFFSET_FR1(SI), F_R1                \
   416  	MOVL OFFSET_FR2(SI), F_R2                \
   417  	MOVL OFFSET_BRC_X0(SI), BRC_X0           \
   418  	MOVL OFFSET_BRC_X1(SI), BRC_X1           \
   419  	MOVL OFFSET_BRC_X2(SI), BRC_X2           \
   420  	MOVL OFFSET_BRC_X3(SI), BRC_X3
   421  
   422  #define SAVE_STATE                           \
   423  	MOVL F_R1, OFFSET_FR1(SI)                \
   424  	MOVL F_R2, OFFSET_FR2(SI)                \
   425  	MOVL BRC_X0, OFFSET_BRC_X0(SI)           \
   426  	MOVL BRC_X1, OFFSET_BRC_X1(SI)           \
   427  	MOVL BRC_X2, OFFSET_BRC_X2(SI)           \
   428  	MOVL BRC_X3, OFFSET_BRC_X3(SI)
   429  
   430  // func genKeywordAsm(s *zucState32) uint32
   431  TEXT ·genKeywordAsm(SB),NOSPLIT,$0
   432  	MOVQ pState+0(FP), SI
   433  
   434  	LOAD_STATE
   435  
   436  	BITS_REORG(0)
   437  	CMPB ·useAVX(SB), $1
   438  	JE   avx
   439  
   440  sse:
   441  	NONLIN_FUN_SSE
   442  
   443  	// (BRC_X3 xor W) as result
   444  	XORL BRC_X3, AX
   445  	MOVL AX, ret+8(FP)
   446  
   447  	// LFSRWithWorkMode
   448  	XORQ AX, AX
   449  	LFSR_UPDT(0)
   450  
   451  	SAVE_STATE
   452  	RESTORE_LFSR_0
   453  
   454  	RET
   455  
   456  avx:
   457  	NONLIN_FUN_AVX
   458  
   459  	// (BRC_X3 xor W) as result
   460  	XORL BRC_X3, AX
   461  	MOVL AX, ret+8(FP)
   462  
   463  	// LFSRWithWorkMode
   464  	XORQ AX, AX
   465  	LFSR_UPDT(0)
   466  
   467  	SAVE_STATE
   468  	RESTORE_LFSR_0
   469  
   470  	RET
   471  
   472  #define ROUND_SSE(idx)            \
   473  	BITS_REORG(idx)               \
   474  	NONLIN_FUN_SSE                \
   475  	XORL BRC_X3, AX               \
   476  	MOVL AX, (idx*4)(DI)          \
   477  	XORQ AX, AX                   \
   478  	LFSR_UPDT(idx)
   479  
   480  #define ROUND_AVX(idx)            \
   481  	BITS_REORG(idx)               \
   482  	NONLIN_FUN_AVX                \
   483  	XORL BRC_X3, AX               \
   484  	MOVL AX, (idx*4)(DI)          \
   485  	XORQ AX, AX                   \
   486  	LFSR_UPDT(idx)
   487  
   488  #define ROUND_REV32_SSE(idx)      \
   489  	BITS_REORG(idx)               \
   490  	NONLIN_FUN_SSE                \
   491  	XORL BRC_X3, AX               \
   492  	BSWAPL AX                     \
   493  	MOVL AX, (idx*4)(DI)          \
   494  	XORQ AX, AX                   \
   495  	LFSR_UPDT(idx)
   496  
   497  #define ROUND_REV32_AVX(idx)      \
   498  	BITS_REORG(idx)               \
   499  	NONLIN_FUN_AVX                \
   500  	XORL BRC_X3, AX               \
   501  	BSWAPL AX                     \
   502  	MOVL AX, (idx*4)(DI)          \
   503  	XORQ AX, AX                   \
   504  	LFSR_UPDT(idx)
   505  
   506  // func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
   507  TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0
   508  	MOVQ ks+0(FP), DI
   509  	MOVQ ks_len+8(FP), BP
   510  	MOVQ pState+24(FP), SI
   511  
   512  	LOAD_STATE
   513  
   514  	CMPB ·useAVX(SB), $1
   515  	JE   avxZucSixteens
   516  
   517  sseZucSixteens:
   518  	CMPQ BP, $16
   519  	JB sseZucOctet
   520  	SUBQ $16, BP
   521  	ROUND_SSE(0)
   522  	ROUND_SSE(1)
   523  	ROUND_SSE(2)
   524  	ROUND_SSE(3)
   525  	ROUND_SSE(4)
   526  	ROUND_SSE(5)
   527  	ROUND_SSE(6)
   528  	ROUND_SSE(7)
   529  	ROUND_SSE(8)
   530  	ROUND_SSE(9)
   531  	ROUND_SSE(10)
   532  	ROUND_SSE(11)
   533  	ROUND_SSE(12)
   534  	ROUND_SSE(13)
   535  	ROUND_SSE(14)
   536  	ROUND_SSE(15)
   537  	LEAQ 64(DI), DI
   538  	JMP sseZucSixteens
   539  
   540  sseZucOctet:
   541  	CMPQ BP, $8
   542  	JB sseZucNibble
   543  	SUBQ $8, BP
   544  	ROUND_SSE(0)
   545  	ROUND_SSE(1)
   546  	ROUND_SSE(2)
   547  	ROUND_SSE(3)
   548  	ROUND_SSE(4)
   549  	ROUND_SSE(5)
   550  	ROUND_SSE(6)
   551  	ROUND_SSE(7)
   552  	LEAQ 32(DI), DI
   553  	RESTORE_LFSR_8
   554  
   555  sseZucNibble:
   556  	CMPQ BP, $4
   557  	JB sseZucDouble
   558  	SUBQ $4, BP
   559  	ROUND_SSE(0)
   560  	ROUND_SSE(1)
   561  	ROUND_SSE(2)
   562  	ROUND_SSE(3)
   563  	LEAQ 16(DI), DI
   564  	RESTORE_LFSR_4
   565  
   566  sseZucDouble:
   567  	CMPQ BP, $2
   568  	JB sseZucSingle
   569  	SUBQ $2, BP
   570  	ROUND_SSE(0)
   571  	ROUND_SSE(1)
   572  	LEAQ 8(DI), DI
   573  	RESTORE_LFSR_2
   574  
   575  sseZucSingle:
   576  	TESTQ BP, BP
   577  	JE sseZucRet
   578  	ROUND_SSE(0)
   579  	RESTORE_LFSR_0
   580  
   581  sseZucRet:
   582  	SAVE_STATE
   583  	RET
   584  
   585  avxZucSixteens:
   586  	CMPQ BP, $16
   587  	JB avxZucOctet
   588  	SUBQ $16, BP
   589  	ROUND_AVX(0)
   590  	ROUND_AVX(1)
   591  	ROUND_AVX(2)
   592  	ROUND_AVX(3)
   593  	ROUND_AVX(4)
   594  	ROUND_AVX(5)
   595  	ROUND_AVX(6)
   596  	ROUND_AVX(7)
   597  	ROUND_AVX(8)
   598  	ROUND_AVX(9)
   599  	ROUND_AVX(10)
   600  	ROUND_AVX(11)
   601  	ROUND_AVX(12)
   602  	ROUND_AVX(13)
   603  	ROUND_AVX(14)
   604  	ROUND_AVX(15)
   605  	LEAQ 64(DI), DI
   606  	JMP avxZucSixteens
   607  
   608  avxZucOctet:
   609  	CMPQ BP, $8
   610  	JB avxZucNibble
   611  	SUBQ $8, BP
   612  	ROUND_AVX(0)
   613  	ROUND_AVX(1)
   614  	ROUND_AVX(2)
   615  	ROUND_AVX(3)
   616  	ROUND_AVX(4)
   617  	ROUND_AVX(5)
   618  	ROUND_AVX(6)
   619  	ROUND_AVX(7)
   620  	LEAQ 32(DI), DI
   621  	RESTORE_LFSR_8
   622  
   623  avxZucNibble:
   624  	CMPQ BP, $4
   625  	JB avxZucDouble
   626  	SUBQ $4, BP
   627  	ROUND_AVX(0)
   628  	ROUND_AVX(1)
   629  	ROUND_AVX(2)
   630  	ROUND_AVX(3)
   631  	LEAQ 16(DI), DI
   632  	RESTORE_LFSR_4
   633  
   634  avxZucDouble:
   635  	CMPQ BP, $2
   636  	JB avxZucSingle
   637  	SUBQ $2, BP
   638  	ROUND_AVX(0)
   639  	ROUND_AVX(1)
   640  	LEAQ 8(DI), DI
   641  	RESTORE_LFSR_2
   642  
   643  avxZucSingle:
   644  	TESTQ BP, BP
   645  	JE avxZucRet
   646  	ROUND_AVX(0)
   647  	RESTORE_LFSR_0
   648  
   649  avxZucRet:
   650  	SAVE_STATE
   651  	RET
   652  
   653  // func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
   654  TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0
   655  	MOVQ ks+0(FP), DI
   656  	MOVQ ks_len+8(FP), BP
   657  	MOVQ pState+24(FP), SI
   658  
   659  	SHRQ $2, BP
   660  
   661  	LOAD_STATE
   662  
   663  	CMPB ·useAVX(SB), $1
   664  	JE   avxZucSixteens
   665  
   666  sseZucSixteens:
   667  	CMPQ BP, $16
   668  	JB sseZucOctet
   669  	SUBQ $16, BP
   670  	ROUND_REV32_SSE(0)
   671  	ROUND_REV32_SSE(1)
   672  	ROUND_REV32_SSE(2)
   673  	ROUND_REV32_SSE(3)
   674  	ROUND_REV32_SSE(4)
   675  	ROUND_REV32_SSE(5)
   676  	ROUND_REV32_SSE(6)
   677  	ROUND_REV32_SSE(7)
   678  	ROUND_REV32_SSE(8)
   679  	ROUND_REV32_SSE(9)
   680  	ROUND_REV32_SSE(10)
   681  	ROUND_REV32_SSE(11)
   682  	ROUND_REV32_SSE(12)
   683  	ROUND_REV32_SSE(13)
   684  	ROUND_REV32_SSE(14)
   685  	ROUND_REV32_SSE(15)
   686  	LEAQ 64(DI), DI
   687  	JMP sseZucSixteens
   688  
   689  sseZucOctet:
   690  	CMPQ BP, $8
   691  	JB sseZucNibble
   692  	SUBQ $8, BP
   693  	ROUND_REV32_SSE(0)
   694  	ROUND_REV32_SSE(1)
   695  	ROUND_REV32_SSE(2)
   696  	ROUND_REV32_SSE(3)
   697  	ROUND_REV32_SSE(4)
   698  	ROUND_REV32_SSE(5)
   699  	ROUND_REV32_SSE(6)
   700  	ROUND_REV32_SSE(7)
   701  	LEAQ 32(DI), DI
   702  	RESTORE_LFSR_8
   703  
   704  sseZucNibble:
   705  	CMPQ BP, $4
   706  	JB sseZucDouble
   707  	SUBQ $4, BP
   708  	ROUND_REV32_SSE(0)
   709  	ROUND_REV32_SSE(1)
   710  	ROUND_REV32_SSE(2)
   711  	ROUND_REV32_SSE(3)
   712  	LEAQ 16(DI), DI
   713  	RESTORE_LFSR_4
   714  
   715  sseZucDouble:
   716  	CMPQ BP, $2
   717  	JB sseZucSingle
   718  	SUBQ $2, BP
   719  	ROUND_REV32_SSE(0)
   720  	ROUND_REV32_SSE(1)
   721  	LEAQ 8(DI), DI
   722  	RESTORE_LFSR_2
   723  
   724  sseZucSingle:
   725  	TESTQ BP, BP
   726  	JE sseZucRet
   727  	ROUND_REV32_SSE(0)
   728  	RESTORE_LFSR_0
   729  
   730  sseZucRet:
   731  	SAVE_STATE
   732  	RET
   733  
   734  avxZucSixteens:
   735  	CMPQ BP, $16
   736  	JB avxZucOctet
   737  	SUBQ $16, BP
   738  	ROUND_REV32_AVX(0)
   739  	ROUND_REV32_AVX(1)
   740  	ROUND_REV32_AVX(2)
   741  	ROUND_REV32_AVX(3)
   742  	ROUND_REV32_AVX(4)
   743  	ROUND_REV32_AVX(5)
   744  	ROUND_REV32_AVX(6)
   745  	ROUND_REV32_AVX(7)
   746  	ROUND_REV32_AVX(8)
   747  	ROUND_REV32_AVX(9)
   748  	ROUND_REV32_AVX(10)
   749  	ROUND_REV32_AVX(11)
   750  	ROUND_REV32_AVX(12)
   751  	ROUND_REV32_AVX(13)
   752  	ROUND_REV32_AVX(14)
   753  	ROUND_REV32_AVX(15)
   754  	LEAQ 64(DI), DI
   755  	JMP avxZucSixteens
   756  
   757  avxZucOctet:
   758  	CMPQ BP, $8
   759  	JB avxZucNibble
   760  	SUBQ $8, BP
   761  	ROUND_REV32_AVX(0)
   762  	ROUND_REV32_AVX(1)
   763  	ROUND_REV32_AVX(2)
   764  	ROUND_REV32_AVX(3)
   765  	ROUND_REV32_AVX(4)
   766  	ROUND_REV32_AVX(5)
   767  	ROUND_REV32_AVX(6)
   768  	ROUND_REV32_AVX(7)
   769  	LEAQ 32(DI), DI
   770  	RESTORE_LFSR_8
   771  
   772  avxZucNibble:
   773  	CMPQ BP, $4
   774  	JB avxZucDouble
   775  	SUBQ $4, BP
   776  	ROUND_REV32_AVX(0)
   777  	ROUND_REV32_AVX(1)
   778  	ROUND_REV32_AVX(2)
   779  	ROUND_REV32_AVX(3)
   780  	LEAQ 16(DI), DI
   781  	RESTORE_LFSR_4
   782  
   783  avxZucDouble:
   784  	CMPQ BP, $2
   785  	JB avxZucSingle
   786  	SUBQ $2, BP
   787  	ROUND_REV32_AVX(0)
   788  	ROUND_REV32_AVX(1)
   789  	LEAQ 8(DI), DI
   790  	RESTORE_LFSR_2
   791  
   792  avxZucSingle:
   793  	TESTQ BP, BP
   794  	JE avxZucRet
   795  	ROUND_REV32_AVX(0)
   796  	RESTORE_LFSR_0
   797  
   798  avxZucRet:
   799  	SAVE_STATE
   800  	RET