github.com/emmansun/gmsm@v0.29.1/zuc/asm_arm64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  DATA Top3_Bottom5_bits_of_the_byte<>+0x00(SB)/8, $0xe0e0e0e0e0e0e0e0
     6  DATA Top3_Bottom5_bits_of_the_byte<>+0x08(SB)/8, $0xe0e0e0e0e0e0e0e0
     7  DATA Top3_Bottom5_bits_of_the_byte<>+0x10(SB)/8, $0x1f1f1f1f1f1f1f1f
     8  DATA Top3_Bottom5_bits_of_the_byte<>+0x18(SB)/8, $0x1f1f1f1f1f1f1f1f
     9  GLOBL Top3_Bottom5_bits_of_the_byte<>(SB), RODATA, $32
    10  
    11  DATA P123_data<>+0x00(SB)/8, $0x0A020F0F0E000F09
    12  DATA P123_data<>+0x08(SB)/8, $0x090305070C000400
    13  DATA P123_data<>+0x10(SB)/8, $0x040C000705060D08
    14  DATA P123_data<>+0x18(SB)/8, $0x0209030F0A0E010B
    15  DATA P123_data<>+0x20(SB)/8, $0x0F0A0D00060A0602
    16  DATA P123_data<>+0x28(SB)/8, $0x0D0C0900050D0303
    17  GLOBL P123_data<>(SB), RODATA, $48
    18  
    19  // Affine transform 1 & 2 (low and high nibbles)
    20  DATA m1_2<>+0x00(SB)/8, $0x1D1C9F9E83820100
    21  DATA m1_2<>+0x08(SB)/8, $0x3938BBBAA7A62524
    22  DATA m1_2<>+0x10(SB)/8, $0xA174A97CDD08D500
    23  DATA m1_2<>+0x18(SB)/8, $0x3DE835E04194499C
    24  DATA m1_2<>+0x20(SB)/8, $0xA8BC0216D9CD7367
    25  DATA m1_2<>+0x28(SB)/8, $0x1F0BB5A16E7AC4D0
    26  DATA m1_2<>+0x30(SB)/8, $0x638CFA1523CCBA55
    27  DATA m1_2<>+0x38(SB)/8, $0x3FD0A6497F90E609
    28  GLOBL m1_2<>(SB), RODATA, $64
    29  
    30  DATA Shuf_mask<>+0x00(SB)/8, $0x0B0E0104070A0D00
    31  DATA Shuf_mask<>+0x08(SB)/8, $0x0306090C0F020508
    32  GLOBL Shuf_mask<>(SB), RODATA, $16
    33  
    34  DATA mask_S01<>+0x00(SB)/8, $0xff00ff00ff00ff00
    35  DATA mask_S01<>+0x08(SB)/8, $0xff00ff00ff00ff00
    36  DATA mask_S01<>+0x10(SB)/8, $0x00ff00ff00ff00ff
    37  DATA mask_S01<>+0x18(SB)/8, $0x00ff00ff00ff00ff
    38  GLOBL mask_S01<>(SB), RODATA, $32
    39  
    40  #define SI R0
    41  #define DI R1
    42  #define BP R2
    43  #define AX R3
    44  #define BX R4
    45  #define CX R5
    46  #define DX R6
    47  
    48  #define ZERO V16
    49  #define TOP3_BITS V19
    50  #define BOTTOM5_BITS V20
    51  #define NIBBLE_MASK V21
    52  #define INVERSE_SHIFT_ROWS V22
    53  #define M1L V23
    54  #define M1H V24 
    55  #define M2L V25 
    56  #define M2H V26
    57  #define P1 V27
    58  #define P2 V28
    59  #define P3 V29
    60  #define S0_MASK V30
    61  #define S1_MASK V31
    62  
    63  #define OFFSET_FR1      (16*4)
    64  #define OFFSET_FR2      (17*4)
    65  #define OFFSET_BRC_X0   (18*4)
    66  #define OFFSET_BRC_X1   (19*4)
    67  #define OFFSET_BRC_X2   (20*4)
    68  #define OFFSET_BRC_X3   (21*4)
    69  
    70  #define LOAD_GLOBAL_DATA() \
    71  	MOVW $0x0F0F0F0F, R0                              \
    72  	VDUP R0, NIBBLE_MASK.S4                           \
    73  	MOVD $Top3_Bottom5_bits_of_the_byte<>(SB), R0     \
    74  	VLD1 (R0), [TOP3_BITS.B16, BOTTOM5_BITS.B16]      \
    75  	MOVD $m1_2<>(SB), R0                              \
    76  	VLD1 (R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16]   \
    77  	MOVD $P123_data<>(SB), R0                         \
    78  	VLD1 (R0), [P1.B16, P2.B16, P3.B16]               \
    79  	MOVD $mask_S01<>(SB), R0                          \
    80  	VLD1 (R0), [S0_MASK.B16, S1_MASK.B16]             \
    81  	MOVD $Shuf_mask<>(SB), R0                         \
    82  	VLD1 (R0), [INVERSE_SHIFT_ROWS.B16]               \
    83  
    84  #define SHLDL(a, b, n) \  // NO SHLDL in GOLANG now
    85  	LSLW n, a          \
    86  	LSRW n, b          \  
    87  	ORRW  b, a
    88  
    89  #define Rotl_5(XDATA, XTMP0)                           \
    90  	VSHL $5, XDATA.S4, XTMP0.S4                        \
    91  	VUSHR $3, XDATA.S4, XDATA.S4                       \
    92  	VAND TOP3_BITS.B16, XTMP0.B16, XTMP0.B16           \
    93  	VAND BOTTOM5_BITS.B16, XDATA.B16, XDATA.B16        \
    94  	VORR XTMP0.B16, XDATA.B16, XDATA.B16
    95  
    96  #define S0_comput(IN_OUT, XTMP1, XTMP2)    \
    97  	VUSHR $4, IN_OUT.S4, XTMP1.S4                \
    98  	VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16   \
    99  	\
   100  	VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
   101  	\
   102  	VTBL IN_OUT.B16, [P1.B16], XTMP2.B16         \
   103  	VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16         \
   104  	\
   105  	VTBL XTMP2.B16, [P2.B16], XTMP1.B16          \
   106  	VEOR IN_OUT.B16, XTMP1.B16, XTMP1.B16        \
   107  	\
   108  	VTBL XTMP1.B16, [P3.B16], IN_OUT.B16         \
   109  	VEOR XTMP2.B16, IN_OUT.B16, IN_OUT.B16       \
   110  	\
   111  	VSHL $4, IN_OUT.S4, IN_OUT.S4                \
   112  	VEOR XTMP1.B16, IN_OUT.B16, IN_OUT.B16       \
   113  	Rotl_5(IN_OUT, XTMP1)    
   114  
   115  // Affine Transform
   116  // parameters:
   117  // -  L: table low nibbles
   118  // -  H: table high nibbles
   119  // -  x: 128 bits register as sbox input/output data
   120  // -  y: 128 bits temp register
   121  // -  z: 128 bits temp register
   122  #define AFFINE_TRANSFORM(L, H, x, y, z)            \
   123  	VAND x.B16, NIBBLE_MASK.B16, z.B16;            \
   124  	VTBL z.B16, [L.B16], y.B16;                    \
   125  	VUSHR $4, x.D2, x.D2;                          \
   126  	VAND x.B16, NIBBLE_MASK.B16, z.B16;            \
   127  	VTBL z.B16, [H.B16], z.B16;                    \
   128  	VEOR y.B16, z.B16, x.B16
   129  
   130  #define S1_comput(x, XTMP1, XTMP2)          \    
   131  	AFFINE_TRANSFORM(M1L, M1H, x, XTMP1, XTMP2);   \
   132  	VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16;   \
   133  	AESE ZERO.B16, x.B16;                          \
   134  	AFFINE_TRANSFORM(M2L, M2H, x, XTMP1, XTMP2)
   135  
   136  #define BITS_REORG(idx)                      \
   137  	MOVW (((15 + idx) % 16)*4)(SI), R12      \
   138  	MOVW (((14 + idx) % 16)*4)(SI), AX       \
   139  	MOVW (((11 + idx) % 16)*4)(SI), R13      \
   140  	MOVW (((9 + idx) % 16)*4)(SI), BX        \
   141  	MOVW (((7 + idx) % 16)*4)(SI), R14       \ 
   142  	MOVW (((5 + idx) % 16)*4)(SI), CX        \
   143  	MOVW (((2 + idx) % 16)*4)(SI), R15       \
   144  	MOVW (((0 + idx) % 16)*4)(SI), DX        \
   145  	LSRW $15, R12                            \
   146  	LSLW $16, AX                             \
   147  	LSLW $1, BX                              \
   148  	LSLW $1, CX                              \
   149  	LSLW $1, DX                              \
   150  	SHLDL(R12, AX, $16)                      \
   151  	SHLDL(R13, BX, $16)                      \
   152  	SHLDL(R14, CX, $16)                      \
   153  	SHLDL(R15, DX, $16)           
   154  
   155  #define LFSR_UPDT(idx)                       \
   156  	MOVW (((0 + idx) % 16)*4)(SI), BX        \
   157  	MOVW (((4 + idx) % 16)*4)(SI), CX        \
   158  	MOVW (((10 + idx) % 16)*4)(SI), DX       \
   159  	MOVW (((13 + idx) % 16)*4)(SI), R8       \
   160  	MOVW (((15 + idx) % 16)*4)(SI), R9       \
   161  	ADD BX, AX                              \
   162  	LSL $8, BX                              \
   163  	LSL $20, CX                             \
   164  	LSL $21, DX                             \
   165  	LSL $17, R8                             \
   166  	LSL $15, R9                             \
   167  	ADD BX, AX                              \
   168  	ADD CX, AX                              \
   169  	ADD DX, AX                              \
   170  	ADD R8, AX                              \
   171  	ADD R9, AX                              \
   172  	\
   173  	LSR $31, AX, BX                         \
   174  	AND $0x7FFFFFFF, AX                     \
   175  	ADD BX, AX                              \
   176  	\
   177  	LSR $31, AX, BX                         \
   178  	AND $0x7FFFFFFF, AX                     \
   179  	ADD BX, AX                              \
   180  	\
   181  	MOVW AX, (((0 + idx) % 16)*4)(SI)
   182  
   183  #define NONLIN_FUN()                         \
   184  	EORW R10, R12, AX                        \
   185  	ADDW R11, AX                             \
   186  	ADDW R13, R10                            \ // W1= F_R1 + BRC_X1
   187  	EORW R14, R11                            \ // W2= F_R2 ^ BRC_X2
   188  	\
   189  	LSLW $16, R10, DX                        \
   190  	LSRW $16, R11, CX                        \  
   191  	ORRW CX, DX                              \ // P = (W1 << 16) | (W2 >> 16)
   192  	SHLDL(R11, R10, $16)                     \ // Q = (W2 << 16) | (W1 >> 16)
   193  	RORW $30, DX, BX                         \
   194  	RORW $22, DX, CX                         \
   195  	RORW $14, DX, R8                         \
   196  	RORW $8, DX, R9                          \
   197  	EORW BX, DX                              \
   198  	EORW CX, DX                              \
   199  	EORW R8, DX                              \
   200  	EORW R9, DX                              \ // U = L1(P) = EDX, hi(RDX)=0
   201  	RORW $24, R11, BX                        \
   202  	RORW $18, R11, CX                        \
   203  	RORW $10, R11, R8                        \
   204  	RORW $2, R11, R9                         \
   205  	EORW BX, R11                             \
   206  	EORW CX, R11                             \
   207  	EORW R8, R11                             \
   208  	EORW R9, R11                             \ // V = L2(Q) = R11D, hi(R11)=0
   209  	LSL $32, R11                             \
   210  	EOR R11, DX                              \
   211  	VDUP DX, V0.D2                           \
   212  	VMOV V0.B16, V1.B16                      \ 
   213  	S0_comput(V1, V2, V3)                    \
   214  	S1_comput(V0, V2, V3)                    \
   215  	\
   216  	VAND S1_MASK.B16, V0.B16, V0.B16         \
   217  	VAND S0_MASK.B16, V1.B16, V1.B16         \ 
   218  	VEOR V1.B16, V0.B16, V0.B16              \ 
   219  	\
   220  	VMOV V0.S[0], R10                        \ // F_R1
   221  	VMOV V0.S[1], R11       
   222  
   223  #define RESTORE_LFSR_0()                     \
   224  	MOVW.P 4(SI), AX                         \
   225  	VLD1 (SI), [V0.B16, V1.B16, V2.B16]      \
   226  	SUB $4, SI                               \
   227  	MOVD (52)(SI), BX                        \
   228  	MOVW (60)(SI), CX                        \
   229  	\
   230  	VST1 [V0.B16, V1.B16, V2.B16], (SI)      \
   231  	MOVD BX, (48)(SI)                        \
   232  	MOVW CX, (56)(SI)                        \
   233  	MOVW AX, (60)(SI)     
   234  
   235  #define RESTORE_LFSR_2()                     \
   236  	MOVD.P 8(SI), AX                         \
   237  	VLD1 (SI), [V0.B16, V1.B16, V2.B16]      \ 
   238  	SUB $8, SI                               \
   239  	MOVD (56)(SI), BX                        \
   240  	\
   241  	VST1 [V0.B16, V1.B16, V2.B16], (SI)      \
   242  	MOVD BX, (48)(SI)                        \
   243  	MOVD AX, (56)(SI)    
   244  
   245  #define RESTORE_LFSR_4()                     \
   246  	VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16]   \ 
   247  	\
   248  	VST1.P [V1.B16, V2.B16, V3.B16], 48(SI)       \
   249  	VST1 [V0.B16], (SI)                           \
   250  	SUB $48, SI
   251  
   252  #define RESTORE_LFSR_8()                     \
   253  	VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16]   \ 
   254  	\
   255  	VST1.P [V2.B16, V3.B16], 32(SI)               \
   256  	VST1 [V0.B16, V1.B16], (SI)                   \
   257  	SUB $32, SI
   258  
   259  #define LOAD_STATE(r)                         \
   260  	MOVW 64+r, R10                            \
   261  	MOVW 68+r, R11                            \
   262  	MOVW 72+r, R12                            \
   263  	MOVW 76+r, R13                            \
   264  	MOVW 80+r, R14                            \
   265  	MOVW 84+r, R15
   266  
   267  #define SAVE_STATE(r)                         \
   268  	MOVW R10, 64+r                            \
   269  	MOVW R11, 68+r                            \
   270  	MOVW R12, 72+r                            \
   271  	MOVW R13, 76+r                            \
   272  	MOVW R14, 80+r                            \
   273  	MOVW R15, 84+r
   274  
   275  // func genKeywordAsm(s *zucState32) uint32
   276  TEXT ·genKeywordAsm(SB),NOSPLIT,$0
   277  	LOAD_GLOBAL_DATA()
   278  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   279  
   280  	MOVD pState+0(FP), SI
   281  	LOAD_STATE(0(SI))
   282  
   283  	BITS_REORG(0)
   284  	NONLIN_FUN()
   285  
   286  	EORW R15, AX
   287  	MOVW AX, ret+8(FP)
   288  	EOR AX, AX
   289  	LFSR_UPDT(0)
   290  	SAVE_STATE(0(SI))
   291  	RESTORE_LFSR_0()
   292  
   293  	RET
   294  
   295  #define ONEROUND(idx)      \
   296  	BITS_REORG(idx)               \
   297  	NONLIN_FUN()                  \
   298  	EORW R15, AX                  \
   299  	MOVW AX, (idx*4)(DI)          \
   300  	EOR AX, AX                    \
   301  	LFSR_UPDT(idx)
   302  
   303  #define ROUND_REV32(idx)      \
   304  	BITS_REORG(idx)               \
   305  	NONLIN_FUN()                  \
   306  	EORW R15, AX                  \
   307  	REVW AX, AX                   \
   308  	MOVW AX, (idx*4)(DI)          \
   309  	EOR AX, AX                    \
   310  	LFSR_UPDT(idx)    
   311  
   312  // func genKeyStreamAsm(keyStream []uint32, pState *zucState32)
   313  TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0
   314  	LOAD_GLOBAL_DATA()
   315  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   316  
   317  	MOVD ks+0(FP), DI
   318  	MOVD ks_len+8(FP), BP
   319  	MOVD pState+24(FP), SI
   320  
   321  	LOAD_STATE(0(SI))
   322  
   323  zucSixteens:
   324  	CMP $16, BP
   325  	BLT zucOctet
   326  	SUB $16, BP
   327  	ONEROUND(0)
   328  	ONEROUND(1)
   329  	ONEROUND(2)
   330  	ONEROUND(3)
   331  	ONEROUND(4)
   332  	ONEROUND(5)
   333  	ONEROUND(6)
   334  	ONEROUND(7)
   335  	ONEROUND(8)
   336  	ONEROUND(9)
   337  	ONEROUND(10)
   338  	ONEROUND(11)
   339  	ONEROUND(12)
   340  	ONEROUND(13)
   341  	ONEROUND(14)
   342  	ONEROUND(15)
   343  	ADD	$4*16, DI
   344  	B zucSixteens
   345  
   346  zucOctet:
   347  	CMP $8, BP
   348  	BLT zucNibble
   349  	SUB $8, BP
   350  	ONEROUND(0)
   351  	ONEROUND(1)
   352  	ONEROUND(2)
   353  	ONEROUND(3)
   354  	ONEROUND(4)
   355  	ONEROUND(5)
   356  	ONEROUND(6)
   357  	ONEROUND(7)
   358  	ADD	$2*16, DI
   359  	RESTORE_LFSR_8()
   360  zucNibble:
   361  	CMP $4, BP
   362  	BLT zucDouble
   363  	SUB $4, BP
   364  	ONEROUND(0)
   365  	ONEROUND(1)
   366  	ONEROUND(2)
   367  	ONEROUND(3)
   368  	ADD	$1*16, DI
   369  	RESTORE_LFSR_4()
   370  zucDouble:
   371  	CMP $2, BP
   372  	BLT zucSingle
   373  	SUB $2, BP
   374  	ONEROUND(0)
   375  	ONEROUND(1)
   376  	ADD	$8, DI
   377  	RESTORE_LFSR_2()
   378  zucSingle:
   379  	TBZ	$0, BP, zucRet
   380  	ONEROUND(0)
   381  	RESTORE_LFSR_0()
   382  zucRet:
   383  	SAVE_STATE(0(SI))
   384  	RET 
   385  
   386  // func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32)
   387  TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0
   388  	LOAD_GLOBAL_DATA()
   389  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   390  
   391  	MOVD ks+0(FP), DI
   392  	MOVD ks_len+8(FP), BP
   393  	MOVD pState+24(FP), SI
   394  
   395  	LSR $2, BP
   396  	LOAD_STATE(0(SI))
   397  
   398  zucSixteens:
   399  	CMP $16, BP
   400  	BLT zucOctet
   401  	SUB $16, BP
   402  	ROUND_REV32(0)
   403  	ROUND_REV32(1)
   404  	ROUND_REV32(2)
   405  	ROUND_REV32(3)
   406  	ROUND_REV32(4)
   407  	ROUND_REV32(5)
   408  	ROUND_REV32(6)
   409  	ROUND_REV32(7)
   410  	ROUND_REV32(8)
   411  	ROUND_REV32(9)
   412  	ROUND_REV32(10)
   413  	ROUND_REV32(11)
   414  	ROUND_REV32(12)
   415  	ROUND_REV32(13)
   416  	ROUND_REV32(14)
   417  	ROUND_REV32(15)
   418  	ADD	$4*16, DI
   419  	B zucSixteens
   420  
   421  zucOctet:
   422  	CMP $8, BP
   423  	BLT zucNibble
   424  	SUB $8, BP
   425  	ROUND_REV32(0)
   426  	ROUND_REV32(1)
   427  	ROUND_REV32(2)
   428  	ROUND_REV32(3)
   429  	ROUND_REV32(4)
   430  	ROUND_REV32(5)
   431  	ROUND_REV32(6)
   432  	ROUND_REV32(7)
   433  	ADD	$2*16, DI
   434  	RESTORE_LFSR_8()
   435  zucNibble:
   436  	CMP $4, BP
   437  	BLT zucDouble
   438  	SUB $4, BP
   439  	ROUND_REV32(0)
   440  	ROUND_REV32(1)
   441  	ROUND_REV32(2)
   442  	ROUND_REV32(3)
   443  	ADD	$16, DI
   444  	RESTORE_LFSR_4()
   445  zucDouble:
   446  	CMP $2, BP
   447  	BLT zucSingle
   448  	SUB $2, BP
   449  	ROUND_REV32(0)
   450  	ROUND_REV32(1)
   451  	ADD	$8, DI
   452  	RESTORE_LFSR_2()
   453  zucSingle:
   454  	TBZ	$0, BP, zucRet
   455  	ROUND_REV32(0)
   456  	RESTORE_LFSR_0()
   457  zucRet:
   458  	SAVE_STATE(0(SI))
   459  	RET