github.com/emmansun/gmsm@v0.29.1/sm4/aesni_macros_arm64.s (about)

     1  // inverse shift rows
     2  DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
     3  DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 
     4  GLOBL inverse_shift_rows<>(SB), (16+8), $16
     5  
     6  // Affine transform 1 & 2 (low and high nibbles)
     7  DATA m1_2<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
     8  DATA m1_2<>+0x08(SB)/8, $0x3045F98CEF9A2653
     9  DATA m1_2<>+0x10(SB)/8, $0xC35BF46CAF379800
    10  DATA m1_2<>+0x18(SB)/8, $0x68F05FC7049C33AB
    11  DATA m1_2<>+0x20(SB)/8, $0x9A950A05FEF16E61
    12  DATA m1_2<>+0x28(SB)/8, $0x0E019E916A65FAF5
    13  DATA m1_2<>+0x30(SB)/8, $0x892D69CD44E0A400
    14  DATA m1_2<>+0x38(SB)/8, $0x2C88CC68E14501A5
    15  GLOBL m1_2<>(SB), (16+8), $64
    16  
    17  // left rotations of 32-bit words by 8-bit increments
    18  DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
    19  DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B  
    20  GLOBL r08_mask<>(SB), (16+8), $16
    21  
    22  DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
    23  DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
    24  GLOBL fk_mask<>(SB), (16+8), $16
    25  
    26  #define LOAD_SM4_AESNI_CONSTS() \
    27  	MOVW $0x0F0F0F0F, R20                             \
    28  	VDUP R20, NIBBLE_MASK.S4                          \
    29  	MOVD $m1_2<>(SB), R20                             \
    30  	VLD1 (R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16]  \
    31  	MOVD $inverse_shift_rows<>(SB), R20               \
    32  	VLD1 (R20), [INVERSE_SHIFT_ROWS.B16]              \
    33  	MOVD $r08_mask<>(SB), R20                         \
    34  	VLD1 (R20), [R08_MASK.B16]                        \ 
    35  
    36  // input: from high to low
    37  // t0 = t0.S3, t0.S2, t0.S1, t0.S0
    38  // t1 = t1.S3, t1.S2, t1.S1, t1.S0
    39  // t2 = t2.S3, t2.S2, t2.S1, t2.S0
    40  // t3 = t3.S3, t3.S2, t3.S1, t3.S0
    41  // output: from high to low
    42  // t0 = t3.S0, t2.S0, t1.S0, t0.S0
    43  // t1 = t3.S1, t2.S1, t1.S1, t0.S1
    44  // t2 = t3.S2, t2.S2, t1.S2, t0.S2
    45  // t3 = t3.S3, t2.S3, t1.S3, t0.S3
    46  #define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \
    47  	VZIP1 t1.S4, t0.S4, RTMP0.S4               \
    48  	VZIP1 t3.S4, t2.S4, RTMP1.S4               \
    49  	VZIP2 t1.S4, t0.S4, RTMP2.S4               \
    50  	VZIP2 t3.S4, t2.S4, RTMP3.S4               \
    51  	VZIP1 RTMP1.D2, RTMP0.D2, t0.D2            \
    52  	VZIP2 RTMP1.D2, RTMP0.D2, t1.D2            \
    53  	VZIP1 RTMP3.D2, RTMP2.D2, t2.D2            \
    54  	VZIP2 RTMP3.D2, RTMP2.D2, t3.D2
    55  
    56  // input: from high to low
    57  // t0 = t0.S3, t0.S2, t0.S1, t0.S0
    58  // t1 = t1.S3, t1.S2, t1.S1, t1.S0
    59  // t2 = t2.S3, t2.S2, t2.S1, t2.S0
    60  // t3 = t3.S3, t3.S2, t3.S1, t3.S0
    61  // output: from high to low
    62  // t0 = t0.S0, t1.S0, t2.S0, t3.S0
    63  // t1 = t0.S1, t1.S1, t2.S1, t3.S1
    64  // t2 = t0.S2, t1.S2, t2.S2, t3.S2
    65  // t3 = t0.S3, t1.S3, t2.S3, t3.S3
    66  #define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \
    67  	VZIP1 t0.S4, t1.S4, RTMP0.S4               \
    68  	VZIP2 t0.S4, t1.S4, RTMP1.S4               \
    69  	VZIP1 t2.S4, t3.S4, RTMP2.S4               \
    70  	VZIP2 t2.S4, t3.S4, RTMP3.S4               \
    71  	VZIP1 RTMP0.D2, RTMP2.D2, t0.D2            \
    72  	VZIP2 RTMP0.D2, RTMP2.D2, t1.D2            \
    73  	VZIP1 RTMP1.D2, RTMP3.D2, t2.D2            \
    74  	VZIP2 RTMP1.D2, RTMP3.D2, t3.D2
    75  
    76  // Affine Transform
    77  // parameters:
    78  // -  L: table low nibbles
    79  // -  H: table high nibbles
    80  // -  x: 128 bits register as sbox input/output data
    81  // -  y: 128 bits temp register
    82  // -  z: 128 bits temp register
    83  #define AFFINE_TRANSFORM(L, H, x, y, z)            \
    84  	VAND x.B16, NIBBLE_MASK.B16, z.B16;            \
    85  	VTBL z.B16, [L.B16], y.B16;                    \
    86  	VUSHR $4, x.D2, x.D2;                          \
    87  	VAND x.B16, NIBBLE_MASK.B16, z.B16;            \
    88  	VTBL z.B16, [H.B16], z.B16;                    \
    89  	VEOR y.B16, z.B16, x.B16
    90  
    91  // SM4 sbox function
    92  // parameters:
    93  // -  x: 128 bits register as sbox input/output data
    94  // -  y: 128 bits temp register
    95  // -  z: 128 bits temp register
    96  #define SM4_SBOX(x, y, z) \
    97  	;                                              \
    98  	AFFINE_TRANSFORM(M1L, M1H, x, y, z);           \
    99  	VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16;   \
   100  	AESE ZERO.B16, x.B16;                          \
   101  	AFFINE_TRANSFORM(M2L, M2H, x, y, z)
   102  
   103  // SM4 TAO L1 function
   104  // parameters:
   105  // -  x: 128 bits register as TAO_L1 input/output data
   106  // -  y: 128 bits temp register
   107  // -  z: 128 bits temp register
   108  #define SM4_TAO_L1(x, y, z)         \
   109  	SM4_SBOX(x, y, z);                                   \
   110  	VTBL R08_MASK.B16, [x.B16], y.B16;                   \ // y = x <<< 8
   111  	VTBL R08_MASK.B16, [y.B16], z.B16;                   \ // z = x <<< 16
   112  	VEOR x.B16, y.B16, y.B16;                            \ // y = x ^ (x <<< 8)
   113  	VEOR z.B16, y.B16, y.B16;                            \ // y = x ^ (x <<< 8) ^ (x <<< 16)
   114  	VTBL R08_MASK.B16, [z.B16], z.B16;                   \ // z = x <<< 24
   115  	VEOR z.B16, x.B16, x.B16;                            \ // x = x ^ (x <<< 24)
   116  	VSHL $2, y.S4, z.S4;                                 \
   117  	VSRI $30, y.S4, z.S4;                                \ // z = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
   118  	VEOR z.B16, x.B16, x.B16
   119  
   120  // SM4 round function
   121  // t0 ^= tao_l1(t1^t2^t3^xk)
   122  // parameters:
   123  // - RK: round key register
   124  // - tmp32: temp 32/64 bits register
   125  // -  x: 128 bits temp register
   126  // -  y: 128 bits temp register
   127  // -  z: 128 bits temp register
   128  // - t0: 128 bits register for data as result
   129  // - t1: 128 bits register for data
   130  // - t2: 128 bits register for data
   131  // - t3: 128 bits register for data
   132  #define SM4_ROUND(RK, tmp32, x, y, z, t0, t1, t2, t3) \ 
   133  	MOVW.P 4(RK), tmp32;                              \
   134  	VDUP tmp32, x.S4;                                 \
   135  	VEOR t1.B16, x.B16, x.B16;                        \
   136  	VEOR t2.B16, x.B16, x.B16;                        \
   137  	VEOR t3.B16, x.B16, x.B16;                        \
   138  	SM4_TAO_L1(x, y, z);                              \
   139  	VEOR x.B16, t0.B16, t0.B16
   140  
   141  // SM4 round function
   142  // t0 ^= tao_l1(t1^t2^t3^xk)
   143  // parameters:
   144  // - RK: round key register
   145  // - tmp32: temp 32/64 bits register
   146  // -  x: 128 bits temp register
   147  // -  y: 128 bits temp register
   148  // -  z: 128 bits temp register
   149  // - t0: 128 bits register for data as result
   150  // - t1: 128 bits register for data
   151  // - t2: 128 bits register for data
   152  // - t3: 128 bits register for data
   153  #define SM4_8BLOCKS_ROUND(RK, tmp32, x, y, z, tmp, t0, t1, t2, t3, t4, t5, t6, t7) \ 
   154  	MOVW.P 4(RK), tmp32;                              \
   155  	VDUP tmp32, tmp.S4;                               \
   156  	VEOR t1.B16, tmp.B16, x.B16;                      \
   157  	VEOR t2.B16, x.B16, x.B16;                        \
   158  	VEOR t3.B16, x.B16, x.B16;                        \
   159  	SM4_TAO_L1(x, y, z);                              \
   160  	VEOR x.B16, t0.B16, t0.B16;                       \ 
   161  	; \
   162  	VEOR t5.B16, tmp.B16, x.B16;                      \
   163  	VEOR t6.B16, x.B16, x.B16;                        \
   164  	VEOR t7.B16, x.B16, x.B16;                        \
   165  	SM4_TAO_L1(x, y, z);                              \
   166  	VEOR x.B16, t4.B16, t4.B16