github.com/emmansun/gmsm@v0.29.1/sm4/asm_amd64.s (about) 1 // This SM4 implementation referenced https://github.com/mjosaarinen/sm4ni/blob/master/sm4ni.c 2 //go:build !purego 3 4 #include "textflag.h" 5 6 #define t0 X0 7 #define t1 X1 8 #define t2 X2 9 #define t3 X3 10 11 #define x X8 12 #define y X9 13 #define XTMP6 X10 14 #define XTMP7 X11 15 16 #include "aesni_macros_amd64.s" 17 18 // SM4 TAO L2 function, used for key expand 19 // parameters: 20 // - x: 128 bits register as TAO_L1 input/output data 21 // - y: 128 bits temp register 22 // - tmp1: 128 bits temp register 23 // - tmp2: 128 bits temp register 24 #define SM4_TAO_L2(x, y, tmp1, tmp2) \ 25 SM4_SBOX(x, y, tmp1); \ 26 ; \ //#################### 4 parallel L2 linear transforms ##################// 27 MOVOU x, y; \ 28 MOVOU x, tmp1; \ 29 PSLLL $13, tmp1; \ 30 PSRLL $19, y; \ 31 POR tmp1, y; \ //y = X roll 13 32 PSLLL $10, tmp1; \ 33 MOVOU x, tmp2; \ 34 PSRLL $9, tmp2; \ 35 POR tmp1, tmp2; \ //tmp2 = x roll 23 36 PXOR tmp2, y; \ 37 PXOR y, x 38 39 // SM4 expand round function 40 // t0 ^= tao_l2(t1^t2^t3^ck) and store t0.S[0] to enc/dec 41 // parameters: 42 // - index: round key index immediate number 43 // - x: 128 bits temp register 44 // - y: 128 bits temp register 45 // - t0: 128 bits register for data 46 // - t1: 128 bits register for data 47 // - t2: 128 bits register for data 48 // - t3: 128 bits register for data 49 #define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \ 50 MOVL (index * 4)(BX)(CX*1), x; \ 51 PXOR t1, x; \ 52 PXOR t2, x; \ 53 PXOR t3, x; \ 54 SM4_TAO_L2(x, y, XTMP6, XTMP7); \ 55 PXOR x, t0; \ 56 MOVL t0, R8; \ // _mm_cvtsi128_si32 57 MOVL R8, (index * 4)(DX)(CX*1); \ 58 MOVL R8, (12 - index * 4)(DI)(SI*1) 59 60 #define XDWORD0 Y4 61 #define XDWORD1 Y5 62 #define XDWORD2 Y6 63 #define XDWORD3 Y7 64 65 #define XWORD0 X4 66 #define XWORD1 X5 67 #define XWORD2 X6 68 #define XWORD3 X7 69 70 #define XDWORD4 Y10 71 #define XDWORD5 Y11 72 #define XDWORD6 Y12 73 #define XDWORD7 Y14 74 75 #define XWORD4 X10 76 #define XWORD5 X11 77 #define XWORD6 X12 78 #define XWORD7 X14 79 80 #define XDWTMP0 Y0 81 #define XDWTMP1 Y1 82 #define XDWTMP2 Y2 83 84 #define XWTMP0 X0 85 #define XWTMP1 X1 86 #define XWTMP2 X2 87 88 #define NIBBLE_MASK Y3 89 #define X_NIBBLE_MASK X3 90 91 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 92 #define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE 93 94 #define XDWORD Y8 95 #define YDWORD Y9 96 97 #define XWORD X8 98 #define YWORD X9 99 100 // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) 101 TEXT ·expandKeyAsm(SB),NOSPLIT,$0 102 MOVQ key+0(FP), AX 103 MOVQ ck+8(FP), BX 104 MOVQ enc+16(FP), DX 105 MOVQ dec+24(FP), DI 106 107 MOVUPS 0(AX), t0 108 PSHUFB flip_mask<>(SB), t0 109 PXOR fk_mask<>(SB), t0 110 PSHUFD $1, t0, t1 111 PSHUFD $2, t0, t2 112 PSHUFD $3, t0, t3 113 114 XORL CX, CX 115 MOVL $112, SI 116 117 loop: 118 SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3) 119 SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0) 120 SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1) 121 SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2) 122 123 ADDL $16, CX 124 SUBL $16, SI 125 CMPL CX, $4*32 126 JB loop 127 128 expand_end: 129 RET 130 131 // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) 132 TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 133 MOVQ xk+0(FP), AX 134 MOVQ dst+8(FP), BX 135 MOVQ src+32(FP), DX 136 MOVQ src_len+40(FP), DI 137 138 CMPB ·useAVX2(SB), $1 139 JE avx2 140 141 CMPB ·useAVX(SB), $1 142 JE avx 143 144 non_avx2_start: 145 CMPQ DI, $128 146 JEQ sse_8blocks 147 148 MOVOU 0(DX), XWORD0 149 MOVOU 16(DX), XWORD1 150 MOVOU 32(DX), XWORD2 151 MOVOU 48(DX), XWORD3 152 153 SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 154 155 MOVOU XWORD0, 0(BX) 156 MOVOU XWORD1, 16(BX) 157 MOVOU XWORD2, 32(BX) 158 MOVOU XWORD3, 48(BX) 159 160 RET 161 162 sse_8blocks: 163 MOVOU 0(DX), XWORD0 164 MOVOU 16(DX), XWORD1 165 MOVOU 32(DX), XWORD2 166 MOVOU 48(DX), XWORD3 167 MOVOU 64(DX), XWORD4 168 MOVOU 80(DX), XWORD5 169 MOVOU 96(DX), XWORD6 170 MOVOU 112(DX), XWORD7 171 172 SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) 173 174 MOVOU XWORD0, 0(BX) 175 MOVOU XWORD1, 16(BX) 176 MOVOU XWORD2, 32(BX) 177 MOVOU XWORD3, 48(BX) 178 MOVOU XWORD4, 64(BX) 179 MOVOU XWORD5, 80(BX) 180 MOVOU XWORD6, 96(BX) 181 MOVOU XWORD7, 112(BX) 182 done_sm4: 183 RET 184 185 avx: 186 CMPQ DI, $128 187 JEQ avx_8blocks 188 189 VMOVDQU 0(DX), XWORD0 190 VMOVDQU 16(DX), XWORD1 191 VMOVDQU 32(DX), XWORD2 192 VMOVDQU 48(DX), XWORD3 193 194 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 195 196 VMOVDQU XWORD0, 0(BX) 197 VMOVDQU XWORD1, 16(BX) 198 VMOVDQU XWORD2, 32(BX) 199 VMOVDQU XWORD3, 48(BX) 200 201 RET 202 203 avx_8blocks: 204 VMOVDQU 0(DX), XWORD0 205 VMOVDQU 16(DX), XWORD1 206 VMOVDQU 32(DX), XWORD2 207 VMOVDQU 48(DX), XWORD3 208 VMOVDQU 64(DX), XWORD4 209 VMOVDQU 80(DX), XWORD5 210 VMOVDQU 96(DX), XWORD6 211 VMOVDQU 112(DX), XWORD7 212 213 AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) 214 215 VMOVDQU XWORD0, 0(BX) 216 VMOVDQU XWORD1, 16(BX) 217 VMOVDQU XWORD2, 32(BX) 218 VMOVDQU XWORD3, 48(BX) 219 VMOVDQU XWORD4, 64(BX) 220 VMOVDQU XWORD5, 80(BX) 221 VMOVDQU XWORD6, 96(BX) 222 VMOVDQU XWORD7, 112(BX) 223 224 avx_done_sm4: 225 RET 226 227 avx2: 228 VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK 229 230 CMPQ DI, $256 231 JEQ avx2_16blocks 232 233 avx2_8blocks: 234 VMOVDQU 0(DX), XDWORD0 235 VMOVDQU 32(DX), XDWORD1 236 VMOVDQU 64(DX), XDWORD2 237 VMOVDQU 96(DX), XDWORD3 238 VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK 239 240 // Apply Byte Flip Mask: LE -> BE 241 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 242 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 243 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 244 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 245 246 // Transpose matrix 4 x 4 32bits word 247 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) 248 249 AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 250 251 // Transpose matrix 4 x 4 32bits word 252 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) 253 254 VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK 255 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 256 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 257 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 258 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 259 260 VMOVDQU XDWORD0, 0(BX) 261 VMOVDQU XDWORD1, 32(BX) 262 VMOVDQU XDWORD2, 64(BX) 263 VMOVDQU XDWORD3, 96(BX) 264 265 VZEROUPPER 266 RET 267 268 avx2_16blocks: 269 VMOVDQU 0(DX), XDWORD0 270 VMOVDQU 32(DX), XDWORD1 271 VMOVDQU 64(DX), XDWORD2 272 VMOVDQU 96(DX), XDWORD3 273 VMOVDQU 128(DX), XDWORD4 274 VMOVDQU 160(DX), XDWORD5 275 VMOVDQU 192(DX), XDWORD6 276 VMOVDQU 224(DX), XDWORD7 277 278 VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK 279 280 // Apply Byte Flip Mask: LE -> BE 281 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 282 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 283 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 284 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 285 VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4 286 VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5 287 VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6 288 VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 289 290 // Transpose matrix 4 x 4 32bits word 291 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) 292 TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2) 293 294 AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7) 295 296 // Transpose matrix 4 x 4 32bits word 297 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) 298 TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2) 299 300 VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK 301 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 302 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 303 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 304 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 305 VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4 306 VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5 307 VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6 308 VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 309 310 VMOVDQU XDWORD0, 0(BX) 311 VMOVDQU XDWORD1, 32(BX) 312 VMOVDQU XDWORD2, 64(BX) 313 VMOVDQU XDWORD3, 96(BX) 314 VMOVDQU XDWORD4, 128(BX) 315 VMOVDQU XDWORD5, 160(BX) 316 VMOVDQU XDWORD6, 192(BX) 317 VMOVDQU XDWORD7, 224(BX) 318 319 avx2_sm4_done: 320 VZEROUPPER 321 RET 322 323 // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) 324 // Requires: SSSE3 325 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 326 MOVQ xk+0(FP), AX 327 MOVQ dst+8(FP), BX 328 MOVQ src+16(FP), DX 329 330 MOVUPS (DX), t0 331 PSHUFB flip_mask<>(SB), t0 332 PSHUFD $1, t0, t1 333 PSHUFD $2, t0, t2 334 PSHUFD $3, t0, t3 335 336 XORL CX, CX 337 338 loop: 339 MOVUPS (AX)(CX*1), XTMP7 340 MOVOU XTMP7, x 341 SM4_SINGLE_ROUND(x, y, XTMP6, t0, t1, t2, t3) 342 PSHUFD $1, XTMP7, x 343 SM4_SINGLE_ROUND(x, y, XTMP6, t1, t2, t3, t0) 344 PSHUFD $2, XTMP7, x 345 SM4_SINGLE_ROUND(x, y, XTMP6, t2, t3, t0, t1) 346 PSHUFD $3, XTMP7, x 347 SM4_SINGLE_ROUND(x, y, XTMP6, t3, t0, t1, t2) 348 349 ADDL $16, CX 350 CMPL CX, $4*32 351 JB loop 352 353 PUNPCKLLQ t2, t3 354 PUNPCKLLQ t0, t1 355 PUNPCKLQDQ t1, t3 356 PSHUFB flip_mask<>(SB), t3 357 MOVUPS t3, (BX) 358 359 done_sm4: 360 RET