gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/sm4/asm_amd64.s (about) 1 // This SM4 implementation referenced https://github.com/mjosaarinen/sm4ni/blob/master/sm4ni.c 2 #include "textflag.h" 3 4 #define x X0 5 #define y X1 6 #define t0 X2 7 #define t1 X3 8 #define t2 X4 9 #define t3 X5 10 11 #define XTMP6 X6 12 #define XTMP7 X7 13 14 // shuffle byte order from LE to BE 15 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 16 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 17 GLOBL flip_mask<>(SB), RODATA, $16 18 19 // shuffle byte and word order 20 DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f 21 DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607 22 GLOBL bswap_mask<>(SB), RODATA, $16 23 24 //nibble mask 25 DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F 26 DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F 27 GLOBL nibble_mask<>(SB), RODATA, $16 28 29 // inverse shift rows 30 DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 31 DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 32 GLOBL inverse_shift_rows<>(SB), RODATA, $16 33 34 // Affine transform 1 (low and high hibbles) 35 DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 36 DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 37 GLOBL m1_low<>(SB), RODATA, $16 38 39 DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 40 DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB 41 GLOBL m1_high<>(SB), RODATA, $16 42 43 // Affine transform 2 (low and high hibbles) 44 DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 45 DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 46 GLOBL m2_low<>(SB), RODATA, $16 47 48 DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 49 DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 50 GLOBL m2_high<>(SB), RODATA, $16 51 52 // left rotations of 32-bit words by 8-bit increments 53 DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 54 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 55 GLOBL r08_mask<>(SB), RODATA, $16 56 57 DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 58 DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A 59 GLOBL r16_mask<>(SB), RODATA, $16 60 61 DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 62 DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 63 GLOBL r24_mask<>(SB), RODATA, $16 64 65 DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 66 DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 67 GLOBL fk_mask<>(SB), RODATA, $16 68 69 #define SM4_SBOX(x, y) \ 70 ; \ //############################# inner affine ############################// 71 MOVOU x, XTMP6; \ 72 PAND nibble_mask<>(SB), XTMP6; \ //y = _mm_and_si128(x, c0f); 73 MOVOU m1_low<>(SB), y; \ 74 PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m1l, y); 75 PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); 76 PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); 77 MOVOU m1_high<>(SB), XTMP6; \ 78 PSHUFB x, XTMP6; \ //x = _mm_shuffle_epi8(m1h, x); 79 MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m1h, x); 80 PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; 81 ; \ // inverse ShiftRows 82 PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); 83 AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction 84 ; \ //############################# outer affine ############################// 85 MOVOU x, XTMP6; \ 86 PANDN nibble_mask<>(SB), XTMP6; \ //XTMP6 = _mm_andnot_si128(x, c0f); 87 MOVOU m2_low<>(SB), y; \ 88 PSHUFB XTMP6, y; \ //y = _mm_shuffle_epi8(m2l, XTMP6) 89 PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); 90 PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); 91 MOVOU m2_high<>(SB), XTMP6; \ 92 PSHUFB x, XTMP6; \ 93 MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m2h, x) 94 PXOR y, x; \ //x = _mm_shuffle_epi8(m2h, x) ^ y; 95 96 #define SM4_TAO_L1(x, y) \ 97 SM4_SBOX(x, y); \ 98 ; \ //#################### 4 parallel L1 linear transforms ##################// 99 MOVOU x, y; \ 100 PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08) 101 PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08) 102 MOVOU x, XTMP6; \ 103 PSHUFB r16_mask<>(SB), XTMP6; \ 104 PXOR XTMP6, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16) 105 MOVOU y, XTMP6; \ 106 PSLLL $2, XTMP6; \ 107 PSRLL $30, y; \ 108 POR XTMP6, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30); 109 MOVOU x, XTMP7; \ 110 PSHUFB r24_mask<>(SB), XTMP7; \ 111 PXOR y, x; \ //x = x xor y 112 PXOR XTMP7, x //x = x xor y xor _mm_shuffle_epi8(x, r24); 113 114 #define SM4_TAO_L2(x, y) \ 115 SM4_SBOX(x, y); \ 116 ; \ //#################### 4 parallel L2 linear transforms ##################// 117 MOVOU x, y; \ 118 MOVOU x, XTMP6; \ 119 PSLLL $13, XTMP6; \ 120 PSRLL $19, y; \ 121 POR XTMP6, y; \ //y = X roll 13 122 PSLLL $10, XTMP6; \ 123 MOVOU x, XTMP7; \ 124 PSRLL $9, XTMP7; \ 125 POR XTMP6, XTMP7; \ //XTMP7 = x roll 23 126 PXOR XTMP7, y; \ 127 PXOR y, x 128 129 #define SM4_ROUND(index, x, y, t0, t1, t2, t3) \ 130 PINSRD $0, (index * 4)(AX)(CX*1), x; \ 131 PSHUFD $0, x, x; \ 132 PXOR t1, x; \ 133 PXOR t2, x; \ 134 PXOR t3, x; \ 135 SM4_TAO_L1(x, y); \ 136 PXOR x, t0 137 138 #define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \ 139 PINSRD $0, (index * 4)(AX)(CX*1), x; \ 140 PXOR t1, x; \ 141 PXOR t2, x; \ 142 PXOR t3, x; \ 143 SM4_TAO_L1(x, y); \ 144 PXOR x, t0 145 146 #define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \ 147 PINSRD $0, (index * 4)(BX)(CX*1), x; \ 148 PXOR t1, x; \ 149 PXOR t2, x; \ 150 PXOR t3, x; \ 151 SM4_TAO_L2(x, y); \ 152 PXOR x, t0; \ 153 PEXTRD $0, t0, R8; \ 154 MOVL R8, (index * 4)(DX)(CX*1); \ 155 MOVL R8, (12 - index * 4)(DI)(SI*1) 156 157 #define XDWORD0 Y4 158 #define XDWORD1 Y5 159 #define XDWORD2 Y6 160 #define XDWORD3 Y7 161 162 #define XWORD0 X4 163 #define XWORD1 X5 164 #define XWORD2 X6 165 #define XWORD3 X7 166 167 #define XDWTMP0 Y0 168 #define XDWTMP1 Y1 169 #define XDWTMP2 Y2 170 171 #define XWTMP0 X0 172 #define XWTMP1 X1 173 #define XWTMP2 X2 174 175 #define NIBBLE_MASK Y3 176 #define X_NIBBLE_MASK X3 177 178 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 179 #define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE 180 181 #define XDWORD Y8 182 #define YDWORD Y9 183 184 #define XWORD X8 185 #define YWORD X9 186 187 #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ 188 VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] 189 VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] 190 VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8] 191 VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10] 192 VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1] 193 VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0] 194 VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] 195 VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] 196 197 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html 198 #define AVX2_SM4_SBOX(x, y) \ 199 VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK; \ 200 VPAND NIBBLE_MASK, x, XDWTMP1; \ 201 VBROADCASTI128 m1_low<>(SB), y; \ 202 VPSHUFB XDWTMP1, y, y; \ 203 VPSRLQ $4, x, x; \ 204 VPAND NIBBLE_MASK, x, x; \ 205 VBROADCASTI128 m1_high<>(SB), XDWTMP1; \ 206 VPSHUFB x, XDWTMP1, x; \ 207 VPXOR y, x, x; \ 208 VBROADCASTI128 inverse_shift_rows<>(SB), XDWTMP1;\ 209 VPSHUFB XDWTMP1, x, x; \ 210 VEXTRACTI128 $1, x, YWORD \ 211 VAESENCLAST X_NIBBLE_MASK, XWORD, XWORD; \ 212 VAESENCLAST X_NIBBLE_MASK, YWORD, YWORD; \ 213 VINSERTI128 $1, YWORD, x, x; \ 214 VPANDN NIBBLE_MASK, x, XDWTMP1; \ 215 VBROADCASTI128 m2_low<>(SB), y; \ 216 VPSHUFB XDWTMP1, y, y; \ 217 VPSRLQ $4, x, x; \ 218 VPAND NIBBLE_MASK, x, x; \ 219 VBROADCASTI128 m2_high<>(SB), XDWTMP1; \ 220 VPSHUFB x, XDWTMP1, x; \ 221 VPXOR y, x, x 222 223 #define AVX2_SM4_TAO_L1(x, y) \ 224 AVX2_SM4_SBOX(x, y); \ 225 VBROADCASTI128 r08_mask<>(SB), XDWTMP0; \ 226 VPSHUFB XDWTMP0, x, y; \ 227 VPXOR x, y, y; \ 228 VBROADCASTI128 r16_mask<>(SB), XDWTMP0; \ 229 VPSHUFB XDWTMP0, x, XDWTMP0; \ 230 VPXOR XDWTMP0, y, y; \ 231 VPSLLD $2, y, XDWTMP1; \ 232 VPSRLD $30, y, y; \ 233 VPXOR XDWTMP1, y, y; \ 234 VBROADCASTI128 r24_mask<>(SB), XDWTMP0; \ 235 VPSHUFB XDWTMP0, x, XDWTMP0; \ 236 VPXOR y, x, x; \ 237 VPXOR x, XDWTMP0, x 238 239 #define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ 240 VPBROADCASTD (index * 4)(AX)(CX*1), x; \ 241 VPXOR t1, x, x; \ 242 VPXOR t2, x, x; \ 243 VPXOR t3, x, x; \ 244 AVX2_SM4_TAO_L1(x, y); \ 245 VPXOR x, t0, t0 246 247 #define AVX_SM4_SBOX(x, y) \ 248 VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK; \ 249 VPAND X_NIBBLE_MASK, x, XWTMP1; \ 250 VMOVDQU m1_low<>(SB), y; \ 251 VPSHUFB XWTMP1, y, y; \ 252 VPSRLQ $4, x, x; \ 253 VPAND X_NIBBLE_MASK, x, x; \ 254 VMOVDQU m1_high<>(SB), XWTMP1; \ 255 VPSHUFB x, XWTMP1, x; \ 256 VPXOR y, x, x; \ 257 VMOVDQU inverse_shift_rows<>(SB), XWTMP1; \ 258 VPSHUFB XWTMP1, x, x; \ 259 VAESENCLAST X_NIBBLE_MASK, x, x; \ 260 VPANDN X_NIBBLE_MASK, x, XWTMP1; \ 261 VMOVDQU m2_low<>(SB), y; \ 262 VPSHUFB XWTMP1, y, y; \ 263 VPSRLQ $4, x, x; \ 264 VPAND X_NIBBLE_MASK, x, x; \ 265 VMOVDQU m2_high<>(SB), XWTMP1; \ 266 VPSHUFB x, XWTMP1, x; \ 267 VPXOR y, x, x 268 269 #define AVX_SM4_TAO_L1(x, y) \ 270 AVX_SM4_SBOX(x, y); \ 271 VMOVDQU r08_mask<>(SB), XWTMP0; \ 272 VPSHUFB XWTMP0, x, y; \ 273 VPXOR x, y, y; \ 274 VMOVDQU r16_mask<>(SB), XWTMP0; \ 275 VPSHUFB XWTMP0, x, XWTMP0; \ 276 VPXOR XWTMP0, y, y; \ 277 VPSLLD $2, y, XWTMP1; \ 278 VPSRLD $30, y, y; \ 279 VPXOR XWTMP1, y, y; \ 280 VMOVDQU r24_mask<>(SB), XWTMP0; \ 281 VPSHUFB XWTMP0, x, XWTMP0; \ 282 VPXOR y, x, x; \ 283 VPXOR x, XWTMP0, x 284 285 #define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ 286 VPBROADCASTD (index * 4)(AX)(CX*1), x; \ 287 VPXOR t1, x, x; \ 288 VPXOR t2, x, x; \ 289 VPXOR t3, x, x; \ 290 AVX_SM4_TAO_L1(x, y); \ 291 VPXOR x, t0, t0 292 293 // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) 294 TEXT ·expandKeyAsm(SB),NOSPLIT,$0 295 MOVQ key+0(FP), AX 296 MOVQ ck+8(FP), BX 297 MOVQ enc+16(FP), DX 298 MOVQ dec+24(FP), DI 299 300 MOVUPS 0(AX), t0 301 PSHUFB flip_mask<>(SB), t0 302 PXOR fk_mask<>(SB), t0 303 PSHUFD $1, t0, t1 304 PSHUFD $2, t0, t2 305 PSHUFD $3, t0, t3 306 307 XORL CX, CX 308 MOVL $112, SI 309 310 loop: 311 SM4_EXPANDKEY_ROUND(0, x, y, t0, t1, t2, t3) 312 SM4_EXPANDKEY_ROUND(1, x, y, t1, t2, t3, t0) 313 SM4_EXPANDKEY_ROUND(2, x, y, t2, t3, t0, t1) 314 SM4_EXPANDKEY_ROUND(3, x, y, t3, t0, t1, t2) 315 316 ADDL $16, CX 317 SUBL $16, SI 318 CMPL CX, $4*32 319 JB loop 320 321 expand_end: 322 RET 323 324 // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) 325 TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 326 MOVQ xk+0(FP), AX 327 MOVQ dst+8(FP), BX 328 MOVQ src+32(FP), DX 329 MOVQ src_len+40(FP), DI 330 331 CMPB ·useAVX2(SB), $1 332 JE avx2 333 334 non_avx2_start: 335 PINSRD $0, 0(DX), t0 336 PINSRD $1, 16(DX), t0 337 PINSRD $2, 32(DX), t0 338 PINSRD $3, 48(DX), t0 339 PSHUFB flip_mask<>(SB), t0 340 341 PINSRD $0, 4(DX), t1 342 PINSRD $1, 20(DX), t1 343 PINSRD $2, 36(DX), t1 344 PINSRD $3, 52(DX), t1 345 PSHUFB flip_mask<>(SB), t1 346 347 PINSRD $0, 8(DX), t2 348 PINSRD $1, 24(DX), t2 349 PINSRD $2, 40(DX), t2 350 PINSRD $3, 56(DX), t2 351 PSHUFB flip_mask<>(SB), t2 352 353 PINSRD $0, 12(DX), t3 354 PINSRD $1, 28(DX), t3 355 PINSRD $2, 44(DX), t3 356 PINSRD $3, 60(DX), t3 357 PSHUFB flip_mask<>(SB), t3 358 359 XORL CX, CX 360 361 loop: 362 SM4_ROUND(0, x, y, t0, t1, t2, t3) 363 SM4_ROUND(1, x, y, t1, t2, t3, t0) 364 SM4_ROUND(2, x, y, t2, t3, t0, t1) 365 SM4_ROUND(3, x, y, t3, t0, t1, t2) 366 367 ADDL $16, CX 368 CMPL CX, $4*32 369 JB loop 370 371 PSHUFB flip_mask<>(SB), t3 372 PSHUFB flip_mask<>(SB), t2 373 PSHUFB flip_mask<>(SB), t1 374 PSHUFB flip_mask<>(SB), t0 375 MOVUPS t3, 0(BX) 376 MOVUPS t2, 16(BX) 377 MOVUPS t1, 32(BX) 378 MOVUPS t0, 48(BX) 379 MOVL 4(BX), R8 380 MOVL 8(BX), R9 381 MOVL 12(BX), R10 382 MOVL 16(BX), R11 383 MOVL 32(BX), R12 384 MOVL 48(BX), R13 385 MOVL R11, 4(BX) 386 MOVL R12, 8(BX) 387 MOVL R13, 12(BX) 388 MOVL R8, 16(BX) 389 MOVL R9, 32(BX) 390 MOVL R10, 48(BX) 391 MOVL 24(BX), R8 392 MOVL 28(BX), R9 393 MOVL 36(BX), R10 394 MOVL 52(BX), R11 395 MOVL R10, 24(BX) 396 MOVL R11, 28(BX) 397 MOVL R8, 36(BX) 398 MOVL R9, 52(BX) 399 MOVL 44(BX), R8 400 MOVL 56(BX), R9 401 MOVL R9, 44(BX) 402 MOVL R8, 56(BX) 403 404 done_sm4: 405 RET 406 407 avx2: 408 CMPQ DI, $64 409 JBE avx2_4blocks 410 411 avx2_8blocks: 412 VMOVDQU 0(DX), XDWORD0 413 VMOVDQU 32(DX), XDWORD1 414 VMOVDQU 64(DX), XDWORD2 415 VMOVDQU 96(DX), XDWORD3 416 VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK 417 418 // Apply Byte Flip Mask: LE -> BE 419 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 420 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 421 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 422 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 423 424 // Transpose matrix 4 x 4 32bits word 425 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) 426 427 XORL CX, CX 428 429 avx2_loop: 430 AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 431 AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 432 AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 433 AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 434 435 ADDL $16, CX 436 CMPL CX, $4*32 437 JB avx2_loop 438 439 // Transpose matrix 4 x 4 32bits word 440 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) 441 442 VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK 443 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 444 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 445 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 446 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 447 448 VMOVDQU XDWORD0, 0(BX) 449 VMOVDQU XDWORD1, 32(BX) 450 VMOVDQU XDWORD2, 64(BX) 451 VMOVDQU XDWORD3, 96(BX) 452 JMP avx2_sm4_done 453 454 avx2_4blocks: 455 VMOVDQU 0(DX), XWORD0 456 VMOVDQU 16(DX), XWORD1 457 VMOVDQU 32(DX), XWORD2 458 VMOVDQU 48(DX), XWORD3 459 460 VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK 461 462 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 463 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 464 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 465 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 466 467 // Transpose matrix 4 x 4 32bits word 468 TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) 469 470 XORL CX, CX 471 472 avx_loop: 473 AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3) 474 AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0) 475 AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1) 476 AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2) 477 478 ADDL $16, CX 479 CMPL CX, $4*32 480 JB avx_loop 481 482 // Transpose matrix 4 x 4 32bits word 483 TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) 484 485 VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK 486 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 487 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 488 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 489 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 490 491 VMOVDQU XWORD0, 0(BX) 492 VMOVDQU XWORD1, 16(BX) 493 VMOVDQU XWORD2, 32(BX) 494 VMOVDQU XWORD3, 48(BX) 495 496 avx2_sm4_done: 497 VZEROUPPER 498 RET 499 500 // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) 501 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 502 MOVQ xk+0(FP), AX 503 MOVQ dst+8(FP), BX 504 MOVQ src+16(FP), DX 505 506 PINSRD $0, 0(DX), t0 507 PSHUFB flip_mask<>(SB), t0 508 509 PINSRD $0, 4(DX), t1 510 PSHUFB flip_mask<>(SB), t1 511 512 PINSRD $0, 8(DX), t2 513 PSHUFB flip_mask<>(SB), t2 514 515 PINSRD $0, 12(DX), t3 516 PSHUFB flip_mask<>(SB), t3 517 518 XORL CX, CX 519 520 loop: 521 SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3) 522 SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0) 523 SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1) 524 SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2) 525 526 ADDL $16, CX 527 CMPL CX, $4*32 528 JB loop 529 530 PSHUFB flip_mask<>(SB), t3 531 PSHUFB flip_mask<>(SB), t2 532 PSHUFB flip_mask<>(SB), t1 533 PSHUFB flip_mask<>(SB), t0 534 MOVUPS t3, 0(BX) 535 PEXTRD $0, t2, R8 536 MOVL R8, 4(BX) 537 PEXTRD $0, t1, R8 538 MOVL R8, 8(BX) 539 PEXTRD $0, t0, R8 540 MOVL R8, 12(BX) 541 done_sm4: 542 RET