github.com/emmansun/gmsm@v0.29.1/sm4/ecb_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #include "aesni_macros_amd64.s" 6 7 #define XDWTMP0 Y0 8 #define XDWTMP1 Y1 9 10 #define XDWORD0 Y4 11 #define XDWORD1 Y5 12 #define XDWORD2 Y6 13 #define XDWORD3 Y7 14 15 #define XDWORD4 Y10 16 #define XDWORD5 Y11 17 #define XDWORD6 Y12 18 #define XDWORD7 Y14 19 20 #define XWTMP0 X0 21 #define XWTMP1 X1 22 #define XWTMP2 X2 23 24 #define XWORD0 X4 25 #define XWORD1 X5 26 #define XWORD2 X6 27 #define XWORD3 X7 28 29 #define XWORD4 X10 30 #define XWORD5 X11 31 #define XWORD6 X12 32 #define XWORD7 X14 33 34 #define NIBBLE_MASK Y3 35 #define X_NIBBLE_MASK X3 36 37 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 38 #define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE 39 40 #define BSWAP_MASK Y2 41 42 #define XDWORD Y8 43 #define YDWORD Y9 44 45 #define XWORD X8 46 #define YWORD X9 47 48 // func encryptSm4Ecb(xk *uint32, dst, src []byte) 49 TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0 50 MOVQ xk+0(FP), AX 51 MOVQ dst+8(FP), BX 52 MOVQ src+32(FP), DX 53 MOVQ src_len+40(FP), DI 54 55 CMPB ·useAVX2(SB), $1 56 JE avx2_start 57 58 CMPB ·useAVX(SB), $1 59 JE avxEcbSm4Octets 60 61 ecbSm4Octets: 62 CMPQ DI, $128 63 JB ecbSm4Nibbles 64 SUBQ $128, DI 65 66 MOVOU 0(DX), XWORD0 67 MOVOU 16(DX), XWORD1 68 MOVOU 32(DX), XWORD2 69 MOVOU 48(DX), XWORD3 70 MOVOU 64(DX), XWORD4 71 MOVOU 80(DX), XWORD5 72 MOVOU 96(DX), XWORD6 73 MOVOU 112(DX), XWORD7 74 75 SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) 76 77 MOVOU XWORD0, 0(BX) 78 MOVOU XWORD1, 16(BX) 79 MOVOU XWORD2, 32(BX) 80 MOVOU XWORD3, 48(BX) 81 MOVOU XWORD4, 64(BX) 82 MOVOU XWORD5, 80(BX) 83 MOVOU XWORD6, 96(BX) 84 MOVOU XWORD7, 112(BX) 85 86 LEAQ 128(BX), BX 87 LEAQ 128(DX), DX 88 JMP ecbSm4Octets 89 90 ecbSm4Nibbles: 91 CMPQ DI, $64 92 JB ecbSm4Single 93 SUBQ $64, DI 94 95 MOVOU 0(DX), XWORD0 96 MOVOU 16(DX), XWORD1 97 MOVOU 32(DX), XWORD2 98 MOVOU 48(DX), XWORD3 99 100 SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 101 102 MOVUPS XWORD0, 0(BX) 103 MOVUPS XWORD1, 16(BX) 104 MOVUPS XWORD2, 32(BX) 105 MOVUPS XWORD3, 48(BX) 106 107 LEAQ 64(BX), BX 108 LEAQ 64(DX), DX 109 110 ecbSm4Single: 111 TESTQ DI, DI 112 JE ecbSm4Done 113 114 MOVOU 0(DX), XWORD0 115 CMPQ DI, $32 116 JEQ ecbSm4Single32 117 CMPQ DI, $48 118 JEQ ecbSm4Single48 119 SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 120 MOVUPS XWORD0, 0(BX) 121 JMP ecbSm4Done 122 123 ecbSm4Single32: 124 MOVOU 16(DX), XWORD1 125 SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 126 MOVUPS XWORD0, 0(BX) 127 MOVUPS XWORD1, 16(BX) 128 JMP ecbSm4Done 129 130 ecbSm4Single48: 131 MOVOU 16(DX), XWORD1 132 MOVOU 32(DX), XWORD2 133 SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 134 MOVUPS XWORD0, 0(BX) 135 MOVUPS XWORD1, 16(BX) 136 MOVUPS XWORD2, 32(BX) 137 138 ecbSm4Done: 139 RET 140 141 avxEcbSm4Octets: 142 CMPQ DI, $128 143 JB avxEcbSm4Nibbles 144 SUBQ $128, DI 145 146 VMOVDQU 0(DX), XWORD0 147 VMOVDQU 16(DX), XWORD1 148 VMOVDQU 32(DX), XWORD2 149 VMOVDQU 48(DX), XWORD3 150 VMOVDQU 64(DX), XWORD4 151 VMOVDQU 80(DX), XWORD5 152 VMOVDQU 96(DX), XWORD6 153 VMOVDQU 112(DX), XWORD7 154 155 AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) 156 157 VMOVDQU XWORD0, 0(BX) 158 VMOVDQU XWORD1, 16(BX) 159 VMOVDQU XWORD2, 32(BX) 160 VMOVDQU XWORD3, 48(BX) 161 VMOVDQU XWORD4, 64(BX) 162 VMOVDQU XWORD5, 80(BX) 163 VMOVDQU XWORD6, 96(BX) 164 VMOVDQU XWORD7, 112(BX) 165 166 LEAQ 128(BX), BX 167 LEAQ 128(DX), DX 168 JMP avxEcbSm4Octets 169 170 avxEcbSm4Nibbles: 171 CMPQ DI, $64 172 JB avxEcbSm4Single 173 SUBQ $64, DI 174 175 VMOVDQU 0(DX), XWORD0 176 VMOVDQU 16(DX), XWORD1 177 VMOVDQU 32(DX), XWORD2 178 VMOVDQU 48(DX), XWORD3 179 180 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 181 182 VMOVDQU XWORD0, 0(BX) 183 VMOVDQU XWORD1, 16(BX) 184 VMOVDQU XWORD2, 32(BX) 185 VMOVDQU XWORD3, 48(BX) 186 187 LEAQ 64(BX), BX 188 LEAQ 64(DX), DX 189 190 avxEcbSm4Single: 191 TESTQ DI, DI 192 JE avxEcbSm4Done 193 194 VMOVDQU 0(DX), XWORD0 195 CMPQ DI, $32 196 JEQ avxEcbSm4Single32 197 CMPQ DI, $48 198 JEQ avxEcbSm4Single48 199 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 200 VMOVDQU XWORD0, 0(BX) 201 JMP avxEcbSm4Done 202 203 avxEcbSm4Single32: 204 VMOVDQU 16(DX), XWORD1 205 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 206 VMOVDQU XWORD0, 0(BX) 207 VMOVDQU XWORD1, 16(BX) 208 JMP avxEcbSm4Done 209 210 avxEcbSm4Single48: 211 VMOVDQU 16(DX), XWORD1 212 VMOVDQU 32(DX), XWORD2 213 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 214 VMOVDQU XWORD0, 0(BX) 215 VMOVDQU XWORD1, 16(BX) 216 VMOVDQU XWORD2, 32(BX) 217 218 avxEcbSm4Done: 219 RET 220 221 avx2_start: 222 VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK 223 VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK 224 VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK 225 226 avx2_16blocks: 227 CMPQ DI, $256 228 JB avx2EcbSm4Octets 229 SUBQ $256, DI 230 231 VMOVDQU 0(DX), XDWORD0 232 VMOVDQU 32(DX), XDWORD1 233 VMOVDQU 64(DX), XDWORD2 234 VMOVDQU 96(DX), XDWORD3 235 VMOVDQU 128(DX), XDWORD4 236 VMOVDQU 160(DX), XDWORD5 237 VMOVDQU 192(DX), XDWORD6 238 VMOVDQU 224(DX), XDWORD7 239 240 // Apply Byte Flip Mask: LE -> BE 241 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 242 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 243 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 244 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 245 VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4 246 VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5 247 VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6 248 VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 249 250 // Transpose matrix 4 x 4 32bits word 251 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) 252 TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1) 253 254 AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7) 255 256 // Transpose matrix 4 x 4 32bits word 257 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) 258 TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1) 259 260 VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0 261 VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1 262 VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2 263 VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3 264 VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4 265 VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5 266 VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6 267 VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7 268 269 VMOVDQU XDWORD0, 0(BX) 270 VMOVDQU XDWORD1, 32(BX) 271 VMOVDQU XDWORD2, 64(BX) 272 VMOVDQU XDWORD3, 96(BX) 273 VMOVDQU XDWORD4, 128(BX) 274 VMOVDQU XDWORD5, 160(BX) 275 VMOVDQU XDWORD6, 192(BX) 276 VMOVDQU XDWORD7, 224(BX) 277 278 LEAQ 256(BX), BX 279 LEAQ 256(DX), DX 280 JMP avx2_16blocks 281 282 avx2EcbSm4Octets: 283 CMPQ DI, $128 284 JB avx2EcbSm4Nibbles 285 SUBQ $128, DI 286 287 VMOVDQU 0(DX), XDWORD0 288 VMOVDQU 32(DX), XDWORD1 289 VMOVDQU 64(DX), XDWORD2 290 VMOVDQU 96(DX), XDWORD3 291 292 // Apply Byte Flip Mask: LE -> BE 293 VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 294 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 295 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 296 VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 297 298 // Transpose matrix 4 x 4 32bits word 299 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) 300 301 AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 302 303 // Transpose matrix 4 x 4 32bits word 304 TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) 305 306 VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0 307 VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1 308 VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2 309 VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3 310 311 VMOVDQU XDWORD0, 0(BX) 312 VMOVDQU XDWORD1, 32(BX) 313 VMOVDQU XDWORD2, 64(BX) 314 VMOVDQU XDWORD3, 96(BX) 315 316 LEAQ 128(BX), BX 317 LEAQ 128(DX), DX 318 JMP avx2EcbSm4Octets 319 320 avx2EcbSm4Nibbles: 321 CMPQ DI, $64 322 JB avx2EcbSm4Single 323 SUBQ $64, DI 324 325 VMOVDQU 0(DX), XWORD0 326 VMOVDQU 16(DX), XWORD1 327 VMOVDQU 32(DX), XWORD2 328 VMOVDQU 48(DX), XWORD3 329 330 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 331 332 VMOVDQU XWORD0, 0(BX) 333 VMOVDQU XWORD1, 16(BX) 334 VMOVDQU XWORD2, 32(BX) 335 VMOVDQU XWORD3, 48(BX) 336 337 LEAQ 64(BX), BX 338 LEAQ 64(DX), DX 339 340 avx2EcbSm4Single: 341 TESTQ DI, DI 342 JE avx2EcbSm4Done 343 344 VMOVDQU 0(DX), XWORD0 345 CMPQ DI, $32 346 JEQ avx2EcbSm4Single32 347 CMPQ DI, $48 348 JEQ avx2EcbSm4Single48 349 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 350 VMOVDQU XWORD0, 0(BX) 351 JMP avx2EcbSm4Done 352 353 avx2EcbSm4Single32: 354 VMOVDQU 16(DX), XWORD1 355 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 356 VMOVDQU XWORD0, 0(BX) 357 VMOVDQU XWORD1, 16(BX) 358 JMP avx2EcbSm4Done 359 360 avx2EcbSm4Single48: 361 VMOVDQU 16(DX), XWORD1 362 VMOVDQU 32(DX), XWORD2 363 AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) 364 VMOVDQU XWORD0, 0(BX) 365 VMOVDQU XWORD1, 16(BX) 366 VMOVDQU XWORD2, 32(BX) 367 368 avx2EcbSm4Done: 369 VZEROUPPER 370 RET