gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/sm4/gcm_amd64.s (about) 1 // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI 2 // The implementation uses some optimization as described in: 3 // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication 4 // Instruction and its Usage for Computing the GCM Mode rev. 2.02 5 // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and 6 // Hardware 7 8 #include "textflag.h" 9 10 #define B0 X0 11 #define B1 X1 12 #define B2 X2 13 #define B3 X3 14 #define B4 X4 15 #define B5 X5 16 #define B6 X6 17 #define B7 X7 18 19 #define DWB0 Y0 20 #define DWB1 Y2 21 #define DWB2 Y4 22 #define DWB3 Y6 23 24 #define XDWORD Y1 25 #define YDWORD Y3 26 #define XDWTMP0 Y5 27 #define XDWTMP1 Y7 28 29 #define ACC0 X8 30 #define ACC1 X9 31 #define ACCM X10 32 33 #define T0 X11 34 #define T1 X12 35 #define T2 X13 36 #define POLY X14 37 #define BSWAP X15 38 #define DWBSWAP Y15 39 #define NIBBLE_MASK Y11 40 #define X_NIBBLE_MASK X11 41 42 // shuffle byte order from LE to BE 43 DATA flipMask<>+0x00(SB)/8, $0x0405060700010203 44 DATA flipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 45 46 //nibble mask 47 DATA nibbleMask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F 48 DATA nibbleMask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F 49 50 // inverse shift rows 51 DATA inverseShiftRows<>+0x00(SB)/8, $0x0B0E0104070A0D00 52 DATA inverseShiftRows<>+0x08(SB)/8, $0x0306090C0F020508 53 54 // Affine transform 1 (low and high hibbles) 55 DATA m1Low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 56 DATA m1Low<>+0x08(SB)/8, $0x3045F98CEF9A2653 57 58 DATA m1High<>+0x00(SB)/8, $0xC35BF46CAF379800 59 DATA m1High<>+0x08(SB)/8, $0x68F05FC7049C33AB 60 61 // Affine transform 2 (low and high hibbles) 62 DATA m2Low<>+0x00(SB)/8, $0x9A950A05FEF16E61 63 DATA m2Low<>+0x08(SB)/8, $0x0E019E916A65FAF5 64 65 DATA m2High<>+0x00(SB)/8, $0x892D69CD44E0A400 66 DATA m2High<>+0x08(SB)/8, $0x2C88CC68E14501A5 67 68 // left rotations of 32-bit words by 8-bit increments 69 DATA r08Mask<>+0x00(SB)/8, $0x0605040702010003 70 DATA r08Mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 71 72 DATA r16Mask<>+0x00(SB)/8, $0x0504070601000302 73 DATA r16Mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A 74 75 DATA r24Mask<>+0x00(SB)/8, $0x0407060500030201 76 DATA r24Mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 77 78 DATA fkMask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 79 DATA fkMask<>+0x08(SB)/8, $0xb27022dc677d9197 80 81 DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f 82 DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 83 84 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 85 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 86 87 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff 88 DATA andMask<>+0x08(SB)/8, $0x0000000000000000 89 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff 90 DATA andMask<>+0x18(SB)/8, $0x0000000000000000 91 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff 92 DATA andMask<>+0x28(SB)/8, $0x0000000000000000 93 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff 94 DATA andMask<>+0x38(SB)/8, $0x0000000000000000 95 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff 96 DATA andMask<>+0x48(SB)/8, $0x0000000000000000 97 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff 98 DATA andMask<>+0x58(SB)/8, $0x0000000000000000 99 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff 100 DATA andMask<>+0x68(SB)/8, $0x0000000000000000 101 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff 102 DATA andMask<>+0x78(SB)/8, $0x0000000000000000 103 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff 104 DATA andMask<>+0x88(SB)/8, $0x00000000000000ff 105 DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff 106 DATA andMask<>+0x98(SB)/8, $0x000000000000ffff 107 DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff 108 DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff 109 DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff 110 DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff 111 DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff 112 DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff 113 DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff 114 DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 115 DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff 116 DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 117 118 GLOBL flipMask<>(SB), (NOPTR+RODATA), $16 119 GLOBL nibbleMask<>(SB), (NOPTR+RODATA), $16 120 GLOBL inverseShiftRows<>(SB), (NOPTR+RODATA), $16 121 GLOBL m1Low<>(SB), (NOPTR+RODATA), $16 122 GLOBL m1High<>(SB), (NOPTR+RODATA), $16 123 GLOBL m2Low<>(SB), (NOPTR+RODATA), $16 124 GLOBL m2High<>(SB), (NOPTR+RODATA), $16 125 GLOBL r08Mask<>(SB), (NOPTR+RODATA), $16 126 GLOBL r16Mask<>(SB), (NOPTR+RODATA), $16 127 GLOBL r24Mask<>(SB), (NOPTR+RODATA), $16 128 GLOBL fkMask<>(SB), (NOPTR+RODATA), $16 129 GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 130 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 131 GLOBL andMask<>(SB), (NOPTR+RODATA), $240 132 133 // func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 134 TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 135 #define pTbl DI 136 #define tMsk SI 137 #define tPtr DX 138 #define plen AX 139 #define dlen CX 140 141 MOVQ productTable+0(FP), pTbl 142 MOVQ tagMask+8(FP), tMsk 143 MOVQ T+16(FP), tPtr 144 MOVQ pLen+24(FP), plen 145 MOVQ dLen+32(FP), dlen 146 147 MOVOU (tPtr), ACC0 148 MOVOU (tMsk), T2 149 150 MOVOU bswapMask<>(SB), BSWAP 151 MOVOU gcmPoly<>(SB), POLY 152 153 SHLQ $3, plen 154 SHLQ $3, dlen 155 156 MOVQ plen, B0 157 PINSRQ $1, dlen, B0 158 159 PXOR ACC0, B0 160 161 MOVOU (16*14)(pTbl), ACC0 162 MOVOU (16*15)(pTbl), ACCM 163 MOVOU ACC0, ACC1 164 165 PCLMULQDQ $0x00, B0, ACC0 166 PCLMULQDQ $0x11, B0, ACC1 167 PSHUFD $78, B0, T0 168 PXOR B0, T0 169 PCLMULQDQ $0x00, T0, ACCM 170 171 PXOR ACC0, ACCM 172 PXOR ACC1, ACCM 173 MOVOU ACCM, T0 174 PSRLDQ $8, ACCM 175 PSLLDQ $8, T0 176 PXOR ACCM, ACC1 177 PXOR T0, ACC0 178 179 MOVOU POLY, T0 180 PCLMULQDQ $0x01, ACC0, T0 181 PSHUFD $78, ACC0, ACC0 182 PXOR T0, ACC0 183 184 MOVOU POLY, T0 185 PCLMULQDQ $0x01, ACC0, T0 186 PSHUFD $78, ACC0, ACC0 187 PXOR T0, ACC0 188 189 PXOR ACC1, ACC0 190 191 PSHUFB BSWAP, ACC0 192 PXOR T2, ACC0 193 MOVOU ACC0, (tPtr) 194 195 RET 196 197 #undef pTbl 198 #undef tMsk 199 #undef tPtr 200 #undef plen 201 #undef dlen 202 203 #define SM4_SBOX(x, y, z) \ 204 ; \ //############################# inner affine ############################// 205 MOVOU x, z; \ 206 PAND nibbleMask<>(SB), z; \ //y = _mm_and_si128(x, c0f); 207 MOVOU m1Low<>(SB), y; \ 208 PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y); 209 PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); 210 PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f); 211 MOVOU m1High<>(SB), z; \ 212 PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x); 213 MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x); 214 PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; 215 ; \ // inverse ShiftRows 216 PSHUFB inverseShiftRows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); 217 AESENCLAST nibbleMask<>(SB), x; \ // AESNI instruction 218 ; \ //############################# outer affine ############################// 219 MOVOU x, z; \ 220 PANDN nibbleMask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f); 221 MOVOU m2Low<>(SB), y; \ 222 PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z) 223 PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); 224 PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f); 225 MOVOU m2High<>(SB), z; \ 226 PSHUFB x, z; \ 227 MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x) 228 PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y; 229 230 #define SM4_TAO_L1(x, y, z) \ 231 SM4_SBOX(x, y, z); \ 232 ; \ //#################### 4 parallel L1 linear transforms ##################// 233 MOVOU x, y; \ 234 PSHUFB r08Mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08) 235 PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08) 236 MOVOU x, z; \ 237 PSHUFB r16Mask<>(SB), z; \ 238 PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16) 239 MOVOU y, z; \ 240 PSLLL $2, z; \ 241 PSRLL $30, y; \ 242 POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30); 243 MOVOU x, z; \ 244 PSHUFB r24Mask<>(SB), z; \ 245 PXOR y, x; \ //x = x xor y 246 PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24); 247 248 #define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ 249 PINSRD $0, (index * 4)(RK)(IND*1), x; \ 250 PXOR t1, x; \ 251 PXOR t2, x; \ 252 PXOR t3, x; \ 253 SM4_TAO_L1(x, y, z); \ 254 PXOR x, t0 255 256 #define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ 257 PINSRD $0, (index * 4)(RK)(IND*1), x; \ 258 PSHUFD $0, x, x; \ 259 PXOR t1, x; \ 260 PXOR t2, x; \ 261 PXOR t3, x; \ 262 SM4_TAO_L1(x, y, z); \ 263 PXOR x, t0 264 265 // MOVOU r0, tmp2; 266 // PUNPCKHDQ r1, tmp2; 267 // PUNPCKLDQ r1, r0; 268 // MOVOU r2, tmp1; 269 // PUNPCKLDQ r3, tmp1; 270 // PUNPCKHDQ r3, r2; 271 // MOVOU r0, r1; 272 // PUNPCKHQDQ tmp1, r1; 273 // PUNPCKLQDQ tmp1, r0; 274 // MOVOU tmp2, r3; 275 // PUNPCKHQDQ r2, r3; 276 // PUNPCKLQDQ r2, tmp2; 277 // MOVOU tmp2, r2 278 #define SSE_TRANSPOSE_MATRIX(r, r0, r1, r2, r3, tmp1, tmp2) \ 279 PEXTRD $2, r0, r; \ 280 PINSRD $0, r, tmp2; \ 281 PEXTRD $2, r1, r; \ 282 PINSRD $1, r, tmp2; \ 283 ; \ 284 PEXTRD $3, r0, r; \ 285 PINSRD $2, r, tmp2; \ 286 PEXTRD $3, r1, r; \ 287 PINSRD $3, r, tmp2; \ // tmp2 = [w7, w3, w6, w2] 288 ; \ 289 PEXTRD $1, r0, r; \ 290 PINSRD $2, r, r0; \ 291 PEXTRD $0, r1, r; \ 292 PINSRD $1, r, r0; \ 293 PEXTRD $1, r1, r; \ 294 PINSRD $3, r, r0; \ // r0 = [w5, w1, w4, w0] 295 ; \ 296 PEXTRD $0, r2, r; \ 297 PINSRD $0, r, tmp1; \ 298 PEXTRD $0, r3, r; \ 299 PINSRD $1, r, tmp1; \ 300 PEXTRD $1, r2, r; \ 301 PINSRD $2, r, tmp1; \ 302 PEXTRD $1, r3, r; \ 303 PINSRD $3, r, tmp1; \ // tmp1 = [w13, w9, w12, w8] 304 ; \ 305 PEXTRD $2, r2, r; \ 306 PINSRD $0, r, r2; \ 307 PEXTRD $2, r3, r; \ 308 PINSRD $1, r, r2; \ 309 PEXTRD $3, r2, r; \ 310 PINSRD $2, r, r2; \ 311 PEXTRD $3, r3, r; \ 312 PINSRD $3, r, r2; \ // r2 = [w15, w11, w14, w10] 313 ; \ 314 MOVOU r0, r1; \ 315 PEXTRQ $1, r1, r; \ 316 PINSRQ $0, r, r1; \ 317 PEXTRQ $1, tmp1, r; \ 318 PINSRQ $1, r, r1; \ // r1 = [w13, w9, w5, w1] 319 ; \ 320 PEXTRQ $0, tmp1, r; \ 321 PINSRQ $1, r, r0; \ // r0 = [w12, w8, w4, w0] 322 ; \ 323 MOVOU tmp2, r3; \ 324 PEXTRQ $1, r3, r; \ 325 PINSRQ $0, r, r3; \ 326 PEXTRQ $1, r2, r; \ 327 PINSRQ $1, r, r3; \ // r3 = [w15, w11, w7, w3] 328 ; \ 329 PEXTRQ $0, r2, r; \ 330 PINSRQ $1, r, r2; \ 331 PEXTRQ $0, tmp2, r; \ 332 PINSRQ $0, r, r2 333 334 #define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \ 335 PSHUFB flipMask<>(SB), t0; \ 336 PSHUFB flipMask<>(SB), t1; \ 337 PSHUFB flipMask<>(SB), t2; \ 338 PSHUFB flipMask<>(SB), t3; \ 339 SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \ 340 XORL IND, IND; \ 341 SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ 342 SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ 343 SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ 344 SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ 345 ADDL $16, IND; \ 346 SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ 347 SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ 348 SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ 349 SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ 350 ADDL $16, IND; \ 351 SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ 352 SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ 353 SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ 354 SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ 355 ADDL $16, IND; \ 356 SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ 357 SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ 358 SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ 359 SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ 360 ADDL $16, IND; \ 361 SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ 362 SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ 363 SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ 364 SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ 365 ADDL $16, IND; \ 366 SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ 367 SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ 368 SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ 369 SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ 370 ADDL $16, IND; \ 371 SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ 372 SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ 373 SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ 374 SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ 375 ADDL $16, IND; \ 376 SM4_ROUND(0, RK, IND, x, y, z, t0, t1, t2, t3); \ 377 SM4_ROUND(1, RK, IND, x, y, z, t1, t2, t3, t0); \ 378 SM4_ROUND(2, RK, IND, x, y, z, t2, t3, t0, t1); \ 379 SM4_ROUND(3, RK, IND, x, y, z, t3, t0, t1, t2); \ 380 SSE_TRANSPOSE_MATRIX(R12, t0, t1, t2, t3, x, y); \ 381 PSHUFB BSWAP, t3; \ 382 PSHUFB BSWAP, t2; \ 383 PSHUFB BSWAP, t1; \ 384 PSHUFB BSWAP, t0 385 386 #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ 387 VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] 388 VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] 389 VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8] 390 VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10] 391 VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1] 392 VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0] 393 VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] 394 VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] 395 396 #define AVX2_SM4_SBOX(x, y, xw, yw, tmp) \ 397 VPAND NIBBLE_MASK, x, tmp; \ 398 VBROADCASTI128 m1Low<>(SB), y; \ 399 VPSHUFB tmp, y, y; \ 400 VPSRLQ $4, x, x; \ 401 VPAND NIBBLE_MASK, x, x; \ 402 VBROADCASTI128 m1High<>(SB), tmp; \ 403 VPSHUFB x, tmp, x; \ 404 VPXOR y, x, x; \ 405 VBROADCASTI128 inverseShiftRows<>(SB), tmp; \ 406 VPSHUFB tmp, x, x; \ 407 VEXTRACTI128 $1, x, yw \ 408 VAESENCLAST X_NIBBLE_MASK, xw, xw; \ 409 VAESENCLAST X_NIBBLE_MASK, yw, yw; \ 410 VINSERTI128 $1, yw, x, x; \ 411 VPANDN NIBBLE_MASK, x, tmp; \ 412 VBROADCASTI128 m2Low<>(SB), y; \ 413 VPSHUFB tmp, y, y; \ 414 VPSRLQ $4, x, x; \ 415 VPAND NIBBLE_MASK, x, x; \ 416 VBROADCASTI128 m2High<>(SB), tmp; \ 417 VPSHUFB x, tmp, x; \ 418 VPXOR y, x, x 419 420 #define AVX2_SM4_TAO_L1(x, y, xw, yw, tmp) \ 421 AVX2_SM4_SBOX(x, y, xw, yw, tmp); \ 422 VBROADCASTI128 r08Mask<>(SB), tmp; \ 423 VPSHUFB tmp, x, y; \ 424 VPXOR x, y, y; \ 425 VBROADCASTI128 r16Mask<>(SB), tmp; \ 426 VPSHUFB tmp, x, tmp; \ 427 VPXOR tmp, y, y; \ 428 VPSLLD $2, y, tmp; \ 429 VPSRLD $30, y, y; \ 430 VPXOR tmp, y, y; \ 431 VBROADCASTI128 r24Mask<>(SB), tmp; \ 432 VPSHUFB tmp, x, tmp; \ 433 VPXOR y, x, x; \ 434 VPXOR x, tmp, x 435 436 #define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \ 437 VPBROADCASTD (index * 4)(RK)(IND*1), x; \ 438 VPXOR t1, x, x; \ 439 VPXOR t2, x, x; \ 440 VPXOR t3, x, x; \ 441 AVX2_SM4_TAO_L1(x, y, xw, yw, tmp); \ 442 VPXOR x, t0, t0 443 444 #define AVX_SM4_SBOX(x, y, tmp) \ 445 VPAND X_NIBBLE_MASK, x, tmp; \ 446 VMOVDQU m1Low<>(SB), y; \ 447 VPSHUFB tmp, y, y; \ 448 VPSRLQ $4, x, x; \ 449 VPAND X_NIBBLE_MASK, x, x; \ 450 VMOVDQU m1High<>(SB), tmp; \ 451 VPSHUFB x, tmp, x; \ 452 VPXOR y, x, x; \ 453 VMOVDQU inverseShiftRows<>(SB), tmp; \ 454 VPSHUFB tmp, x, x; \ 455 VAESENCLAST X_NIBBLE_MASK, x, x; \ 456 VPANDN X_NIBBLE_MASK, x, tmp; \ 457 VMOVDQU m2Low<>(SB), y; \ 458 VPSHUFB tmp, y, y; \ 459 VPSRLQ $4, x, x; \ 460 VPAND X_NIBBLE_MASK, x, x; \ 461 VMOVDQU m2High<>(SB), tmp; \ 462 VPSHUFB x, tmp, x; \ 463 VPXOR y, x, x 464 465 #define AVX_SM4_TAO_L1(x, y, tmp) \ 466 AVX_SM4_SBOX(x, y, tmp); \ 467 VMOVDQU r08Mask<>(SB), tmp; \ 468 VPSHUFB tmp, x, y; \ 469 VPXOR x, y, y; \ 470 VMOVDQU r16Mask<>(SB), tmp; \ 471 VPSHUFB tmp, x, tmp; \ 472 VPXOR tmp, y, y; \ 473 VPSLLD $2, y, tmp; \ 474 VPSRLD $30, y, y; \ 475 VPXOR tmp, y, y; \ 476 VMOVDQU r24Mask<>(SB), tmp; \ 477 VPSHUFB tmp, x, tmp; \ 478 VPXOR y, x, x; \ 479 VPXOR x, tmp, x 480 481 #define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ 482 VPBROADCASTD (index * 4)(RK)(IND*1), x; \ 483 VPXOR t1, x, x; \ 484 VPXOR t2, x, x; \ 485 VPXOR t3, x, x; \ 486 AVX_SM4_TAO_L1(x, y, tmp); \ 487 VPXOR x, t0, t0 488 489 // func gcmSm4Init(productTable *[256]byte, rk []uint32) 490 TEXT ·gcmSm4Init(SB),NOSPLIT,$0 491 #define dst DI 492 #define RK SI 493 494 MOVQ productTable+0(FP), dst 495 MOVQ rk+8(FP), RK 496 497 MOVOU gcmPoly<>(SB), POLY 498 499 // Encrypt block 0, with the sm4 round keys to generate the hash key H 500 PXOR B0, B0 501 PXOR B1, B1 502 PXOR B2, B2 503 PXOR B3, B3 504 XORL CX, CX 505 506 sm4InitEncLoop: 507 SM4_SINGLE_ROUND(0, RK, CX, T0, T1, T2, B0, B1, B2, B3) 508 SM4_SINGLE_ROUND(1, RK, CX, T0, T1, T2, B1, B2, B3, B0) 509 SM4_SINGLE_ROUND(2, RK, CX, T0, T1, T2, B2, B3, B0, B1) 510 SM4_SINGLE_ROUND(3, RK, CX, T0, T1, T2, B3, B0, B1, B2) 511 512 ADDL $16, CX 513 CMPL CX, $4*32 514 JB sm4InitEncLoop 515 516 PEXTRD $0, B1, R8 517 PINSRD $1, R8, B0 518 PEXTRD $0, B2, R8 519 PINSRD $2, R8, B0 520 PEXTRD $0, B3, R8 521 PINSRD $3, R8, B0 522 523 // H * 2 524 PSHUFD $0xff, B0, T0 525 MOVOU B0, T1 526 PSRAL $31, T0 527 PAND POLY, T0 528 PSRLL $31, T1 529 PSLLDQ $4, T1 530 PSLLL $1, B0 531 PXOR T0, B0 532 PXOR T1, B0 533 // Karatsuba pre-computations 534 MOVOU B0, (16*14)(dst) 535 PSHUFD $78, B0, B1 536 PXOR B0, B1 537 MOVOU B1, (16*15)(dst) 538 539 MOVOU B0, B2 540 MOVOU B1, B3 541 // Now prepare powers of H and pre-computations for them 542 MOVQ $7, AX 543 544 initLoop: 545 MOVOU B2, T0 546 MOVOU B2, T1 547 MOVOU B3, T2 548 PCLMULQDQ $0x00, B0, T0 549 PCLMULQDQ $0x11, B0, T1 550 PCLMULQDQ $0x00, B1, T2 551 552 PXOR T0, T2 553 PXOR T1, T2 554 MOVOU T2, B4 555 PSLLDQ $8, B4 556 PSRLDQ $8, T2 557 PXOR B4, T0 558 PXOR T2, T1 559 560 MOVOU POLY, B2 561 PCLMULQDQ $0x01, T0, B2 562 PSHUFD $78, T0, T0 563 PXOR B2, T0 564 MOVOU POLY, B2 565 PCLMULQDQ $0x01, T0, B2 566 PSHUFD $78, T0, T0 567 PXOR T0, B2 568 PXOR T1, B2 569 570 MOVOU B2, (16*12)(dst) 571 PSHUFD $78, B2, B3 572 PXOR B2, B3 573 MOVOU B3, (16*13)(dst) 574 575 DECQ AX 576 LEAQ (-16*2)(dst), dst 577 JNE initLoop 578 579 RET 580 581 #undef RK 582 #undef dst 583 584 // func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) 585 TEXT ·gcmSm4Data(SB),NOSPLIT,$0 586 #define pTbl DI 587 #define aut SI 588 #define tPtr CX 589 #define autLen DX 590 591 #define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a 592 #define mulRoundAAD(X ,i) \ 593 MOVOU (16*(i*2))(pTbl), T1;\ 594 MOVOU T1, T2;\ 595 PCLMULQDQ $0x00, X, T1;\ 596 PXOR T1, ACC0;\ 597 PCLMULQDQ $0x11, X, T2;\ 598 PXOR T2, ACC1;\ 599 PSHUFD $78, X, T1;\ 600 PXOR T1, X;\ 601 MOVOU (16*(i*2+1))(pTbl), T1;\ 602 PCLMULQDQ $0x00, X, T1;\ 603 PXOR T1, ACCM 604 605 MOVQ productTable+0(FP), pTbl 606 MOVQ data_base+8(FP), aut 607 MOVQ data_len+16(FP), autLen 608 MOVQ T+32(FP), tPtr 609 610 //PXOR ACC0, ACC0 611 MOVOU (tPtr), ACC0 612 MOVOU bswapMask<>(SB), BSWAP 613 MOVOU gcmPoly<>(SB), POLY 614 615 TESTQ autLen, autLen 616 JEQ dataBail 617 618 CMPQ autLen, $13 // optimize the TLS case 619 JE dataTLS 620 CMPQ autLen, $128 621 JB startSinglesLoop 622 JMP dataOctaLoop 623 624 dataTLS: 625 MOVOU (16*14)(pTbl), T1 626 MOVOU (16*15)(pTbl), T2 627 PXOR B0, B0 628 MOVQ (aut), B0 629 PINSRD $2, 8(aut), B0 630 PINSRB $12, 12(aut), B0 631 XORQ autLen, autLen 632 JMP dataMul 633 634 dataOctaLoop: 635 CMPQ autLen, $128 636 JB startSinglesLoop 637 SUBQ $128, autLen 638 639 MOVOU (16*0)(aut), X0 640 MOVOU (16*1)(aut), X1 641 MOVOU (16*2)(aut), X2 642 MOVOU (16*3)(aut), X3 643 MOVOU (16*4)(aut), X4 644 MOVOU (16*5)(aut), X5 645 MOVOU (16*6)(aut), X6 646 MOVOU (16*7)(aut), X7 647 LEAQ (16*8)(aut), aut 648 PSHUFB BSWAP, X0 649 PSHUFB BSWAP, X1 650 PSHUFB BSWAP, X2 651 PSHUFB BSWAP, X3 652 PSHUFB BSWAP, X4 653 PSHUFB BSWAP, X5 654 PSHUFB BSWAP, X6 655 PSHUFB BSWAP, X7 656 PXOR ACC0, X0 657 658 MOVOU (16*0)(pTbl), ACC0 659 MOVOU (16*1)(pTbl), ACCM 660 MOVOU ACC0, ACC1 661 PSHUFD $78, X0, T1 662 PXOR X0, T1 663 PCLMULQDQ $0x00, X0, ACC0 664 PCLMULQDQ $0x11, X0, ACC1 665 PCLMULQDQ $0x00, T1, ACCM 666 667 mulRoundAAD(X1, 1) 668 mulRoundAAD(X2, 2) 669 mulRoundAAD(X3, 3) 670 mulRoundAAD(X4, 4) 671 mulRoundAAD(X5, 5) 672 mulRoundAAD(X6, 6) 673 mulRoundAAD(X7, 7) 674 675 PXOR ACC0, ACCM 676 PXOR ACC1, ACCM 677 MOVOU ACCM, T0 678 PSRLDQ $8, ACCM 679 PSLLDQ $8, T0 680 PXOR ACCM, ACC1 681 PXOR T0, ACC0 682 reduceRound(ACC0) 683 reduceRound(ACC0) 684 PXOR ACC1, ACC0 685 JMP dataOctaLoop 686 687 startSinglesLoop: 688 MOVOU (16*14)(pTbl), T1 689 MOVOU (16*15)(pTbl), T2 690 691 dataSinglesLoop: 692 693 CMPQ autLen, $16 694 JB dataEnd 695 SUBQ $16, autLen 696 697 MOVOU (aut), B0 698 dataMul: 699 PSHUFB BSWAP, B0 700 PXOR ACC0, B0 701 702 MOVOU T1, ACC0 703 MOVOU T2, ACCM 704 MOVOU T1, ACC1 705 706 PSHUFD $78, B0, T0 707 PXOR B0, T0 708 PCLMULQDQ $0x00, B0, ACC0 709 PCLMULQDQ $0x11, B0, ACC1 710 PCLMULQDQ $0x00, T0, ACCM 711 712 PXOR ACC0, ACCM 713 PXOR ACC1, ACCM 714 MOVOU ACCM, T0 715 PSRLDQ $8, ACCM 716 PSLLDQ $8, T0 717 PXOR ACCM, ACC1 718 PXOR T0, ACC0 719 720 MOVOU POLY, T0 721 PCLMULQDQ $0x01, ACC0, T0 722 PSHUFD $78, ACC0, ACC0 723 PXOR T0, ACC0 724 725 MOVOU POLY, T0 726 PCLMULQDQ $0x01, ACC0, T0 727 PSHUFD $78, ACC0, ACC0 728 PXOR T0, ACC0 729 PXOR ACC1, ACC0 730 731 LEAQ 16(aut), aut 732 733 JMP dataSinglesLoop 734 735 dataEnd: 736 737 TESTQ autLen, autLen 738 JEQ dataBail 739 740 PXOR B0, B0 741 LEAQ -1(aut)(autLen*1), aut 742 743 dataLoadLoop: 744 745 PSLLDQ $1, B0 746 PINSRB $0, (aut), B0 747 748 LEAQ -1(aut), aut 749 DECQ autLen 750 JNE dataLoadLoop 751 752 JMP dataMul 753 754 dataBail: 755 MOVOU ACC0, (tPtr) 756 RET 757 758 #undef pTbl 759 #undef aut 760 #undef tPtr 761 #undef autLen 762 763 764 // func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 765 TEXT ·gcmSm4Enc(SB),0,$256-96 766 #define pTbl DI 767 #define ctx DX 768 #define ctrPtr CX 769 #define ptx SI 770 #define rk AX 771 #define tPtr R8 772 #define ptxLen R9 773 #define aluCTR R10 774 #define aluTMP R11 775 776 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) 777 778 #define mulRound(i) \ 779 MOVOU (16*i)(SP), T0;\ 780 MOVOU (16*(i*2))(pTbl), T1;\ 781 MOVOU T1, T2;\ 782 PCLMULQDQ $0x00, T0, T1;\ 783 PXOR T1, ACC0;\ 784 PCLMULQDQ $0x11, T0, T2;\ 785 PXOR T2, ACC1;\ 786 PSHUFD $78, T0, T1;\ 787 PXOR T1, T0;\ 788 MOVOU (16*(i*2+1))(pTbl), T1;\ 789 PCLMULQDQ $0x00, T0, T1;\ 790 PXOR T1, ACCM 791 792 #define gcmEncDataStep(B) \ 793 PSHUFB BSWAP, B; \ 794 PXOR ACC0, B; \ 795 MOVOU T2, ACC0; \ 796 MOVOU T2, ACC1; \ 797 MOVOU (16*15)(pTbl), ACCM; \ 798 PSHUFD $78, B, T0; \ 799 PXOR B, T0; \ 800 PCLMULQDQ $0x00, B, ACC0; \ 801 PCLMULQDQ $0x11, B, ACC1; \ 802 PCLMULQDQ $0x00, T0, ACCM; \ 803 PXOR ACC0, ACCM; \ 804 PXOR ACC1, ACCM; \ 805 MOVOU ACCM, T0; \ 806 PSRLDQ $8, ACCM; \ 807 PSLLDQ $8, T0; \ 808 PXOR ACCM, ACC1; \ 809 PXOR T0, ACC0; \ 810 reduceRound(ACC0); \ 811 reduceRound(ACC0); \ 812 PXOR ACC1, ACC0 813 814 MOVQ productTable+0(FP), pTbl 815 MOVQ dst+8(FP), ctx 816 MOVQ src_base+32(FP), ptx 817 MOVQ src_len+40(FP), ptxLen 818 MOVQ ctr+56(FP), ctrPtr 819 MOVQ T+64(FP), tPtr 820 MOVQ rk_base+72(FP), rk 821 822 CMPB ·useAVX2(SB), $1 823 JE avx2GcmSm4Enc 824 825 MOVOU bswapMask<>(SB), BSWAP 826 MOVOU gcmPoly<>(SB), POLY 827 828 MOVOU (tPtr), ACC0 829 PXOR ACC1, ACC1 830 PXOR ACCM, ACCM 831 MOVOU (ctrPtr), T0 832 MOVL (3*4)(ctrPtr), aluCTR 833 834 BSWAPL aluCTR 835 MOVOU T0, (8*16 + 0*16)(SP) 836 increment(0) 837 MOVOU T0, (8*16 + 1*16)(SP) 838 increment(1) 839 MOVOU T0, (8*16 + 2*16)(SP) 840 increment(2) 841 MOVOU T0, (8*16 + 3*16)(SP) 842 increment(3) 843 844 CMPQ ptxLen, $128 845 JB gcmSm4EncNibbles 846 SUBQ $128, ptxLen 847 848 // We have at least 8 blocks to encrypt, prepare the rest of the counters 849 MOVOU T0, (8*16 + 4*16)(SP) 850 increment(4) 851 MOVOU T0, (8*16 + 5*16)(SP) 852 increment(5) 853 MOVOU T0, (8*16 + 6*16)(SP) 854 increment(6) 855 MOVOU T0, (8*16 + 7*16)(SP) 856 increment(7) 857 858 // load 8 ctrs for encryption 859 MOVOU (8*16 + 0*16)(SP), B0 860 MOVOU (8*16 + 1*16)(SP), B1 861 MOVOU (8*16 + 2*16)(SP), B2 862 MOVOU (8*16 + 3*16)(SP), B3 863 MOVOU (8*16 + 4*16)(SP), B4 864 MOVOU (8*16 + 5*16)(SP), B5 865 MOVOU (8*16 + 6*16)(SP), B6 866 MOVOU (8*16 + 7*16)(SP), B7 867 868 SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) 869 increment(0) 870 increment(1) 871 increment(2) 872 increment(3) 873 SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) 874 increment(4) 875 increment(5) 876 increment(6) 877 increment(7) 878 879 // XOR plaintext 880 MOVOU (16*0)(ptx), T0 881 PXOR T0, B0 882 MOVOU (16*1)(ptx), T0 883 PXOR T0, B1 884 MOVOU (16*2)(ptx), T0 885 PXOR T0, B2 886 MOVOU (16*3)(ptx), T0 887 PXOR T0, B3 888 MOVOU (16*4)(ptx), T0 889 PXOR T0, B4 890 MOVOU (16*5)(ptx), T0 891 PXOR T0, B5 892 MOVOU (16*6)(ptx), T0 893 PXOR T0, B6 894 MOVOU (16*7)(ptx), T0 895 PXOR T0, B7 896 897 // Store ciphertext 898 MOVOU B0, (16*0)(ctx) 899 PSHUFB BSWAP, B0 900 PXOR ACC0, B0 901 MOVOU B1, (16*1)(ctx) 902 PSHUFB BSWAP, B1 903 MOVOU B2, (16*2)(ctx) 904 PSHUFB BSWAP, B2 905 MOVOU B3, (16*3)(ctx) 906 PSHUFB BSWAP, B3 907 MOVOU B4, (16*4)(ctx) 908 PSHUFB BSWAP, B4 909 MOVOU B5, (16*5)(ctx) 910 PSHUFB BSWAP, B5 911 MOVOU B6, (16*6)(ctx) 912 PSHUFB BSWAP, B6 913 MOVOU B7, (16*7)(ctx) 914 PSHUFB BSWAP, B7 915 916 MOVOU B0, (16*0)(SP) 917 MOVOU B1, (16*1)(SP) 918 MOVOU B2, (16*2)(SP) 919 MOVOU B3, (16*3)(SP) 920 MOVOU B4, (16*4)(SP) 921 MOVOU B5, (16*5)(SP) 922 MOVOU B6, (16*6)(SP) 923 MOVOU B7, (16*7)(SP) 924 925 LEAQ 128(ptx), ptx 926 LEAQ 128(ctx), ctx 927 928 gcmSm4EncOctetsLoop: 929 CMPQ ptxLen, $128 930 JB gcmSm4EncOctetsEnd 931 SUBQ $128, ptxLen 932 933 MOVOU (8*16 + 0*16)(SP), B0 934 MOVOU (8*16 + 1*16)(SP), B1 935 MOVOU (8*16 + 2*16)(SP), B2 936 MOVOU (8*16 + 3*16)(SP), B3 937 MOVOU (8*16 + 4*16)(SP), B4 938 MOVOU (8*16 + 5*16)(SP), B5 939 MOVOU (8*16 + 6*16)(SP), B6 940 MOVOU (8*16 + 7*16)(SP), B7 941 942 MOVOU (16*0)(SP), T0 943 PSHUFD $78, T0, T1 944 PXOR T0, T1 945 946 MOVOU (16*0)(pTbl), ACC0 947 MOVOU (16*1)(pTbl), ACCM 948 MOVOU ACC0, ACC1 949 950 PCLMULQDQ $0x00, T1, ACCM 951 PCLMULQDQ $0x00, T0, ACC0 952 PCLMULQDQ $0x11, T0, ACC1 953 954 SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) 955 mulRound(1) 956 increment(0) 957 mulRound(2) 958 increment(1) 959 mulRound(3) 960 increment(2) 961 mulRound(4) 962 increment(3) 963 SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) 964 mulRound(5) 965 increment(4) 966 mulRound(6) 967 increment(5) 968 mulRound(7) 969 increment(6) 970 increment(7) 971 PXOR ACC0, ACCM 972 PXOR ACC1, ACCM 973 MOVOU ACCM, T0 974 PSRLDQ $8, ACCM 975 PSLLDQ $8, T0 976 PXOR ACCM, ACC1 977 PXOR T0, ACC0 978 979 reduceRound(ACC0) 980 reduceRound(ACC0) 981 PXOR ACC1, ACC0 982 983 MOVOU (16*0)(ptx), T0 984 PXOR T0, B0 985 MOVOU (16*1)(ptx), T0 986 PXOR T0, B1 987 MOVOU (16*2)(ptx), T0 988 PXOR T0, B2 989 MOVOU (16*3)(ptx), T0 990 PXOR T0, B3 991 MOVOU (16*4)(ptx), T0 992 PXOR T0, B4 993 MOVOU (16*5)(ptx), T0 994 PXOR T0, B5 995 MOVOU (16*6)(ptx), T0 996 PXOR T0, B6 997 MOVOU (16*7)(ptx), T0 998 PXOR T0, B7 999 1000 MOVOU B0, (16*0)(ctx) 1001 PSHUFB BSWAP, B0 1002 PXOR ACC0, B0 1003 MOVOU B1, (16*1)(ctx) 1004 PSHUFB BSWAP, B1 1005 MOVOU B2, (16*2)(ctx) 1006 PSHUFB BSWAP, B2 1007 MOVOU B3, (16*3)(ctx) 1008 PSHUFB BSWAP, B3 1009 MOVOU B4, (16*4)(ctx) 1010 PSHUFB BSWAP, B4 1011 MOVOU B5, (16*5)(ctx) 1012 PSHUFB BSWAP, B5 1013 MOVOU B6, (16*6)(ctx) 1014 PSHUFB BSWAP, B6 1015 MOVOU B7, (16*7)(ctx) 1016 PSHUFB BSWAP, B7 1017 1018 MOVOU B0, (16*0)(SP) 1019 MOVOU B1, (16*1)(SP) 1020 MOVOU B2, (16*2)(SP) 1021 MOVOU B3, (16*3)(SP) 1022 MOVOU B4, (16*4)(SP) 1023 MOVOU B5, (16*5)(SP) 1024 MOVOU B6, (16*6)(SP) 1025 MOVOU B7, (16*7)(SP) 1026 1027 LEAQ 128(ptx), ptx 1028 LEAQ 128(ctx), ctx 1029 1030 JMP gcmSm4EncOctetsLoop 1031 1032 gcmSm4EncOctetsEnd: 1033 MOVOU (16*0)(SP), T0 1034 MOVOU (16*0)(pTbl), ACC0 1035 MOVOU (16*1)(pTbl), ACCM 1036 MOVOU ACC0, ACC1 1037 PSHUFD $78, T0, T1 1038 PXOR T0, T1 1039 PCLMULQDQ $0x00, T0, ACC0 1040 PCLMULQDQ $0x11, T0, ACC1 1041 PCLMULQDQ $0x00, T1, ACCM 1042 1043 mulRound(1) 1044 mulRound(2) 1045 mulRound(3) 1046 mulRound(4) 1047 mulRound(5) 1048 mulRound(6) 1049 mulRound(7) 1050 1051 PXOR ACC0, ACCM 1052 PXOR ACC1, ACCM 1053 MOVOU ACCM, T0 1054 PSRLDQ $8, ACCM 1055 PSLLDQ $8, T0 1056 PXOR ACCM, ACC1 1057 PXOR T0, ACC0 1058 1059 reduceRound(ACC0) 1060 reduceRound(ACC0) 1061 PXOR ACC1, ACC0 1062 1063 TESTQ ptxLen, ptxLen 1064 JE gcmSm4EncDone 1065 1066 SUBQ $4, aluCTR 1067 1068 gcmSm4EncNibbles: 1069 CMPQ ptxLen, $64 1070 JBE gcmSm4EncSingles 1071 SUBQ $64, ptxLen 1072 1073 MOVOU (8*16 + 0*16)(SP), B0 1074 MOVOU (8*16 + 1*16)(SP), B1 1075 MOVOU (8*16 + 2*16)(SP), B2 1076 MOVOU (8*16 + 3*16)(SP), B3 1077 1078 SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3) 1079 MOVOU (16*0)(ptx), T0 1080 PXOR T0, B0 1081 MOVOU (16*1)(ptx), T0 1082 PXOR T0, B1 1083 MOVOU (16*2)(ptx), T0 1084 PXOR T0, B2 1085 MOVOU (16*3)(ptx), T0 1086 PXOR T0, B3 1087 1088 MOVOU B0, (16*0)(ctx) 1089 MOVOU B1, (16*1)(ctx) 1090 MOVOU B2, (16*2)(ctx) 1091 MOVOU B3, (16*3)(ctx) 1092 1093 MOVOU (16*14)(pTbl), T2 1094 gcmEncDataStep(B0) 1095 gcmEncDataStep(B1) 1096 gcmEncDataStep(B2) 1097 gcmEncDataStep(B3) 1098 increment(0) 1099 increment(1) 1100 increment(2) 1101 increment(3) 1102 1103 LEAQ 64(ptx), ptx 1104 LEAQ 64(ctx), ctx 1105 1106 gcmSm4EncSingles: 1107 TESTQ ptxLen, ptxLen 1108 JE gcmSm4EncDone 1109 MOVOU (8*16 + 0*16)(SP), B0 1110 MOVOU (8*16 + 1*16)(SP), B1 1111 MOVOU (8*16 + 2*16)(SP), B2 1112 MOVOU (8*16 + 3*16)(SP), B3 1113 1114 SM4_4BLOCKS(AX, BX, T0, T1, T2, B0, B1, B2, B3) 1115 MOVOU B0, (16*0)(SP) 1116 MOVOU B1, (16*1)(SP) 1117 MOVOU B2, (16*2)(SP) 1118 MOVOU B3, (16*3)(SP) 1119 1120 MOVOU (16*14)(pTbl), T2 1121 MOVQ SP, BP 1122 1123 gcmSm4EncSinglesLoop: 1124 CMPQ ptxLen, $16 1125 JB gcmSm4EncTail 1126 SUBQ $16, ptxLen 1127 MOVOU (16*0)(BP), B0 1128 MOVOU (ptx), T0 1129 PXOR T0, B0 1130 MOVOU B0, (ctx) 1131 gcmEncDataStep(B0) 1132 LEAQ (16*1)(ptx), ptx 1133 LEAQ (16*1)(ctx), ctx 1134 ADDQ $16, BP 1135 JMP gcmSm4EncSinglesLoop 1136 1137 gcmSm4EncTail: 1138 TESTQ ptxLen, ptxLen 1139 JE gcmSm4EncDone 1140 MOVOU (16*0)(BP), B0 1141 MOVOU B0, T0 1142 1143 LEAQ -1(ptx)(ptxLen*1), ptx 1144 1145 MOVQ ptxLen, aluTMP 1146 SHLQ $4, aluTMP 1147 1148 LEAQ andMask<>(SB), aluCTR 1149 MOVOU -16(aluCTR)(aluTMP*1), T1 1150 PXOR B0, B0 1151 ptxLoadLoop: 1152 PSLLDQ $1, B0 1153 PINSRB $0, (ptx), B0 1154 LEAQ -1(ptx), ptx 1155 DECQ ptxLen 1156 JNE ptxLoadLoop 1157 1158 PXOR T0, B0 1159 PAND T1, B0 1160 MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT 1161 gcmEncDataStep(B0) 1162 1163 gcmSm4EncDone: 1164 MOVOU ACC0, (tPtr) 1165 RET 1166 1167 avx2GcmSm4Enc: 1168 VMOVDQU bswapMask<>(SB), BSWAP 1169 VMOVDQU gcmPoly<>(SB), POLY 1170 1171 VMOVDQU (tPtr), ACC0 1172 VPXOR ACC1, ACC1, ACC1 1173 VPXOR ACCM, ACCM, ACCM 1174 VMOVDQU (ctrPtr), T0 1175 MOVL (3*4)(ctrPtr), aluCTR 1176 1177 BSWAPL aluCTR 1178 VMOVDQU T0, (8*16 + 0*16)(SP) 1179 increment(0) 1180 VMOVDQU T0, (8*16 + 1*16)(SP) 1181 increment(1) 1182 VMOVDQU T0, (8*16 + 2*16)(SP) 1183 increment(2) 1184 VMOVDQU T0, (8*16 + 3*16)(SP) 1185 increment(3) 1186 1187 CMPQ ptxLen, $128 1188 JB avx2GcmSm4EncNibbles 1189 SUBQ $128, ptxLen 1190 1191 // We have at least 8 blocks to encrypt, prepare the rest of the counters 1192 VMOVDQU T0, (8*16 + 4*16)(SP) 1193 increment(4) 1194 VMOVDQU T0, (8*16 + 5*16)(SP) 1195 increment(5) 1196 VMOVDQU T0, (8*16 + 6*16)(SP) 1197 increment(6) 1198 VMOVDQU T0, (8*16 + 7*16)(SP) 1199 increment(7) 1200 1201 // load 8 ctrs for encryption 1202 VMOVDQU (4*32 + 0*32)(SP), DWB0 1203 VMOVDQU (4*32 + 1*32)(SP), DWB1 1204 VMOVDQU (4*32 + 2*32)(SP), DWB2 1205 VMOVDQU (4*32 + 3*32)(SP), DWB3 1206 1207 VBROADCASTI128 flipMask<>(SB), XDWTMP0 1208 // Apply Byte Flip Mask: LE -> BE 1209 VPSHUFB XDWTMP0, DWB0, DWB0 1210 VPSHUFB XDWTMP0, DWB1, DWB1 1211 VPSHUFB XDWTMP0, DWB2, DWB2 1212 VPSHUFB XDWTMP0, DWB3, DWB3 1213 1214 // Transpose matrix 4 x 4 32bits word 1215 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) 1216 XORL BX, BX 1217 VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK 1218 1219 avx2GcmSm4Enc8Loop1: 1220 AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) 1221 AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) 1222 AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) 1223 AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) 1224 1225 ADDL $16, BX 1226 CMPL BX, $4*32 1227 JB avx2GcmSm4Enc8Loop1 1228 1229 // Transpose matrix 4 x 4 32bits word 1230 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) 1231 1232 VBROADCASTI128 bswapMask<>(SB), DWBSWAP 1233 VPSHUFB DWBSWAP, DWB0, DWB0 1234 VPSHUFB DWBSWAP, DWB1, DWB1 1235 VPSHUFB DWBSWAP, DWB2, DWB2 1236 VPSHUFB DWBSWAP, DWB3, DWB3 1237 1238 increment(0) 1239 increment(1) 1240 increment(2) 1241 increment(3) 1242 increment(4) 1243 increment(5) 1244 increment(6) 1245 increment(7) 1246 1247 // XOR plaintext 1248 VMOVDQU (32*0)(ptx), XDWTMP0 1249 VPXOR XDWTMP0, DWB0, DWB0 1250 VMOVDQU (32*1)(ptx), XDWTMP0 1251 VPXOR XDWTMP0, DWB1, DWB1 1252 VMOVDQU (32*2)(ptx), XDWTMP0 1253 VPXOR XDWTMP0, DWB2, DWB2 1254 VMOVDQU (32*3)(ptx), XDWTMP0 1255 VPXOR XDWTMP0, DWB3, DWB3 1256 1257 // Store ciphertext 1258 VMOVDQU DWB0, (32*0)(ctx) 1259 VPSHUFB DWBSWAP, DWB0, DWB0 1260 VMOVDQU DWB1, (32*1)(ctx) 1261 VPSHUFB DWBSWAP, DWB1, DWB1 1262 VMOVDQU DWB2, (32*2)(ctx) 1263 VPSHUFB DWBSWAP, DWB2, DWB2 1264 VMOVDQU DWB3, (32*3)(ctx) 1265 VPSHUFB DWBSWAP, DWB3, DWB3 1266 1267 //VPXOR XDWTMP0, XDWTMP0, XDWTMP0 1268 //VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0 1269 //VPXOR XDWTMP0, DWB0, DWB0 1270 PXOR ACC0, B0 // Can't call VPXOR here 1271 VMOVDQU DWB0, (32*0)(SP) 1272 VMOVDQU DWB1, (32*1)(SP) 1273 VMOVDQU DWB2, (32*2)(SP) 1274 VMOVDQU DWB3, (32*3)(SP) 1275 1276 LEAQ 128(ptx), ptx 1277 LEAQ 128(ctx), ctx 1278 1279 avx2GcmSm4EncOctetsLoop: 1280 CMPQ ptxLen, $128 1281 JB avx2GcmSm4EncOctetsEnd 1282 SUBQ $128, ptxLen 1283 1284 // load 8 ctrs for encryption 1285 VMOVDQU (4*32 + 0*32)(SP), DWB0 1286 VMOVDQU (4*32 + 1*32)(SP), DWB1 1287 VMOVDQU (4*32 + 2*32)(SP), DWB2 1288 VMOVDQU (4*32 + 3*32)(SP), DWB3 1289 1290 VBROADCASTI128 flipMask<>(SB), XDWTMP0 1291 // Apply Byte Flip Mask: LE -> BE 1292 VPSHUFB XDWTMP0, DWB0, DWB0 1293 VPSHUFB XDWTMP0, DWB1, DWB1 1294 VPSHUFB XDWTMP0, DWB2, DWB2 1295 VPSHUFB XDWTMP0, DWB3, DWB3 1296 1297 VMOVDQU (16*0)(SP), T0 1298 VPSHUFD $78, T0, T1 1299 VPXOR T0, T1, T1 1300 1301 VMOVDQU (16*0)(pTbl), ACC0 1302 VMOVDQU (16*1)(pTbl), ACCM 1303 VMOVDQU ACC0, ACC1 1304 1305 PCLMULQDQ $0x00, T1, ACCM 1306 PCLMULQDQ $0x00, T0, ACC0 1307 PCLMULQDQ $0x11, T0, ACC1 1308 1309 // Transpose matrix 4 x 4 32bits word 1310 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) 1311 XORL BX, BX 1312 VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK 1313 1314 avx2GcmSm4Enc8Loop2: 1315 AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) 1316 AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) 1317 AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) 1318 AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) 1319 1320 ADDL $16, BX 1321 CMPL BX, $4*32 1322 JB avx2GcmSm4Enc8Loop2 1323 1324 // Transpose matrix 4 x 4 32bits word 1325 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) 1326 1327 VBROADCASTI128 bswapMask<>(SB), DWBSWAP 1328 VPSHUFB DWBSWAP, DWB0, DWB0 1329 VPSHUFB DWBSWAP, DWB1, DWB1 1330 VPSHUFB DWBSWAP, DWB2, DWB2 1331 VPSHUFB DWBSWAP, DWB3, DWB3 1332 1333 mulRound(1) 1334 increment(0) 1335 mulRound(2) 1336 increment(1) 1337 mulRound(3) 1338 increment(2) 1339 mulRound(4) 1340 increment(3) 1341 mulRound(5) 1342 increment(4) 1343 mulRound(6) 1344 increment(5) 1345 mulRound(7) 1346 increment(6) 1347 increment(7) 1348 VPXOR ACC0, ACCM, ACCM 1349 VPXOR ACC1, ACCM, ACCM 1350 VPSLLDQ $8, ACCM, T0 1351 VPSRLDQ $8, ACCM, ACCM 1352 1353 VPXOR ACCM, ACC1, ACC1 1354 VPXOR T0, ACC0, ACC0 1355 1356 reduceRound(ACC0) 1357 reduceRound(ACC0) 1358 VPXOR ACC1, ACC0, ACC0 1359 1360 // XOR plaintext 1361 VMOVDQU (32*0)(ptx), XDWTMP0 1362 VPXOR XDWTMP0, DWB0, DWB0 1363 VMOVDQU (32*1)(ptx), XDWTMP0 1364 VPXOR XDWTMP0, DWB1, DWB1 1365 VMOVDQU (32*2)(ptx), XDWTMP0 1366 VPXOR XDWTMP0, DWB2, DWB2 1367 VMOVDQU (32*3)(ptx), XDWTMP0 1368 VPXOR XDWTMP0, DWB3, DWB3 1369 1370 // Store ciphertext 1371 VMOVDQU DWB0, (32*0)(ctx) 1372 VPSHUFB DWBSWAP, DWB0, DWB0 1373 VMOVDQU DWB1, (32*1)(ctx) 1374 VPSHUFB DWBSWAP, DWB1, DWB1 1375 VMOVDQU DWB2, (32*2)(ctx) 1376 VPSHUFB DWBSWAP, DWB2, DWB2 1377 VMOVDQU DWB3, (32*3)(ctx) 1378 VPSHUFB DWBSWAP, DWB3, DWB3 1379 1380 //VPXOR XDWTMP0, XDWTMP0, XDWTMP0 1381 //VINSERTI128 $0, ACC0, XDWTMP0, XDWTMP0 1382 //VPXOR XDWTMP0, DWB0, DWB0 1383 PXOR ACC0, B0 // Can't call VPXOR here 1384 VMOVDQU DWB0, (32*0)(SP) 1385 VMOVDQU DWB1, (32*1)(SP) 1386 VMOVDQU DWB2, (32*2)(SP) 1387 VMOVDQU DWB3, (32*3)(SP) 1388 1389 LEAQ 128(ptx), ptx 1390 LEAQ 128(ctx), ctx 1391 1392 JMP avx2GcmSm4EncOctetsLoop 1393 1394 avx2GcmSm4EncOctetsEnd: 1395 VMOVDQU (16*0)(SP), T0 1396 VMOVDQU (16*0)(pTbl), ACC0 1397 VMOVDQU (16*1)(pTbl), ACCM 1398 VMOVDQU ACC0, ACC1 1399 VPSHUFD $78, T0, T1 1400 VPXOR T0, T1, T1 1401 PCLMULQDQ $0x00, T0, ACC0 1402 PCLMULQDQ $0x11, T0, ACC1 1403 PCLMULQDQ $0x00, T1, ACCM 1404 1405 mulRound(1) 1406 mulRound(2) 1407 mulRound(3) 1408 mulRound(4) 1409 mulRound(5) 1410 mulRound(6) 1411 mulRound(7) 1412 1413 VPXOR ACC0, ACCM, ACCM 1414 VPXOR ACC1, ACCM, ACCM 1415 VPSLLDQ $8, ACCM, T0 1416 VPSRLDQ $8, ACCM, ACCM 1417 1418 VPXOR ACCM, ACC1, ACC1 1419 VPXOR T0, ACC0, ACC0 1420 1421 reduceRound(ACC0) 1422 reduceRound(ACC0) 1423 VPXOR ACC1, ACC0, ACC0 1424 1425 TESTQ ptxLen, ptxLen 1426 JE avx2GcmSm4EncDone 1427 1428 SUBQ $4, aluCTR 1429 1430 avx2GcmSm4EncNibbles: 1431 VMOVDQU flipMask<>(SB), B7 1432 CMPQ ptxLen, $64 1433 JBE avx2GcmSm4EncSingles 1434 SUBQ $64, ptxLen 1435 1436 VMOVDQU (8*16 + 0*16)(SP), B0 1437 VMOVDQU (8*16 + 1*16)(SP), B1 1438 VMOVDQU (8*16 + 2*16)(SP), B2 1439 VMOVDQU (8*16 + 3*16)(SP), B3 1440 1441 VPSHUFB B7, B0, B0 1442 VPSHUFB B7, B1, B1 1443 VPSHUFB B7, B2, B2 1444 VPSHUFB B7, B3, B3 1445 1446 TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) 1447 XORL BX, BX 1448 VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK 1449 1450 avx2GcmSm4Enc4Loop2: 1451 AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) 1452 AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) 1453 AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) 1454 AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) 1455 1456 ADDL $16, BX 1457 CMPL BX, $4*32 1458 JB avx2GcmSm4Enc4Loop2 1459 1460 // Transpose matrix 4 x 4 32bits word 1461 TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) 1462 VPSHUFB BSWAP, B0, B0 1463 VPSHUFB BSWAP, B1, B1 1464 VPSHUFB BSWAP, B2, B2 1465 VPSHUFB BSWAP, B3, B3 1466 1467 VMOVDQU (16*0)(ptx), T0 1468 VPXOR T0, B0, B0 1469 VMOVDQU (16*1)(ptx), T0 1470 VPXOR T0, B1, B1 1471 VMOVDQU (16*2)(ptx), T0 1472 VPXOR T0, B2, B2 1473 VMOVDQU (16*3)(ptx), T0 1474 VPXOR T0, B3, B3 1475 1476 VMOVDQU B0, (16*0)(ctx) 1477 VMOVDQU B1, (16*1)(ctx) 1478 VMOVDQU B2, (16*2)(ctx) 1479 VMOVDQU B3, (16*3)(ctx) 1480 1481 VMOVDQU (16*14)(pTbl), T2 1482 gcmEncDataStep(B0) 1483 gcmEncDataStep(B1) 1484 gcmEncDataStep(B2) 1485 gcmEncDataStep(B3) 1486 increment(0) 1487 increment(1) 1488 increment(2) 1489 increment(3) 1490 1491 LEAQ 64(ptx), ptx 1492 LEAQ 64(ctx), ctx 1493 1494 avx2GcmSm4EncSingles: 1495 TESTQ ptxLen, ptxLen 1496 JE avx2GcmSm4EncDone 1497 1498 VMOVDQU (8*16 + 0*16)(SP), B0 1499 VMOVDQU (8*16 + 1*16)(SP), B1 1500 VMOVDQU (8*16 + 2*16)(SP), B2 1501 VMOVDQU (8*16 + 3*16)(SP), B3 1502 1503 VPSHUFB B7, B0, B0 1504 VPSHUFB B7, B1, B1 1505 VPSHUFB B7, B2, B2 1506 VPSHUFB B7, B3, B3 1507 1508 TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) 1509 XORL BX, BX 1510 VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK 1511 1512 avx2GcmSm4Enc4Loop1: 1513 AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) 1514 AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) 1515 AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) 1516 AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) 1517 1518 ADDL $16, BX 1519 CMPL BX, $4*32 1520 JB avx2GcmSm4Enc4Loop1 1521 1522 // Transpose matrix 4 x 4 32bits word 1523 TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) 1524 VPSHUFB BSWAP, B0, B0 1525 VPSHUFB BSWAP, B1, B1 1526 VPSHUFB BSWAP, B2, B2 1527 VPSHUFB BSWAP, B3, B3 1528 1529 VMOVDQU B0, (16*0)(SP) 1530 VMOVDQU B1, (16*1)(SP) 1531 VMOVDQU B2, (16*2)(SP) 1532 VMOVDQU B3, (16*3)(SP) 1533 1534 VMOVDQU (16*14)(pTbl), T2 1535 MOVQ SP, BP 1536 1537 avx2GcmSm4EncSinglesLoop: 1538 CMPQ ptxLen, $16 1539 JB avx2GcmSm4EncTail 1540 SUBQ $16, ptxLen 1541 VMOVDQU (16*0)(BP), B0 1542 VMOVDQU (ptx), T0 1543 VPXOR T0, B0, B0 1544 VMOVDQU B0, (ctx) 1545 gcmEncDataStep(B0) 1546 LEAQ (16*1)(ptx), ptx 1547 LEAQ (16*1)(ctx), ctx 1548 ADDQ $16, BP 1549 JMP avx2GcmSm4EncSinglesLoop 1550 1551 avx2GcmSm4EncTail: 1552 TESTQ ptxLen, ptxLen 1553 JE avx2GcmSm4EncDone 1554 VMOVDQU (16*0)(BP), B0 1555 VMOVDQU B0, T0 1556 1557 LEAQ -1(ptx)(ptxLen*1), ptx 1558 1559 MOVQ ptxLen, aluTMP 1560 SHLQ $4, aluTMP 1561 1562 LEAQ andMask<>(SB), aluCTR 1563 VMOVDQU -16(aluCTR)(aluTMP*1), T1 1564 VPXOR B0, B0, B0 1565 1566 avx2PtxLoadLoop: 1567 PSLLDQ $1, B0 1568 PINSRB $0, (ptx), B0 1569 LEAQ -1(ptx), ptx 1570 DECQ ptxLen 1571 JNE avx2PtxLoadLoop 1572 1573 VPXOR T0, B0, B0 1574 VPAND T1, B0, B0 1575 VMOVDQU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT 1576 gcmEncDataStep(B0) 1577 1578 avx2GcmSm4EncDone: 1579 VMOVDQU ACC0, (tPtr) 1580 VZEROUPPER 1581 RET 1582 1583 #undef increment 1584 1585 // func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 1586 TEXT ·gcmSm4Dec(SB),0,$128-96 1587 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) 1588 1589 #define decMulRound(i) \ 1590 MOVOU (16*i)(ctx), T0;\ 1591 PSHUFB BSWAP, T0;\ 1592 internalDecMulRound(i) 1593 1594 #define internalDecMulRound(i) \ 1595 MOVOU (16*(i*2))(pTbl), T1;\ 1596 MOVOU T1, T2;\ 1597 PCLMULQDQ $0x00, T0, T1;\ 1598 PXOR T1, ACC0;\ 1599 PSHUFD $78, T0, T1;\ 1600 PCLMULQDQ $0x11, T0, T2;\ 1601 PXOR T1, T0;\ 1602 PXOR T2, ACC1;\ 1603 MOVOU (16*(i*2+1))(pTbl), T2;\ 1604 PCLMULQDQ $0x00, T2, T0;\ 1605 PXOR T0, ACCM 1606 1607 #define decGhashRound(i) \ 1608 MOVOU (16*i)(ctx), B0; \ 1609 internalDecGhashRound() 1610 1611 #define internalDecGhashRound() \ 1612 PSHUFB BSWAP, B0; \ 1613 PXOR ACC0, B0; \ 1614 MOVOU T2, ACC0; \ 1615 MOVOU T2, ACC1; \ 1616 MOVOU (16*15)(pTbl), ACCM; \ 1617 PCLMULQDQ $0x00, B0, ACC0; \ 1618 PCLMULQDQ $0x11, B0, ACC1; \ 1619 PSHUFD $78, B0, T0; \ 1620 PXOR B0, T0; \ 1621 PCLMULQDQ $0x00, T0, ACCM; \ 1622 PXOR ACC0, ACCM; \ 1623 PXOR ACC1, ACCM; \ 1624 MOVOU ACCM, T0; \ 1625 PSRLDQ $8, ACCM; \ 1626 PSLLDQ $8, T0; \ 1627 PXOR ACCM, ACC1; \ 1628 PXOR T0, ACC0; \ 1629 reduceRound(ACC0); \ 1630 reduceRound(ACC0); \ 1631 PXOR ACC1, ACC0 1632 1633 MOVQ productTable+0(FP), pTbl 1634 MOVQ dst+8(FP), ptx 1635 MOVQ src_base+32(FP), ctx 1636 MOVQ src_len+40(FP), ptxLen 1637 MOVQ ctr+56(FP), ctrPtr 1638 MOVQ T+64(FP), tPtr 1639 MOVQ rk_base+72(FP), rk 1640 1641 CMPB ·useAVX2(SB), $1 1642 JE avx2GcmSm4Dec 1643 1644 MOVOU bswapMask<>(SB), BSWAP 1645 MOVOU gcmPoly<>(SB), POLY 1646 1647 MOVOU (tPtr), ACC0 1648 PXOR ACC1, ACC1 1649 PXOR ACCM, ACCM 1650 MOVOU (ctrPtr), T0 1651 MOVL (3*4)(ctrPtr), aluCTR 1652 BSWAPL aluCTR 1653 1654 MOVOU T0, (0*16)(SP) 1655 increment(0) 1656 MOVOU T0, (1*16)(SP) 1657 increment(1) 1658 MOVOU T0, (2*16)(SP) 1659 increment(2) 1660 MOVOU T0, (3*16)(SP) 1661 increment(3) 1662 1663 CMPQ ptxLen, $128 1664 JB gcmSm4DecNibbles 1665 1666 // We have at least 8 blocks to dencrypt, prepare the rest of the counters 1667 MOVOU T0, (4*16)(SP) 1668 increment(4) 1669 MOVOU T0, (5*16)(SP) 1670 increment(5) 1671 MOVOU T0, (6*16)(SP) 1672 increment(6) 1673 MOVOU T0, (7*16)(SP) 1674 increment(7) 1675 1676 gcmSm4DecOctetsLoop: 1677 CMPQ ptxLen, $128 1678 JB gcmSm4DecEndOctets 1679 SUBQ $128, ptxLen 1680 1681 MOVOU (0*16)(SP), B0 1682 MOVOU (1*16)(SP), B1 1683 MOVOU (2*16)(SP), B2 1684 MOVOU (3*16)(SP), B3 1685 MOVOU (4*16)(SP), B4 1686 MOVOU (5*16)(SP), B5 1687 MOVOU (6*16)(SP), B6 1688 MOVOU (7*16)(SP), B7 1689 1690 MOVOU (16*0)(ctx), T0 1691 PSHUFB BSWAP, T0 1692 PXOR ACC0, T0 1693 PSHUFD $78, T0, T1 1694 PXOR T0, T1 1695 1696 MOVOU (16*0)(pTbl), ACC0 1697 MOVOU (16*1)(pTbl), ACCM 1698 MOVOU ACC0, ACC1 1699 1700 PCLMULQDQ $0x00, T1, ACCM 1701 PCLMULQDQ $0x00, T0, ACC0 1702 PCLMULQDQ $0x11, T0, ACC1 1703 1704 SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) 1705 decMulRound(1) 1706 increment(0) 1707 decMulRound(2) 1708 increment(1) 1709 decMulRound(3) 1710 increment(2) 1711 decMulRound(4) 1712 increment(3) 1713 SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) 1714 decMulRound(5) 1715 increment(4) 1716 decMulRound(6) 1717 increment(5) 1718 decMulRound(7) 1719 increment(6) 1720 increment(7) 1721 1722 PXOR ACC0, ACCM 1723 PXOR ACC1, ACCM 1724 MOVOU ACCM, T0 1725 PSRLDQ $8, ACCM 1726 PSLLDQ $8, T0 1727 PXOR ACCM, ACC1 1728 PXOR T0, ACC0 1729 1730 reduceRound(ACC0) 1731 reduceRound(ACC0) 1732 PXOR ACC1, ACC0 1733 1734 MOVOU (16*0)(ctx), T0 1735 PXOR T0, B0 1736 MOVOU (16*1)(ctx), T0 1737 PXOR T0, B1 1738 MOVOU (16*2)(ctx), T0 1739 PXOR T0, B2 1740 MOVOU (16*3)(ctx), T0 1741 PXOR T0, B3 1742 MOVOU (16*4)(ctx), T0 1743 PXOR T0, B4 1744 MOVOU (16*5)(ctx), T0 1745 PXOR T0, B5 1746 MOVOU (16*6)(ctx), T0 1747 PXOR T0, B6 1748 MOVOU (16*7)(ctx), T0 1749 PXOR T0, B7 1750 1751 MOVOU B0, (16*0)(ptx) 1752 MOVOU B1, (16*1)(ptx) 1753 MOVOU B2, (16*2)(ptx) 1754 MOVOU B3, (16*3)(ptx) 1755 MOVOU B4, (16*4)(ptx) 1756 MOVOU B5, (16*5)(ptx) 1757 MOVOU B6, (16*6)(ptx) 1758 MOVOU B7, (16*7)(ptx) 1759 1760 LEAQ 128(ptx), ptx 1761 LEAQ 128(ctx), ctx 1762 1763 JMP gcmSm4DecOctetsLoop 1764 1765 gcmSm4DecEndOctets: 1766 SUBQ $4, aluCTR 1767 1768 gcmSm4DecNibbles: 1769 CMPQ ptxLen, $64 1770 JBE gcmSm4DecSingles 1771 SUBQ $64, ptxLen 1772 1773 MOVOU (0*16)(SP), B4 1774 MOVOU (1*16)(SP), B5 1775 MOVOU (2*16)(SP), B6 1776 MOVOU (3*16)(SP), B7 1777 1778 SM4_4BLOCKS(rk, BX, T0, T1, T2, B4, B5, B6, B7) 1779 MOVOU (16*14)(pTbl), T2 1780 MOVOU (16*0)(ctx), T0 1781 PXOR T0, B4 1782 MOVOU (16*1)(ctx), T0 1783 PXOR T0, B5 1784 MOVOU (16*2)(ctx), T0 1785 PXOR T0, B6 1786 MOVOU (16*3)(ctx), T0 1787 PXOR T0, B7 1788 1789 decGhashRound(0) 1790 increment(0) 1791 decGhashRound(1) 1792 increment(1) 1793 decGhashRound(2) 1794 increment(2) 1795 decGhashRound(3) 1796 increment(3) 1797 1798 MOVOU B4, (16*0)(ptx) 1799 MOVOU B5, (16*1)(ptx) 1800 MOVOU B6, (16*2)(ptx) 1801 MOVOU B7, (16*3)(ptx) 1802 1803 LEAQ 64(ptx), ptx 1804 LEAQ 64(ctx), ctx 1805 1806 gcmSm4DecSingles: 1807 TESTQ ptxLen, ptxLen 1808 JE gcmSm4DecDone 1809 MOVOU (0*16)(SP), B0 1810 MOVOU (1*16)(SP), B1 1811 MOVOU (2*16)(SP), B2 1812 MOVOU (3*16)(SP), B3 1813 1814 SM4_4BLOCKS(rk, BX, T0, T1, T2, B0, B1, B2, B3) 1815 MOVOU B0, (16*4)(SP) 1816 MOVOU B1, (16*5)(SP) 1817 MOVOU B2, (16*6)(SP) 1818 MOVOU B3, (16*7)(SP) 1819 1820 MOVOU (16*14)(pTbl), T2 1821 MOVQ SP, BP 1822 ADDQ $64, BP 1823 1824 gcmSm4DecSinglesLoop: 1825 CMPQ ptxLen, $16 1826 JB gcmSm4DecTail 1827 SUBQ $16, ptxLen 1828 1829 MOVOU (16*0)(BP), B1 1830 MOVOU (ctx), T0 1831 PXOR T0, B1 1832 1833 decGhashRound(0) 1834 MOVOU B1, (ptx) 1835 1836 LEAQ (16*1)(ptx), ptx 1837 LEAQ (16*1)(ctx), ctx 1838 ADDQ $16, BP 1839 JMP gcmSm4DecSinglesLoop 1840 1841 gcmSm4DecTail: 1842 TESTQ ptxLen, ptxLen 1843 JE gcmSm4DecDone 1844 1845 MOVQ ptxLen, aluTMP 1846 SHLQ $4, aluTMP 1847 LEAQ andMask<>(SB), aluCTR 1848 MOVOU -16(aluCTR)(aluTMP*1), T1 1849 1850 MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow 1851 PAND T1, B0 1852 1853 MOVOU B0, T1 1854 PSHUFB BSWAP, B0 1855 PXOR ACC0, B0 1856 1857 MOVOU (16*14)(pTbl), ACC0 1858 MOVOU (16*15)(pTbl), ACCM 1859 MOVOU ACC0, ACC1 1860 1861 PCLMULQDQ $0x00, B0, ACC0 1862 PCLMULQDQ $0x11, B0, ACC1 1863 PSHUFD $78, B0, T0 1864 PXOR B0, T0 1865 PCLMULQDQ $0x00, T0, ACCM 1866 1867 PXOR ACC0, ACCM 1868 PXOR ACC1, ACCM 1869 MOVOU ACCM, T0 1870 PSRLDQ $8, ACCM 1871 PSLLDQ $8, T0 1872 PXOR ACCM, ACC1 1873 PXOR T0, ACC0 1874 1875 reduceRound(ACC0) 1876 reduceRound(ACC0) 1877 PXOR ACC1, ACC0 1878 1879 MOVOU (16*0)(BP), B0 1880 PXOR T1, B0 1881 1882 ptxStoreLoop: 1883 PEXTRB $0, B0, (ptx) 1884 PSRLDQ $1, B0 1885 LEAQ 1(ptx), ptx 1886 DECQ ptxLen 1887 1888 JNE ptxStoreLoop 1889 1890 gcmSm4DecDone: 1891 MOVOU ACC0, (tPtr) 1892 RET 1893 1894 avx2GcmSm4Dec: 1895 VMOVDQU bswapMask<>(SB), BSWAP 1896 VMOVDQU gcmPoly<>(SB), POLY 1897 1898 VMOVDQU (tPtr), ACC0 1899 VPXOR ACC1, ACC1, ACC1 1900 VPXOR ACCM, ACCM, ACCM 1901 VMOVDQU (ctrPtr), T0 1902 MOVL (3*4)(ctrPtr), aluCTR 1903 BSWAPL aluCTR 1904 1905 VMOVDQU T0, (0*16)(SP) 1906 increment(0) 1907 VMOVDQU T0, (1*16)(SP) 1908 increment(1) 1909 VMOVDQU T0, (2*16)(SP) 1910 increment(2) 1911 VMOVDQU T0, (3*16)(SP) 1912 increment(3) 1913 1914 CMPQ ptxLen, $128 1915 JB avx2GcmSm4DecNibbles 1916 1917 // We have at least 8 blocks to dencrypt, prepare the rest of the counters 1918 VMOVDQU T0, (4*16)(SP) 1919 increment(4) 1920 VMOVDQU T0, (5*16)(SP) 1921 increment(5) 1922 VMOVDQU T0, (6*16)(SP) 1923 increment(6) 1924 VMOVDQU T0, (7*16)(SP) 1925 increment(7) 1926 1927 avx2GcmSm4DecOctetsLoop: 1928 CMPQ ptxLen, $128 1929 JB avx2GcmSm4DecEndOctets 1930 SUBQ $128, ptxLen 1931 1932 // load 8 ctrs for encryption 1933 VMOVDQU (0*32)(SP), DWB0 1934 VMOVDQU (1*32)(SP), DWB1 1935 VMOVDQU (2*32)(SP), DWB2 1936 VMOVDQU (3*32)(SP), DWB3 1937 1938 VBROADCASTI128 flipMask<>(SB), XDWTMP0 1939 // Apply Byte Flip Mask: LE -> BE 1940 VPSHUFB XDWTMP0, DWB0, DWB0 1941 VPSHUFB XDWTMP0, DWB1, DWB1 1942 VPSHUFB XDWTMP0, DWB2, DWB2 1943 VPSHUFB XDWTMP0, DWB3, DWB3 1944 1945 VMOVDQU (16*0)(ctx), T0 1946 VPSHUFB BSWAP, T0, T0 1947 VPXOR ACC0, T0, T0 1948 VPSHUFD $78, T0, T1 1949 VPXOR T0, T1, T1 1950 1951 VMOVDQU (16*0)(pTbl), ACC0 1952 VMOVDQU (16*1)(pTbl), ACCM 1953 VMOVDQU ACC0, ACC1 1954 1955 PCLMULQDQ $0x00, T1, ACCM 1956 PCLMULQDQ $0x00, T0, ACC0 1957 PCLMULQDQ $0x11, T0, ACC1 1958 1959 1960 // Transpose matrix 4 x 4 32bits word 1961 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) 1962 XORL BX, BX 1963 VBROADCASTI128 nibbleMask<>(SB), NIBBLE_MASK 1964 1965 avx2GcmSm4Dec8Loop2: 1966 AVX2_SM4_ROUND(0, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB0, DWB1, DWB2, DWB3) 1967 AVX2_SM4_ROUND(1, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB1, DWB2, DWB3, DWB0) 1968 AVX2_SM4_ROUND(2, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB2, DWB3, DWB0, DWB1) 1969 AVX2_SM4_ROUND(3, rk, BX, XDWORD, YDWORD, X1, X3, XDWTMP1, DWB3, DWB0, DWB1, DWB2) 1970 1971 ADDL $16, BX 1972 CMPL BX, $4*32 1973 JB avx2GcmSm4Dec8Loop2 1974 1975 // Transpose matrix 4 x 4 32bits word 1976 TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWTMP0, XDWTMP1) 1977 1978 VBROADCASTI128 bswapMask<>(SB), DWBSWAP 1979 VPSHUFB DWBSWAP, DWB0, DWB0 1980 VPSHUFB DWBSWAP, DWB1, DWB1 1981 VPSHUFB DWBSWAP, DWB2, DWB2 1982 VPSHUFB DWBSWAP, DWB3, DWB3 1983 1984 VMOVDQU (32*0)(ctx), XDWTMP0 1985 VPXOR XDWTMP0, DWB0, DWB0 1986 VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0 1987 VEXTRACTI128 $1, XDWTMP0, T0 1988 internalDecMulRound(1) 1989 increment(0) 1990 1991 VMOVDQU (32*1)(ctx), XDWTMP0 1992 VPXOR XDWTMP0, DWB1, DWB1 1993 VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0 1994 VEXTRACTI128 $0, XDWTMP0, T0 1995 internalDecMulRound(2) 1996 increment(1) 1997 VEXTRACTI128 $1, XDWTMP0, T0 1998 internalDecMulRound(3) 1999 increment(2) 2000 2001 VMOVDQU (32*2)(ctx), XDWTMP0 2002 VPXOR XDWTMP0, DWB2, DWB2 2003 VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0 2004 VEXTRACTI128 $0, XDWTMP0, T0 2005 internalDecMulRound(4) 2006 increment(3) 2007 VEXTRACTI128 $1, XDWTMP0, T0 2008 internalDecMulRound(5) 2009 increment(4) 2010 2011 VMOVDQU (32*3)(ctx), XDWTMP0 2012 VPXOR XDWTMP0, DWB3, DWB3 2013 VPSHUFB DWBSWAP, XDWTMP0, XDWTMP0 2014 VEXTRACTI128 $0, XDWTMP0, T0 2015 internalDecMulRound(6) 2016 increment(5) 2017 VEXTRACTI128 $1, XDWTMP0, T0 2018 internalDecMulRound(7) 2019 increment(6) 2020 increment(7) 2021 2022 VMOVDQU DWB0, (32*0)(ptx) 2023 VMOVDQU DWB1, (32*1)(ptx) 2024 VMOVDQU DWB2, (32*2)(ptx) 2025 VMOVDQU DWB3, (32*3)(ptx) 2026 2027 VPXOR ACC0, ACCM, ACCM 2028 VPXOR ACC1, ACCM, ACCM 2029 VPSLLDQ $8, ACCM, T0 2030 VPSRLDQ $8, ACCM, ACCM 2031 2032 VPXOR ACCM, ACC1, ACC1 2033 VPXOR T0, ACC0, ACC0 2034 2035 reduceRound(ACC0) 2036 reduceRound(ACC0) 2037 VPXOR ACC1, ACC0, ACC0 2038 2039 LEAQ 128(ptx), ptx 2040 LEAQ 128(ctx), ctx 2041 2042 JMP avx2GcmSm4DecOctetsLoop 2043 2044 avx2GcmSm4DecEndOctets: 2045 SUBQ $4, aluCTR 2046 2047 avx2GcmSm4DecNibbles: 2048 VMOVDQU flipMask<>(SB), B7 // DO NOT CHANGE B7 2049 CMPQ ptxLen, $64 2050 JBE avx2GcmSm4DecSingles 2051 SUBQ $64, ptxLen 2052 2053 VMOVDQU (0*16)(SP), B0 2054 VMOVDQU (1*16)(SP), B1 2055 VMOVDQU (2*16)(SP), B2 2056 VMOVDQU (3*16)(SP), B3 2057 2058 VPSHUFB B7, B0, B0 2059 VPSHUFB B7, B1, B1 2060 VPSHUFB B7, B2, B2 2061 VPSHUFB B7, B3, B3 2062 2063 TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) 2064 XORL BX, BX 2065 VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK 2066 2067 avx2GcmSm4Dec4Loop2: 2068 AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) 2069 AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) 2070 AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) 2071 AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) 2072 2073 ADDL $16, BX 2074 CMPL BX, $4*32 2075 JB avx2GcmSm4Dec4Loop2 2076 2077 // Transpose matrix 4 x 4 32bits word 2078 TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) 2079 VPSHUFB BSWAP, B0, B4 2080 VPSHUFB BSWAP, B1, B1 2081 VPSHUFB BSWAP, B2, B2 2082 VPSHUFB BSWAP, B3, B3 2083 2084 VMOVDQU (16*14)(pTbl), T2 2085 VMOVDQU (16*0)(ctx), B0 2086 VPXOR B0, B4, B4 2087 internalDecGhashRound() 2088 2089 VMOVDQU (16*1)(ctx), B0 2090 VPXOR B0, B1, B1 2091 internalDecGhashRound() 2092 2093 VMOVDQU (16*2)(ctx), B0 2094 VPXOR B0, B2, B2 2095 internalDecGhashRound() 2096 2097 VMOVDQU (16*3)(ctx), B0 2098 VPXOR B0, B3, B3 2099 internalDecGhashRound() 2100 2101 VMOVDQU B4, (16*0)(ptx) 2102 VMOVDQU B1, (16*1)(ptx) 2103 VMOVDQU B2, (16*2)(ptx) 2104 VMOVDQU B3, (16*3)(ptx) 2105 2106 increment(0) 2107 increment(1) 2108 increment(2) 2109 increment(3) 2110 2111 LEAQ 64(ptx), ptx 2112 LEAQ 64(ctx), ctx 2113 2114 avx2GcmSm4DecSingles: 2115 TESTQ ptxLen, ptxLen 2116 JE avx2GcmSm4DecDone 2117 2118 VMOVDQU (0*16)(SP), B0 2119 VMOVDQU (1*16)(SP), B1 2120 VMOVDQU (2*16)(SP), B2 2121 VMOVDQU (3*16)(SP), B3 2122 2123 VPSHUFB B7, B0, B0 2124 VPSHUFB B7, B1, B1 2125 VPSHUFB B7, B2, B2 2126 VPSHUFB B7, B3, B3 2127 2128 TRANSPOSE_MATRIX(B0, B1, B2, B3, T0, T1) 2129 2130 XORL BX, BX 2131 VMOVDQU nibbleMask<>(SB), X_NIBBLE_MASK 2132 2133 avx2GcmSm4Dec4Loop1: 2134 AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) 2135 AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) 2136 AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) 2137 AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) 2138 2139 ADDL $16, BX 2140 CMPL BX, $4*32 2141 JB avx2GcmSm4Dec4Loop1 2142 2143 // Transpose matrix 4 x 4 32bits word 2144 TRANSPOSE_MATRIX(B0, B1, B2, B3, B4, B5) 2145 VPSHUFB BSWAP, B0, B0 2146 VPSHUFB BSWAP, B1, B1 2147 VPSHUFB BSWAP, B2, B2 2148 VPSHUFB BSWAP, B3, B3 2149 2150 VMOVDQU B0, (16*4)(SP) 2151 VMOVDQU B1, (16*5)(SP) 2152 VMOVDQU B2, (16*6)(SP) 2153 VMOVDQU B3, (16*7)(SP) 2154 2155 VMOVDQU (16*14)(pTbl), T2 2156 MOVQ SP, BP 2157 ADDQ $64, BP 2158 2159 avx2GcmSm4DecSinglesLoop: 2160 CMPQ ptxLen, $16 2161 JB avx2GcmSm4DecTail 2162 SUBQ $16, ptxLen 2163 2164 VMOVDQU (16*0)(BP), T0 2165 VMOVDQU (ctx), B0 2166 VPXOR T0, B0, T0 2167 VMOVDQU T0, (ptx) 2168 2169 internalDecGhashRound() 2170 LEAQ (16*1)(ptx), ptx 2171 LEAQ (16*1)(ctx), ctx 2172 ADDQ $16, BP 2173 JMP avx2GcmSm4DecSinglesLoop 2174 2175 avx2GcmSm4DecTail: 2176 TESTQ ptxLen, ptxLen 2177 JE avx2GcmSm4DecDone 2178 2179 MOVQ ptxLen, aluTMP 2180 SHLQ $4, aluTMP 2181 LEAQ andMask<>(SB), aluCTR 2182 VMOVDQU -16(aluCTR)(aluTMP*1), T1 // Fetch and-mask according ptxLen 2183 2184 VMOVDQU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow 2185 VPAND T1, B0, B0 // Just keep ptxLen bytes, others will be zero 2186 2187 VMOVDQU B0, T1 2188 internalDecGhashRound() 2189 VMOVDQU (16*0)(BP), B0 2190 VPXOR T1, B0, B0 2191 2192 avx2PtxStoreLoop: 2193 PEXTRB $0, B0, (ptx) 2194 PSRLDQ $1, B0 2195 LEAQ 1(ptx), ptx 2196 DECQ ptxLen 2197 2198 JNE avx2PtxStoreLoop 2199 2200 avx2GcmSm4DecDone: 2201 VMOVDQU ACC0, (tPtr) 2202 VZEROUPPER 2203 RET 2204 2205 // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 2206 TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 2207 RET 2208 2209 // func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 2210 TEXT ·gcmSm4niDec(SB),NOSPLIT,$0 2211 RET