github.com/emmansun/gmsm@v0.29.1/zuc/asm_amd64.s (about) 1 // Referenced Intel(R) Multi-Buffer Crypto for IPsec 2 // https://github.com/intel/intel-ipsec-mb/ 3 // https://gist.github.com/emmansun/15d2fce6659ab97ffaf7ab66e278caee 4 //go:build !purego 5 6 #include "textflag.h" 7 8 DATA Top3_bits_of_the_byte<>+0x00(SB)/8, $0xe0e0e0e0e0e0e0e0 9 DATA Top3_bits_of_the_byte<>+0x08(SB)/8, $0xe0e0e0e0e0e0e0e0 10 GLOBL Top3_bits_of_the_byte<>(SB), RODATA, $16 11 12 DATA Bottom5_bits_of_the_byte<>+0x00(SB)/8, $0x1f1f1f1f1f1f1f1f 13 DATA Bottom5_bits_of_the_byte<>+0x08(SB)/8, $0x1f1f1f1f1f1f1f1f 14 GLOBL Bottom5_bits_of_the_byte<>(SB), RODATA, $16 15 16 DATA Low_nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F 17 DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F 18 GLOBL Low_nibble_mask<>(SB), RODATA, $16 19 20 DATA High_nibble_mask<>+0x00(SB)/8, $0xF0F0F0F0F0F0F0F0 21 DATA High_nibble_mask<>+0x08(SB)/8, $0xF0F0F0F0F0F0F0F0 22 GLOBL High_nibble_mask<>(SB), RODATA, $16 23 24 DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09 25 DATA P1<>+0x08(SB)/8, $0x090305070C000400 26 GLOBL P1<>(SB), RODATA, $16 27 28 DATA P2<>+0x00(SB)/8, $0x040C000705060D08 29 DATA P2<>+0x08(SB)/8, $0x0209030F0A0E010B 30 GLOBL P2<>(SB), RODATA, $16 31 32 DATA P3<>+0x00(SB)/8, $0x0F0A0D00060A0602 33 DATA P3<>+0x08(SB)/8, $0x0D0C0900050D0303 34 GLOBL P3<>(SB), RODATA, $16 35 36 DATA Aes_to_Zuc_mul_low_nibble<>+0x00(SB)/8, $0x1D1C9F9E83820100 37 DATA Aes_to_Zuc_mul_low_nibble<>+0x08(SB)/8, $0x3938BBBAA7A62524 38 GLOBL Aes_to_Zuc_mul_low_nibble<>(SB), RODATA, $16 39 40 DATA Aes_to_Zuc_mul_high_nibble<>+0x00(SB)/8, $0xA174A97CDD08D500 41 DATA Aes_to_Zuc_mul_high_nibble<>+0x08(SB)/8, $0x3DE835E04194499C 42 GLOBL Aes_to_Zuc_mul_high_nibble<>(SB), RODATA, $16 43 44 DATA Comb_matrix_mul_low_nibble<>+0x00(SB)/8, $0xCFDB6571BEAA1400 45 DATA Comb_matrix_mul_low_nibble<>+0x08(SB)/8, $0x786CD2C6091DA3B7 46 GLOBL Comb_matrix_mul_low_nibble<>(SB), RODATA, $16 47 48 DATA Comb_matrix_mul_high_nibble<>+0x00(SB)/8, $0x638CFA1523CCBA55 49 DATA Comb_matrix_mul_high_nibble<>+0x08(SB)/8, $0x3FD0A6497F90E609 50 GLOBL Comb_matrix_mul_high_nibble<>(SB), RODATA, $16 51 52 DATA Shuf_mask<>+0x00(SB)/8, $0x0B0E0104070A0D00 53 DATA Shuf_mask<>+0x08(SB)/8, $0x0306090C0F020508 54 GLOBL Shuf_mask<>(SB), RODATA, $16 55 56 DATA Cancel_aes<>+0x00(SB)/8, $0x6363636363636363 57 DATA Cancel_aes<>+0x08(SB)/8, $0x6363636363636363 58 GLOBL Cancel_aes<>(SB), RODATA, $16 59 60 DATA CombMatrix<>+0x00(SB)/8, $0x3C1A99B2AD1ED43A 61 DATA CombMatrix<>+0x08(SB)/8, $0x3C1A99B2AD1ED43A 62 GLOBL CombMatrix<>(SB), RODATA, $16 63 64 DATA mask_S0<>+0x00(SB)/8, $0xff00ff00ff00ff00 65 DATA mask_S0<>+0x08(SB)/8, $0xff00ff00ff00ff00 66 GLOBL mask_S0<>(SB), RODATA, $16 67 68 DATA mask_S1<>+0x00(SB)/8, $0x00ff00ff00ff00ff 69 DATA mask_S1<>+0x08(SB)/8, $0x00ff00ff00ff00ff 70 GLOBL mask_S1<>(SB), RODATA, $16 71 72 // shuffle byte order from LE to BE 73 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 74 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 75 GLOBL flip_mask<>(SB), RODATA, $16 76 77 #define OFFSET_FR1 (16*4) 78 #define OFFSET_FR2 (17*4) 79 #define OFFSET_BRC_X0 (18*4) 80 #define OFFSET_BRC_X1 (19*4) 81 #define OFFSET_BRC_X2 (20*4) 82 #define OFFSET_BRC_X3 (21*4) 83 84 #define SHLDL(a, b, n) \ // NO SHLDL in GOLANG now 85 SHLL n, a \ 86 SHRL n, b \ 87 ORL b, a 88 89 // Rotate left 5 bits in each byte, within an XMM register, SSE version. 90 #define Rotl_5_SSE(XDATA, XTMP0) \ 91 MOVOU XDATA, XTMP0 \ 92 PSLLL $5, XTMP0 \ 93 PSRLL $3, XDATA \ 94 PAND Top3_bits_of_the_byte<>(SB), XTMP0 \ 95 PAND Bottom5_bits_of_the_byte<>(SB), XDATA \ 96 POR XTMP0, XDATA 97 98 // Compute 16 S0 box values from 16 bytes, SSE version. 99 #define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \ 100 MOVOU IN_OUT, XTMP1 \ 101 \ 102 PAND Low_nibble_mask<>(SB), IN_OUT \ // x2 103 \ 104 PAND High_nibble_mask<>(SB), XTMP1 \ 105 PSRLQ $4, XTMP1 \ // x1 106 \ 107 MOVOU P1<>(SB), XTMP2 \ 108 PSHUFB IN_OUT, XTMP2 \ // P1[x2] 109 PXOR XTMP1, XTMP2 \ // q = x1 ^ P1[x2], XTMP1 free 110 \ 111 MOVOU P2<>(SB), XTMP1 \ 112 PSHUFB XTMP2, XTMP1 \ // P2[q] 113 PXOR IN_OUT, XTMP1 \ // r = x2 ^ P2[q]; IN_OUT free 114 \ 115 MOVOU P3<>(SB), IN_OUT \ 116 PSHUFB XTMP1, IN_OUT \ // P3[r] 117 PXOR XTMP2, IN_OUT \ // s = q ^ P3[r], XTMP2 free 118 \ // s << 4 (since high nibble of each byte is 0, no masking is required) 119 PSLLQ $4, IN_OUT \ 120 POR XTMP1, IN_OUT \ // t = (s << 4) | r 121 Rotl_5_SSE(IN_OUT, XTMP1) 122 123 // Perform 8x8 matrix multiplication using lookup tables with partial results 124 // for high and low nible of each input byte, SSE versiion. 125 #define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \ 126 \ // Get low nibble of input data 127 MOVOU Low_nibble_mask<>(SB), XTMP \ 128 PAND XIN, XTMP \ 129 \ // Get low nibble of output 130 PSHUFB XTMP, XLO \ 131 \ // Get high nibble of input data 132 MOVOU High_nibble_mask<>(SB), XTMP \ 133 PAND XIN, XTMP \ 134 PSRLQ $4, XTMP \ 135 \ // Get high nibble of output 136 PSHUFB XTMP, XHI_OUT \ 137 \ // XOR high and low nibbles to get full bytes 138 PXOR XLO, XHI_OUT 139 140 // Compute 16 S1 box values from 16 bytes, stored in XMM register 141 #define S1_comput_SSE(XIN_OUT, XTMP1, XTMP2, XTMP3) \ 142 MOVOU Aes_to_Zuc_mul_low_nibble<>(SB), XTMP1 \ 143 MOVOU Aes_to_Zuc_mul_high_nibble<>(SB), XTMP2 \ 144 MUL_PSHUFB_SSE(XIN_OUT, XTMP1, XTMP2, XTMP3) \ 145 \ 146 PSHUFB Shuf_mask<>(SB), XTMP2 \ 147 AESENCLAST Cancel_aes<>(SB), XTMP2 \ 148 \ 149 MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \ 150 MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \ 151 MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3) 152 153 // Rotate left 5 bits in each byte, within an XMM register, AVX version. 154 #define Rotl_5_AVX(XDATA, XTMP0) \ 155 VPSLLD $5, XDATA, XTMP0 \ 156 VPSRLD $3, XDATA, XDATA \ 157 VPAND Top3_bits_of_the_byte<>(SB), XTMP0, XTMP0 \ 158 VPAND Bottom5_bits_of_the_byte<>(SB), XDATA, XDATA \ 159 VPOR XTMP0, XDATA, XDATA 160 161 // Compute 16 S0 box values from 16 bytes, AVX version. 162 #define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \ 163 VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \ 164 VPSRLQ $4, XTMP1, XTMP1 \ // x1 165 \ 166 VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2 167 \ 168 VMOVDQU P1<>(SB), XTMP2 \ 169 VPSHUFB IN_OUT, XTMP2, XTMP2 \ // P1[x2] 170 VPXOR XTMP1, XTMP2, XTMP2 \ // q = x1 ^ P1[x2] ; XTMP1 free 171 \ 172 VMOVDQU P2<>(SB), XTMP1 \ 173 VPSHUFB XTMP2, XTMP1, XTMP1 \ // P2[q] 174 VPXOR IN_OUT, XTMP1, XTMP1 \ // r = x2 ^ P2[q] ; IN_OUT free 175 \ 176 VMOVDQU P3<>(SB), IN_OUT \ 177 VPSHUFB XTMP1, IN_OUT, IN_OUT \ // P3[r] 178 VPXOR XTMP2, IN_OUT, IN_OUT \ // s = q ^ P3[r] ; XTMP2 free 179 \ // s << 4 (since high nibble of each byte is 0, no masking is required) 180 VPSLLQ $4, IN_OUT, IN_OUT \ 181 VPOR XTMP1, IN_OUT, IN_OUT \ // t = (s << 4) | r 182 Rotl_5_AVX(IN_OUT, XTMP1) 183 184 // Perform 8x8 matrix multiplication using lookup tables with partial results 185 // for high and low nible of each input byte, AVX version. 186 #define MUL_PSHUFB_AVX(XIN, XLO, XHI_OUT, XTMP) \ 187 \ // Get low nibble of input data 188 VPAND Low_nibble_mask<>(SB), XIN, XTMP \ 189 \ // Get low nibble of output 190 VPSHUFB XTMP, XLO, XLO \ 191 \ // Get high nibble of input data 192 VPAND High_nibble_mask<>(SB), XIN, XTMP \ 193 VPSRLQ $4, XTMP, XTMP \ 194 \ // Get high nibble of output 195 VPSHUFB XTMP, XHI_OUT, XHI_OUT \ 196 \ // XOR high and low nibbles to get full bytes 197 VPXOR XLO, XHI_OUT, XHI_OUT 198 199 // Compute 16 S1 box values from 16 bytes, stored in XMM register 200 #define S1_comput_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3) \ 201 \ // gf2p8affineqb XIN_OUT, [rel Aes_to_Zuc], 0x00 202 VMOVDQU Aes_to_Zuc_mul_low_nibble<>(SB), XTMP1 \ 203 VMOVDQU Aes_to_Zuc_mul_high_nibble<>(SB), XTMP2 \ 204 MUL_PSHUFB_AVX(XIN_OUT, XTMP1, XTMP2, XTMP3) \ 205 \ 206 VPSHUFB Shuf_mask<>(SB), XTMP2, XTMP2 \ 207 VAESENCLAST Cancel_aes<>(SB), XTMP2, XTMP2 \ 208 \ // gf2p8affineqb XIN_OUT, [rel CombMatrix], 0x55 209 VMOVDQU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \ 210 VMOVDQU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \ 211 MUL_PSHUFB_AVX(XTMP2, XTMP1, XIN_OUT, XTMP3) 212 213 #define F_R1 R9 214 #define F_R2 R10 215 #define BRC_X0 R11 216 #define BRC_X1 R12 217 #define BRC_X2 R13 218 #define BRC_X3 R14 219 220 // BITS_REORG(idx) 221 // 222 // params 223 // %1 - round number 224 // uses 225 // AX, BX, CX, DX 226 // return 227 // updates R11, R12, R13, R14 228 // 229 #define BITS_REORG(idx) \ 230 MOVL (((15 + idx) % 16)*4)(SI), BRC_X0 \ 231 MOVL (((14 + idx) % 16)*4)(SI), AX \ 232 MOVL (((11 + idx) % 16)*4)(SI), BRC_X1 \ 233 MOVL (((9 + idx) % 16)*4)(SI), BX \ 234 MOVL (((7 + idx) % 16)*4)(SI), BRC_X2 \ 235 MOVL (((5 + idx) % 16)*4)(SI), CX \ 236 MOVL (((2 + idx) % 16)*4)(SI), BRC_X3 \ 237 MOVL (((0 + idx) % 16)*4)(SI), DX \ 238 SHRL $15, BRC_X0 \ 239 SHLL $16, AX \ 240 SHLL $1, BX \ 241 SHLL $1, CX \ 242 SHLL $1, DX \ 243 SHLDL(BRC_X0, AX, $16) \ 244 SHLDL(BRC_X1, BX, $16) \ 245 SHLDL(BRC_X2, CX, $16) \ 246 SHLDL(BRC_X3, DX, $16) 247 248 // LFSR_UPDT calculates the next state word and places/overwrites it to lfsr[idx % 16] 249 // 250 // params 251 // %1 - round number 252 // uses 253 // AX as input (ZERO or W), BX, CX, DX, R8 254 #define LFSR_UPDT(idx) \ 255 MOVL (((0 + idx) % 16)*4)(SI), BX \ 256 MOVL (((4 + idx) % 16)*4)(SI), CX \ 257 MOVL (((10 + idx) % 16)*4)(SI), DX \ 258 MOVL (((13 + idx) % 16)*4)(SI), R8 \ 259 \ // Calculate 64-bit LFSR feedback 260 ADDQ BX, AX \ 261 SHLQ $8, BX \ 262 SHLQ $20, CX \ 263 SHLQ $21, DX \ 264 SHLQ $17, R8 \ 265 ADDQ BX, AX \ 266 ADDQ CX, AX \ 267 ADDQ DX, AX \ 268 ADDQ R8, AX \ 269 MOVL (((15 + idx) % 16)*4)(SI), R8 \ 270 SHLQ $15, R8 \ 271 ADDQ R8, AX \ 272 \ // Reduce it to 31-bit value 273 MOVQ AX, BX \ 274 ANDQ $0x7FFFFFFF, AX \ 275 SHRQ $31, BX \ 276 ADDQ BX, AX \ 277 \ 278 MOVQ AX, BX \ 279 SUBQ $0x7FFFFFFF, AX \ 280 CMOVQCS BX, AX \ 281 \ // LFSR_S16 = (LFSR_S15++) = AX 282 MOVL AX, (((0 + idx) % 16)*4)(SI) 283 284 #define NONLIN_FUN \ 285 MOVL BRC_X0, AX \ 286 XORL F_R1, AX \ // F_R1 xor BRC_X1 287 ADDL F_R2, AX \ // W = (F_R1 xor BRC_X1) + F_R2 288 ADDL BRC_X1, F_R1 \ // W1= F_R1 + BRC_X1 289 XORL BRC_X2, F_R2 \ // W2= F_R2 ^ BRC_X2 290 \ 291 MOVL F_R1, DX \ 292 MOVL F_R2, CX \ 293 SHLDL(DX, CX, $16) \ // P = (W1 << 16) | (W2 >> 16) 294 SHLDL(F_R2, F_R1, $16) \ // Q = (W2 << 16) | (W1 >> 16) 295 MOVL DX, BX \ // start L1 296 MOVL DX, CX \ 297 ROLL $2, BX \ 298 ROLL $24, CX \ 299 XORL CX, DX \ 300 XORL BX, DX \ 301 ROLL $8, BX \ 302 XORL BX, DX \ 303 ROLL $8, BX \ 304 XORL BX, DX \ // U = L1(P) = EDX, hi(RDX)=0 305 MOVL F_R2, BX \ 306 MOVL F_R2, CX \ 307 ROLL $8, BX \ 308 XORL BX, F_R2 \ 309 ROLL $14, CX \ 310 XORL CX, F_R2 \ 311 ROLL $8, CX \ 312 XORL CX, F_R2 \ 313 ROLL $8, CX \ 314 XORL CX, F_R2 \ // V = L2(Q) = R11D, hi(R11)=0 315 SHLQ $32, F_R2 \ // DX = V || U 316 XORQ F_R2, DX 317 318 // Non-Linear function F, SSE version. 319 // uses 320 // AX, BX, CX, DX, R8 321 // X0, X1, X2, X3, X4 322 // return 323 // W in AX 324 // updated F_R1, F_R2 325 #define NONLIN_FUN_SSE \ 326 NONLIN_FUN \ 327 MOVQ DX, X0 \ 328 MOVOU X0, X1 \ 329 S0_comput_SSE(X1, X2, X3) \ 330 S1_comput_SSE(X0, X2, X3, X4) \ 331 \ 332 PAND mask_S1<>(SB), X0 \ 333 PAND mask_S0<>(SB), X1 \ 334 PXOR X1, X0 \ 335 \ 336 MOVL X0, F_R1 \ // F_R1 337 PEXTRD $1, X0, F_R2 338 339 // RESTORE_LFSR_0, appends the first 4 bytes to last. 340 #define RESTORE_LFSR_0 \ 341 MOVL (0*4)(SI), AX \ // first 4-bytes 342 MOVUPS (4)(SI), X0 \ 343 MOVUPS (20)(SI), X1 \ 344 MOVUPS (36)(SI), X2 \ 345 MOVQ (52)(SI), BX \ 346 MOVL (60)(SI), CX \ // last 4-bytes 347 \ 348 MOVUPS X0, (SI) \ 349 MOVUPS X1, (16)(SI) \ 350 MOVUPS X2, (32)(SI) \ 351 MOVQ BX, (48)(SI) \ 352 MOVL CX, (56)(SI) \ 353 MOVL AX, (60)(SI) 354 355 // RESTORE_LFSR_2, appends the first 8 bytes to last. 356 #define RESTORE_LFSR_2 \ 357 MOVQ (0)(SI), AX \ // first 8-bytes 358 MOVUPS (8)(SI), X0 \ 359 MOVUPS (24)(SI), X1 \ 360 MOVUPS (40)(SI), X2 \ 361 MOVQ (56)(SI), BX \ // last 8-bytes 362 \ 363 MOVUPS X0, (SI) \ 364 MOVUPS X1, (16)(SI) \ 365 MOVUPS X2, (32)(SI) \ 366 MOVQ BX, (48)(SI) \ 367 MOVQ AX, (56)(SI) 368 369 // RESTORE_LFSR_4, appends the first 16 bytes to last. 370 #define RESTORE_LFSR_4 \ 371 MOVUPS (0)(SI), X0 \ // first 16 bytes 372 MOVUPS (16)(SI), X1 \ 373 MOVUPS (32)(SI), X2 \ 374 MOVUPS (48)(SI), X3 \ // last 16 bytes 375 \ 376 MOVUPS X1, (0)(SI) \ 377 MOVUPS X2, (16)(SI) \ 378 MOVUPS X3, (32)(SI) \ 379 MOVUPS X0, (48)(SI) 380 381 // RESTORE_LFSR_8, appends the first 32 bytes to last. 382 #define RESTORE_LFSR_8 \ 383 MOVUPS (0)(SI), X0 \ 384 MOVUPS (16)(SI), X1 \ 385 MOVUPS (32)(SI), X2 \ 386 MOVUPS (48)(SI), X3 \ 387 \ 388 MOVUPS X2, (0)(SI) \ 389 MOVUPS X3, (16)(SI) \ 390 MOVUPS X0, (32)(SI) \ 391 MOVUPS X1, (48)(SI) 392 393 // Non-Linear function F, AVX version. 394 // uses 395 // AX, BX, CX, DX, R8 396 // X0, X1, X2, X3, X4 397 // return 398 // W in AX 399 // updated F_R1, F_R2 400 #define NONLIN_FUN_AVX \ 401 NONLIN_FUN \ 402 VMOVQ DX, X0 \ 403 VMOVDQA X0, X1 \ 404 S0_comput_AVX(X1, X2, X3) \ 405 S1_comput_AVX(X0, X2, X3, X4) \ 406 \ 407 VPAND mask_S1<>(SB), X0, X0 \ 408 VPAND mask_S0<>(SB), X1, X1 \ 409 VPXOR X1, X0, X0 \ 410 \ 411 MOVL X0, F_R1 \ // F_R1 412 VPEXTRD $1, X0, F_R2 413 414 #define LOAD_STATE \ 415 MOVL OFFSET_FR1(SI), F_R1 \ 416 MOVL OFFSET_FR2(SI), F_R2 \ 417 MOVL OFFSET_BRC_X0(SI), BRC_X0 \ 418 MOVL OFFSET_BRC_X1(SI), BRC_X1 \ 419 MOVL OFFSET_BRC_X2(SI), BRC_X2 \ 420 MOVL OFFSET_BRC_X3(SI), BRC_X3 421 422 #define SAVE_STATE \ 423 MOVL F_R1, OFFSET_FR1(SI) \ 424 MOVL F_R2, OFFSET_FR2(SI) \ 425 MOVL BRC_X0, OFFSET_BRC_X0(SI) \ 426 MOVL BRC_X1, OFFSET_BRC_X1(SI) \ 427 MOVL BRC_X2, OFFSET_BRC_X2(SI) \ 428 MOVL BRC_X3, OFFSET_BRC_X3(SI) 429 430 // func genKeywordAsm(s *zucState32) uint32 431 TEXT ·genKeywordAsm(SB),NOSPLIT,$0 432 MOVQ pState+0(FP), SI 433 434 LOAD_STATE 435 436 BITS_REORG(0) 437 CMPB ·useAVX(SB), $1 438 JE avx 439 440 sse: 441 NONLIN_FUN_SSE 442 443 // (BRC_X3 xor W) as result 444 XORL BRC_X3, AX 445 MOVL AX, ret+8(FP) 446 447 // LFSRWithWorkMode 448 XORQ AX, AX 449 LFSR_UPDT(0) 450 451 SAVE_STATE 452 RESTORE_LFSR_0 453 454 RET 455 456 avx: 457 NONLIN_FUN_AVX 458 459 // (BRC_X3 xor W) as result 460 XORL BRC_X3, AX 461 MOVL AX, ret+8(FP) 462 463 // LFSRWithWorkMode 464 XORQ AX, AX 465 LFSR_UPDT(0) 466 467 SAVE_STATE 468 RESTORE_LFSR_0 469 470 RET 471 472 #define ROUND_SSE(idx) \ 473 BITS_REORG(idx) \ 474 NONLIN_FUN_SSE \ 475 XORL BRC_X3, AX \ 476 MOVL AX, (idx*4)(DI) \ 477 XORQ AX, AX \ 478 LFSR_UPDT(idx) 479 480 #define ROUND_AVX(idx) \ 481 BITS_REORG(idx) \ 482 NONLIN_FUN_AVX \ 483 XORL BRC_X3, AX \ 484 MOVL AX, (idx*4)(DI) \ 485 XORQ AX, AX \ 486 LFSR_UPDT(idx) 487 488 #define ROUND_REV32_SSE(idx) \ 489 BITS_REORG(idx) \ 490 NONLIN_FUN_SSE \ 491 XORL BRC_X3, AX \ 492 BSWAPL AX \ 493 MOVL AX, (idx*4)(DI) \ 494 XORQ AX, AX \ 495 LFSR_UPDT(idx) 496 497 #define ROUND_REV32_AVX(idx) \ 498 BITS_REORG(idx) \ 499 NONLIN_FUN_AVX \ 500 XORL BRC_X3, AX \ 501 BSWAPL AX \ 502 MOVL AX, (idx*4)(DI) \ 503 XORQ AX, AX \ 504 LFSR_UPDT(idx) 505 506 // func genKeyStreamAsm(keyStream []uint32, pState *zucState32) 507 TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0 508 MOVQ ks+0(FP), DI 509 MOVQ ks_len+8(FP), BP 510 MOVQ pState+24(FP), SI 511 512 LOAD_STATE 513 514 CMPB ·useAVX(SB), $1 515 JE avxZucSixteens 516 517 sseZucSixteens: 518 CMPQ BP, $16 519 JB sseZucOctet 520 SUBQ $16, BP 521 ROUND_SSE(0) 522 ROUND_SSE(1) 523 ROUND_SSE(2) 524 ROUND_SSE(3) 525 ROUND_SSE(4) 526 ROUND_SSE(5) 527 ROUND_SSE(6) 528 ROUND_SSE(7) 529 ROUND_SSE(8) 530 ROUND_SSE(9) 531 ROUND_SSE(10) 532 ROUND_SSE(11) 533 ROUND_SSE(12) 534 ROUND_SSE(13) 535 ROUND_SSE(14) 536 ROUND_SSE(15) 537 LEAQ 64(DI), DI 538 JMP sseZucSixteens 539 540 sseZucOctet: 541 CMPQ BP, $8 542 JB sseZucNibble 543 SUBQ $8, BP 544 ROUND_SSE(0) 545 ROUND_SSE(1) 546 ROUND_SSE(2) 547 ROUND_SSE(3) 548 ROUND_SSE(4) 549 ROUND_SSE(5) 550 ROUND_SSE(6) 551 ROUND_SSE(7) 552 LEAQ 32(DI), DI 553 RESTORE_LFSR_8 554 555 sseZucNibble: 556 CMPQ BP, $4 557 JB sseZucDouble 558 SUBQ $4, BP 559 ROUND_SSE(0) 560 ROUND_SSE(1) 561 ROUND_SSE(2) 562 ROUND_SSE(3) 563 LEAQ 16(DI), DI 564 RESTORE_LFSR_4 565 566 sseZucDouble: 567 CMPQ BP, $2 568 JB sseZucSingle 569 SUBQ $2, BP 570 ROUND_SSE(0) 571 ROUND_SSE(1) 572 LEAQ 8(DI), DI 573 RESTORE_LFSR_2 574 575 sseZucSingle: 576 TESTQ BP, BP 577 JE sseZucRet 578 ROUND_SSE(0) 579 RESTORE_LFSR_0 580 581 sseZucRet: 582 SAVE_STATE 583 RET 584 585 avxZucSixteens: 586 CMPQ BP, $16 587 JB avxZucOctet 588 SUBQ $16, BP 589 ROUND_AVX(0) 590 ROUND_AVX(1) 591 ROUND_AVX(2) 592 ROUND_AVX(3) 593 ROUND_AVX(4) 594 ROUND_AVX(5) 595 ROUND_AVX(6) 596 ROUND_AVX(7) 597 ROUND_AVX(8) 598 ROUND_AVX(9) 599 ROUND_AVX(10) 600 ROUND_AVX(11) 601 ROUND_AVX(12) 602 ROUND_AVX(13) 603 ROUND_AVX(14) 604 ROUND_AVX(15) 605 LEAQ 64(DI), DI 606 JMP avxZucSixteens 607 608 avxZucOctet: 609 CMPQ BP, $8 610 JB avxZucNibble 611 SUBQ $8, BP 612 ROUND_AVX(0) 613 ROUND_AVX(1) 614 ROUND_AVX(2) 615 ROUND_AVX(3) 616 ROUND_AVX(4) 617 ROUND_AVX(5) 618 ROUND_AVX(6) 619 ROUND_AVX(7) 620 LEAQ 32(DI), DI 621 RESTORE_LFSR_8 622 623 avxZucNibble: 624 CMPQ BP, $4 625 JB avxZucDouble 626 SUBQ $4, BP 627 ROUND_AVX(0) 628 ROUND_AVX(1) 629 ROUND_AVX(2) 630 ROUND_AVX(3) 631 LEAQ 16(DI), DI 632 RESTORE_LFSR_4 633 634 avxZucDouble: 635 CMPQ BP, $2 636 JB avxZucSingle 637 SUBQ $2, BP 638 ROUND_AVX(0) 639 ROUND_AVX(1) 640 LEAQ 8(DI), DI 641 RESTORE_LFSR_2 642 643 avxZucSingle: 644 TESTQ BP, BP 645 JE avxZucRet 646 ROUND_AVX(0) 647 RESTORE_LFSR_0 648 649 avxZucRet: 650 SAVE_STATE 651 RET 652 653 // func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32) 654 TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0 655 MOVQ ks+0(FP), DI 656 MOVQ ks_len+8(FP), BP 657 MOVQ pState+24(FP), SI 658 659 SHRQ $2, BP 660 661 LOAD_STATE 662 663 CMPB ·useAVX(SB), $1 664 JE avxZucSixteens 665 666 sseZucSixteens: 667 CMPQ BP, $16 668 JB sseZucOctet 669 SUBQ $16, BP 670 ROUND_REV32_SSE(0) 671 ROUND_REV32_SSE(1) 672 ROUND_REV32_SSE(2) 673 ROUND_REV32_SSE(3) 674 ROUND_REV32_SSE(4) 675 ROUND_REV32_SSE(5) 676 ROUND_REV32_SSE(6) 677 ROUND_REV32_SSE(7) 678 ROUND_REV32_SSE(8) 679 ROUND_REV32_SSE(9) 680 ROUND_REV32_SSE(10) 681 ROUND_REV32_SSE(11) 682 ROUND_REV32_SSE(12) 683 ROUND_REV32_SSE(13) 684 ROUND_REV32_SSE(14) 685 ROUND_REV32_SSE(15) 686 LEAQ 64(DI), DI 687 JMP sseZucSixteens 688 689 sseZucOctet: 690 CMPQ BP, $8 691 JB sseZucNibble 692 SUBQ $8, BP 693 ROUND_REV32_SSE(0) 694 ROUND_REV32_SSE(1) 695 ROUND_REV32_SSE(2) 696 ROUND_REV32_SSE(3) 697 ROUND_REV32_SSE(4) 698 ROUND_REV32_SSE(5) 699 ROUND_REV32_SSE(6) 700 ROUND_REV32_SSE(7) 701 LEAQ 32(DI), DI 702 RESTORE_LFSR_8 703 704 sseZucNibble: 705 CMPQ BP, $4 706 JB sseZucDouble 707 SUBQ $4, BP 708 ROUND_REV32_SSE(0) 709 ROUND_REV32_SSE(1) 710 ROUND_REV32_SSE(2) 711 ROUND_REV32_SSE(3) 712 LEAQ 16(DI), DI 713 RESTORE_LFSR_4 714 715 sseZucDouble: 716 CMPQ BP, $2 717 JB sseZucSingle 718 SUBQ $2, BP 719 ROUND_REV32_SSE(0) 720 ROUND_REV32_SSE(1) 721 LEAQ 8(DI), DI 722 RESTORE_LFSR_2 723 724 sseZucSingle: 725 TESTQ BP, BP 726 JE sseZucRet 727 ROUND_REV32_SSE(0) 728 RESTORE_LFSR_0 729 730 sseZucRet: 731 SAVE_STATE 732 RET 733 734 avxZucSixteens: 735 CMPQ BP, $16 736 JB avxZucOctet 737 SUBQ $16, BP 738 ROUND_REV32_AVX(0) 739 ROUND_REV32_AVX(1) 740 ROUND_REV32_AVX(2) 741 ROUND_REV32_AVX(3) 742 ROUND_REV32_AVX(4) 743 ROUND_REV32_AVX(5) 744 ROUND_REV32_AVX(6) 745 ROUND_REV32_AVX(7) 746 ROUND_REV32_AVX(8) 747 ROUND_REV32_AVX(9) 748 ROUND_REV32_AVX(10) 749 ROUND_REV32_AVX(11) 750 ROUND_REV32_AVX(12) 751 ROUND_REV32_AVX(13) 752 ROUND_REV32_AVX(14) 753 ROUND_REV32_AVX(15) 754 LEAQ 64(DI), DI 755 JMP avxZucSixteens 756 757 avxZucOctet: 758 CMPQ BP, $8 759 JB avxZucNibble 760 SUBQ $8, BP 761 ROUND_REV32_AVX(0) 762 ROUND_REV32_AVX(1) 763 ROUND_REV32_AVX(2) 764 ROUND_REV32_AVX(3) 765 ROUND_REV32_AVX(4) 766 ROUND_REV32_AVX(5) 767 ROUND_REV32_AVX(6) 768 ROUND_REV32_AVX(7) 769 LEAQ 32(DI), DI 770 RESTORE_LFSR_8 771 772 avxZucNibble: 773 CMPQ BP, $4 774 JB avxZucDouble 775 SUBQ $4, BP 776 ROUND_REV32_AVX(0) 777 ROUND_REV32_AVX(1) 778 ROUND_REV32_AVX(2) 779 ROUND_REV32_AVX(3) 780 LEAQ 16(DI), DI 781 RESTORE_LFSR_4 782 783 avxZucDouble: 784 CMPQ BP, $2 785 JB avxZucSingle 786 SUBQ $2, BP 787 ROUND_REV32_AVX(0) 788 ROUND_REV32_AVX(1) 789 LEAQ 8(DI), DI 790 RESTORE_LFSR_2 791 792 avxZucSingle: 793 TESTQ BP, BP 794 JE avxZucRet 795 ROUND_REV32_AVX(0) 796 RESTORE_LFSR_0 797 798 avxZucRet: 799 SAVE_STATE 800 RET