github.com/emmansun/gmsm@v0.29.1/zuc/eia256_asm_amd64.s (about) 1 // Referenced Intel(R) Multi-Buffer Crypto for IPsec 2 // https://github.com/intel/intel-ipsec-mb/ 3 //go:build !purego 4 5 #include "textflag.h" 6 7 DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800 8 DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901 9 GLOBL bit_reverse_table_l<>(SB), RODATA, $16 10 11 DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000 12 DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010 13 GLOBL bit_reverse_table_h<>(SB), RODATA, $16 14 15 DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f 16 DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f 17 GLOBL bit_reverse_and_table<>(SB), RODATA, $16 18 19 DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100 20 DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504 21 GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16 22 23 DATA shuf_mask_0_0_dw1_0<>+0x00(SB)/8, $0xffffffffffffffff 24 DATA shuf_mask_0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504 25 GLOBL shuf_mask_0_0_dw1_0<>(SB), RODATA, $16 26 27 DATA shuf_mask_0_0_0_dw1<>+0x00(SB)/8, $0xffffffffffffffff 28 DATA shuf_mask_0_0_0_dw1<>+0x08(SB)/8, $0x07060504ffffffff 29 GLOBL shuf_mask_0_0_0_dw1<>(SB), RODATA, $16 30 31 DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908 32 DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c 33 GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16 34 35 DATA bits_32_63<>+0x00(SB)/8, $0xffffffff00000000 36 DATA bits_32_63<>+0x08(SB)/8, $0x0000000000000000 37 GLOBL bits_32_63<>(SB), RODATA, $16 38 39 40 #define XTMP1 X1 41 #define XTMP2 X2 42 #define XTMP3 X3 43 #define XTMP4 X4 44 #define XTMP5 X5 45 #define XTMP6 X6 46 #define XDATA X7 47 #define XDIGEST X8 48 #define KS_L X9 49 #define KS_M1 X10 50 #define KS_M2 X11 51 #define KS_H X12 52 53 // func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte) 54 TEXT ·eia256RoundTag8(SB),NOSPLIT,$0 55 MOVQ t+0(FP), AX 56 MOVQ ks+8(FP), BX 57 MOVQ p+16(FP), CX 58 59 CMPB ·useAVX(SB), $1 60 JE avx 61 62 // Reverse data bytes 63 MOVUPS (0)(CX), XDATA 64 MOVOU bit_reverse_and_table<>(SB), XTMP4 65 MOVOU XDATA, XTMP2 66 PAND XTMP4, XTMP2 67 68 PANDN XDATA, XTMP4 69 PSRLQ $4, XTMP4 70 71 MOVOU bit_reverse_table_h<>(SB), XTMP3 72 PSHUFB XTMP2, XTMP3 73 74 MOVOU bit_reverse_table_l<>(SB), XTMP1 75 PSHUFB XTMP4, XTMP1 76 77 PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes 78 79 // ZUC authentication part, 4x32 data bits 80 // setup KS 81 MOVUPS (0*4)(BX), XTMP1 82 MOVUPS (2*4)(BX), XTMP2 83 MOVUPS (4*4)(BX), XTMP4 84 PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32] 85 PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96] 86 PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160] 87 88 // setup DATA 89 MOVOU XTMP3, XTMP1 90 PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1 91 MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s] 92 93 PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3 94 MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s] 95 96 // clmul 97 // xor the results from 4 32-bit words together 98 // Save data for following products 99 MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s] 100 MOVOU XTMP3, XTMP6 // Data bits [95:64 0s 127:96 0s] 101 102 // Calculate lower 32 bits of tag 103 PCLMULQDQ $0x00, KS_L, XTMP1 104 PCLMULQDQ $0x11, KS_L, XTMP2 105 PCLMULQDQ $0x00, KS_M1, XDIGEST 106 PCLMULQDQ $0x11, KS_M1, XTMP3 107 108 // XOR all products and move bits 63-32 bits to lower 32 bits 109 PXOR XTMP1, XTMP2 110 PXOR XTMP3, XDIGEST 111 PXOR XTMP2, XDIGEST 112 MOVQ XDIGEST, XDIGEST // Clear top 64 bits 113 PSRLDQ $4, XDIGEST 114 115 // Prepare data and calculate bits 63-32 of tag 116 MOVOU XTMP5, XTMP1 117 MOVOU XTMP5, XTMP2 118 MOVOU XTMP6, XTMP3 119 MOVOU XTMP6, XTMP4 120 121 PCLMULQDQ $0x10, KS_L, XTMP1 122 PCLMULQDQ $0x01, KS_M1, XTMP2 123 PCLMULQDQ $0x10, KS_M1, XTMP3 124 PCLMULQDQ $0x01, KS_M2, XTMP4 125 126 // XOR all the products and keep only bits 63-32 127 PXOR XTMP2, XTMP1 128 PXOR XTMP4, XTMP3 129 PXOR XTMP3, XTMP1 130 PAND bits_32_63<>(SB), XTMP1 131 132 // OR with lower 32 bits, to construct 64 bits of tag 133 POR XTMP1, XDIGEST 134 135 // Update tag 136 MOVQ XDIGEST, R10 137 XORQ R10, (AX) 138 139 // Copy last 16 bytes of KS to the front 140 MOVUPS (4*4)(BX), XTMP1 141 MOVUPS XTMP1, (0*4)(BX) 142 143 RET 144 145 avx: 146 VMOVDQU (0)(CX), XDATA 147 148 // Reverse data bytes 149 VMOVDQU bit_reverse_and_table<>(SB), XTMP1 150 VPAND XTMP1, XDATA, XTMP2 151 VPANDN XDATA, XTMP1, XTMP3 152 VPSRLD $4, XTMP3, XTMP3 153 154 VMOVDQU bit_reverse_table_h<>(SB), XTMP1 155 VPSHUFB XTMP2, XTMP1, XTMP4 156 VMOVDQU bit_reverse_table_l<>(SB), XTMP1 157 VPSHUFB XTMP3, XTMP1, XTMP1 158 VPOR XTMP1, XTMP4, XTMP4 159 160 // ZUC authentication part, 4x32 data bits 161 // setup KS 162 VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32] 163 VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32] 164 VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160] 165 166 // setup DATA 167 // Data bytes [31:0 0s 63:32 0s] 168 VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1 169 // Data bytes [95:64 0s 127:96 0s] 170 VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2 171 172 173 // clmul 174 // xor the results from 4 32-bit words together 175 // Calculate lower 32 bits of tag 176 VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3 177 VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4 178 VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5 179 VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6 180 181 VPXOR XTMP3, XTMP4, XTMP3 182 VPXOR XTMP5, XTMP6, XTMP5 183 VPXOR XTMP3, XTMP5, XTMP3 184 185 // Move previous result to low 32 bits and XOR with previous digest 186 VMOVQ XTMP3, XTMP3 // Clear top 64 bits 187 VPSRLDQ $4, XTMP3, XDIGEST 188 189 VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3 190 VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4 191 VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5 192 VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6 193 194 // XOR all the products and keep only 32-63 bits 195 VPXOR XTMP4, XTMP3, XTMP3 196 VPXOR XTMP6, XTMP5, XTMP5 197 VPXOR XTMP5, XTMP3, XTMP3 198 VPAND bits_32_63<>(SB), XTMP3, XTMP3 199 200 // XOR with bits 32-63 of previous digest 201 VPXOR XTMP3, XDIGEST, XDIGEST 202 203 // Update tag 204 VMOVQ XDIGEST, R10 205 XORQ R10, (AX) 206 207 // Copy last 16 bytes of KS to the front 208 VMOVDQU (4*4)(BX), XTMP1 209 VMOVDQU XTMP1, (0*4)(BX) 210 211 VZEROUPPER 212 RET 213 214 // func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte) 215 TEXT ·eia256RoundTag16(SB),NOSPLIT,$0 216 MOVQ t+0(FP), AX 217 MOVQ ks+8(FP), BX 218 MOVQ p+16(FP), CX 219 220 CMPB ·useAVX(SB), $1 221 JE avx 222 223 // Reverse data bytes 224 MOVUPS (0)(CX), XDATA 225 MOVOU bit_reverse_and_table<>(SB), XTMP4 226 MOVOU XDATA, XTMP2 227 PAND XTMP4, XTMP2 228 229 PANDN XDATA, XTMP4 230 PSRLQ $4, XTMP4 231 232 MOVOU bit_reverse_table_h<>(SB), XTMP3 233 PSHUFB XTMP2, XTMP3 234 235 MOVOU bit_reverse_table_l<>(SB), XTMP1 236 PSHUFB XTMP4, XTMP1 237 238 PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes 239 240 // ZUC authentication part, 4x32 data bits 241 // setup KS 242 MOVUPS (0*4)(BX), XTMP1 243 MOVUPS (2*4)(BX), XTMP2 244 MOVUPS (4*4)(BX), XTMP4 245 PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32] 246 PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96] 247 PSHUFD $0x61, XTMP4, KS_M2 // KS bits [191:160 159:128 223:192 191:160] 248 PSHUFD $0xBB, XTMP4, KS_H // KS bits [255:224 223:192 255:224 223:192] 249 250 // setup DATA 251 MOVOU XTMP3, XTMP1 252 PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1 253 MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s] 254 255 PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3 256 MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s] 257 258 // clmul 259 // xor the results from 4 32-bit words together 260 // Save data for following products 261 MOVOU XTMP2, XTMP5 // Data bits [31:0 0s 63:32 0s] 262 MOVOU XTMP3, XTMP6 // Data bits [95:64 0s 127:96 0s] 263 264 // Calculate lower 32 bits of tag 265 PCLMULQDQ $0x00, KS_L, XTMP1 266 PCLMULQDQ $0x11, KS_L, XTMP2 267 PCLMULQDQ $0x00, KS_M1, XDIGEST 268 PCLMULQDQ $0x11, KS_M1, XTMP3 269 270 // XOR all products and move bits 63-32 bits to lower 32 bits 271 PXOR XTMP1, XTMP2 272 PXOR XTMP3, XDIGEST 273 PXOR XTMP2, XDIGEST 274 MOVQ XDIGEST, XDIGEST // Clear top 64 bits 275 PSRLDQ $4, XDIGEST 276 277 // Prepare data and calculate bits 63-32 of tag 278 MOVOU XTMP5, XTMP1 279 MOVOU XTMP5, XTMP2 280 MOVOU XTMP6, XTMP3 281 MOVOU XTMP6, XTMP4 282 283 PCLMULQDQ $0x10, KS_L, XTMP1 284 PCLMULQDQ $0x01, KS_M1, XTMP2 285 PCLMULQDQ $0x10, KS_M1, XTMP3 286 PCLMULQDQ $0x01, KS_M2, XTMP4 287 288 // XOR all the products and keep only bits 63-32 289 PXOR XTMP2, XTMP1 290 PXOR XTMP4, XTMP3 291 PXOR XTMP3, XTMP1 292 PAND bits_32_63<>(SB), XTMP1 293 294 // OR with lower 32 bits, to construct 64 bits of tag 295 POR XTMP1, XDIGEST 296 297 // Prepare data and calculate bits 95-64 of tag 298 MOVOU XTMP5, XTMP1 299 MOVOU XTMP5, XTMP2 300 MOVOU XTMP6, XTMP3 301 MOVOU XTMP6, XTMP4 302 303 PCLMULQDQ $0x00, KS_M1, XTMP1 304 PCLMULQDQ $0x11, KS_M1, XTMP2 305 PCLMULQDQ $0x00, KS_M2, XTMP3 306 PCLMULQDQ $0x11, KS_M2, XTMP4 307 308 // XOR all the products and move bits 63-32 to bits 95-64 309 PXOR XTMP2, XTMP1 310 PXOR XTMP4, XTMP3 311 PXOR XTMP3, XTMP1 312 PSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP1 313 314 // OR with lower 64 bits, to construct 96 bits of tag 315 POR XTMP1, XDIGEST 316 317 // Prepare data and calculate bits 127-96 of tag 318 MOVOU XTMP5, XTMP1 319 MOVOU XTMP5, XTMP2 320 MOVOU XTMP6, XTMP3 321 MOVOU XTMP6, XTMP4 322 323 PCLMULQDQ $0x10, KS_M1, XTMP1 324 PCLMULQDQ $0x01, KS_M2, XTMP2 325 PCLMULQDQ $0x10, KS_M2, XTMP3 326 PCLMULQDQ $0x01, KS_H, XTMP4 327 328 // XOR all the products and move bits 63-32 to bits 127-96 329 PXOR XTMP2, XTMP1 330 PXOR XTMP4, XTMP3 331 PXOR XTMP3, XTMP1 332 PSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP1 333 334 // OR with lower 96 bits, to construct 128 bits of tag 335 POR XTMP1, XDIGEST 336 337 // Update tag 338 MOVUPS (AX), XTMP1 339 PXOR XTMP1, XDIGEST 340 MOVUPS XDIGEST, (AX) 341 342 // Copy last 16 bytes of KS to the front 343 MOVUPS (4*4)(BX), XTMP1 344 MOVUPS XTMP1, (0*4)(BX) 345 346 RET 347 348 avx: 349 VMOVDQU (0)(CX), XDATA 350 351 // Reverse data bytes 352 VMOVDQU bit_reverse_and_table<>(SB), XTMP1 353 VPAND XTMP1, XDATA, XTMP2 354 VPANDN XDATA, XTMP1, XTMP3 355 VPSRLD $4, XTMP3, XTMP3 356 357 VMOVDQU bit_reverse_table_h<>(SB), XTMP1 358 VPSHUFB XTMP2, XTMP1, XTMP4 359 VMOVDQU bit_reverse_table_l<>(SB), XTMP1 360 VPSHUFB XTMP3, XTMP1, XTMP1 361 VPOR XTMP1, XTMP4, XTMP4 362 363 // ZUC authentication part, 4x32 data bits 364 // setup KS 365 VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32] 366 VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32] 367 VPSHUFD $0x61, (4*4)(BX), KS_M2 // KS bits [191:160 159:128 223:192 191:160] 368 VPSHUFD $0xBB, (4*4)(BX), KS_H // KS bits [255:224 223:192 255:224 223:192] 369 370 // setup DATA 371 // Data bytes [31:0 0s 63:32 0s] 372 VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1 373 // Data bytes [95:64 0s 127:96 0s] 374 VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2 375 376 377 // clmul 378 // xor the results from 4 32-bit words together 379 // Calculate lower 32 bits of tag 380 VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3 381 VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4 382 VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5 383 VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6 384 385 VPXOR XTMP3, XTMP4, XTMP3 386 VPXOR XTMP5, XTMP6, XTMP5 387 VPXOR XTMP3, XTMP5, XTMP3 388 389 // Move previous result to low 32 bits and XOR with previous digest 390 VMOVQ XTMP3, XTMP3 // Clear top 64 bits 391 VPSRLDQ $4, XTMP3, XDIGEST 392 393 VPCLMULQDQ $0x10, KS_L, XTMP1, XTMP3 394 VPCLMULQDQ $0x01, KS_M1, XTMP1, XTMP4 395 VPCLMULQDQ $0x10, KS_M1, XTMP2, XTMP5 396 VPCLMULQDQ $0x01, KS_M2, XTMP2, XTMP6 397 398 // XOR all the products and keep only 32-63 bits 399 VPXOR XTMP4, XTMP3, XTMP3 400 VPXOR XTMP6, XTMP5, XTMP5 401 VPXOR XTMP5, XTMP3, XTMP3 402 VPAND bits_32_63<>(SB), XTMP3, XTMP3 403 404 // XOR with bits 32-63 of previous digest 405 VPXOR XTMP3, XDIGEST, XDIGEST 406 407 // Prepare data and calculate bits 95-64 of tag 408 VPCLMULQDQ $0x00, KS_M1, XTMP1, XTMP3 409 VPCLMULQDQ $0x11, KS_M1, XTMP1, XTMP4 410 VPCLMULQDQ $0x00, KS_M2, XTMP2, XTMP5 411 VPCLMULQDQ $0x11, KS_M2, XTMP2, XTMP6 412 413 // XOR all the products and move bits 63-32 to bits 95-64 414 VPXOR XTMP4, XTMP3, XTMP3 415 VPXOR XTMP6, XTMP5, XTMP5 416 VPXOR XTMP5, XTMP3, XTMP3 417 418 VPSHUFB shuf_mask_0_0_dw1_0<>(SB), XTMP3, XTMP3 419 420 // XOR with previous bits 64-95 of previous digest 421 VPXOR XTMP3, XDIGEST, XDIGEST 422 423 // Prepare data and calculate bits 127-96 of tag 424 VPCLMULQDQ $0x10, KS_M1, XTMP1, XTMP3 425 VPCLMULQDQ $0x01, KS_M2, XTMP1, XTMP4 426 VPCLMULQDQ $0x10, KS_M2, XTMP2, XTMP5 427 VPCLMULQDQ $0x01, KS_H, XTMP2, XTMP6 428 429 // XOR all the products and move bits 63-32 to bits 127-96 430 VPXOR XTMP4, XTMP3, XTMP3 431 VPXOR XTMP6, XTMP5, XTMP5 432 VPXOR XTMP5, XTMP3, XTMP3 433 434 VPSHUFB shuf_mask_0_0_0_dw1<>(SB), XTMP3, XTMP3 435 436 // XOR with previous bits 64-95 of previous digest 437 VPXOR XTMP3, XDIGEST, XDIGEST 438 439 // Update tag 440 VPXOR (AX), XDIGEST, XDIGEST 441 VMOVDQA XDIGEST, (AX) 442 443 // Copy last 16 bytes of KS to the front 444 VMOVDQU (4*4)(BX), XTMP1 445 VMOVDQU XTMP1, (0*4)(BX) 446 447 VZEROUPPER 448 RET