github.com/emmansun/gmsm@v0.29.1/zuc/eia_asm_amd64.s (about) 1 // Referenced Intel(R) Multi-Buffer Crypto for IPsec 2 // https://github.com/intel/intel-ipsec-mb/ 3 //go:build !purego 4 5 #include "textflag.h" 6 7 DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800 8 DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901 9 GLOBL bit_reverse_table_l<>(SB), RODATA, $16 10 11 DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000 12 DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010 13 GLOBL bit_reverse_table_h<>(SB), RODATA, $16 14 15 DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f 16 DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f 17 GLOBL bit_reverse_and_table<>(SB), RODATA, $16 18 19 DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100 20 DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504 21 GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16 22 23 DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908 24 DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c 25 GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16 26 27 #define XTMP1 X1 28 #define XTMP2 X2 29 #define XTMP3 X3 30 #define XTMP4 X4 31 #define XTMP5 X5 32 #define XTMP6 X6 33 #define XDATA X7 34 #define XDIGEST X8 35 #define KS_L X9 36 #define KS_M1 X10 37 #define KS_M2 X11 38 #define KS_H X12 39 40 // func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) 41 TEXT ·eia3Round16B(SB),NOSPLIT,$0 42 MOVQ t+0(FP), AX 43 MOVQ ks+8(FP), BX 44 MOVQ p+16(FP), CX 45 MOVQ tagSize+24(FP), DX 46 47 CMPB ·useAVX(SB), $1 48 JE avx 49 50 // Reverse data bytes 51 MOVUPS (0)(CX), XDATA 52 MOVOU bit_reverse_and_table<>(SB), XTMP4 53 MOVOU XDATA, XTMP2 54 PAND XTMP4, XTMP2 55 56 PANDN XDATA, XTMP4 57 PSRLQ $4, XTMP4 58 59 MOVOU bit_reverse_table_h<>(SB), XTMP3 60 PSHUFB XTMP2, XTMP3 61 62 MOVOU bit_reverse_table_l<>(SB), XTMP1 63 PSHUFB XTMP4, XTMP1 64 65 PXOR XTMP1, XTMP3 // XTMP3 - bit reverse data bytes 66 67 // ZUC authentication part, 4x32 data bits 68 // setup KS 69 MOVUPS (0*4)(BX), XTMP1 70 MOVUPS (2*4)(BX), XTMP2 71 PSHUFD $0x61, XTMP1, KS_L // KS bits [63:32 31:0 95:64 63:32] 72 PSHUFD $0x61, XTMP2, KS_M1 // KS bits [127:96 95:64 159:128 127:96] 73 74 // setup DATA 75 MOVOU XTMP3, XTMP1 76 PSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP1 77 MOVOU XTMP1, XTMP2 // XTMP1/2 - Data bits [31:0 0s 63:32 0s] 78 79 PSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP3 80 MOVOU XTMP3, XDIGEST // XDIGEST/XTMP3 - Data bits [95:64 0s 127:96 0s] 81 82 // clmul 83 // xor the results from 4 32-bit words together 84 // Calculate lower 32 bits of tag 85 PCLMULQDQ $0x00, KS_L, XTMP1 86 PCLMULQDQ $0x11, KS_L, XTMP2 87 PCLMULQDQ $0x00, KS_M1, XDIGEST 88 PCLMULQDQ $0x11, KS_M1, XTMP3 89 90 // XOR all products and move 32-bits to lower 32 bits 91 PXOR XTMP1, XTMP2 92 PXOR XTMP3, XDIGEST 93 PXOR XTMP2, XDIGEST 94 PSRLDQ $4, XDIGEST 95 96 // Update tag 97 MOVL XDIGEST, R10 98 XORL R10, (AX) 99 100 // Copy last 16 bytes of KS to the front 101 MOVUPS (4*4)(BX), XTMP1 102 MOVUPS XTMP1, (0*4)(BX) 103 104 RET 105 106 avx: 107 VMOVDQU (0)(CX), XDATA 108 109 // Reverse data bytes 110 VMOVDQU bit_reverse_and_table<>(SB), XTMP1 111 VPAND XTMP1, XDATA, XTMP2 112 VPANDN XDATA, XTMP1, XTMP3 113 VPSRLD $4, XTMP3, XTMP3 114 115 VMOVDQU bit_reverse_table_h<>(SB), XTMP1 116 VPSHUFB XTMP2, XTMP1, XTMP4 117 VMOVDQU bit_reverse_table_l<>(SB), XTMP1 118 VPSHUFB XTMP3, XTMP1, XTMP1 119 VPOR XTMP1, XTMP4, XTMP4 120 121 // ZUC authentication part, 4x32 data bits 122 // setup KS 123 VPSHUFD $0x61, (0*4)(BX), KS_L // KS bits [63:32 31:0 95:64 63:32] 124 VPSHUFD $0x61, (2*4)(BX), KS_M1 // KS bits [63:32 31:0 95:64 63:32] 125 126 // setup DATA 127 // Data bytes [31:0 0s 63:32 0s] 128 VPSHUFB shuf_mask_dw0_0_dw1_0<>(SB), XTMP4, XTMP1 129 // Data bytes [95:64 0s 127:96 0s] 130 VPSHUFB shuf_mask_dw2_0_dw3_0<>(SB), XTMP4, XTMP2 131 132 // clmul 133 // xor the results from 4 32-bit words together 134 // Calculate lower 32 bits of tag 135 VPCLMULQDQ $0x00, KS_L, XTMP1, XTMP3 136 VPCLMULQDQ $0x11, KS_L, XTMP1, XTMP4 137 VPCLMULQDQ $0x00, KS_M1, XTMP2, XTMP5 138 VPCLMULQDQ $0x11, KS_M1, XTMP2, XTMP6 139 140 VPXOR XTMP3, XTMP4, XTMP3 141 VPXOR XTMP5, XTMP6, XTMP5 142 VPXOR XTMP3, XTMP5, XDIGEST 143 144 VMOVQ XDIGEST, R10 145 SHRQ $32, R10 146 XORL R10, (AX) 147 148 // Copy last 16 bytes of KS to the front 149 VMOVDQU (4*4)(BX), XTMP1 150 VMOVDQU XTMP1, (0*4)(BX) 151 152 VZEROUPPER 153 RET