github.com/emmansun/gmsm@v0.29.1/zuc/eia256_asm_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 DATA bit_reverse_table<>+0x00(SB)/8, $0x0e060a020c040800 6 DATA bit_reverse_table<>+0x08(SB)/8, $0x0f070b030d050901 7 DATA bit_reverse_table<>+0x10(SB)/8, $0xe060a020c0408000 8 DATA bit_reverse_table<>+0x18(SB)/8, $0xf070b030d0509010 9 GLOBL bit_reverse_table<>(SB), RODATA, $32 10 11 DATA shuf_mask_dw<>+0x00(SB)/8, $0xffffffff03020100 12 DATA shuf_mask_dw<>+0x08(SB)/8, $0xffffffff07060504 13 DATA shuf_mask_dw<>+0x10(SB)/8, $0xffffffff0b0a0908 14 DATA shuf_mask_dw<>+0x18(SB)/8, $0xffffffff0f0e0d0c 15 GLOBL shuf_mask_dw<>(SB), RODATA, $32 16 17 #define AX R2 18 #define BX R3 19 #define CX R4 20 #define DX R5 21 22 #define XTMP1 V1 23 #define XTMP2 V2 24 #define XTMP3 V3 25 #define XTMP4 V4 26 #define XTMP5 V5 27 #define XTMP6 V6 28 #define XDATA V7 29 #define XDIGEST V8 30 #define KS_L V9 31 #define KS_M1 V10 32 #define KS_M2 V11 33 #define KS_H V12 34 #define BIT_REV_TAB_L V20 35 #define BIT_REV_TAB_H V21 36 #define BIT_REV_AND_TAB V22 37 #define SHUF_MASK_DW0_DW1 V23 38 #define SHUF_MASK_DW2_DW3 V24 39 40 #define LOAD_GLOBAL_DATA() \ 41 MOVD $bit_reverse_table<>(SB), R0 \ 42 VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16] \ 43 MOVW $0x0F0F0F0F, R0 \ 44 VDUP R0, BIT_REV_AND_TAB.S4 \ 45 MOVD $shuf_mask_dw<>(SB), R0 \ 46 VLD1 (R0), [SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16] 47 48 // func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte) 49 TEXT ·eia256RoundTag8(SB),NOSPLIT,$0 50 MOVD t+0(FP), AX 51 MOVD ks+8(FP), BX 52 MOVD p+16(FP), CX 53 54 LOAD_GLOBAL_DATA() 55 56 // Reverse data bytes 57 VLD1 (CX), [XDATA.B16] 58 VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 59 VUSHR $4, XDATA.S4, XTMP1.S4 60 VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16 61 62 VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 63 VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 64 VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes 65 66 // ZUC authentication part, 4x32 data bits 67 // setup KS 68 VLD1 (BX), [XTMP1.B16, XTMP2.B16] 69 VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front 70 // TODO: Any better solution??? 71 VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4 72 VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32] 73 VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160] 74 VDUP XTMP1.S[3], KS_M1.S4 75 VMOV XTMP1.S[2], KS_M1.S[1] 76 VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96] 77 78 // setup DATA 79 VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s] 80 VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s] 81 82 // clmul 83 // xor the results from 4 32-bit words together 84 85 // Calculate lower 32 bits of tag 86 VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1 87 VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1 88 VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1 89 VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1 90 91 VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 92 VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 93 VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 94 95 // Move previous result to low 32 bits and XOR with previous digest 96 VMOV XTMP3.S[1], XDIGEST.S[0] 97 98 // Prepare data and calculate bits 63-32 of tag 99 VEXT $8, KS_L.B16, KS_L.B16, XTMP5.B16 100 VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1 101 VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16 102 VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1 103 VEXT $8, KS_M1.B16, KS_M1.B16, XTMP1.B16 104 VPMULL XTMP1.D1, XTMP2.D1, XTMP5.Q1 105 VEXT $8, XTMP2.B16, XTMP2.B16, XTMP1.B16 106 VPMULL KS_M2.D1, XTMP1.D1, XTMP6.Q1 107 108 VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 109 VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 110 VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 111 112 VMOV XTMP3.S[1], XDIGEST.S[1] 113 114 VMOV XDIGEST.D[0], R10 115 MOVD (AX), R11 116 EOR R10, R11 117 MOVD R11, (AX) 118 119 RET 120 121 // func eia256RoundTag16(t *uint32, keyStream *uint32, p *byte) 122 TEXT ·eia256RoundTag16(SB),NOSPLIT,$0 123 MOVD t+0(FP), AX 124 MOVD ks+8(FP), BX 125 MOVD p+16(FP), CX 126 127 LOAD_GLOBAL_DATA() 128 129 // Reverse data bytes 130 VLD1 (CX), [XDATA.B16] 131 VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16 132 VUSHR $4, XDATA.S4, XTMP1.S4 133 VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16 134 135 VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16 136 VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16 137 VEOR XTMP1.B16, XTMP3.B16, XTMP3.B16 // XTMP3 - bit reverse data bytes 138 139 // ZUC authentication part, 4x32 data bits 140 // setup KS 141 VLD1 (BX), [XTMP1.B16, XTMP2.B16] 142 VST1 [XTMP2.B16], (BX) // Copy last 16 bytes of KS to the front 143 // TODO: Any better solution??? We can use VTBL, but there are no performance imprvoement if we can't reuse MASK constant 144 VMOVQ $0x0302010007060504, $0x070605040b0a0908, XTMP4 145 VTBL XTMP4.B16, [XTMP1.B16], KS_L.B16 // KS bits [63:32 31:0 95:64 63:32] 146 VTBL XTMP4.B16, [XTMP2.B16], KS_M2.B16 // KS bits [191:160 159:128 223:192 191:160] 147 VMOVQ $0x0b0a09080f0e0d0c, $0x0b0a09080f0e0d0c, XTMP4 148 VTBL XTMP4.B16, [XTMP2.B16], KS_H.B16 // KS bits [255:224 223:192 255:224 223:192] 149 VDUP XTMP1.S[3], KS_M1.S4 150 VMOV XTMP1.S[2], KS_M1.S[1] 151 VMOV XTMP2.S[0], KS_M1.S[2] // KS bits [127:96 95:64 159:128 127:96] 152 153 // setup DATA 154 VTBL SHUF_MASK_DW0_DW1.B16, [XTMP3.B16], XTMP1.B16 // XTMP1 - Data bits [31:0 0s 63:32 0s] 155 VTBL SHUF_MASK_DW2_DW3.B16, [XTMP3.B16], XTMP2.B16 // XTMP2 - Data bits [95:64 0s 127:96 0s] 156 157 // clmul 158 // xor the results from 4 32-bit words together 159 160 // Calculate lower 32 bits of tag 161 VPMULL KS_L.D1, XTMP1.D1, XTMP3.Q1 162 VPMULL2 KS_L.D2, XTMP1.D2, XTMP4.Q1 163 VPMULL KS_M1.D1, XTMP2.D1, XTMP5.Q1 164 VPMULL2 KS_M1.D2, XTMP2.D2, XTMP6.Q1 165 166 VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 167 VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 168 VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 169 170 // Move previous result to low 32 bits and XOR with previous digest 171 VMOV XTMP3.S[1], XDIGEST.S[0] 172 173 // Prepare data and calculate bits 63-32 of tag 174 VEXT $8, KS_L.B16, KS_L.B16, XTMP5.B16 175 VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1 176 VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16 177 VPMULL KS_M1.D1, XTMP5.D1, XTMP4.Q1 178 VEXT $8, KS_M1.B16, KS_M1.B16, XTMP6.B16 179 VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1 180 VEXT $8, XTMP2.B16, XTMP2.B16, KS_L.B16 181 VPMULL KS_M2.D1, KS_L.D1, XTMP6.Q1 182 183 // XOR all the products and keep only 32-63 bits 184 VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 185 VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 186 VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 187 188 VMOV XTMP3.S[1], XDIGEST.S[1] 189 190 // Prepare data and calculate bits 95-64 of tag 191 VPMULL KS_M1.D1, XTMP1.D1, XTMP3.Q1 192 VPMULL2 KS_M1.D2, XTMP1.D2, XTMP4.Q1 193 VPMULL KS_M2.D1, XTMP2.D1, XTMP5.Q1 194 VPMULL2 KS_M2.D2, XTMP2.D2, XTMP6.Q1 195 196 // XOR all the products and move bits 63-32 to bits 95-64 197 VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 198 VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 199 VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 200 201 VMOV XTMP3.S[1], XDIGEST.S[2] 202 203 // Prepare data and calculate bits 127-96 of tag 204 VEXT $8, KS_M1.B16, KS_M1.B16, XTMP5.B16 205 VPMULL XTMP5.D1, XTMP1.D1, XTMP3.Q1 206 VEXT $8, XTMP1.B16, XTMP1.B16, XTMP5.B16 207 VPMULL KS_M2.D1, XTMP5.D1, XTMP4.Q1 208 VEXT $8, KS_M2.B16, KS_M2.B16, XTMP6.B16 209 VPMULL XTMP6.D1, XTMP2.D1, XTMP5.Q1 210 VEXT $8, XTMP2.B16, XTMP2.B16, KS_L.B16 211 VPMULL KS_H.D1, KS_L.D1, XTMP6.Q1 212 213 // XOR all the products and move bits 63-32 to bits 127-96 214 VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16 215 VEOR XTMP5.B16, XTMP6.B16, XTMP5.B16 216 VEOR XTMP3.B16, XTMP5.B16, XTMP3.B16 217 218 VMOV XTMP3.S[1], XDIGEST.S[3] 219 220 VLD1 (AX), [XTMP1.B16] 221 VEOR XTMP1.B16, XDIGEST.B16, XDIGEST.B16 222 VST1 [XDIGEST.B16], (AX) 223 224 RET