github.com/emmansun/gmsm@v0.29.1/zuc/asm_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 DATA Top3_Bottom5_bits_of_the_byte<>+0x00(SB)/8, $0xe0e0e0e0e0e0e0e0 6 DATA Top3_Bottom5_bits_of_the_byte<>+0x08(SB)/8, $0xe0e0e0e0e0e0e0e0 7 DATA Top3_Bottom5_bits_of_the_byte<>+0x10(SB)/8, $0x1f1f1f1f1f1f1f1f 8 DATA Top3_Bottom5_bits_of_the_byte<>+0x18(SB)/8, $0x1f1f1f1f1f1f1f1f 9 GLOBL Top3_Bottom5_bits_of_the_byte<>(SB), RODATA, $32 10 11 DATA P123_data<>+0x00(SB)/8, $0x0A020F0F0E000F09 12 DATA P123_data<>+0x08(SB)/8, $0x090305070C000400 13 DATA P123_data<>+0x10(SB)/8, $0x040C000705060D08 14 DATA P123_data<>+0x18(SB)/8, $0x0209030F0A0E010B 15 DATA P123_data<>+0x20(SB)/8, $0x0F0A0D00060A0602 16 DATA P123_data<>+0x28(SB)/8, $0x0D0C0900050D0303 17 GLOBL P123_data<>(SB), RODATA, $48 18 19 // Affine transform 1 & 2 (low and high nibbles) 20 DATA m1_2<>+0x00(SB)/8, $0x1D1C9F9E83820100 21 DATA m1_2<>+0x08(SB)/8, $0x3938BBBAA7A62524 22 DATA m1_2<>+0x10(SB)/8, $0xA174A97CDD08D500 23 DATA m1_2<>+0x18(SB)/8, $0x3DE835E04194499C 24 DATA m1_2<>+0x20(SB)/8, $0xA8BC0216D9CD7367 25 DATA m1_2<>+0x28(SB)/8, $0x1F0BB5A16E7AC4D0 26 DATA m1_2<>+0x30(SB)/8, $0x638CFA1523CCBA55 27 DATA m1_2<>+0x38(SB)/8, $0x3FD0A6497F90E609 28 GLOBL m1_2<>(SB), RODATA, $64 29 30 DATA Shuf_mask<>+0x00(SB)/8, $0x0B0E0104070A0D00 31 DATA Shuf_mask<>+0x08(SB)/8, $0x0306090C0F020508 32 GLOBL Shuf_mask<>(SB), RODATA, $16 33 34 DATA mask_S01<>+0x00(SB)/8, $0xff00ff00ff00ff00 35 DATA mask_S01<>+0x08(SB)/8, $0xff00ff00ff00ff00 36 DATA mask_S01<>+0x10(SB)/8, $0x00ff00ff00ff00ff 37 DATA mask_S01<>+0x18(SB)/8, $0x00ff00ff00ff00ff 38 GLOBL mask_S01<>(SB), RODATA, $32 39 40 #define SI R0 41 #define DI R1 42 #define BP R2 43 #define AX R3 44 #define BX R4 45 #define CX R5 46 #define DX R6 47 48 #define ZERO V16 49 #define TOP3_BITS V19 50 #define BOTTOM5_BITS V20 51 #define NIBBLE_MASK V21 52 #define INVERSE_SHIFT_ROWS V22 53 #define M1L V23 54 #define M1H V24 55 #define M2L V25 56 #define M2H V26 57 #define P1 V27 58 #define P2 V28 59 #define P3 V29 60 #define S0_MASK V30 61 #define S1_MASK V31 62 63 #define OFFSET_FR1 (16*4) 64 #define OFFSET_FR2 (17*4) 65 #define OFFSET_BRC_X0 (18*4) 66 #define OFFSET_BRC_X1 (19*4) 67 #define OFFSET_BRC_X2 (20*4) 68 #define OFFSET_BRC_X3 (21*4) 69 70 #define LOAD_GLOBAL_DATA() \ 71 MOVW $0x0F0F0F0F, R0 \ 72 VDUP R0, NIBBLE_MASK.S4 \ 73 MOVD $Top3_Bottom5_bits_of_the_byte<>(SB), R0 \ 74 VLD1 (R0), [TOP3_BITS.B16, BOTTOM5_BITS.B16] \ 75 MOVD $m1_2<>(SB), R0 \ 76 VLD1 (R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \ 77 MOVD $P123_data<>(SB), R0 \ 78 VLD1 (R0), [P1.B16, P2.B16, P3.B16] \ 79 MOVD $mask_S01<>(SB), R0 \ 80 VLD1 (R0), [S0_MASK.B16, S1_MASK.B16] \ 81 MOVD $Shuf_mask<>(SB), R0 \ 82 VLD1 (R0), [INVERSE_SHIFT_ROWS.B16] \ 83 84 #define SHLDL(a, b, n) \ // NO SHLDL in GOLANG now 85 LSLW n, a \ 86 LSRW n, b \ 87 ORRW b, a 88 89 #define Rotl_5(XDATA, XTMP0) \ 90 VSHL $5, XDATA.S4, XTMP0.S4 \ 91 VUSHR $3, XDATA.S4, XDATA.S4 \ 92 VAND TOP3_BITS.B16, XTMP0.B16, XTMP0.B16 \ 93 VAND BOTTOM5_BITS.B16, XDATA.B16, XDATA.B16 \ 94 VORR XTMP0.B16, XDATA.B16, XDATA.B16 95 96 #define S0_comput(IN_OUT, XTMP1, XTMP2) \ 97 VUSHR $4, IN_OUT.S4, XTMP1.S4 \ 98 VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \ 99 \ 100 VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \ 101 \ 102 VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \ 103 VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16 \ 104 \ 105 VTBL XTMP2.B16, [P2.B16], XTMP1.B16 \ 106 VEOR IN_OUT.B16, XTMP1.B16, XTMP1.B16 \ 107 \ 108 VTBL XTMP1.B16, [P3.B16], IN_OUT.B16 \ 109 VEOR XTMP2.B16, IN_OUT.B16, IN_OUT.B16 \ 110 \ 111 VSHL $4, IN_OUT.S4, IN_OUT.S4 \ 112 VEOR XTMP1.B16, IN_OUT.B16, IN_OUT.B16 \ 113 Rotl_5(IN_OUT, XTMP1) 114 115 // Affine Transform 116 // parameters: 117 // - L: table low nibbles 118 // - H: table high nibbles 119 // - x: 128 bits register as sbox input/output data 120 // - y: 128 bits temp register 121 // - z: 128 bits temp register 122 #define AFFINE_TRANSFORM(L, H, x, y, z) \ 123 VAND x.B16, NIBBLE_MASK.B16, z.B16; \ 124 VTBL z.B16, [L.B16], y.B16; \ 125 VUSHR $4, x.D2, x.D2; \ 126 VAND x.B16, NIBBLE_MASK.B16, z.B16; \ 127 VTBL z.B16, [H.B16], z.B16; \ 128 VEOR y.B16, z.B16, x.B16 129 130 #define S1_comput(x, XTMP1, XTMP2) \ 131 AFFINE_TRANSFORM(M1L, M1H, x, XTMP1, XTMP2); \ 132 VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ 133 AESE ZERO.B16, x.B16; \ 134 AFFINE_TRANSFORM(M2L, M2H, x, XTMP1, XTMP2) 135 136 #define BITS_REORG(idx) \ 137 MOVW (((15 + idx) % 16)*4)(SI), R12 \ 138 MOVW (((14 + idx) % 16)*4)(SI), AX \ 139 MOVW (((11 + idx) % 16)*4)(SI), R13 \ 140 MOVW (((9 + idx) % 16)*4)(SI), BX \ 141 MOVW (((7 + idx) % 16)*4)(SI), R14 \ 142 MOVW (((5 + idx) % 16)*4)(SI), CX \ 143 MOVW (((2 + idx) % 16)*4)(SI), R15 \ 144 MOVW (((0 + idx) % 16)*4)(SI), DX \ 145 LSRW $15, R12 \ 146 LSLW $16, AX \ 147 LSLW $1, BX \ 148 LSLW $1, CX \ 149 LSLW $1, DX \ 150 SHLDL(R12, AX, $16) \ 151 SHLDL(R13, BX, $16) \ 152 SHLDL(R14, CX, $16) \ 153 SHLDL(R15, DX, $16) 154 155 #define LFSR_UPDT(idx) \ 156 MOVW (((0 + idx) % 16)*4)(SI), BX \ 157 MOVW (((4 + idx) % 16)*4)(SI), CX \ 158 MOVW (((10 + idx) % 16)*4)(SI), DX \ 159 MOVW (((13 + idx) % 16)*4)(SI), R8 \ 160 MOVW (((15 + idx) % 16)*4)(SI), R9 \ 161 ADD BX, AX \ 162 LSL $8, BX \ 163 LSL $20, CX \ 164 LSL $21, DX \ 165 LSL $17, R8 \ 166 LSL $15, R9 \ 167 ADD BX, AX \ 168 ADD CX, AX \ 169 ADD DX, AX \ 170 ADD R8, AX \ 171 ADD R9, AX \ 172 \ 173 LSR $31, AX, BX \ 174 AND $0x7FFFFFFF, AX \ 175 ADD BX, AX \ 176 \ 177 LSR $31, AX, BX \ 178 AND $0x7FFFFFFF, AX \ 179 ADD BX, AX \ 180 \ 181 MOVW AX, (((0 + idx) % 16)*4)(SI) 182 183 #define NONLIN_FUN() \ 184 EORW R10, R12, AX \ 185 ADDW R11, AX \ 186 ADDW R13, R10 \ // W1= F_R1 + BRC_X1 187 EORW R14, R11 \ // W2= F_R2 ^ BRC_X2 188 \ 189 LSLW $16, R10, DX \ 190 LSRW $16, R11, CX \ 191 ORRW CX, DX \ // P = (W1 << 16) | (W2 >> 16) 192 SHLDL(R11, R10, $16) \ // Q = (W2 << 16) | (W1 >> 16) 193 RORW $30, DX, BX \ 194 RORW $22, DX, CX \ 195 RORW $14, DX, R8 \ 196 RORW $8, DX, R9 \ 197 EORW BX, DX \ 198 EORW CX, DX \ 199 EORW R8, DX \ 200 EORW R9, DX \ // U = L1(P) = EDX, hi(RDX)=0 201 RORW $24, R11, BX \ 202 RORW $18, R11, CX \ 203 RORW $10, R11, R8 \ 204 RORW $2, R11, R9 \ 205 EORW BX, R11 \ 206 EORW CX, R11 \ 207 EORW R8, R11 \ 208 EORW R9, R11 \ // V = L2(Q) = R11D, hi(R11)=0 209 LSL $32, R11 \ 210 EOR R11, DX \ 211 VDUP DX, V0.D2 \ 212 VMOV V0.B16, V1.B16 \ 213 S0_comput(V1, V2, V3) \ 214 S1_comput(V0, V2, V3) \ 215 \ 216 VAND S1_MASK.B16, V0.B16, V0.B16 \ 217 VAND S0_MASK.B16, V1.B16, V1.B16 \ 218 VEOR V1.B16, V0.B16, V0.B16 \ 219 \ 220 VMOV V0.S[0], R10 \ // F_R1 221 VMOV V0.S[1], R11 222 223 #define RESTORE_LFSR_0() \ 224 MOVW.P 4(SI), AX \ 225 VLD1 (SI), [V0.B16, V1.B16, V2.B16] \ 226 SUB $4, SI \ 227 MOVD (52)(SI), BX \ 228 MOVW (60)(SI), CX \ 229 \ 230 VST1 [V0.B16, V1.B16, V2.B16], (SI) \ 231 MOVD BX, (48)(SI) \ 232 MOVW CX, (56)(SI) \ 233 MOVW AX, (60)(SI) 234 235 #define RESTORE_LFSR_2() \ 236 MOVD.P 8(SI), AX \ 237 VLD1 (SI), [V0.B16, V1.B16, V2.B16] \ 238 SUB $8, SI \ 239 MOVD (56)(SI), BX \ 240 \ 241 VST1 [V0.B16, V1.B16, V2.B16], (SI) \ 242 MOVD BX, (48)(SI) \ 243 MOVD AX, (56)(SI) 244 245 #define RESTORE_LFSR_4() \ 246 VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \ 247 \ 248 VST1.P [V1.B16, V2.B16, V3.B16], 48(SI) \ 249 VST1 [V0.B16], (SI) \ 250 SUB $48, SI 251 252 #define RESTORE_LFSR_8() \ 253 VLD1 (SI), [V0.B16, V1.B16, V2.B16, V3.B16] \ 254 \ 255 VST1.P [V2.B16, V3.B16], 32(SI) \ 256 VST1 [V0.B16, V1.B16], (SI) \ 257 SUB $32, SI 258 259 #define LOAD_STATE(r) \ 260 MOVW 64+r, R10 \ 261 MOVW 68+r, R11 \ 262 MOVW 72+r, R12 \ 263 MOVW 76+r, R13 \ 264 MOVW 80+r, R14 \ 265 MOVW 84+r, R15 266 267 #define SAVE_STATE(r) \ 268 MOVW R10, 64+r \ 269 MOVW R11, 68+r \ 270 MOVW R12, 72+r \ 271 MOVW R13, 76+r \ 272 MOVW R14, 80+r \ 273 MOVW R15, 84+r 274 275 // func genKeywordAsm(s *zucState32) uint32 276 TEXT ·genKeywordAsm(SB),NOSPLIT,$0 277 LOAD_GLOBAL_DATA() 278 VEOR ZERO.B16, ZERO.B16, ZERO.B16 279 280 MOVD pState+0(FP), SI 281 LOAD_STATE(0(SI)) 282 283 BITS_REORG(0) 284 NONLIN_FUN() 285 286 EORW R15, AX 287 MOVW AX, ret+8(FP) 288 EOR AX, AX 289 LFSR_UPDT(0) 290 SAVE_STATE(0(SI)) 291 RESTORE_LFSR_0() 292 293 RET 294 295 #define ONEROUND(idx) \ 296 BITS_REORG(idx) \ 297 NONLIN_FUN() \ 298 EORW R15, AX \ 299 MOVW AX, (idx*4)(DI) \ 300 EOR AX, AX \ 301 LFSR_UPDT(idx) 302 303 #define ROUND_REV32(idx) \ 304 BITS_REORG(idx) \ 305 NONLIN_FUN() \ 306 EORW R15, AX \ 307 REVW AX, AX \ 308 MOVW AX, (idx*4)(DI) \ 309 EOR AX, AX \ 310 LFSR_UPDT(idx) 311 312 // func genKeyStreamAsm(keyStream []uint32, pState *zucState32) 313 TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0 314 LOAD_GLOBAL_DATA() 315 VEOR ZERO.B16, ZERO.B16, ZERO.B16 316 317 MOVD ks+0(FP), DI 318 MOVD ks_len+8(FP), BP 319 MOVD pState+24(FP), SI 320 321 LOAD_STATE(0(SI)) 322 323 zucSixteens: 324 CMP $16, BP 325 BLT zucOctet 326 SUB $16, BP 327 ONEROUND(0) 328 ONEROUND(1) 329 ONEROUND(2) 330 ONEROUND(3) 331 ONEROUND(4) 332 ONEROUND(5) 333 ONEROUND(6) 334 ONEROUND(7) 335 ONEROUND(8) 336 ONEROUND(9) 337 ONEROUND(10) 338 ONEROUND(11) 339 ONEROUND(12) 340 ONEROUND(13) 341 ONEROUND(14) 342 ONEROUND(15) 343 ADD $4*16, DI 344 B zucSixteens 345 346 zucOctet: 347 CMP $8, BP 348 BLT zucNibble 349 SUB $8, BP 350 ONEROUND(0) 351 ONEROUND(1) 352 ONEROUND(2) 353 ONEROUND(3) 354 ONEROUND(4) 355 ONEROUND(5) 356 ONEROUND(6) 357 ONEROUND(7) 358 ADD $2*16, DI 359 RESTORE_LFSR_8() 360 zucNibble: 361 CMP $4, BP 362 BLT zucDouble 363 SUB $4, BP 364 ONEROUND(0) 365 ONEROUND(1) 366 ONEROUND(2) 367 ONEROUND(3) 368 ADD $1*16, DI 369 RESTORE_LFSR_4() 370 zucDouble: 371 CMP $2, BP 372 BLT zucSingle 373 SUB $2, BP 374 ONEROUND(0) 375 ONEROUND(1) 376 ADD $8, DI 377 RESTORE_LFSR_2() 378 zucSingle: 379 TBZ $0, BP, zucRet 380 ONEROUND(0) 381 RESTORE_LFSR_0() 382 zucRet: 383 SAVE_STATE(0(SI)) 384 RET 385 386 // func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32) 387 TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0 388 LOAD_GLOBAL_DATA() 389 VEOR ZERO.B16, ZERO.B16, ZERO.B16 390 391 MOVD ks+0(FP), DI 392 MOVD ks_len+8(FP), BP 393 MOVD pState+24(FP), SI 394 395 LSR $2, BP 396 LOAD_STATE(0(SI)) 397 398 zucSixteens: 399 CMP $16, BP 400 BLT zucOctet 401 SUB $16, BP 402 ROUND_REV32(0) 403 ROUND_REV32(1) 404 ROUND_REV32(2) 405 ROUND_REV32(3) 406 ROUND_REV32(4) 407 ROUND_REV32(5) 408 ROUND_REV32(6) 409 ROUND_REV32(7) 410 ROUND_REV32(8) 411 ROUND_REV32(9) 412 ROUND_REV32(10) 413 ROUND_REV32(11) 414 ROUND_REV32(12) 415 ROUND_REV32(13) 416 ROUND_REV32(14) 417 ROUND_REV32(15) 418 ADD $4*16, DI 419 B zucSixteens 420 421 zucOctet: 422 CMP $8, BP 423 BLT zucNibble 424 SUB $8, BP 425 ROUND_REV32(0) 426 ROUND_REV32(1) 427 ROUND_REV32(2) 428 ROUND_REV32(3) 429 ROUND_REV32(4) 430 ROUND_REV32(5) 431 ROUND_REV32(6) 432 ROUND_REV32(7) 433 ADD $2*16, DI 434 RESTORE_LFSR_8() 435 zucNibble: 436 CMP $4, BP 437 BLT zucDouble 438 SUB $4, BP 439 ROUND_REV32(0) 440 ROUND_REV32(1) 441 ROUND_REV32(2) 442 ROUND_REV32(3) 443 ADD $16, DI 444 RESTORE_LFSR_4() 445 zucDouble: 446 CMP $2, BP 447 BLT zucSingle 448 SUB $2, BP 449 ROUND_REV32(0) 450 ROUND_REV32(1) 451 ADD $8, DI 452 RESTORE_LFSR_2() 453 zucSingle: 454 TBZ $0, BP, zucRet 455 ROUND_REV32(0) 456 RESTORE_LFSR_0() 457 zucRet: 458 SAVE_STATE(0(SI)) 459 RET