github.com/emmansun/gmsm@v0.29.1/zuc/asm_ppc64x.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build (ppc64 || ppc64le) && !purego 6 7 #include "textflag.h" 8 9 DATA rcon<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F // nibble mask 10 DATA rcon<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F 11 DATA rcon<>+0x10(SB)/8, $0x000182839E9F1C1D // affine transform matrix m1 low 12 DATA rcon<>+0x18(SB)/8, $0x2425A6A7BABB3839 13 DATA rcon<>+0x20(SB)/8, $0x00D508DD7CA974A1 // affine transform matrix m1 high 14 DATA rcon<>+0x28(SB)/8, $0x9C499441E035E83D 15 DATA rcon<>+0x30(SB)/8, $0x6773CDD91602BCA8 // affine transform matrix m2 low 16 DATA rcon<>+0x38(SB)/8, $0xD0C47A6EA1B50B1F 17 DATA rcon<>+0x40(SB)/8, $0x55BACC2315FA8C63 // affine transform matrix m2 high 18 DATA rcon<>+0x48(SB)/8, $0x09E6907F49A6D03F 19 DATA rcon<>+0x50(SB)/8, $0x090F000E0F0F020A // P1 20 DATA rcon<>+0x58(SB)/8, $0x0004000C07050309 // P1 21 DATA rcon<>+0x60(SB)/8, $0x080D060507000C04 // P2 22 DATA rcon<>+0x68(SB)/8, $0x0B010E0A0F030902 // P2 23 DATA rcon<>+0x70(SB)/8, $0x02060A06000D0A0F // P3 24 DATA rcon<>+0x78(SB)/8, $0x03030D0500090C0D // P3 25 DATA rcon<>+0x80(SB)/8, $0xff00ff00ff00ff00 // S0 26 DATA rcon<>+0x88(SB)/8, $0xff00ff00ff00ff00 27 DATA rcon<>+0x90(SB)/8, $0x00ff00ff00ff00ff // S1 28 DATA rcon<>+0x98(SB)/8, $0x00ff00ff00ff00ff 29 GLOBL rcon<>(SB), RODATA, $160 30 31 32 #define M1L V20 33 #define M1H V21 34 #define M2L V22 35 #define M2H V23 36 #define V_FOUR V24 37 #define NIBBLE_MASK V25 38 #define S1_MASK V26 39 #define S0_MASK V27 40 #define P1 V28 41 #define P2 V29 42 #define P3 V30 43 44 #define LOAD_CONSTS \ 45 VSPLTISW $4, V_FOUR \ 46 MOVD $rcon<>+0x00(SB), R4 \ 47 LXVD2X (R4)(R0), NIBBLE_MASK \ 48 MOVD $0x10, R5 \ 49 LXVD2X (R4)(R5), M1L \ 50 MOVD $0x20, R5 \ 51 LXVD2X (R4)(R5), M1H \ 52 MOVD $0x30, R5 \ 53 LXVD2X (R4)(R5), M2L \ 54 MOVD $0x40, R5 \ 55 LXVD2X (R4)(R5), M2H \ 56 MOVD $0x50, R5 \ 57 LXVD2X (R4)(R5), P1 \ 58 MOVD $0x60, R5 \ 59 LXVD2X (R4)(R5), P2 \ 60 MOVD $0x70, R5 \ 61 LXVD2X (R4)(R5), P3 \ 62 MOVD $0x80, R5 \ 63 LXVD2X (R4)(R5), S0_MASK \ 64 MOVD $0x90, R5 \ 65 LXVD2X (R4)(R5), S1_MASK 66 67 #define S0_comput(IN_OUT, V_FOUR, XTMP1, XTMP2) \ 68 VSRW IN_OUT, V_FOUR, XTMP1; \ 69 VAND XTMP1, NIBBLE_MASK, XTMP1; \ 70 VAND IN_OUT, NIBBLE_MASK, IN_OUT; \ 71 VPERM P1, P1, IN_OUT, XTMP2; \ 72 VXOR XTMP1, XTMP2, XTMP2; \ 73 VPERM P2, P2, XTMP2, XTMP1; \ 74 VXOR IN_OUT, XTMP1, XTMP1; \ 75 VPERM P3, P3, XTMP1, IN_OUT; \ 76 VXOR XTMP2, IN_OUT, IN_OUT; \ 77 VSLW IN_OUT, V_FOUR, IN_OUT; \ 78 VXOR IN_OUT, XTMP1, IN_OUT; \ 79 VSPLTISB $5, XTMP1; \ 80 VRLB IN_OUT, XTMP1, IN_OUT 81 82 // Affine Transform 83 // parameters: 84 // - L: table low nibbles 85 // - H: table high nibbles 86 // - x: 128 bits register as sbox input/output data 87 // - y: 128 bits temp register 88 // - z: 128 bits temp register 89 #define AFFINE_TRANSFORM(L, H, V_FOUR, x, y, z) \ 90 VAND NIBBLE_MASK, x, z; \ 91 VPERM L, L, z, y; \ 92 VSRD x, V_FOUR, x; \ 93 VAND NIBBLE_MASK, x, z; \ 94 VPERM H, H, z, x; \ 95 VXOR y, x, x 96 97 #define SHLDL(a, b, n) \ // NO SHLDL in GOLANG now 98 SLW n, a, a \ 99 SRW n, b, b \ 100 OR b, a, a 101 102 // zuc sbox function 103 // parameters: 104 // - x: 128 bits register as sbox input/output data 105 // - y: 128 bits temp register 106 // - z: 128 bits temp register 107 #define S1_comput(x, y, z) \ 108 AFFINE_TRANSFORM(M1L, M1H, V_FOUR, x, y, z); \ 109 VSBOX x, x; \ 110 AFFINE_TRANSFORM(M2L, M2H, V_FOUR, x, y, z) 111 112 #define OFFSET_FR1 (16*4) 113 #define OFFSET_FR2 (17*4) 114 #define OFFSET_BRC_X0 (18*4) 115 #define OFFSET_BRC_X1 (19*4) 116 #define OFFSET_BRC_X2 (20*4) 117 #define OFFSET_BRC_X3 (21*4) 118 119 #define F_R1 R7 120 #define F_R2 R8 121 #define BRC_X0 R9 122 #define BRC_X1 R10 123 #define BRC_X2 R11 124 #define BRC_X3 R12 125 126 #define BITS_REORG(idx, addr, tmpR1, tmpR2, tmpR3, tmpR4) \ 127 MOVWZ (((15 + idx) % 16)*4)(addr), BRC_X0 \ 128 MOVWZ (((14 + idx) % 16)*4)(addr), tmpR1 \ 129 MOVWZ (((11 + idx) % 16)*4)(addr), BRC_X1 \ 130 MOVWZ (((9 + idx) % 16)*4)(addr), tmpR2 \ 131 MOVWZ (((7 + idx) % 16)*4)(addr), BRC_X2 \ 132 MOVWZ (((5 + idx) % 16)*4)(addr), tmpR3 \ 133 MOVWZ (((2 + idx) % 16)*4)(addr), BRC_X3 \ 134 MOVWZ (((0 + idx) % 16)*4)(addr), tmpR4 \ 135 SRW $15, BRC_X0, BRC_X0 \ 136 SLW $16, tmpR1, tmpR1 \ 137 SLW $1, tmpR2, tmpR2 \ 138 SLW $1, tmpR3, tmpR3 \ 139 SLW $1, tmpR4, tmpR4 \ 140 SHLDL(BRC_X0, tmpR1, $16) \ 141 SHLDL(BRC_X1, tmpR2, $16) \ 142 SHLDL(BRC_X2, tmpR3, $16) \ 143 SHLDL(BRC_X3, tmpR4, $16) 144 145 #define LOAD_STATE(addr) \ 146 MOVWZ OFFSET_FR1(addr), F_R1 \ 147 MOVWZ OFFSET_FR2(addr), F_R2 \ 148 MOVWZ OFFSET_BRC_X0(addr), BRC_X0 \ 149 MOVWZ OFFSET_BRC_X1(addr), BRC_X1 \ 150 MOVWZ OFFSET_BRC_X2(addr), BRC_X2 \ 151 MOVWZ OFFSET_BRC_X3(addr), BRC_X3 152 153 #define SAVE_STATE(addr) \ 154 MOVW F_R1, OFFSET_FR1(addr) \ 155 MOVW F_R2, OFFSET_FR2(addr) \ 156 MOVW BRC_X0, OFFSET_BRC_X0(addr) \ 157 MOVW BRC_X1, OFFSET_BRC_X1(addr) \ 158 MOVW BRC_X2, OFFSET_BRC_X2(addr) \ 159 MOVW BRC_X3, OFFSET_BRC_X3(addr) 160 161 #define NONLIN_FUN(AX, BX, CX, DX) \ 162 XOR F_R1, BRC_X0, AX \ // F_R1 xor BRC_X0 163 ADD F_R2, AX \ // W = (F_R1 xor BRC_X1) + F_R2 164 ADD BRC_X1, F_R1 \ // W1= F_R1 + BRC_X1 165 XOR BRC_X2, F_R2 \ // W2= F_R2 ^ BRC_X2 166 \ 167 SLW $16, F_R1, DX \ 168 SRW $16, F_R2, CX \ 169 OR CX, DX \ // P = (W1 << 16) | (W2 >> 16) 170 SHLDL(F_R2, F_R1, $16) \ // Q = (W2 << 16) | (W1 >> 16) 171 ROTLW $2, DX, BX \ // start L1 172 ROTLW $24, DX, CX \ 173 XOR CX, DX \ 174 XOR BX, DX \ 175 ROTLW $8, BX \ 176 XOR BX, DX \ 177 ROTLW $8, BX \ 178 XOR BX, DX, BX \ // U = L1(P) = EDX, hi(RDX)=0 179 RLDICL $0, BX, $32, DX \ // make sure hi(RDX)=0 180 ROTLW $8, F_R2, BX \ 181 ROTLW $14, F_R2, CX \ 182 XOR BX, F_R2 \ 183 XOR CX, F_R2 \ 184 ROTLW $8, CX \ 185 XOR CX, F_R2 \ 186 ROTLW $8, CX \ 187 XOR CX, F_R2 \ // V = L2(Q) = R11D, hi(R11)=0 188 SLD $32, F_R2 \ // DX = V || U 189 XOR F_R2, DX \ 190 MTVSRD DX, V0 \ // save V || U to V0 191 VOR V0, V0, V1 \ 192 S0_comput(V0, V_FOUR, V2, V3) \ 193 S1_comput(V1, V2, V3) \ 194 VAND S0_MASK, V0, V0 \ 195 VAND S1_MASK, V1, V1 \ 196 VXOR V0, V1, V0 \ 197 MFVSRD V0, DX \ 198 SRD $32, DX, F_R2 \ 199 MOVWZ DX, F_R1 200 201 #define LFSR_UPDT(idx, addr, W, tmpR1, tmpR2, tmpR3, tmpR4 ) \ 202 MOVWZ (((0 + idx) % 16)*4)(addr), tmpR1 \ 203 MOVWZ (((4 + idx) % 16)*4)(addr), tmpR2 \ 204 MOVWZ (((10 + idx) % 16)*4)(addr), tmpR3 \ 205 MOVWZ (((13 + idx) % 16)*4)(addr), tmpR4 \ 206 \ // Calculate 64-bit LFSR feedback 207 ADD tmpR1, W \ 208 SLD $8, tmpR1 \ 209 SLD $20, tmpR2 \ 210 SLD $21, tmpR3 \ 211 SLD $17, tmpR4 \ 212 ADD tmpR1, W \ 213 ADD tmpR2, W \ 214 ADD tmpR3, W \ 215 ADD tmpR4, W \ 216 MOVWZ (((15 + idx) % 16)*4)(addr), tmpR4 \ 217 SLD $15, tmpR4 \ 218 ADD tmpR4, W \ 219 \ // Reduce it to 31-bit value 220 MOVD $0x7FFFFFFF, tmpR2 \ 221 SRD $31, W, tmpR1 \ 222 AND tmpR2, W \ 223 ADD tmpR1, W \ 224 \ 225 SRD $31, W, tmpR1 \ 226 AND tmpR2, W \ 227 ADD tmpR1, W \ 228 \ // LFSR_S16 = (LFSR_S15++) = W 229 MOVW W, (((0 + idx) % 16)*4)(addr) 230 231 #define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \ 232 MOVWZ (addr), tmpR1 \ 233 MOVD $4, tmpR4 \ 234 LXVD2X (tmpR4)(addr), V0 \ 235 MOVD $20, tmpR4 \ 236 LXVD2X (tmpR4)(addr), V1 \ 237 MOVD $36, tmpR4 \ 238 LXVD2X (tmpR4)(addr), V2 \ 239 MOVD 52(addr), tmpR2 \ 240 MOVWZ 60(addr), tmpR3 \ 241 STXVD2X V0, (addr) \ 242 MOVD $16, tmpR4 \ 243 STXVD2X V1, (tmpR4)(addr) \ 244 MOVD $32, tmpR4 \ 245 STXVD2X V2, (tmpR4)(addr) \ 246 MOVD tmpR2, 48(addr) \ 247 MOVW tmpR3, 56(addr) \ 248 MOVW tmpR1, 60(addr) 249 250 #define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \ 251 MOVD (addr), tmpR1 \ 252 MOVD $8, tmpR2 \ 253 LXVD2X (tmpR2)(addr), V0 \ 254 MOVD $24, tmpR2 \ 255 LXVD2X (tmpR2)(addr), V1 \ 256 MOVD $40, tmpR2 \ 257 LXVD2X (tmpR2)(addr), V2 \ 258 MOVD 56(addr), tmpR3 \ 259 \ 260 STXVD2X V0, (addr) \ 261 MOVD $16, tmpR2 \ 262 STXVD2X V1, (tmpR2)(addr) \ 263 MOVD $32, tmpR2 \ 264 STXVD2X V2, (tmpR2)(addr) \ 265 MOVD tmpR3, 48(addr) \ 266 MOVD tmpR1, 56(addr) 267 268 #define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \ 269 LXVD2X (addr), V0 \ 270 MOVD $16, tmpR1 \ 271 LXVD2X (tmpR1)(addr), V1 \ 272 MOVD $32, tmpR2 \ 273 LXVD2X (tmpR2)(addr), V2 \ 274 MOVD $48, tmpR3 \ 275 LXVD2X (tmpR3)(addr), V3 \ 276 \ 277 STXVD2X V1, (addr) \ 278 STXVD2X V2, (tmpR1)(addr) \ 279 STXVD2X V3, (tmpR2)(addr) \ 280 STXVD2X V0, (tmpR3)(addr) 281 282 #define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \ 283 LXVD2X (addr), V0 \ 284 MOVD $16, tmpR1 \ 285 LXVD2X (tmpR1)(addr), V1 \ 286 MOVD $32, tmpR2 \ 287 LXVD2X (tmpR2)(addr), V2 \ 288 MOVD $48, tmpR3 \ 289 LXVD2X (tmpR3)(addr), V3 \ 290 \ 291 STXVD2X V2, (addr) \ 292 STXVD2X V3, (tmpR1)(addr) \ 293 STXVD2X V0, (tmpR2)(addr) \ 294 STXVD2X V1, (tmpR3)(addr) 295 296 // func genKeywordAsm(s *zucState32) uint32 297 TEXT ·genKeywordAsm(SB),NOSPLIT,$0 298 LOAD_CONSTS 299 300 MOVD pState+0(FP), R4 301 LOAD_STATE(R4) 302 BITS_REORG(0, R4, R14, R15, R16, R17) 303 NONLIN_FUN(R14, R15, R16, R17) 304 // (BRC_X3 xor W) as result 305 XOR BRC_X3, R14 306 MOVW R14, ret+8(FP) 307 308 // LFSRWithWorkMode 309 XOR R14, R14 310 LFSR_UPDT(0, R4, R14, R15, R16, R17, R18) 311 SAVE_STATE(R4) 312 RESTORE_LFSR_0(R4, R15, R16, R17, R18) 313 314 RET 315 316 #define ONEROUND(idx, addr, dst, W, tmpR1, tmpR2, tmpR3, tmpR4) \ 317 BITS_REORG(idx, addr, W, tmpR1, tmpR2, tmpR3) \ 318 NONLIN_FUN(W, tmpR1, tmpR2, tmpR3) \ 319 XOR BRC_X3, W \ 320 MOVW W, (idx*4)(dst) \ 321 XOR W, W \ 322 LFSR_UPDT(idx, addr, W, tmpR1, tmpR2, tmpR3, tmpR4) 323 324 #ifdef GOARCH_ppc64le 325 #define PPC64X_MOVWBR(src, dst, idx, tmp) \ 326 MOVD $(idx), tmp \ 327 MOVWBR src, (tmp)(dst) 328 #else 329 #define PPC64X_MOVWBR(src, dst, idx, tmp) MOVW src, (idx)(dst) 330 #endif 331 332 #define ONEROUND_REV32(idx, addr, dst, W, tmpR1, tmpR2, tmpR3, tmpR4) \ 333 BITS_REORG(idx, addr, W, tmpR1, tmpR2, tmpR3) \ 334 NONLIN_FUN(W, tmpR1, tmpR2, tmpR3) \ 335 XOR BRC_X3, W \ 336 PPC64X_MOVWBR(W, dst, idx*4, tmpR1) \ 337 XOR W, W \ 338 LFSR_UPDT(idx, addr, W, tmpR1, tmpR2, tmpR3, tmpR4) 339 340 // func genKeyStreamAsm(keyStream []uint32, pState *zucState32) 341 TEXT ·genKeyStreamAsm(SB),NOSPLIT,$0 342 LOAD_CONSTS 343 344 MOVD pState+24(FP), R4 345 MOVD ks+0(FP), R3 346 MOVD ks_len+8(FP), R5 347 348 LOAD_STATE(R4) 349 350 CMP R5, $16 351 BLT zucOctet 352 353 preloop16: 354 SRD $4, R5, R6 // Set up loop counter 355 MOVD R6, CTR 356 ANDCC $15, R5, R6 // Check for tailing bytes for later 357 PCALIGN $16 358 359 zucSixteens: 360 ONEROUND(0, R4, R3, R14, R15, R16, R17, R18) 361 ONEROUND(1, R4, R3, R14, R15, R16, R17, R18) 362 ONEROUND(2, R4, R3, R14, R15, R16, R17, R18) 363 ONEROUND(3, R4, R3, R14, R15, R16, R17, R18) 364 ONEROUND(4, R4, R3, R14, R15, R16, R17, R18) 365 ONEROUND(5, R4, R3, R14, R15, R16, R17, R18) 366 ONEROUND(6, R4, R3, R14, R15, R16, R17, R18) 367 ONEROUND(7, R4, R3, R14, R15, R16, R17, R18) 368 ONEROUND(8, R4, R3, R14, R15, R16, R17, R18) 369 ONEROUND(9, R4, R3, R14, R15, R16, R17, R18) 370 ONEROUND(10, R4, R3, R14, R15, R16, R17, R18) 371 ONEROUND(11, R4, R3, R14, R15, R16, R17, R18) 372 ONEROUND(12, R4, R3, R14, R15, R16, R17, R18) 373 ONEROUND(13, R4, R3, R14, R15, R16, R17, R18) 374 ONEROUND(14, R4, R3, R14, R15, R16, R17, R18) 375 ONEROUND(15, R4, R3, R14, R15, R16, R17, R18) 376 ADD $64, R3 377 BDNZ zucSixteens 378 BC 12,2,zucRet // fast return 379 MOVD R6, R5 380 381 zucOctet: 382 CMP R5, $8 383 BLT zucNibble 384 ONEROUND(0, R4, R3, R14, R15, R16, R17, R18) 385 ONEROUND(1, R4, R3, R14, R15, R16, R17, R18) 386 ONEROUND(2, R4, R3, R14, R15, R16, R17, R18) 387 ONEROUND(3, R4, R3, R14, R15, R16, R17, R18) 388 ONEROUND(4, R4, R3, R14, R15, R16, R17, R18) 389 ONEROUND(5, R4, R3, R14, R15, R16, R17, R18) 390 ONEROUND(6, R4, R3, R14, R15, R16, R17, R18) 391 ONEROUND(7, R4, R3, R14, R15, R16, R17, R18) 392 RESTORE_LFSR_8(R4, R14, R15, R16) 393 ADD $32, R3 394 ADD $-8, R5 395 396 zucNibble: 397 CMP R5, $4 398 BLT zucDouble 399 ONEROUND(0, R4, R3, R14, R15, R16, R17, R18) 400 ONEROUND(1, R4, R3, R14, R15, R16, R17, R18) 401 ONEROUND(2, R4, R3, R14, R15, R16, R17, R18) 402 ONEROUND(3, R4, R3, R14, R15, R16, R17, R18) 403 RESTORE_LFSR_4(R4, R14, R15, R16) 404 ADD $16, R3 405 ADD $-4, R5 406 407 zucDouble: 408 CMP R5, $2 409 BLT zucSingle 410 ONEROUND(0, R4, R3, R14, R15, R16, R17, R18) 411 ONEROUND(1, R4, R3, R14, R15, R16, R17, R18) 412 RESTORE_LFSR_2(R4, R14, R15, R16) 413 ADD $8, R3 414 ADD $-2, R5 415 416 zucSingle: 417 CMP R5, $1 418 BLT zucRet 419 ONEROUND(0, R4, R3, R14, R15, R16, R17, R18) 420 RESTORE_LFSR_0(R4, R14, R15, R16, R17) 421 422 zucRet: 423 SAVE_STATE(R4) 424 RET 425 426 // func genKeyStreamRev32Asm(keyStream []byte, pState *zucState32) 427 TEXT ·genKeyStreamRev32Asm(SB),NOSPLIT,$0 428 LOAD_CONSTS 429 430 MOVD pState+24(FP), R4 431 MOVD ks+0(FP), R3 432 MOVD ks_len+8(FP), R5 433 434 SRD $2, R5, R5 435 LOAD_STATE(R4) 436 437 CMP R5, $16 438 BLT zucOctet 439 440 preloop16: 441 SRD $4, R5, R6 // Set up loop counter 442 MOVD R6, CTR 443 ANDCC $15, R5, R6 // Check for tailing bytes for later 444 PCALIGN $16 445 446 zucSixteens: 447 ONEROUND_REV32(0, R4, R3, R14, R15, R16, R17, R18) 448 ONEROUND_REV32(1, R4, R3, R14, R15, R16, R17, R18) 449 ONEROUND_REV32(2, R4, R3, R14, R15, R16, R17, R18) 450 ONEROUND_REV32(3, R4, R3, R14, R15, R16, R17, R18) 451 ONEROUND_REV32(4, R4, R3, R14, R15, R16, R17, R18) 452 ONEROUND_REV32(5, R4, R3, R14, R15, R16, R17, R18) 453 ONEROUND_REV32(6, R4, R3, R14, R15, R16, R17, R18) 454 ONEROUND_REV32(7, R4, R3, R14, R15, R16, R17, R18) 455 ONEROUND_REV32(8, R4, R3, R14, R15, R16, R17, R18) 456 ONEROUND_REV32(9, R4, R3, R14, R15, R16, R17, R18) 457 ONEROUND_REV32(10, R4, R3, R14, R15, R16, R17, R18) 458 ONEROUND_REV32(11, R4, R3, R14, R15, R16, R17, R18) 459 ONEROUND_REV32(12, R4, R3, R14, R15, R16, R17, R18) 460 ONEROUND_REV32(13, R4, R3, R14, R15, R16, R17, R18) 461 ONEROUND_REV32(14, R4, R3, R14, R15, R16, R17, R18) 462 ONEROUND_REV32(15, R4, R3, R14, R15, R16, R17, R18) 463 ADD $64, R3 464 BDNZ zucSixteens 465 BC 12,2,zucRet // fast return 466 MOVD R6, R5 467 468 zucOctet: 469 CMP R5, $8 470 BLT zucNibble 471 ONEROUND_REV32(0, R4, R3, R14, R15, R16, R17, R18) 472 ONEROUND_REV32(1, R4, R3, R14, R15, R16, R17, R18) 473 ONEROUND_REV32(2, R4, R3, R14, R15, R16, R17, R18) 474 ONEROUND_REV32(3, R4, R3, R14, R15, R16, R17, R18) 475 ONEROUND_REV32(4, R4, R3, R14, R15, R16, R17, R18) 476 ONEROUND_REV32(5, R4, R3, R14, R15, R16, R17, R18) 477 ONEROUND_REV32(6, R4, R3, R14, R15, R16, R17, R18) 478 ONEROUND_REV32(7, R4, R3, R14, R15, R16, R17, R18) 479 RESTORE_LFSR_8(R4, R14, R15, R16) 480 ADD $32, R3 481 ADD $-8, R5 482 483 zucNibble: 484 CMP R5, $4 485 BLT zucDouble 486 ONEROUND_REV32(0, R4, R3, R14, R15, R16, R17, R18) 487 ONEROUND_REV32(1, R4, R3, R14, R15, R16, R17, R18) 488 ONEROUND_REV32(2, R4, R3, R14, R15, R16, R17, R18) 489 ONEROUND_REV32(3, R4, R3, R14, R15, R16, R17, R18) 490 RESTORE_LFSR_4(R4, R14, R15, R16) 491 ADD $16, R3 492 ADD $-4, R5 493 494 zucDouble: 495 CMP R5, $2 496 BLT zucSingle 497 ONEROUND_REV32(0, R4, R3, R14, R15, R16, R17, R18) 498 ONEROUND_REV32(1, R4, R3, R14, R15, R16, R17, R18) 499 RESTORE_LFSR_2(R4, R14, R15, R16) 500 ADD $8, R3 501 ADD $-2, R5 502 503 zucSingle: 504 CMP R5, $1 505 BLT zucRet 506 ONEROUND_REV32(0, R4, R3, R14, R15, R16, R17, R18) 507 RESTORE_LFSR_0(R4, R14, R15, R16, R17) 508 509 zucRet: 510 SAVE_STATE(R4) 511 RET