gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/sm4/gcm_arm64.s (about) 1 #include "textflag.h" 2 3 //nibble mask 4 DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F 5 DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F 6 GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16 7 8 // inverse shift rows 9 DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 10 DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 11 GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16 12 13 // Affine transform 1 (low and high hibbles) 14 DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 15 DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 16 GLOBL m1_low<>(SB), (NOPTR+RODATA), $16 17 18 DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 19 DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB 20 GLOBL m1_high<>(SB), (NOPTR+RODATA), $16 21 22 // Affine transform 2 (low and high hibbles) 23 DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 24 DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 25 GLOBL m2_low<>(SB), (NOPTR+RODATA), $16 26 27 DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 28 DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 29 GLOBL m2_high<>(SB), (NOPTR+RODATA), $16 30 31 // left rotations of 32-bit words by 8-bit increments 32 DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 33 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 34 GLOBL r08_mask<>(SB), (NOPTR+RODATA), $16 35 36 DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302 37 DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A 38 GLOBL r16_mask<>(SB), (NOPTR+RODATA), $16 39 40 DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 41 DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 42 GLOBL r24_mask<>(SB), (NOPTR+RODATA), $16 43 44 #define B0 V0 45 #define B1 V1 46 #define B2 V2 47 #define B3 V3 48 #define B4 V4 49 #define B5 V5 50 #define B6 V6 51 #define B7 V7 52 53 #define ACC0 V8 54 #define ACC1 V9 55 #define ACCM V10 56 57 #define T0 V11 58 #define T1 V12 59 #define T2 V13 60 #define T3 V14 61 62 #define POLY V15 63 #define ZERO V16 64 #define INC V17 65 #define CTR V18 66 67 #define K0 V19 68 #define K1 V20 69 #define K2 V21 70 #define K3 V22 71 #define NIBBLE_MASK V23 72 #define INVERSE_SHIFT_ROWS V24 73 #define M1L V25 74 #define M1H V26 75 #define M2L V27 76 #define M2H V28 77 #define R08_MASK V29 78 #define R16_MASK V30 79 #define R24_MASK V31 80 81 #define reduce() \ 82 VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ 83 VEOR ACC1.B16, ACCM.B16, ACCM.B16 \ 84 VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \ 85 VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \ 86 VEOR ACCM.B16, ACC0.B16, ACC0.B16 \ 87 VEOR T0.B16, ACC1.B16, ACC1.B16 \ 88 VPMULL POLY.D1, ACC0.D1, T0.Q1 \ 89 VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \ 90 VEOR T0.B16, ACC0.B16, ACC0.B16 \ 91 VPMULL POLY.D1, ACC0.D1, T0.Q1 \ 92 VEOR T0.B16, ACC1.B16, ACC1.B16 \ 93 VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \ 94 VEOR ACC1.B16, ACC0.B16, ACC0.B16 \ 95 96 // func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 97 TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 98 #define pTbl R0 99 #define tMsk R1 100 #define tPtr R2 101 #define plen R3 102 #define dlen R4 103 104 MOVD $0xC2, R1 105 LSL $56, R1 106 MOVD $1, R0 107 VMOV R1, POLY.D[0] 108 VMOV R0, POLY.D[1] 109 VEOR ZERO.B16, ZERO.B16, ZERO.B16 110 111 MOVD productTable+0(FP), pTbl 112 MOVD tagMask+8(FP), tMsk 113 MOVD T+16(FP), tPtr 114 MOVD pLen+24(FP), plen 115 MOVD dLen+32(FP), dlen 116 117 VLD1 (tPtr), [ACC0.B16] 118 VLD1 (tMsk), [B1.B16] 119 120 LSL $3, plen 121 LSL $3, dlen 122 123 VMOV dlen, B0.D[0] 124 VMOV plen, B0.D[1] 125 126 ADD $14*16, pTbl 127 VLD1.P (pTbl), [T1.B16, T2.B16] 128 129 VEOR ACC0.B16, B0.B16, B0.B16 130 131 VEXT $8, B0.B16, B0.B16, T0.B16 132 VEOR B0.B16, T0.B16, T0.B16 133 VPMULL B0.D1, T1.D1, ACC1.Q1 134 VPMULL2 B0.D2, T1.D2, ACC0.Q1 135 VPMULL T0.D1, T2.D1, ACCM.Q1 136 137 reduce() 138 139 VREV64 ACC0.B16, ACC0.B16 140 VEOR B1.B16, ACC0.B16, ACC0.B16 141 142 VST1 [ACC0.B16], (tPtr) 143 RET 144 #undef pTbl 145 #undef tMsk 146 #undef tPtr 147 #undef plen 148 #undef dlen 149 150 #define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ 151 VMOV t0.B16, K.B16 \ 152 VMOV t1.S[0], t0.S[1] \ 153 VMOV t2.S[0], t0.S[2] \ 154 VMOV t3.S[0], t0.S[3] \ 155 VMOV K.S[1], t1.S[0] \ 156 VMOV K.S[2], t2.S[0] \ 157 VMOV K.S[3], t3.S[0] \ 158 VMOV t1.D[1], K.D[1] \ 159 VMOV t2.S[1], t1.S[2] \ 160 VMOV t3.S[1], t1.S[3] \ 161 VMOV K.S[2], t2.S[1] \ 162 VMOV K.S[3], t3.S[1] \ 163 VMOV t2.S[3], K.S[3] \ 164 VMOV t3.S[2], t2.S[3] \ 165 VMOV K.S[3], t3.S[2] 166 167 #define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ 168 VMOV t0.B16, K.B16 \ 169 VMOV t3.S[0], t0.S[0] \ 170 VMOV t2.S[0], t0.S[1] \ 171 VMOV t1.S[0], t0.S[2] \ 172 VMOV K0.S[0], t0.S[3] \ 173 VMOV t3.S[1], t1.S[0] \ 174 VMOV t3.S[2], t2.S[0] \ 175 VMOV t3.S[3], t3.S[0] \ 176 VMOV t2.S[3], t3.S[1] \ 177 VMOV t1.S[3], t3.S[2] \ 178 VMOV K.S[3], t3.S[3] \ 179 VMOV K.S[2], t2.S[3] \ 180 VMOV K.S[1], t1.S[3] \ 181 VMOV t1.B16, K.B16 \ 182 VMOV t2.S[1], t1.S[1] \ 183 VMOV K.S[1], t1.S[2] \ 184 VMOV t2.S[2], t2.S[1] \ 185 VMOV K.S[2], t2.S[2] 186 187 #define LOAD_SM4_AESNI_CONSTS() \ 188 LDP nibble_mask<>(SB), (R20, R21) \ 189 VMOV R20, NIBBLE_MASK.D[0] \ 190 VMOV R21, NIBBLE_MASK.D[1] \ 191 LDP m1_low<>(SB), (R20, R21) \ 192 VMOV R20, M1L.D[0] \ 193 VMOV R21, M1L.D[1] \ 194 LDP m1_high<>(SB), (R20, R21) \ 195 VMOV R20, M1H.D[0] \ 196 VMOV R21, M1H.D[1] \ 197 LDP m2_low<>(SB), (R20, R21) \ 198 VMOV R20, M2L.D[0] \ 199 VMOV R21, M2L.D[1] \ 200 LDP m2_high<>(SB), (R20, R21) \ 201 VMOV R20, M2H.D[0] \ 202 VMOV R21, M2H.D[1] \ 203 LDP inverse_shift_rows<>(SB), (R20, R21) \ 204 VMOV R20, INVERSE_SHIFT_ROWS.D[0] \ 205 VMOV R21, INVERSE_SHIFT_ROWS.D[1] \ 206 LDP r08_mask<>(SB), (R20, R21) \ 207 VMOV R20, R08_MASK.D[0] \ 208 VMOV R21, R08_MASK.D[1] \ 209 LDP r16_mask<>(SB), (R20, R21) \ 210 VMOV R20, R16_MASK.D[0] \ 211 VMOV R21, R16_MASK.D[1] \ 212 LDP r24_mask<>(SB), (R20, R21) \ 213 VMOV R20, R24_MASK.D[0] \ 214 VMOV R21, R24_MASK.D[1] 215 216 #define SM4_SBOX(x, y, z) \ 217 ; \ 218 VAND x.B16, NIBBLE_MASK.B16, z.B16; \ 219 VTBL z.B16, [M1L.B16], y.B16; \ 220 VUSHR $4, x.D2, x.D2; \ 221 VAND x.B16, NIBBLE_MASK.B16, z.B16; \ 222 VTBL z.B16, [M1H.B16], z.B16; \ 223 VEOR y.B16, z.B16, x.B16; \ 224 VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ 225 AESE ZERO.B16, x.B16; \ 226 VAND x.B16, NIBBLE_MASK.B16, z.B16; \ 227 VTBL z.B16, [M2L.B16], y.B16; \ 228 VUSHR $4, x.D2, x.D2; \ 229 VAND x.B16, NIBBLE_MASK.B16, z.B16; \ 230 VTBL z.B16, [M2H.B16], z.B16; \ 231 VEOR y.B16, z.B16, x.B16 232 233 #define SM4_TAO_L1(x, y, z) \ 234 SM4_SBOX(x, y, z); \ 235 VTBL R08_MASK.B16, [x.B16], y.B16; \ 236 VEOR y.B16, x.B16, y.B16; \ 237 VTBL R16_MASK.B16, [x.B16], z.B16; \ 238 VEOR z.B16, y.B16, y.B16; \ 239 VSHL $2, y.S4, z.S4; \ 240 VUSHR $30, y.S4, y.S4; \ 241 VORR y.B16, z.B16, y.B16; \ 242 VTBL R24_MASK.B16, [x.B16], z.B16; \ 243 VEOR z.B16, x.B16, x.B16; \ 244 VEOR y.B16, x.B16, x.B16 245 246 #define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \ 247 MOVW.P 4(RK), R19; \ 248 VMOV R19, x.S4; \ 249 VEOR t1.B16, x.B16, x.B16; \ 250 VEOR t2.B16, x.B16, x.B16; \ 251 VEOR t3.B16, x.B16, x.B16; \ 252 SM4_TAO_L1(x, y, z); \ 253 VEOR x.B16, t0.B16, t0.B16 254 255 // func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int) 256 TEXT ·gcmSm4Init(SB),NOSPLIT,$0 257 #define pTbl R0 258 #define RK R1 259 #define I R2 260 261 MOVD productTable+0(FP), pTbl 262 MOVD rk+8(FP), RK 263 MOVD inst+16(FP), R5 264 265 MOVD $0xC2, I 266 LSL $56, I 267 VMOV I, POLY.D[0] 268 MOVD $1, I 269 VMOV I, POLY.D[1] 270 VEOR ZERO.B16, ZERO.B16, ZERO.B16 271 272 // Encrypt block 0 with the SM4 keys to generate the hash key H 273 CMP $1, R5 274 BEQ sm4InitSM4E 275 276 LOAD_SM4_AESNI_CONSTS() 277 VEOR B0.B16, B0.B16, B0.B16 278 VEOR B1.B16, B1.B16, B1.B16 279 VEOR B2.B16, B2.B16, B2.B16 280 VEOR B3.B16, B3.B16, B3.B16 281 EOR R3, R3 282 283 sm4InitEncLoop: 284 SM4_ROUND(RK, K0, K1, K2, B0, B1, B2, B3) 285 SM4_ROUND(RK, K0, K1, K2, B1, B2, B3, B0) 286 SM4_ROUND(RK, K0, K1, K2, B2, B3, B0, B1) 287 SM4_ROUND(RK, K0, K1, K2, B3, B0, B1, B2) 288 289 ADD $1, R3 290 CMP $8, R3 291 BNE sm4InitEncLoop 292 293 VMOV B0.S[0], B0.S[2] 294 VMOV B1.S[0], B0.S[3] 295 VMOV B2.S[0], B0.S[0] 296 VMOV B3.S[0], B0.S[1] 297 B sm4InitEncDone 298 sm4InitSM4E: 299 VEOR B0.B16, B0.B16, B0.B16 300 VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4] 301 WORD $0x6085c0ce //SM4E V0.4S, V11.4S 302 WORD $0x8085c0ce //SM4E V0.4S, V12.4S 303 WORD $0xa085c0ce //SM4E V0.4S, V13.4S 304 WORD $0xc085c0ce //SM4E V0.4S, V14.4S 305 VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4] 306 WORD $0x6085c0ce //SM4E V0.4S, V11.4S 307 WORD $0x8085c0ce //SM4E V0.4S, V12.4S 308 WORD $0xa085c0ce //SM4E V0.4S, V13.4S 309 WORD $0xc085c0ce //SM4E V0.4S, V14.4S 310 VREV32 B0.B16, B0.B16 311 VREV64 B0.B16, B0.B16 312 sm4InitEncDone: 313 // Multiply by 2 modulo P 314 VMOV B0.D[0], I 315 ASR $63, I 316 VMOV I, T1.D[0] 317 VMOV I, T1.D[1] 318 VAND POLY.B16, T1.B16, T1.B16 319 VUSHR $63, B0.D2, T2.D2 320 VEXT $8, ZERO.B16, T2.B16, T2.B16 321 VSHL $1, B0.D2, B0.D2 322 VEOR T1.B16, B0.B16, B0.B16 323 VEOR T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available 324 325 // Karatsuba pre-computation 326 VEXT $8, B0.B16, B0.B16, B1.B16 327 VEOR B0.B16, B1.B16, B1.B16 328 329 ADD $14*16, pTbl 330 331 VST1 [B0.B16, B1.B16], (pTbl) 332 SUB $2*16, pTbl 333 334 VMOV B0.B16, B2.B16 335 VMOV B1.B16, B3.B16 336 337 MOVD $7, I 338 339 initLoop: 340 // Compute powers of H 341 SUBS $1, I 342 343 VPMULL B0.D1, B2.D1, T1.Q1 344 VPMULL2 B0.D2, B2.D2, T0.Q1 345 VPMULL B1.D1, B3.D1, T2.Q1 346 VEOR T0.B16, T2.B16, T2.B16 347 VEOR T1.B16, T2.B16, T2.B16 348 VEXT $8, ZERO.B16, T2.B16, T3.B16 349 VEXT $8, T2.B16, ZERO.B16, T2.B16 350 VEOR T2.B16, T0.B16, T0.B16 351 VEOR T3.B16, T1.B16, T1.B16 352 VPMULL POLY.D1, T0.D1, T2.Q1 353 VEXT $8, T0.B16, T0.B16, T0.B16 354 VEOR T2.B16, T0.B16, T0.B16 355 VPMULL POLY.D1, T0.D1, T2.Q1 356 VEXT $8, T0.B16, T0.B16, T0.B16 357 VEOR T2.B16, T0.B16, T0.B16 358 VEOR T1.B16, T0.B16, B2.B16 359 VMOV B2.B16, B3.B16 360 VEXT $8, B2.B16, B2.B16, B2.B16 361 VEOR B2.B16, B3.B16, B3.B16 362 363 VST1 [B2.B16, B3.B16], (pTbl) 364 SUB $2*16, pTbl 365 366 BNE initLoop 367 RET 368 #undef I 369 #undef RK 370 #undef pTbl 371 372 // func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) 373 TEXT ·gcmSm4Data(SB),NOSPLIT,$0 374 #define pTbl R0 375 #define aut R1 376 #define tPtr R2 377 #define autLen R3 378 #define H0 R4 379 #define pTblSave R5 380 381 #define mulRound(X) \ 382 VLD1.P 32(pTbl), [T1.B16, T2.B16] \ 383 VREV64 X.B16, X.B16 \ 384 VEXT $8, X.B16, X.B16, T0.B16 \ 385 VEOR X.B16, T0.B16, T0.B16 \ 386 VPMULL X.D1, T1.D1, T3.Q1 \ 387 VEOR T3.B16, ACC1.B16, ACC1.B16 \ 388 VPMULL2 X.D2, T1.D2, T3.Q1 \ 389 VEOR T3.B16, ACC0.B16, ACC0.B16 \ 390 VPMULL T0.D1, T2.D1, T3.Q1 \ 391 VEOR T3.B16, ACCM.B16, ACCM.B16 392 393 MOVD productTable+0(FP), pTbl 394 MOVD data_base+8(FP), aut 395 MOVD data_len+16(FP), autLen 396 MOVD T+32(FP), tPtr 397 398 //VEOR ACC0.B16, ACC0.B16, ACC0.B16 399 VLD1 (tPtr), [ACC0.B16] 400 CBZ autLen, dataBail 401 402 MOVD $0xC2, H0 403 LSL $56, H0 404 VMOV H0, POLY.D[0] 405 MOVD $1, H0 406 VMOV H0, POLY.D[1] 407 VEOR ZERO.B16, ZERO.B16, ZERO.B16 408 MOVD pTbl, pTblSave 409 410 CMP $13, autLen 411 BEQ dataTLS 412 CMP $128, autLen 413 BLT startSinglesLoop 414 B octetsLoop 415 416 dataTLS: 417 ADD $14*16, pTbl 418 VLD1.P (pTbl), [T1.B16, T2.B16] 419 VEOR B0.B16, B0.B16, B0.B16 420 421 MOVD (aut), H0 422 VMOV H0, B0.D[0] 423 MOVW 8(aut), H0 424 VMOV H0, B0.S[2] 425 MOVB 12(aut), H0 426 VMOV H0, B0.B[12] 427 428 MOVD $0, autLen 429 B dataMul 430 431 octetsLoop: 432 CMP $128, autLen 433 BLT startSinglesLoop 434 SUB $128, autLen 435 436 VLD1.P 32(aut), [B0.B16, B1.B16] 437 438 VLD1.P 32(pTbl), [T1.B16, T2.B16] 439 VREV64 B0.B16, B0.B16 440 VEOR ACC0.B16, B0.B16, B0.B16 441 VEXT $8, B0.B16, B0.B16, T0.B16 442 VEOR B0.B16, T0.B16, T0.B16 443 VPMULL B0.D1, T1.D1, ACC1.Q1 444 VPMULL2 B0.D2, T1.D2, ACC0.Q1 445 VPMULL T0.D1, T2.D1, ACCM.Q1 446 447 mulRound(B1) 448 VLD1.P 32(aut), [B2.B16, B3.B16] 449 mulRound(B2) 450 mulRound(B3) 451 VLD1.P 32(aut), [B4.B16, B5.B16] 452 mulRound(B4) 453 mulRound(B5) 454 VLD1.P 32(aut), [B6.B16, B7.B16] 455 mulRound(B6) 456 mulRound(B7) 457 458 MOVD pTblSave, pTbl 459 reduce() 460 B octetsLoop 461 462 startSinglesLoop: 463 464 ADD $14*16, pTbl 465 VLD1.P (pTbl), [T1.B16, T2.B16] 466 467 singlesLoop: 468 469 CMP $16, autLen 470 BLT dataEnd 471 SUB $16, autLen 472 473 VLD1.P 16(aut), [B0.B16] 474 dataMul: 475 VREV64 B0.B16, B0.B16 476 VEOR ACC0.B16, B0.B16, B0.B16 477 478 VEXT $8, B0.B16, B0.B16, T0.B16 479 VEOR B0.B16, T0.B16, T0.B16 480 VPMULL B0.D1, T1.D1, ACC1.Q1 481 VPMULL2 B0.D2, T1.D2, ACC0.Q1 482 VPMULL T0.D1, T2.D1, ACCM.Q1 483 484 reduce() 485 486 B singlesLoop 487 488 dataEnd: 489 490 CBZ autLen, dataBail 491 VEOR B0.B16, B0.B16, B0.B16 492 ADD autLen, aut 493 494 dataLoadLoop: 495 MOVB.W -1(aut), H0 496 VEXT $15, B0.B16, ZERO.B16, B0.B16 497 VMOV H0, B0.B[0] 498 SUBS $1, autLen 499 BNE dataLoadLoop 500 B dataMul 501 502 dataBail: 503 VST1 [ACC0.B16], (tPtr) 504 RET 505 506 #undef pTbl 507 #undef aut 508 #undef tPtr 509 #undef autLen 510 #undef H0 511 #undef pTblSave 512 513 // func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 514 TEXT ·gcmSm4Enc(SB),NOSPLIT,$0 515 #define pTbl R0 516 #define dstPtr R1 517 #define ctrPtr R2 518 #define srcPtr R3 519 #define rk R4 520 #define tPtr R5 521 #define srcPtrLen R6 522 #define aluCTR R7 523 #define aluTMP R8 524 #define H0 R9 525 #define H1 R10 526 #define pTblSave R11 527 #define rkSave R12 528 #define mulRoundSingleWithoutRev(X) \ 529 VEOR ACC0.B16, X.B16, X.B16 \ 530 VEXT $8, X.B16, X.B16, T0.B16 \ 531 VEOR X.B16, T0.B16, T0.B16 \ 532 VPMULL X.D1, T1.D1, ACC1.Q1 \ 533 VPMULL2 X.D2, T1.D2, ACC0.Q1 \ 534 VPMULL T0.D1, T2.D1, ACCM.Q1 \ 535 reduce() \ 536 537 #define mulRoundSingle(X) \ 538 VREV64 X.B16, X.B16 \ 539 mulRoundSingleWithoutRev(X) \ 540 541 MOVD productTable+0(FP), pTbl 542 MOVD dst+8(FP), dstPtr 543 MOVD src_base+32(FP), srcPtr 544 MOVD src_len+40(FP), srcPtrLen 545 MOVD ctr+56(FP), ctrPtr 546 MOVD T+64(FP), tPtr 547 MOVD rk_base+72(FP), rk 548 549 MOVD $0xC2, H1 550 LSL $56, H1 551 MOVD $1, H0 552 VMOV H1, POLY.D[0] 553 VMOV H0, POLY.D[1] 554 VEOR ZERO.B16, ZERO.B16, ZERO.B16 555 556 MOVD pTbl, pTblSave 557 MOVD rk, rkSave 558 // Current tag, after AAD 559 VLD1 (tPtr), [ACC0.B16] 560 VEOR ACC1.B16, ACC1.B16, ACC1.B16 561 VEOR ACCM.B16, ACCM.B16, ACCM.B16 562 // Prepare initial counter, and the increment vector 563 VLD1 (ctrPtr), [CTR.B16] 564 VEOR INC.B16, INC.B16, INC.B16 565 MOVD $1, H0 566 VMOV H0, INC.S[3] 567 VREV32 CTR.B16, CTR.B16 568 VADD CTR.S4, INC.S4, CTR.S4 569 570 // Skip to <8 blocks loop 571 CMP $128, srcPtrLen 572 573 LOAD_SM4_AESNI_CONSTS() 574 575 BLT encNibblesLoop 576 // There are at least 8 blocks to encrypt 577 578 encOctetsLoop: 579 SUB $128, srcPtrLen 580 // Prepare 8 counters 581 VMOV CTR.B16, B0.B16 582 VADD B0.S4, INC.S4, B1.S4 583 VADD B1.S4, INC.S4, B2.S4 584 VADD B2.S4, INC.S4, B3.S4 585 VADD B3.S4, INC.S4, B4.S4 586 VADD B4.S4, INC.S4, B5.S4 587 VADD B5.S4, INC.S4, B6.S4 588 VADD B6.S4, INC.S4, B7.S4 589 VADD B7.S4, INC.S4, CTR.S4 590 591 // encryption first 4 blocks 592 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 593 EOR R13, R13 594 MOVD rkSave, rk 595 596 encOctetsEnc4Blocks1: 597 SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) 598 SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) 599 SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) 600 SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) 601 602 ADD $1, R13 603 CMP $8, R13 604 BNE encOctetsEnc4Blocks1 605 VREV32 B0.B16, B0.B16 606 VREV32 B1.B16, B1.B16 607 VREV32 B2.B16, B2.B16 608 VREV32 B3.B16, B3.B16 609 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 610 // encryption second 4 blocks 611 PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0) 612 MOVD rkSave, rk 613 614 encOctetsEnc4Blocks2: 615 SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7) 616 SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4) 617 SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5) 618 SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6) 619 620 ADD $1, R13 621 CMP $16, R13 622 BNE encOctetsEnc4Blocks2 623 VREV32 B4.B16, B4.B16 624 VREV32 B5.B16, B5.B16 625 VREV32 B6.B16, B6.B16 626 VREV32 B7.B16, B7.B16 627 TRANSPOSE_MATRIX(B4, B5, B6, B7, K0) 628 629 // XOR plaintext and store ciphertext 630 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 631 VEOR B0.B16, T1.B16, B0.B16 632 VEOR B1.B16, T2.B16, B1.B16 633 VST1.P [B0.B16, B1.B16], 32(dstPtr) 634 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 635 VEOR B2.B16, T1.B16, B2.B16 636 VEOR B3.B16, T2.B16, B3.B16 637 VST1.P [B2.B16, B3.B16], 32(dstPtr) 638 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 639 VEOR B4.B16, T1.B16, B4.B16 640 VEOR B5.B16, T2.B16, B5.B16 641 VST1.P [B4.B16, B5.B16], 32(dstPtr) 642 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 643 VEOR B6.B16, T1.B16, B6.B16 644 VEOR B7.B16, T2.B16, B7.B16 645 VST1.P [B6.B16, B7.B16], 32(dstPtr) 646 647 VLD1.P 32(pTbl), [T1.B16, T2.B16] 648 VREV64 B0.B16, B0.B16 649 VEOR ACC0.B16, B0.B16, B0.B16 650 VEXT $8, B0.B16, B0.B16, T0.B16 651 VEOR B0.B16, T0.B16, T0.B16 652 VPMULL B0.D1, T1.D1, ACC1.Q1 653 VPMULL2 B0.D2, T1.D2, ACC0.Q1 654 VPMULL T0.D1, T2.D1, ACCM.Q1 655 656 mulRound(B1) 657 mulRound(B2) 658 mulRound(B3) 659 mulRound(B4) 660 mulRound(B5) 661 mulRound(B6) 662 mulRound(B7) 663 MOVD pTblSave, pTbl 664 reduce() 665 666 CMP $128, srcPtrLen 667 BGE encOctetsLoop 668 669 encNibblesLoop: 670 CBZ srcPtrLen, encDone 671 ADD $14*16, pTbl 672 // Preload H and its Karatsuba precomp 673 VLD1.P (pTbl), [T1.B16, T2.B16] 674 675 CMP $64, srcPtrLen 676 BLT encStartSingles 677 SUB $64, srcPtrLen 678 679 // Prepare 4 counters 680 VMOV CTR.B16, B0.B16 681 VADD B0.S4, INC.S4, B1.S4 682 VADD B1.S4, INC.S4, B2.S4 683 VADD B2.S4, INC.S4, B3.S4 684 VADD B3.S4, INC.S4, CTR.S4 685 686 // encryption first 4 blocks 687 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 688 EOR R13, R13 689 MOVD rkSave, rk 690 691 encNibblesEnc4Blocks: 692 SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) 693 SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) 694 SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) 695 SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) 696 697 ADD $1, R13 698 CMP $8, R13 699 BNE encNibblesEnc4Blocks 700 VREV32 B0.B16, B0.B16 701 VREV32 B1.B16, B1.B16 702 VREV32 B2.B16, B2.B16 703 VREV32 B3.B16, B3.B16 704 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 705 706 // XOR plaintext and store ciphertext 707 VLD1.P 32(srcPtr), [K1.B16, K2.B16] 708 VEOR B0.B16, K1.B16, B0.B16 709 VEOR B1.B16, K2.B16, B1.B16 710 VST1.P [B0.B16, B1.B16], 32(dstPtr) 711 VLD1.P 32(srcPtr), [K1.B16, K2.B16] 712 VEOR B2.B16, K1.B16, B2.B16 713 VEOR B3.B16, K2.B16, B3.B16 714 VST1.P [B2.B16, B3.B16], 32(dstPtr) 715 716 mulRoundSingle(B0) 717 mulRoundSingle(B1) 718 mulRoundSingle(B2) 719 mulRoundSingle(B3) 720 721 encStartSingles: 722 CBZ srcPtrLen, encDone 723 724 // Prepare 4 counters 725 VMOV CTR.B16, B0.B16 726 VADD B0.S4, INC.S4, B1.S4 727 VADD B1.S4, INC.S4, B2.S4 728 VADD B2.S4, INC.S4, B3.S4 729 VADD B3.S4, INC.S4, CTR.S4 730 731 // encryption first 4 blocks 732 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 733 EOR R13, R13 734 MOVD rkSave, rk 735 736 encSinglesEnc4Blocks: 737 SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) 738 SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) 739 SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) 740 SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) 741 742 ADD $1, R13 743 CMP $8, R13 744 BNE encSinglesEnc4Blocks 745 VREV32 B0.B16, B0.B16 746 VREV32 B1.B16, B1.B16 747 VREV32 B2.B16, B2.B16 748 VREV32 B3.B16, B3.B16 749 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 750 751 VMOV B0.B16, K0.B16 752 CMP $16, srcPtrLen 753 BLT encTail 754 SUB $16, srcPtrLen 755 VLD1.P 16(srcPtr), [K1.B16] 756 VEOR K0.B16, K1.B16, K0.B16 757 VST1.P [K0.B16], 16(dstPtr) 758 mulRoundSingle(K0) 759 760 VMOV B1.B16, K0.B16 761 CMP $16, srcPtrLen 762 BLT encTail 763 SUB $16, srcPtrLen 764 VLD1.P 16(srcPtr), [K1.B16] 765 VEOR K0.B16, K1.B16, K0.B16 766 VST1.P [K0.B16], 16(dstPtr) 767 mulRoundSingle(K0) 768 769 VMOV B2.B16, K0.B16 770 CMP $16, srcPtrLen 771 BLT encTail 772 SUB $16, srcPtrLen 773 VLD1.P 16(srcPtr), [K1.B16] 774 VEOR K0.B16, K1.B16, K0.B16 775 VST1.P [K0.B16], 16(dstPtr) 776 mulRoundSingle(K0) 777 778 VMOV B3.B16, K0.B16 779 CMP $16, srcPtrLen 780 BLT encTail 781 SUB $16, srcPtrLen 782 VLD1.P 16(srcPtr), [K1.B16] 783 VEOR K0.B16, K1.B16, K0.B16 784 VST1.P [K0.B16], 16(dstPtr) 785 mulRoundSingle(K0) 786 787 encTail: 788 CBZ srcPtrLen, encDone 789 VEOR T0.B16, T0.B16, T0.B16 790 VEOR T3.B16, T3.B16, T3.B16 791 MOVD $0, H1 792 SUB $1, H1 793 ADD srcPtrLen, srcPtr 794 795 TBZ $3, srcPtrLen, ld4 796 MOVD.W -8(srcPtr), H0 797 VMOV H0, T0.D[0] 798 VMOV H1, T3.D[0] 799 ld4: 800 TBZ $2, srcPtrLen, ld2 801 MOVW.W -4(srcPtr), H0 802 VEXT $12, T0.B16, ZERO.B16, T0.B16 803 VEXT $12, T3.B16, ZERO.B16, T3.B16 804 VMOV H0, T0.S[0] 805 VMOV H1, T3.S[0] 806 ld2: 807 TBZ $1, srcPtrLen, ld1 808 MOVH.W -2(srcPtr), H0 809 VEXT $14, T0.B16, ZERO.B16, T0.B16 810 VEXT $14, T3.B16, ZERO.B16, T3.B16 811 VMOV H0, T0.H[0] 812 VMOV H1, T3.H[0] 813 ld1: 814 TBZ $0, srcPtrLen, ld0 815 MOVB.W -1(srcPtr), H0 816 VEXT $15, T0.B16, ZERO.B16, T0.B16 817 VEXT $15, T3.B16, ZERO.B16, T3.B16 818 VMOV H0, T0.B[0] 819 VMOV H1, T3.B[0] 820 ld0: 821 MOVD ZR, srcPtrLen 822 VEOR T0.B16, K0.B16, K0.B16 823 VAND T3.B16, K0.B16, K0.B16 824 VST1.P [K0.B16], 16(dstPtr) 825 mulRoundSingle(K0) 826 827 encDone: 828 VST1 [ACC0.B16], (tPtr) 829 RET 830 831 // func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) 832 TEXT ·gcmSm4Dec(SB),NOSPLIT,$0 833 MOVD productTable+0(FP), pTbl 834 MOVD dst+8(FP), dstPtr 835 MOVD src_base+32(FP), srcPtr 836 MOVD src_len+40(FP), srcPtrLen 837 MOVD ctr+56(FP), ctrPtr 838 MOVD T+64(FP), tPtr 839 MOVD rk_base+72(FP), rk 840 841 MOVD $0xC2, H1 842 LSL $56, H1 843 MOVD $1, H0 844 VMOV H1, POLY.D[0] 845 VMOV H0, POLY.D[1] 846 VEOR ZERO.B16, ZERO.B16, ZERO.B16 847 848 MOVD pTbl, pTblSave 849 MOVD rk, rkSave 850 // Current tag, after AAD 851 VLD1 (tPtr), [ACC0.B16] 852 VEOR ACC1.B16, ACC1.B16, ACC1.B16 853 VEOR ACCM.B16, ACCM.B16, ACCM.B16 854 // Prepare initial counter, and the increment vector 855 VLD1 (ctrPtr), [CTR.B16] 856 VEOR INC.B16, INC.B16, INC.B16 857 MOVD $1, H0 858 VMOV H0, INC.S[3] 859 VREV32 CTR.B16, CTR.B16 860 VADD CTR.S4, INC.S4, CTR.S4 861 862 // Skip to <8 blocks loop 863 CMP $128, srcPtrLen 864 865 LOAD_SM4_AESNI_CONSTS() 866 867 BLT decNibblesLoop 868 // There are at least 8 blocks to encrypt 869 870 decOctetsLoop: 871 SUB $128, srcPtrLen 872 873 VMOV CTR.B16, B0.B16 874 VADD B0.S4, INC.S4, B1.S4 875 VADD B1.S4, INC.S4, B2.S4 876 VADD B2.S4, INC.S4, B3.S4 877 VADD B3.S4, INC.S4, B4.S4 878 VADD B4.S4, INC.S4, B5.S4 879 VADD B5.S4, INC.S4, B6.S4 880 VADD B6.S4, INC.S4, B7.S4 881 VADD B7.S4, INC.S4, CTR.S4 882 883 // encryption first 4 blocks 884 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 885 EOR R13, R13 886 MOVD rkSave, rk 887 888 decOctetsEnc4Blocks1: 889 SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) 890 SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) 891 SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) 892 SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) 893 894 ADD $1, R13 895 CMP $8, R13 896 BNE decOctetsEnc4Blocks1 897 VREV32 B0.B16, T1.B16 898 VREV32 B1.B16, T2.B16 899 VREV32 B2.B16, B2.B16 900 VREV32 B3.B16, B3.B16 901 TRANSPOSE_MATRIX(T1, T2, B2, B3, K0) 902 903 // encryption second 4 blocks 904 PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0) 905 MOVD rkSave, rk 906 907 decOctetsEnc4Blocks2: 908 SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7) 909 SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4) 910 SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5) 911 SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6) 912 913 ADD $1, R13 914 CMP $16, R13 915 BNE decOctetsEnc4Blocks2 916 VREV32 B4.B16, B4.B16 917 VREV32 B5.B16, B5.B16 918 VREV32 B6.B16, B6.B16 919 VREV32 B7.B16, B7.B16 920 TRANSPOSE_MATRIX(B4, B5, B6, B7, K0) 921 922 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 923 VEOR B0.B16, T1.B16, T1.B16 924 VEOR B1.B16, T2.B16, T2.B16 925 VST1.P [T1.B16, T2.B16], 32(dstPtr) 926 927 VLD1.P 32(pTbl), [T1.B16, T2.B16] 928 VREV64 B0.B16, B0.B16 929 VEOR ACC0.B16, B0.B16, B0.B16 930 VEXT $8, B0.B16, B0.B16, T0.B16 931 VEOR B0.B16, T0.B16, T0.B16 932 VPMULL B0.D1, T1.D1, ACC1.Q1 933 VPMULL2 B0.D2, T1.D2, ACC0.Q1 934 VPMULL T0.D1, T2.D1, ACCM.Q1 935 mulRound(B1) 936 937 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 938 VEOR B2.B16, B0.B16, T1.B16 939 VEOR B3.B16, B1.B16, T2.B16 940 VST1.P [T1.B16, T2.B16], 32(dstPtr) 941 mulRound(B0) 942 mulRound(B1) 943 944 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 945 VEOR B4.B16, B0.B16, T1.B16 946 VEOR B5.B16, B1.B16, T2.B16 947 VST1.P [T1.B16, T2.B16], 32(dstPtr) 948 mulRound(B0) 949 mulRound(B1) 950 951 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 952 VEOR B6.B16, B0.B16, T1.B16 953 VEOR B7.B16, B1.B16, T2.B16 954 VST1.P [T1.B16, T2.B16], 32(dstPtr) 955 mulRound(B0) 956 mulRound(B1) 957 958 MOVD pTblSave, pTbl 959 reduce() 960 961 CMP $128, srcPtrLen 962 BGE decOctetsLoop 963 964 decNibblesLoop: 965 CBZ srcPtrLen, decDone 966 ADD $14*16, pTbl 967 // Preload H and its Karatsuba precomp 968 VLD1.P (pTbl), [T1.B16, T2.B16] 969 CMP $64, srcPtrLen 970 BLT decStartSingles 971 SUB $64, srcPtrLen 972 973 // Prepare 4 counters 974 VMOV CTR.B16, B0.B16 975 VADD B0.S4, INC.S4, B1.S4 976 VADD B1.S4, INC.S4, B2.S4 977 VADD B2.S4, INC.S4, B3.S4 978 VADD B3.S4, INC.S4, CTR.S4 979 980 // encryption first 4 blocks 981 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 982 EOR R13, R13 983 MOVD rkSave, rk 984 985 decNibblesEnc4Blocks: 986 SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) 987 SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) 988 SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) 989 SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) 990 991 ADD $1, R13 992 CMP $8, R13 993 BNE decNibblesEnc4Blocks 994 VREV32 B0.B16, B0.B16 995 VREV32 B1.B16, B1.B16 996 VREV32 B2.B16, B2.B16 997 VREV32 B3.B16, B3.B16 998 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 999 1000 // XOR plaintext and store ciphertext 1001 VLD1.P 32(srcPtr), [K1.B16, K2.B16] 1002 VREV64 K1.B16, B4.B16 1003 VREV64 K2.B16, B5.B16 1004 VEOR B0.B16, K1.B16, B0.B16 1005 VEOR B1.B16, K2.B16, B1.B16 1006 VST1.P [B0.B16, B1.B16], 32(dstPtr) 1007 VLD1.P 32(srcPtr), [K1.B16, K2.B16] 1008 VREV64 K1.B16, B6.B16 1009 VREV64 K2.B16, B7.B16 1010 VEOR B2.B16, K1.B16, B2.B16 1011 VEOR B3.B16, K2.B16, B3.B16 1012 VST1.P [B2.B16, B3.B16], 32(dstPtr) 1013 mulRoundSingleWithoutRev(B4) 1014 mulRoundSingleWithoutRev(B5) 1015 mulRoundSingleWithoutRev(B6) 1016 mulRoundSingleWithoutRev(B7) 1017 1018 decStartSingles: 1019 CBZ srcPtrLen, decDone 1020 1021 // Prepare 4 counters 1022 VMOV CTR.B16, B0.B16 1023 VADD B0.S4, INC.S4, B1.S4 1024 VADD B1.S4, INC.S4, B2.S4 1025 VADD B2.S4, INC.S4, B3.S4 1026 VADD B3.S4, INC.S4, CTR.S4 1027 1028 // encryption first 4 blocks 1029 PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 1030 EOR R13, R13 1031 MOVD rkSave, rk 1032 1033 decSinglesEnc4Blocks: 1034 SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3) 1035 SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0) 1036 SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1) 1037 SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2) 1038 1039 ADD $1, R13 1040 CMP $8, R13 1041 BNE decSinglesEnc4Blocks 1042 VREV32 B0.B16, B0.B16 1043 VREV32 B1.B16, B1.B16 1044 VREV32 B2.B16, B2.B16 1045 VREV32 B3.B16, B3.B16 1046 TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) 1047 1048 VMOV B0.B16, K0.B16 1049 CMP $16, srcPtrLen 1050 BLT decTail 1051 SUB $16, srcPtrLen 1052 VLD1.P 16(srcPtr), [K1.B16] 1053 VREV64 K1.B16, B5.B16 1054 VEOR K0.B16, K1.B16, K0.B16 1055 VST1.P [K0.B16], 16(dstPtr) 1056 mulRoundSingleWithoutRev(B5) 1057 1058 VMOV B1.B16, K0.B16 1059 CMP $16, srcPtrLen 1060 BLT decTail 1061 SUB $16, srcPtrLen 1062 VLD1.P 16(srcPtr), [K1.B16] 1063 VREV64 K1.B16, B5.B16 1064 VEOR K0.B16, K1.B16, K0.B16 1065 VST1.P [K0.B16], 16(dstPtr) 1066 mulRoundSingleWithoutRev(B5) 1067 1068 VMOV B2.B16, K0.B16 1069 CMP $16, srcPtrLen 1070 BLT decTail 1071 SUB $16, srcPtrLen 1072 VLD1.P 16(srcPtr), [K1.B16] 1073 VREV64 K1.B16, B5.B16 1074 VEOR K0.B16, K1.B16, K0.B16 1075 VST1.P [K0.B16], 16(dstPtr) 1076 mulRoundSingleWithoutRev(B5) 1077 1078 VMOV B3.B16, K0.B16 1079 CMP $16, srcPtrLen 1080 BLT decTail 1081 SUB $16, srcPtrLen 1082 VLD1.P 16(srcPtr), [K1.B16] 1083 VREV64 K1.B16, B5.B16 1084 VEOR K0.B16, K1.B16, K0.B16 1085 VST1.P [K0.B16], 16(dstPtr) 1086 mulRoundSingleWithoutRev(B5) 1087 1088 decTail: 1089 CBZ srcPtrLen, decDone 1090 // Assuming it is safe to load past dstPtr due to the presence of the tag 1091 VLD1 (srcPtr), [B5.B16] 1092 1093 VEOR B5.B16, K0.B16, B0.B16 1094 1095 VEOR T3.B16, T3.B16, T3.B16 1096 MOVD $0, H1 1097 SUB $1, H1 1098 1099 TBZ $3, srcPtrLen, decLd4 1100 VMOV B0.D[0], H0 1101 MOVD.P H0, 8(dstPtr) 1102 VMOV H1, T3.D[0] 1103 VEXT $8, ZERO.B16, B0.B16, B0.B16 1104 1105 decLd4: 1106 TBZ $2, srcPtrLen, decLd2 1107 VMOV B0.S[0], H0 1108 MOVW.P H0, 4(dstPtr) 1109 VEXT $12, T3.B16, ZERO.B16, T3.B16 1110 VMOV H1, T3.S[0] 1111 VEXT $4, ZERO.B16, B0.B16, B0.B16 1112 decLd2: 1113 TBZ $1, srcPtrLen, decLd1 1114 VMOV B0.H[0], H0 1115 MOVH.P H0, 2(dstPtr) 1116 VEXT $14, T3.B16, ZERO.B16, T3.B16 1117 VMOV H1, T3.H[0] 1118 VEXT $2, ZERO.B16, B0.B16, B0.B16 1119 decLd1: 1120 TBZ $0, srcPtrLen, decLd0 1121 VMOV B0.B[0], H0 1122 MOVB.P H0, 1(dstPtr) 1123 VEXT $15, T3.B16, ZERO.B16, T3.B16 1124 VMOV H1, T3.B[0] 1125 decLd0: 1126 1127 VAND T3.B16, B5.B16, B5.B16 1128 VREV64 B5.B16, B5.B16 1129 1130 VEOR ACC0.B16, B5.B16, B5.B16 1131 VEXT $8, B5.B16, B5.B16, T0.B16 1132 VEOR B5.B16, T0.B16, T0.B16 1133 VPMULL B5.D1, T1.D1, ACC1.Q1 1134 VPMULL2 B5.D2, T1.D2, ACC0.Q1 1135 VPMULL T0.D1, T2.D1, ACCM.Q1 1136 reduce() 1137 1138 decDone: 1139 VST1 [ACC0.B16], (tPtr) 1140 RET