github.com/emmansun/gmsm@v0.29.1/sm4/gcm_ppc64x.s (about) 1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build (ppc64 || ppc64le) && !purego 6 7 // Portions based on CRYPTOGAMS code with the following comment: 8 // # ==================================================================== 9 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 10 // # project. The module is, however, dual licensed under OpenSSL and 11 // # CRYPTOGAMS licenses depending on where you obtain it. For further 12 // # details see http://www.openssl.org/~appro/cryptogams/. 13 // # ==================================================================== 14 15 // The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm 16 // from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl 17 // from commit d47afb3c. 18 19 // Changes were made due to differences in the ABI and some register usage. 20 // Some arguments were changed due to the way the Go code passes them. 21 22 #include "textflag.h" 23 24 #define XIP R3 25 #define HTBL R4 26 #define INP R5 27 #define LEN R6 28 29 #define XL V0 30 #define XM V1 31 #define XH V2 32 #define IN V3 33 #define ZERO V4 34 #define T0 V5 35 #define T1 V6 36 #define T2 V7 37 #define XC2 V8 38 #define H V9 39 #define HH V10 40 #define HL V11 41 #define LEMASK V12 42 #define XL1 V13 43 #define XM1 V14 44 #define XH1 V15 45 #define IN1 V16 46 #define H2 V17 47 #define H2H V18 48 #define H2L V19 49 #define XL3 V20 50 #define XM2 V21 51 #define IN2 V22 52 #define H3L V23 53 #define H3 V24 54 #define H3H V25 55 #define XH3 V26 56 #define XM3 V27 57 #define IN3 V28 58 #define H4L V29 59 #define H4 V30 60 #define H4H V31 61 62 #define IN0 IN 63 #define H21L HL 64 #define H21H HH 65 #define LOPERM H2L 66 #define HIPERM H2H 67 68 #define VXL VS32 69 #define VIN VS35 70 #define VXC2 VS40 71 #define VH VS41 72 #define VHH VS42 73 #define VHL VS43 74 #define VIN1 VS48 75 #define VH2 VS49 76 #define VH2H VS50 77 #define VH2L VS51 78 79 #define VIN2 VS54 80 #define VH3L VS55 81 #define VH3 VS56 82 #define VH3H VS57 83 #define VIN3 VS60 84 #define VH4L VS61 85 #define VH4 VS62 86 #define VH4H VS63 87 88 #define VIN0 VIN 89 90 // func gcmInit(productTable *[256]byte, h []byte) 91 TEXT ·gcmInit(SB), NOSPLIT, $0-32 92 MOVD productTable+0(FP), XIP 93 MOVD h+8(FP), HTBL 94 95 MOVD $0x10, R8 96 MOVD $0x20, R9 97 MOVD $0x30, R10 98 LXVD2X (HTBL)(R0), VH // Load H 99 100 VSPLTISB $-16, XC2 // 0xf0 101 VSPLTISB $1, T0 // one 102 VADDUBM XC2, XC2, XC2 // 0xe0 103 VXOR ZERO, ZERO, ZERO 104 VOR XC2, T0, XC2 // 0xe1 105 VSLDOI $15, XC2, ZERO, XC2 // 0xe1... 106 VSLDOI $1, ZERO, T0, T1 // ...1 107 VADDUBM XC2, XC2, XC2 // 0xc2... 108 VSPLTISB $7, T2 109 VOR XC2, T1, XC2 // 0xc2....01 110 VSPLTB $0, H, T1 // most significant byte 111 VSL H, T0, H // H<<=1 112 VSRAB T1, T2, T1 // broadcast carry bit 113 VAND T1, XC2, T1 114 VXOR H, T1, IN // twisted H 115 116 VSLDOI $8, IN, IN, H // twist even more ... 117 VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0 118 VSLDOI $8, ZERO, H, HL // ... and split 119 VSLDOI $8, H, ZERO, HH 120 121 STXVD2X VXC2, (XIP+R0) // save pre-computed table 122 STXVD2X VHL, (XIP+R8) 123 MOVD $0x40, R8 124 STXVD2X VH, (XIP+R9) 125 MOVD $0x50, R9 126 STXVD2X VHH, (XIP+R10) 127 MOVD $0x60, R10 128 129 VPMSUMD IN, HL, XL // H.lo·H.lo 130 VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi 131 VPMSUMD IN, HH, XH // H.hi·H.hi 132 133 VPMSUMD XL, XC2, T2 // 1st reduction phase 134 135 VSLDOI $8, XM, ZERO, T0 136 VSLDOI $8, ZERO, XM, T1 137 VXOR XL, T0, XL 138 VXOR XH, T1, XH 139 140 VSLDOI $8, XL, XL, XL 141 VXOR XL, T2, XL 142 143 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 144 VPMSUMD XL, XC2, XL 145 VXOR T1, XH, T1 146 VXOR XL, T1, IN1 147 148 VSLDOI $8, IN1, IN1, H2 149 VSLDOI $8, ZERO, H2, H2L 150 VSLDOI $8, H2, ZERO, H2H 151 152 STXVD2X VH2L, (XIP+R8) // save H^2 153 MOVD $0x70, R8 154 STXVD2X VH2, (XIP+R9) 155 MOVD $0x80, R9 156 STXVD2X VH2H, (XIP+R10) 157 MOVD $0x90, R10 158 159 VPMSUMD IN, H2L, XL // H.lo·H^2.lo 160 VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo 161 VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi 162 VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi 163 VPMSUMD IN, H2H, XH // H.hi·H^2.hi 164 VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi 165 166 VPMSUMD XL, XC2, T2 // 1st reduction phase 167 VPMSUMD XL1, XC2, HH // 1st reduction phase 168 169 VSLDOI $8, XM, ZERO, T0 170 VSLDOI $8, ZERO, XM, T1 171 VSLDOI $8, XM1, ZERO, HL 172 VSLDOI $8, ZERO, XM1, H 173 VXOR XL, T0, XL 174 VXOR XH, T1, XH 175 VXOR XL1, HL, XL1 176 VXOR XH1, H, XH1 177 178 VSLDOI $8, XL, XL, XL 179 VSLDOI $8, XL1, XL1, XL1 180 VXOR XL, T2, XL 181 VXOR XL1, HH, XL1 182 183 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 184 VSLDOI $8, XL1, XL1, H // 2nd reduction phase 185 VPMSUMD XL, XC2, XL 186 VPMSUMD XL1, XC2, XL1 187 VXOR T1, XH, T1 188 VXOR H, XH1, H 189 VXOR XL, T1, XL 190 VXOR XL1, H, XL1 191 192 VSLDOI $8, XL, XL, H 193 VSLDOI $8, XL1, XL1, H2 194 VSLDOI $8, ZERO, H, HL 195 VSLDOI $8, H, ZERO, HH 196 VSLDOI $8, ZERO, H2, H2L 197 VSLDOI $8, H2, ZERO, H2H 198 199 STXVD2X VHL, (XIP+R8) // save H^3 200 MOVD $0xa0, R8 201 STXVD2X VH, (XIP+R9) 202 MOVD $0xb0, R9 203 STXVD2X VHH, (XIP+R10) 204 MOVD $0xc0, R10 205 STXVD2X VH2L, (XIP+R8) // save H^4 206 STXVD2X VH2, (XIP+R9) 207 STXVD2X VH2H, (XIP+R10) 208 209 RET 210 211 // func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int) 212 TEXT ·gcmHash(SB), NOSPLIT, $0-64 213 MOVD output+0(FP), XIP 214 MOVD productTable+24(FP), HTBL 215 MOVD inp+32(FP), INP 216 MOVD len+56(FP), LEN 217 218 MOVD $0x10, R8 219 MOVD $0x20, R9 220 MOVD $0x30, R10 221 LXVD2X (XIP)(R0), VXL // load Xi 222 223 LXVD2X (HTBL)(R8), VHL // load pre-computed table 224 MOVD $0x40, R8 225 LXVD2X (HTBL)(R9), VH 226 MOVD $0x50, R9 227 LXVD2X (HTBL)(R10), VHH 228 MOVD $0x60, R10 229 LXVD2X (HTBL)(R0), VXC2 230 #ifdef GOARCH_ppc64le 231 LVSL (R0)(R0), LEMASK 232 VSPLTISB $0x07, T0 233 VXOR LEMASK, T0, LEMASK 234 VPERM XL, XL, LEMASK, XL 235 #endif 236 VXOR ZERO, ZERO, ZERO 237 238 CMPU LEN, $64 239 BGE gcm_ghash_p8_4x 240 241 LXVD2X (INP)(R0), VIN 242 ADD $16, INP, INP 243 SUBCCC $16, LEN, LEN 244 #ifdef GOARCH_ppc64le 245 VPERM IN, IN, LEMASK, IN 246 #endif 247 VXOR IN, XL, IN 248 BEQ short 249 250 LXVD2X (HTBL)(R8), VH2L // load H^2 251 MOVD $16, R8 252 LXVD2X (HTBL)(R9), VH2 253 ADD LEN, INP, R9 // end of input 254 LXVD2X (HTBL)(R10), VH2H 255 256 loop_2x: 257 LXVD2X (INP)(R0), VIN1 258 #ifdef GOARCH_ppc64le 259 VPERM IN1, IN1, LEMASK, IN1 260 #endif 261 262 SUBC $32, LEN, LEN 263 VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo 264 VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo 265 SUBE R11, R11, R11 // borrow?-1:0 266 VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi 267 VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi 268 AND LEN, R11, R11 269 VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi 270 VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi 271 ADD R11, INP, INP 272 273 VXOR XL, XL1, XL 274 VXOR XM, XM1, XM 275 276 VPMSUMD XL, XC2, T2 // 1st reduction phase 277 278 VSLDOI $8, XM, ZERO, T0 279 VSLDOI $8, ZERO, XM, T1 280 VXOR XH, XH1, XH 281 VXOR XL, T0, XL 282 VXOR XH, T1, XH 283 284 VSLDOI $8, XL, XL, XL 285 VXOR XL, T2, XL 286 LXVD2X (INP)(R8), VIN 287 ADD $32, INP, INP 288 289 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 290 VPMSUMD XL, XC2, XL 291 #ifdef GOARCH_ppc64le 292 VPERM IN, IN, LEMASK, IN 293 #endif 294 VXOR T1, XH, T1 295 VXOR IN, T1, IN 296 VXOR IN, XL, IN 297 CMP R9, INP 298 BGT loop_2x // done yet? 299 300 CMPWU LEN, $0 301 BNE even 302 303 short: 304 VPMSUMD IN, HL, XL // H.lo·Xi.lo 305 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi 306 VPMSUMD IN, HH, XH // H.hi·Xi.hi 307 308 VPMSUMD XL, XC2, T2 // 1st reduction phase 309 310 VSLDOI $8, XM, ZERO, T0 311 VSLDOI $8, ZERO, XM, T1 312 VXOR XL, T0, XL 313 VXOR XH, T1, XH 314 315 VSLDOI $8, XL, XL, XL 316 VXOR XL, T2, XL 317 318 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 319 VPMSUMD XL, XC2, XL 320 VXOR T1, XH, T1 321 322 even: 323 VXOR XL, T1, XL 324 #ifdef GOARCH_ppc64le 325 VPERM XL, XL, LEMASK, XL 326 #endif 327 STXVD2X VXL, (XIP+R0) 328 329 OR R12, R12, R12 // write out Xi 330 RET 331 332 gcm_ghash_p8_4x: 333 LVSL (R8)(R0), T0 // 0x0001..0e0f 334 MOVD $0x70, R8 335 LXVD2X (HTBL)(R9), VH2 336 MOVD $0x80, R9 337 VSPLTISB $8, T1 // 0x0808..0808 338 MOVD $0x90, R10 339 LXVD2X (HTBL)(R8), VH3L // load H^3 340 MOVD $0xa0, R8 341 LXVD2X (HTBL)(R9), VH3 342 MOVD $0xb0, R9 343 LXVD2X (HTBL)(R10), VH3H 344 MOVD $0xc0, R10 345 LXVD2X (HTBL)(R8), VH4L // load H^4 346 MOVD $0x10, R8 347 LXVD2X (HTBL)(R9), VH4 348 MOVD $0x20, R9 349 LXVD2X (HTBL)(R10), VH4H 350 MOVD $0x30, R10 351 352 VSLDOI $8, ZERO, T1, T2 // 0x0000..0808 353 VADDUBM T0, T2, HIPERM // 0x0001..1617 354 VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f 355 356 SRD $4, LEN, LEN // this allows to use sign bit as carry 357 358 LXVD2X (INP)(R0), VIN0 // load input 359 LXVD2X (INP)(R8), VIN1 360 SUBCCC $8, LEN, LEN 361 LXVD2X (INP)(R9), VIN2 362 LXVD2X (INP)(R10), VIN3 363 ADD $0x40, INP, INP 364 #ifdef GOARCH_ppc64le 365 VPERM IN0, IN0, LEMASK, IN0 366 VPERM IN1, IN1, LEMASK, IN1 367 VPERM IN2, IN2, LEMASK, IN2 368 VPERM IN3, IN3, LEMASK, IN3 369 #endif 370 371 VXOR IN0, XL, XH 372 373 VPMSUMD IN1, H3L, XL1 374 VPMSUMD IN1, H3, XM1 375 VPMSUMD IN1, H3H, XH1 376 377 VPERM H2, H, HIPERM, H21L 378 VPERM IN2, IN3, LOPERM, T0 379 VPERM H2, H, LOPERM, H21H 380 VPERM IN2, IN3, HIPERM, T1 381 VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo 382 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo 383 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi 384 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi 385 386 VXOR XM2, XM1, XM2 387 VXOR XL3, XL1, XL3 388 VXOR XM3, XM2, XM3 389 VXOR XH3, XH1, XH3 390 391 BLT tail_4x 392 393 loop_4x: 394 LXVD2X (INP)(R0), VIN0 395 LXVD2X (INP)(R8), VIN1 396 SUBCCC $4, LEN, LEN 397 LXVD2X (INP)(R9), VIN2 398 LXVD2X (INP)(R10), VIN3 399 ADD $0x40, INP, INP 400 #ifdef GOARCH_ppc64le 401 VPERM IN1, IN1, LEMASK, IN1 402 VPERM IN2, IN2, LEMASK, IN2 403 VPERM IN3, IN3, LEMASK, IN3 404 VPERM IN0, IN0, LEMASK, IN0 405 #endif 406 407 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo 408 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi 409 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi 410 VPMSUMD IN1, H3L, XL1 411 VPMSUMD IN1, H3, XM1 412 VPMSUMD IN1, H3H, XH1 413 414 VXOR XL, XL3, XL 415 VXOR XM, XM3, XM 416 VXOR XH, XH3, XH 417 VPERM IN2, IN3, LOPERM, T0 418 VPERM IN2, IN3, HIPERM, T1 419 420 VPMSUMD XL, XC2, T2 // 1st reduction phase 421 VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo 422 VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi 423 424 VSLDOI $8, XM, ZERO, T0 425 VSLDOI $8, ZERO, XM, T1 426 VXOR XL, T0, XL 427 VXOR XH, T1, XH 428 429 VSLDOI $8, XL, XL, XL 430 VXOR XL, T2, XL 431 432 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 433 VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi 434 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi 435 VPMSUMD XL, XC2, XL 436 437 VXOR XL3, XL1, XL3 438 VXOR XH3, XH1, XH3 439 VXOR XH, IN0, XH 440 VXOR XM2, XM1, XM2 441 VXOR XH, T1, XH 442 VXOR XM3, XM2, XM3 443 VXOR XH, XL, XH 444 BGE loop_4x 445 446 tail_4x: 447 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo 448 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi 449 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi 450 451 VXOR XL, XL3, XL 452 VXOR XM, XM3, XM 453 454 VPMSUMD XL, XC2, T2 // 1st reduction phase 455 456 VSLDOI $8, XM, ZERO, T0 457 VSLDOI $8, ZERO, XM, T1 458 VXOR XH, XH3, XH 459 VXOR XL, T0, XL 460 VXOR XH, T1, XH 461 462 VSLDOI $8, XL, XL, XL 463 VXOR XL, T2, XL 464 465 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 466 VPMSUMD XL, XC2, XL 467 VXOR T1, XH, T1 468 VXOR XL, T1, XL 469 470 ADDCCC $4, LEN, LEN 471 BEQ done_4x 472 473 LXVD2X (INP)(R0), VIN0 474 CMPU LEN, $2 475 MOVD $-4, LEN 476 BLT one 477 LXVD2X (INP)(R8), VIN1 478 BEQ two 479 480 three: 481 LXVD2X (INP)(R9), VIN2 482 #ifdef GOARCH_ppc64le 483 VPERM IN0, IN0, LEMASK, IN0 484 VPERM IN1, IN1, LEMASK, IN1 485 VPERM IN2, IN2, LEMASK, IN2 486 #endif 487 488 VXOR IN0, XL, XH 489 VOR H3L, H3L, H4L 490 VOR H3, H3, H4 491 VOR H3H, H3H, H4H 492 493 VPERM IN1, IN2, LOPERM, T0 494 VPERM IN1, IN2, HIPERM, T1 495 VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo 496 VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi 497 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo 498 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi 499 500 VXOR XM3, XM2, XM3 501 JMP tail_4x 502 503 two: 504 #ifdef GOARCH_ppc64le 505 VPERM IN0, IN0, LEMASK, IN0 506 VPERM IN1, IN1, LEMASK, IN1 507 #endif 508 509 VXOR IN, XL, XH 510 VPERM ZERO, IN1, LOPERM, T0 511 VPERM ZERO, IN1, HIPERM, T1 512 513 VSLDOI $8, ZERO, H2, H4L 514 VOR H2, H2, H4 515 VSLDOI $8, H2, ZERO, H4H 516 517 VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo 518 VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi 519 VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi 520 521 JMP tail_4x 522 523 one: 524 #ifdef GOARCH_ppc64le 525 VPERM IN0, IN0, LEMASK, IN0 526 #endif 527 528 VSLDOI $8, ZERO, H, H4L 529 VOR H, H, H4 530 VSLDOI $8, H, ZERO, H4H 531 532 VXOR IN0, XL, XH 533 VXOR XL3, XL3, XL3 534 VXOR XM3, XM3, XM3 535 VXOR XH3, XH3, XH3 536 537 JMP tail_4x 538 539 done_4x: 540 #ifdef GOARCH_ppc64le 541 VPERM XL, XL, LEMASK, XL 542 #endif 543 STXVD2X VXL, (XIP+R0) // write out Xi 544 RET 545 546 // func gcmMul(output []byte, productTable *[256]byte) 547 TEXT ·gcmMul(SB), NOSPLIT, $0-32 548 MOVD output+0(FP), XIP 549 MOVD productTable+24(FP), HTBL 550 551 MOVD $0x10, R8 552 MOVD $0x20, R9 553 MOVD $0x30, R10 554 LXVD2X (XIP)(R0), VIN // load Xi 555 556 LXVD2X (HTBL)(R8), VHL // Load pre-computed table 557 LXVD2X (HTBL)(R9), VH 558 LXVD2X (HTBL)(R10), VHH 559 LXVD2X (HTBL)(R0), VXC2 560 #ifdef GOARCH_ppc64le 561 VSPLTISB $0x07, T0 562 VXOR LEMASK, T0, LEMASK 563 VPERM IN, IN, LEMASK, IN 564 #endif 565 VXOR ZERO, ZERO, ZERO 566 567 VPMSUMD IN, HL, XL // H.lo·Xi.lo 568 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi 569 VPMSUMD IN, HH, XH // H.hi·Xi.hi 570 571 VPMSUMD XL, XC2, T2 // 1st reduction phase 572 573 VSLDOI $8, XM, ZERO, T0 574 VSLDOI $8, ZERO, XM, T1 575 VXOR XL, T0, XL 576 VXOR XH, T1, XH 577 578 VSLDOI $8, XL, XL, XL 579 VXOR XL, T2, XL 580 581 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 582 VPMSUMD XL, XC2, XL 583 VXOR T1, XH, T1 584 VXOR XL, T1, XL 585 586 #ifdef GOARCH_ppc64le 587 VPERM XL, XL, LEMASK, XL 588 #endif 589 STXVD2X VXL, (XIP+R0) // write out Xi 590 RET