github.com/code-reading/golang@v0.0.0-20220303082512-ba5bc0e589a3/go/src/crypto/aes/gcm_ppc64le.s (about) 1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Based on CRYPTOGAMS code with the following comment: 6 // # ==================================================================== 7 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 8 // # project. The module is, however, dual licensed under OpenSSL and 9 // # CRYPTOGAMS licenses depending on where you obtain it. For further 10 // # details see http://www.openssl.org/~appro/cryptogams/. 11 // # ==================================================================== 12 13 // This implementation is based on the ppc64 asm generated by the 14 // script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl 15 // from commit d47afb3c. 16 17 // Changes were made due to differences in the ABI and some register usage. 18 // Some arguments were changed due to the way the Go code passes them. 19 20 #include "textflag.h" 21 22 #define XIP R3 23 #define HTBL R4 24 #define INP R5 25 #define LEN R6 26 27 #define XL V0 28 #define XM V1 29 #define XH V2 30 #define IN V3 31 #define ZERO V4 32 #define T0 V5 33 #define T1 V6 34 #define T2 V7 35 #define XC2 V8 36 #define H V9 37 #define HH V10 38 #define HL V11 39 #define LEMASK V12 40 #define XL1 V13 41 #define XM1 V14 42 #define XH1 V15 43 #define IN1 V16 44 #define H2 V17 45 #define H2H V18 46 #define H2L V19 47 #define XL3 V20 48 #define XM2 V21 49 #define IN2 V22 50 #define H3L V23 51 #define H3 V24 52 #define H3H V25 53 #define XH3 V26 54 #define XM3 V27 55 #define IN3 V28 56 #define H4L V29 57 #define H4 V30 58 #define H4H V31 59 60 #define IN0 IN 61 #define H21L HL 62 #define H21H HH 63 #define LOPERM H2L 64 #define HIPERM H2H 65 66 #define VXL VS32 67 #define VIN VS35 68 #define VXC2 VS40 69 #define VH VS41 70 #define VHH VS42 71 #define VHL VS43 72 #define VIN1 VS48 73 #define VH2 VS49 74 #define VH2H VS50 75 #define VH2L VS51 76 77 #define VIN2 VS54 78 #define VH3L VS55 79 #define VH3 VS56 80 #define VH3H VS57 81 #define VIN3 VS60 82 #define VH4L VS61 83 #define VH4 VS62 84 #define VH4H VS63 85 86 #define VIN0 VIN 87 88 // func gcmInit(productTable *[256]byte, h []byte) 89 TEXT ·gcmInit(SB), NOSPLIT, $0-32 90 MOVD productTable+0(FP), XIP 91 MOVD h+8(FP), HTBL 92 93 MOVD $0x10, R8 94 MOVD $0x20, R9 95 MOVD $0x30, R10 96 LXVD2X (HTBL)(R0), VH // Load H 97 98 VSPLTISB $-16, XC2 // 0xf0 99 VSPLTISB $1, T0 // one 100 VADDUBM XC2, XC2, XC2 // 0xe0 101 VXOR ZERO, ZERO, ZERO 102 VOR XC2, T0, XC2 // 0xe1 103 VSLDOI $15, XC2, ZERO, XC2 // 0xe1... 104 VSLDOI $1, ZERO, T0, T1 // ...1 105 VADDUBM XC2, XC2, XC2 // 0xc2... 106 VSPLTISB $7, T2 107 VOR XC2, T1, XC2 // 0xc2....01 108 VSPLTB $0, H, T1 // most significant byte 109 VSL H, T0, H // H<<=1 110 VSRAB T1, T2, T1 // broadcast carry bit 111 VAND T1, XC2, T1 112 VXOR H, T1, IN // twisted H 113 114 VSLDOI $8, IN, IN, H // twist even more ... 115 VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0 116 VSLDOI $8, ZERO, H, HL // ... and split 117 VSLDOI $8, H, ZERO, HH 118 119 STXVD2X VXC2, (XIP+R0) // save pre-computed table 120 STXVD2X VHL, (XIP+R8) 121 MOVD $0x40, R8 122 STXVD2X VH, (XIP+R9) 123 MOVD $0x50, R9 124 STXVD2X VHH, (XIP+R10) 125 MOVD $0x60, R10 126 127 VPMSUMD IN, HL, XL // H.lo·H.lo 128 VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi 129 VPMSUMD IN, HH, XH // H.hi·H.hi 130 131 VPMSUMD XL, XC2, T2 // 1st reduction phase 132 133 VSLDOI $8, XM, ZERO, T0 134 VSLDOI $8, ZERO, XM, T1 135 VXOR XL, T0, XL 136 VXOR XH, T1, XH 137 138 VSLDOI $8, XL, XL, XL 139 VXOR XL, T2, XL 140 141 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 142 VPMSUMD XL, XC2, XL 143 VXOR T1, XH, T1 144 VXOR XL, T1, IN1 145 146 VSLDOI $8, IN1, IN1, H2 147 VSLDOI $8, ZERO, H2, H2L 148 VSLDOI $8, H2, ZERO, H2H 149 150 STXVD2X VH2L, (XIP+R8) // save H^2 151 MOVD $0x70, R8 152 STXVD2X VH2, (XIP+R9) 153 MOVD $0x80, R9 154 STXVD2X VH2H, (XIP+R10) 155 MOVD $0x90, R10 156 157 VPMSUMD IN, H2L, XL // H.lo·H^2.lo 158 VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo 159 VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi 160 VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi 161 VPMSUMD IN, H2H, XH // H.hi·H^2.hi 162 VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi 163 164 VPMSUMD XL, XC2, T2 // 1st reduction phase 165 VPMSUMD XL1, XC2, HH // 1st reduction phase 166 167 VSLDOI $8, XM, ZERO, T0 168 VSLDOI $8, ZERO, XM, T1 169 VSLDOI $8, XM1, ZERO, HL 170 VSLDOI $8, ZERO, XM1, H 171 VXOR XL, T0, XL 172 VXOR XH, T1, XH 173 VXOR XL1, HL, XL1 174 VXOR XH1, H, XH1 175 176 VSLDOI $8, XL, XL, XL 177 VSLDOI $8, XL1, XL1, XL1 178 VXOR XL, T2, XL 179 VXOR XL1, HH, XL1 180 181 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 182 VSLDOI $8, XL1, XL1, H // 2nd reduction phase 183 VPMSUMD XL, XC2, XL 184 VPMSUMD XL1, XC2, XL1 185 VXOR T1, XH, T1 186 VXOR H, XH1, H 187 VXOR XL, T1, XL 188 VXOR XL1, H, XL1 189 190 VSLDOI $8, XL, XL, H 191 VSLDOI $8, XL1, XL1, H2 192 VSLDOI $8, ZERO, H, HL 193 VSLDOI $8, H, ZERO, HH 194 VSLDOI $8, ZERO, H2, H2L 195 VSLDOI $8, H2, ZERO, H2H 196 197 STXVD2X VHL, (XIP+R8) // save H^3 198 MOVD $0xa0, R8 199 STXVD2X VH, (XIP+R9) 200 MOVD $0xb0, R9 201 STXVD2X VHH, (XIP+R10) 202 MOVD $0xc0, R10 203 STXVD2X VH2L, (XIP+R8) // save H^4 204 STXVD2X VH2, (XIP+R9) 205 STXVD2X VH2H, (XIP+R10) 206 207 RET 208 209 // func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int) 210 TEXT ·gcmHash(SB), NOSPLIT, $0-64 211 MOVD output+0(FP), XIP 212 MOVD productTable+24(FP), HTBL 213 MOVD inp+32(FP), INP 214 MOVD len+56(FP), LEN 215 216 MOVD $0x10, R8 217 MOVD $0x20, R9 218 MOVD $0x30, R10 219 LXVD2X (XIP)(R0), VXL // load Xi 220 221 LXVD2X (HTBL)(R8), VHL // load pre-computed table 222 MOVD $0x40, R8 223 LVSL (R0)(R0), LEMASK 224 LXVD2X (HTBL)(R9), VH 225 MOVD $0x50, R9 226 VSPLTISB $0x07, T0 227 LXVD2X (HTBL)(R10), VHH 228 MOVD $0x60, R10 229 VXOR LEMASK, T0, LEMASK 230 LXVD2X (HTBL)(R0), VXC2 231 VPERM XL, XL, LEMASK, XL 232 VXOR ZERO, ZERO, ZERO 233 234 CMPU LEN, $64 235 BGE gcm_ghash_p8_4x 236 237 LXVD2X (INP)(R0), VIN 238 ADD $16, INP, INP 239 SUBCCC $16, LEN, LEN 240 VPERM IN, IN, LEMASK, IN 241 VXOR IN, XL, IN 242 BEQ short 243 244 LXVD2X (HTBL)(R8), VH2L // load H^2 245 MOVD $16, R8 246 LXVD2X (HTBL)(R9), VH2 247 ADD LEN, INP, R9 // end of input 248 LXVD2X (HTBL)(R10), VH2H 249 250 loop_2x: 251 LXVD2X (INP)(R0), VIN1 252 VPERM IN1, IN1, LEMASK, IN1 253 254 SUBC $32, LEN, LEN 255 VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo 256 VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo 257 SUBE R11, R11, R11 // borrow?-1:0 258 VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi 259 VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi 260 AND LEN, R11, R11 261 VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi 262 VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi 263 ADD R11, INP, INP 264 265 VXOR XL, XL1, XL 266 VXOR XM, XM1, XM 267 268 VPMSUMD XL, XC2, T2 // 1st reduction phase 269 270 VSLDOI $8, XM, ZERO, T0 271 VSLDOI $8, ZERO, XM, T1 272 VXOR XH, XH1, XH 273 VXOR XL, T0, XL 274 VXOR XH, T1, XH 275 276 VSLDOI $8, XL, XL, XL 277 VXOR XL, T2, XL 278 LXVD2X (INP)(R8), VIN 279 ADD $32, INP, INP 280 281 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 282 VPMSUMD XL, XC2, XL 283 VPERM IN, IN, LEMASK, IN 284 VXOR T1, XH, T1 285 VXOR IN, T1, IN 286 VXOR IN, XL, IN 287 CMP R9, INP 288 BGT loop_2x // done yet? 289 290 CMPWU LEN, $0 291 BNE even 292 293 short: 294 VPMSUMD IN, HL, XL // H.lo·Xi.lo 295 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi 296 VPMSUMD IN, HH, XH // H.hi·Xi.hi 297 298 VPMSUMD XL, XC2, T2 // 1st reduction phase 299 300 VSLDOI $8, XM, ZERO, T0 301 VSLDOI $8, ZERO, XM, T1 302 VXOR XL, T0, XL 303 VXOR XH, T1, XH 304 305 VSLDOI $8, XL, XL, XL 306 VXOR XL, T2, XL 307 308 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 309 VPMSUMD XL, XC2, XL 310 VXOR T1, XH, T1 311 312 even: 313 VXOR XL, T1, XL 314 VPERM XL, XL, LEMASK, XL 315 STXVD2X VXL, (XIP+R0) 316 317 OR R12, R12, R12 // write out Xi 318 RET 319 320 gcm_ghash_p8_4x: 321 LVSL (R8)(R0), T0 // 0x0001..0e0f 322 MOVD $0x70, R8 323 LXVD2X (HTBL)(R9), VH2 324 MOVD $0x80, R9 325 VSPLTISB $8, T1 // 0x0808..0808 326 MOVD $0x90, R10 327 LXVD2X (HTBL)(R8), VH3L // load H^3 328 MOVD $0xa0, R8 329 LXVD2X (HTBL)(R9), VH3 330 MOVD $0xb0, R9 331 LXVD2X (HTBL)(R10), VH3H 332 MOVD $0xc0, R10 333 LXVD2X (HTBL)(R8), VH4L // load H^4 334 MOVD $0x10, R8 335 LXVD2X (HTBL)(R9), VH4 336 MOVD $0x20, R9 337 LXVD2X (HTBL)(R10), VH4H 338 MOVD $0x30, R10 339 340 VSLDOI $8, ZERO, T1, T2 // 0x0000..0808 341 VADDUBM T0, T2, HIPERM // 0x0001..1617 342 VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f 343 344 SRD $4, LEN, LEN // this allows to use sign bit as carry 345 346 LXVD2X (INP)(R0), VIN0 // load input 347 LXVD2X (INP)(R8), VIN1 348 SUBCCC $8, LEN, LEN 349 LXVD2X (INP)(R9), VIN2 350 LXVD2X (INP)(R10), VIN3 351 ADD $0x40, INP, INP 352 VPERM IN0, IN0, LEMASK, IN0 353 VPERM IN1, IN1, LEMASK, IN1 354 VPERM IN2, IN2, LEMASK, IN2 355 VPERM IN3, IN3, LEMASK, IN3 356 357 VXOR IN0, XL, XH 358 359 VPMSUMD IN1, H3L, XL1 360 VPMSUMD IN1, H3, XM1 361 VPMSUMD IN1, H3H, XH1 362 363 VPERM H2, H, HIPERM, H21L 364 VPERM IN2, IN3, LOPERM, T0 365 VPERM H2, H, LOPERM, H21H 366 VPERM IN2, IN3, HIPERM, T1 367 VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo 368 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo 369 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi 370 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi 371 372 VXOR XM2, XM1, XM2 373 VXOR XL3, XL1, XL3 374 VXOR XM3, XM2, XM3 375 VXOR XH3, XH1, XH3 376 377 BLT tail_4x 378 379 loop_4x: 380 LXVD2X (INP)(R0), VIN0 381 LXVD2X (INP)(R8), VIN1 382 SUBCCC $4, LEN, LEN 383 LXVD2X (INP)(R9), VIN2 384 LXVD2X (INP)(R10), VIN3 385 ADD $0x40, INP, INP 386 VPERM IN1, IN1, LEMASK, IN1 387 VPERM IN2, IN2, LEMASK, IN2 388 VPERM IN3, IN3, LEMASK, IN3 389 VPERM IN0, IN0, LEMASK, IN0 390 391 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo 392 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi 393 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi 394 VPMSUMD IN1, H3L, XL1 395 VPMSUMD IN1, H3, XM1 396 VPMSUMD IN1, H3H, XH1 397 398 VXOR XL, XL3, XL 399 VXOR XM, XM3, XM 400 VXOR XH, XH3, XH 401 VPERM IN2, IN3, LOPERM, T0 402 VPERM IN2, IN3, HIPERM, T1 403 404 VPMSUMD XL, XC2, T2 // 1st reduction phase 405 VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo 406 VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi 407 408 VSLDOI $8, XM, ZERO, T0 409 VSLDOI $8, ZERO, XM, T1 410 VXOR XL, T0, XL 411 VXOR XH, T1, XH 412 413 VSLDOI $8, XL, XL, XL 414 VXOR XL, T2, XL 415 416 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 417 VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi 418 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi 419 VPMSUMD XL, XC2, XL 420 421 VXOR XL3, XL1, XL3 422 VXOR XH3, XH1, XH3 423 VXOR XH, IN0, XH 424 VXOR XM2, XM1, XM2 425 VXOR XH, T1, XH 426 VXOR XM3, XM2, XM3 427 VXOR XH, XL, XH 428 BGE loop_4x 429 430 tail_4x: 431 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo 432 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi 433 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi 434 435 VXOR XL, XL3, XL 436 VXOR XM, XM3, XM 437 438 VPMSUMD XL, XC2, T2 // 1st reduction phase 439 440 VSLDOI $8, XM, ZERO, T0 441 VSLDOI $8, ZERO, XM, T1 442 VXOR XH, XH3, XH 443 VXOR XL, T0, XL 444 VXOR XH, T1, XH 445 446 VSLDOI $8, XL, XL, XL 447 VXOR XL, T2, XL 448 449 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 450 VPMSUMD XL, XC2, XL 451 VXOR T1, XH, T1 452 VXOR XL, T1, XL 453 454 ADDCCC $4, LEN, LEN 455 BEQ done_4x 456 457 LXVD2X (INP)(R0), VIN0 458 CMPU LEN, $2 459 MOVD $-4, LEN 460 BLT one 461 LXVD2X (INP)(R8), VIN1 462 BEQ two 463 464 three: 465 LXVD2X (INP)(R9), VIN2 466 VPERM IN0, IN0, LEMASK, IN0 467 VPERM IN1, IN1, LEMASK, IN1 468 VPERM IN2, IN2, LEMASK, IN2 469 470 VXOR IN0, XL, XH 471 VOR H3L, H3L, H4L 472 VOR H3, H3, H4 473 VOR H3H, H3H, H4H 474 475 VPERM IN1, IN2, LOPERM, T0 476 VPERM IN1, IN2, HIPERM, T1 477 VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo 478 VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi 479 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo 480 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi 481 482 VXOR XM3, XM2, XM3 483 JMP tail_4x 484 485 two: 486 VPERM IN0, IN0, LEMASK, IN0 487 VPERM IN1, IN1, LEMASK, IN1 488 489 VXOR IN, XL, XH 490 VPERM ZERO, IN1, LOPERM, T0 491 VPERM ZERO, IN1, HIPERM, T1 492 493 VSLDOI $8, ZERO, H2, H4L 494 VOR H2, H2, H4 495 VSLDOI $8, H2, ZERO, H4H 496 497 VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo 498 VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi 499 VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi 500 501 JMP tail_4x 502 503 one: 504 VPERM IN0, IN0, LEMASK, IN0 505 506 VSLDOI $8, ZERO, H, H4L 507 VOR H, H, H4 508 VSLDOI $8, H, ZERO, H4H 509 510 VXOR IN0, XL, XH 511 VXOR XL3, XL3, XL3 512 VXOR XM3, XM3, XM3 513 VXOR XH3, XH3, XH3 514 515 JMP tail_4x 516 517 done_4x: 518 VPERM XL, XL, LEMASK, XL 519 STXVD2X VXL, (XIP+R0) // write out Xi 520 RET 521 522 // func gcmMul(output []byte, productTable *[256]byte) 523 TEXT ·gcmMul(SB), NOSPLIT, $0-32 524 MOVD output+0(FP), XIP 525 MOVD productTable+24(FP), HTBL 526 527 MOVD $0x10, R8 528 MOVD $0x20, R9 529 MOVD $0x30, R10 530 LXVD2X (XIP)(R0), VIN // load Xi 531 532 LXVD2X (HTBL)(R8), VHL // Load pre-computed table 533 LVSL (R0)(R0), LEMASK 534 LXVD2X (HTBL)(R9), VH 535 VSPLTISB $0x07, T0 536 LXVD2X (HTBL)(R10), VHH 537 VXOR LEMASK, T0, LEMASK 538 LXVD2X (HTBL)(R0), VXC2 539 VPERM IN, IN, LEMASK, IN 540 VXOR ZERO, ZERO, ZERO 541 542 VPMSUMD IN, HL, XL // H.lo·Xi.lo 543 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi 544 VPMSUMD IN, HH, XH // H.hi·Xi.hi 545 546 VPMSUMD XL, XC2, T2 // 1st reduction phase 547 548 VSLDOI $8, XM, ZERO, T0 549 VSLDOI $8, ZERO, XM, T1 550 VXOR XL, T0, XL 551 VXOR XH, T1, XH 552 553 VSLDOI $8, XL, XL, XL 554 VXOR XL, T2, XL 555 556 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 557 VPMSUMD XL, XC2, XL 558 VXOR T1, XH, T1 559 VXOR XL, T1, XL 560 561 VPERM XL, XL, LEMASK, XL 562 STXVD2X VXL, (XIP+R0) // write out Xi 563 RET