github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/crypto/internal/nistec/p256_asm_ppc64le.s (about) 1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // This is a port of the s390x asm implementation. 8 // to ppc64le. 9 10 // Some changes were needed due to differences in 11 // the Go opcodes and/or available instructions 12 // between s390x and ppc64le. 13 14 // 1. There were operand order differences in the 15 // VSUBUQM, VSUBCUQ, and VSEL instructions. 16 17 // 2. ppc64 does not have a multiply high and low 18 // like s390x, so those were implemented using 19 // macros to compute the equivalent values. 20 21 // 3. The LVX, STVX instructions on ppc64 require 22 // 16 byte alignment of the data. To avoid that 23 // requirement, data is loaded using LXVD2X and 24 // STXVD2X with VPERM to reorder bytes correctly. 25 26 // I have identified some areas where I believe 27 // changes would be needed to make this work for big 28 // endian; however additional changes beyond what I 29 // have noted are most likely needed to make it work. 30 // - The string used with VPERM to swap the byte order 31 // for loads and stores. 32 // - The constants that are loaded from CPOOL. 33 // 34 35 // The following constants are defined in an order 36 // that is correct for use with LXVD2X/STXVD2X 37 // on little endian. 38 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 39 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 40 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 41 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 42 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 43 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 44 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 45 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 46 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 47 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 48 DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original 49 DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256 50 DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original 51 DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256 52 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 53 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 54 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 55 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 56 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 57 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 58 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 59 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 60 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 61 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 62 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 63 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 64 DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 65 DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256 66 DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 67 DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 68 69 // External declarations for constants 70 GLOBL p256ord<>(SB), 8, $32 71 GLOBL p256<>(SB), 8, $80 72 GLOBL p256mul<>(SB), 8, $160 73 74 // The following macros are used to implement the ppc64le 75 // equivalent function from the corresponding s390x 76 // instruction for vector multiply high, low, and add, 77 // since there aren't exact equivalent instructions. 78 // The corresponding s390x instructions appear in the 79 // comments. 80 // Implementation for big endian would have to be 81 // investigated, I think it would be different. 82 // 83 // 84 // Vector multiply word 85 // 86 // VMLF x0, x1, out_low 87 // VMLHF x0, x1, out_hi 88 #define VMULT(x1, x2, out_low, out_hi) \ 89 VMULEUW x1, x2, TMP1; \ 90 VMULOUW x1, x2, TMP2; \ 91 VMRGEW TMP1, TMP2, out_hi; \ 92 VMRGOW TMP1, TMP2, out_low 93 94 // 95 // Vector multiply add word 96 // 97 // VMALF x0, x1, y, out_low 98 // VMALHF x0, x1, y, out_hi 99 #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \ 100 VMULEUW y, one, TMP2; \ 101 VMULOUW y, one, TMP1; \ 102 VMULEUW x1, x2, out_low; \ 103 VMULOUW x1, x2, out_hi; \ 104 VADDUDM TMP2, out_low, TMP2; \ 105 VADDUDM TMP1, out_hi, TMP1; \ 106 VMRGOW TMP2, TMP1, out_low; \ 107 VMRGEW TMP2, TMP1, out_hi 108 109 #define res_ptr R3 110 #define a_ptr R4 111 112 #undef res_ptr 113 #undef a_ptr 114 115 #define P1ptr R3 116 #define CPOOL R7 117 118 #define Y1L V0 119 #define Y1H V1 120 #define T1L V2 121 #define T1H V3 122 123 #define PL V30 124 #define PH V31 125 126 #define CAR1 V6 127 // func p256NegCond(val *p256Point, cond int) 128 TEXT ·p256NegCond(SB), NOSPLIT, $0-16 129 MOVD val+0(FP), P1ptr 130 MOVD $16, R16 131 132 MOVD cond+8(FP), R6 133 CMP $0, R6 134 BC 12, 2, LR // just return if cond == 0 135 136 MOVD $p256mul<>+0x00(SB), CPOOL 137 138 LXVD2X (P1ptr)(R0), Y1L 139 LXVD2X (P1ptr)(R16), Y1H 140 141 XXPERMDI Y1H, Y1H, $2, Y1H 142 XXPERMDI Y1L, Y1L, $2, Y1L 143 144 LXVD2X (CPOOL)(R0), PL 145 LXVD2X (CPOOL)(R16), PH 146 147 VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry 148 VSUBUQM PL, Y1L, T1L // subtract part2 giving result 149 VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2 150 151 XXPERMDI T1H, T1H, $2, T1H 152 XXPERMDI T1L, T1L, $2, T1L 153 154 STXVD2X T1L, (R0+P1ptr) 155 STXVD2X T1H, (R16+P1ptr) 156 RET 157 158 #undef P1ptr 159 #undef CPOOL 160 #undef Y1L 161 #undef Y1H 162 #undef T1L 163 #undef T1H 164 #undef PL 165 #undef PH 166 #undef CAR1 167 168 #define P3ptr R3 169 #define P1ptr R4 170 #define P2ptr R5 171 172 #define X1L V0 173 #define X1H V1 174 #define Y1L V2 175 #define Y1H V3 176 #define Z1L V4 177 #define Z1H V5 178 #define X2L V6 179 #define X2H V7 180 #define Y2L V8 181 #define Y2H V9 182 #define Z2L V10 183 #define Z2H V11 184 #define SEL V12 185 #define ZER V13 186 187 // This function uses LXVD2X and STXVD2X to avoid the 188 // data alignment requirement for LVX, STVX. Since 189 // this code is just moving bytes and not doing arithmetic, 190 // order of the bytes doesn't matter. 191 // 192 // func p256MovCond(res, a, b *p256Point, cond int) 193 TEXT ·p256MovCond(SB), NOSPLIT, $0-32 194 MOVD res+0(FP), P3ptr 195 MOVD a+8(FP), P1ptr 196 MOVD b+16(FP), P2ptr 197 MOVD $16, R16 198 MOVD $32, R17 199 MOVD $48, R18 200 MOVD $56, R21 201 MOVD $64, R19 202 MOVD $80, R20 203 // cond is R1 + 24 (cond offset) + 32 204 LXVDSX (R1)(R21), SEL 205 VSPLTISB $0, ZER 206 // SEL controls whether to store a or b 207 VCMPEQUD SEL, ZER, SEL 208 209 LXVD2X (P1ptr+R0), X1H 210 LXVD2X (P1ptr+R16), X1L 211 LXVD2X (P1ptr+R17), Y1H 212 LXVD2X (P1ptr+R18), Y1L 213 LXVD2X (P1ptr+R19), Z1H 214 LXVD2X (P1ptr+R20), Z1L 215 216 LXVD2X (P2ptr+R0), X2H 217 LXVD2X (P2ptr+R16), X2L 218 LXVD2X (P2ptr+R17), Y2H 219 LXVD2X (P2ptr+R18), Y2L 220 LXVD2X (P2ptr+R19), Z2H 221 LXVD2X (P2ptr+R20), Z2L 222 223 VSEL X1H, X2H, SEL, X1H 224 VSEL X1L, X2L, SEL, X1L 225 VSEL Y1H, Y2H, SEL, Y1H 226 VSEL Y1L, Y2L, SEL, Y1L 227 VSEL Z1H, Z2H, SEL, Z1H 228 VSEL Z1L, Z2L, SEL, Z1L 229 230 STXVD2X X1H, (P3ptr+R0) 231 STXVD2X X1L, (P3ptr+R16) 232 STXVD2X Y1H, (P3ptr+R17) 233 STXVD2X Y1L, (P3ptr+R18) 234 STXVD2X Z1H, (P3ptr+R19) 235 STXVD2X Z1L, (P3ptr+R20) 236 237 RET 238 239 #undef P3ptr 240 #undef P1ptr 241 #undef P2ptr 242 #undef X1L 243 #undef X1H 244 #undef Y1L 245 #undef Y1H 246 #undef Z1L 247 #undef Z1H 248 #undef X2L 249 #undef X2H 250 #undef Y2L 251 #undef Y2H 252 #undef Z2L 253 #undef Z2H 254 #undef SEL 255 #undef ZER 256 257 #define P3ptr R3 258 #define P1ptr R4 259 #define COUNT R5 260 261 #define X1L V0 262 #define X1H V1 263 #define Y1L V2 264 #define Y1H V3 265 #define Z1L V4 266 #define Z1H V5 267 #define X2L V6 268 #define X2H V7 269 #define Y2L V8 270 #define Y2H V9 271 #define Z2L V10 272 #define Z2H V11 273 274 #define ONE V18 275 #define IDX V19 276 #define SEL1 V20 277 #define SEL2 V21 278 // func p256Select(point *p256Point, table *p256Table, idx int) 279 TEXT ·p256Select(SB), NOSPLIT, $0-24 280 MOVD res+0(FP), P3ptr 281 MOVD table+8(FP), P1ptr 282 MOVD $16, R16 283 MOVD $32, R17 284 MOVD $48, R18 285 MOVD $64, R19 286 MOVD $80, R20 287 288 LXVDSX (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1 289 VSPLTB $7, SEL1, IDX // splat byte 290 VSPLTISB $1, ONE // VREPIB $1, ONE 291 VSPLTISB $1, SEL2 // VREPIB $1, SEL2 292 MOVD $17, COUNT 293 MOVD COUNT, CTR // set up ctr 294 295 VSPLTISB $0, X1H // VZERO X1H 296 VSPLTISB $0, X1L // VZERO X1L 297 VSPLTISB $0, Y1H // VZERO Y1H 298 VSPLTISB $0, Y1L // VZERO Y1L 299 VSPLTISB $0, Z1H // VZERO Z1H 300 VSPLTISB $0, Z1L // VZERO Z1L 301 302 loop_select: 303 304 // LVXD2X is used here since data alignment doesn't 305 // matter. 306 307 LXVD2X (P1ptr+R0), X2H 308 LXVD2X (P1ptr+R16), X2L 309 LXVD2X (P1ptr+R17), Y2H 310 LXVD2X (P1ptr+R18), Y2L 311 LXVD2X (P1ptr+R19), Z2H 312 LXVD2X (P1ptr+R20), Z2L 313 314 VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK 315 316 // This will result in SEL1 being all 0s or 1s, meaning 317 // the result is either X1L or X2L, no individual byte 318 // selection. 319 320 VSEL X1L, X2L, SEL1, X1L 321 VSEL X1H, X2H, SEL1, X1H 322 VSEL Y1L, Y2L, SEL1, Y1L 323 VSEL Y1H, Y2H, SEL1, Y1H 324 VSEL Z1L, Z2L, SEL1, Z1L 325 VSEL Z1H, Z2H, SEL1, Z1H 326 327 // Add 1 to all bytes in SEL2 328 VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK 329 ADD $96, P1ptr 330 BDNZ loop_select 331 332 // STXVD2X is used here so that alignment doesn't 333 // need to be verified. Since values were loaded 334 // using LXVD2X this is OK. 335 STXVD2X X1H, (P3ptr+R0) 336 STXVD2X X1L, (P3ptr+R16) 337 STXVD2X Y1H, (P3ptr+R17) 338 STXVD2X Y1L, (P3ptr+R18) 339 STXVD2X Z1H, (P3ptr+R19) 340 STXVD2X Z1L, (P3ptr+R20) 341 RET 342 343 #undef P3ptr 344 #undef P1ptr 345 #undef COUNT 346 #undef X1L 347 #undef X1H 348 #undef Y1L 349 #undef Y1H 350 #undef Z1L 351 #undef Z1H 352 #undef X2L 353 #undef X2H 354 #undef Y2L 355 #undef Y2H 356 #undef Z2L 357 #undef Z2H 358 #undef ONE 359 #undef IDX 360 #undef SEL1 361 #undef SEL2 362 363 // The following functions all reverse the byte order. 364 365 //func p256BigToLittle(res *p256Element, in *[32]byte) 366 TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16 367 MOVD res+0(FP), R3 368 MOVD in+8(FP), R4 369 BR p256InternalEndianSwap<>(SB) 370 371 //func p256LittleToBig(res *[32]byte, in *p256Element) 372 TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16 373 MOVD res+0(FP), R3 374 MOVD in+8(FP), R4 375 BR p256InternalEndianSwap<>(SB) 376 377 //func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 378 TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16 379 MOVD res+0(FP), R3 380 MOVD in+8(FP), R4 381 BR p256InternalEndianSwap<>(SB) 382 383 //func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 384 TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16 385 MOVD res+0(FP), R3 386 MOVD in+8(FP), R4 387 BR p256InternalEndianSwap<>(SB) 388 389 TEXT p256InternalEndianSwap<>(SB), NOSPLIT, $0-0 390 // Index registers needed for BR movs 391 MOVD $8, R9 392 MOVD $16, R10 393 MOVD $24, R14 394 395 MOVDBR (R0)(R4), R5 396 MOVDBR (R9)(R4), R6 397 MOVDBR (R10)(R4), R7 398 MOVDBR (R14)(R4), R8 399 400 MOVD R8, 0(R3) 401 MOVD R7, 8(R3) 402 MOVD R6, 16(R3) 403 MOVD R5, 24(R3) 404 405 RET 406 407 #define P3ptr R3 408 #define P1ptr R4 409 #define COUNT R5 410 411 #define X1L V0 412 #define X1H V1 413 #define Y1L V2 414 #define Y1H V3 415 #define Z1L V4 416 #define Z1H V5 417 #define X2L V6 418 #define X2H V7 419 #define Y2L V8 420 #define Y2H V9 421 #define Z2L V10 422 #define Z2H V11 423 424 #define ONE V18 425 #define IDX V19 426 #define SEL1 V20 427 #define SEL2 V21 428 429 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 430 TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24 431 MOVD res+0(FP), P3ptr 432 MOVD table+8(FP), P1ptr 433 MOVD $16, R16 434 MOVD $32, R17 435 MOVD $48, R18 436 437 LXVDSX (R1)(R18), SEL1 438 VSPLTB $7, SEL1, IDX // splat byte 439 440 VSPLTISB $1, ONE // Vector with byte 1s 441 VSPLTISB $1, SEL2 // Vector with byte 1s 442 MOVD $64, COUNT 443 MOVD COUNT, CTR // loop count 444 445 VSPLTISB $0, X1H // VZERO X1H 446 VSPLTISB $0, X1L // VZERO X1L 447 VSPLTISB $0, Y1H // VZERO Y1H 448 VSPLTISB $0, Y1L // VZERO Y1L 449 450 loop_select: 451 LXVD2X (P1ptr+R0), X2H 452 LXVD2X (P1ptr+R16), X2L 453 LXVD2X (P1ptr+R17), Y2H 454 LXVD2X (P1ptr+R18), Y2L 455 456 VCMPEQUD SEL2, IDX, SEL1 // Compare against idx 457 458 VSEL X1L, X2L, SEL1, X1L // Select if idx matched 459 VSEL X1H, X2H, SEL1, X1H 460 VSEL Y1L, Y2L, SEL1, Y1L 461 VSEL Y1H, Y2H, SEL1, Y1H 462 463 VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1 464 ADD $64, P1ptr // Next chunk 465 BDNZ loop_select 466 467 STXVD2X X1H, (P3ptr+R0) 468 STXVD2X X1L, (P3ptr+R16) 469 STXVD2X Y1H, (P3ptr+R17) 470 STXVD2X Y1L, (P3ptr+R18) 471 RET 472 473 #undef P3ptr 474 #undef P1ptr 475 #undef COUNT 476 #undef X1L 477 #undef X1H 478 #undef Y1L 479 #undef Y1H 480 #undef Z1L 481 #undef Z1H 482 #undef X2L 483 #undef X2H 484 #undef Y2L 485 #undef Y2H 486 #undef Z2L 487 #undef Z2H 488 #undef ONE 489 #undef IDX 490 #undef SEL1 491 #undef SEL2 492 493 #define res_ptr R3 494 #define x_ptr R4 495 #define CPOOL R7 496 497 #define T0 V0 498 #define T1 V1 499 #define T2 V2 500 #define TT0 V3 501 #define TT1 V4 502 503 #define ZER V6 504 #define SEL1 V7 505 #define SEL2 V8 506 #define CAR1 V9 507 #define CAR2 V10 508 #define RED1 V11 509 #define RED2 V12 510 #define PL V13 511 #define PH V14 512 513 // func p256FromMont(res, in *p256Element) 514 TEXT ·p256FromMont(SB), NOSPLIT, $0-16 515 MOVD res+0(FP), res_ptr 516 MOVD in+8(FP), x_ptr 517 518 MOVD $16, R16 519 MOVD $32, R17 520 MOVD $48, R18 521 MOVD $64, R19 522 MOVD $p256<>+0x00(SB), CPOOL 523 524 VSPLTISB $0, T2 // VZERO T2 525 VSPLTISB $0, ZER // VZERO ZER 526 527 // Constants are defined so that the LXVD2X is correct 528 LXVD2X (CPOOL+R0), PH 529 LXVD2X (CPOOL+R16), PL 530 531 // VPERM byte selections 532 LXVD2X (CPOOL+R18), SEL2 533 LXVD2X (CPOOL+R19), SEL1 534 535 LXVD2X (R16)(x_ptr), T1 536 LXVD2X (R0)(x_ptr), T0 537 538 // Put in true little endian order 539 XXPERMDI T0, T0, $2, T0 540 XXPERMDI T1, T1, $2, T1 541 542 // First round 543 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 544 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 545 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 546 547 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 548 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 549 550 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 551 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 552 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 553 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 554 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 555 556 // Second round 557 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 558 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 559 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 560 561 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 562 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 563 564 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 565 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 566 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 567 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 568 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 569 570 // Third round 571 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 572 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 573 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 574 575 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 576 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 577 578 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 579 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 580 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 581 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 582 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 583 584 // Last round 585 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 586 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 587 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 588 589 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 590 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 591 592 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 593 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 594 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 595 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 596 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 597 598 // --------------------------------------------------- 599 600 VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1 601 VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0 602 VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2 603 VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1 604 VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2 605 606 VSEL TT0, T0, T2, T0 607 VSEL TT1, T1, T2, T1 608 609 // Reorder the bytes so STXVD2X can be used. 610 // TT0, TT1 used for VPERM result in case 611 // the caller expects T0, T1 to be good. 612 XXPERMDI T0, T0, $2, TT0 613 XXPERMDI T1, T1, $2, TT1 614 615 STXVD2X TT0, (R0)(res_ptr) 616 STXVD2X TT1, (R16)(res_ptr) 617 RET 618 619 #undef res_ptr 620 #undef x_ptr 621 #undef CPOOL 622 #undef T0 623 #undef T1 624 #undef T2 625 #undef TT0 626 #undef TT1 627 #undef ZER 628 #undef SEL1 629 #undef SEL2 630 #undef CAR1 631 #undef CAR2 632 #undef RED1 633 #undef RED2 634 #undef PL 635 #undef PH 636 637 // --------------------------------------- 638 // p256MulInternal 639 // V0-V3 V30,V31 - Not Modified 640 // V4-V15 V27-V29 - Volatile 641 642 #define CPOOL R7 643 644 // Parameters 645 #define X0 V0 // Not modified 646 #define X1 V1 // Not modified 647 #define Y0 V2 // Not modified 648 #define Y1 V3 // Not modified 649 #define T0 V4 // Result 650 #define T1 V5 // Result 651 #define P0 V30 // Not modified 652 #define P1 V31 // Not modified 653 654 // Temporaries: lots of reused vector regs 655 #define YDIG V6 // Overloaded with CAR2 656 #define ADD1H V7 // Overloaded with ADD3H 657 #define ADD2H V8 // Overloaded with ADD4H 658 #define ADD3 V9 // Overloaded with SEL2,SEL5 659 #define ADD4 V10 // Overloaded with SEL3,SEL6 660 #define RED1 V11 // Overloaded with CAR2 661 #define RED2 V12 662 #define RED3 V13 // Overloaded with SEL1 663 #define T2 V14 664 // Overloaded temporaries 665 #define ADD1 V4 // Overloaded with T0 666 #define ADD2 V5 // Overloaded with T1 667 #define ADD3H V7 // Overloaded with ADD1H 668 #define ADD4H V8 // Overloaded with ADD2H 669 #define ZER V28 // Overloaded with TMP1 670 #define CAR1 V6 // Overloaded with YDIG 671 #define CAR2 V11 // Overloaded with RED1 672 // Constant Selects 673 #define SEL1 V13 // Overloaded with RED3 674 #define SEL2 V9 // Overloaded with ADD3,SEL5 675 #define SEL3 V10 // Overloaded with ADD4,SEL6 676 #define SEL4 V6 // Overloaded with YDIG,CAR1 677 #define SEL5 V9 // Overloaded with ADD3,SEL2 678 #define SEL6 V10 // Overloaded with ADD4,SEL3 679 680 // TMP1, TMP2 used in 681 // VMULT macros 682 #define TMP1 V13 // Overloaded with RED3 683 #define TMP2 V27 684 #define ONE V29 // 1s splatted by word 685 686 /* * 687 * To follow the flow of bits, for your own sanity a stiff drink, need you shall. 688 * Of a single round, a 'helpful' picture, here is. Meaning, column position has. 689 * With you, SIMD be... 690 * 691 * +--------+--------+ 692 * +--------| RED2 | RED1 | 693 * | +--------+--------+ 694 * | ---+--------+--------+ 695 * | +---- T2| T1 | T0 |--+ 696 * | | ---+--------+--------+ | 697 * | | | 698 * | | ======================= | 699 * | | | 700 * | | +--------+--------+<-+ 701 * | +-------| ADD2 | ADD1 |--|-----+ 702 * | | +--------+--------+ | | 703 * | | +--------+--------+<---+ | 704 * | | | ADD2H | ADD1H |--+ | 705 * | | +--------+--------+ | | 706 * | | +--------+--------+<-+ | 707 * | | | ADD4 | ADD3 |--|-+ | 708 * | | +--------+--------+ | | | 709 * | | +--------+--------+<---+ | | 710 * | | | ADD4H | ADD3H |------|-+ |(+vzero) 711 * | | +--------+--------+ | | V 712 * | | ------------------------ | | +--------+ 713 * | | | | | RED3 | [d0 0 0 d0] 714 * | | | | +--------+ 715 * | +---->+--------+--------+ | | | 716 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | | 717 * | +--------+--------+ | | | 718 * +---->---+--------+--------+ | | | 719 * T2| T1 | T0 |----+ | | 720 * ---+--------+--------+ | | | 721 * ---+--------+--------+<---+ | | 722 * +--- T2| T1 | T0 |----------+ 723 * | ---+--------+--------+ | | 724 * | +--------+--------+<-------------+ 725 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 726 * | +--------+--------+ | | | 727 * | +--------+<----------------------+ 728 * | | RED3 |--------------+ | [0 0 d1 d0] 729 * | +--------+ | | 730 * +--->+--------+--------+ | | 731 * | T1 | T0 |--------+ 732 * +--------+--------+ | | 733 * --------------------------- | | 734 * | | 735 * +--------+--------+<----+ | 736 * | RED2 | RED1 | | 737 * +--------+--------+ | 738 * ---+--------+--------+<-------+ 739 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 740 * ---+--------+--------+ 741 * 742 * *Mi obra de arte de siglo XXI @vpaprots 743 * 744 * 745 * First group is special, doesn't get the two inputs: 746 * +--------+--------+<-+ 747 * +-------| ADD2 | ADD1 |--|-----+ 748 * | +--------+--------+ | | 749 * | +--------+--------+<---+ | 750 * | | ADD2H | ADD1H |--+ | 751 * | +--------+--------+ | | 752 * | +--------+--------+<-+ | 753 * | | ADD4 | ADD3 |--|-+ | 754 * | +--------+--------+ | | | 755 * | +--------+--------+<---+ | | 756 * | | ADD4H | ADD3H |------|-+ |(+vzero) 757 * | +--------+--------+ | | V 758 * | ------------------------ | | +--------+ 759 * | | | | RED3 | [d0 0 0 d0] 760 * | | | +--------+ 761 * +---->+--------+--------+ | | | 762 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | | 763 * +--------+--------+ | | | 764 * ---+--------+--------+<---+ | | 765 * +--- T2| T1 | T0 |----------+ 766 * | ---+--------+--------+ | | 767 * | +--------+--------+<-------------+ 768 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 769 * | +--------+--------+ | | | 770 * | +--------+<----------------------+ 771 * | | RED3 |--------------+ | [0 0 d1 d0] 772 * | +--------+ | | 773 * +--->+--------+--------+ | | 774 * | T1 | T0 |--------+ 775 * +--------+--------+ | | 776 * --------------------------- | | 777 * | | 778 * +--------+--------+<----+ | 779 * | RED2 | RED1 | | 780 * +--------+--------+ | 781 * ---+--------+--------+<-------+ 782 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 783 * ---+--------+--------+ 784 * 785 * Last 'group' needs to RED2||RED1 shifted less 786 */ 787 TEXT p256MulInternal<>(SB), NOSPLIT, $0-16 788 // CPOOL loaded from caller 789 MOVD $16, R16 790 MOVD $32, R17 791 MOVD $48, R18 792 MOVD $64, R19 793 MOVD $80, R20 794 MOVD $96, R21 795 MOVD $112, R22 796 797 // --------------------------------------------------- 798 799 VSPLTW $3, Y0, YDIG // VREPF Y0 is input 800 801 // VMLHF X0, YDIG, ADD1H 802 // VMLHF X1, YDIG, ADD2H 803 // VMLF X0, YDIG, ADD1 804 // VMLF X1, YDIG, ADD2 805 // 806 VMULT(X0, YDIG, ADD1, ADD1H) 807 VMULT(X1, YDIG, ADD2, ADD2H) 808 809 VSPLTISW $1, ONE 810 VSPLTW $2, Y0, YDIG // VREPF 811 812 // VMALF X0, YDIG, ADD1H, ADD3 813 // VMALF X1, YDIG, ADD2H, ADD4 814 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 815 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 816 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 817 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 818 819 LXVD2X (R17)(CPOOL), SEL1 820 VSPLTISB $0, ZER // VZERO ZER 821 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 822 823 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB 824 VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB 825 826 VADDCUQ T0, ADD3, CAR1 // VACCQ 827 VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ 828 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ 829 VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ 830 831 LXVD2X (R18)(CPOOL), SEL2 832 LXVD2X (R19)(CPOOL), SEL3 833 LXVD2X (R20)(CPOOL), SEL4 834 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 835 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 836 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 837 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow -->? // VSQ 838 839 VSLDOI $12, T1, T0, T0 // VSLDB 840 VSLDOI $12, T2, T1, T1 // VSLDB 841 842 VADDCUQ T0, ADD3H, CAR1 // VACCQ 843 VADDUQM T0, ADD3H, T0 // VAQ 844 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 845 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 846 847 // --------------------------------------------------- 848 849 VSPLTW $1, Y0, YDIG // VREPF 850 851 // VMALHF X0, YDIG, T0, ADD1H 852 // VMALHF X1, YDIG, T1, ADD2H 853 // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 854 // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 855 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 856 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 857 858 VSPLTW $0, Y0, YDIG // VREPF 859 860 // VMALF X0, YDIG, ADD1H, ADD3 861 // VMALF X1, YDIG, ADD2H, ADD4 862 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H 863 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER 864 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 865 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 866 867 VSPLTISB $0, ZER // VZERO ZER 868 LXVD2X (R17)(CPOOL), SEL1 869 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 870 871 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB 872 VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB 873 874 VADDCUQ T0, RED1, CAR1 // VACCQ 875 VADDUQM T0, RED1, T0 // VAQ 876 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ 877 VADDEUQM T1, RED2, CAR1, T1 // VACQ 878 879 VADDCUQ T0, ADD3, CAR1 // VACCQ 880 VADDUQM T0, ADD3, T0 // VAQ 881 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ 882 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 883 VADDUQM T2, CAR2, T2 // VAQ 884 885 LXVD2X (R18)(CPOOL), SEL2 886 LXVD2X (R19)(CPOOL), SEL3 887 LXVD2X (R20)(CPOOL), SEL4 888 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 889 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 890 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 891 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ 892 893 VSLDOI $12, T1, T0, T0 // VSLDB 894 VSLDOI $12, T2, T1, T1 // VSLDB 895 896 VADDCUQ T0, ADD3H, CAR1 // VACCQ 897 VADDUQM T0, ADD3H, T0 // VAQ 898 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 899 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 900 901 // --------------------------------------------------- 902 903 VSPLTW $3, Y1, YDIG // VREPF 904 905 // VMALHF X0, YDIG, T0, ADD1H 906 // VMALHF X1, YDIG, T1, ADD2H 907 // VMALF X0, YDIG, T0, ADD1 908 // VMALF X1, YDIG, T1, ADD2 909 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 910 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 911 912 VSPLTW $2, Y1, YDIG // VREPF 913 914 // VMALF X0, YDIG, ADD1H, ADD3 915 // VMALF X1, YDIG, ADD2H, ADD4 916 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 917 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 918 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 919 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 920 921 LXVD2X (R17)(CPOOL), SEL1 922 VSPLTISB $0, ZER // VZERO ZER 923 LXVD2X (R17)(CPOOL), SEL1 924 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 925 926 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB 927 VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB 928 929 VADDCUQ T0, RED1, CAR1 // VACCQ 930 VADDUQM T0, RED1, T0 // VAQ 931 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ 932 VADDEUQM T1, RED2, CAR1, T1 // VACQ 933 934 VADDCUQ T0, ADD3, CAR1 // VACCQ 935 VADDUQM T0, ADD3, T0 // VAQ 936 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ 937 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 938 VADDUQM T2, CAR2, T2 // VAQ 939 940 LXVD2X (R18)(CPOOL), SEL2 941 LXVD2X (R19)(CPOOL), SEL3 942 LXVD2X (R20)(CPOOL), SEL4 943 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 944 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 945 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 946 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ 947 948 VSLDOI $12, T1, T0, T0 // VSLDB 949 VSLDOI $12, T2, T1, T1 // VSLDB 950 951 VADDCUQ T0, ADD3H, CAR1 // VACCQ 952 VADDUQM T0, ADD3H, T0 // VAQ 953 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 954 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 955 956 // --------------------------------------------------- 957 958 VSPLTW $1, Y1, YDIG // VREPF 959 960 // VMALHF X0, YDIG, T0, ADD1H 961 // VMALHF X1, YDIG, T1, ADD2H 962 // VMALF X0, YDIG, T0, ADD1 963 // VMALF X1, YDIG, T1, ADD2 964 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 965 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 966 967 VSPLTW $0, Y1, YDIG // VREPF 968 969 // VMALF X0, YDIG, ADD1H, ADD3 970 // VMALF X1, YDIG, ADD2H, ADD4 971 // VMALHF X0, YDIG, ADD1H, ADD3H 972 // VMALHF X1, YDIG, ADD2H, ADD4H 973 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 974 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 975 976 VSPLTISB $0, ZER // VZERO ZER 977 LXVD2X (R17)(CPOOL), SEL1 978 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 979 980 VSLDOI $12, ADD2, ADD1, T0 // VSLDB 981 VSLDOI $12, T2, ADD2, T1 // VSLDB 982 983 VADDCUQ T0, RED1, CAR1 // VACCQ 984 VADDUQM T0, RED1, T0 // VAQ 985 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ 986 VADDEUQM T1, RED2, CAR1, T1 // VACQ 987 988 VADDCUQ T0, ADD3, CAR1 // VACCQ 989 VADDUQM T0, ADD3, T0 // VAQ 990 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ 991 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 992 VADDUQM T2, CAR2, T2 // VAQ 993 994 LXVD2X (R21)(CPOOL), SEL5 995 LXVD2X (R22)(CPOOL), SEL6 996 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] 997 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0] 998 VSUBUQM RED2, RED1, RED2 // Guaranteed not to underflow // VSQ 999 1000 VSLDOI $12, T1, T0, T0 // VSLDB 1001 VSLDOI $12, T2, T1, T1 // VSLDB 1002 1003 VADDCUQ T0, ADD3H, CAR1 // VACCQ 1004 VADDUQM T0, ADD3H, T0 // VAQ 1005 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 1006 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 1007 1008 VADDCUQ T0, RED1, CAR1 // VACCQ 1009 VADDUQM T0, RED1, T0 // VAQ 1010 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ 1011 VADDEUQM T1, RED2, CAR1, T1 // VACQ 1012 VADDUQM T2, CAR2, T2 // VAQ 1013 1014 // --------------------------------------------------- 1015 1016 VSPLTISB $0, RED3 // VZERO RED3 1017 VSUBCUQ T0, P0, CAR1 // VSCBIQ 1018 VSUBUQM T0, P0, ADD1H // VSQ 1019 VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ 1020 VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ 1021 VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ 1022 1023 // what output to use, ADD2H||ADD1H or T1||T0? 1024 VSEL ADD1H, T0, T2, T0 1025 VSEL ADD2H, T1, T2, T1 1026 RET 1027 1028 #undef CPOOL 1029 1030 #undef X0 1031 #undef X1 1032 #undef Y0 1033 #undef Y1 1034 #undef T0 1035 #undef T1 1036 #undef P0 1037 #undef P1 1038 1039 #undef SEL1 1040 #undef SEL2 1041 #undef SEL3 1042 #undef SEL4 1043 #undef SEL5 1044 #undef SEL6 1045 1046 #undef YDIG 1047 #undef ADD1H 1048 #undef ADD2H 1049 #undef ADD3 1050 #undef ADD4 1051 #undef RED1 1052 #undef RED2 1053 #undef RED3 1054 #undef T2 1055 #undef ADD1 1056 #undef ADD2 1057 #undef ADD3H 1058 #undef ADD4H 1059 #undef ZER 1060 #undef CAR1 1061 #undef CAR2 1062 1063 #undef TMP1 1064 #undef TMP2 1065 1066 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ 1067 VSPLTISB $0, ZER \ // VZERO 1068 VSUBCUQ X0, Y0, CAR1 \ 1069 VSUBUQM X0, Y0, T0 \ 1070 VSUBECUQ X1, Y1, CAR1, SEL1 \ 1071 VSUBEUQM X1, Y1, CAR1, T1 \ 1072 VSUBUQM ZER, SEL1, SEL1 \ // VSQ 1073 \ 1074 VADDCUQ T0, PL, CAR1 \ // VACCQ 1075 VADDUQM T0, PL, TT0 \ // VAQ 1076 VADDEUQM T1, PH, CAR1, TT1 \ // VACQ 1077 \ 1078 VSEL TT0, T0, SEL1, T0 \ 1079 VSEL TT1, T1, SEL1, T1 \ 1080 1081 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ 1082 VADDCUQ X0, Y0, CAR1 \ 1083 VADDUQM X0, Y0, T0 \ 1084 VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ 1085 VADDEUQM X1, Y1, CAR1, T1 \ 1086 \ 1087 VSPLTISB $0, ZER \ 1088 VSUBCUQ T0, PL, CAR1 \ // VSCBIQ 1089 VSUBUQM T0, PL, TT0 \ 1090 VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ 1091 VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ 1092 VSUBEUQM T2, ZER, CAR2, SEL1 \ 1093 \ 1094 VSEL TT0, T0, SEL1, T0 \ 1095 VSEL TT1, T1, SEL1, T1 1096 1097 #define p256HalfInternal(T1, T0, X1, X0) \ 1098 VSPLTISB $0, ZER \ 1099 VSUBEUQM ZER, ZER, X0, SEL1 \ 1100 \ 1101 VADDCUQ X0, PL, CAR1 \ 1102 VADDUQM X0, PL, T0 \ 1103 VADDECUQ X1, PH, CAR1, T2 \ 1104 VADDEUQM X1, PH, CAR1, T1 \ 1105 \ 1106 VSEL T0, X0, SEL1, T0 \ 1107 VSEL T1, X1, SEL1, T1 \ 1108 VSEL T2, ZER, SEL1, T2 \ 1109 \ 1110 VSLDOI $15, T2, ZER, TT1 \ 1111 VSLDOI $15, T1, ZER, TT0 \ 1112 VSPLTISB $1, SEL1 \ 1113 VSR T0, SEL1, T0 \ // VSRL 1114 VSR T1, SEL1, T1 \ 1115 VSPLTISB $7, SEL1 \ // VREPIB 1116 VSL TT0, SEL1, TT0 \ 1117 VSL TT1, SEL1, TT1 \ 1118 VOR T0, TT0, T0 \ 1119 VOR T1, TT1, T1 1120 1121 #define res_ptr R3 1122 #define x_ptr R4 1123 #define y_ptr R5 1124 #define CPOOL R7 1125 #define TEMP R8 1126 #define N R9 1127 1128 // Parameters 1129 #define X0 V0 1130 #define X1 V1 1131 #define Y0 V2 1132 #define Y1 V3 1133 #define T0 V4 1134 #define T1 V5 1135 1136 // Constants 1137 #define P0 V30 1138 #define P1 V31 1139 // func p256MulAsm(res, in1, in2 *p256Element) 1140 TEXT ·p256Mul(SB), NOSPLIT, $0-24 1141 MOVD res+0(FP), res_ptr 1142 MOVD in1+8(FP), x_ptr 1143 MOVD in2+16(FP), y_ptr 1144 MOVD $16, R16 1145 MOVD $32, R17 1146 1147 MOVD $p256mul<>+0x00(SB), CPOOL 1148 1149 1150 LXVD2X (R0)(x_ptr), X0 1151 LXVD2X (R16)(x_ptr), X1 1152 1153 XXPERMDI X0, X0, $2, X0 1154 XXPERMDI X1, X1, $2, X1 1155 1156 LXVD2X (R0)(y_ptr), Y0 1157 LXVD2X (R16)(y_ptr), Y1 1158 1159 XXPERMDI Y0, Y0, $2, Y0 1160 XXPERMDI Y1, Y1, $2, Y1 1161 1162 LXVD2X (R16)(CPOOL), P1 1163 LXVD2X (R0)(CPOOL), P0 1164 1165 CALL p256MulInternal<>(SB) 1166 1167 MOVD $p256mul<>+0x00(SB), CPOOL 1168 1169 XXPERMDI T0, T0, $2, T0 1170 XXPERMDI T1, T1, $2, T1 1171 STXVD2X T0, (R0)(res_ptr) 1172 STXVD2X T1, (R16)(res_ptr) 1173 RET 1174 1175 // func p256Sqr(res, in *p256Element, n int) 1176 TEXT ·p256Sqr(SB), NOSPLIT, $0-24 1177 MOVD res+0(FP), res_ptr 1178 MOVD in+8(FP), x_ptr 1179 MOVD $16, R16 1180 MOVD $32, R17 1181 1182 MOVD $p256mul<>+0x00(SB), CPOOL 1183 1184 LXVD2X (R0)(x_ptr), X0 1185 LXVD2X (R16)(x_ptr), X1 1186 1187 XXPERMDI X0, X0, $2, X0 1188 XXPERMDI X1, X1, $2, X1 1189 1190 sqrLoop: 1191 // Sqr uses same value for both 1192 1193 VOR X0, X0, Y0 1194 VOR X1, X1, Y1 1195 1196 LXVD2X (R16)(CPOOL), P1 1197 LXVD2X (R0)(CPOOL), P0 1198 1199 CALL p256MulInternal<>(SB) 1200 1201 MOVD n+16(FP), N 1202 ADD $-1, N 1203 CMP $0, N 1204 BEQ done 1205 MOVD N, n+16(FP) // Save counter to avoid clobber 1206 VOR T0, T0, X0 1207 VOR T1, T1, X1 1208 BR sqrLoop 1209 1210 done: 1211 MOVD $p256mul<>+0x00(SB), CPOOL 1212 1213 XXPERMDI T0, T0, $2, T0 1214 XXPERMDI T1, T1, $2, T1 1215 STXVD2X T0, (R0)(res_ptr) 1216 STXVD2X T1, (R16)(res_ptr) 1217 RET 1218 1219 #undef res_ptr 1220 #undef x_ptr 1221 #undef y_ptr 1222 #undef CPOOL 1223 1224 #undef X0 1225 #undef X1 1226 #undef Y0 1227 #undef Y1 1228 #undef T0 1229 #undef T1 1230 #undef P0 1231 #undef P1 1232 1233 #define P3ptr R3 1234 #define P1ptr R4 1235 #define P2ptr R5 1236 #define CPOOL R7 1237 1238 // Temporaries in REGs 1239 #define Y2L V15 1240 #define Y2H V16 1241 #define T1L V17 1242 #define T1H V18 1243 #define T2L V19 1244 #define T2H V20 1245 #define T3L V21 1246 #define T3H V22 1247 #define T4L V23 1248 #define T4H V24 1249 1250 // Temps for Sub and Add 1251 #define TT0 V11 1252 #define TT1 V12 1253 #define T2 V13 1254 1255 // p256MulAsm Parameters 1256 #define X0 V0 1257 #define X1 V1 1258 #define Y0 V2 1259 #define Y1 V3 1260 #define T0 V4 1261 #define T1 V5 1262 1263 #define PL V30 1264 #define PH V31 1265 1266 // Names for zero/sel selects 1267 #define X1L V0 1268 #define X1H V1 1269 #define Y1L V2 // p256MulAsmParmY 1270 #define Y1H V3 // p256MulAsmParmY 1271 #define Z1L V4 1272 #define Z1H V5 1273 #define X2L V0 1274 #define X2H V1 1275 #define Z2L V4 1276 #define Z2H V5 1277 #define X3L V17 // T1L 1278 #define X3H V18 // T1H 1279 #define Y3L V21 // T3L 1280 #define Y3H V22 // T3H 1281 #define Z3L V25 1282 #define Z3H V26 1283 1284 #define ZER V6 1285 #define SEL1 V7 1286 #define CAR1 V8 1287 #define CAR2 V9 1288 /* * 1289 * Three operand formula: 1290 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1291 * T1 = Z1² 1292 * T2 = T1*Z1 1293 * T1 = T1*X2 1294 * T2 = T2*Y2 1295 * T1 = T1-X1 1296 * T2 = T2-Y1 1297 * Z3 = Z1*T1 1298 * T3 = T1² 1299 * T4 = T3*T1 1300 * T3 = T3*X1 1301 * T1 = 2*T3 1302 * X3 = T2² 1303 * X3 = X3-T1 1304 * X3 = X3-T4 1305 * T3 = T3-X3 1306 * T3 = T3*T2 1307 * T4 = T4*Y1 1308 * Y3 = T3-T4 1309 1310 * Three operand formulas, but with MulInternal X,Y used to store temps 1311 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 1312 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 1313 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 1314 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 1315 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1316 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1317 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 1318 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 1319 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 1320 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 1321 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1322 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 1323 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 1324 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1325 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1326 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 1327 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 1328 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 1329 1330 */ 1331 // 1332 // V27 is clobbered by p256MulInternal so must be 1333 // saved in a temp. 1334 // 1335 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1336 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48 1337 MOVD res+0(FP), P3ptr 1338 MOVD in1+8(FP), P1ptr 1339 MOVD in2+16(FP), P2ptr 1340 1341 MOVD $p256mul<>+0x00(SB), CPOOL 1342 1343 MOVD $16, R16 1344 MOVD $32, R17 1345 MOVD $48, R18 1346 MOVD $64, R19 1347 MOVD $80, R20 1348 MOVD $96, R21 1349 MOVD $112, R22 1350 MOVD $128, R23 1351 MOVD $144, R24 1352 MOVD $160, R25 1353 MOVD $104, R26 // offset of sign+24(FP) 1354 1355 LXVD2X (R16)(CPOOL), PH 1356 LXVD2X (R0)(CPOOL), PL 1357 1358 LXVD2X (R17)(P2ptr), Y2L 1359 LXVD2X (R18)(P2ptr), Y2H 1360 XXPERMDI Y2H, Y2H, $2, Y2H 1361 XXPERMDI Y2L, Y2L, $2, Y2L 1362 1363 // Equivalent of VLREPG sign+24(FP), SEL1 1364 LXVDSX (R1)(R26), SEL1 1365 VSPLTISB $0, ZER 1366 VCMPEQUD SEL1, ZER, SEL1 1367 1368 VSUBCUQ PL, Y2L, CAR1 1369 VSUBUQM PL, Y2L, T1L 1370 VSUBEUQM PH, Y2H, CAR1, T1H 1371 1372 VSEL T1L, Y2L, SEL1, Y2L 1373 VSEL T1H, Y2H, SEL1, Y2H 1374 1375 /* * 1376 * Three operand formula: 1377 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1378 */ 1379 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 1380 LXVD2X (R19)(P1ptr), X0 // Z1H 1381 LXVD2X (R20)(P1ptr), X1 // Z1L 1382 XXPERMDI X0, X0, $2, X0 1383 XXPERMDI X1, X1, $2, X1 1384 VOR X0, X0, Y0 1385 VOR X1, X1, Y1 1386 CALL p256MulInternal<>(SB) 1387 1388 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 1389 VOR T0, T0, X0 1390 VOR T1, T1, X1 1391 CALL p256MulInternal<>(SB) 1392 VOR T0, T0, T2L 1393 VOR T1, T1, T2H 1394 1395 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 1396 MOVD in2+16(FP), P2ptr 1397 LXVD2X (R0)(P2ptr), Y0 // X2H 1398 LXVD2X (R16)(P2ptr), Y1 // X2L 1399 XXPERMDI Y0, Y0, $2, Y0 1400 XXPERMDI Y1, Y1, $2, Y1 1401 CALL p256MulInternal<>(SB) 1402 VOR T0, T0, T1L 1403 VOR T1, T1, T1H 1404 1405 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 1406 VOR T2L, T2L, X0 1407 VOR T2H, T2H, X1 1408 VOR Y2L, Y2L, Y0 1409 VOR Y2H, Y2H, Y1 1410 CALL p256MulInternal<>(SB) 1411 1412 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1413 MOVD in1+8(FP), P1ptr 1414 LXVD2X (R17)(P1ptr), Y1L 1415 LXVD2X (R18)(P1ptr), Y1H 1416 XXPERMDI Y1H, Y1H, $2, Y1H 1417 XXPERMDI Y1L, Y1L, $2, Y1L 1418 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) 1419 1420 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1421 LXVD2X (R0)(P1ptr), X1L 1422 LXVD2X (R16)(P1ptr), X1H 1423 XXPERMDI X1H, X1H, $2, X1H 1424 XXPERMDI X1L, X1L, $2, X1L 1425 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) 1426 1427 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 1428 LXVD2X (R19)(P1ptr), X0 // Z1H 1429 LXVD2X (R20)(P1ptr), X1 // Z1L 1430 XXPERMDI X0, X0, $2, X0 1431 XXPERMDI X1, X1, $2, X1 1432 CALL p256MulInternal<>(SB) 1433 1434 VOR T0, T0, Z3L 1435 VOR T1, T1, Z3H 1436 1437 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 1438 VOR Y0, Y0, X0 1439 VOR Y1, Y1, X1 1440 CALL p256MulInternal<>(SB) 1441 VOR T0, T0, X0 1442 VOR T1, T1, X1 1443 1444 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 1445 CALL p256MulInternal<>(SB) 1446 VOR T0, T0, T4L 1447 VOR T1, T1, T4H 1448 1449 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 1450 MOVD in1+8(FP), P1ptr 1451 LXVD2X (R0)(P1ptr), Y0 // X1H 1452 LXVD2X (R16)(P1ptr), Y1 // X1L 1453 XXPERMDI Y1, Y1, $2, Y1 1454 XXPERMDI Y0, Y0, $2, Y0 1455 CALL p256MulInternal<>(SB) 1456 VOR T0, T0, T3L 1457 VOR T1, T1, T3H 1458 1459 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1460 p256AddInternal(T1H,T1L, T1,T0,T1,T0) 1461 1462 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 1463 VOR T2L, T2L, X0 1464 VOR T2H, T2H, X1 1465 VOR T2L, T2L, Y0 1466 VOR T2H, T2H, Y1 1467 CALL p256MulInternal<>(SB) 1468 1469 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) 1470 p256SubInternal(T1,T0,T1,T0,T1H,T1L) 1471 1472 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1473 p256SubInternal(T1,T0,T1,T0,T4H,T4L) 1474 VOR T0, T0, X3L 1475 VOR T1, T1, X3H 1476 1477 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1478 p256SubInternal(X1,X0,T3H,T3L,T1,T0) 1479 1480 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 1481 CALL p256MulInternal<>(SB) 1482 VOR T0, T0, T3L 1483 VOR T1, T1, T3H 1484 1485 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 1486 VOR T4L, T4L, X0 1487 VOR T4H, T4H, X1 1488 MOVD in1+8(FP), P1ptr 1489 LXVD2X (R17)(P1ptr), Y0 // Y1H 1490 LXVD2X (R18)(P1ptr), Y1 // Y1L 1491 XXPERMDI Y0, Y0, $2, Y0 1492 XXPERMDI Y1, Y1, $2, Y1 1493 CALL p256MulInternal<>(SB) 1494 1495 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) 1496 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) 1497 1498 // if (sel == 0) { 1499 // copy(P3.x[:], X1) 1500 // copy(P3.y[:], Y1) 1501 // copy(P3.z[:], Z1) 1502 // } 1503 1504 LXVD2X (R0)(P1ptr), X1L 1505 LXVD2X (R16)(P1ptr), X1H 1506 XXPERMDI X1H, X1H, $2, X1H 1507 XXPERMDI X1L, X1L, $2, X1L 1508 1509 // Y1 already loaded, left over from addition 1510 LXVD2X (R19)(P1ptr), Z1L 1511 LXVD2X (R20)(P1ptr), Z1H 1512 XXPERMDI Z1H, Z1H, $2, Z1H 1513 XXPERMDI Z1L, Z1L, $2, Z1L 1514 1515 MOVD $112, R26 // Get offset to sel+32 1516 LXVDSX (R1)(R26), SEL1 1517 VSPLTISB $0, ZER 1518 VCMPEQUD SEL1, ZER, SEL1 1519 1520 VSEL X3L, X1L, SEL1, X3L 1521 VSEL X3H, X1H, SEL1, X3H 1522 VSEL Y3L, Y1L, SEL1, Y3L 1523 VSEL Y3H, Y1H, SEL1, Y3H 1524 VSEL Z3L, Z1L, SEL1, Z3L 1525 VSEL Z3H, Z1H, SEL1, Z3H 1526 1527 MOVD in2+16(FP), P2ptr 1528 LXVD2X (R0)(P2ptr), X2L 1529 LXVD2X (R16)(P2ptr), X2H 1530 XXPERMDI X2H, X2H, $2, X2H 1531 XXPERMDI X2L, X2L, $2, X2L 1532 1533 // Y2 already loaded 1534 LXVD2X (R23)(CPOOL), Z2L 1535 LXVD2X (R24)(CPOOL), Z2H 1536 1537 MOVD $120, R26 // Get the value from zero+40(FP) 1538 LXVDSX (R1)(R26), SEL1 1539 VSPLTISB $0, ZER 1540 VCMPEQUD SEL1, ZER, SEL1 1541 1542 VSEL X3L, X2L, SEL1, X3L 1543 VSEL X3H, X2H, SEL1, X3H 1544 VSEL Y3L, Y2L, SEL1, Y3L 1545 VSEL Y3H, Y2H, SEL1, Y3H 1546 VSEL Z3L, Z2L, SEL1, Z3L 1547 VSEL Z3H, Z2H, SEL1, Z3H 1548 1549 // Reorder the bytes so they can be stored using STXVD2X. 1550 MOVD res+0(FP), P3ptr 1551 XXPERMDI X3H, X3H, $2, X3H 1552 XXPERMDI X3L, X3L, $2, X3L 1553 XXPERMDI Y3H, Y3H, $2, Y3H 1554 XXPERMDI Y3L, Y3L, $2, Y3L 1555 XXPERMDI Z3H, Z3H, $2, Z3H 1556 XXPERMDI Z3L, Z3L, $2, Z3L 1557 STXVD2X X3L, (R0)(P3ptr) 1558 STXVD2X X3H, (R16)(P3ptr) 1559 STXVD2X Y3L, (R17)(P3ptr) 1560 STXVD2X Y3H, (R18)(P3ptr) 1561 STXVD2X Z3L, (R19)(P3ptr) 1562 STXVD2X Z3H, (R20)(P3ptr) 1563 1564 RET 1565 1566 #undef P3ptr 1567 #undef P1ptr 1568 #undef P2ptr 1569 #undef CPOOL 1570 1571 #undef Y2L 1572 #undef Y2H 1573 #undef T1L 1574 #undef T1H 1575 #undef T2L 1576 #undef T2H 1577 #undef T3L 1578 #undef T3H 1579 #undef T4L 1580 #undef T4H 1581 1582 #undef TT0 1583 #undef TT1 1584 #undef T2 1585 1586 #undef X0 1587 #undef X1 1588 #undef Y0 1589 #undef Y1 1590 #undef T0 1591 #undef T1 1592 1593 #undef PL 1594 #undef PH 1595 1596 #undef X1L 1597 #undef X1H 1598 #undef Y1L 1599 #undef Y1H 1600 #undef Z1L 1601 #undef Z1H 1602 #undef X2L 1603 #undef X2H 1604 #undef Z2L 1605 #undef Z2H 1606 #undef X3L 1607 #undef X3H 1608 #undef Y3L 1609 #undef Y3H 1610 #undef Z3L 1611 #undef Z3H 1612 1613 #undef ZER 1614 #undef SEL1 1615 #undef CAR1 1616 #undef CAR2 1617 1618 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl 1619 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html 1620 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html 1621 #define P3ptr R3 1622 #define P1ptr R4 1623 #define CPOOL R7 1624 1625 // Temporaries in REGs 1626 #define X3L V15 1627 #define X3H V16 1628 #define Y3L V17 1629 #define Y3H V18 1630 #define T1L V19 1631 #define T1H V20 1632 #define T2L V21 1633 #define T2H V22 1634 #define T3L V23 1635 #define T3H V24 1636 1637 #define X1L V6 1638 #define X1H V7 1639 #define Y1L V8 1640 #define Y1H V9 1641 #define Z1L V10 1642 #define Z1H V11 1643 1644 // Temps for Sub and Add 1645 #define TT0 V11 1646 #define TT1 V12 1647 #define T2 V13 1648 1649 // p256MulAsm Parameters 1650 #define X0 V0 1651 #define X1 V1 1652 #define Y0 V2 1653 #define Y1 V3 1654 #define T0 V4 1655 #define T1 V5 1656 1657 #define PL V30 1658 #define PH V31 1659 1660 #define Z3L V23 1661 #define Z3H V24 1662 1663 #define ZER V26 1664 #define SEL1 V27 1665 #define CAR1 V28 1666 #define CAR2 V29 1667 /* 1668 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv 1669 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. 1670 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1671 * A = 3(X₁-Z₁²)×(X₁+Z₁²) 1672 * B = 2Y₁ 1673 * Z₃ = B×Z₁ 1674 * C = B² 1675 * D = C×X₁ 1676 * X₃ = A²-2D 1677 * Y₃ = (D-X₃)×A-C²/2 1678 * 1679 * Three-operand formula: 1680 * T1 = Z1² 1681 * T2 = X1-T1 1682 * T1 = X1+T1 1683 * T2 = T2*T1 1684 * T2 = 3*T2 1685 * Y3 = 2*Y1 1686 * Z3 = Y3*Z1 1687 * Y3 = Y3² 1688 * T3 = Y3*X1 1689 * Y3 = Y3² 1690 * Y3 = half*Y3 1691 * X3 = T2² 1692 * T1 = 2*T3 1693 * X3 = X3-T1 1694 * T1 = T3-X3 1695 * T1 = T1*T2 1696 * Y3 = T1-Y3 1697 */ 1698 // p256PointDoubleAsm(res, in1 *p256Point) 1699 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16 1700 MOVD res+0(FP), P3ptr 1701 MOVD in+8(FP), P1ptr 1702 1703 MOVD $p256mul<>+0x00(SB), CPOOL 1704 1705 MOVD $16, R16 1706 MOVD $32, R17 1707 MOVD $48, R18 1708 MOVD $64, R19 1709 MOVD $80, R20 1710 1711 LXVD2X (R16)(CPOOL), PH 1712 LXVD2X (R0)(CPOOL), PL 1713 1714 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² 1715 LXVD2X (R19)(P1ptr), X0 // Z1H 1716 LXVD2X (R20)(P1ptr), X1 // Z1L 1717 1718 XXPERMDI X0, X0, $2, X0 1719 XXPERMDI X1, X1, $2, X1 1720 1721 VOR X0, X0, Y0 1722 VOR X1, X1, Y1 1723 CALL p256MulInternal<>(SB) 1724 1725 // SUB(X<X1-T) // T2 = X1-T1 1726 LXVD2X (R0)(P1ptr), X1L 1727 LXVD2X (R16)(P1ptr), X1H 1728 XXPERMDI X1L, X1L, $2, X1L 1729 XXPERMDI X1H, X1H, $2, X1H 1730 1731 p256SubInternal(X1,X0,X1H,X1L,T1,T0) 1732 1733 // ADD(Y<X1+T) // T1 = X1+T1 1734 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) 1735 1736 // X- ; Y- ; MUL; T- // T2 = T2*T1 1737 CALL p256MulInternal<>(SB) 1738 1739 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 1740 p256AddInternal(T2H,T2L,T1,T0,T1,T0) 1741 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) 1742 1743 // ADD(X<Y1+Y1) // Y3 = 2*Y1 1744 LXVD2X (R17)(P1ptr), Y1L 1745 LXVD2X (R18)(P1ptr), Y1H 1746 XXPERMDI Y1L, Y1L, $2, Y1L 1747 XXPERMDI Y1H, Y1H, $2, Y1H 1748 1749 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) 1750 1751 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 1752 LXVD2X (R19)(P1ptr), Y0 1753 LXVD2X (R20)(P1ptr), Y1 1754 XXPERMDI Y0, Y0, $2, Y0 1755 XXPERMDI Y1, Y1, $2, Y1 1756 1757 CALL p256MulInternal<>(SB) 1758 1759 // Leave T0, T1 as is. 1760 XXPERMDI T0, T0, $2, TT0 1761 XXPERMDI T1, T1, $2, TT1 1762 STXVD2X TT0, (R19)(P3ptr) 1763 STXVD2X TT1, (R20)(P3ptr) 1764 1765 // X- ; Y=X ; MUL; T- // Y3 = Y3² 1766 VOR X0, X0, Y0 1767 VOR X1, X1, Y1 1768 CALL p256MulInternal<>(SB) 1769 1770 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 1771 VOR T0, T0, X0 1772 VOR T1, T1, X1 1773 LXVD2X (R0)(P1ptr), Y0 1774 LXVD2X (R16)(P1ptr), Y1 1775 XXPERMDI Y0, Y0, $2, Y0 1776 XXPERMDI Y1, Y1, $2, Y1 1777 CALL p256MulInternal<>(SB) 1778 VOR T0, T0, T3L 1779 VOR T1, T1, T3H 1780 1781 // X- ; Y=X ; MUL; T- // Y3 = Y3² 1782 VOR X0, X0, Y0 1783 VOR X1, X1, Y1 1784 CALL p256MulInternal<>(SB) 1785 1786 // HAL(Y3<T) // Y3 = half*Y3 1787 p256HalfInternal(Y3H,Y3L, T1,T0) 1788 1789 // X=T2; Y=T2; MUL; T- // X3 = T2² 1790 VOR T2L, T2L, X0 1791 VOR T2H, T2H, X1 1792 VOR T2L, T2L, Y0 1793 VOR T2H, T2H, Y1 1794 CALL p256MulInternal<>(SB) 1795 1796 // ADD(T1<T3+T3) // T1 = 2*T3 1797 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) 1798 1799 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 1800 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) 1801 1802 XXPERMDI X3L, X3L, $2, TT0 1803 XXPERMDI X3H, X3H, $2, TT1 1804 STXVD2X TT0, (R0)(P3ptr) 1805 STXVD2X TT1, (R16)(P3ptr) 1806 1807 // SUB(X<T3-X3) // T1 = T3-X3 1808 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) 1809 1810 // X- ; Y- ; MUL; T- // T1 = T1*T2 1811 CALL p256MulInternal<>(SB) 1812 1813 // SUB(Y3<T-Y3) // Y3 = T1-Y3 1814 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) 1815 1816 XXPERMDI Y3L, Y3L, $2, Y3L 1817 XXPERMDI Y3H, Y3H, $2, Y3H 1818 STXVD2X Y3L, (R17)(P3ptr) 1819 STXVD2X Y3H, (R18)(P3ptr) 1820 RET 1821 1822 #undef P3ptr 1823 #undef P1ptr 1824 #undef CPOOL 1825 #undef X3L 1826 #undef X3H 1827 #undef Y3L 1828 #undef Y3H 1829 #undef T1L 1830 #undef T1H 1831 #undef T2L 1832 #undef T2H 1833 #undef T3L 1834 #undef T3H 1835 #undef X1L 1836 #undef X1H 1837 #undef Y1L 1838 #undef Y1H 1839 #undef Z1L 1840 #undef Z1H 1841 #undef TT0 1842 #undef TT1 1843 #undef T2 1844 #undef X0 1845 #undef X1 1846 #undef Y0 1847 #undef Y1 1848 #undef T0 1849 #undef T1 1850 #undef PL 1851 #undef PH 1852 #undef Z3L 1853 #undef Z3H 1854 #undef ZER 1855 #undef SEL1 1856 #undef CAR1 1857 #undef CAR2 1858 1859 #define P3ptr R3 1860 #define P1ptr R4 1861 #define P2ptr R5 1862 #define CPOOL R7 1863 #define TRUE R14 1864 #define RES1 R9 1865 #define RES2 R10 1866 1867 // Temporaries in REGs 1868 #define T1L V16 1869 #define T1H V17 1870 #define T2L V18 1871 #define T2H V19 1872 #define U1L V20 1873 #define U1H V21 1874 #define S1L V22 1875 #define S1H V23 1876 #define HL V24 1877 #define HH V25 1878 #define RL V26 1879 #define RH V27 1880 1881 // Temps for Sub and Add 1882 #define ZER V6 1883 #define SEL1 V7 1884 #define CAR1 V8 1885 #define CAR2 V9 1886 #define TT0 V11 1887 #define TT1 V12 1888 #define T2 V13 1889 1890 // p256MulAsm Parameters 1891 #define X0 V0 1892 #define X1 V1 1893 #define Y0 V2 1894 #define Y1 V3 1895 #define T0 V4 1896 #define T1 V5 1897 1898 #define PL V30 1899 #define PH V31 1900 /* 1901 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" 1902 * 1903 * A = X₁×Z₂² 1904 * B = Y₁×Z₂³ 1905 * C = X₂×Z₁²-A 1906 * D = Y₂×Z₁³-B 1907 * X₃ = D² - 2A×C² - C³ 1908 * Y₃ = D×(A×C² - X₃) - B×C³ 1909 * Z₃ = Z₁×Z₂×C 1910 * 1911 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 1912 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R 1913 * 1914 * T1 = Z1*Z1 1915 * T2 = Z2*Z2 1916 * U1 = X1*T2 1917 * H = X2*T1 1918 * H = H-U1 1919 * Z3 = Z1*Z2 1920 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 1921 * 1922 * S1 = Z2*T2 1923 * S1 = Y1*S1 1924 * R = Z1*T1 1925 * R = Y2*R 1926 * R = R-S1 1927 * 1928 * T1 = H*H 1929 * T2 = H*T1 1930 * U1 = U1*T1 1931 * 1932 * X3 = R*R 1933 * X3 = X3-T2 1934 * T1 = 2*U1 1935 * X3 = X3-T1 << store-out X3 result reg 1936 * 1937 * T2 = S1*T2 1938 * Y3 = U1-X3 1939 * Y3 = R*Y3 1940 * Y3 = Y3-T2 << store-out Y3 result reg 1941 1942 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 1943 // X- ; Y=T ; MUL; R=T // R = Z1*T1 1944 // X=X2; Y- ; MUL; H=T // H = X2*T1 1945 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 1946 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 1947 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 1948 // SUB(H<H-T) // H = H-U1 1949 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 1950 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 1951 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 1952 // X=Y2; Y=R ; MUL; T- // R = Y2*R 1953 // SUB(R<T-S1) // R = R-S1 1954 // X=H ; Y=H ; MUL; T- // T1 = H*H 1955 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 1956 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 1957 // X=R ; Y=R ; MUL; T- // X3 = R*R 1958 // SUB(T<T-T2) // X3 = X3-T2 1959 // ADD(X<U1+U1) // T1 = 2*U1 1960 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 1961 // SUB(Y<U1-T) // Y3 = U1-X3 1962 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 1963 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 1964 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 1965 */ 1966 // p256PointAddAsm(res, in1, in2 *p256Point) 1967 TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32 1968 MOVD res+0(FP), P3ptr 1969 MOVD in1+8(FP), P1ptr 1970 MOVD $p256mul<>+0x00(SB), CPOOL 1971 MOVD $16, R16 1972 MOVD $32, R17 1973 MOVD $48, R18 1974 MOVD $64, R19 1975 MOVD $80, R20 1976 1977 LXVD2X (R16)(CPOOL), PH 1978 LXVD2X (R0)(CPOOL), PL 1979 1980 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 1981 LXVD2X (R19)(P1ptr), X0 // Z1L 1982 LXVD2X (R20)(P1ptr), X1 // Z1H 1983 XXPERMDI X0, X0, $2, X0 1984 XXPERMDI X1, X1, $2, X1 1985 VOR X0, X0, Y0 1986 VOR X1, X1, Y1 1987 CALL p256MulInternal<>(SB) 1988 1989 // X- ; Y=T ; MUL; R=T // R = Z1*T1 1990 VOR T0, T0, Y0 1991 VOR T1, T1, Y1 1992 CALL p256MulInternal<>(SB) 1993 VOR T0, T0, RL // SAVE: RL 1994 VOR T1, T1, RH // SAVE: RH 1995 1996 STXVD2X RH, (R1)(R17) // V27 has to be saved 1997 1998 // X=X2; Y- ; MUL; H=T // H = X2*T1 1999 MOVD in2+16(FP), P2ptr 2000 LXVD2X (R0)(P2ptr), X0 // X2L 2001 LXVD2X (R16)(P2ptr), X1 // X2H 2002 XXPERMDI X0, X0, $2, X0 2003 XXPERMDI X1, X1, $2, X1 2004 CALL p256MulInternal<>(SB) 2005 VOR T0, T0, HL // SAVE: HL 2006 VOR T1, T1, HH // SAVE: HH 2007 2008 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2009 MOVD in2+16(FP), P2ptr 2010 LXVD2X (R19)(P2ptr), X0 // Z2L 2011 LXVD2X (R20)(P2ptr), X1 // Z2H 2012 XXPERMDI X0, X0, $2, X0 2013 XXPERMDI X1, X1, $2, X1 2014 VOR X0, X0, Y0 2015 VOR X1, X1, Y1 2016 CALL p256MulInternal<>(SB) 2017 2018 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2019 VOR T0, T0, Y0 2020 VOR T1, T1, Y1 2021 CALL p256MulInternal<>(SB) 2022 VOR T0, T0, S1L // SAVE: S1L 2023 VOR T1, T1, S1H // SAVE: S1H 2024 2025 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2026 MOVD in1+8(FP), P1ptr 2027 LXVD2X (R0)(P1ptr), X0 // X1L 2028 LXVD2X (R16)(P1ptr), X1 // X1H 2029 XXPERMDI X0, X0, $2, X0 2030 XXPERMDI X1, X1, $2, X1 2031 CALL p256MulInternal<>(SB) 2032 VOR T0, T0, U1L // SAVE: U1L 2033 VOR T1, T1, U1H // SAVE: U1H 2034 2035 // SUB(H<H-T) // H = H-U1 2036 p256SubInternal(HH,HL,HH,HL,T1,T0) 2037 2038 // if H == 0 or H^P == 0 then ret=1 else ret=0 2039 // clobbers T1H and T1L 2040 MOVD $1, TRUE 2041 VSPLTISB $0, ZER 2042 VOR HL, HH, T1H 2043 VCMPEQUDCC ZER, T1H, T1H 2044 2045 // 26 = CR6 NE 2046 ISEL $26, R0, TRUE, RES1 2047 VXOR HL, PL, T1L // SAVE: T1L 2048 VXOR HH, PH, T1H // SAVE: T1H 2049 VOR T1L, T1H, T1H 2050 VCMPEQUDCC ZER, T1H, T1H 2051 2052 // 26 = CR6 NE 2053 ISEL $26, R0, TRUE, RES2 2054 OR RES2, RES1, RES1 2055 MOVD RES1, ret+24(FP) 2056 2057 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2058 MOVD in1+8(FP), P1ptr 2059 MOVD in2+16(FP), P2ptr 2060 LXVD2X (R19)(P1ptr), X0 // Z1L 2061 LXVD2X (R20)(P1ptr), X1 // Z1H 2062 XXPERMDI X0, X0, $2, X0 2063 XXPERMDI X1, X1, $2, X1 2064 LXVD2X (R19)(P2ptr), Y0 // Z2L 2065 LXVD2X (R20)(P2ptr), Y1 // Z2H 2066 XXPERMDI Y0, Y0, $2, Y0 2067 XXPERMDI Y1, Y1, $2, Y1 2068 CALL p256MulInternal<>(SB) 2069 2070 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H 2071 VOR T0, T0, X0 2072 VOR T1, T1, X1 2073 VOR HL, HL, Y0 2074 VOR HH, HH, Y1 2075 CALL p256MulInternal<>(SB) 2076 MOVD res+0(FP), P3ptr 2077 XXPERMDI T1, T1, $2, TT1 2078 XXPERMDI T0, T0, $2, TT0 2079 STXVD2X TT0, (R19)(P3ptr) 2080 STXVD2X TT1, (R20)(P3ptr) 2081 2082 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2083 MOVD in1+8(FP), P1ptr 2084 LXVD2X (R17)(P1ptr), X0 2085 LXVD2X (R18)(P1ptr), X1 2086 XXPERMDI X0, X0, $2, X0 2087 XXPERMDI X1, X1, $2, X1 2088 VOR S1L, S1L, Y0 2089 VOR S1H, S1H, Y1 2090 CALL p256MulInternal<>(SB) 2091 VOR T0, T0, S1L 2092 VOR T1, T1, S1H 2093 2094 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2095 MOVD in2+16(FP), P2ptr 2096 LXVD2X (R17)(P2ptr), X0 2097 LXVD2X (R18)(P2ptr), X1 2098 XXPERMDI X0, X0, $2, X0 2099 XXPERMDI X1, X1, $2, X1 2100 VOR RL, RL, Y0 2101 2102 // VOR RH, RH, Y1 RH was saved above in D2X format 2103 LXVD2X (R1)(R17), Y1 2104 CALL p256MulInternal<>(SB) 2105 2106 // SUB(R<T-S1) // R = T-S1 2107 p256SubInternal(RH,RL,T1,T0,S1H,S1L) 2108 2109 STXVD2X RH, (R1)(R17) // Save RH 2110 2111 // if R == 0 or R^P == 0 then ret=ret else ret=0 2112 // clobbers T1H and T1L 2113 // Redo this using ISEL?? 2114 MOVD $1, TRUE 2115 VSPLTISB $0, ZER 2116 VOR RL, RH, T1H 2117 VCMPEQUDCC ZER, T1H, T1H 2118 2119 // 24 = CR6 NE 2120 ISEL $26, R0, TRUE, RES1 2121 VXOR RL, PL, T1L 2122 VXOR RH, PH, T1H // SAVE: T1L 2123 VOR T1L, T1H, T1H 2124 VCMPEQUDCC ZER, T1H, T1H 2125 2126 // 26 = CR6 NE 2127 ISEL $26, R0, TRUE, RES2 2128 OR RES2, RES1, RES1 2129 MOVD ret+24(FP), RES2 2130 AND RES2, RES1, RES1 2131 MOVD RES1, ret+24(FP) 2132 2133 // X=H ; Y=H ; MUL; T- // T1 = H*H 2134 VOR HL, HL, X0 2135 VOR HH, HH, X1 2136 VOR HL, HL, Y0 2137 VOR HH, HH, Y1 2138 CALL p256MulInternal<>(SB) 2139 2140 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2141 VOR T0, T0, Y0 2142 VOR T1, T1, Y1 2143 CALL p256MulInternal<>(SB) 2144 VOR T0, T0, T2L 2145 VOR T1, T1, T2H 2146 2147 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2148 VOR U1L, U1L, X0 2149 VOR U1H, U1H, X1 2150 CALL p256MulInternal<>(SB) 2151 VOR T0, T0, U1L 2152 VOR T1, T1, U1H 2153 2154 // X=R ; Y=R ; MUL; T- // X3 = R*R 2155 VOR RL, RL, X0 2156 2157 // VOR RH, RH, X1 2158 VOR RL, RL, Y0 2159 2160 // RH was saved above using STXVD2X 2161 LXVD2X (R1)(R17), X1 2162 VOR X1, X1, Y1 2163 2164 // VOR RH, RH, Y1 2165 CALL p256MulInternal<>(SB) 2166 2167 // SUB(T<T-T2) // X3 = X3-T2 2168 p256SubInternal(T1,T0,T1,T0,T2H,T2L) 2169 2170 // ADD(X<U1+U1) // T1 = 2*U1 2171 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) 2172 2173 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2174 p256SubInternal(T1,T0,T1,T0,X1,X0) 2175 MOVD res+0(FP), P3ptr 2176 XXPERMDI T1, T1, $2, TT1 2177 XXPERMDI T0, T0, $2, TT0 2178 STXVD2X TT0, (R0)(P3ptr) 2179 STXVD2X TT1, (R16)(P3ptr) 2180 2181 // SUB(Y<U1-T) // Y3 = U1-X3 2182 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) 2183 2184 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2185 VOR RL, RL, X0 2186 2187 // VOR RH, RH, X1 2188 LXVD2X (R1)(R17), X1 2189 CALL p256MulInternal<>(SB) 2190 VOR T0, T0, U1L 2191 VOR T1, T1, U1H 2192 2193 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2194 VOR S1L, S1L, X0 2195 VOR S1H, S1H, X1 2196 VOR T2L, T2L, Y0 2197 VOR T2H, T2H, Y1 2198 CALL p256MulInternal<>(SB) 2199 2200 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2201 p256SubInternal(T1,T0,U1H,U1L,T1,T0) 2202 MOVD res+0(FP), P3ptr 2203 XXPERMDI T1, T1, $2, TT1 2204 XXPERMDI T0, T0, $2, TT0 2205 STXVD2X TT0, (R17)(P3ptr) 2206 STXVD2X TT1, (R18)(P3ptr) 2207 2208 RET