github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/crypto/internal/nistec/p256_asm_ppc64le.s (about) 1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 #include "textflag.h" 8 9 // This is a port of the s390x asm implementation. 10 // to ppc64le. 11 12 // Some changes were needed due to differences in 13 // the Go opcodes and/or available instructions 14 // between s390x and ppc64le. 15 16 // 1. There were operand order differences in the 17 // VSUBUQM, VSUBCUQ, and VSEL instructions. 18 19 // 2. ppc64 does not have a multiply high and low 20 // like s390x, so those were implemented using 21 // macros to compute the equivalent values. 22 23 // 3. The LVX, STVX instructions on ppc64 require 24 // 16 byte alignment of the data. To avoid that 25 // requirement, data is loaded using LXVD2X and 26 // STXVD2X with VPERM to reorder bytes correctly. 27 28 // I have identified some areas where I believe 29 // changes would be needed to make this work for big 30 // endian; however additional changes beyond what I 31 // have noted are most likely needed to make it work. 32 // - The string used with VPERM to swap the byte order 33 // for loads and stores. 34 // - The constants that are loaded from CPOOL. 35 // 36 37 // The following constants are defined in an order 38 // that is correct for use with LXVD2X/STXVD2X 39 // on little endian. 40 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 41 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 42 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 43 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 44 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 45 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 46 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 47 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 48 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 49 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 50 DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original 51 DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256 52 DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original 53 DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256 54 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 55 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 56 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 57 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 58 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 59 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 60 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 61 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 62 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 63 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 64 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 65 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 66 DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 67 DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256 68 DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 69 DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 70 71 // External declarations for constants 72 GLOBL p256ord<>(SB), 8, $32 73 GLOBL p256<>(SB), 8, $80 74 GLOBL p256mul<>(SB), 8, $160 75 76 // The following macros are used to implement the ppc64le 77 // equivalent function from the corresponding s390x 78 // instruction for vector multiply high, low, and add, 79 // since there aren't exact equivalent instructions. 80 // The corresponding s390x instructions appear in the 81 // comments. 82 // Implementation for big endian would have to be 83 // investigated, I think it would be different. 84 // 85 // 86 // Vector multiply word 87 // 88 // VMLF x0, x1, out_low 89 // VMLHF x0, x1, out_hi 90 #define VMULT(x1, x2, out_low, out_hi) \ 91 VMULEUW x1, x2, TMP1; \ 92 VMULOUW x1, x2, TMP2; \ 93 VMRGEW TMP1, TMP2, out_hi; \ 94 VMRGOW TMP1, TMP2, out_low 95 96 // 97 // Vector multiply add word 98 // 99 // VMALF x0, x1, y, out_low 100 // VMALHF x0, x1, y, out_hi 101 #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \ 102 VMULEUW y, one, TMP2; \ 103 VMULOUW y, one, TMP1; \ 104 VMULEUW x1, x2, out_low; \ 105 VMULOUW x1, x2, out_hi; \ 106 VADDUDM TMP2, out_low, TMP2; \ 107 VADDUDM TMP1, out_hi, TMP1; \ 108 VMRGOW TMP2, TMP1, out_low; \ 109 VMRGEW TMP2, TMP1, out_hi 110 111 #define res_ptr R3 112 #define a_ptr R4 113 114 #undef res_ptr 115 #undef a_ptr 116 117 #define P1ptr R3 118 #define CPOOL R7 119 120 #define Y1L V0 121 #define Y1H V1 122 #define T1L V2 123 #define T1H V3 124 125 #define PL V30 126 #define PH V31 127 128 #define CAR1 V6 129 // func p256NegCond(val *p256Point, cond int) 130 TEXT ·p256NegCond(SB), NOSPLIT, $0-16 131 MOVD val+0(FP), P1ptr 132 MOVD $16, R16 133 134 MOVD cond+8(FP), R6 135 CMP $0, R6 136 BC 12, 2, LR // just return if cond == 0 137 138 MOVD $p256mul<>+0x00(SB), CPOOL 139 140 LXVD2X (P1ptr)(R0), Y1L 141 LXVD2X (P1ptr)(R16), Y1H 142 143 XXPERMDI Y1H, Y1H, $2, Y1H 144 XXPERMDI Y1L, Y1L, $2, Y1L 145 146 LXVD2X (CPOOL)(R0), PL 147 LXVD2X (CPOOL)(R16), PH 148 149 VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry 150 VSUBUQM PL, Y1L, T1L // subtract part2 giving result 151 VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2 152 153 XXPERMDI T1H, T1H, $2, T1H 154 XXPERMDI T1L, T1L, $2, T1L 155 156 STXVD2X T1L, (R0+P1ptr) 157 STXVD2X T1H, (R16+P1ptr) 158 RET 159 160 #undef P1ptr 161 #undef CPOOL 162 #undef Y1L 163 #undef Y1H 164 #undef T1L 165 #undef T1H 166 #undef PL 167 #undef PH 168 #undef CAR1 169 170 #define P3ptr R3 171 #define P1ptr R4 172 #define P2ptr R5 173 174 #define X1L V0 175 #define X1H V1 176 #define Y1L V2 177 #define Y1H V3 178 #define Z1L V4 179 #define Z1H V5 180 #define X2L V6 181 #define X2H V7 182 #define Y2L V8 183 #define Y2H V9 184 #define Z2L V10 185 #define Z2H V11 186 #define SEL V12 187 #define ZER V13 188 189 // This function uses LXVD2X and STXVD2X to avoid the 190 // data alignment requirement for LVX, STVX. Since 191 // this code is just moving bytes and not doing arithmetic, 192 // order of the bytes doesn't matter. 193 // 194 // func p256MovCond(res, a, b *p256Point, cond int) 195 TEXT ·p256MovCond(SB), NOSPLIT, $0-32 196 MOVD res+0(FP), P3ptr 197 MOVD a+8(FP), P1ptr 198 MOVD b+16(FP), P2ptr 199 MOVD $16, R16 200 MOVD $32, R17 201 MOVD $48, R18 202 MOVD $56, R21 203 MOVD $64, R19 204 MOVD $80, R20 205 // cond is R1 + 24 (cond offset) + 32 206 LXVDSX (R1)(R21), SEL 207 VSPLTISB $0, ZER 208 // SEL controls whether to store a or b 209 VCMPEQUD SEL, ZER, SEL 210 211 LXVD2X (P1ptr+R0), X1H 212 LXVD2X (P1ptr+R16), X1L 213 LXVD2X (P1ptr+R17), Y1H 214 LXVD2X (P1ptr+R18), Y1L 215 LXVD2X (P1ptr+R19), Z1H 216 LXVD2X (P1ptr+R20), Z1L 217 218 LXVD2X (P2ptr+R0), X2H 219 LXVD2X (P2ptr+R16), X2L 220 LXVD2X (P2ptr+R17), Y2H 221 LXVD2X (P2ptr+R18), Y2L 222 LXVD2X (P2ptr+R19), Z2H 223 LXVD2X (P2ptr+R20), Z2L 224 225 VSEL X1H, X2H, SEL, X1H 226 VSEL X1L, X2L, SEL, X1L 227 VSEL Y1H, Y2H, SEL, Y1H 228 VSEL Y1L, Y2L, SEL, Y1L 229 VSEL Z1H, Z2H, SEL, Z1H 230 VSEL Z1L, Z2L, SEL, Z1L 231 232 STXVD2X X1H, (P3ptr+R0) 233 STXVD2X X1L, (P3ptr+R16) 234 STXVD2X Y1H, (P3ptr+R17) 235 STXVD2X Y1L, (P3ptr+R18) 236 STXVD2X Z1H, (P3ptr+R19) 237 STXVD2X Z1L, (P3ptr+R20) 238 239 RET 240 241 #undef P3ptr 242 #undef P1ptr 243 #undef P2ptr 244 #undef X1L 245 #undef X1H 246 #undef Y1L 247 #undef Y1H 248 #undef Z1L 249 #undef Z1H 250 #undef X2L 251 #undef X2H 252 #undef Y2L 253 #undef Y2H 254 #undef Z2L 255 #undef Z2H 256 #undef SEL 257 #undef ZER 258 259 #define P3ptr R3 260 #define P1ptr R4 261 #define COUNT R5 262 263 #define X1L V0 264 #define X1H V1 265 #define Y1L V2 266 #define Y1H V3 267 #define Z1L V4 268 #define Z1H V5 269 #define X2L V6 270 #define X2H V7 271 #define Y2L V8 272 #define Y2H V9 273 #define Z2L V10 274 #define Z2H V11 275 276 #define ONE V18 277 #define IDX V19 278 #define SEL1 V20 279 #define SEL2 V21 280 // func p256Select(point *p256Point, table *p256Table, idx int) 281 TEXT ·p256Select(SB), NOSPLIT, $0-24 282 MOVD res+0(FP), P3ptr 283 MOVD table+8(FP), P1ptr 284 MOVD $16, R16 285 MOVD $32, R17 286 MOVD $48, R18 287 MOVD $64, R19 288 MOVD $80, R20 289 290 LXVDSX (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1 291 VSPLTB $7, SEL1, IDX // splat byte 292 VSPLTISB $1, ONE // VREPIB $1, ONE 293 VSPLTISB $1, SEL2 // VREPIB $1, SEL2 294 MOVD $17, COUNT 295 MOVD COUNT, CTR // set up ctr 296 297 VSPLTISB $0, X1H // VZERO X1H 298 VSPLTISB $0, X1L // VZERO X1L 299 VSPLTISB $0, Y1H // VZERO Y1H 300 VSPLTISB $0, Y1L // VZERO Y1L 301 VSPLTISB $0, Z1H // VZERO Z1H 302 VSPLTISB $0, Z1L // VZERO Z1L 303 304 loop_select: 305 306 // LVXD2X is used here since data alignment doesn't 307 // matter. 308 309 LXVD2X (P1ptr+R0), X2H 310 LXVD2X (P1ptr+R16), X2L 311 LXVD2X (P1ptr+R17), Y2H 312 LXVD2X (P1ptr+R18), Y2L 313 LXVD2X (P1ptr+R19), Z2H 314 LXVD2X (P1ptr+R20), Z2L 315 316 VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK 317 318 // This will result in SEL1 being all 0s or 1s, meaning 319 // the result is either X1L or X2L, no individual byte 320 // selection. 321 322 VSEL X1L, X2L, SEL1, X1L 323 VSEL X1H, X2H, SEL1, X1H 324 VSEL Y1L, Y2L, SEL1, Y1L 325 VSEL Y1H, Y2H, SEL1, Y1H 326 VSEL Z1L, Z2L, SEL1, Z1L 327 VSEL Z1H, Z2H, SEL1, Z1H 328 329 // Add 1 to all bytes in SEL2 330 VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK 331 ADD $96, P1ptr 332 BDNZ loop_select 333 334 // STXVD2X is used here so that alignment doesn't 335 // need to be verified. Since values were loaded 336 // using LXVD2X this is OK. 337 STXVD2X X1H, (P3ptr+R0) 338 STXVD2X X1L, (P3ptr+R16) 339 STXVD2X Y1H, (P3ptr+R17) 340 STXVD2X Y1L, (P3ptr+R18) 341 STXVD2X Z1H, (P3ptr+R19) 342 STXVD2X Z1L, (P3ptr+R20) 343 RET 344 345 #undef P3ptr 346 #undef P1ptr 347 #undef COUNT 348 #undef X1L 349 #undef X1H 350 #undef Y1L 351 #undef Y1H 352 #undef Z1L 353 #undef Z1H 354 #undef X2L 355 #undef X2H 356 #undef Y2L 357 #undef Y2H 358 #undef Z2L 359 #undef Z2H 360 #undef ONE 361 #undef IDX 362 #undef SEL1 363 #undef SEL2 364 365 // The following functions all reverse the byte order. 366 367 //func p256BigToLittle(res *p256Element, in *[32]byte) 368 TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16 369 MOVD res+0(FP), R3 370 MOVD in+8(FP), R4 371 BR p256InternalEndianSwap<>(SB) 372 373 //func p256LittleToBig(res *[32]byte, in *p256Element) 374 TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16 375 MOVD res+0(FP), R3 376 MOVD in+8(FP), R4 377 BR p256InternalEndianSwap<>(SB) 378 379 //func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 380 TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16 381 MOVD res+0(FP), R3 382 MOVD in+8(FP), R4 383 BR p256InternalEndianSwap<>(SB) 384 385 //func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 386 TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16 387 MOVD res+0(FP), R3 388 MOVD in+8(FP), R4 389 BR p256InternalEndianSwap<>(SB) 390 391 TEXT p256InternalEndianSwap<>(SB), NOSPLIT, $0-0 392 // Index registers needed for BR movs 393 MOVD $8, R9 394 MOVD $16, R10 395 MOVD $24, R14 396 397 MOVDBR (R0)(R4), R5 398 MOVDBR (R9)(R4), R6 399 MOVDBR (R10)(R4), R7 400 MOVDBR (R14)(R4), R8 401 402 MOVD R8, 0(R3) 403 MOVD R7, 8(R3) 404 MOVD R6, 16(R3) 405 MOVD R5, 24(R3) 406 407 RET 408 409 #define P3ptr R3 410 #define P1ptr R4 411 #define COUNT R5 412 413 #define X1L V0 414 #define X1H V1 415 #define Y1L V2 416 #define Y1H V3 417 #define Z1L V4 418 #define Z1H V5 419 #define X2L V6 420 #define X2H V7 421 #define Y2L V8 422 #define Y2H V9 423 #define Z2L V10 424 #define Z2H V11 425 426 #define ONE V18 427 #define IDX V19 428 #define SEL1 V20 429 #define SEL2 V21 430 431 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 432 TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24 433 MOVD res+0(FP), P3ptr 434 MOVD table+8(FP), P1ptr 435 MOVD $16, R16 436 MOVD $32, R17 437 MOVD $48, R18 438 439 LXVDSX (R1)(R18), SEL1 440 VSPLTB $7, SEL1, IDX // splat byte 441 442 VSPLTISB $1, ONE // Vector with byte 1s 443 VSPLTISB $1, SEL2 // Vector with byte 1s 444 MOVD $64, COUNT 445 MOVD COUNT, CTR // loop count 446 447 VSPLTISB $0, X1H // VZERO X1H 448 VSPLTISB $0, X1L // VZERO X1L 449 VSPLTISB $0, Y1H // VZERO Y1H 450 VSPLTISB $0, Y1L // VZERO Y1L 451 452 loop_select: 453 LXVD2X (P1ptr+R0), X2H 454 LXVD2X (P1ptr+R16), X2L 455 LXVD2X (P1ptr+R17), Y2H 456 LXVD2X (P1ptr+R18), Y2L 457 458 VCMPEQUD SEL2, IDX, SEL1 // Compare against idx 459 460 VSEL X1L, X2L, SEL1, X1L // Select if idx matched 461 VSEL X1H, X2H, SEL1, X1H 462 VSEL Y1L, Y2L, SEL1, Y1L 463 VSEL Y1H, Y2H, SEL1, Y1H 464 465 VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1 466 ADD $64, P1ptr // Next chunk 467 BDNZ loop_select 468 469 STXVD2X X1H, (P3ptr+R0) 470 STXVD2X X1L, (P3ptr+R16) 471 STXVD2X Y1H, (P3ptr+R17) 472 STXVD2X Y1L, (P3ptr+R18) 473 RET 474 475 #undef P3ptr 476 #undef P1ptr 477 #undef COUNT 478 #undef X1L 479 #undef X1H 480 #undef Y1L 481 #undef Y1H 482 #undef Z1L 483 #undef Z1H 484 #undef X2L 485 #undef X2H 486 #undef Y2L 487 #undef Y2H 488 #undef Z2L 489 #undef Z2H 490 #undef ONE 491 #undef IDX 492 #undef SEL1 493 #undef SEL2 494 495 #define res_ptr R3 496 #define x_ptr R4 497 #define CPOOL R7 498 499 #define T0 V0 500 #define T1 V1 501 #define T2 V2 502 #define TT0 V3 503 #define TT1 V4 504 505 #define ZER V6 506 #define SEL1 V7 507 #define SEL2 V8 508 #define CAR1 V9 509 #define CAR2 V10 510 #define RED1 V11 511 #define RED2 V12 512 #define PL V13 513 #define PH V14 514 515 // func p256FromMont(res, in *p256Element) 516 TEXT ·p256FromMont(SB), NOSPLIT, $0-16 517 MOVD res+0(FP), res_ptr 518 MOVD in+8(FP), x_ptr 519 520 MOVD $16, R16 521 MOVD $32, R17 522 MOVD $48, R18 523 MOVD $64, R19 524 MOVD $p256<>+0x00(SB), CPOOL 525 526 VSPLTISB $0, T2 // VZERO T2 527 VSPLTISB $0, ZER // VZERO ZER 528 529 // Constants are defined so that the LXVD2X is correct 530 LXVD2X (CPOOL+R0), PH 531 LXVD2X (CPOOL+R16), PL 532 533 // VPERM byte selections 534 LXVD2X (CPOOL+R18), SEL2 535 LXVD2X (CPOOL+R19), SEL1 536 537 LXVD2X (R16)(x_ptr), T1 538 LXVD2X (R0)(x_ptr), T0 539 540 // Put in true little endian order 541 XXPERMDI T0, T0, $2, T0 542 XXPERMDI T1, T1, $2, T1 543 544 // First round 545 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 546 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 547 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 548 549 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 550 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 551 552 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 553 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 554 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 555 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 556 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 557 558 // Second round 559 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 560 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 561 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 562 563 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 564 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 565 566 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 567 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 568 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 569 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 570 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 571 572 // Third round 573 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 574 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 575 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 576 577 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 578 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 579 580 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 581 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 582 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 583 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 584 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 585 586 // Last round 587 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 588 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 589 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 590 591 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 592 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 593 594 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 595 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 596 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 597 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 598 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 599 600 // --------------------------------------------------- 601 602 VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1 603 VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0 604 VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2 605 VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1 606 VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2 607 608 VSEL TT0, T0, T2, T0 609 VSEL TT1, T1, T2, T1 610 611 // Reorder the bytes so STXVD2X can be used. 612 // TT0, TT1 used for VPERM result in case 613 // the caller expects T0, T1 to be good. 614 XXPERMDI T0, T0, $2, TT0 615 XXPERMDI T1, T1, $2, TT1 616 617 STXVD2X TT0, (R0)(res_ptr) 618 STXVD2X TT1, (R16)(res_ptr) 619 RET 620 621 #undef res_ptr 622 #undef x_ptr 623 #undef CPOOL 624 #undef T0 625 #undef T1 626 #undef T2 627 #undef TT0 628 #undef TT1 629 #undef ZER 630 #undef SEL1 631 #undef SEL2 632 #undef CAR1 633 #undef CAR2 634 #undef RED1 635 #undef RED2 636 #undef PL 637 #undef PH 638 639 // --------------------------------------- 640 // p256MulInternal 641 // V0-V3 V30,V31 - Not Modified 642 // V4-V15 V27-V29 - Volatile 643 644 #define CPOOL R7 645 646 // Parameters 647 #define X0 V0 // Not modified 648 #define X1 V1 // Not modified 649 #define Y0 V2 // Not modified 650 #define Y1 V3 // Not modified 651 #define T0 V4 // Result 652 #define T1 V5 // Result 653 #define P0 V30 // Not modified 654 #define P1 V31 // Not modified 655 656 // Temporaries: lots of reused vector regs 657 #define YDIG V6 // Overloaded with CAR2 658 #define ADD1H V7 // Overloaded with ADD3H 659 #define ADD2H V8 // Overloaded with ADD4H 660 #define ADD3 V9 // Overloaded with SEL2,SEL5 661 #define ADD4 V10 // Overloaded with SEL3,SEL6 662 #define RED1 V11 // Overloaded with CAR2 663 #define RED2 V12 664 #define RED3 V13 // Overloaded with SEL1 665 #define T2 V14 666 // Overloaded temporaries 667 #define ADD1 V4 // Overloaded with T0 668 #define ADD2 V5 // Overloaded with T1 669 #define ADD3H V7 // Overloaded with ADD1H 670 #define ADD4H V8 // Overloaded with ADD2H 671 #define ZER V28 // Overloaded with TMP1 672 #define CAR1 V6 // Overloaded with YDIG 673 #define CAR2 V11 // Overloaded with RED1 674 // Constant Selects 675 #define SEL1 V13 // Overloaded with RED3 676 #define SEL2 V9 // Overloaded with ADD3,SEL5 677 #define SEL3 V10 // Overloaded with ADD4,SEL6 678 #define SEL4 V6 // Overloaded with YDIG,CAR1 679 #define SEL5 V9 // Overloaded with ADD3,SEL2 680 #define SEL6 V10 // Overloaded with ADD4,SEL3 681 682 // TMP1, TMP2 used in 683 // VMULT macros 684 #define TMP1 V13 // Overloaded with RED3 685 #define TMP2 V27 686 #define ONE V29 // 1s splatted by word 687 688 /* * 689 * To follow the flow of bits, for your own sanity a stiff drink, need you shall. 690 * Of a single round, a 'helpful' picture, here is. Meaning, column position has. 691 * With you, SIMD be... 692 * 693 * +--------+--------+ 694 * +--------| RED2 | RED1 | 695 * | +--------+--------+ 696 * | ---+--------+--------+ 697 * | +---- T2| T1 | T0 |--+ 698 * | | ---+--------+--------+ | 699 * | | | 700 * | | ======================= | 701 * | | | 702 * | | +--------+--------+<-+ 703 * | +-------| ADD2 | ADD1 |--|-----+ 704 * | | +--------+--------+ | | 705 * | | +--------+--------+<---+ | 706 * | | | ADD2H | ADD1H |--+ | 707 * | | +--------+--------+ | | 708 * | | +--------+--------+<-+ | 709 * | | | ADD4 | ADD3 |--|-+ | 710 * | | +--------+--------+ | | | 711 * | | +--------+--------+<---+ | | 712 * | | | ADD4H | ADD3H |------|-+ |(+vzero) 713 * | | +--------+--------+ | | V 714 * | | ------------------------ | | +--------+ 715 * | | | | | RED3 | [d0 0 0 d0] 716 * | | | | +--------+ 717 * | +---->+--------+--------+ | | | 718 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | | 719 * | +--------+--------+ | | | 720 * +---->---+--------+--------+ | | | 721 * T2| T1 | T0 |----+ | | 722 * ---+--------+--------+ | | | 723 * ---+--------+--------+<---+ | | 724 * +--- T2| T1 | T0 |----------+ 725 * | ---+--------+--------+ | | 726 * | +--------+--------+<-------------+ 727 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 728 * | +--------+--------+ | | | 729 * | +--------+<----------------------+ 730 * | | RED3 |--------------+ | [0 0 d1 d0] 731 * | +--------+ | | 732 * +--->+--------+--------+ | | 733 * | T1 | T0 |--------+ 734 * +--------+--------+ | | 735 * --------------------------- | | 736 * | | 737 * +--------+--------+<----+ | 738 * | RED2 | RED1 | | 739 * +--------+--------+ | 740 * ---+--------+--------+<-------+ 741 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 742 * ---+--------+--------+ 743 * 744 * *Mi obra de arte de siglo XXI @vpaprots 745 * 746 * 747 * First group is special, doesn't get the two inputs: 748 * +--------+--------+<-+ 749 * +-------| ADD2 | ADD1 |--|-----+ 750 * | +--------+--------+ | | 751 * | +--------+--------+<---+ | 752 * | | ADD2H | ADD1H |--+ | 753 * | +--------+--------+ | | 754 * | +--------+--------+<-+ | 755 * | | ADD4 | ADD3 |--|-+ | 756 * | +--------+--------+ | | | 757 * | +--------+--------+<---+ | | 758 * | | ADD4H | ADD3H |------|-+ |(+vzero) 759 * | +--------+--------+ | | V 760 * | ------------------------ | | +--------+ 761 * | | | | RED3 | [d0 0 0 d0] 762 * | | | +--------+ 763 * +---->+--------+--------+ | | | 764 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | | 765 * +--------+--------+ | | | 766 * ---+--------+--------+<---+ | | 767 * +--- T2| T1 | T0 |----------+ 768 * | ---+--------+--------+ | | 769 * | +--------+--------+<-------------+ 770 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 771 * | +--------+--------+ | | | 772 * | +--------+<----------------------+ 773 * | | RED3 |--------------+ | [0 0 d1 d0] 774 * | +--------+ | | 775 * +--->+--------+--------+ | | 776 * | T1 | T0 |--------+ 777 * +--------+--------+ | | 778 * --------------------------- | | 779 * | | 780 * +--------+--------+<----+ | 781 * | RED2 | RED1 | | 782 * +--------+--------+ | 783 * ---+--------+--------+<-------+ 784 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 785 * ---+--------+--------+ 786 * 787 * Last 'group' needs to RED2||RED1 shifted less 788 */ 789 TEXT p256MulInternal<>(SB), NOSPLIT, $0-16 790 // CPOOL loaded from caller 791 MOVD $16, R16 792 MOVD $32, R17 793 MOVD $48, R18 794 MOVD $64, R19 795 MOVD $80, R20 796 MOVD $96, R21 797 MOVD $112, R22 798 799 // --------------------------------------------------- 800 801 VSPLTW $3, Y0, YDIG // VREPF Y0 is input 802 803 // VMLHF X0, YDIG, ADD1H 804 // VMLHF X1, YDIG, ADD2H 805 // VMLF X0, YDIG, ADD1 806 // VMLF X1, YDIG, ADD2 807 // 808 VMULT(X0, YDIG, ADD1, ADD1H) 809 VMULT(X1, YDIG, ADD2, ADD2H) 810 811 VSPLTISW $1, ONE 812 VSPLTW $2, Y0, YDIG // VREPF 813 814 // VMALF X0, YDIG, ADD1H, ADD3 815 // VMALF X1, YDIG, ADD2H, ADD4 816 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 817 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 818 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 819 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 820 821 LXVD2X (R17)(CPOOL), SEL1 822 VSPLTISB $0, ZER // VZERO ZER 823 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 824 825 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB 826 VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB 827 828 VADDCUQ T0, ADD3, CAR1 // VACCQ 829 VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ 830 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ 831 VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ 832 833 LXVD2X (R18)(CPOOL), SEL2 834 LXVD2X (R19)(CPOOL), SEL3 835 LXVD2X (R20)(CPOOL), SEL4 836 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 837 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 838 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 839 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow -->? // VSQ 840 841 VSLDOI $12, T1, T0, T0 // VSLDB 842 VSLDOI $12, T2, T1, T1 // VSLDB 843 844 VADDCUQ T0, ADD3H, CAR1 // VACCQ 845 VADDUQM T0, ADD3H, T0 // VAQ 846 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 847 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 848 849 // --------------------------------------------------- 850 851 VSPLTW $1, Y0, YDIG // VREPF 852 853 // VMALHF X0, YDIG, T0, ADD1H 854 // VMALHF X1, YDIG, T1, ADD2H 855 // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 856 // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 857 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 858 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 859 860 VSPLTW $0, Y0, YDIG // VREPF 861 862 // VMALF X0, YDIG, ADD1H, ADD3 863 // VMALF X1, YDIG, ADD2H, ADD4 864 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H 865 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER 866 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 867 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 868 869 VSPLTISB $0, ZER // VZERO ZER 870 LXVD2X (R17)(CPOOL), SEL1 871 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 872 873 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB 874 VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB 875 876 VADDCUQ T0, RED1, CAR1 // VACCQ 877 VADDUQM T0, RED1, T0 // VAQ 878 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ 879 VADDEUQM T1, RED2, CAR1, T1 // VACQ 880 881 VADDCUQ T0, ADD3, CAR1 // VACCQ 882 VADDUQM T0, ADD3, T0 // VAQ 883 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ 884 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 885 VADDUQM T2, CAR2, T2 // VAQ 886 887 LXVD2X (R18)(CPOOL), SEL2 888 LXVD2X (R19)(CPOOL), SEL3 889 LXVD2X (R20)(CPOOL), SEL4 890 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 891 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 892 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 893 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ 894 895 VSLDOI $12, T1, T0, T0 // VSLDB 896 VSLDOI $12, T2, T1, T1 // VSLDB 897 898 VADDCUQ T0, ADD3H, CAR1 // VACCQ 899 VADDUQM T0, ADD3H, T0 // VAQ 900 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 901 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 902 903 // --------------------------------------------------- 904 905 VSPLTW $3, Y1, YDIG // VREPF 906 907 // VMALHF X0, YDIG, T0, ADD1H 908 // VMALHF X1, YDIG, T1, ADD2H 909 // VMALF X0, YDIG, T0, ADD1 910 // VMALF X1, YDIG, T1, ADD2 911 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 912 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 913 914 VSPLTW $2, Y1, YDIG // VREPF 915 916 // VMALF X0, YDIG, ADD1H, ADD3 917 // VMALF X1, YDIG, ADD2H, ADD4 918 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 919 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 920 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 921 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 922 923 LXVD2X (R17)(CPOOL), SEL1 924 VSPLTISB $0, ZER // VZERO ZER 925 LXVD2X (R17)(CPOOL), SEL1 926 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 927 928 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB 929 VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB 930 931 VADDCUQ T0, RED1, CAR1 // VACCQ 932 VADDUQM T0, RED1, T0 // VAQ 933 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ 934 VADDEUQM T1, RED2, CAR1, T1 // VACQ 935 936 VADDCUQ T0, ADD3, CAR1 // VACCQ 937 VADDUQM T0, ADD3, T0 // VAQ 938 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ 939 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 940 VADDUQM T2, CAR2, T2 // VAQ 941 942 LXVD2X (R18)(CPOOL), SEL2 943 LXVD2X (R19)(CPOOL), SEL3 944 LXVD2X (R20)(CPOOL), SEL4 945 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 946 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 947 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 948 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ 949 950 VSLDOI $12, T1, T0, T0 // VSLDB 951 VSLDOI $12, T2, T1, T1 // VSLDB 952 953 VADDCUQ T0, ADD3H, CAR1 // VACCQ 954 VADDUQM T0, ADD3H, T0 // VAQ 955 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 956 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 957 958 // --------------------------------------------------- 959 960 VSPLTW $1, Y1, YDIG // VREPF 961 962 // VMALHF X0, YDIG, T0, ADD1H 963 // VMALHF X1, YDIG, T1, ADD2H 964 // VMALF X0, YDIG, T0, ADD1 965 // VMALF X1, YDIG, T1, ADD2 966 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 967 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 968 969 VSPLTW $0, Y1, YDIG // VREPF 970 971 // VMALF X0, YDIG, ADD1H, ADD3 972 // VMALF X1, YDIG, ADD2H, ADD4 973 // VMALHF X0, YDIG, ADD1H, ADD3H 974 // VMALHF X1, YDIG, ADD2H, ADD4H 975 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 976 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 977 978 VSPLTISB $0, ZER // VZERO ZER 979 LXVD2X (R17)(CPOOL), SEL1 980 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 981 982 VSLDOI $12, ADD2, ADD1, T0 // VSLDB 983 VSLDOI $12, T2, ADD2, T1 // VSLDB 984 985 VADDCUQ T0, RED1, CAR1 // VACCQ 986 VADDUQM T0, RED1, T0 // VAQ 987 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ 988 VADDEUQM T1, RED2, CAR1, T1 // VACQ 989 990 VADDCUQ T0, ADD3, CAR1 // VACCQ 991 VADDUQM T0, ADD3, T0 // VAQ 992 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ 993 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 994 VADDUQM T2, CAR2, T2 // VAQ 995 996 LXVD2X (R21)(CPOOL), SEL5 997 LXVD2X (R22)(CPOOL), SEL6 998 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] 999 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0] 1000 VSUBUQM RED2, RED1, RED2 // Guaranteed not to underflow // VSQ 1001 1002 VSLDOI $12, T1, T0, T0 // VSLDB 1003 VSLDOI $12, T2, T1, T1 // VSLDB 1004 1005 VADDCUQ T0, ADD3H, CAR1 // VACCQ 1006 VADDUQM T0, ADD3H, T0 // VAQ 1007 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 1008 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 1009 1010 VADDCUQ T0, RED1, CAR1 // VACCQ 1011 VADDUQM T0, RED1, T0 // VAQ 1012 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ 1013 VADDEUQM T1, RED2, CAR1, T1 // VACQ 1014 VADDUQM T2, CAR2, T2 // VAQ 1015 1016 // --------------------------------------------------- 1017 1018 VSPLTISB $0, RED3 // VZERO RED3 1019 VSUBCUQ T0, P0, CAR1 // VSCBIQ 1020 VSUBUQM T0, P0, ADD1H // VSQ 1021 VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ 1022 VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ 1023 VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ 1024 1025 // what output to use, ADD2H||ADD1H or T1||T0? 1026 VSEL ADD1H, T0, T2, T0 1027 VSEL ADD2H, T1, T2, T1 1028 RET 1029 1030 #undef CPOOL 1031 1032 #undef X0 1033 #undef X1 1034 #undef Y0 1035 #undef Y1 1036 #undef T0 1037 #undef T1 1038 #undef P0 1039 #undef P1 1040 1041 #undef SEL1 1042 #undef SEL2 1043 #undef SEL3 1044 #undef SEL4 1045 #undef SEL5 1046 #undef SEL6 1047 1048 #undef YDIG 1049 #undef ADD1H 1050 #undef ADD2H 1051 #undef ADD3 1052 #undef ADD4 1053 #undef RED1 1054 #undef RED2 1055 #undef RED3 1056 #undef T2 1057 #undef ADD1 1058 #undef ADD2 1059 #undef ADD3H 1060 #undef ADD4H 1061 #undef ZER 1062 #undef CAR1 1063 #undef CAR2 1064 1065 #undef TMP1 1066 #undef TMP2 1067 1068 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ 1069 VSPLTISB $0, ZER \ // VZERO 1070 VSUBCUQ X0, Y0, CAR1 \ 1071 VSUBUQM X0, Y0, T0 \ 1072 VSUBECUQ X1, Y1, CAR1, SEL1 \ 1073 VSUBEUQM X1, Y1, CAR1, T1 \ 1074 VSUBUQM ZER, SEL1, SEL1 \ // VSQ 1075 \ 1076 VADDCUQ T0, PL, CAR1 \ // VACCQ 1077 VADDUQM T0, PL, TT0 \ // VAQ 1078 VADDEUQM T1, PH, CAR1, TT1 \ // VACQ 1079 \ 1080 VSEL TT0, T0, SEL1, T0 \ 1081 VSEL TT1, T1, SEL1, T1 \ 1082 1083 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ 1084 VADDCUQ X0, Y0, CAR1 \ 1085 VADDUQM X0, Y0, T0 \ 1086 VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ 1087 VADDEUQM X1, Y1, CAR1, T1 \ 1088 \ 1089 VSPLTISB $0, ZER \ 1090 VSUBCUQ T0, PL, CAR1 \ // VSCBIQ 1091 VSUBUQM T0, PL, TT0 \ 1092 VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ 1093 VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ 1094 VSUBEUQM T2, ZER, CAR2, SEL1 \ 1095 \ 1096 VSEL TT0, T0, SEL1, T0 \ 1097 VSEL TT1, T1, SEL1, T1 1098 1099 #define p256HalfInternal(T1, T0, X1, X0) \ 1100 VSPLTISB $0, ZER \ 1101 VSUBEUQM ZER, ZER, X0, SEL1 \ 1102 \ 1103 VADDCUQ X0, PL, CAR1 \ 1104 VADDUQM X0, PL, T0 \ 1105 VADDECUQ X1, PH, CAR1, T2 \ 1106 VADDEUQM X1, PH, CAR1, T1 \ 1107 \ 1108 VSEL T0, X0, SEL1, T0 \ 1109 VSEL T1, X1, SEL1, T1 \ 1110 VSEL T2, ZER, SEL1, T2 \ 1111 \ 1112 VSLDOI $15, T2, ZER, TT1 \ 1113 VSLDOI $15, T1, ZER, TT0 \ 1114 VSPLTISB $1, SEL1 \ 1115 VSR T0, SEL1, T0 \ // VSRL 1116 VSR T1, SEL1, T1 \ 1117 VSPLTISB $7, SEL1 \ // VREPIB 1118 VSL TT0, SEL1, TT0 \ 1119 VSL TT1, SEL1, TT1 \ 1120 VOR T0, TT0, T0 \ 1121 VOR T1, TT1, T1 1122 1123 #define res_ptr R3 1124 #define x_ptr R4 1125 #define y_ptr R5 1126 #define CPOOL R7 1127 #define TEMP R8 1128 #define N R9 1129 1130 // Parameters 1131 #define X0 V0 1132 #define X1 V1 1133 #define Y0 V2 1134 #define Y1 V3 1135 #define T0 V4 1136 #define T1 V5 1137 1138 // Constants 1139 #define P0 V30 1140 #define P1 V31 1141 // func p256MulAsm(res, in1, in2 *p256Element) 1142 TEXT ·p256Mul(SB), NOSPLIT, $0-24 1143 MOVD res+0(FP), res_ptr 1144 MOVD in1+8(FP), x_ptr 1145 MOVD in2+16(FP), y_ptr 1146 MOVD $16, R16 1147 MOVD $32, R17 1148 1149 MOVD $p256mul<>+0x00(SB), CPOOL 1150 1151 1152 LXVD2X (R0)(x_ptr), X0 1153 LXVD2X (R16)(x_ptr), X1 1154 1155 XXPERMDI X0, X0, $2, X0 1156 XXPERMDI X1, X1, $2, X1 1157 1158 LXVD2X (R0)(y_ptr), Y0 1159 LXVD2X (R16)(y_ptr), Y1 1160 1161 XXPERMDI Y0, Y0, $2, Y0 1162 XXPERMDI Y1, Y1, $2, Y1 1163 1164 LXVD2X (R16)(CPOOL), P1 1165 LXVD2X (R0)(CPOOL), P0 1166 1167 CALL p256MulInternal<>(SB) 1168 1169 MOVD $p256mul<>+0x00(SB), CPOOL 1170 1171 XXPERMDI T0, T0, $2, T0 1172 XXPERMDI T1, T1, $2, T1 1173 STXVD2X T0, (R0)(res_ptr) 1174 STXVD2X T1, (R16)(res_ptr) 1175 RET 1176 1177 // func p256Sqr(res, in *p256Element, n int) 1178 TEXT ·p256Sqr(SB), NOSPLIT, $0-24 1179 MOVD res+0(FP), res_ptr 1180 MOVD in+8(FP), x_ptr 1181 MOVD $16, R16 1182 MOVD $32, R17 1183 1184 MOVD $p256mul<>+0x00(SB), CPOOL 1185 1186 LXVD2X (R0)(x_ptr), X0 1187 LXVD2X (R16)(x_ptr), X1 1188 1189 XXPERMDI X0, X0, $2, X0 1190 XXPERMDI X1, X1, $2, X1 1191 1192 sqrLoop: 1193 // Sqr uses same value for both 1194 1195 VOR X0, X0, Y0 1196 VOR X1, X1, Y1 1197 1198 LXVD2X (R16)(CPOOL), P1 1199 LXVD2X (R0)(CPOOL), P0 1200 1201 CALL p256MulInternal<>(SB) 1202 1203 MOVD n+16(FP), N 1204 ADD $-1, N 1205 CMP $0, N 1206 BEQ done 1207 MOVD N, n+16(FP) // Save counter to avoid clobber 1208 VOR T0, T0, X0 1209 VOR T1, T1, X1 1210 BR sqrLoop 1211 1212 done: 1213 MOVD $p256mul<>+0x00(SB), CPOOL 1214 1215 XXPERMDI T0, T0, $2, T0 1216 XXPERMDI T1, T1, $2, T1 1217 STXVD2X T0, (R0)(res_ptr) 1218 STXVD2X T1, (R16)(res_ptr) 1219 RET 1220 1221 #undef res_ptr 1222 #undef x_ptr 1223 #undef y_ptr 1224 #undef CPOOL 1225 1226 #undef X0 1227 #undef X1 1228 #undef Y0 1229 #undef Y1 1230 #undef T0 1231 #undef T1 1232 #undef P0 1233 #undef P1 1234 1235 #define P3ptr R3 1236 #define P1ptr R4 1237 #define P2ptr R5 1238 #define CPOOL R7 1239 1240 // Temporaries in REGs 1241 #define Y2L V15 1242 #define Y2H V16 1243 #define T1L V17 1244 #define T1H V18 1245 #define T2L V19 1246 #define T2H V20 1247 #define T3L V21 1248 #define T3H V22 1249 #define T4L V23 1250 #define T4H V24 1251 1252 // Temps for Sub and Add 1253 #define TT0 V11 1254 #define TT1 V12 1255 #define T2 V13 1256 1257 // p256MulAsm Parameters 1258 #define X0 V0 1259 #define X1 V1 1260 #define Y0 V2 1261 #define Y1 V3 1262 #define T0 V4 1263 #define T1 V5 1264 1265 #define PL V30 1266 #define PH V31 1267 1268 // Names for zero/sel selects 1269 #define X1L V0 1270 #define X1H V1 1271 #define Y1L V2 // p256MulAsmParmY 1272 #define Y1H V3 // p256MulAsmParmY 1273 #define Z1L V4 1274 #define Z1H V5 1275 #define X2L V0 1276 #define X2H V1 1277 #define Z2L V4 1278 #define Z2H V5 1279 #define X3L V17 // T1L 1280 #define X3H V18 // T1H 1281 #define Y3L V21 // T3L 1282 #define Y3H V22 // T3H 1283 #define Z3L V25 1284 #define Z3H V26 1285 1286 #define ZER V6 1287 #define SEL1 V7 1288 #define CAR1 V8 1289 #define CAR2 V9 1290 /* * 1291 * Three operand formula: 1292 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1293 * T1 = Z1² 1294 * T2 = T1*Z1 1295 * T1 = T1*X2 1296 * T2 = T2*Y2 1297 * T1 = T1-X1 1298 * T2 = T2-Y1 1299 * Z3 = Z1*T1 1300 * T3 = T1² 1301 * T4 = T3*T1 1302 * T3 = T3*X1 1303 * T1 = 2*T3 1304 * X3 = T2² 1305 * X3 = X3-T1 1306 * X3 = X3-T4 1307 * T3 = T3-X3 1308 * T3 = T3*T2 1309 * T4 = T4*Y1 1310 * Y3 = T3-T4 1311 1312 * Three operand formulas, but with MulInternal X,Y used to store temps 1313 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 1314 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 1315 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 1316 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 1317 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1318 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1319 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 1320 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 1321 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 1322 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 1323 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1324 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 1325 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 1326 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1327 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1328 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 1329 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 1330 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 1331 1332 */ 1333 // 1334 // V27 is clobbered by p256MulInternal so must be 1335 // saved in a temp. 1336 // 1337 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1338 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48 1339 MOVD res+0(FP), P3ptr 1340 MOVD in1+8(FP), P1ptr 1341 MOVD in2+16(FP), P2ptr 1342 1343 MOVD $p256mul<>+0x00(SB), CPOOL 1344 1345 MOVD $16, R16 1346 MOVD $32, R17 1347 MOVD $48, R18 1348 MOVD $64, R19 1349 MOVD $80, R20 1350 MOVD $96, R21 1351 MOVD $112, R22 1352 MOVD $128, R23 1353 MOVD $144, R24 1354 MOVD $160, R25 1355 MOVD $104, R26 // offset of sign+24(FP) 1356 1357 LXVD2X (R16)(CPOOL), PH 1358 LXVD2X (R0)(CPOOL), PL 1359 1360 LXVD2X (R17)(P2ptr), Y2L 1361 LXVD2X (R18)(P2ptr), Y2H 1362 XXPERMDI Y2H, Y2H, $2, Y2H 1363 XXPERMDI Y2L, Y2L, $2, Y2L 1364 1365 // Equivalent of VLREPG sign+24(FP), SEL1 1366 LXVDSX (R1)(R26), SEL1 1367 VSPLTISB $0, ZER 1368 VCMPEQUD SEL1, ZER, SEL1 1369 1370 VSUBCUQ PL, Y2L, CAR1 1371 VSUBUQM PL, Y2L, T1L 1372 VSUBEUQM PH, Y2H, CAR1, T1H 1373 1374 VSEL T1L, Y2L, SEL1, Y2L 1375 VSEL T1H, Y2H, SEL1, Y2H 1376 1377 /* * 1378 * Three operand formula: 1379 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1380 */ 1381 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 1382 LXVD2X (R19)(P1ptr), X0 // Z1H 1383 LXVD2X (R20)(P1ptr), X1 // Z1L 1384 XXPERMDI X0, X0, $2, X0 1385 XXPERMDI X1, X1, $2, X1 1386 VOR X0, X0, Y0 1387 VOR X1, X1, Y1 1388 CALL p256MulInternal<>(SB) 1389 1390 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 1391 VOR T0, T0, X0 1392 VOR T1, T1, X1 1393 CALL p256MulInternal<>(SB) 1394 VOR T0, T0, T2L 1395 VOR T1, T1, T2H 1396 1397 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 1398 MOVD in2+16(FP), P2ptr 1399 LXVD2X (R0)(P2ptr), Y0 // X2H 1400 LXVD2X (R16)(P2ptr), Y1 // X2L 1401 XXPERMDI Y0, Y0, $2, Y0 1402 XXPERMDI Y1, Y1, $2, Y1 1403 CALL p256MulInternal<>(SB) 1404 VOR T0, T0, T1L 1405 VOR T1, T1, T1H 1406 1407 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 1408 VOR T2L, T2L, X0 1409 VOR T2H, T2H, X1 1410 VOR Y2L, Y2L, Y0 1411 VOR Y2H, Y2H, Y1 1412 CALL p256MulInternal<>(SB) 1413 1414 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1415 MOVD in1+8(FP), P1ptr 1416 LXVD2X (R17)(P1ptr), Y1L 1417 LXVD2X (R18)(P1ptr), Y1H 1418 XXPERMDI Y1H, Y1H, $2, Y1H 1419 XXPERMDI Y1L, Y1L, $2, Y1L 1420 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) 1421 1422 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1423 LXVD2X (R0)(P1ptr), X1L 1424 LXVD2X (R16)(P1ptr), X1H 1425 XXPERMDI X1H, X1H, $2, X1H 1426 XXPERMDI X1L, X1L, $2, X1L 1427 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) 1428 1429 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 1430 LXVD2X (R19)(P1ptr), X0 // Z1H 1431 LXVD2X (R20)(P1ptr), X1 // Z1L 1432 XXPERMDI X0, X0, $2, X0 1433 XXPERMDI X1, X1, $2, X1 1434 CALL p256MulInternal<>(SB) 1435 1436 VOR T0, T0, Z3L 1437 VOR T1, T1, Z3H 1438 1439 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 1440 VOR Y0, Y0, X0 1441 VOR Y1, Y1, X1 1442 CALL p256MulInternal<>(SB) 1443 VOR T0, T0, X0 1444 VOR T1, T1, X1 1445 1446 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 1447 CALL p256MulInternal<>(SB) 1448 VOR T0, T0, T4L 1449 VOR T1, T1, T4H 1450 1451 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 1452 MOVD in1+8(FP), P1ptr 1453 LXVD2X (R0)(P1ptr), Y0 // X1H 1454 LXVD2X (R16)(P1ptr), Y1 // X1L 1455 XXPERMDI Y1, Y1, $2, Y1 1456 XXPERMDI Y0, Y0, $2, Y0 1457 CALL p256MulInternal<>(SB) 1458 VOR T0, T0, T3L 1459 VOR T1, T1, T3H 1460 1461 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1462 p256AddInternal(T1H,T1L, T1,T0,T1,T0) 1463 1464 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 1465 VOR T2L, T2L, X0 1466 VOR T2H, T2H, X1 1467 VOR T2L, T2L, Y0 1468 VOR T2H, T2H, Y1 1469 CALL p256MulInternal<>(SB) 1470 1471 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) 1472 p256SubInternal(T1,T0,T1,T0,T1H,T1L) 1473 1474 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1475 p256SubInternal(T1,T0,T1,T0,T4H,T4L) 1476 VOR T0, T0, X3L 1477 VOR T1, T1, X3H 1478 1479 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1480 p256SubInternal(X1,X0,T3H,T3L,T1,T0) 1481 1482 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 1483 CALL p256MulInternal<>(SB) 1484 VOR T0, T0, T3L 1485 VOR T1, T1, T3H 1486 1487 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 1488 VOR T4L, T4L, X0 1489 VOR T4H, T4H, X1 1490 MOVD in1+8(FP), P1ptr 1491 LXVD2X (R17)(P1ptr), Y0 // Y1H 1492 LXVD2X (R18)(P1ptr), Y1 // Y1L 1493 XXPERMDI Y0, Y0, $2, Y0 1494 XXPERMDI Y1, Y1, $2, Y1 1495 CALL p256MulInternal<>(SB) 1496 1497 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) 1498 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) 1499 1500 // if (sel == 0) { 1501 // copy(P3.x[:], X1) 1502 // copy(P3.y[:], Y1) 1503 // copy(P3.z[:], Z1) 1504 // } 1505 1506 LXVD2X (R0)(P1ptr), X1L 1507 LXVD2X (R16)(P1ptr), X1H 1508 XXPERMDI X1H, X1H, $2, X1H 1509 XXPERMDI X1L, X1L, $2, X1L 1510 1511 // Y1 already loaded, left over from addition 1512 LXVD2X (R19)(P1ptr), Z1L 1513 LXVD2X (R20)(P1ptr), Z1H 1514 XXPERMDI Z1H, Z1H, $2, Z1H 1515 XXPERMDI Z1L, Z1L, $2, Z1L 1516 1517 MOVD $112, R26 // Get offset to sel+32 1518 LXVDSX (R1)(R26), SEL1 1519 VSPLTISB $0, ZER 1520 VCMPEQUD SEL1, ZER, SEL1 1521 1522 VSEL X3L, X1L, SEL1, X3L 1523 VSEL X3H, X1H, SEL1, X3H 1524 VSEL Y3L, Y1L, SEL1, Y3L 1525 VSEL Y3H, Y1H, SEL1, Y3H 1526 VSEL Z3L, Z1L, SEL1, Z3L 1527 VSEL Z3H, Z1H, SEL1, Z3H 1528 1529 MOVD in2+16(FP), P2ptr 1530 LXVD2X (R0)(P2ptr), X2L 1531 LXVD2X (R16)(P2ptr), X2H 1532 XXPERMDI X2H, X2H, $2, X2H 1533 XXPERMDI X2L, X2L, $2, X2L 1534 1535 // Y2 already loaded 1536 LXVD2X (R23)(CPOOL), Z2L 1537 LXVD2X (R24)(CPOOL), Z2H 1538 1539 MOVD $120, R26 // Get the value from zero+40(FP) 1540 LXVDSX (R1)(R26), SEL1 1541 VSPLTISB $0, ZER 1542 VCMPEQUD SEL1, ZER, SEL1 1543 1544 VSEL X3L, X2L, SEL1, X3L 1545 VSEL X3H, X2H, SEL1, X3H 1546 VSEL Y3L, Y2L, SEL1, Y3L 1547 VSEL Y3H, Y2H, SEL1, Y3H 1548 VSEL Z3L, Z2L, SEL1, Z3L 1549 VSEL Z3H, Z2H, SEL1, Z3H 1550 1551 // Reorder the bytes so they can be stored using STXVD2X. 1552 MOVD res+0(FP), P3ptr 1553 XXPERMDI X3H, X3H, $2, X3H 1554 XXPERMDI X3L, X3L, $2, X3L 1555 XXPERMDI Y3H, Y3H, $2, Y3H 1556 XXPERMDI Y3L, Y3L, $2, Y3L 1557 XXPERMDI Z3H, Z3H, $2, Z3H 1558 XXPERMDI Z3L, Z3L, $2, Z3L 1559 STXVD2X X3L, (R0)(P3ptr) 1560 STXVD2X X3H, (R16)(P3ptr) 1561 STXVD2X Y3L, (R17)(P3ptr) 1562 STXVD2X Y3H, (R18)(P3ptr) 1563 STXVD2X Z3L, (R19)(P3ptr) 1564 STXVD2X Z3H, (R20)(P3ptr) 1565 1566 RET 1567 1568 #undef P3ptr 1569 #undef P1ptr 1570 #undef P2ptr 1571 #undef CPOOL 1572 1573 #undef Y2L 1574 #undef Y2H 1575 #undef T1L 1576 #undef T1H 1577 #undef T2L 1578 #undef T2H 1579 #undef T3L 1580 #undef T3H 1581 #undef T4L 1582 #undef T4H 1583 1584 #undef TT0 1585 #undef TT1 1586 #undef T2 1587 1588 #undef X0 1589 #undef X1 1590 #undef Y0 1591 #undef Y1 1592 #undef T0 1593 #undef T1 1594 1595 #undef PL 1596 #undef PH 1597 1598 #undef X1L 1599 #undef X1H 1600 #undef Y1L 1601 #undef Y1H 1602 #undef Z1L 1603 #undef Z1H 1604 #undef X2L 1605 #undef X2H 1606 #undef Z2L 1607 #undef Z2H 1608 #undef X3L 1609 #undef X3H 1610 #undef Y3L 1611 #undef Y3H 1612 #undef Z3L 1613 #undef Z3H 1614 1615 #undef ZER 1616 #undef SEL1 1617 #undef CAR1 1618 #undef CAR2 1619 1620 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl 1621 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html 1622 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html 1623 #define P3ptr R3 1624 #define P1ptr R4 1625 #define CPOOL R7 1626 1627 // Temporaries in REGs 1628 #define X3L V15 1629 #define X3H V16 1630 #define Y3L V17 1631 #define Y3H V18 1632 #define T1L V19 1633 #define T1H V20 1634 #define T2L V21 1635 #define T2H V22 1636 #define T3L V23 1637 #define T3H V24 1638 1639 #define X1L V6 1640 #define X1H V7 1641 #define Y1L V8 1642 #define Y1H V9 1643 #define Z1L V10 1644 #define Z1H V11 1645 1646 // Temps for Sub and Add 1647 #define TT0 V11 1648 #define TT1 V12 1649 #define T2 V13 1650 1651 // p256MulAsm Parameters 1652 #define X0 V0 1653 #define X1 V1 1654 #define Y0 V2 1655 #define Y1 V3 1656 #define T0 V4 1657 #define T1 V5 1658 1659 #define PL V30 1660 #define PH V31 1661 1662 #define Z3L V23 1663 #define Z3H V24 1664 1665 #define ZER V26 1666 #define SEL1 V27 1667 #define CAR1 V28 1668 #define CAR2 V29 1669 /* 1670 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv 1671 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. 1672 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1673 * A = 3(X₁-Z₁²)×(X₁+Z₁²) 1674 * B = 2Y₁ 1675 * Z₃ = B×Z₁ 1676 * C = B² 1677 * D = C×X₁ 1678 * X₃ = A²-2D 1679 * Y₃ = (D-X₃)×A-C²/2 1680 * 1681 * Three-operand formula: 1682 * T1 = Z1² 1683 * T2 = X1-T1 1684 * T1 = X1+T1 1685 * T2 = T2*T1 1686 * T2 = 3*T2 1687 * Y3 = 2*Y1 1688 * Z3 = Y3*Z1 1689 * Y3 = Y3² 1690 * T3 = Y3*X1 1691 * Y3 = Y3² 1692 * Y3 = half*Y3 1693 * X3 = T2² 1694 * T1 = 2*T3 1695 * X3 = X3-T1 1696 * T1 = T3-X3 1697 * T1 = T1*T2 1698 * Y3 = T1-Y3 1699 */ 1700 // p256PointDoubleAsm(res, in1 *p256Point) 1701 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16 1702 MOVD res+0(FP), P3ptr 1703 MOVD in+8(FP), P1ptr 1704 1705 MOVD $p256mul<>+0x00(SB), CPOOL 1706 1707 MOVD $16, R16 1708 MOVD $32, R17 1709 MOVD $48, R18 1710 MOVD $64, R19 1711 MOVD $80, R20 1712 1713 LXVD2X (R16)(CPOOL), PH 1714 LXVD2X (R0)(CPOOL), PL 1715 1716 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² 1717 LXVD2X (R19)(P1ptr), X0 // Z1H 1718 LXVD2X (R20)(P1ptr), X1 // Z1L 1719 1720 XXPERMDI X0, X0, $2, X0 1721 XXPERMDI X1, X1, $2, X1 1722 1723 VOR X0, X0, Y0 1724 VOR X1, X1, Y1 1725 CALL p256MulInternal<>(SB) 1726 1727 // SUB(X<X1-T) // T2 = X1-T1 1728 LXVD2X (R0)(P1ptr), X1L 1729 LXVD2X (R16)(P1ptr), X1H 1730 XXPERMDI X1L, X1L, $2, X1L 1731 XXPERMDI X1H, X1H, $2, X1H 1732 1733 p256SubInternal(X1,X0,X1H,X1L,T1,T0) 1734 1735 // ADD(Y<X1+T) // T1 = X1+T1 1736 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) 1737 1738 // X- ; Y- ; MUL; T- // T2 = T2*T1 1739 CALL p256MulInternal<>(SB) 1740 1741 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 1742 p256AddInternal(T2H,T2L,T1,T0,T1,T0) 1743 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) 1744 1745 // ADD(X<Y1+Y1) // Y3 = 2*Y1 1746 LXVD2X (R17)(P1ptr), Y1L 1747 LXVD2X (R18)(P1ptr), Y1H 1748 XXPERMDI Y1L, Y1L, $2, Y1L 1749 XXPERMDI Y1H, Y1H, $2, Y1H 1750 1751 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) 1752 1753 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 1754 LXVD2X (R19)(P1ptr), Y0 1755 LXVD2X (R20)(P1ptr), Y1 1756 XXPERMDI Y0, Y0, $2, Y0 1757 XXPERMDI Y1, Y1, $2, Y1 1758 1759 CALL p256MulInternal<>(SB) 1760 1761 // Leave T0, T1 as is. 1762 XXPERMDI T0, T0, $2, TT0 1763 XXPERMDI T1, T1, $2, TT1 1764 STXVD2X TT0, (R19)(P3ptr) 1765 STXVD2X TT1, (R20)(P3ptr) 1766 1767 // X- ; Y=X ; MUL; T- // Y3 = Y3² 1768 VOR X0, X0, Y0 1769 VOR X1, X1, Y1 1770 CALL p256MulInternal<>(SB) 1771 1772 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 1773 VOR T0, T0, X0 1774 VOR T1, T1, X1 1775 LXVD2X (R0)(P1ptr), Y0 1776 LXVD2X (R16)(P1ptr), Y1 1777 XXPERMDI Y0, Y0, $2, Y0 1778 XXPERMDI Y1, Y1, $2, Y1 1779 CALL p256MulInternal<>(SB) 1780 VOR T0, T0, T3L 1781 VOR T1, T1, T3H 1782 1783 // X- ; Y=X ; MUL; T- // Y3 = Y3² 1784 VOR X0, X0, Y0 1785 VOR X1, X1, Y1 1786 CALL p256MulInternal<>(SB) 1787 1788 // HAL(Y3<T) // Y3 = half*Y3 1789 p256HalfInternal(Y3H,Y3L, T1,T0) 1790 1791 // X=T2; Y=T2; MUL; T- // X3 = T2² 1792 VOR T2L, T2L, X0 1793 VOR T2H, T2H, X1 1794 VOR T2L, T2L, Y0 1795 VOR T2H, T2H, Y1 1796 CALL p256MulInternal<>(SB) 1797 1798 // ADD(T1<T3+T3) // T1 = 2*T3 1799 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) 1800 1801 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 1802 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) 1803 1804 XXPERMDI X3L, X3L, $2, TT0 1805 XXPERMDI X3H, X3H, $2, TT1 1806 STXVD2X TT0, (R0)(P3ptr) 1807 STXVD2X TT1, (R16)(P3ptr) 1808 1809 // SUB(X<T3-X3) // T1 = T3-X3 1810 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) 1811 1812 // X- ; Y- ; MUL; T- // T1 = T1*T2 1813 CALL p256MulInternal<>(SB) 1814 1815 // SUB(Y3<T-Y3) // Y3 = T1-Y3 1816 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) 1817 1818 XXPERMDI Y3L, Y3L, $2, Y3L 1819 XXPERMDI Y3H, Y3H, $2, Y3H 1820 STXVD2X Y3L, (R17)(P3ptr) 1821 STXVD2X Y3H, (R18)(P3ptr) 1822 RET 1823 1824 #undef P3ptr 1825 #undef P1ptr 1826 #undef CPOOL 1827 #undef X3L 1828 #undef X3H 1829 #undef Y3L 1830 #undef Y3H 1831 #undef T1L 1832 #undef T1H 1833 #undef T2L 1834 #undef T2H 1835 #undef T3L 1836 #undef T3H 1837 #undef X1L 1838 #undef X1H 1839 #undef Y1L 1840 #undef Y1H 1841 #undef Z1L 1842 #undef Z1H 1843 #undef TT0 1844 #undef TT1 1845 #undef T2 1846 #undef X0 1847 #undef X1 1848 #undef Y0 1849 #undef Y1 1850 #undef T0 1851 #undef T1 1852 #undef PL 1853 #undef PH 1854 #undef Z3L 1855 #undef Z3H 1856 #undef ZER 1857 #undef SEL1 1858 #undef CAR1 1859 #undef CAR2 1860 1861 #define P3ptr R3 1862 #define P1ptr R4 1863 #define P2ptr R5 1864 #define CPOOL R7 1865 #define TRUE R14 1866 #define RES1 R9 1867 #define RES2 R10 1868 1869 // Temporaries in REGs 1870 #define T1L V16 1871 #define T1H V17 1872 #define T2L V18 1873 #define T2H V19 1874 #define U1L V20 1875 #define U1H V21 1876 #define S1L V22 1877 #define S1H V23 1878 #define HL V24 1879 #define HH V25 1880 #define RL V26 1881 #define RH V27 1882 1883 // Temps for Sub and Add 1884 #define ZER V6 1885 #define SEL1 V7 1886 #define CAR1 V8 1887 #define CAR2 V9 1888 #define TT0 V11 1889 #define TT1 V12 1890 #define T2 V13 1891 1892 // p256MulAsm Parameters 1893 #define X0 V0 1894 #define X1 V1 1895 #define Y0 V2 1896 #define Y1 V3 1897 #define T0 V4 1898 #define T1 V5 1899 1900 #define PL V30 1901 #define PH V31 1902 /* 1903 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" 1904 * 1905 * A = X₁×Z₂² 1906 * B = Y₁×Z₂³ 1907 * C = X₂×Z₁²-A 1908 * D = Y₂×Z₁³-B 1909 * X₃ = D² - 2A×C² - C³ 1910 * Y₃ = D×(A×C² - X₃) - B×C³ 1911 * Z₃ = Z₁×Z₂×C 1912 * 1913 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 1914 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R 1915 * 1916 * T1 = Z1*Z1 1917 * T2 = Z2*Z2 1918 * U1 = X1*T2 1919 * H = X2*T1 1920 * H = H-U1 1921 * Z3 = Z1*Z2 1922 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 1923 * 1924 * S1 = Z2*T2 1925 * S1 = Y1*S1 1926 * R = Z1*T1 1927 * R = Y2*R 1928 * R = R-S1 1929 * 1930 * T1 = H*H 1931 * T2 = H*T1 1932 * U1 = U1*T1 1933 * 1934 * X3 = R*R 1935 * X3 = X3-T2 1936 * T1 = 2*U1 1937 * X3 = X3-T1 << store-out X3 result reg 1938 * 1939 * T2 = S1*T2 1940 * Y3 = U1-X3 1941 * Y3 = R*Y3 1942 * Y3 = Y3-T2 << store-out Y3 result reg 1943 1944 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 1945 // X- ; Y=T ; MUL; R=T // R = Z1*T1 1946 // X=X2; Y- ; MUL; H=T // H = X2*T1 1947 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 1948 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 1949 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 1950 // SUB(H<H-T) // H = H-U1 1951 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 1952 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 1953 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 1954 // X=Y2; Y=R ; MUL; T- // R = Y2*R 1955 // SUB(R<T-S1) // R = R-S1 1956 // X=H ; Y=H ; MUL; T- // T1 = H*H 1957 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 1958 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 1959 // X=R ; Y=R ; MUL; T- // X3 = R*R 1960 // SUB(T<T-T2) // X3 = X3-T2 1961 // ADD(X<U1+U1) // T1 = 2*U1 1962 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 1963 // SUB(Y<U1-T) // Y3 = U1-X3 1964 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 1965 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 1966 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 1967 */ 1968 // p256PointAddAsm(res, in1, in2 *p256Point) 1969 TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32 1970 MOVD res+0(FP), P3ptr 1971 MOVD in1+8(FP), P1ptr 1972 MOVD $p256mul<>+0x00(SB), CPOOL 1973 MOVD $16, R16 1974 MOVD $32, R17 1975 MOVD $48, R18 1976 MOVD $64, R19 1977 MOVD $80, R20 1978 1979 LXVD2X (R16)(CPOOL), PH 1980 LXVD2X (R0)(CPOOL), PL 1981 1982 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 1983 LXVD2X (R19)(P1ptr), X0 // Z1L 1984 LXVD2X (R20)(P1ptr), X1 // Z1H 1985 XXPERMDI X0, X0, $2, X0 1986 XXPERMDI X1, X1, $2, X1 1987 VOR X0, X0, Y0 1988 VOR X1, X1, Y1 1989 CALL p256MulInternal<>(SB) 1990 1991 // X- ; Y=T ; MUL; R=T // R = Z1*T1 1992 VOR T0, T0, Y0 1993 VOR T1, T1, Y1 1994 CALL p256MulInternal<>(SB) 1995 VOR T0, T0, RL // SAVE: RL 1996 VOR T1, T1, RH // SAVE: RH 1997 1998 STXVD2X RH, (R1)(R17) // V27 has to be saved 1999 2000 // X=X2; Y- ; MUL; H=T // H = X2*T1 2001 MOVD in2+16(FP), P2ptr 2002 LXVD2X (R0)(P2ptr), X0 // X2L 2003 LXVD2X (R16)(P2ptr), X1 // X2H 2004 XXPERMDI X0, X0, $2, X0 2005 XXPERMDI X1, X1, $2, X1 2006 CALL p256MulInternal<>(SB) 2007 VOR T0, T0, HL // SAVE: HL 2008 VOR T1, T1, HH // SAVE: HH 2009 2010 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2011 MOVD in2+16(FP), P2ptr 2012 LXVD2X (R19)(P2ptr), X0 // Z2L 2013 LXVD2X (R20)(P2ptr), X1 // Z2H 2014 XXPERMDI X0, X0, $2, X0 2015 XXPERMDI X1, X1, $2, X1 2016 VOR X0, X0, Y0 2017 VOR X1, X1, Y1 2018 CALL p256MulInternal<>(SB) 2019 2020 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2021 VOR T0, T0, Y0 2022 VOR T1, T1, Y1 2023 CALL p256MulInternal<>(SB) 2024 VOR T0, T0, S1L // SAVE: S1L 2025 VOR T1, T1, S1H // SAVE: S1H 2026 2027 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2028 MOVD in1+8(FP), P1ptr 2029 LXVD2X (R0)(P1ptr), X0 // X1L 2030 LXVD2X (R16)(P1ptr), X1 // X1H 2031 XXPERMDI X0, X0, $2, X0 2032 XXPERMDI X1, X1, $2, X1 2033 CALL p256MulInternal<>(SB) 2034 VOR T0, T0, U1L // SAVE: U1L 2035 VOR T1, T1, U1H // SAVE: U1H 2036 2037 // SUB(H<H-T) // H = H-U1 2038 p256SubInternal(HH,HL,HH,HL,T1,T0) 2039 2040 // if H == 0 or H^P == 0 then ret=1 else ret=0 2041 // clobbers T1H and T1L 2042 MOVD $1, TRUE 2043 VSPLTISB $0, ZER 2044 VOR HL, HH, T1H 2045 VCMPEQUDCC ZER, T1H, T1H 2046 2047 // 26 = CR6 NE 2048 ISEL $26, R0, TRUE, RES1 2049 VXOR HL, PL, T1L // SAVE: T1L 2050 VXOR HH, PH, T1H // SAVE: T1H 2051 VOR T1L, T1H, T1H 2052 VCMPEQUDCC ZER, T1H, T1H 2053 2054 // 26 = CR6 NE 2055 ISEL $26, R0, TRUE, RES2 2056 OR RES2, RES1, RES1 2057 MOVD RES1, ret+24(FP) 2058 2059 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2060 MOVD in1+8(FP), P1ptr 2061 MOVD in2+16(FP), P2ptr 2062 LXVD2X (R19)(P1ptr), X0 // Z1L 2063 LXVD2X (R20)(P1ptr), X1 // Z1H 2064 XXPERMDI X0, X0, $2, X0 2065 XXPERMDI X1, X1, $2, X1 2066 LXVD2X (R19)(P2ptr), Y0 // Z2L 2067 LXVD2X (R20)(P2ptr), Y1 // Z2H 2068 XXPERMDI Y0, Y0, $2, Y0 2069 XXPERMDI Y1, Y1, $2, Y1 2070 CALL p256MulInternal<>(SB) 2071 2072 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H 2073 VOR T0, T0, X0 2074 VOR T1, T1, X1 2075 VOR HL, HL, Y0 2076 VOR HH, HH, Y1 2077 CALL p256MulInternal<>(SB) 2078 MOVD res+0(FP), P3ptr 2079 XXPERMDI T1, T1, $2, TT1 2080 XXPERMDI T0, T0, $2, TT0 2081 STXVD2X TT0, (R19)(P3ptr) 2082 STXVD2X TT1, (R20)(P3ptr) 2083 2084 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2085 MOVD in1+8(FP), P1ptr 2086 LXVD2X (R17)(P1ptr), X0 2087 LXVD2X (R18)(P1ptr), X1 2088 XXPERMDI X0, X0, $2, X0 2089 XXPERMDI X1, X1, $2, X1 2090 VOR S1L, S1L, Y0 2091 VOR S1H, S1H, Y1 2092 CALL p256MulInternal<>(SB) 2093 VOR T0, T0, S1L 2094 VOR T1, T1, S1H 2095 2096 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2097 MOVD in2+16(FP), P2ptr 2098 LXVD2X (R17)(P2ptr), X0 2099 LXVD2X (R18)(P2ptr), X1 2100 XXPERMDI X0, X0, $2, X0 2101 XXPERMDI X1, X1, $2, X1 2102 VOR RL, RL, Y0 2103 2104 // VOR RH, RH, Y1 RH was saved above in D2X format 2105 LXVD2X (R1)(R17), Y1 2106 CALL p256MulInternal<>(SB) 2107 2108 // SUB(R<T-S1) // R = T-S1 2109 p256SubInternal(RH,RL,T1,T0,S1H,S1L) 2110 2111 STXVD2X RH, (R1)(R17) // Save RH 2112 2113 // if R == 0 or R^P == 0 then ret=ret else ret=0 2114 // clobbers T1H and T1L 2115 // Redo this using ISEL?? 2116 MOVD $1, TRUE 2117 VSPLTISB $0, ZER 2118 VOR RL, RH, T1H 2119 VCMPEQUDCC ZER, T1H, T1H 2120 2121 // 24 = CR6 NE 2122 ISEL $26, R0, TRUE, RES1 2123 VXOR RL, PL, T1L 2124 VXOR RH, PH, T1H // SAVE: T1L 2125 VOR T1L, T1H, T1H 2126 VCMPEQUDCC ZER, T1H, T1H 2127 2128 // 26 = CR6 NE 2129 ISEL $26, R0, TRUE, RES2 2130 OR RES2, RES1, RES1 2131 MOVD ret+24(FP), RES2 2132 AND RES2, RES1, RES1 2133 MOVD RES1, ret+24(FP) 2134 2135 // X=H ; Y=H ; MUL; T- // T1 = H*H 2136 VOR HL, HL, X0 2137 VOR HH, HH, X1 2138 VOR HL, HL, Y0 2139 VOR HH, HH, Y1 2140 CALL p256MulInternal<>(SB) 2141 2142 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2143 VOR T0, T0, Y0 2144 VOR T1, T1, Y1 2145 CALL p256MulInternal<>(SB) 2146 VOR T0, T0, T2L 2147 VOR T1, T1, T2H 2148 2149 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2150 VOR U1L, U1L, X0 2151 VOR U1H, U1H, X1 2152 CALL p256MulInternal<>(SB) 2153 VOR T0, T0, U1L 2154 VOR T1, T1, U1H 2155 2156 // X=R ; Y=R ; MUL; T- // X3 = R*R 2157 VOR RL, RL, X0 2158 2159 // VOR RH, RH, X1 2160 VOR RL, RL, Y0 2161 2162 // RH was saved above using STXVD2X 2163 LXVD2X (R1)(R17), X1 2164 VOR X1, X1, Y1 2165 2166 // VOR RH, RH, Y1 2167 CALL p256MulInternal<>(SB) 2168 2169 // SUB(T<T-T2) // X3 = X3-T2 2170 p256SubInternal(T1,T0,T1,T0,T2H,T2L) 2171 2172 // ADD(X<U1+U1) // T1 = 2*U1 2173 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) 2174 2175 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2176 p256SubInternal(T1,T0,T1,T0,X1,X0) 2177 MOVD res+0(FP), P3ptr 2178 XXPERMDI T1, T1, $2, TT1 2179 XXPERMDI T0, T0, $2, TT0 2180 STXVD2X TT0, (R0)(P3ptr) 2181 STXVD2X TT1, (R16)(P3ptr) 2182 2183 // SUB(Y<U1-T) // Y3 = U1-X3 2184 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) 2185 2186 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2187 VOR RL, RL, X0 2188 2189 // VOR RH, RH, X1 2190 LXVD2X (R1)(R17), X1 2191 CALL p256MulInternal<>(SB) 2192 VOR T0, T0, U1L 2193 VOR T1, T1, U1H 2194 2195 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2196 VOR S1L, S1L, X0 2197 VOR S1H, S1H, X1 2198 VOR T2L, T2L, Y0 2199 VOR T2H, T2H, Y1 2200 CALL p256MulInternal<>(SB) 2201 2202 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2203 p256SubInternal(T1,T0,U1H,U1L,T1,T0) 2204 MOVD res+0(FP), P3ptr 2205 XXPERMDI T1, T1, $2, TT1 2206 XXPERMDI T0, T0, $2, TT0 2207 STXVD2X TT0, (R17)(P3ptr) 2208 STXVD2X TT1, (R18)(P3ptr) 2209 2210 RET