github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/crypto/internal/nistec/p256_asm.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file contains the Go wrapper for the constant-time, 64-bit assembly 6 // implementation of P256. The optimizations performed here are described in 7 // detail in: 8 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 9 // 256-bit primes" 10 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x 11 // https://eprint.iacr.org/2013/816.pdf 12 13 //go:build amd64 || arm64 || ppc64le || s390x 14 15 package nistec 16 17 import ( 18 _ "embed" 19 "encoding/binary" 20 "errors" 21 "math/bits" 22 "runtime" 23 "unsafe" 24 ) 25 26 // p256Element is a P-256 base field element in [0, P-1] in the Montgomery 27 // domain (with R 2²⁵⁶) as four limbs in little-endian order value. 28 type p256Element [4]uint64 29 30 // p256One is one in the Montgomery domain. 31 var p256One = p256Element{0x0000000000000001, 0xffffffff00000000, 32 0xffffffffffffffff, 0x00000000fffffffe} 33 34 var p256Zero = p256Element{} 35 36 // p256P is 2²⁵⁶ - 2²²⁴ + 2¹⁹² + 2⁹⁶ - 1 in the Montgomery domain. 37 var p256P = p256Element{0xffffffffffffffff, 0x00000000ffffffff, 38 0x0000000000000000, 0xffffffff00000001} 39 40 // P256Point is a P-256 point. The zero value should not be assumed to be valid 41 // (although it is in this implementation). 42 type P256Point struct { 43 // (X:Y:Z) are Jacobian coordinates where x = X/Z² and y = Y/Z³. The point 44 // at infinity can be represented by any set of coordinates with Z = 0. 45 x, y, z p256Element 46 } 47 48 // NewP256Point returns a new P256Point representing the point at infinity. 49 func NewP256Point() *P256Point { 50 return &P256Point{ 51 x: p256One, y: p256One, z: p256Zero, 52 } 53 } 54 55 // SetGenerator sets p to the canonical generator and returns p. 56 func (p *P256Point) SetGenerator() *P256Point { 57 p.x = p256Element{0x79e730d418a9143c, 0x75ba95fc5fedb601, 58 0x79fb732b77622510, 0x18905f76a53755c6} 59 p.y = p256Element{0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 60 0xd2e88688dd21f325, 0x8571ff1825885d85} 61 p.z = p256One 62 return p 63 } 64 65 // Set sets p = q and returns p. 66 func (p *P256Point) Set(q *P256Point) *P256Point { 67 p.x, p.y, p.z = q.x, q.y, q.z 68 return p 69 } 70 71 const p256ElementLength = 32 72 const p256UncompressedLength = 1 + 2*p256ElementLength 73 const p256CompressedLength = 1 + p256ElementLength 74 75 // SetBytes sets p to the compressed, uncompressed, or infinity value encoded in 76 // b, as specified in SEC 1, Version 2.0, Section 2.3.4. If the point is not on 77 // the curve, it returns nil and an error, and the receiver is unchanged. 78 // Otherwise, it returns p. 79 func (p *P256Point) SetBytes(b []byte) (*P256Point, error) { 80 // p256Mul operates in the Montgomery domain with R = 2²⁵⁶ mod p. Thus rr 81 // here is R in the Montgomery domain, or R×R mod p. See comment in 82 // P256OrdInverse about how this is used. 83 rr := p256Element{0x0000000000000003, 0xfffffffbffffffff, 84 0xfffffffffffffffe, 0x00000004fffffffd} 85 86 switch { 87 // Point at infinity. 88 case len(b) == 1 && b[0] == 0: 89 return p.Set(NewP256Point()), nil 90 91 // Uncompressed form. 92 case len(b) == p256UncompressedLength && b[0] == 4: 93 var r P256Point 94 p256BigToLittle(&r.x, (*[32]byte)(b[1:33])) 95 p256BigToLittle(&r.y, (*[32]byte)(b[33:65])) 96 if p256LessThanP(&r.x) == 0 || p256LessThanP(&r.y) == 0 { 97 return nil, errors.New("invalid P256 element encoding") 98 } 99 p256Mul(&r.x, &r.x, &rr) 100 p256Mul(&r.y, &r.y, &rr) 101 if err := p256CheckOnCurve(&r.x, &r.y); err != nil { 102 return nil, err 103 } 104 r.z = p256One 105 return p.Set(&r), nil 106 107 // Compressed form. 108 case len(b) == p256CompressedLength && (b[0] == 2 || b[0] == 3): 109 var r P256Point 110 p256BigToLittle(&r.x, (*[32]byte)(b[1:33])) 111 if p256LessThanP(&r.x) == 0 { 112 return nil, errors.New("invalid P256 element encoding") 113 } 114 p256Mul(&r.x, &r.x, &rr) 115 116 // y² = x³ - 3x + b 117 p256Polynomial(&r.y, &r.x) 118 if !p256Sqrt(&r.y, &r.y) { 119 return nil, errors.New("invalid P256 compressed point encoding") 120 } 121 122 // Select the positive or negative root, as indicated by the least 123 // significant bit, based on the encoding type byte. 124 yy := new(p256Element) 125 p256FromMont(yy, &r.y) 126 cond := int(yy[0]&1) ^ int(b[0]&1) 127 p256NegCond(&r.y, cond) 128 129 r.z = p256One 130 return p.Set(&r), nil 131 132 default: 133 return nil, errors.New("invalid P256 point encoding") 134 } 135 } 136 137 // p256Polynomial sets y2 to x³ - 3x + b, and returns y2. 138 func p256Polynomial(y2, x *p256Element) *p256Element { 139 x3 := new(p256Element) 140 p256Sqr(x3, x, 1) 141 p256Mul(x3, x3, x) 142 143 threeX := new(p256Element) 144 p256Add(threeX, x, x) 145 p256Add(threeX, threeX, x) 146 p256NegCond(threeX, 1) 147 148 p256B := &p256Element{0xd89cdf6229c4bddf, 0xacf005cd78843090, 149 0xe5a220abf7212ed6, 0xdc30061d04874834} 150 151 p256Add(x3, x3, threeX) 152 p256Add(x3, x3, p256B) 153 154 *y2 = *x3 155 return y2 156 } 157 158 func p256CheckOnCurve(x, y *p256Element) error { 159 // y² = x³ - 3x + b 160 rhs := p256Polynomial(new(p256Element), x) 161 lhs := new(p256Element) 162 p256Sqr(lhs, y, 1) 163 if p256Equal(lhs, rhs) != 1 { 164 return errors.New("P256 point not on curve") 165 } 166 return nil 167 } 168 169 // p256LessThanP returns 1 if x < p, and 0 otherwise. Note that a p256Element is 170 // not allowed to be equal to or greater than p, so if this function returns 0 171 // then x is invalid. 172 func p256LessThanP(x *p256Element) int { 173 var b uint64 174 _, b = bits.Sub64(x[0], p256P[0], b) 175 _, b = bits.Sub64(x[1], p256P[1], b) 176 _, b = bits.Sub64(x[2], p256P[2], b) 177 _, b = bits.Sub64(x[3], p256P[3], b) 178 return int(b) 179 } 180 181 // p256Add sets res = x + y. 182 func p256Add(res, x, y *p256Element) { 183 var c, b uint64 184 t1 := make([]uint64, 4) 185 t1[0], c = bits.Add64(x[0], y[0], 0) 186 t1[1], c = bits.Add64(x[1], y[1], c) 187 t1[2], c = bits.Add64(x[2], y[2], c) 188 t1[3], c = bits.Add64(x[3], y[3], c) 189 t2 := make([]uint64, 4) 190 t2[0], b = bits.Sub64(t1[0], p256P[0], 0) 191 t2[1], b = bits.Sub64(t1[1], p256P[1], b) 192 t2[2], b = bits.Sub64(t1[2], p256P[2], b) 193 t2[3], b = bits.Sub64(t1[3], p256P[3], b) 194 // Three options: 195 // - a+b < p 196 // then c is 0, b is 1, and t1 is correct 197 // - p <= a+b < 2^256 198 // then c is 0, b is 0, and t2 is correct 199 // - 2^256 <= a+b 200 // then c is 1, b is 1, and t2 is correct 201 t2Mask := (c ^ b) - 1 202 res[0] = (t1[0] & ^t2Mask) | (t2[0] & t2Mask) 203 res[1] = (t1[1] & ^t2Mask) | (t2[1] & t2Mask) 204 res[2] = (t1[2] & ^t2Mask) | (t2[2] & t2Mask) 205 res[3] = (t1[3] & ^t2Mask) | (t2[3] & t2Mask) 206 } 207 208 // p256Sqrt sets e to a square root of x. If x is not a square, p256Sqrt returns 209 // false and e is unchanged. e and x can overlap. 210 func p256Sqrt(e, x *p256Element) (isSquare bool) { 211 t0, t1 := new(p256Element), new(p256Element) 212 213 // Since p = 3 mod 4, exponentiation by (p + 1) / 4 yields a square root candidate. 214 // 215 // The sequence of 7 multiplications and 253 squarings is derived from the 216 // following addition chain generated with github.com/mmcloughlin/addchain v0.4.0. 217 // 218 // _10 = 2*1 219 // _11 = 1 + _10 220 // _1100 = _11 << 2 221 // _1111 = _11 + _1100 222 // _11110000 = _1111 << 4 223 // _11111111 = _1111 + _11110000 224 // x16 = _11111111 << 8 + _11111111 225 // x32 = x16 << 16 + x16 226 // return ((x32 << 32 + 1) << 96 + 1) << 94 227 // 228 p256Sqr(t0, x, 1) 229 p256Mul(t0, x, t0) 230 p256Sqr(t1, t0, 2) 231 p256Mul(t0, t0, t1) 232 p256Sqr(t1, t0, 4) 233 p256Mul(t0, t0, t1) 234 p256Sqr(t1, t0, 8) 235 p256Mul(t0, t0, t1) 236 p256Sqr(t1, t0, 16) 237 p256Mul(t0, t0, t1) 238 p256Sqr(t0, t0, 32) 239 p256Mul(t0, x, t0) 240 p256Sqr(t0, t0, 96) 241 p256Mul(t0, x, t0) 242 p256Sqr(t0, t0, 94) 243 244 p256Sqr(t1, t0, 1) 245 if p256Equal(t1, x) != 1 { 246 return false 247 } 248 *e = *t0 249 return true 250 } 251 252 // The following assembly functions are implemented in p256_asm_*.s 253 254 // Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p. 255 // 256 //go:noescape 257 func p256Mul(res, in1, in2 *p256Element) 258 259 // Montgomery square, repeated n times (n >= 1). 260 // 261 //go:noescape 262 func p256Sqr(res, in *p256Element, n int) 263 264 // Montgomery multiplication by R⁻¹, or 1 outside the domain. 265 // Sets res = in * R⁻¹, bringing res out of the Montgomery domain. 266 // 267 //go:noescape 268 func p256FromMont(res, in *p256Element) 269 270 // If cond is not 0, sets val = -val mod p. 271 // 272 //go:noescape 273 func p256NegCond(val *p256Element, cond int) 274 275 // If cond is 0, sets res = b, otherwise sets res = a. 276 // 277 //go:noescape 278 func p256MovCond(res, a, b *P256Point, cond int) 279 280 //go:noescape 281 func p256BigToLittle(res *p256Element, in *[32]byte) 282 283 //go:noescape 284 func p256LittleToBig(res *[32]byte, in *p256Element) 285 286 //go:noescape 287 func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 288 289 //go:noescape 290 func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 291 292 // p256Table is a table of the first 16 multiples of a point. Points are stored 293 // at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15. 294 // [0]P is the point at infinity and it's not stored. 295 type p256Table [16]P256Point 296 297 // p256Select sets res to the point at index idx - 1 in the table. 298 // idx must be in [1, 16] or res will be set to an undefined value. 299 // It executes in constant time. 300 // 301 //go:noescape 302 func p256Select(res *P256Point, table *p256Table, idx int) 303 304 // p256AffinePoint is a point in affine coordinates (x, y). x and y are still 305 // Montgomery domain elements. The point can't be the point at infinity. 306 type p256AffinePoint struct { 307 x, y p256Element 308 } 309 310 // p256AffineTable is a table of the first 32 multiples of a point. Points are 311 // stored at an index offset of -1 like in p256Table, and [0]P is not stored. 312 type p256AffineTable [32]p256AffinePoint 313 314 // p256Precomputed is a series of precomputed multiples of G, the canonical 315 // generator. The first p256AffineTable contains multiples of G. The second one 316 // multiples of [2⁶]G, the third one of [2¹²]G, and so on, where each successive 317 // table is the previous table doubled six times. Six is the width of the 318 // sliding window used in p256ScalarMult, and having each table already 319 // pre-doubled lets us avoid the doublings between windows entirely. This table 320 // MUST NOT be modified, as it aliases into p256PrecomputedEmbed below. 321 var p256Precomputed *[43]p256AffineTable 322 323 //go:embed p256_asm_table.bin 324 var p256PrecomputedEmbed string 325 326 func init() { 327 p256PrecomputedPtr := (*unsafe.Pointer)(unsafe.Pointer(&p256PrecomputedEmbed)) 328 if runtime.GOARCH == "s390x" { 329 var newTable [43 * 32 * 2 * 4]uint64 330 for i, x := range (*[43 * 32 * 2 * 4][8]byte)(*p256PrecomputedPtr) { 331 newTable[i] = binary.LittleEndian.Uint64(x[:]) 332 } 333 newTablePtr := unsafe.Pointer(&newTable) 334 p256PrecomputedPtr = &newTablePtr 335 } 336 p256Precomputed = (*[43]p256AffineTable)(*p256PrecomputedPtr) 337 } 338 339 // p256SelectAffine sets res to the point at index idx - 1 in the table. 340 // idx must be in [1, 32] or res will be set to an undefined value. 341 // It executes in constant time. 342 // 343 //go:noescape 344 func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 345 346 // Point addition with an affine point and constant time conditions. 347 // If zero is 0, sets res = in2. If sel is 0, sets res = in1. 348 // If sign is not 0, sets res = in1 + -in2. Otherwise, sets res = in1 + in2. 349 // If neither sel nor zero are 0 and in1 = in2, or both zero and sel are 0, 350 // or in1 is the infinity, res is undefined. 351 // 352 //go:noescape 353 func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 354 355 // Point addition. Sets res = in1 + in2 and returns zero if in1 and in2 are not 356 // equal. Otherwise, returns one and res is undefined. If in1 or in2 are the 357 // point at infinity, res and the return value are undefined. 358 // 359 //go:noescape 360 func p256PointAddAsm(res, in1, in2 *P256Point) int 361 362 // Point doubling. Sets res = in + in. in can be the point at infinity. 363 // 364 //go:noescape 365 func p256PointDoubleAsm(res, in *P256Point) 366 367 // p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the 368 // Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order. 369 type p256OrdElement [4]uint64 370 371 // p256OrdReduce ensures s is in the range [0, ord(G)-1]. 372 func p256OrdReduce(s *p256OrdElement) { 373 // Since 2 * ord(G) > 2²⁵⁶, we can just conditionally subtract ord(G), 374 // keeping the result if it doesn't underflow. 375 t0, b := bits.Sub64(s[0], 0xf3b9cac2fc632551, 0) 376 t1, b := bits.Sub64(s[1], 0xbce6faada7179e84, b) 377 t2, b := bits.Sub64(s[2], 0xffffffffffffffff, b) 378 t3, b := bits.Sub64(s[3], 0xffffffff00000000, b) 379 tMask := b - 1 // zero if subtraction underflowed 380 s[0] ^= (t0 ^ s[0]) & tMask 381 s[1] ^= (t1 ^ s[1]) & tMask 382 s[2] ^= (t2 ^ s[2]) & tMask 383 s[3] ^= (t3 ^ s[3]) & tMask 384 } 385 386 // Add sets q = p1 + p2, and returns q. The points may overlap. 387 func (q *P256Point) Add(r1, r2 *P256Point) *P256Point { 388 var sum, double P256Point 389 r1IsInfinity := r1.isInfinity() 390 r2IsInfinity := r2.isInfinity() 391 pointsEqual := p256PointAddAsm(&sum, r1, r2) 392 p256PointDoubleAsm(&double, r1) 393 p256MovCond(&sum, &double, &sum, pointsEqual) 394 p256MovCond(&sum, r1, &sum, r2IsInfinity) 395 p256MovCond(&sum, r2, &sum, r1IsInfinity) 396 return q.Set(&sum) 397 } 398 399 // Double sets q = p + p, and returns q. The points may overlap. 400 func (q *P256Point) Double(p *P256Point) *P256Point { 401 var double P256Point 402 p256PointDoubleAsm(&double, p) 403 return q.Set(&double) 404 } 405 406 // ScalarBaseMult sets r = scalar * generator, where scalar is a 32-byte big 407 // endian value, and returns r. If scalar is not 32 bytes long, ScalarBaseMult 408 // returns an error and the receiver is unchanged. 409 func (r *P256Point) ScalarBaseMult(scalar []byte) (*P256Point, error) { 410 if len(scalar) != 32 { 411 return nil, errors.New("invalid scalar length") 412 } 413 scalarReversed := new(p256OrdElement) 414 p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar)) 415 p256OrdReduce(scalarReversed) 416 417 r.p256BaseMult(scalarReversed) 418 return r, nil 419 } 420 421 // ScalarMult sets r = scalar * q, where scalar is a 32-byte big endian value, 422 // and returns r. If scalar is not 32 bytes long, ScalarBaseMult returns an 423 // error and the receiver is unchanged. 424 func (r *P256Point) ScalarMult(q *P256Point, scalar []byte) (*P256Point, error) { 425 if len(scalar) != 32 { 426 return nil, errors.New("invalid scalar length") 427 } 428 scalarReversed := new(p256OrdElement) 429 p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar)) 430 p256OrdReduce(scalarReversed) 431 432 r.Set(q).p256ScalarMult(scalarReversed) 433 return r, nil 434 } 435 436 // uint64IsZero returns 1 if x is zero and zero otherwise. 437 func uint64IsZero(x uint64) int { 438 x = ^x 439 x &= x >> 32 440 x &= x >> 16 441 x &= x >> 8 442 x &= x >> 4 443 x &= x >> 2 444 x &= x >> 1 445 return int(x & 1) 446 } 447 448 // p256Equal returns 1 if a and b are equal and 0 otherwise. 449 func p256Equal(a, b *p256Element) int { 450 var acc uint64 451 for i := range a { 452 acc |= a[i] ^ b[i] 453 } 454 return uint64IsZero(acc) 455 } 456 457 // isInfinity returns 1 if p is the point at infinity and 0 otherwise. 458 func (p *P256Point) isInfinity() int { 459 return p256Equal(&p.z, &p256Zero) 460 } 461 462 // Bytes returns the uncompressed or infinity encoding of p, as specified in 463 // SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the point at 464 // infinity is shorter than all other encodings. 465 func (p *P256Point) Bytes() []byte { 466 // This function is outlined to make the allocations inline in the caller 467 // rather than happen on the heap. 468 var out [p256UncompressedLength]byte 469 return p.bytes(&out) 470 } 471 472 func (p *P256Point) bytes(out *[p256UncompressedLength]byte) []byte { 473 // The proper representation of the point at infinity is a single zero byte. 474 if p.isInfinity() == 1 { 475 return append(out[:0], 0) 476 } 477 478 x, y := new(p256Element), new(p256Element) 479 p.affineFromMont(x, y) 480 481 out[0] = 4 // Uncompressed form. 482 p256LittleToBig((*[32]byte)(out[1:33]), x) 483 p256LittleToBig((*[32]byte)(out[33:65]), y) 484 485 return out[:] 486 } 487 488 // affineFromMont sets (x, y) to the affine coordinates of p, converted out of the 489 // Montgomery domain. 490 func (p *P256Point) affineFromMont(x, y *p256Element) { 491 p256Inverse(y, &p.z) 492 p256Sqr(x, y, 1) 493 p256Mul(y, y, x) 494 495 p256Mul(x, &p.x, x) 496 p256Mul(y, &p.y, y) 497 498 p256FromMont(x, x) 499 p256FromMont(y, y) 500 } 501 502 // BytesX returns the encoding of the x-coordinate of p, as specified in SEC 1, 503 // Version 2.0, Section 2.3.5, or an error if p is the point at infinity. 504 func (p *P256Point) BytesX() ([]byte, error) { 505 // This function is outlined to make the allocations inline in the caller 506 // rather than happen on the heap. 507 var out [p256ElementLength]byte 508 return p.bytesX(&out) 509 } 510 511 func (p *P256Point) bytesX(out *[p256ElementLength]byte) ([]byte, error) { 512 if p.isInfinity() == 1 { 513 return nil, errors.New("P256 point is the point at infinity") 514 } 515 516 x := new(p256Element) 517 p256Inverse(x, &p.z) 518 p256Sqr(x, x, 1) 519 p256Mul(x, &p.x, x) 520 p256FromMont(x, x) 521 p256LittleToBig((*[32]byte)(out[:]), x) 522 523 return out[:], nil 524 } 525 526 // BytesCompressed returns the compressed or infinity encoding of p, as 527 // specified in SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the 528 // point at infinity is shorter than all other encodings. 529 func (p *P256Point) BytesCompressed() []byte { 530 // This function is outlined to make the allocations inline in the caller 531 // rather than happen on the heap. 532 var out [p256CompressedLength]byte 533 return p.bytesCompressed(&out) 534 } 535 536 func (p *P256Point) bytesCompressed(out *[p256CompressedLength]byte) []byte { 537 if p.isInfinity() == 1 { 538 return append(out[:0], 0) 539 } 540 541 x, y := new(p256Element), new(p256Element) 542 p.affineFromMont(x, y) 543 544 out[0] = 2 | byte(y[0]&1) 545 p256LittleToBig((*[32]byte)(out[1:33]), x) 546 547 return out[:] 548 } 549 550 // Select sets q to p1 if cond == 1, and to p2 if cond == 0. 551 func (q *P256Point) Select(p1, p2 *P256Point, cond int) *P256Point { 552 p256MovCond(q, p1, p2, cond) 553 return q 554 } 555 556 // p256Inverse sets out to in⁻¹ mod p. If in is zero, out will be zero. 557 func p256Inverse(out, in *p256Element) { 558 // Inversion is calculated through exponentiation by p - 2, per Fermat's 559 // little theorem. 560 // 561 // The sequence of 12 multiplications and 255 squarings is derived from the 562 // following addition chain generated with github.com/mmcloughlin/addchain 563 // v0.4.0. 564 // 565 // _10 = 2*1 566 // _11 = 1 + _10 567 // _110 = 2*_11 568 // _111 = 1 + _110 569 // _111000 = _111 << 3 570 // _111111 = _111 + _111000 571 // x12 = _111111 << 6 + _111111 572 // x15 = x12 << 3 + _111 573 // x16 = 2*x15 + 1 574 // x32 = x16 << 16 + x16 575 // i53 = x32 << 15 576 // x47 = x15 + i53 577 // i263 = ((i53 << 17 + 1) << 143 + x47) << 47 578 // return (x47 + i263) << 2 + 1 579 // 580 var z = new(p256Element) 581 var t0 = new(p256Element) 582 var t1 = new(p256Element) 583 584 p256Sqr(z, in, 1) 585 p256Mul(z, in, z) 586 p256Sqr(z, z, 1) 587 p256Mul(z, in, z) 588 p256Sqr(t0, z, 3) 589 p256Mul(t0, z, t0) 590 p256Sqr(t1, t0, 6) 591 p256Mul(t0, t0, t1) 592 p256Sqr(t0, t0, 3) 593 p256Mul(z, z, t0) 594 p256Sqr(t0, z, 1) 595 p256Mul(t0, in, t0) 596 p256Sqr(t1, t0, 16) 597 p256Mul(t0, t0, t1) 598 p256Sqr(t0, t0, 15) 599 p256Mul(z, z, t0) 600 p256Sqr(t0, t0, 17) 601 p256Mul(t0, in, t0) 602 p256Sqr(t0, t0, 143) 603 p256Mul(t0, z, t0) 604 p256Sqr(t0, t0, 47) 605 p256Mul(z, z, t0) 606 p256Sqr(z, z, 2) 607 p256Mul(out, in, z) 608 } 609 610 // p256OrdRsh returns the 64 least significant bits of x >> n. n must be lower 611 // than 256. The value of n leaks through timing side-channels. 612 func p256OrdRsh(x *p256OrdElement, n int) uint64 { 613 i := n / 64 614 n = n % 64 615 res := x[i] >> n 616 // Shift in the more significant limb, if present. 617 if i := i + 1; i < len(x) { 618 res |= x[i] << (64 - n) 619 } 620 return res 621 } 622 623 func boothW5(in uint64) (int, int) { 624 s := ^((in >> 5) - 1) 625 d := (1 << 6) - in - 1 626 d = (d & s) | (in & (^s)) 627 d = (d >> 1) + (d & 1) 628 return int(d), int(s & 1) 629 } 630 631 func boothW6(in uint64) (int, int) { 632 s := ^((in >> 6) - 1) 633 d := (1 << 7) - in - 1 634 d = (d & s) | (in & (^s)) 635 d = (d >> 1) + (d & 1) 636 return int(d), int(s & 1) 637 } 638 639 func (p *P256Point) p256BaseMult(scalar *p256OrdElement) { 640 // This function works like p256ScalarMult below, but the table is fixed and 641 // "pre-doubled" for each iteration, so instead of doubling we move to the 642 // next table at each iteration. 643 644 // Start scanning the window from the most significant bits. We move by 645 // 6 bits at a time and need to finish at -1, so -1 + 6 * 42 = 251. 646 index := 251 647 648 sel, sign := boothW6(p256OrdRsh(scalar, index)) 649 // sign is always zero because the boothW6 input here is at 650 // most five bits long, so the top bit is never set. 651 _ = sign 652 653 var t0 p256AffinePoint 654 p256SelectAffine(&t0, &p256Precomputed[(index+1)/6], sel) 655 p.x, p.y, p.z = t0.x, t0.y, p256One 656 zero := sel 657 658 for index >= 5 { 659 index -= 6 660 661 if index >= 0 { 662 sel, sign = boothW6(p256OrdRsh(scalar, index) & 0b1111111) 663 } else { 664 // Booth encoding considers a virtual zero bit at index -1, 665 // so we shift left the least significant limb. 666 wvalue := (scalar[0] << 1) & 0b1111111 667 sel, sign = boothW6(wvalue) 668 } 669 670 table := &p256Precomputed[(index+1)/6] 671 p256SelectAffine(&t0, table, sel) 672 673 // See p256ScalarMult for the behavior of sign, sel, and zero, that here 674 // is all rolled into the p256PointAddAffineAsm function. We also know 675 // that (if sel and zero are not 0) p != t0 for a similar reason. 676 p256PointAddAffineAsm(p, p, &t0, sign, sel, zero) 677 zero |= sel 678 } 679 680 // If zero is 0, the whole scalar was zero, p is undefined, 681 // and the correct result is the infinity. 682 infinity := NewP256Point() 683 p256MovCond(p, p, infinity, zero) 684 } 685 686 func (p *P256Point) p256ScalarMult(scalar *p256OrdElement) { 687 // If p is the point at infinity, p256PointAddAsm's behavior below is 688 // undefined. We'll just return the infinity at the end. 689 isInfinity := p.isInfinity() 690 691 // precomp is a table of precomputed points that stores 692 // powers of p from p^1 to p^16. 693 var precomp p256Table 694 var t0, t1, t2, t3 P256Point 695 696 // Prepare the table by double and adding. 697 precomp[0] = *p // 1 698 699 p256PointDoubleAsm(&t0, p) 700 p256PointDoubleAsm(&t1, &t0) 701 p256PointDoubleAsm(&t2, &t1) 702 p256PointDoubleAsm(&t3, &t2) 703 precomp[1] = t0 // 2 704 precomp[3] = t1 // 4 705 precomp[7] = t2 // 8 706 precomp[15] = t3 // 16 707 708 p256PointAddAsm(&t0, &t0, p) 709 p256PointAddAsm(&t1, &t1, p) 710 p256PointAddAsm(&t2, &t2, p) 711 precomp[2] = t0 // 3 712 precomp[4] = t1 // 5 713 precomp[8] = t2 // 9 714 715 p256PointDoubleAsm(&t0, &t0) 716 p256PointDoubleAsm(&t1, &t1) 717 precomp[5] = t0 // 6 718 precomp[9] = t1 // 10 719 720 p256PointAddAsm(&t2, &t0, p) 721 p256PointAddAsm(&t1, &t1, p) 722 precomp[6] = t2 // 7 723 precomp[10] = t1 // 11 724 725 p256PointDoubleAsm(&t0, &t0) 726 p256PointDoubleAsm(&t2, &t2) 727 precomp[11] = t0 // 12 728 precomp[13] = t2 // 14 729 730 p256PointAddAsm(&t0, &t0, p) 731 p256PointAddAsm(&t2, &t2, p) 732 precomp[12] = t0 // 13 733 precomp[14] = t2 // 15 734 735 // Start scanning the window from the most significant bits. We move by 736 // 5 bits at a time and need to finish at -1, so -1 + 5 * 51 = 254. 737 index := 254 738 739 sel, sign := boothW5(p256OrdRsh(scalar, index)) 740 // sign is always zero because the boothW5 input here is at 741 // most two bits long, so the top bit is never set. 742 _ = sign 743 744 p256Select(p, &precomp, sel) 745 zero := sel 746 747 for index >= 4 { 748 index -= 5 749 750 p256PointDoubleAsm(p, p) 751 p256PointDoubleAsm(p, p) 752 p256PointDoubleAsm(p, p) 753 p256PointDoubleAsm(p, p) 754 p256PointDoubleAsm(p, p) 755 756 if index >= 0 { 757 sel, sign = boothW5(p256OrdRsh(scalar, index) & 0b111111) 758 } else { 759 // Booth encoding considers a virtual zero bit at index -1, 760 // so we shift left the least significant limb. 761 wvalue := (scalar[0] << 1) & 0b111111 762 sel, sign = boothW5(wvalue) 763 } 764 765 p256Select(&t0, &precomp, sel) 766 p256NegCond(&t0.y, sign) 767 768 // We don't check the return value of p256PointAddAsm because t0 is 769 // [±1-16]P, while p was just doubled five times and can't have wrapped 770 // around because scalar is less than the group order. 771 p256PointAddAsm(&t1, p, &t0) 772 773 // If sel is 0, t0 was undefined and the correct result is p unmodified. 774 // If zero is 0, all previous sel were 0 and the correct result is t0. 775 // If both are 0, the result doesn't matter as it will be thrown out. 776 p256MovCond(&t1, &t1, p, sel) 777 p256MovCond(p, &t1, &t0, zero) 778 zero |= sel 779 } 780 781 // If zero is 0, the whole scalar was zero. 782 // If isInfinity is 1, the input point was the infinity. 783 // In both cases, p is undefined and the correct result is the infinity. 784 infinity := NewP256Point() 785 wantInfinity := zero & (isInfinity - 1) 786 p256MovCond(p, p, infinity, wantInfinity) 787 }