github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/crypto/internal/nistec/p256_asm.go

github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/crypto/internal/nistec/p256_asm.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file contains the Go wrapper for the constant-time, 64-bit assembly
     6  // implementation of P256. The optimizations performed here are described in
     7  // detail in:
     8  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     9  //                          256-bit primes"
    10  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    11  // https://eprint.iacr.org/2013/816.pdf
    12  
    13  //go:build amd64 || arm64 || ppc64le || s390x
    14  
    15  package nistec
    16  
    17  import (
    18  	_ "embed"
    19  	"encoding/binary"
    20  	"errors"
    21  	"math/bits"
    22  	"runtime"
    23  	"unsafe"
    24  )
    25  
    26  // p256Element is a P-256 base field element in [0, P-1] in the Montgomery
    27  // domain (with R 2²⁵⁶) as four limbs in little-endian order value.
    28  type p256Element [4]uint64
    29  
    30  // p256One is one in the Montgomery domain.
    31  var p256One = p256Element{0x0000000000000001, 0xffffffff00000000,
    32  	0xffffffffffffffff, 0x00000000fffffffe}
    33  
    34  var p256Zero = p256Element{}
    35  
    36  // p256P is 2²⁵⁶ - 2²²⁴ + 2¹⁹² + 2⁹⁶ - 1 in the Montgomery domain.
    37  var p256P = p256Element{0xffffffffffffffff, 0x00000000ffffffff,
    38  	0x0000000000000000, 0xffffffff00000001}
    39  
    40  // P256Point is a P-256 point. The zero value should not be assumed to be valid
    41  // (although it is in this implementation).
    42  type P256Point struct {
    43  	// (X:Y:Z) are Jacobian coordinates where x = X/Z² and y = Y/Z³. The point
    44  	// at infinity can be represented by any set of coordinates with Z = 0.
    45  	x, y, z p256Element
    46  }
    47  
    48  // NewP256Point returns a new P256Point representing the point at infinity.
    49  func NewP256Point() *P256Point {
    50  	return &P256Point{
    51  		x: p256One, y: p256One, z: p256Zero,
    52  	}
    53  }
    54  
    55  // SetGenerator sets p to the canonical generator and returns p.
    56  func (p *P256Point) SetGenerator() *P256Point {
    57  	p.x = p256Element{0x79e730d418a9143c, 0x75ba95fc5fedb601,
    58  		0x79fb732b77622510, 0x18905f76a53755c6}
    59  	p.y = p256Element{0xddf25357ce95560a, 0x8b4ab8e4ba19e45c,
    60  		0xd2e88688dd21f325, 0x8571ff1825885d85}
    61  	p.z = p256One
    62  	return p
    63  }
    64  
    65  // Set sets p = q and returns p.
    66  func (p *P256Point) Set(q *P256Point) *P256Point {
    67  	p.x, p.y, p.z = q.x, q.y, q.z
    68  	return p
    69  }
    70  
    71  const p256ElementLength = 32
    72  const p256UncompressedLength = 1 + 2*p256ElementLength
    73  const p256CompressedLength = 1 + p256ElementLength
    74  
    75  // SetBytes sets p to the compressed, uncompressed, or infinity value encoded in
    76  // b, as specified in SEC 1, Version 2.0, Section 2.3.4. If the point is not on
    77  // the curve, it returns nil and an error, and the receiver is unchanged.
    78  // Otherwise, it returns p.
    79  func (p *P256Point) SetBytes(b []byte) (*P256Point, error) {
    80  	// p256Mul operates in the Montgomery domain with R = 2²⁵⁶ mod p. Thus rr
    81  	// here is R in the Montgomery domain, or R×R mod p. See comment in
    82  	// P256OrdInverse about how this is used.
    83  	rr := p256Element{0x0000000000000003, 0xfffffffbffffffff,
    84  		0xfffffffffffffffe, 0x00000004fffffffd}
    85  
    86  	switch {
    87  	// Point at infinity.
    88  	case len(b) == 1 && b[0] == 0:
    89  		return p.Set(NewP256Point()), nil
    90  
    91  	// Uncompressed form.
    92  	case len(b) == p256UncompressedLength && b[0] == 4:
    93  		var r P256Point
    94  		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
    95  		p256BigToLittle(&r.y, (*[32]byte)(b[33:65]))
    96  		if p256LessThanP(&r.x) == 0 || p256LessThanP(&r.y) == 0 {
    97  			return nil, errors.New("invalid P256 element encoding")
    98  		}
    99  		p256Mul(&r.x, &r.x, &rr)
   100  		p256Mul(&r.y, &r.y, &rr)
   101  		if err := p256CheckOnCurve(&r.x, &r.y); err != nil {
   102  			return nil, err
   103  		}
   104  		r.z = p256One
   105  		return p.Set(&r), nil
   106  
   107  	// Compressed form.
   108  	case len(b) == p256CompressedLength && (b[0] == 2 || b[0] == 3):
   109  		var r P256Point
   110  		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
   111  		if p256LessThanP(&r.x) == 0 {
   112  			return nil, errors.New("invalid P256 element encoding")
   113  		}
   114  		p256Mul(&r.x, &r.x, &rr)
   115  
   116  		// y² = x³ - 3x + b
   117  		p256Polynomial(&r.y, &r.x)
   118  		if !p256Sqrt(&r.y, &r.y) {
   119  			return nil, errors.New("invalid P256 compressed point encoding")
   120  		}
   121  
   122  		// Select the positive or negative root, as indicated by the least
   123  		// significant bit, based on the encoding type byte.
   124  		yy := new(p256Element)
   125  		p256FromMont(yy, &r.y)
   126  		cond := int(yy[0]&1) ^ int(b[0]&1)
   127  		p256NegCond(&r.y, cond)
   128  
   129  		r.z = p256One
   130  		return p.Set(&r), nil
   131  
   132  	default:
   133  		return nil, errors.New("invalid P256 point encoding")
   134  	}
   135  }
   136  
   137  // p256Polynomial sets y2 to x³ - 3x + b, and returns y2.
   138  func p256Polynomial(y2, x *p256Element) *p256Element {
   139  	x3 := new(p256Element)
   140  	p256Sqr(x3, x, 1)
   141  	p256Mul(x3, x3, x)
   142  
   143  	threeX := new(p256Element)
   144  	p256Add(threeX, x, x)
   145  	p256Add(threeX, threeX, x)
   146  	p256NegCond(threeX, 1)
   147  
   148  	p256B := &p256Element{0xd89cdf6229c4bddf, 0xacf005cd78843090,
   149  		0xe5a220abf7212ed6, 0xdc30061d04874834}
   150  
   151  	p256Add(x3, x3, threeX)
   152  	p256Add(x3, x3, p256B)
   153  
   154  	*y2 = *x3
   155  	return y2
   156  }
   157  
   158  func p256CheckOnCurve(x, y *p256Element) error {
   159  	// y² = x³ - 3x + b
   160  	rhs := p256Polynomial(new(p256Element), x)
   161  	lhs := new(p256Element)
   162  	p256Sqr(lhs, y, 1)
   163  	if p256Equal(lhs, rhs) != 1 {
   164  		return errors.New("P256 point not on curve")
   165  	}
   166  	return nil
   167  }
   168  
   169  // p256LessThanP returns 1 if x < p, and 0 otherwise. Note that a p256Element is
   170  // not allowed to be equal to or greater than p, so if this function returns 0
   171  // then x is invalid.
   172  func p256LessThanP(x *p256Element) int {
   173  	var b uint64
   174  	_, b = bits.Sub64(x[0], p256P[0], b)
   175  	_, b = bits.Sub64(x[1], p256P[1], b)
   176  	_, b = bits.Sub64(x[2], p256P[2], b)
   177  	_, b = bits.Sub64(x[3], p256P[3], b)
   178  	return int(b)
   179  }
   180  
   181  // p256Add sets res = x + y.
   182  func p256Add(res, x, y *p256Element) {
   183  	var c, b uint64
   184  	t1 := make([]uint64, 4)
   185  	t1[0], c = bits.Add64(x[0], y[0], 0)
   186  	t1[1], c = bits.Add64(x[1], y[1], c)
   187  	t1[2], c = bits.Add64(x[2], y[2], c)
   188  	t1[3], c = bits.Add64(x[3], y[3], c)
   189  	t2 := make([]uint64, 4)
   190  	t2[0], b = bits.Sub64(t1[0], p256P[0], 0)
   191  	t2[1], b = bits.Sub64(t1[1], p256P[1], b)
   192  	t2[2], b = bits.Sub64(t1[2], p256P[2], b)
   193  	t2[3], b = bits.Sub64(t1[3], p256P[3], b)
   194  	// Three options:
   195  	//   - a+b < p
   196  	//     then c is 0, b is 1, and t1 is correct
   197  	//   - p <= a+b < 2^256
   198  	//     then c is 0, b is 0, and t2 is correct
   199  	//   - 2^256 <= a+b
   200  	//     then c is 1, b is 1, and t2 is correct
   201  	t2Mask := (c ^ b) - 1
   202  	res[0] = (t1[0] & ^t2Mask) | (t2[0] & t2Mask)
   203  	res[1] = (t1[1] & ^t2Mask) | (t2[1] & t2Mask)
   204  	res[2] = (t1[2] & ^t2Mask) | (t2[2] & t2Mask)
   205  	res[3] = (t1[3] & ^t2Mask) | (t2[3] & t2Mask)
   206  }
   207  
   208  // p256Sqrt sets e to a square root of x. If x is not a square, p256Sqrt returns
   209  // false and e is unchanged. e and x can overlap.
   210  func p256Sqrt(e, x *p256Element) (isSquare bool) {
   211  	t0, t1 := new(p256Element), new(p256Element)
   212  
   213  	// Since p = 3 mod 4, exponentiation by (p + 1) / 4 yields a square root candidate.
   214  	//
   215  	// The sequence of 7 multiplications and 253 squarings is derived from the
   216  	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
   217  	//
   218  	//	_10       = 2*1
   219  	//	_11       = 1 + _10
   220  	//	_1100     = _11 << 2
   221  	//	_1111     = _11 + _1100
   222  	//	_11110000 = _1111 << 4
   223  	//	_11111111 = _1111 + _11110000
   224  	//	x16       = _11111111 << 8 + _11111111
   225  	//	x32       = x16 << 16 + x16
   226  	//	return      ((x32 << 32 + 1) << 96 + 1) << 94
   227  	//
   228  	p256Sqr(t0, x, 1)
   229  	p256Mul(t0, x, t0)
   230  	p256Sqr(t1, t0, 2)
   231  	p256Mul(t0, t0, t1)
   232  	p256Sqr(t1, t0, 4)
   233  	p256Mul(t0, t0, t1)
   234  	p256Sqr(t1, t0, 8)
   235  	p256Mul(t0, t0, t1)
   236  	p256Sqr(t1, t0, 16)
   237  	p256Mul(t0, t0, t1)
   238  	p256Sqr(t0, t0, 32)
   239  	p256Mul(t0, x, t0)
   240  	p256Sqr(t0, t0, 96)
   241  	p256Mul(t0, x, t0)
   242  	p256Sqr(t0, t0, 94)
   243  
   244  	p256Sqr(t1, t0, 1)
   245  	if p256Equal(t1, x) != 1 {
   246  		return false
   247  	}
   248  	*e = *t0
   249  	return true
   250  }
   251  
   252  // The following assembly functions are implemented in p256_asm_*.s
   253  
   254  // Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p.
   255  //
   256  //go:noescape
   257  func p256Mul(res, in1, in2 *p256Element)
   258  
   259  // Montgomery square, repeated n times (n >= 1).
   260  //
   261  //go:noescape
   262  func p256Sqr(res, in *p256Element, n int)
   263  
   264  // Montgomery multiplication by R⁻¹, or 1 outside the domain.
   265  // Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
   266  //
   267  //go:noescape
   268  func p256FromMont(res, in *p256Element)
   269  
   270  // If cond is not 0, sets val = -val mod p.
   271  //
   272  //go:noescape
   273  func p256NegCond(val *p256Element, cond int)
   274  
   275  // If cond is 0, sets res = b, otherwise sets res = a.
   276  //
   277  //go:noescape
   278  func p256MovCond(res, a, b *P256Point, cond int)
   279  
   280  //go:noescape
   281  func p256BigToLittle(res *p256Element, in *[32]byte)
   282  
   283  //go:noescape
   284  func p256LittleToBig(res *[32]byte, in *p256Element)
   285  
   286  //go:noescape
   287  func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
   288  
   289  //go:noescape
   290  func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
   291  
   292  // p256Table is a table of the first 16 multiples of a point. Points are stored
   293  // at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15.
   294  // [0]P is the point at infinity and it's not stored.
   295  type p256Table [16]P256Point
   296  
   297  // p256Select sets res to the point at index idx - 1 in the table.
   298  // idx must be in [1, 16] or res will be set to an undefined value.
   299  // It executes in constant time.
   300  //
   301  //go:noescape
   302  func p256Select(res *P256Point, table *p256Table, idx int)
   303  
   304  // p256AffinePoint is a point in affine coordinates (x, y). x and y are still
   305  // Montgomery domain elements. The point can't be the point at infinity.
   306  type p256AffinePoint struct {
   307  	x, y p256Element
   308  }
   309  
   310  // p256AffineTable is a table of the first 32 multiples of a point. Points are
   311  // stored at an index offset of -1 like in p256Table, and [0]P is not stored.
   312  type p256AffineTable [32]p256AffinePoint
   313  
   314  // p256Precomputed is a series of precomputed multiples of G, the canonical
   315  // generator. The first p256AffineTable contains multiples of G. The second one
   316  // multiples of [2⁶]G, the third one of [2¹²]G, and so on, where each successive
   317  // table is the previous table doubled six times. Six is the width of the
   318  // sliding window used in p256ScalarMult, and having each table already
   319  // pre-doubled lets us avoid the doublings between windows entirely. This table
   320  // MUST NOT be modified, as it aliases into p256PrecomputedEmbed below.
   321  var p256Precomputed *[43]p256AffineTable
   322  
   323  //go:embed p256_asm_table.bin
   324  var p256PrecomputedEmbed string
   325  
   326  func init() {
   327  	p256PrecomputedPtr := (*unsafe.Pointer)(unsafe.Pointer(&p256PrecomputedEmbed))
   328  	if runtime.GOARCH == "s390x" {
   329  		var newTable [43 * 32 * 2 * 4]uint64
   330  		for i, x := range (*[43 * 32 * 2 * 4][8]byte)(*p256PrecomputedPtr) {
   331  			newTable[i] = binary.LittleEndian.Uint64(x[:])
   332  		}
   333  		newTablePtr := unsafe.Pointer(&newTable)
   334  		p256PrecomputedPtr = &newTablePtr
   335  	}
   336  	p256Precomputed = (*[43]p256AffineTable)(*p256PrecomputedPtr)
   337  }
   338  
   339  // p256SelectAffine sets res to the point at index idx - 1 in the table.
   340  // idx must be in [1, 32] or res will be set to an undefined value.
   341  // It executes in constant time.
   342  //
   343  //go:noescape
   344  func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   345  
   346  // Point addition with an affine point and constant time conditions.
   347  // If zero is 0, sets res = in2. If sel is 0, sets res = in1.
   348  // If sign is not 0, sets res = in1 + -in2. Otherwise, sets res = in1 + in2.
   349  // If neither sel nor zero are 0 and in1 = in2, or both zero and sel are 0,
   350  // or in1 is the infinity, res is undefined.
   351  //
   352  //go:noescape
   353  func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
   354  
   355  // Point addition. Sets res = in1 + in2 and returns zero if in1 and in2 are not
   356  // equal. Otherwise, returns one and res is undefined. If in1 or in2 are the
   357  // point at infinity, res and the return value are undefined.
   358  //
   359  //go:noescape
   360  func p256PointAddAsm(res, in1, in2 *P256Point) int
   361  
   362  // Point doubling. Sets res = in + in. in can be the point at infinity.
   363  //
   364  //go:noescape
   365  func p256PointDoubleAsm(res, in *P256Point)
   366  
   367  // p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
   368  // Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
   369  type p256OrdElement [4]uint64
   370  
   371  // p256OrdReduce ensures s is in the range [0, ord(G)-1].
   372  func p256OrdReduce(s *p256OrdElement) {
   373  	// Since 2 * ord(G) > 2²⁵⁶, we can just conditionally subtract ord(G),
   374  	// keeping the result if it doesn't underflow.
   375  	t0, b := bits.Sub64(s[0], 0xf3b9cac2fc632551, 0)
   376  	t1, b := bits.Sub64(s[1], 0xbce6faada7179e84, b)
   377  	t2, b := bits.Sub64(s[2], 0xffffffffffffffff, b)
   378  	t3, b := bits.Sub64(s[3], 0xffffffff00000000, b)
   379  	tMask := b - 1 // zero if subtraction underflowed
   380  	s[0] ^= (t0 ^ s[0]) & tMask
   381  	s[1] ^= (t1 ^ s[1]) & tMask
   382  	s[2] ^= (t2 ^ s[2]) & tMask
   383  	s[3] ^= (t3 ^ s[3]) & tMask
   384  }
   385  
   386  // Add sets q = p1 + p2, and returns q. The points may overlap.
   387  func (q *P256Point) Add(r1, r2 *P256Point) *P256Point {
   388  	var sum, double P256Point
   389  	r1IsInfinity := r1.isInfinity()
   390  	r2IsInfinity := r2.isInfinity()
   391  	pointsEqual := p256PointAddAsm(&sum, r1, r2)
   392  	p256PointDoubleAsm(&double, r1)
   393  	p256MovCond(&sum, &double, &sum, pointsEqual)
   394  	p256MovCond(&sum, r1, &sum, r2IsInfinity)
   395  	p256MovCond(&sum, r2, &sum, r1IsInfinity)
   396  	return q.Set(&sum)
   397  }
   398  
   399  // Double sets q = p + p, and returns q. The points may overlap.
   400  func (q *P256Point) Double(p *P256Point) *P256Point {
   401  	var double P256Point
   402  	p256PointDoubleAsm(&double, p)
   403  	return q.Set(&double)
   404  }
   405  
   406  // ScalarBaseMult sets r = scalar * generator, where scalar is a 32-byte big
   407  // endian value, and returns r. If scalar is not 32 bytes long, ScalarBaseMult
   408  // returns an error and the receiver is unchanged.
   409  func (r *P256Point) ScalarBaseMult(scalar []byte) (*P256Point, error) {
   410  	if len(scalar) != 32 {
   411  		return nil, errors.New("invalid scalar length")
   412  	}
   413  	scalarReversed := new(p256OrdElement)
   414  	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
   415  	p256OrdReduce(scalarReversed)
   416  
   417  	r.p256BaseMult(scalarReversed)
   418  	return r, nil
   419  }
   420  
   421  // ScalarMult sets r = scalar * q, where scalar is a 32-byte big endian value,
   422  // and returns r. If scalar is not 32 bytes long, ScalarBaseMult returns an
   423  // error and the receiver is unchanged.
   424  func (r *P256Point) ScalarMult(q *P256Point, scalar []byte) (*P256Point, error) {
   425  	if len(scalar) != 32 {
   426  		return nil, errors.New("invalid scalar length")
   427  	}
   428  	scalarReversed := new(p256OrdElement)
   429  	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
   430  	p256OrdReduce(scalarReversed)
   431  
   432  	r.Set(q).p256ScalarMult(scalarReversed)
   433  	return r, nil
   434  }
   435  
   436  // uint64IsZero returns 1 if x is zero and zero otherwise.
   437  func uint64IsZero(x uint64) int {
   438  	x = ^x
   439  	x &= x >> 32
   440  	x &= x >> 16
   441  	x &= x >> 8
   442  	x &= x >> 4
   443  	x &= x >> 2
   444  	x &= x >> 1
   445  	return int(x & 1)
   446  }
   447  
   448  // p256Equal returns 1 if a and b are equal and 0 otherwise.
   449  func p256Equal(a, b *p256Element) int {
   450  	var acc uint64
   451  	for i := range a {
   452  		acc |= a[i] ^ b[i]
   453  	}
   454  	return uint64IsZero(acc)
   455  }
   456  
   457  // isInfinity returns 1 if p is the point at infinity and 0 otherwise.
   458  func (p *P256Point) isInfinity() int {
   459  	return p256Equal(&p.z, &p256Zero)
   460  }
   461  
   462  // Bytes returns the uncompressed or infinity encoding of p, as specified in
   463  // SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the point at
   464  // infinity is shorter than all other encodings.
   465  func (p *P256Point) Bytes() []byte {
   466  	// This function is outlined to make the allocations inline in the caller
   467  	// rather than happen on the heap.
   468  	var out [p256UncompressedLength]byte
   469  	return p.bytes(&out)
   470  }
   471  
   472  func (p *P256Point) bytes(out *[p256UncompressedLength]byte) []byte {
   473  	// The proper representation of the point at infinity is a single zero byte.
   474  	if p.isInfinity() == 1 {
   475  		return append(out[:0], 0)
   476  	}
   477  
   478  	x, y := new(p256Element), new(p256Element)
   479  	p.affineFromMont(x, y)
   480  
   481  	out[0] = 4 // Uncompressed form.
   482  	p256LittleToBig((*[32]byte)(out[1:33]), x)
   483  	p256LittleToBig((*[32]byte)(out[33:65]), y)
   484  
   485  	return out[:]
   486  }
   487  
   488  // affineFromMont sets (x, y) to the affine coordinates of p, converted out of the
   489  // Montgomery domain.
   490  func (p *P256Point) affineFromMont(x, y *p256Element) {
   491  	p256Inverse(y, &p.z)
   492  	p256Sqr(x, y, 1)
   493  	p256Mul(y, y, x)
   494  
   495  	p256Mul(x, &p.x, x)
   496  	p256Mul(y, &p.y, y)
   497  
   498  	p256FromMont(x, x)
   499  	p256FromMont(y, y)
   500  }
   501  
   502  // BytesX returns the encoding of the x-coordinate of p, as specified in SEC 1,
   503  // Version 2.0, Section 2.3.5, or an error if p is the point at infinity.
   504  func (p *P256Point) BytesX() ([]byte, error) {
   505  	// This function is outlined to make the allocations inline in the caller
   506  	// rather than happen on the heap.
   507  	var out [p256ElementLength]byte
   508  	return p.bytesX(&out)
   509  }
   510  
   511  func (p *P256Point) bytesX(out *[p256ElementLength]byte) ([]byte, error) {
   512  	if p.isInfinity() == 1 {
   513  		return nil, errors.New("P256 point is the point at infinity")
   514  	}
   515  
   516  	x := new(p256Element)
   517  	p256Inverse(x, &p.z)
   518  	p256Sqr(x, x, 1)
   519  	p256Mul(x, &p.x, x)
   520  	p256FromMont(x, x)
   521  	p256LittleToBig((*[32]byte)(out[:]), x)
   522  
   523  	return out[:], nil
   524  }
   525  
   526  // BytesCompressed returns the compressed or infinity encoding of p, as
   527  // specified in SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the
   528  // point at infinity is shorter than all other encodings.
   529  func (p *P256Point) BytesCompressed() []byte {
   530  	// This function is outlined to make the allocations inline in the caller
   531  	// rather than happen on the heap.
   532  	var out [p256CompressedLength]byte
   533  	return p.bytesCompressed(&out)
   534  }
   535  
   536  func (p *P256Point) bytesCompressed(out *[p256CompressedLength]byte) []byte {
   537  	if p.isInfinity() == 1 {
   538  		return append(out[:0], 0)
   539  	}
   540  
   541  	x, y := new(p256Element), new(p256Element)
   542  	p.affineFromMont(x, y)
   543  
   544  	out[0] = 2 | byte(y[0]&1)
   545  	p256LittleToBig((*[32]byte)(out[1:33]), x)
   546  
   547  	return out[:]
   548  }
   549  
   550  // Select sets q to p1 if cond == 1, and to p2 if cond == 0.
   551  func (q *P256Point) Select(p1, p2 *P256Point, cond int) *P256Point {
   552  	p256MovCond(q, p1, p2, cond)
   553  	return q
   554  }
   555  
   556  // p256Inverse sets out to in⁻¹ mod p. If in is zero, out will be zero.
   557  func p256Inverse(out, in *p256Element) {
   558  	// Inversion is calculated through exponentiation by p - 2, per Fermat's
   559  	// little theorem.
   560  	//
   561  	// The sequence of 12 multiplications and 255 squarings is derived from the
   562  	// following addition chain generated with github.com/mmcloughlin/addchain
   563  	// v0.4.0.
   564  	//
   565  	//  _10     = 2*1
   566  	//  _11     = 1 + _10
   567  	//  _110    = 2*_11
   568  	//  _111    = 1 + _110
   569  	//  _111000 = _111 << 3
   570  	//  _111111 = _111 + _111000
   571  	//  x12     = _111111 << 6 + _111111
   572  	//  x15     = x12 << 3 + _111
   573  	//  x16     = 2*x15 + 1
   574  	//  x32     = x16 << 16 + x16
   575  	//  i53     = x32 << 15
   576  	//  x47     = x15 + i53
   577  	//  i263    = ((i53 << 17 + 1) << 143 + x47) << 47
   578  	//  return    (x47 + i263) << 2 + 1
   579  	//
   580  	var z = new(p256Element)
   581  	var t0 = new(p256Element)
   582  	var t1 = new(p256Element)
   583  
   584  	p256Sqr(z, in, 1)
   585  	p256Mul(z, in, z)
   586  	p256Sqr(z, z, 1)
   587  	p256Mul(z, in, z)
   588  	p256Sqr(t0, z, 3)
   589  	p256Mul(t0, z, t0)
   590  	p256Sqr(t1, t0, 6)
   591  	p256Mul(t0, t0, t1)
   592  	p256Sqr(t0, t0, 3)
   593  	p256Mul(z, z, t0)
   594  	p256Sqr(t0, z, 1)
   595  	p256Mul(t0, in, t0)
   596  	p256Sqr(t1, t0, 16)
   597  	p256Mul(t0, t0, t1)
   598  	p256Sqr(t0, t0, 15)
   599  	p256Mul(z, z, t0)
   600  	p256Sqr(t0, t0, 17)
   601  	p256Mul(t0, in, t0)
   602  	p256Sqr(t0, t0, 143)
   603  	p256Mul(t0, z, t0)
   604  	p256Sqr(t0, t0, 47)
   605  	p256Mul(z, z, t0)
   606  	p256Sqr(z, z, 2)
   607  	p256Mul(out, in, z)
   608  }
   609  
   610  // p256OrdRsh returns the 64 least significant bits of x >> n. n must be lower
   611  // than 256. The value of n leaks through timing side-channels.
   612  func p256OrdRsh(x *p256OrdElement, n int) uint64 {
   613  	i := n / 64
   614  	n = n % 64
   615  	res := x[i] >> n
   616  	// Shift in the more significant limb, if present.
   617  	if i := i + 1; i < len(x) {
   618  		res |= x[i] << (64 - n)
   619  	}
   620  	return res
   621  }
   622  
   623  func boothW5(in uint64) (int, int) {
   624  	s := ^((in >> 5) - 1)
   625  	d := (1 << 6) - in - 1
   626  	d = (d & s) | (in & (^s))
   627  	d = (d >> 1) + (d & 1)
   628  	return int(d), int(s & 1)
   629  }
   630  
   631  func boothW6(in uint64) (int, int) {
   632  	s := ^((in >> 6) - 1)
   633  	d := (1 << 7) - in - 1
   634  	d = (d & s) | (in & (^s))
   635  	d = (d >> 1) + (d & 1)
   636  	return int(d), int(s & 1)
   637  }
   638  
   639  func (p *P256Point) p256BaseMult(scalar *p256OrdElement) {
   640  	// This function works like p256ScalarMult below, but the table is fixed and
   641  	// "pre-doubled" for each iteration, so instead of doubling we move to the
   642  	// next table at each iteration.
   643  
   644  	// Start scanning the window from the most significant bits. We move by
   645  	// 6 bits at a time and need to finish at -1, so -1 + 6 * 42 = 251.
   646  	index := 251
   647  
   648  	sel, sign := boothW6(p256OrdRsh(scalar, index))
   649  	// sign is always zero because the boothW6 input here is at
   650  	// most five bits long, so the top bit is never set.
   651  	_ = sign
   652  
   653  	var t0 p256AffinePoint
   654  	p256SelectAffine(&t0, &p256Precomputed[(index+1)/6], sel)
   655  	p.x, p.y, p.z = t0.x, t0.y, p256One
   656  	zero := sel
   657  
   658  	for index >= 5 {
   659  		index -= 6
   660  
   661  		if index >= 0 {
   662  			sel, sign = boothW6(p256OrdRsh(scalar, index) & 0b1111111)
   663  		} else {
   664  			// Booth encoding considers a virtual zero bit at index -1,
   665  			// so we shift left the least significant limb.
   666  			wvalue := (scalar[0] << 1) & 0b1111111
   667  			sel, sign = boothW6(wvalue)
   668  		}
   669  
   670  		table := &p256Precomputed[(index+1)/6]
   671  		p256SelectAffine(&t0, table, sel)
   672  
   673  		// See p256ScalarMult for the behavior of sign, sel, and zero, that here
   674  		// is all rolled into the p256PointAddAffineAsm function. We also know
   675  		// that (if sel and zero are not 0) p != t0 for a similar reason.
   676  		p256PointAddAffineAsm(p, p, &t0, sign, sel, zero)
   677  		zero |= sel
   678  	}
   679  
   680  	// If zero is 0, the whole scalar was zero, p is undefined,
   681  	// and the correct result is the infinity.
   682  	infinity := NewP256Point()
   683  	p256MovCond(p, p, infinity, zero)
   684  }
   685  
   686  func (p *P256Point) p256ScalarMult(scalar *p256OrdElement) {
   687  	// If p is the point at infinity, p256PointAddAsm's behavior below is
   688  	// undefined. We'll just return the infinity at the end.
   689  	isInfinity := p.isInfinity()
   690  
   691  	// precomp is a table of precomputed points that stores
   692  	// powers of p from p^1 to p^16.
   693  	var precomp p256Table
   694  	var t0, t1, t2, t3 P256Point
   695  
   696  	// Prepare the table by double and adding.
   697  	precomp[0] = *p // 1
   698  
   699  	p256PointDoubleAsm(&t0, p)
   700  	p256PointDoubleAsm(&t1, &t0)
   701  	p256PointDoubleAsm(&t2, &t1)
   702  	p256PointDoubleAsm(&t3, &t2)
   703  	precomp[1] = t0  // 2
   704  	precomp[3] = t1  // 4
   705  	precomp[7] = t2  // 8
   706  	precomp[15] = t3 // 16
   707  
   708  	p256PointAddAsm(&t0, &t0, p)
   709  	p256PointAddAsm(&t1, &t1, p)
   710  	p256PointAddAsm(&t2, &t2, p)
   711  	precomp[2] = t0 // 3
   712  	precomp[4] = t1 // 5
   713  	precomp[8] = t2 // 9
   714  
   715  	p256PointDoubleAsm(&t0, &t0)
   716  	p256PointDoubleAsm(&t1, &t1)
   717  	precomp[5] = t0 // 6
   718  	precomp[9] = t1 // 10
   719  
   720  	p256PointAddAsm(&t2, &t0, p)
   721  	p256PointAddAsm(&t1, &t1, p)
   722  	precomp[6] = t2  // 7
   723  	precomp[10] = t1 // 11
   724  
   725  	p256PointDoubleAsm(&t0, &t0)
   726  	p256PointDoubleAsm(&t2, &t2)
   727  	precomp[11] = t0 // 12
   728  	precomp[13] = t2 // 14
   729  
   730  	p256PointAddAsm(&t0, &t0, p)
   731  	p256PointAddAsm(&t2, &t2, p)
   732  	precomp[12] = t0 // 13
   733  	precomp[14] = t2 // 15
   734  
   735  	// Start scanning the window from the most significant bits. We move by
   736  	// 5 bits at a time and need to finish at -1, so -1 + 5 * 51 = 254.
   737  	index := 254
   738  
   739  	sel, sign := boothW5(p256OrdRsh(scalar, index))
   740  	// sign is always zero because the boothW5 input here is at
   741  	// most two bits long, so the top bit is never set.
   742  	_ = sign
   743  
   744  	p256Select(p, &precomp, sel)
   745  	zero := sel
   746  
   747  	for index >= 4 {
   748  		index -= 5
   749  
   750  		p256PointDoubleAsm(p, p)
   751  		p256PointDoubleAsm(p, p)
   752  		p256PointDoubleAsm(p, p)
   753  		p256PointDoubleAsm(p, p)
   754  		p256PointDoubleAsm(p, p)
   755  
   756  		if index >= 0 {
   757  			sel, sign = boothW5(p256OrdRsh(scalar, index) & 0b111111)
   758  		} else {
   759  			// Booth encoding considers a virtual zero bit at index -1,
   760  			// so we shift left the least significant limb.
   761  			wvalue := (scalar[0] << 1) & 0b111111
   762  			sel, sign = boothW5(wvalue)
   763  		}
   764  
   765  		p256Select(&t0, &precomp, sel)
   766  		p256NegCond(&t0.y, sign)
   767  
   768  		// We don't check the return value of p256PointAddAsm because t0 is
   769  		// [±1-16]P, while p was just doubled five times and can't have wrapped
   770  		// around because scalar is less than the group order.
   771  		p256PointAddAsm(&t1, p, &t0)
   772  
   773  		// If sel is 0, t0 was undefined and the correct result is p unmodified.
   774  		// If zero is 0, all previous sel were 0 and the correct result is t0.
   775  		// If both are 0, the result doesn't matter as it will be thrown out.
   776  		p256MovCond(&t1, &t1, p, sel)
   777  		p256MovCond(p, &t1, &t0, zero)
   778  		zero |= sel
   779  	}
   780  
   781  	// If zero is 0, the whole scalar was zero.
   782  	// If isInfinity is 1, the input point was the infinity.
   783  	// In both cases, p is undefined and the correct result is the infinity.
   784  	infinity := NewP256Point()
   785  	wantInfinity := zero & (isInfinity - 1)
   786  	p256MovCond(p, p, infinity, wantInfinity)
   787  }