github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/crypto/internal/nistec/p256_asm.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file contains the Go wrapper for the constant-time, 64-bit assembly
     6  // implementation of P256. The optimizations performed here are described in
     7  // detail in:
     8  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     9  //                          256-bit primes"
    10  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    11  // https://eprint.iacr.org/2013/816.pdf
    12  
    13  //go:build amd64 || arm64 || ppc64le || s390x
    14  
    15  package nistec
    16  
    17  import (
    18  	_ "embed"
    19  	"encoding/binary"
    20  	"errors"
    21  	"math/bits"
    22  	"runtime"
    23  	"unsafe"
    24  )
    25  
    26  // p256Element is a P-256 base field element in [0, P-1] in the Montgomery
    27  // domain (with R 2²⁵⁶) as four limbs in little-endian order value.
    28  type p256Element [4]uint64
    29  
    30  // p256One is one in the Montgomery domain.
    31  var p256One = p256Element{0x0000000000000001, 0xffffffff00000000,
    32  	0xffffffffffffffff, 0x00000000fffffffe}
    33  
    34  var p256Zero = p256Element{}
    35  
    36  // p256P is 2²⁵⁶ - 2²²⁴ + 2¹⁹² + 2⁹⁶ - 1 in the Montgomery domain.
    37  var p256P = p256Element{0xffffffffffffffff, 0x00000000ffffffff,
    38  	0x0000000000000000, 0xffffffff00000001}
    39  
    40  // P256Point is a P-256 point. The zero value should not be assumed to be valid
    41  // (although it is in this implementation).
    42  type P256Point struct {
    43  	// (X:Y:Z) are Jacobian coordinates where x = X/Z² and y = Y/Z³. The point
    44  	// at infinity can be represented by any set of coordinates with Z = 0.
    45  	x, y, z p256Element
    46  }
    47  
    48  // NewP256Point returns a new P256Point representing the point at infinity.
    49  func NewP256Point() *P256Point {
    50  	return &P256Point{
    51  		x: p256One, y: p256One, z: p256Zero,
    52  	}
    53  }
    54  
    55  // SetGenerator sets p to the canonical generator and returns p.
    56  func (p *P256Point) SetGenerator() *P256Point {
    57  	p.x = p256Element{0x79e730d418a9143c, 0x75ba95fc5fedb601,
    58  		0x79fb732b77622510, 0x18905f76a53755c6}
    59  	p.y = p256Element{0xddf25357ce95560a, 0x8b4ab8e4ba19e45c,
    60  		0xd2e88688dd21f325, 0x8571ff1825885d85}
    61  	p.z = p256One
    62  	return p
    63  }
    64  
    65  // Set sets p = q and returns p.
    66  func (p *P256Point) Set(q *P256Point) *P256Point {
    67  	p.x, p.y, p.z = q.x, q.y, q.z
    68  	return p
    69  }
    70  
    71  const p256ElementLength = 32
    72  const p256UncompressedLength = 1 + 2*p256ElementLength
    73  const p256CompressedLength = 1 + p256ElementLength
    74  
    75  // SetBytes sets p to the compressed, uncompressed, or infinity value encoded in
    76  // b, as specified in SEC 1, Version 2.0, Section 2.3.4. If the point is not on
    77  // the curve, it returns nil and an error, and the receiver is unchanged.
    78  // Otherwise, it returns p.
    79  func (p *P256Point) SetBytes(b []byte) (*P256Point, error) {
    80  	// p256Mul operates in the Montgomery domain with R = 2²⁵⁶ mod p. Thus rr
    81  	// here is R in the Montgomery domain, or R×R mod p. See comment in
    82  	// P256OrdInverse about how this is used.
    83  	rr := p256Element{0x0000000000000003, 0xfffffffbffffffff,
    84  		0xfffffffffffffffe, 0x00000004fffffffd}
    85  
    86  	switch {
    87  	// Point at infinity.
    88  	case len(b) == 1 && b[0] == 0:
    89  		return p.Set(NewP256Point()), nil
    90  
    91  	// Uncompressed form.
    92  	case len(b) == p256UncompressedLength && b[0] == 4:
    93  		var r P256Point
    94  		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
    95  		p256BigToLittle(&r.y, (*[32]byte)(b[33:65]))
    96  		if p256LessThanP(&r.x) == 0 || p256LessThanP(&r.y) == 0 {
    97  			return nil, errors.New("invalid P256 element encoding")
    98  		}
    99  		p256Mul(&r.x, &r.x, &rr)
   100  		p256Mul(&r.y, &r.y, &rr)
   101  		if err := p256CheckOnCurve(&r.x, &r.y); err != nil {
   102  			return nil, err
   103  		}
   104  		r.z = p256One
   105  		return p.Set(&r), nil
   106  
   107  	// Compressed form.
   108  	case len(b) == p256CompressedLength && (b[0] == 2 || b[0] == 3):
   109  		var r P256Point
   110  		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
   111  		if p256LessThanP(&r.x) == 0 {
   112  			return nil, errors.New("invalid P256 element encoding")
   113  		}
   114  		p256Mul(&r.x, &r.x, &rr)
   115  
   116  		// y² = x³ - 3x + b
   117  		p256Polynomial(&r.y, &r.x)
   118  		if !p256Sqrt(&r.y, &r.y) {
   119  			return nil, errors.New("invalid P256 compressed point encoding")
   120  		}
   121  
   122  		// Select the positive or negative root, as indicated by the least
   123  		// significant bit, based on the encoding type byte.
   124  		yy := new(p256Element)
   125  		p256FromMont(yy, &r.y)
   126  		cond := int(yy[0]&1) ^ int(b[0]&1)
   127  		p256NegCond(&r.y, cond)
   128  
   129  		r.z = p256One
   130  		return p.Set(&r), nil
   131  
   132  	default:
   133  		return nil, errors.New("invalid P256 point encoding")
   134  	}
   135  }
   136  
   137  // p256Polynomial sets y2 to x³ - 3x + b, and returns y2.
   138  func p256Polynomial(y2, x *p256Element) *p256Element {
   139  	x3 := new(p256Element)
   140  	p256Sqr(x3, x, 1)
   141  	p256Mul(x3, x3, x)
   142  
   143  	threeX := new(p256Element)
   144  	p256Add(threeX, x, x)
   145  	p256Add(threeX, threeX, x)
   146  	p256NegCond(threeX, 1)
   147  
   148  	p256B := &p256Element{0xd89cdf6229c4bddf, 0xacf005cd78843090,
   149  		0xe5a220abf7212ed6, 0xdc30061d04874834}
   150  
   151  	p256Add(x3, x3, threeX)
   152  	p256Add(x3, x3, p256B)
   153  
   154  	*y2 = *x3
   155  	return y2
   156  }
   157  
   158  func p256CheckOnCurve(x, y *p256Element) error {
   159  	// y² = x³ - 3x + b
   160  	rhs := p256Polynomial(new(p256Element), x)
   161  	lhs := new(p256Element)
   162  	p256Sqr(lhs, y, 1)
   163  	if p256Equal(lhs, rhs) != 1 {
   164  		return errors.New("P256 point not on curve")
   165  	}
   166  	return nil
   167  }
   168  
   169  // p256LessThanP returns 1 if x < p, and 0 otherwise. Note that a p256Element is
   170  // not allowed to be equal to or greater than p, so if this function returns 0
   171  // then x is invalid.
   172  func p256LessThanP(x *p256Element) int {
   173  	var b uint64
   174  	_, b = bits.Sub64(x[0], p256P[0], b)
   175  	_, b = bits.Sub64(x[1], p256P[1], b)
   176  	_, b = bits.Sub64(x[2], p256P[2], b)
   177  	_, b = bits.Sub64(x[3], p256P[3], b)
   178  	return int(b)
   179  }
   180  
   181  // p256Add sets res = x + y.
   182  func p256Add(res, x, y *p256Element) {
   183  	var c, b uint64
   184  	t1 := make([]uint64, 4)
   185  	t1[0], c = bits.Add64(x[0], y[0], 0)
   186  	t1[1], c = bits.Add64(x[1], y[1], c)
   187  	t1[2], c = bits.Add64(x[2], y[2], c)
   188  	t1[3], c = bits.Add64(x[3], y[3], c)
   189  	t2 := make([]uint64, 4)
   190  	t2[0], b = bits.Sub64(t1[0], p256P[0], 0)
   191  	t2[1], b = bits.Sub64(t1[1], p256P[1], b)
   192  	t2[2], b = bits.Sub64(t1[2], p256P[2], b)
   193  	t2[3], b = bits.Sub64(t1[3], p256P[3], b)
   194  	// Three options:
   195  	//   - a+b < p
   196  	//     then c is 0, b is 1, and t1 is correct
   197  	//   - p <= a+b < 2^256
   198  	//     then c is 0, b is 0, and t2 is correct
   199  	//   - 2^256 <= a+b
   200  	//     then c is 1, b is 1, and t2 is correct
   201  	t2Mask := (c ^ b) - 1
   202  	res[0] = (t1[0] & ^t2Mask) | (t2[0] & t2Mask)
   203  	res[1] = (t1[1] & ^t2Mask) | (t2[1] & t2Mask)
   204  	res[2] = (t1[2] & ^t2Mask) | (t2[2] & t2Mask)
   205  	res[3] = (t1[3] & ^t2Mask) | (t2[3] & t2Mask)
   206  }
   207  
   208  // p256Sqrt sets e to a square root of x. If x is not a square, p256Sqrt returns
   209  // false and e is unchanged. e and x can overlap.
   210  func p256Sqrt(e, x *p256Element) (isSquare bool) {
   211  	t0, t1 := new(p256Element), new(p256Element)
   212  
   213  	// Since p = 3 mod 4, exponentiation by (p + 1) / 4 yields a square root candidate.
   214  	//
   215  	// The sequence of 7 multiplications and 253 squarings is derived from the
   216  	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
   217  	//
   218  	//	_10       = 2*1
   219  	//	_11       = 1 + _10
   220  	//	_1100     = _11 << 2
   221  	//	_1111     = _11 + _1100
   222  	//	_11110000 = _1111 << 4
   223  	//	_11111111 = _1111 + _11110000
   224  	//	x16       = _11111111 << 8 + _11111111
   225  	//	x32       = x16 << 16 + x16
   226  	//	return      ((x32 << 32 + 1) << 96 + 1) << 94
   227  	//
   228  	p256Sqr(t0, x, 1)
   229  	p256Mul(t0, x, t0)
   230  	p256Sqr(t1, t0, 2)
   231  	p256Mul(t0, t0, t1)
   232  	p256Sqr(t1, t0, 4)
   233  	p256Mul(t0, t0, t1)
   234  	p256Sqr(t1, t0, 8)
   235  	p256Mul(t0, t0, t1)
   236  	p256Sqr(t1, t0, 16)
   237  	p256Mul(t0, t0, t1)
   238  	p256Sqr(t0, t0, 32)
   239  	p256Mul(t0, x, t0)
   240  	p256Sqr(t0, t0, 96)
   241  	p256Mul(t0, x, t0)
   242  	p256Sqr(t0, t0, 94)
   243  
   244  	p256Sqr(t1, t0, 1)
   245  	if p256Equal(t1, x) != 1 {
   246  		return false
   247  	}
   248  	*e = *t0
   249  	return true
   250  }
   251  
   252  // The following assembly functions are implemented in p256_asm_*.s
   253  
   254  // Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p.
   255  //
   256  //go:noescape
   257  func p256Mul(res, in1, in2 *p256Element)
   258  
   259  // Montgomery square, repeated n times (n >= 1).
   260  //
   261  //go:noescape
   262  func p256Sqr(res, in *p256Element, n int)
   263  
   264  // Montgomery multiplication by R⁻¹, or 1 outside the domain.
   265  // Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
   266  //
   267  //go:noescape
   268  func p256FromMont(res, in *p256Element)
   269  
   270  // If cond is not 0, sets val = -val mod p.
   271  //
   272  //go:noescape
   273  func p256NegCond(val *p256Element, cond int)
   274  
   275  // If cond is 0, sets res = b, otherwise sets res = a.
   276  //
   277  //go:noescape
   278  func p256MovCond(res, a, b *P256Point, cond int)
   279  
   280  //go:noescape
   281  func p256BigToLittle(res *p256Element, in *[32]byte)
   282  
   283  //go:noescape
   284  func p256LittleToBig(res *[32]byte, in *p256Element)
   285  
   286  //go:noescape
   287  func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
   288  
   289  //go:noescape
   290  func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
   291  
   292  // p256Table is a table of the first 16 multiples of a point. Points are stored
   293  // at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15.
   294  // [0]P is the point at infinity and it's not stored.
   295  type p256Table [16]P256Point
   296  
   297  // p256Select sets res to the point at index idx in the table.
   298  // idx must be in [0, 15]. It executes in constant time.
   299  //
   300  //go:noescape
   301  func p256Select(res *P256Point, table *p256Table, idx int)
   302  
   303  // p256AffinePoint is a point in affine coordinates (x, y). x and y are still
   304  // Montgomery domain elements. The point can't be the point at infinity.
   305  type p256AffinePoint struct {
   306  	x, y p256Element
   307  }
   308  
   309  // p256AffineTable is a table of the first 32 multiples of a point. Points are
   310  // stored at an index offset of -1 like in p256Table, and [0]P is not stored.
   311  type p256AffineTable [32]p256AffinePoint
   312  
   313  // p256Precomputed is a series of precomputed multiples of G, the canonical
   314  // generator. The first p256AffineTable contains multiples of G. The second one
   315  // multiples of [2⁶]G, the third one of [2¹²]G, and so on, where each successive
   316  // table is the previous table doubled six times. Six is the width of the
   317  // sliding window used in p256ScalarMult, and having each table already
   318  // pre-doubled lets us avoid the doublings between windows entirely. This table
   319  // MUST NOT be modified, as it aliases into p256PrecomputedEmbed below.
   320  var p256Precomputed *[43]p256AffineTable
   321  
   322  //go:embed p256_asm_table.bin
   323  var p256PrecomputedEmbed string
   324  
   325  func init() {
   326  	p256PrecomputedPtr := (*unsafe.Pointer)(unsafe.Pointer(&p256PrecomputedEmbed))
   327  	if runtime.GOARCH == "s390x" {
   328  		var newTable [43 * 32 * 2 * 4]uint64
   329  		for i, x := range (*[43 * 32 * 2 * 4][8]byte)(*p256PrecomputedPtr) {
   330  			newTable[i] = binary.LittleEndian.Uint64(x[:])
   331  		}
   332  		newTablePtr := unsafe.Pointer(&newTable)
   333  		p256PrecomputedPtr = &newTablePtr
   334  	}
   335  	p256Precomputed = (*[43]p256AffineTable)(*p256PrecomputedPtr)
   336  }
   337  
   338  // p256SelectAffine sets res to the point at index idx in the table.
   339  // idx must be in [0, 31]. It executes in constant time.
   340  //
   341  //go:noescape
   342  func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   343  
   344  // Point addition with an affine point and constant time conditions.
   345  // If zero is 0, sets res = in2. If sel is 0, sets res = in1.
   346  // If sign is not 0, sets res = in1 + -in2. Otherwise, sets res = in1 + in2
   347  //
   348  //go:noescape
   349  func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
   350  
   351  // Point addition. Sets res = in1 + in2. Returns one if the two input points
   352  // were equal and zero otherwise. If in1 or in2 are the point at infinity, res
   353  // and the return value are undefined.
   354  //
   355  //go:noescape
   356  func p256PointAddAsm(res, in1, in2 *P256Point) int
   357  
   358  // Point doubling. Sets res = in + in. in can be the point at infinity.
   359  //
   360  //go:noescape
   361  func p256PointDoubleAsm(res, in *P256Point)
   362  
   363  // p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
   364  // Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
   365  type p256OrdElement [4]uint64
   366  
   367  // Add sets q = p1 + p2, and returns q. The points may overlap.
   368  func (q *P256Point) Add(r1, r2 *P256Point) *P256Point {
   369  	var sum, double P256Point
   370  	r1IsInfinity := r1.isInfinity()
   371  	r2IsInfinity := r2.isInfinity()
   372  	pointsEqual := p256PointAddAsm(&sum, r1, r2)
   373  	p256PointDoubleAsm(&double, r1)
   374  	p256MovCond(&sum, &double, &sum, pointsEqual)
   375  	p256MovCond(&sum, r1, &sum, r2IsInfinity)
   376  	p256MovCond(&sum, r2, &sum, r1IsInfinity)
   377  	return q.Set(&sum)
   378  }
   379  
   380  // Double sets q = p + p, and returns q. The points may overlap.
   381  func (q *P256Point) Double(p *P256Point) *P256Point {
   382  	var double P256Point
   383  	p256PointDoubleAsm(&double, p)
   384  	return q.Set(&double)
   385  }
   386  
   387  // ScalarBaseMult sets r = scalar * generator, where scalar is a 32-byte big
   388  // endian value, and returns r. If scalar is not 32 bytes long, ScalarBaseMult
   389  // returns an error and the receiver is unchanged.
   390  func (r *P256Point) ScalarBaseMult(scalar []byte) (*P256Point, error) {
   391  	if len(scalar) != 32 {
   392  		return nil, errors.New("invalid scalar length")
   393  	}
   394  	scalarReversed := new(p256OrdElement)
   395  	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
   396  
   397  	r.p256BaseMult(scalarReversed)
   398  	return r, nil
   399  }
   400  
   401  // ScalarMult sets r = scalar * q, where scalar is a 32-byte big endian value,
   402  // and returns r. If scalar is not 32 bytes long, ScalarBaseMult returns an
   403  // error and the receiver is unchanged.
   404  func (r *P256Point) ScalarMult(q *P256Point, scalar []byte) (*P256Point, error) {
   405  	if len(scalar) != 32 {
   406  		return nil, errors.New("invalid scalar length")
   407  	}
   408  	scalarReversed := new(p256OrdElement)
   409  	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
   410  
   411  	r.Set(q).p256ScalarMult(scalarReversed)
   412  	return r, nil
   413  }
   414  
   415  // uint64IsZero returns 1 if x is zero and zero otherwise.
   416  func uint64IsZero(x uint64) int {
   417  	x = ^x
   418  	x &= x >> 32
   419  	x &= x >> 16
   420  	x &= x >> 8
   421  	x &= x >> 4
   422  	x &= x >> 2
   423  	x &= x >> 1
   424  	return int(x & 1)
   425  }
   426  
   427  // p256Equal returns 1 if a and b are equal and 0 otherwise.
   428  func p256Equal(a, b *p256Element) int {
   429  	var acc uint64
   430  	for i := range a {
   431  		acc |= a[i] ^ b[i]
   432  	}
   433  	return uint64IsZero(acc)
   434  }
   435  
   436  // isInfinity returns 1 if p is the point at infinity and 0 otherwise.
   437  func (p *P256Point) isInfinity() int {
   438  	return p256Equal(&p.z, &p256Zero)
   439  }
   440  
   441  // Bytes returns the uncompressed or infinity encoding of p, as specified in
   442  // SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the point at
   443  // infinity is shorter than all other encodings.
   444  func (p *P256Point) Bytes() []byte {
   445  	// This function is outlined to make the allocations inline in the caller
   446  	// rather than happen on the heap.
   447  	var out [p256UncompressedLength]byte
   448  	return p.bytes(&out)
   449  }
   450  
   451  func (p *P256Point) bytes(out *[p256UncompressedLength]byte) []byte {
   452  	// The proper representation of the point at infinity is a single zero byte.
   453  	if p.isInfinity() == 1 {
   454  		return append(out[:0], 0)
   455  	}
   456  
   457  	x, y := new(p256Element), new(p256Element)
   458  	p.affineFromMont(x, y)
   459  
   460  	out[0] = 4 // Uncompressed form.
   461  	p256LittleToBig((*[32]byte)(out[1:33]), x)
   462  	p256LittleToBig((*[32]byte)(out[33:65]), y)
   463  
   464  	return out[:]
   465  }
   466  
   467  // affineFromMont sets (x, y) to the affine coordinates of p, converted out of the
   468  // Montgomery domain.
   469  func (p *P256Point) affineFromMont(x, y *p256Element) {
   470  	p256Inverse(y, &p.z)
   471  	p256Sqr(x, y, 1)
   472  	p256Mul(y, y, x)
   473  
   474  	p256Mul(x, &p.x, x)
   475  	p256Mul(y, &p.y, y)
   476  
   477  	p256FromMont(x, x)
   478  	p256FromMont(y, y)
   479  }
   480  
   481  // BytesX returns the encoding of the x-coordinate of p, as specified in SEC 1,
   482  // Version 2.0, Section 2.3.5, or an error if p is the point at infinity.
   483  func (p *P256Point) BytesX() ([]byte, error) {
   484  	// This function is outlined to make the allocations inline in the caller
   485  	// rather than happen on the heap.
   486  	var out [p256ElementLength]byte
   487  	return p.bytesX(&out)
   488  }
   489  
   490  func (p *P256Point) bytesX(out *[p256ElementLength]byte) ([]byte, error) {
   491  	if p.isInfinity() == 1 {
   492  		return nil, errors.New("P256 point is the point at infinity")
   493  	}
   494  
   495  	x := new(p256Element)
   496  	p256Inverse(x, &p.z)
   497  	p256Sqr(x, x, 1)
   498  	p256Mul(x, &p.x, x)
   499  	p256FromMont(x, x)
   500  	p256LittleToBig((*[32]byte)(out[:]), x)
   501  
   502  	return out[:], nil
   503  }
   504  
   505  // BytesCompressed returns the compressed or infinity encoding of p, as
   506  // specified in SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the
   507  // point at infinity is shorter than all other encodings.
   508  func (p *P256Point) BytesCompressed() []byte {
   509  	// This function is outlined to make the allocations inline in the caller
   510  	// rather than happen on the heap.
   511  	var out [p256CompressedLength]byte
   512  	return p.bytesCompressed(&out)
   513  }
   514  
   515  func (p *P256Point) bytesCompressed(out *[p256CompressedLength]byte) []byte {
   516  	if p.isInfinity() == 1 {
   517  		return append(out[:0], 0)
   518  	}
   519  
   520  	x, y := new(p256Element), new(p256Element)
   521  	p.affineFromMont(x, y)
   522  
   523  	out[0] = 2 | byte(y[0]&1)
   524  	p256LittleToBig((*[32]byte)(out[1:33]), x)
   525  
   526  	return out[:]
   527  }
   528  
   529  // Select sets q to p1 if cond == 1, and to p2 if cond == 0.
   530  func (q *P256Point) Select(p1, p2 *P256Point, cond int) *P256Point {
   531  	p256MovCond(q, p1, p2, cond)
   532  	return q
   533  }
   534  
   535  // p256Inverse sets out to in⁻¹ mod p. If in is zero, out will be zero.
   536  func p256Inverse(out, in *p256Element) {
   537  	// Inversion is calculated through exponentiation by p - 2, per Fermat's
   538  	// little theorem.
   539  	//
   540  	// The sequence of 12 multiplications and 255 squarings is derived from the
   541  	// following addition chain generated with github.com/mmcloughlin/addchain
   542  	// v0.4.0.
   543  	//
   544  	//  _10     = 2*1
   545  	//  _11     = 1 + _10
   546  	//  _110    = 2*_11
   547  	//  _111    = 1 + _110
   548  	//  _111000 = _111 << 3
   549  	//  _111111 = _111 + _111000
   550  	//  x12     = _111111 << 6 + _111111
   551  	//  x15     = x12 << 3 + _111
   552  	//  x16     = 2*x15 + 1
   553  	//  x32     = x16 << 16 + x16
   554  	//  i53     = x32 << 15
   555  	//  x47     = x15 + i53
   556  	//  i263    = ((i53 << 17 + 1) << 143 + x47) << 47
   557  	//  return    (x47 + i263) << 2 + 1
   558  	//
   559  	var z = new(p256Element)
   560  	var t0 = new(p256Element)
   561  	var t1 = new(p256Element)
   562  
   563  	p256Sqr(z, in, 1)
   564  	p256Mul(z, in, z)
   565  	p256Sqr(z, z, 1)
   566  	p256Mul(z, in, z)
   567  	p256Sqr(t0, z, 3)
   568  	p256Mul(t0, z, t0)
   569  	p256Sqr(t1, t0, 6)
   570  	p256Mul(t0, t0, t1)
   571  	p256Sqr(t0, t0, 3)
   572  	p256Mul(z, z, t0)
   573  	p256Sqr(t0, z, 1)
   574  	p256Mul(t0, in, t0)
   575  	p256Sqr(t1, t0, 16)
   576  	p256Mul(t0, t0, t1)
   577  	p256Sqr(t0, t0, 15)
   578  	p256Mul(z, z, t0)
   579  	p256Sqr(t0, t0, 17)
   580  	p256Mul(t0, in, t0)
   581  	p256Sqr(t0, t0, 143)
   582  	p256Mul(t0, z, t0)
   583  	p256Sqr(t0, t0, 47)
   584  	p256Mul(z, z, t0)
   585  	p256Sqr(z, z, 2)
   586  	p256Mul(out, in, z)
   587  }
   588  
   589  func boothW5(in uint) (int, int) {
   590  	var s uint = ^((in >> 5) - 1)
   591  	var d uint = (1 << 6) - in - 1
   592  	d = (d & s) | (in & (^s))
   593  	d = (d >> 1) + (d & 1)
   594  	return int(d), int(s & 1)
   595  }
   596  
   597  func boothW6(in uint) (int, int) {
   598  	var s uint = ^((in >> 6) - 1)
   599  	var d uint = (1 << 7) - in - 1
   600  	d = (d & s) | (in & (^s))
   601  	d = (d >> 1) + (d & 1)
   602  	return int(d), int(s & 1)
   603  }
   604  
   605  func (p *P256Point) p256BaseMult(scalar *p256OrdElement) {
   606  	var t0 p256AffinePoint
   607  
   608  	wvalue := (scalar[0] << 1) & 0x7f
   609  	sel, sign := boothW6(uint(wvalue))
   610  	p256SelectAffine(&t0, &p256Precomputed[0], sel)
   611  	p.x, p.y, p.z = t0.x, t0.y, p256One
   612  	p256NegCond(&p.y, sign)
   613  
   614  	index := uint(5)
   615  	zero := sel
   616  
   617  	for i := 1; i < 43; i++ {
   618  		if index < 192 {
   619  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
   620  		} else {
   621  			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
   622  		}
   623  		index += 6
   624  		sel, sign = boothW6(uint(wvalue))
   625  		p256SelectAffine(&t0, &p256Precomputed[i], sel)
   626  		p256PointAddAffineAsm(p, p, &t0, sign, sel, zero)
   627  		zero |= sel
   628  	}
   629  
   630  	// If the whole scalar was zero, set to the point at infinity.
   631  	p256MovCond(p, p, NewP256Point(), zero)
   632  }
   633  
   634  func (p *P256Point) p256ScalarMult(scalar *p256OrdElement) {
   635  	// precomp is a table of precomputed points that stores powers of p
   636  	// from p^1 to p^16.
   637  	var precomp p256Table
   638  	var t0, t1, t2, t3 P256Point
   639  
   640  	// Prepare the table
   641  	precomp[0] = *p // 1
   642  
   643  	p256PointDoubleAsm(&t0, p)
   644  	p256PointDoubleAsm(&t1, &t0)
   645  	p256PointDoubleAsm(&t2, &t1)
   646  	p256PointDoubleAsm(&t3, &t2)
   647  	precomp[1] = t0  // 2
   648  	precomp[3] = t1  // 4
   649  	precomp[7] = t2  // 8
   650  	precomp[15] = t3 // 16
   651  
   652  	p256PointAddAsm(&t0, &t0, p)
   653  	p256PointAddAsm(&t1, &t1, p)
   654  	p256PointAddAsm(&t2, &t2, p)
   655  	precomp[2] = t0 // 3
   656  	precomp[4] = t1 // 5
   657  	precomp[8] = t2 // 9
   658  
   659  	p256PointDoubleAsm(&t0, &t0)
   660  	p256PointDoubleAsm(&t1, &t1)
   661  	precomp[5] = t0 // 6
   662  	precomp[9] = t1 // 10
   663  
   664  	p256PointAddAsm(&t2, &t0, p)
   665  	p256PointAddAsm(&t1, &t1, p)
   666  	precomp[6] = t2  // 7
   667  	precomp[10] = t1 // 11
   668  
   669  	p256PointDoubleAsm(&t0, &t0)
   670  	p256PointDoubleAsm(&t2, &t2)
   671  	precomp[11] = t0 // 12
   672  	precomp[13] = t2 // 14
   673  
   674  	p256PointAddAsm(&t0, &t0, p)
   675  	p256PointAddAsm(&t2, &t2, p)
   676  	precomp[12] = t0 // 13
   677  	precomp[14] = t2 // 15
   678  
   679  	// Start scanning the window from top bit
   680  	index := uint(254)
   681  	var sel, sign int
   682  
   683  	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
   684  	sel, _ = boothW5(uint(wvalue))
   685  
   686  	p256Select(p, &precomp, sel)
   687  	zero := sel
   688  
   689  	for index > 4 {
   690  		index -= 5
   691  		p256PointDoubleAsm(p, p)
   692  		p256PointDoubleAsm(p, p)
   693  		p256PointDoubleAsm(p, p)
   694  		p256PointDoubleAsm(p, p)
   695  		p256PointDoubleAsm(p, p)
   696  
   697  		if index < 192 {
   698  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
   699  		} else {
   700  			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
   701  		}
   702  
   703  		sel, sign = boothW5(uint(wvalue))
   704  
   705  		p256Select(&t0, &precomp, sel)
   706  		p256NegCond(&t0.y, sign)
   707  		p256PointAddAsm(&t1, p, &t0)
   708  		p256MovCond(&t1, &t1, p, sel)
   709  		p256MovCond(p, &t1, &t0, zero)
   710  		zero |= sel
   711  	}
   712  
   713  	p256PointDoubleAsm(p, p)
   714  	p256PointDoubleAsm(p, p)
   715  	p256PointDoubleAsm(p, p)
   716  	p256PointDoubleAsm(p, p)
   717  	p256PointDoubleAsm(p, p)
   718  
   719  	wvalue = (scalar[0] << 1) & 0x3f
   720  	sel, sign = boothW5(uint(wvalue))
   721  
   722  	p256Select(&t0, &precomp, sel)
   723  	p256NegCond(&t0.y, sign)
   724  	p256PointAddAsm(&t1, p, &t0)
   725  	p256MovCond(&t1, &t1, p, sel)
   726  	p256MovCond(p, &t1, &t0, zero)
   727  }