github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/sm2p256_asm.go

github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/sm2p256_asm.go (about)

     1  // It is by standing on the shoulders of giants.
     2  
     3  // This file contains the Go wrapper for the constant-time, 64-bit assembly
     4  // implementation of P256. The optimizations performed here are described in
     5  // detail in:
     6  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     7  //                          256-bit primes"
     8  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
     9  // https://eprint.iacr.org/2013/816.pdf
    10  //go:build (amd64 || arm64 || s390x || ppc64le) && !purego
    11  
    12  package sm2ec
    13  
    14  import (
    15  	_ "embed"
    16  	"errors"
    17  	"math/bits"
    18  	"runtime"
    19  	"unsafe"
    20  
    21  	"golang.org/x/sys/cpu"
    22  )
    23  
    24  // p256Element is a P-256 base field element in [0, P-1] in the Montgomery
    25  // domain (with R 2²⁵⁶) as four limbs in little-endian order value.
    26  type p256Element [4]uint64
    27  
    28  // p256One is one in the Montgomery domain.
    29  var p256One = p256Element{0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000}
    30  
    31  var p256Zero = p256Element{}
    32  
    33  // p256P is 2^256 - 2^224 - 2^96 + 2^64 - 1.
    34  var p256P = p256Element{0xffffffffffffffff, 0xffffffff00000000,
    35  	0xffffffffffffffff, 0xfffffffeffffffff}
    36  
    37  // P256Point is a P-256 point. The zero value should not be assumed to be valid
    38  // (although it is in this implementation).
    39  type SM2P256Point struct {
    40  	// (X:Y:Z) are Jacobian coordinates where x = X/Z² and y = Y/Z³. The point
    41  	// at infinity can be represented by any set of coordinates with Z = 0.
    42  	x, y, z p256Element
    43  }
    44  
    45  // NewSM2P256Point returns a new SM2P256Point representing the point at infinity.
    46  func NewSM2P256Point() *SM2P256Point {
    47  	return &SM2P256Point{
    48  		x: p256One, y: p256One, z: p256Zero,
    49  	}
    50  }
    51  
    52  // SetGenerator sets p to the canonical generator and returns p.
    53  func (p *SM2P256Point) SetGenerator() *SM2P256Point {
    54  	p.x = p256Element{0x61328990f418029e, 0x3e7981eddca6c050,
    55  		0xd6a1ed99ac24c3c3, 0x91167a5ee1c13b05}
    56  	p.y = p256Element{0xc1354e593c2d0ddd, 0xc1f5e5788d3295fa,
    57  		0x8d4cfb066e2a48f8, 0x63cd65d481d735bd}
    58  	p.z = p256One
    59  	return p
    60  }
    61  
    62  // Set sets p = q and returns p.
    63  func (p *SM2P256Point) Set(q *SM2P256Point) *SM2P256Point {
    64  	p.x, p.y, p.z = q.x, q.y, q.z
    65  	return p
    66  }
    67  
    68  const p256ElementLength = 32
    69  const p256UncompressedLength = 1 + 2*p256ElementLength
    70  const p256CompressedLength = 1 + p256ElementLength
    71  
    72  // SetBytes sets p to the compressed, uncompressed, or infinity value encoded in
    73  // b, as specified in SEC 1, Version 2.0, Section 2.3.4. If the point is not on
    74  // the curve, it returns nil and an error, and the receiver is unchanged.
    75  // Otherwise, it returns p.
    76  func (p *SM2P256Point) SetBytes(b []byte) (*SM2P256Point, error) {
    77  	// p256Mul operates in the Montgomery domain with R = 2²⁵⁶ mod p. Thus rr
    78  	// here is R in the Montgomery domain, or R×R mod p. See comment in
    79  	// P256OrdInverse about how this is used.
    80  	rr := p256Element{0x0000000200000003, 0x00000002ffffffff,
    81  		0x0000000100000001, 0x0000000400000002}
    82  
    83  	switch {
    84  	// Point at infinity.
    85  	case len(b) == 1 && b[0] == 0:
    86  		return p.Set(NewSM2P256Point()), nil
    87  
    88  	// Uncompressed form.
    89  	case len(b) == p256UncompressedLength && b[0] == 4:
    90  		var r SM2P256Point
    91  		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
    92  		p256BigToLittle(&r.y, (*[32]byte)(b[33:65]))
    93  		if p256LessThanP(&r.x) == 0 || p256LessThanP(&r.y) == 0 {
    94  			return nil, errors.New("invalid P256 element encoding")
    95  		}
    96  		p256Mul(&r.x, &r.x, &rr)
    97  		p256Mul(&r.y, &r.y, &rr)
    98  		if err := p256CheckOnCurve(&r.x, &r.y); err != nil {
    99  			return nil, err
   100  		}
   101  		r.z = p256One
   102  		return p.Set(&r), nil
   103  
   104  	// Compressed form.
   105  	case len(b) == p256CompressedLength && (b[0] == 2 || b[0] == 3):
   106  		var r SM2P256Point
   107  		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
   108  		if p256LessThanP(&r.x) == 0 {
   109  			return nil, errors.New("invalid P256 element encoding")
   110  		}
   111  		p256Mul(&r.x, &r.x, &rr)
   112  
   113  		// y² = x³ - 3x + b
   114  		p256Polynomial(&r.y, &r.x)
   115  		if !p256Sqrt(&r.y, &r.y) {
   116  			return nil, errors.New("invalid P256 compressed point encoding")
   117  		}
   118  
   119  		// Select the positive or negative root, as indicated by the least
   120  		// significant bit, based on the encoding type byte.
   121  		yy := new(p256Element)
   122  		p256FromMont(yy, &r.y)
   123  		cond := int(yy[0]&1) ^ int(b[0]&1)
   124  		p256NegCond(&r.y, cond)
   125  
   126  		r.z = p256One
   127  		return p.Set(&r), nil
   128  
   129  	default:
   130  		return nil, errors.New("invalid P256 point encoding")
   131  	}
   132  }
   133  
   134  // p256Polynomial sets y2 to x³ - 3x + b, and returns y2.
   135  func p256Polynomial(y2, x *p256Element) *p256Element {
   136  	x3 := new(p256Element)
   137  	p256Sqr(x3, x, 1)
   138  	p256Mul(x3, x3, x)
   139  
   140  	threeX := new(p256Element)
   141  	p256Add(threeX, x, x)
   142  	p256Add(threeX, threeX, x)
   143  	p256NegCond(threeX, 1)
   144  
   145  	p256B := &p256Element{0x90d230632bc0dd42, 0x71cf379ae9b537ab,
   146  		0x527981505ea51c3c, 0x240fe188ba20e2c8}
   147  
   148  	p256Add(x3, x3, threeX)
   149  	p256Add(x3, x3, p256B)
   150  
   151  	*y2 = *x3
   152  	return y2
   153  }
   154  
   155  func p256CheckOnCurve(x, y *p256Element) error {
   156  	// y² = x³ - 3x + b
   157  	rhs := p256Polynomial(new(p256Element), x)
   158  	lhs := new(p256Element)
   159  	p256Sqr(lhs, y, 1)
   160  	if p256Equal(lhs, rhs) != 1 {
   161  		return errors.New("point not on SM2 P256 curve")
   162  	}
   163  	return nil
   164  }
   165  
   166  // p256LessThanP returns 1 if x < p, and 0 otherwise. Note that a p256Element is
   167  // not allowed to be equal to or greater than p, so if this function returns 0
   168  // then x is invalid.
   169  func p256LessThanP(x *p256Element) int {
   170  	var b uint64
   171  	_, b = bits.Sub64(x[0], p256P[0], b)
   172  	_, b = bits.Sub64(x[1], p256P[1], b)
   173  	_, b = bits.Sub64(x[2], p256P[2], b)
   174  	_, b = bits.Sub64(x[3], p256P[3], b)
   175  	return int(b)
   176  }
   177  
   178  // p256Add sets res = x + y.
   179  func p256Add(res, x, y *p256Element) {
   180  	var c, b uint64
   181  	t1 := make([]uint64, 4)
   182  	t1[0], c = bits.Add64(x[0], y[0], 0)
   183  	t1[1], c = bits.Add64(x[1], y[1], c)
   184  	t1[2], c = bits.Add64(x[2], y[2], c)
   185  	t1[3], c = bits.Add64(x[3], y[3], c)
   186  	t2 := make([]uint64, 4)
   187  	t2[0], b = bits.Sub64(t1[0], p256P[0], 0)
   188  	t2[1], b = bits.Sub64(t1[1], p256P[1], b)
   189  	t2[2], b = bits.Sub64(t1[2], p256P[2], b)
   190  	t2[3], b = bits.Sub64(t1[3], p256P[3], b)
   191  	// Three options:
   192  	//   - a+b < p
   193  	//     then c is 0, b is 1, and t1 is correct
   194  	//   - p <= a+b < 2^256
   195  	//     then c is 0, b is 0, and t2 is correct
   196  	//   - 2^256 <= a+b
   197  	//     then c is 1, b is 1, and t2 is correct
   198  	t2Mask := (c ^ b) - 1
   199  	res[0] = (t1[0] & ^t2Mask) | (t2[0] & t2Mask)
   200  	res[1] = (t1[1] & ^t2Mask) | (t2[1] & t2Mask)
   201  	res[2] = (t1[2] & ^t2Mask) | (t2[2] & t2Mask)
   202  	res[3] = (t1[3] & ^t2Mask) | (t2[3] & t2Mask)
   203  }
   204  
   205  // p256Sqrt sets e to a square root of x. If x is not a square, p256Sqrt returns
   206  // false and e is unchanged. e and x can overlap.
   207  func p256Sqrt(e, x *p256Element) (isSquare bool) {
   208  	z, t0, t1, t2, t3, t4 := new(p256Element), new(p256Element), new(p256Element), new(p256Element), new(p256Element), new(p256Element)
   209  
   210  	// Since p = 3 mod 4, exponentiation by (p + 1) / 4 yields a square root candidate.
   211  	//
   212  	// The sequence of 13 multiplications and 253 squarings is derived from the
   213  	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
   214  	//
   215  	//	_10      = 2*1
   216  	//	_11      = 1 + _10
   217  	//	_110     = 2*_11
   218  	//	_111     = 1 + _110
   219  	//	_1110    = 2*_111
   220  	//	_1111    = 1 + _1110
   221  	//	_11110   = 2*_1111
   222  	//	_111100  = 2*_11110
   223  	//	_1111000 = 2*_111100
   224  	//	i19      = (_1111000 << 3 + _111100) << 5 + _1111000
   225  	//	x31      = (i19 << 2 + _11110) << 14 + i19 + _111
   226  	//	i42      = x31 << 4
   227  	//	i73      = i42 << 31
   228  	//	i74      = i42 + i73
   229  	//	i171     = (i73 << 32 + i74) << 62 + i74 + _1111
   230  	//	return     (i171 << 32 + 1) << 62
   231  	//
   232  	p256Sqr(z, x, 1)   // z.Square(x)
   233  	p256Mul(z, x, z)   // z.Mul(x, z)
   234  	p256Sqr(z, z, 1)   // z.Square(z)
   235  	p256Mul(t0, x, z)  // t0.Mul(x, z)
   236  	p256Sqr(z, t0, 1)  // z.Square(t0)
   237  	p256Mul(z, x, z)   // z.Mul(x, z)
   238  	p256Sqr(t2, z, 1)  // t2.Square(z)
   239  	p256Sqr(t3, t2, 1) // t3.Square(t2)
   240  	p256Sqr(t1, t3, 1) // t1.Square(t3)
   241  	// t4.Square(t1)
   242  	//for s := 1; s < 3; s++ {
   243  	//	t4.Square(t4)
   244  	//}
   245  	p256Sqr(t4, t1, 3)
   246  	p256Mul(t3, t3, t4) // t3.Mul(t3, t4)
   247  	//for s := 0; s < 5; s++ {
   248  	//	t3.Square(t3)
   249  	//}
   250  	p256Sqr(t3, t3, 5)
   251  	p256Mul(t1, t1, t3) // t1.Mul(t1, t3)
   252  	//t3.Square(t1)
   253  	//for s := 1; s < 2; s++ {
   254  	//	t3.Square(t3)
   255  	//}
   256  	p256Sqr(t3, t1, 2)
   257  	p256Mul(t2, t2, t3) // t2.Mul(t2, t3)
   258  	//for s := 0; s < 14; s++ {
   259  	//	t2.Square(t2)
   260  	//}
   261  	p256Sqr(t2, t2, 14)
   262  	p256Mul(t1, t1, t2) // t1.Mul(t1, t2)
   263  
   264  	p256Mul(t0, t0, t1) // t0.Mul(t0, t1)
   265  	//for s := 0; s < 4; s++ {
   266  	//	t0.Square(t0)
   267  	//}
   268  	p256Sqr(t0, t0, 4)
   269  	//t1.Square(t0)
   270  	//for s := 1; s < 31; s++ {
   271  	//	t1.Square(t1)
   272  	//}
   273  	p256Sqr(t1, t0, 31)
   274  	p256Mul(t0, t0, t1) //t0.Mul(t0, t1)
   275  	//for s := 0; s < 32; s++ {
   276  	//	t1.Square(t1)
   277  	//}
   278  	p256Sqr(t1, t1, 32)
   279  
   280  	p256Mul(t1, t0, t1) //t1.Mul(t0, t1)
   281  	//for s := 0; s < 62; s++ {
   282  	//	t1.Square(t1)
   283  	//}
   284  	p256Sqr(t1, t1, 62)
   285  	p256Mul(t0, t0, t1) //t0.Mul(t0, t1)
   286  	p256Mul(z, z, t0)   //z.Mul(z, t0)
   287  	//for s := 0; s < 32; s++ {
   288  	//	e.Square(e)
   289  	//}
   290  	p256Sqr(z, z, 32)
   291  	p256Mul(z, z, x) // z.Mul(x, z)
   292  	//for s := 0; s < 62; s++ {
   293  	//	z.Square(z)
   294  	//}
   295  	p256Sqr(z, z, 62)
   296  
   297  	p256Sqr(t1, z, 1)
   298  	if p256Equal(t1, x) != 1 {
   299  		return false
   300  	}
   301  	*e = *z
   302  	return true
   303  }
   304  
   305  // The following assembly functions are implemented in p256_asm_*.s
   306  
   307  // amd64 assembly uses ADCX/ADOX/MULX
   308  var supportBMI2 = cpu.X86.HasADX && cpu.X86.HasBMI2
   309  
   310  var supportAVX2 = cpu.X86.HasAVX2
   311  
   312  // Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p.
   313  //
   314  //go:noescape
   315  func p256Mul(res, in1, in2 *p256Element)
   316  
   317  // Montgomery square, repeated n times (n >= 1).
   318  //
   319  //go:noescape
   320  func p256Sqr(res, in *p256Element, n int)
   321  
   322  // Montgomery multiplication by R⁻¹, or 1 outside the domain.
   323  // Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
   324  //
   325  //go:noescape
   326  func p256FromMont(res, in *p256Element)
   327  
   328  // If cond is not 0, sets val = -val mod p.
   329  //
   330  //go:noescape
   331  func p256NegCond(val *p256Element, cond int)
   332  
   333  // If cond is 0, sets res = b, otherwise sets res = a.
   334  //
   335  //go:noescape
   336  func p256MovCond(res, a, b *SM2P256Point, cond int)
   337  
   338  //go:noescape
   339  func p256BigToLittle(res *p256Element, in *[32]byte)
   340  
   341  //go:noescape
   342  func p256LittleToBig(res *[32]byte, in *p256Element)
   343  
   344  //go:noescape
   345  func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
   346  
   347  //go:noescape
   348  func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
   349  
   350  // p256OrdReduce ensures s is in the range [0, ord(G)-1].
   351  //
   352  //go:noescape
   353  func p256OrdReduce(s *p256OrdElement)
   354  
   355  // p256Table is a table of the first 16 multiples of a point. Points are stored
   356  // at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15.
   357  // [0]P is the point at infinity and it's not stored.
   358  type p256Table [32]SM2P256Point
   359  
   360  // p256Select sets res to the point at index idx in the table.
   361  // idx must be in [0, limit-1]. It executes in constant time.
   362  //
   363  //go:noescape
   364  func p256Select(res *SM2P256Point, table *p256Table, idx, limit int)
   365  
   366  // p256AffinePoint is a point in affine coordinates (x, y). x and y are still
   367  // Montgomery domain elements. The point can't be the point at infinity.
   368  type p256AffinePoint struct {
   369  	x, y p256Element
   370  }
   371  
   372  // p256AffineTable is a table of the first 32 multiples of a point. Points are
   373  // stored at an index offset of -1 like in p256Table, and [0]P is not stored.
   374  type p256AffineTable [32]p256AffinePoint
   375  
   376  // p256Precomputed is a series of precomputed multiples of G, the canonical
   377  // generator. The first p256AffineTable contains multiples of G. The second one
   378  // multiples of [2⁶]G, the third one of [2¹²]G, and so on, where each successive
   379  // table is the previous table doubled six times. Six is the width of the
   380  // sliding window used in p256ScalarMult, and having each table already
   381  // pre-doubled lets us avoid the doublings between windows entirely. This table
   382  // MUST NOT be modified, as it aliases into p256PrecomputedEmbed below.
   383  var p256Precomputed *[43]p256AffineTable
   384  
   385  //go:embed p256_asm_table.bin
   386  var p256PrecomputedEmbed string
   387  
   388  func leUint64(b []byte) uint64 {
   389  	_ = b[7] // bounds check hint to compiler; see golang.org/issue/14808
   390  	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
   391  		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
   392  }
   393  
   394  func init() {
   395  	p256PrecomputedPtr := (*unsafe.Pointer)(unsafe.Pointer(&p256PrecomputedEmbed))
   396  	if runtime.GOARCH == "s390x" {
   397  		var newTable [43 * 32 * 2 * 4]uint64
   398  		for i, x := range (*[43 * 32 * 2 * 4][8]byte)(*p256PrecomputedPtr) {
   399  			newTable[i] = leUint64(x[:])
   400  		}
   401  		newTablePtr := unsafe.Pointer(&newTable)
   402  		p256PrecomputedPtr = &newTablePtr
   403  	}
   404  	p256Precomputed = (*[43]p256AffineTable)(*p256PrecomputedPtr)
   405  }
   406  
   407  // p256SelectAffine sets res to the point at index idx in the table.
   408  // idx must be in [0, 31]. It executes in constant time.
   409  //
   410  //go:noescape
   411  func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   412  
   413  // Point addition with an affine point and constant time conditions.
   414  // If zero is 0, sets res = in2. If sel is 0, sets res = in1.
   415  // If sign is not 0, sets res = in1 + -in2. Otherwise, sets res = in1 + in2
   416  //
   417  //go:noescape
   418  func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int)
   419  
   420  // Point addition. Sets res = in1 + in2. Returns one if the two input points
   421  // were equal and zero otherwise. If in1 or in2 are the point at infinity, res
   422  // and the return value are undefined.
   423  //
   424  //go:noescape
   425  func p256PointAddAsm(res, in1, in2 *SM2P256Point) int
   426  
   427  // Point doubling. Sets res = in + in. in can be the point at infinity.
   428  //
   429  //go:noescape
   430  func p256PointDoubleAsm(res, in *SM2P256Point)
   431  
   432  // Point doubling 6 times. in can be the point at infinity.
   433  //
   434  //go:noescape
   435  func p256PointDouble6TimesAsm(res, in *SM2P256Point)
   436  
   437  // p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
   438  // Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
   439  type p256OrdElement [4]uint64
   440  
   441  // Add sets q = p1 + p2, and returns q. The points may overlap.
   442  func (q *SM2P256Point) Add(r1, r2 *SM2P256Point) *SM2P256Point {
   443  	var sum, double SM2P256Point
   444  	r1IsInfinity := r1.isInfinity()
   445  	r2IsInfinity := r2.isInfinity()
   446  	pointsEqual := p256PointAddAsm(&sum, r1, r2)
   447  	p256PointDoubleAsm(&double, r1)
   448  	p256MovCond(&sum, &double, &sum, pointsEqual)
   449  	p256MovCond(&sum, r1, &sum, r2IsInfinity)
   450  	p256MovCond(&sum, r2, &sum, r1IsInfinity)
   451  	return q.Set(&sum)
   452  }
   453  
   454  // Double sets q = p + p, and returns q. The points may overlap.
   455  func (q *SM2P256Point) Double(p *SM2P256Point) *SM2P256Point {
   456  	var double SM2P256Point
   457  	p256PointDoubleAsm(&double, p)
   458  	return q.Set(&double)
   459  }
   460  
   461  // ScalarBaseMult sets r = scalar * generator, where scalar is a 32-byte big
   462  // endian value, and returns r. If scalar is not 32 bytes long, ScalarBaseMult
   463  // returns an error and the receiver is unchanged.
   464  func (r *SM2P256Point) ScalarBaseMult(scalar []byte) (*SM2P256Point, error) {
   465  	if len(scalar) != 32 {
   466  		return nil, errors.New("invalid scalar length")
   467  	}
   468  	scalarReversed := new(p256OrdElement)
   469  	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
   470  	p256OrdReduce(scalarReversed)
   471  	r.p256BaseMult(scalarReversed)
   472  	return r, nil
   473  }
   474  
   475  // ScalarMult sets r = scalar * q, where scalar is a 32-byte big endian value,
   476  // and returns r. If scalar is not 32 bytes long, ScalarBaseMult returns an
   477  // error and the receiver is unchanged.
   478  func (r *SM2P256Point) ScalarMult(q *SM2P256Point, scalar []byte) (*SM2P256Point, error) {
   479  	if len(scalar) != 32 {
   480  		return nil, errors.New("invalid scalar length")
   481  	}
   482  	scalarReversed := new(p256OrdElement)
   483  	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
   484  	p256OrdReduce(scalarReversed)
   485  	r.Set(q).p256ScalarMult(scalarReversed)
   486  	return r, nil
   487  }
   488  
   489  // uint64IsZero returns 1 if x is zero and zero otherwise.
   490  func uint64IsZero(x uint64) int {
   491  	x = ^x
   492  	x &= x >> 32
   493  	x &= x >> 16
   494  	x &= x >> 8
   495  	x &= x >> 4
   496  	x &= x >> 2
   497  	x &= x >> 1
   498  	return int(x & 1)
   499  }
   500  
   501  // p256Equal returns 1 if a and b are equal and 0 otherwise.
   502  func p256Equal(a, b *p256Element) int {
   503  	var acc uint64
   504  	for i := range a {
   505  		acc |= a[i] ^ b[i]
   506  	}
   507  	return uint64IsZero(acc)
   508  }
   509  
   510  // isInfinity returns 1 if p is the point at infinity and 0 otherwise.
   511  func (p *SM2P256Point) isInfinity() int {
   512  	return p256Equal(&p.z, &p256Zero)
   513  }
   514  
   515  // Bytes returns the uncompressed or infinity encoding of p, as specified in
   516  // SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the point at
   517  // infinity is shorter than all other encodings.
   518  func (p *SM2P256Point) Bytes() []byte {
   519  	// This function is outlined to make the allocations inline in the caller
   520  	// rather than happen on the heap.
   521  	var out [p256UncompressedLength]byte
   522  	return p.bytes(&out)
   523  }
   524  
   525  func (p *SM2P256Point) bytes(out *[p256UncompressedLength]byte) []byte {
   526  	// The proper representation of the point at infinity is a single zero byte.
   527  	if p.isInfinity() == 1 {
   528  		return append(out[:0], 0)
   529  	}
   530  
   531  	x, y := new(p256Element), new(p256Element)
   532  	p.affineFromMont(x, y)
   533  
   534  	out[0] = 4 // Uncompressed form.
   535  	p256LittleToBig((*[32]byte)(out[1:33]), x)
   536  	p256LittleToBig((*[32]byte)(out[33:65]), y)
   537  
   538  	return out[:]
   539  }
   540  
   541  // affineFromMont sets (x, y) to the affine coordinates of p, converted out of the
   542  // Montgomery domain.
   543  func (p *SM2P256Point) affineFromMont(x, y *p256Element) {
   544  	p256Inverse(y, &p.z)
   545  	p256Sqr(x, y, 1)
   546  	p256Mul(y, y, x)
   547  
   548  	p256Mul(x, &p.x, x)
   549  	p256Mul(y, &p.y, y)
   550  
   551  	p256FromMont(x, x)
   552  	p256FromMont(y, y)
   553  }
   554  
   555  // BytesX returns the encoding of the x-coordinate of p, as specified in SEC 1,
   556  // Version 2.0, Section 2.3.5, or an error if p is the point at infinity.
   557  func (p *SM2P256Point) BytesX() ([]byte, error) {
   558  	// This function is outlined to make the allocations inline in the caller
   559  	// rather than happen on the heap.
   560  	var out [p256ElementLength]byte
   561  	return p.bytesX(&out)
   562  }
   563  
   564  func (p *SM2P256Point) bytesX(out *[p256ElementLength]byte) ([]byte, error) {
   565  	if p.isInfinity() == 1 {
   566  		return nil, errors.New("SM2 point is the point at infinity")
   567  	}
   568  
   569  	x := new(p256Element)
   570  	p256Inverse(x, &p.z)
   571  	p256Sqr(x, x, 1)
   572  	p256Mul(x, &p.x, x)
   573  	p256FromMont(x, x)
   574  	p256LittleToBig((*[32]byte)(out[:]), x)
   575  
   576  	return out[:], nil
   577  }
   578  
   579  // BytesCompressed returns the compressed or infinity encoding of p, as
   580  // specified in SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the
   581  // point at infinity is shorter than all other encodings.
   582  func (p *SM2P256Point) BytesCompressed() []byte {
   583  	// This function is outlined to make the allocations inline in the caller
   584  	// rather than happen on the heap.
   585  	var out [p256CompressedLength]byte
   586  	return p.bytesCompressed(&out)
   587  }
   588  
   589  func (p *SM2P256Point) bytesCompressed(out *[p256CompressedLength]byte) []byte {
   590  	if p.isInfinity() == 1 {
   591  		return append(out[:0], 0)
   592  	}
   593  
   594  	x, y := new(p256Element), new(p256Element)
   595  	p.affineFromMont(x, y)
   596  
   597  	out[0] = 2 | byte(y[0]&1)
   598  	p256LittleToBig((*[32]byte)(out[1:33]), x)
   599  
   600  	return out[:]
   601  }
   602  
   603  // Select sets q to p1 if cond == 1, and to p2 if cond == 0.
   604  func (q *SM2P256Point) Select(p1, p2 *SM2P256Point, cond int) *SM2P256Point {
   605  	p256MovCond(q, p1, p2, cond)
   606  	return q
   607  }
   608  
   609  // p256Inverse sets out to in⁻¹ mod p. If in is zero, out will be zero.
   610  func p256Inverse(out, in *p256Element) {
   611  	// Inversion is calculated through exponentiation by p - 2, per Fermat's
   612  	// little theorem.
   613  	//
   614  	// The sequence of 14 multiplications and 255 squarings is derived from the
   615  	// following addition chain generated with github.com/mmcloughlin/addchain
   616  	// v0.4.0.
   617  	//
   618  	//      _10      = 2*1
   619  	//      _11      = 1 + _10
   620  	//      _110     = 2*_11
   621  	//      _111     = 1 + _110
   622  	//      _111000  = _111 << 3
   623  	//      _111111  = _111 + _111000
   624  	//      _1111110 = 2*_111111
   625  	//      _1111111 = 1 + _1111110
   626  	//      x12      = _1111110 << 5 + _111111
   627  	//      x24      = x12 << 12 + x12
   628  	//      x31      = x24 << 7 + _1111111
   629  	//      i39      = x31 << 2
   630  	//      i68      = i39 << 29
   631  	//      x62      = x31 + i68
   632  	//      i71      = i68 << 2
   633  	//      x64      = i39 + i71 + _11
   634  	//      i265     = ((i71 << 32 + x64) << 64 + x64) << 94
   635  	//      return     (x62 + i265) << 2 + 1
   636  	// Allocate Temporaries.
   637  	var (
   638  		t0 = new(p256Element)
   639  		t1 = new(p256Element)
   640  		t2 = new(p256Element)
   641  	)
   642  	// Step 1: z = x^0x2
   643  	//z.Sqr(x)
   644  	p256Sqr(out, in, 1)
   645  
   646  	// Step 2: t0 = x^0x3
   647  	// t0.Mul(x, z)
   648  	p256Mul(t0, in, out)
   649  
   650  	// Step 3: z = x^0x6
   651  	// z.Sqr(t0)
   652  	p256Sqr(out, t0, 1)
   653  
   654  	// Step 4: z = x^0x7
   655  	// z.Mul(x, z)
   656  	p256Mul(out, in, out)
   657  
   658  	// Step 7: t1 = x^0x38
   659  	//t1.Sqr(z)
   660  	//for s := 1; s < 3; s++ {
   661  	//	t1.Sqr(t1)
   662  	//}
   663  	p256Sqr(t1, out, 3)
   664  
   665  	// Step 8: t1 = x^0x3f
   666  	//t1.Mul(z, t1)
   667  	p256Mul(t1, out, t1)
   668  
   669  	// Step 9: t2 = x^0x7e
   670  	//t2.Sqr(t1)
   671  	p256Sqr(t2, t1, 1)
   672  
   673  	// Step 10: z = x^0x7f
   674  	//z.Mul(x, t2)
   675  	p256Mul(out, in, t2)
   676  
   677  	// Step 15: t2 = x^0xfc0
   678  	//for s := 0; s < 5; s++ {
   679  	//	t2.Sqr(t2)
   680  	//}
   681  	p256Sqr(t2, t2, 5)
   682  
   683  	// Step 16: t1 = x^0xfff
   684  	//t1.Mul(t1, t2)
   685  	p256Mul(t1, t1, t2)
   686  
   687  	// Step 28: t2 = x^0xfff000
   688  	//t2.Sqr(t1)
   689  	//for s := 1; s < 12; s++ {
   690  	//	t2.Sqr(t2)
   691  	//}
   692  	p256Sqr(t2, t1, 12)
   693  
   694  	// Step 29: t1 = x^0xffffff
   695  	//t1.Mul(t1, t2)
   696  	p256Mul(t1, t1, t2)
   697  
   698  	// Step 36: t1 = x^0x7fffff80
   699  	//for s := 0; s < 7; s++ {
   700  	//	t1.Sqr(t1)
   701  	//}
   702  	p256Sqr(t1, t1, 7)
   703  
   704  	// Step 37: z = x^0x7fffffff
   705  	//z.Mul(z, t1)
   706  	p256Mul(out, out, t1)
   707  
   708  	// Step 39: t2 = x^0x1fffffffc
   709  	//t2.Sqr(z)
   710  	//for s := 1; s < 2; s++ {
   711  	//	t2.Sqr(t2)
   712  	//}
   713  	p256Sqr(t2, out, 2)
   714  
   715  	// Step 68: t1 = x^0x3fffffff80000000
   716  	//t1.Sqr(t2)
   717  	//for s := 1; s < 29; s++ {
   718  	//	t1.Sqr(t1)
   719  	//}
   720  	p256Sqr(t1, t2, 29)
   721  
   722  	// Step 69: z = x^0x3fffffffffffffff
   723  	//z.Mul(z, t1)
   724  	p256Mul(out, out, t1)
   725  
   726  	// Step 71: t1 = x^0xfffffffe00000000
   727  	//for s := 0; s < 2; s++ {
   728  	//	t1.Sqr(t1)
   729  	//}
   730  	p256Sqr(t1, t1, 2)
   731  
   732  	// Step 72: t2 = x^0xfffffffffffffffc
   733  	//t2.Mul(t2, t1)
   734  	p256Mul(t2, t2, t1)
   735  
   736  	// Step 73: t0 = x^0xffffffffffffffff
   737  	//t0.Mul(t0, t2)
   738  	p256Mul(t0, t0, t2)
   739  
   740  	// Step 105: t1 = x^0xfffffffe0000000000000000
   741  	//for s := 0; s < 32; s++ {
   742  	//	t1.Sqr(t1)
   743  	//}
   744  	p256Sqr(t1, t1, 32)
   745  
   746  	// Step 106: t1 = x^0xfffffffeffffffffffffffff
   747  	//t1.Mul(t0, t1)
   748  	p256Mul(t1, t0, t1)
   749  
   750  	// Step 170: t1 = x^0xfffffffeffffffffffffffff0000000000000000
   751  	//for s := 0; s < 64; s++ {
   752  	//	t1.Sqr(t1)
   753  	//}
   754  	p256Sqr(t1, t1, 64)
   755  
   756  	// Step 171: t0 = x^0xfffffffeffffffffffffffffffffffffffffffff
   757  	//t0.Mul(t0, t1)
   758  	p256Mul(t0, t0, t1)
   759  
   760  	// Step 265: t0 = x^0x3fffffffbfffffffffffffffffffffffffffffffc00000000000000000000000
   761  	//for s := 0; s < 94; s++ {
   762  	//	t0.Sqr(t0)
   763  	//}
   764  	p256Sqr(t0, t0, 94)
   765  
   766  	// Step 266: z = x^0x3fffffffbfffffffffffffffffffffffffffffffc00000003fffffffffffffff
   767  	//z.Mul(z, t0)
   768  	p256Mul(out, out, t0)
   769  
   770  	// Step 268: z = x^0xfffffffeffffffffffffffffffffffffffffffff00000000fffffffffffffffc
   771  	//for s := 0; s < 2; s++ {
   772  	//	z.Sqr(z)
   773  	//}
   774  	p256Sqr(out, out, 2)
   775  
   776  	// Step 269: z = x^0xfffffffeffffffffffffffffffffffffffffffff00000000fffffffffffffffd
   777  	//z.Mul(x, z)
   778  	p256Mul(out, in, out)
   779  }
   780  
   781  // This function takes those six bits as an integer (0 .. 63), writing the
   782  // recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
   783  // value, in the range 0 .. 16).  Note that this integer essentially provides
   784  // the input bits "shifted to the left" by one position: for example, the input
   785  // to compute the least significant recoded digit, given that there's no bit
   786  // b_-1, has to be b_4 b_3 b_2 b_1 b_0 0.
   787  //
   788  // Reference:
   789  // https://github.com/openssl/openssl/blob/master/crypto/ec/ecp_nistputil.c
   790  // https://github.com/google/boringssl/blob/master/crypto/fipsmodule/ec/util.c
   791  func boothW5(in uint) (int, int) {
   792  	var s uint = ^((in >> 5) - 1)  // sets all bits to MSB(in), 'in' seen as 6-bit value
   793  	var d uint = (1 << 6) - in - 1 // d = 63 - in, or d = ^in & 0x3f
   794  	d = (d & s) | (in & (^s))      // d = in if in < 2^5; otherwise, d = 63 - in
   795  	d = (d >> 1) + (d & 1)         // d = (d + 1) / 2
   796  	return int(d), int(s & 1)
   797  }
   798  
   799  func boothW6(in uint) (int, int) {
   800  	var s uint = ^((in >> 6) - 1)
   801  	var d uint = (1 << 7) - in - 1
   802  	d = (d & s) | (in & (^s))
   803  	d = (d >> 1) + (d & 1)
   804  	return int(d), int(s & 1)
   805  }
   806  
   807  func (p *SM2P256Point) p256BaseMult(scalar *p256OrdElement) {
   808  	var t0 p256AffinePoint
   809  
   810  	wvalue := (scalar[0] << 1) & 0x7f
   811  	sel, sign := boothW6(uint(wvalue))
   812  	p256SelectAffine(&t0, &p256Precomputed[0], sel)
   813  	p.x, p.y, p.z = t0.x, t0.y, p256One
   814  	p256NegCond(&p.y, sign)
   815  
   816  	index := uint(5)
   817  	zero := sel
   818  
   819  	for i := 1; i < 43; i++ {
   820  		if index >= 192 {
   821  			wvalue = (scalar[3] >> (index & 63)) & 0x7f
   822  		} else if index >= 128 {
   823  			wvalue = ((scalar[2] >> (index & 63)) + (scalar[3] << (64 - (index & 63)))) & 0x7f
   824  		} else if index >= 64 {
   825  			wvalue = ((scalar[1] >> (index & 63)) + (scalar[2] << (64 - (index & 63)))) & 0x7f
   826  		} else {
   827  			wvalue = ((scalar[0] >> (index & 63)) + (scalar[1] << (64 - (index & 63)))) & 0x7f
   828  		}
   829  		index += 6
   830  		sel, sign = boothW6(uint(wvalue))
   831  		p256SelectAffine(&t0, &p256Precomputed[i], sel)
   832  		p256PointAddAffineAsm(p, p, &t0, sign, sel, zero)
   833  		zero |= sel
   834  	}
   835  
   836  	// If the whole scalar was zero, set to the point at infinity.
   837  	p256MovCond(p, p, NewSM2P256Point(), zero)
   838  }
   839  
   840  func (p *SM2P256Point) p256ScalarMult(scalar *p256OrdElement) {
   841  	// precomp is a table of precomputed points that stores powers of p
   842  	// from p^1 to p^32.
   843  	var precomp p256Table
   844  	var t0, t1 SM2P256Point
   845  
   846  	// Prepare the table
   847  	precomp[0] = *p // 1
   848  
   849  	p256PointDoubleAsm(&precomp[1], p)             //2
   850  	p256PointAddAsm(&precomp[2], &precomp[1], p)   //3
   851  	p256PointDoubleAsm(&precomp[3], &precomp[1])   //4
   852  	p256PointAddAsm(&precomp[4], &precomp[3], p)   //5
   853  	p256PointDoubleAsm(&precomp[5], &precomp[2])   //6
   854  	p256PointAddAsm(&precomp[6], &precomp[5], p)   //7
   855  	p256PointDoubleAsm(&precomp[7], &precomp[3])   //8
   856  	p256PointAddAsm(&precomp[8], &precomp[7], p)   //9
   857  	p256PointDoubleAsm(&precomp[9], &precomp[4])   //10
   858  	p256PointAddAsm(&precomp[10], &precomp[9], p)  //11
   859  	p256PointDoubleAsm(&precomp[11], &precomp[5])  //12
   860  	p256PointAddAsm(&precomp[12], &precomp[11], p) //13
   861  	p256PointDoubleAsm(&precomp[13], &precomp[6])  //14
   862  	p256PointAddAsm(&precomp[14], &precomp[13], p) //15
   863  	p256PointDoubleAsm(&precomp[15], &precomp[7])  //16
   864  
   865  	p256PointAddAsm(&precomp[16], &precomp[15], p) //17
   866  	p256PointDoubleAsm(&precomp[17], &precomp[8])  //18
   867  	p256PointAddAsm(&precomp[18], &precomp[17], p) //19
   868  	p256PointDoubleAsm(&precomp[19], &precomp[9])  //20
   869  	p256PointAddAsm(&precomp[20], &precomp[19], p) //21
   870  	p256PointDoubleAsm(&precomp[21], &precomp[10]) //22
   871  	p256PointAddAsm(&precomp[22], &precomp[21], p) //23
   872  	p256PointDoubleAsm(&precomp[23], &precomp[11]) //24
   873  	p256PointAddAsm(&precomp[24], &precomp[23], p) //25
   874  	p256PointDoubleAsm(&precomp[25], &precomp[12]) //26
   875  	p256PointAddAsm(&precomp[26], &precomp[25], p) //27
   876  	p256PointDoubleAsm(&precomp[27], &precomp[13]) //28
   877  	p256PointAddAsm(&precomp[28], &precomp[27], p) //29
   878  	p256PointDoubleAsm(&precomp[29], &precomp[14]) //30
   879  	p256PointAddAsm(&precomp[30], &precomp[29], p) //31
   880  	p256PointDoubleAsm(&precomp[31], &precomp[15]) //32
   881  
   882  	// Start scanning the window from top bit
   883  	index := uint(251)
   884  	var sel, sign int
   885  
   886  	wvalue := (scalar[index/64] >> (index % 64)) & 0x7f
   887  	sel, _ = boothW6(uint(wvalue))
   888  
   889  	p256Select(p, &precomp, sel, 32)
   890  	zero := sel
   891  
   892  	for index > 5 {
   893  		index -= 6
   894  
   895  		p256PointDouble6TimesAsm(p, p)
   896  
   897  		if index >= 192 {
   898  			wvalue = (scalar[3] >> (index & 63)) & 0x7f
   899  		} else if index >= 128 {
   900  			wvalue = ((scalar[2] >> (index & 63)) + (scalar[3] << (64 - (index & 63)))) & 0x7f
   901  		} else if index >= 64 {
   902  			wvalue = ((scalar[1] >> (index & 63)) + (scalar[2] << (64 - (index & 63)))) & 0x7f
   903  		} else {
   904  			wvalue = ((scalar[0] >> (index & 63)) + (scalar[1] << (64 - (index & 63)))) & 0x7f
   905  		}
   906  
   907  		sel, sign = boothW6(uint(wvalue))
   908  
   909  		p256Select(&t0, &precomp, sel, 32)
   910  		p256NegCond(&t0.y, sign)
   911  		p256PointAddAsm(&t1, p, &t0)
   912  		p256MovCond(&t1, &t1, p, sel)
   913  		p256MovCond(p, &t1, &t0, zero)
   914  		zero |= sel
   915  	}
   916  	p256PointDouble6TimesAsm(p, p)
   917  
   918  	wvalue = (scalar[0] << 1) & 0x7f
   919  	sel, sign = boothW6(uint(wvalue))
   920  
   921  	p256Select(&t0, &precomp, sel, 32)
   922  	p256NegCond(&t0.y, sign)
   923  	p256PointAddAsm(&t1, p, &t0)
   924  	p256MovCond(&t1, &t1, p, sel)
   925  	p256MovCond(p, &t1, &t0, zero)
   926  }