github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/sm2/p256_asm.go (about)

     1  // It is by standing on the shoulders of giants.
     2  
     3  // This file contains the Go wrapper for the constant-time, 64-bit assembly
     4  // implementation of P256. The optimizations performed here are described in
     5  // detail in:
     6  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     7  //                          256-bit primes"
     8  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
     9  // https://eprint.iacr.org/2013/816.pdf
    10  //go:build amd64 || arm64
    11  // +build amd64 arm64
    12  
    13  package sm2
    14  
    15  /*
    16  sm2/p256_asm.go sm2p256在amd64或arm64架构下的实现
    17  */
    18  
    19  import (
    20  	"crypto/elliptic"
    21  	"math/big"
    22  )
    23  
    24  type (
    25  	// sm2的P256椭圆曲线类型,内嵌 *elliptic.CurveParams
    26  	p256Curve struct {
    27  		*elliptic.CurveParams
    28  	}
    29  	// sm2p256曲线上的座标类型
    30  	p256Point struct {
    31  		xyz [12]uint64
    32  	}
    33  )
    34  
    35  var (
    36  	// 定义sm2椭圆曲线单例 : p256
    37  	p256 p256Curve
    38  )
    39  
    40  // 初始化sm2的 p256 曲线单例
    41  func initP256() {
    42  	p256.CurveParams = &elliptic.CurveParams{Name: "SM2-P-256"}
    43  	// SM2椭圆曲线公钥密码算法推荐曲线参数
    44  	// 2**256 - 2**224 - 2**96 + 2**64 - 1
    45  	p256.P, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
    46  	p256.N, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
    47  	p256.B, _ = new(big.Int).SetString("28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93", 16)
    48  	p256.Gx, _ = new(big.Int).SetString("32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7", 16)
    49  	p256.Gy, _ = new(big.Int).SetString("BC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0", 16)
    50  	p256.BitSize = 256
    51  }
    52  
    53  // Params 获取sm2p256曲线参数
    54  func (curve p256Curve) Params() *elliptic.CurveParams {
    55  	return curve.CurveParams
    56  }
    57  
    58  // p256曲线的蒙哥马利模乘运算。
    59  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    60  // Functions implemented in p256_asm_*64.s
    61  // Montgomery multiplication modulo P256
    62  //
    63  //go:noescape
    64  //goland:noinspection GoUnusedParameter
    65  func p256Mul(res, in1, in2 []uint64)
    66  
    67  // p256曲线的蒙哥马利幂方运算。
    68  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    69  // Montgomery square modulo P256, repeated n times (n >= 1)
    70  //
    71  //go:noescape
    72  //goland:noinspection GoUnusedParameter
    73  func p256Sqr(res, in []uint64, n int)
    74  
    75  // p256曲线的蒙哥马利乘1运算。
    76  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    77  // Montgomery multiplication by 1
    78  //
    79  //go:noescape
    80  //goland:noinspection GoUnusedParameter
    81  func p256FromMont(res, in []uint64)
    82  
    83  // p256曲线的按条件求补(取反)指令。
    84  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    85  // iff cond == 1  val <- -val
    86  //
    87  //go:noescape
    88  //goland:noinspection GoUnusedParameter
    89  func p256NegCond(val []uint64, cond int)
    90  
    91  // p256曲线的按条件传送指令。
    92  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    93  // if cond == 0 res <- b; else res <- a
    94  //
    95  //go:noescape
    96  //goland:noinspection GoUnusedParameter
    97  func p256MovCond(res, a, b []uint64, cond int)
    98  
    99  // p256曲线的字节序交换运算(大端序转小端序)。
   100  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
   101  // Endianness swap
   102  //
   103  //go:noescape
   104  //goland:noinspection GoUnusedParameter
   105  func p256BigToLittle(res []uint64, in []byte)
   106  
   107  // p256曲线的字节序交换运算(小端序转大端序)。
   108  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
   109  //
   110  //go:noescape
   111  //goland:noinspection GoUnusedParameter
   112  func p256LittleToBig(res []byte, in []uint64)
   113  
   114  // Constant time table access
   115  //
   116  //go:noescape
   117  //goland:noinspection GoUnusedParameter
   118  func p256Select(point, table []uint64, idx int)
   119  
   120  //go:noescape
   121  //goland:noinspection GoUnusedParameter
   122  func p256SelectBase(point *[12]uint64, table string, idx int)
   123  
   124  // p256曲线的蒙哥马利Ord(G)模乘运算。
   125  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
   126  // Montgomery multiplication modulo Ord(G)
   127  //
   128  //go:noescape
   129  func p256OrdMul(res, in1, in2 []uint64)
   130  
   131  // p256曲线的蒙哥马利 Ord(G)幂方运算。
   132  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
   133  // Montgomery square modulo Ord(G), repeated n times
   134  //
   135  //go:noescape
   136  //goland:noinspection GoUnusedParameter
   137  func p256OrdSqr(res, in []uint64, n int)
   138  
   139  // Point add with in2 being affine point
   140  // If sign == 1 -> in2 = -in2
   141  // If sel == 0 -> res = in1
   142  // if zero == 0 -> res = in2
   143  //
   144  //go:noescape
   145  //goland:noinspection GoUnusedParameter
   146  func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
   147  
   148  // Point add. Returns one if the two input points were equal and zero
   149  // otherwise. (Note that, due to the way that the equations work out, some
   150  // representations of ∞ are considered equal to everything by this function.)
   151  //
   152  //go:noescape
   153  //goland:noinspection GoUnusedParameter
   154  func p256PointAddAsm(res, in1, in2 []uint64) int
   155  
   156  // Point double
   157  //
   158  //go:noescape
   159  //goland:noinspection GoUnusedParameter
   160  func p256PointDoubleAsm(res, in []uint64)
   161  
   162  var p256one = []uint64{0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000}
   163  
   164  // Inverse 利用amd64或arm64架构的CPU实现快速的 mod Params().N 的倒数运算
   165  // Inverse, implements invertible interface, used by Sign()
   166  // n-2 =
   167  // 1111111111111111111111111111111011111111111111111111111111111111
   168  // 1111111111111111111111111111111111111111111111111111111111111111
   169  // 0111001000000011110111110110101100100001110001100000010100101011
   170  // 0101001110111011111101000000100100111001110101010100000100100001
   171  func (curve p256Curve) Inverse(k *big.Int) *big.Int {
   172  	if k.Sign() < 0 {
   173  		// This should never happen.
   174  		k = new(big.Int).Neg(k)
   175  	}
   176  
   177  	if k.Cmp(p256.N) >= 0 {
   178  		// This should never happen.
   179  		k = new(big.Int).Mod(k, p256.N)
   180  	}
   181  
   182  	// table will store precomputed powers of x.
   183  	var table [4 * 10]uint64
   184  	var (
   185  		_1      = table[4*0 : 4*1]
   186  		_11     = table[4*1 : 4*2]
   187  		_101    = table[4*2 : 4*3]
   188  		_111    = table[4*3 : 4*4]
   189  		_1111   = table[4*4 : 4*5]
   190  		_10101  = table[4*5 : 4*6]
   191  		_101111 = table[4*6 : 4*7]
   192  		x       = table[4*7 : 4*8]
   193  		t       = table[4*8 : 4*9]
   194  		m       = table[4*9 : 4*10]
   195  	)
   196  
   197  	fromBig(x[:], k)
   198  	// This code operates in the Montgomery domain where R = 2^256 mod n
   199  	// and n is the order of the scalar field. (See initP256 for the
   200  	// value.) Elements in the Montgomery domain take the form a×R and
   201  	// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
   202  	// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
   203  	// i.e. converts x into the Montgomery domain.
   204  	// Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
   205  	RR := []uint64{0x901192af7c114f20, 0x3464504ade6fa2fa, 0x620fc84c3affe0d4, 0x1eb5e412a22b3d3b}
   206  
   207  	p256OrdMul(_1, x, RR)      // _1 , 2^0
   208  	p256OrdSqr(m, _1, 1)       // _10, 2^1
   209  	p256OrdMul(_11, m, _1)     // _11, 2^1 + 2^0
   210  	p256OrdMul(_101, m, _11)   // _101, 2^2 + 2^0
   211  	p256OrdMul(_111, m, _101)  // _111, 2^2 + 2^1 + 2^0
   212  	p256OrdSqr(x, _101, 1)     // _1010, 2^3 + 2^1
   213  	p256OrdMul(_1111, _101, x) // _1111, 2^3 + 2^2 + 2^1 + 2^0
   214  
   215  	p256OrdSqr(t, x, 1)          // _10100, 2^4 + 2^2
   216  	p256OrdMul(_10101, t, _1)    // _10101, 2^4 + 2^2 + 2^0
   217  	p256OrdSqr(x, _10101, 1)     // _101010, 2^5 + 2^3 + 2^1
   218  	p256OrdMul(_101111, _101, x) // _101111, 2^5 + 2^3 + 2^2 + 2^1 + 2^0
   219  	p256OrdMul(x, _10101, x)     // _111111 = x6, 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0
   220  	p256OrdSqr(t, x, 2)          // _11111100, 2^7 + 2^6 + 2^5 + 2^4 + 2^3 + 2^2
   221  
   222  	p256OrdMul(m, t, m)   // _11111110 = x8, , 2^7 + 2^6 + 2^5 + 2^4 + 2^3 + 2^2 + 2^1
   223  	p256OrdMul(t, t, _11) // _11111111 = x8, , 2^7 + 2^6 + 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0
   224  	p256OrdSqr(x, t, 8)   // _ff00, 2^15 + 2^14 + 2^13 + 2^12 + 2^11 + 2^10 + 2^9 + 2^8
   225  	p256OrdMul(m, x, m)   //  _fffe
   226  	p256OrdMul(x, x, t)   // _ffff = x16, 2^15 + 2^14 + 2^13 + 2^12 + 2^11 + 2^10 + 2^9 + 2^8 + 2^7 + 2^6 + 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0
   227  
   228  	p256OrdSqr(t, x, 16) // _ffff0000, 2^31 + 2^30 + 2^29 + 2^28 + 2^27 + 2^26 + 2^25 + 2^24 + 2^23 + 2^22 + 2^21 + 2^20 + 2^19 + 2^18 + 2^17 + 2^16
   229  	p256OrdMul(m, t, m)  // _fffffffe
   230  	p256OrdMul(t, t, x)  // _ffffffff = x32
   231  
   232  	p256OrdSqr(x, m, 32) // _fffffffe00000000
   233  	p256OrdMul(x, x, t)  // _fffffffeffffffff
   234  	p256OrdSqr(x, x, 32) // _fffffffeffffffff00000000
   235  	p256OrdMul(x, x, t)  // _fffffffeffffffffffffffff
   236  	p256OrdSqr(x, x, 32) // _fffffffeffffffffffffffff00000000
   237  	p256OrdMul(x, x, t)  // _fffffffeffffffffffffffffffffffff
   238  
   239  	sqrs := []uint8{
   240  		4, 3, 11, 5, 3, 5, 1,
   241  		3, 7, 5, 9, 7, 5, 5,
   242  		4, 5, 2, 2, 7, 3, 5,
   243  		5, 6, 2, 6, 3, 5,
   244  	}
   245  	muls := [][]uint64{
   246  		_111, _1, _1111, _1111, _101, _10101, _1,
   247  		_1, _111, _11, _101, _10101, _10101, _111,
   248  		_111, _1111, _11, _1, _1, _1, _111,
   249  		_111, _10101, _1, _1, _1, _1}
   250  
   251  	for i, s := range sqrs {
   252  		p256OrdSqr(x, x, int(s))
   253  		p256OrdMul(x, x, muls[i])
   254  	}
   255  
   256  	// Multiplying by one in the Montgomery domain converts a Montgomery
   257  	// value out of the domain.
   258  	one := []uint64{1, 0, 0, 0}
   259  	p256OrdMul(x, x, one)
   260  
   261  	xOut := make([]byte, 32)
   262  	p256LittleToBig(xOut, x)
   263  	return new(big.Int).SetBytes(xOut)
   264  }
   265  
   266  // fromBig converts a *big.Int into a format used by this code.
   267  func fromBig(out []uint64, big *big.Int) {
   268  	for i := range out {
   269  		out[i] = 0
   270  	}
   271  
   272  	for i, v := range big.Bits() {
   273  		out[i] = uint64(v)
   274  	}
   275  }
   276  
   277  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
   278  // to out. If the scalar is equal or greater than the order of the group, it's
   279  // reduced modulo that order.
   280  func p256GetScalar(out []uint64, in []byte) {
   281  	n := new(big.Int).SetBytes(in)
   282  
   283  	if n.Cmp(p256.N) >= 0 {
   284  		n.Mod(n, p256.N)
   285  	}
   286  	fromBig(out, n)
   287  }
   288  
   289  // p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
   290  // underlying field of the curve. (See initP256 for the value.) Thus rr here is
   291  // R×R mod p. See comment in Inverse about how this is used.
   292  var rr = []uint64{0x200000003, 0x2ffffffff, 0x100000001, 0x400000002}
   293  
   294  func maybeReduceModP(in *big.Int) *big.Int {
   295  	if in.Cmp(p256.P) < 0 {
   296  		return in
   297  	}
   298  	return new(big.Int).Mod(in, p256.P)
   299  }
   300  
   301  // CombinedMult 利用amd64或arm64架构的cpu指令集功能加速曲线上的乘法运算
   302  func (curve p256Curve) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
   303  	scalarReversed := make([]uint64, 4)
   304  	var r1, r2 p256Point
   305  	p256GetScalar(scalarReversed, baseScalar)
   306  	r1IsInfinity := scalarIsZero(scalarReversed)
   307  	r1.p256BaseMult(scalarReversed)
   308  
   309  	p256GetScalar(scalarReversed, scalar)
   310  	r2IsInfinity := scalarIsZero(scalarReversed)
   311  	fromBig(r2.xyz[0:4], maybeReduceModP(bigX))
   312  	fromBig(r2.xyz[4:8], maybeReduceModP(bigY))
   313  	p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:])
   314  	p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:])
   315  
   316  	// This sets r2's Z value to 1, in the Montgomery domain.
   317  	r2.xyz[8] = p256one[0]
   318  	r2.xyz[9] = p256one[1]
   319  	r2.xyz[10] = p256one[2]
   320  	r2.xyz[11] = p256one[3]
   321  
   322  	r2.p256ScalarMult(scalarReversed)
   323  
   324  	var sum, double p256Point
   325  	pointsEqual := p256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:])
   326  	p256PointDoubleAsm(double.xyz[:], r1.xyz[:])
   327  	sum.CopyConditional(&double, pointsEqual)
   328  	sum.CopyConditional(&r1, r2IsInfinity)
   329  	sum.CopyConditional(&r2, r1IsInfinity)
   330  
   331  	return sum.p256PointToAffine()
   332  }
   333  
   334  // ScalarBaseMult sm2p256曲线基点乘法: k*G , k是随机数,G是曲线基点座标。
   335  // 实现的是 elliptic.Curve 接口。
   336  // ScalarBaseMult returns k*G, where G is the base point of the group
   337  // and k is an integer in big-endian form.
   338  func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
   339  	scalarReversed := make([]uint64, 4)
   340  	p256GetScalar(scalarReversed, scalar)
   341  
   342  	var r p256Point
   343  	r.p256BaseMult(scalarReversed)
   344  	return r.p256PointToAffine()
   345  }
   346  
   347  // ScalarMult sm2p256曲线乘法: k*G , k是随机数,(Bx,By)是曲线上某点座标。
   348  // 实现的是 elliptic.Curve 接口。
   349  // ScalarMult returns k*(Bx,By) where k is a number in big-endian form.
   350  func (curve p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
   351  	scalarReversed := make([]uint64, 4)
   352  	p256GetScalar(scalarReversed, scalar)
   353  
   354  	var r p256Point
   355  	fromBig(r.xyz[0:4], maybeReduceModP(bigX))
   356  	fromBig(r.xyz[4:8], maybeReduceModP(bigY))
   357  	p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:])
   358  	p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:])
   359  	// This sets r2's Z value to 1, in the Montgomery domain.
   360  	r.xyz[8] = p256one[0]
   361  	r.xyz[9] = p256one[1]
   362  	r.xyz[10] = p256one[2]
   363  	r.xyz[11] = p256one[3]
   364  
   365  	r.p256ScalarMult(scalarReversed)
   366  	return r.p256PointToAffine()
   367  }
   368  
   369  // uint64IsZero returns 1 if x is zero and zero otherwise.
   370  func uint64IsZero(x uint64) int {
   371  	x = ^x
   372  	x &= x >> 32
   373  	x &= x >> 16
   374  	x &= x >> 8
   375  	x &= x >> 4
   376  	x &= x >> 2
   377  	x &= x >> 1
   378  	return int(x & 1)
   379  }
   380  
   381  // scalarIsZero returns 1 if scalar represents the zero value, and zero
   382  // otherwise.
   383  func scalarIsZero(scalar []uint64) int {
   384  	return uint64IsZero(scalar[0] | scalar[1] | scalar[2] | scalar[3])
   385  }
   386  
   387  func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
   388  	zInv := make([]uint64, 4)
   389  	zInvSq := make([]uint64, 4)
   390  	p256Inverse(zInv, p.xyz[8:12])
   391  	p256Sqr(zInvSq, zInv, 1)
   392  	p256Mul(zInv, zInv, zInvSq)
   393  
   394  	p256Mul(zInvSq, p.xyz[0:4], zInvSq)
   395  	p256Mul(zInv, p.xyz[4:8], zInv)
   396  
   397  	p256FromMont(zInvSq, zInvSq)
   398  	p256FromMont(zInv, zInv)
   399  
   400  	xOut := make([]byte, 32)
   401  	yOut := make([]byte, 32)
   402  	p256LittleToBig(xOut, zInvSq)
   403  	p256LittleToBig(yOut, zInv)
   404  
   405  	return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut)
   406  }
   407  
   408  // CopyConditional copies overwrites p with src if v == 1, and leaves p
   409  // unchanged if v == 0.
   410  func (p *p256Point) CopyConditional(src *p256Point, v int) {
   411  	pMask := uint64(v) - 1
   412  	srcMask := ^pMask
   413  
   414  	for i, n := range p.xyz {
   415  		p.xyz[i] = (n & pMask) | (src.xyz[i] & srcMask)
   416  	}
   417  }
   418  
   419  // p256Inverse sets out to in^-1 mod p.
   420  func p256Inverse(out, in []uint64) {
   421  	// Inversion is calculated through exponentiation by p - 2, per Fermat's
   422  	// little theorem.
   423  	//
   424  	// The sequence of 14 multiplications and 255 squarings is derived from the
   425  	// following addition chain generated with github.com/mmcloughlin/addchain
   426  	// v0.4.0.
   427  	//
   428  	//      _10      = 2*1
   429  	//      _11      = 1 + _10
   430  	//      _110     = 2*_11
   431  	//      _111     = 1 + _110
   432  	//      _111000  = _111 << 3
   433  	//      _111111  = _111 + _111000
   434  	//      _1111110 = 2*_111111
   435  	//      _1111111 = 1 + _1111110
   436  	//      x12      = _1111110 << 5 + _111111
   437  	//      x24      = x12 << 12 + x12
   438  	//      x31      = x24 << 7 + _1111111
   439  	//      i39      = x31 << 2
   440  	//      i68      = i39 << 29
   441  	//      x62      = x31 + i68
   442  	//      i71      = i68 << 2
   443  	//      x64      = i39 + i71 + _11
   444  	//      i265     = ((i71 << 32 + x64) << 64 + x64) << 94
   445  	//      return     (x62 + i265) << 2 + 1
   446  	var stack [3 * 4]uint64
   447  	t0 := stack[4*0 : 4*0+4]
   448  	t1 := stack[4*1 : 4*1+4]
   449  	t2 := stack[4*2 : 4*2+4]
   450  
   451  	p256Sqr(out, in, 1)
   452  	p256Mul(t0, in, out)
   453  	p256Sqr(out, t0, 1)
   454  	p256Mul(out, in, out)
   455  	p256Sqr(t1, out, 3)
   456  	p256Mul(t1, out, t1)
   457  	p256Sqr(t2, t1, 1)
   458  	p256Mul(out, in, t2)
   459  	p256Sqr(t2, t2, 5)
   460  	p256Mul(t1, t1, t2)
   461  	p256Sqr(t2, t1, 12)
   462  	p256Mul(t1, t1, t2)
   463  	p256Sqr(t1, t1, 7)
   464  	p256Mul(out, out, t1)
   465  	p256Sqr(t2, out, 2)
   466  	p256Sqr(t1, t2, 29)
   467  	p256Mul(out, out, t1)
   468  	p256Sqr(t1, t1, 2)
   469  	p256Mul(t2, t2, t1)
   470  	p256Mul(t0, t0, t2)
   471  	p256Sqr(t1, t1, 32)
   472  	p256Mul(t1, t0, t1)
   473  	p256Sqr(t1, t1, 64)
   474  	p256Mul(t0, t0, t1)
   475  	p256Sqr(t0, t0, 94)
   476  	p256Mul(out, out, t0)
   477  	p256Sqr(out, out, 2)
   478  	p256Mul(out, in, out)
   479  }
   480  
   481  func (p *p256Point) p256StorePoint(r *[16 * 4 * 3]uint64, index int) {
   482  	copy(r[index*12:], p.xyz[:])
   483  }
   484  
   485  // This function takes those six bits as an integer (0 .. 63), writing the
   486  // recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
   487  // value, in the range 0 .. 16).  Note that this integer essentially provides
   488  // the input bits "shifted to the left" by one position: for example, the input
   489  // to compute the least significant recoded digit, given that there's no bit
   490  // b_-1, has to be b_4 b_3 b_2 b_1 b_0 0.
   491  //
   492  // Reference:
   493  // https://github.com/openssl/openssl/blob/master/crypto/ec/ecp_nistputil.c
   494  // https://github.com/google/boringssl/blob/master/crypto/fipsmodule/ec/util.c
   495  func boothW5(in uint) (int, int) {
   496  	var s = ^((in >> 5) - 1)  // sets all bits to MSB(in), 'in' seen as 6-bit value
   497  	var d = (1 << 6) - in - 1 // d = 63 - in, or d = ^in & 0x3f
   498  	d = (d & s) | (in & (^s)) // d = in if in < 2^5; otherwise, d = 63 - in
   499  	d = (d >> 1) + (d & 1)    // d = (d + 1) / 2
   500  	return int(d), int(s & 1)
   501  }
   502  
   503  func boothW6(in uint) (int, int) {
   504  	var s = ^((in >> 6) - 1)
   505  	var d = (1 << 7) - in - 1
   506  	d = (d & s) | (in & (^s))
   507  	d = (d >> 1) + (d & 1)
   508  	return int(d), int(s & 1)
   509  }
   510  
   511  func (p *p256Point) p256BaseMult(scalar []uint64) {
   512  	wvalue := (scalar[0] << 1) & 0x7f
   513  	sel, sign := boothW6(uint(wvalue))
   514  	p256SelectBase(&p.xyz, p256Precomputed, sel)
   515  	p256NegCond(p.xyz[4:8], sign)
   516  
   517  	// (This is one, in the Montgomery domain.)
   518  	p.xyz[8] = p256one[0]
   519  	p.xyz[9] = p256one[1]
   520  	p.xyz[10] = p256one[2]
   521  	p.xyz[11] = p256one[3]
   522  
   523  	var t0 p256Point
   524  	// (This is one, in the Montgomery domain.)
   525  	t0.xyz[8] = p256one[0]
   526  	t0.xyz[9] = p256one[1]
   527  	t0.xyz[10] = p256one[2]
   528  	t0.xyz[11] = p256one[3]
   529  
   530  	index := uint(5)
   531  	zero := sel
   532  
   533  	for i := 1; i < 43; i++ {
   534  		if index < 192 {
   535  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
   536  		} else {
   537  			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
   538  		}
   539  		index += 6
   540  		sel, sign = boothW6(uint(wvalue))
   541  		p256SelectBase(&t0.xyz, p256Precomputed[i*32*8*8:], sel)
   542  		p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
   543  		zero |= sel
   544  	}
   545  }
   546  
   547  func (p *p256Point) p256ScalarMult(scalar []uint64) {
   548  	// precomp is a table of precomputed points that stores powers of p
   549  	// from p^1 to p^16.
   550  	var precomp [16 * 4 * 3]uint64
   551  	var t0, t1, t2, t3 p256Point
   552  
   553  	// Prepare the table
   554  	p.p256StorePoint(&precomp, 0) // 1
   555  
   556  	p256PointDoubleAsm(t0.xyz[:], p.xyz[:])
   557  	p256PointDoubleAsm(t1.xyz[:], t0.xyz[:])
   558  	p256PointDoubleAsm(t2.xyz[:], t1.xyz[:])
   559  	p256PointDoubleAsm(t3.xyz[:], t2.xyz[:])
   560  	t0.p256StorePoint(&precomp, 1)  // 2
   561  	t1.p256StorePoint(&precomp, 3)  // 4
   562  	t2.p256StorePoint(&precomp, 7)  // 8
   563  	t3.p256StorePoint(&precomp, 15) // 16
   564  
   565  	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
   566  	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
   567  	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
   568  	t0.p256StorePoint(&precomp, 2) // 3
   569  	t1.p256StorePoint(&precomp, 4) // 5
   570  	t2.p256StorePoint(&precomp, 8) // 9
   571  
   572  	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
   573  	p256PointDoubleAsm(t1.xyz[:], t1.xyz[:])
   574  	t0.p256StorePoint(&precomp, 5) // 6
   575  	t1.p256StorePoint(&precomp, 9) // 10
   576  
   577  	p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:])
   578  	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
   579  	t2.p256StorePoint(&precomp, 6)  // 7
   580  	t1.p256StorePoint(&precomp, 10) // 11
   581  
   582  	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
   583  	p256PointDoubleAsm(t2.xyz[:], t2.xyz[:])
   584  	t0.p256StorePoint(&precomp, 11) // 12
   585  	t2.p256StorePoint(&precomp, 13) // 14
   586  
   587  	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
   588  	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
   589  	t0.p256StorePoint(&precomp, 12) // 13
   590  	t2.p256StorePoint(&precomp, 14) // 15
   591  
   592  	// Start scanning the window from top bit
   593  	index := uint(254)
   594  	var sel, sign int
   595  
   596  	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
   597  	sel, _ = boothW5(uint(wvalue))
   598  
   599  	p256Select(p.xyz[0:12], precomp[0:], sel)
   600  	zero := sel
   601  
   602  	for index > 4 {
   603  		index -= 5
   604  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   605  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   606  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   607  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   608  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   609  
   610  		if index < 192 {
   611  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
   612  		} else {
   613  			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
   614  		}
   615  
   616  		sel, sign = boothW5(uint(wvalue))
   617  
   618  		p256Select(t0.xyz[0:], precomp[0:], sel)
   619  		p256NegCond(t0.xyz[4:8], sign)
   620  		p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
   621  		p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
   622  		p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
   623  		zero |= sel
   624  	}
   625  
   626  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   627  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   628  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   629  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   630  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   631  
   632  	wvalue = (scalar[0] << 1) & 0x3f
   633  	sel, sign = boothW5(uint(wvalue))
   634  
   635  	p256Select(t0.xyz[0:], precomp[0:], sel)
   636  	p256NegCond(t0.xyz[4:8], sign)
   637  	p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
   638  	p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
   639  	p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
   640  }