gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/sm2/p256_asm.go

gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/sm2/p256_asm.go (about)

     1  // It is by standing on the shoulders of giants.
     2  
     3  // This file contains the Go wrapper for the constant-time, 64-bit assembly
     4  // implementation of P256. The optimizations performed here are described in
     5  // detail in:
     6  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     7  //                          256-bit primes"
     8  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
     9  // https://eprint.iacr.org/2013/816.pdf
    10  //go:build amd64 || arm64
    11  // +build amd64 arm64
    12  
    13  package sm2
    14  
    15  /*
    16  sm2/p256_asm.go sm2p256在amd64或arm64架构下的实现
    17  */
    18  
    19  import (
    20  	"crypto/elliptic"
    21  	"math/big"
    22  )
    23  
    24  type (
    25  	// sm2的P256椭圆曲线类型，内嵌 *elliptic.CurveParams
    26  	p256Curve struct {
    27  		*elliptic.CurveParams
    28  	}
    29  	// sm2p256曲线上的座标类型
    30  	p256Point struct {
    31  		xyz [12]uint64
    32  	}
    33  )
    34  
    35  var (
    36  	// 定义sm2椭圆曲线单例 : p256
    37  	p256 p256Curve
    38  )
    39  
    40  // 初始化sm2的 p256 曲线单例
    41  func initP256() {
    42  	p256.CurveParams = &elliptic.CurveParams{Name: "SM2-P-256"}
    43  	// SM2椭圆曲线公钥密码算法推荐曲线参数
    44  	// 2**256 - 2**224 - 2**96 + 2**64 - 1
    45  	p256.P, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
    46  	p256.N, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
    47  	p256.B, _ = new(big.Int).SetString("28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93", 16)
    48  	p256.Gx, _ = new(big.Int).SetString("32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7", 16)
    49  	p256.Gy, _ = new(big.Int).SetString("BC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0", 16)
    50  	p256.BitSize = 256
    51  }
    52  
    53  // Params 获取sm2p256曲线参数
    54  func (curve p256Curve) Params() *elliptic.CurveParams {
    55  	return curve.CurveParams
    56  }
    57  
    58  // p256曲线的蒙哥马利模乘运算。
    59  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    60  // Functions implemented in p256_asm_*64.s
    61  // Montgomery multiplication modulo P256
    62  //go:noescape
    63  //goland:noinspection GoUnusedParameter
    64  func p256Mul(res, in1, in2 []uint64)
    65  
    66  // p256曲线的蒙哥马利幂方运算。
    67  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    68  // Montgomery square modulo P256, repeated n times (n >= 1)
    69  //go:noescape
    70  //goland:noinspection GoUnusedParameter
    71  func p256Sqr(res, in []uint64, n int)
    72  
    73  // p256曲线的蒙哥马利乘1运算。
    74  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    75  // Montgomery multiplication by 1
    76  //go:noescape
    77  //goland:noinspection GoUnusedParameter
    78  func p256FromMont(res, in []uint64)
    79  
    80  // p256曲线的按条件求补(取反)指令。
    81  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    82  // iff cond == 1  val <- -val
    83  //go:noescape
    84  //goland:noinspection GoUnusedParameter
    85  func p256NegCond(val []uint64, cond int)
    86  
    87  // p256曲线的按条件传送指令。
    88  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    89  // if cond == 0 res <- b; else res <- a
    90  //go:noescape
    91  //goland:noinspection GoUnusedParameter
    92  func p256MovCond(res, a, b []uint64, cond int)
    93  
    94  // p256曲线的字节序交换运算(大端序转小端序)。
    95  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
    96  // Endianness swap
    97  //go:noescape
    98  //goland:noinspection GoUnusedParameter
    99  func p256BigToLittle(res []uint64, in []byte)
   100  
   101  // p256曲线的字节序交换运算(小端序转大端序)。
   102  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
   103  //go:noescape
   104  //goland:noinspection GoUnusedParameter
   105  func p256LittleToBig(res []byte, in []uint64)
   106  
   107  // Constant time table access
   108  //go:noescape
   109  //goland:noinspection GoUnusedParameter
   110  func p256Select(point, table []uint64, idx int)
   111  
   112  //go:noescape
   113  //goland:noinspection GoUnusedParameter
   114  func p256SelectBase(point *[12]uint64, table string, idx int)
   115  
   116  // p256曲线的蒙哥马利Ord(G)模乘运算。
   117  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
   118  // Montgomery multiplication modulo Ord(G)
   119  //go:noescape
   120  //goland:noinspection GoUnusedParameter
   121  func p256OrdMul(res, in1, in2 []uint64)
   122  
   123  // p256曲线的蒙哥马利 Ord(G)幂方运算。
   124  // 具体实现在对应平台的 p256_asm_*64.s , *匹配amd64或arm64。
   125  // Montgomery square modulo Ord(G), repeated n times
   126  //go:noescape
   127  //goland:noinspection GoUnusedParameter
   128  func p256OrdSqr(res, in []uint64, n int)
   129  
   130  // Point add with in2 being affine point
   131  // If sign == 1 -> in2 = -in2
   132  // If sel == 0 -> res = in1
   133  // if zero == 0 -> res = in2
   134  //go:noescape
   135  //goland:noinspection GoUnusedParameter
   136  func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
   137  
   138  // Point add. Returns one if the two input points were equal and zero
   139  // otherwise. (Note that, due to the way that the equations work out, some
   140  // representations of ∞ are considered equal to everything by this function.)
   141  //go:noescape
   142  //goland:noinspection GoUnusedParameter
   143  func p256PointAddAsm(res, in1, in2 []uint64) int
   144  
   145  // Point double
   146  //go:noescape
   147  //goland:noinspection GoUnusedParameter
   148  func p256PointDoubleAsm(res, in []uint64)
   149  
   150  var p256one = []uint64{0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000}
   151  
   152  // Inverse 利用amd64或arm64架构的CPU实现快速的 mod Params().N 的倒数运算
   153  // Inverse, implements invertible interface, used by Sign()
   154  // n-2 =
   155  // 1111111111111111111111111111111011111111111111111111111111111111
   156  // 1111111111111111111111111111111111111111111111111111111111111111
   157  // 0111001000000011110111110110101100100001110001100000010100101011
   158  // 0101001110111011111101000000100100111001110101010100000100100001
   159  //
   160  func (curve p256Curve) Inverse(k *big.Int) *big.Int {
   161  	if k.Sign() < 0 {
   162  		// This should never happen.
   163  		k = new(big.Int).Neg(k)
   164  	}
   165  
   166  	if k.Cmp(p256.N) >= 0 {
   167  		// This should never happen.
   168  		k = new(big.Int).Mod(k, p256.N)
   169  	}
   170  
   171  	// table will store precomputed powers of x.
   172  	var table [4 * 10]uint64
   173  	var (
   174  		_1      = table[4*0 : 4*1]
   175  		_11     = table[4*1 : 4*2]
   176  		_101    = table[4*2 : 4*3]
   177  		_111    = table[4*3 : 4*4]
   178  		_1111   = table[4*4 : 4*5]
   179  		_10101  = table[4*5 : 4*6]
   180  		_101111 = table[4*6 : 4*7]
   181  		x       = table[4*7 : 4*8]
   182  		t       = table[4*8 : 4*9]
   183  		m       = table[4*9 : 4*10]
   184  	)
   185  
   186  	fromBig(x[:], k)
   187  	// This code operates in the Montgomery domain where R = 2^256 mod n
   188  	// and n is the order of the scalar field. (See initP256 for the
   189  	// value.) Elements in the Montgomery domain take the form a×R and
   190  	// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
   191  	// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
   192  	// i.e. converts x into the Montgomery domain.
   193  	// Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
   194  	RR := []uint64{0x901192af7c114f20, 0x3464504ade6fa2fa, 0x620fc84c3affe0d4, 0x1eb5e412a22b3d3b}
   195  
   196  	p256OrdMul(_1, x, RR)      // _1 , 2^0
   197  	p256OrdSqr(m, _1, 1)       // _10, 2^1
   198  	p256OrdMul(_11, m, _1)     // _11, 2^1 + 2^0
   199  	p256OrdMul(_101, m, _11)   // _101, 2^2 + 2^0
   200  	p256OrdMul(_111, m, _101)  // _111, 2^2 + 2^1 + 2^0
   201  	p256OrdSqr(x, _101, 1)     // _1010, 2^3 + 2^1
   202  	p256OrdMul(_1111, _101, x) // _1111, 2^3 + 2^2 + 2^1 + 2^0
   203  
   204  	p256OrdSqr(t, x, 1)          // _10100, 2^4 + 2^2
   205  	p256OrdMul(_10101, t, _1)    // _10101, 2^4 + 2^2 + 2^0
   206  	p256OrdSqr(x, _10101, 1)     // _101010, 2^5 + 2^3 + 2^1
   207  	p256OrdMul(_101111, _101, x) // _101111, 2^5 + 2^3 + 2^2 + 2^1 + 2^0
   208  	p256OrdMul(x, _10101, x)     // _111111 = x6, 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0
   209  	p256OrdSqr(t, x, 2)          // _11111100, 2^7 + 2^6 + 2^5 + 2^4 + 2^3 + 2^2
   210  
   211  	p256OrdMul(m, t, m)   // _11111110 = x8, , 2^7 + 2^6 + 2^5 + 2^4 + 2^3 + 2^2 + 2^1
   212  	p256OrdMul(t, t, _11) // _11111111 = x8, , 2^7 + 2^6 + 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0
   213  	p256OrdSqr(x, t, 8)   // _ff00, 2^15 + 2^14 + 2^13 + 2^12 + 2^11 + 2^10 + 2^9 + 2^8
   214  	p256OrdMul(m, x, m)   //  _fffe
   215  	p256OrdMul(x, x, t)   // _ffff = x16, 2^15 + 2^14 + 2^13 + 2^12 + 2^11 + 2^10 + 2^9 + 2^8 + 2^7 + 2^6 + 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0
   216  
   217  	p256OrdSqr(t, x, 16) // _ffff0000, 2^31 + 2^30 + 2^29 + 2^28 + 2^27 + 2^26 + 2^25 + 2^24 + 2^23 + 2^22 + 2^21 + 2^20 + 2^19 + 2^18 + 2^17 + 2^16
   218  	p256OrdMul(m, t, m)  // _fffffffe
   219  	p256OrdMul(t, t, x)  // _ffffffff = x32
   220  
   221  	p256OrdSqr(x, m, 32) // _fffffffe00000000
   222  	p256OrdMul(x, x, t)  // _fffffffeffffffff
   223  	p256OrdSqr(x, x, 32) // _fffffffeffffffff00000000
   224  	p256OrdMul(x, x, t)  // _fffffffeffffffffffffffff
   225  	p256OrdSqr(x, x, 32) // _fffffffeffffffffffffffff00000000
   226  	p256OrdMul(x, x, t)  // _fffffffeffffffffffffffffffffffff
   227  
   228  	sqrs := []uint8{
   229  		4, 3, 11, 5, 3, 5, 1,
   230  		3, 7, 5, 9, 7, 5, 5,
   231  		4, 5, 2, 2, 7, 3, 5,
   232  		5, 6, 2, 6, 3, 5,
   233  	}
   234  	muls := [][]uint64{
   235  		_111, _1, _1111, _1111, _101, _10101, _1,
   236  		_1, _111, _11, _101, _10101, _10101, _111,
   237  		_111, _1111, _11, _1, _1, _1, _111,
   238  		_111, _10101, _1, _1, _1, _1}
   239  
   240  	for i, s := range sqrs {
   241  		p256OrdSqr(x, x, int(s))
   242  		p256OrdMul(x, x, muls[i])
   243  	}
   244  
   245  	// Multiplying by one in the Montgomery domain converts a Montgomery
   246  	// value out of the domain.
   247  	one := []uint64{1, 0, 0, 0}
   248  	p256OrdMul(x, x, one)
   249  
   250  	xOut := make([]byte, 32)
   251  	p256LittleToBig(xOut, x)
   252  	return new(big.Int).SetBytes(xOut)
   253  }
   254  
   255  // fromBig converts a *big.Int into a format used by this code.
   256  func fromBig(out []uint64, big *big.Int) {
   257  	for i := range out {
   258  		out[i] = 0
   259  	}
   260  
   261  	for i, v := range big.Bits() {
   262  		out[i] = uint64(v)
   263  	}
   264  }
   265  
   266  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
   267  // to out. If the scalar is equal or greater than the order of the group, it's
   268  // reduced modulo that order.
   269  func p256GetScalar(out []uint64, in []byte) {
   270  	n := new(big.Int).SetBytes(in)
   271  
   272  	if n.Cmp(p256.N) >= 0 {
   273  		n.Mod(n, p256.N)
   274  	}
   275  	fromBig(out, n)
   276  }
   277  
   278  // p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
   279  // underlying field of the curve. (See initP256 for the value.) Thus rr here is
   280  // R×R mod p. See comment in Inverse about how this is used.
   281  var rr = []uint64{0x200000003, 0x2ffffffff, 0x100000001, 0x400000002}
   282  
   283  func maybeReduceModP(in *big.Int) *big.Int {
   284  	if in.Cmp(p256.P) < 0 {
   285  		return in
   286  	}
   287  	return new(big.Int).Mod(in, p256.P)
   288  }
   289  
   290  // CombinedMult 利用amd64或arm64架构的cpu指令集功能加速曲线上的乘法运算
   291  func (curve p256Curve) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
   292  	scalarReversed := make([]uint64, 4)
   293  	var r1, r2 p256Point
   294  	p256GetScalar(scalarReversed, baseScalar)
   295  	r1IsInfinity := scalarIsZero(scalarReversed)
   296  	r1.p256BaseMult(scalarReversed)
   297  
   298  	p256GetScalar(scalarReversed, scalar)
   299  	r2IsInfinity := scalarIsZero(scalarReversed)
   300  	fromBig(r2.xyz[0:4], maybeReduceModP(bigX))
   301  	fromBig(r2.xyz[4:8], maybeReduceModP(bigY))
   302  	p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:])
   303  	p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:])
   304  
   305  	// This sets r2's Z value to 1, in the Montgomery domain.
   306  	r2.xyz[8] = p256one[0]
   307  	r2.xyz[9] = p256one[1]
   308  	r2.xyz[10] = p256one[2]
   309  	r2.xyz[11] = p256one[3]
   310  
   311  	r2.p256ScalarMult(scalarReversed)
   312  
   313  	var sum, double p256Point
   314  	pointsEqual := p256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:])
   315  	p256PointDoubleAsm(double.xyz[:], r1.xyz[:])
   316  	sum.CopyConditional(&double, pointsEqual)
   317  	sum.CopyConditional(&r1, r2IsInfinity)
   318  	sum.CopyConditional(&r2, r1IsInfinity)
   319  
   320  	return sum.p256PointToAffine()
   321  }
   322  
   323  // ScalarBaseMult sm2p256曲线基点乘法: k*G , k是随机数，G是曲线基点座标。
   324  // 实现的是 elliptic.Curve 接口。
   325  // ScalarBaseMult returns k*G, where G is the base point of the group
   326  // and k is an integer in big-endian form.
   327  func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
   328  	scalarReversed := make([]uint64, 4)
   329  	p256GetScalar(scalarReversed, scalar)
   330  
   331  	var r p256Point
   332  	r.p256BaseMult(scalarReversed)
   333  	return r.p256PointToAffine()
   334  }
   335  
   336  // ScalarMult sm2p256曲线乘法: k*G , k是随机数，(Bx,By)是曲线上某点座标。
   337  // 实现的是 elliptic.Curve 接口。
   338  // ScalarMult returns k*(Bx,By) where k is a number in big-endian form.
   339  func (curve p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
   340  	scalarReversed := make([]uint64, 4)
   341  	p256GetScalar(scalarReversed, scalar)
   342  
   343  	var r p256Point
   344  	fromBig(r.xyz[0:4], maybeReduceModP(bigX))
   345  	fromBig(r.xyz[4:8], maybeReduceModP(bigY))
   346  	p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:])
   347  	p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:])
   348  	// This sets r2's Z value to 1, in the Montgomery domain.
   349  	r.xyz[8] = p256one[0]
   350  	r.xyz[9] = p256one[1]
   351  	r.xyz[10] = p256one[2]
   352  	r.xyz[11] = p256one[3]
   353  
   354  	r.p256ScalarMult(scalarReversed)
   355  	return r.p256PointToAffine()
   356  }
   357  
   358  // uint64IsZero returns 1 if x is zero and zero otherwise.
   359  func uint64IsZero(x uint64) int {
   360  	x = ^x
   361  	x &= x >> 32
   362  	x &= x >> 16
   363  	x &= x >> 8
   364  	x &= x >> 4
   365  	x &= x >> 2
   366  	x &= x >> 1
   367  	return int(x & 1)
   368  }
   369  
   370  // scalarIsZero returns 1 if scalar represents the zero value, and zero
   371  // otherwise.
   372  func scalarIsZero(scalar []uint64) int {
   373  	return uint64IsZero(scalar[0] | scalar[1] | scalar[2] | scalar[3])
   374  }
   375  
   376  func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
   377  	zInv := make([]uint64, 4)
   378  	zInvSq := make([]uint64, 4)
   379  	p256Inverse(zInv, p.xyz[8:12])
   380  	p256Sqr(zInvSq, zInv, 1)
   381  	p256Mul(zInv, zInv, zInvSq)
   382  
   383  	p256Mul(zInvSq, p.xyz[0:4], zInvSq)
   384  	p256Mul(zInv, p.xyz[4:8], zInv)
   385  
   386  	p256FromMont(zInvSq, zInvSq)
   387  	p256FromMont(zInv, zInv)
   388  
   389  	xOut := make([]byte, 32)
   390  	yOut := make([]byte, 32)
   391  	p256LittleToBig(xOut, zInvSq)
   392  	p256LittleToBig(yOut, zInv)
   393  
   394  	return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut)
   395  }
   396  
   397  // CopyConditional copies overwrites p with src if v == 1, and leaves p
   398  // unchanged if v == 0.
   399  func (p *p256Point) CopyConditional(src *p256Point, v int) {
   400  	pMask := uint64(v) - 1
   401  	srcMask := ^pMask
   402  
   403  	for i, n := range p.xyz {
   404  		p.xyz[i] = (n & pMask) | (src.xyz[i] & srcMask)
   405  	}
   406  }
   407  
   408  // p256Inverse sets out to in^-1 mod p.
   409  func p256Inverse(out, in []uint64) {
   410  	// Inversion is calculated through exponentiation by p - 2, per Fermat's
   411  	// little theorem.
   412  	//
   413  	// The sequence of 14 multiplications and 255 squarings is derived from the
   414  	// following addition chain generated with github.com/mmcloughlin/addchain
   415  	// v0.4.0.
   416  	//
   417  	//      _10      = 2*1
   418  	//      _11      = 1 + _10
   419  	//      _110     = 2*_11
   420  	//      _111     = 1 + _110
   421  	//      _111000  = _111 << 3
   422  	//      _111111  = _111 + _111000
   423  	//      _1111110 = 2*_111111
   424  	//      _1111111 = 1 + _1111110
   425  	//      x12      = _1111110 << 5 + _111111
   426  	//      x24      = x12 << 12 + x12
   427  	//      x31      = x24 << 7 + _1111111
   428  	//      i39      = x31 << 2
   429  	//      i68      = i39 << 29
   430  	//      x62      = x31 + i68
   431  	//      i71      = i68 << 2
   432  	//      x64      = i39 + i71 + _11
   433  	//      i265     = ((i71 << 32 + x64) << 64 + x64) << 94
   434  	//      return     (x62 + i265) << 2 + 1
   435  	var stack [3 * 4]uint64
   436  	t0 := stack[4*0 : 4*0+4]
   437  	t1 := stack[4*1 : 4*1+4]
   438  	t2 := stack[4*2 : 4*2+4]
   439  
   440  	p256Sqr(out, in, 1)
   441  	p256Mul(t0, in, out)
   442  	p256Sqr(out, t0, 1)
   443  	p256Mul(out, in, out)
   444  	p256Sqr(t1, out, 3)
   445  	p256Mul(t1, out, t1)
   446  	p256Sqr(t2, t1, 1)
   447  	p256Mul(out, in, t2)
   448  	p256Sqr(t2, t2, 5)
   449  	p256Mul(t1, t1, t2)
   450  	p256Sqr(t2, t1, 12)
   451  	p256Mul(t1, t1, t2)
   452  	p256Sqr(t1, t1, 7)
   453  	p256Mul(out, out, t1)
   454  	p256Sqr(t2, out, 2)
   455  	p256Sqr(t1, t2, 29)
   456  	p256Mul(out, out, t1)
   457  	p256Sqr(t1, t1, 2)
   458  	p256Mul(t2, t2, t1)
   459  	p256Mul(t0, t0, t2)
   460  	p256Sqr(t1, t1, 32)
   461  	p256Mul(t1, t0, t1)
   462  	p256Sqr(t1, t1, 64)
   463  	p256Mul(t0, t0, t1)
   464  	p256Sqr(t0, t0, 94)
   465  	p256Mul(out, out, t0)
   466  	p256Sqr(out, out, 2)
   467  	p256Mul(out, in, out)
   468  }
   469  
   470  func (p *p256Point) p256StorePoint(r *[16 * 4 * 3]uint64, index int) {
   471  	copy(r[index*12:], p.xyz[:])
   472  }
   473  
   474  // This function takes those six bits as an integer (0 .. 63), writing the
   475  // recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
   476  // value, in the range 0 .. 16).  Note that this integer essentially provides
   477  // the input bits "shifted to the left" by one position: for example, the input
   478  // to compute the least significant recoded digit, given that there's no bit
   479  // b_-1, has to be b_4 b_3 b_2 b_1 b_0 0.
   480  //
   481  // Reference:
   482  // https://github.com/openssl/openssl/blob/master/crypto/ec/ecp_nistputil.c
   483  // https://github.com/google/boringssl/blob/master/crypto/fipsmodule/ec/util.c
   484  func boothW5(in uint) (int, int) {
   485  	var s = ^((in >> 5) - 1)  // sets all bits to MSB(in), 'in' seen as 6-bit value
   486  	var d = (1 << 6) - in - 1 // d = 63 - in, or d = ^in & 0x3f
   487  	d = (d & s) | (in & (^s)) // d = in if in < 2^5; otherwise, d = 63 - in
   488  	d = (d >> 1) + (d & 1)    // d = (d + 1) / 2
   489  	return int(d), int(s & 1)
   490  }
   491  
   492  func boothW6(in uint) (int, int) {
   493  	var s = ^((in >> 6) - 1)
   494  	var d = (1 << 7) - in - 1
   495  	d = (d & s) | (in & (^s))
   496  	d = (d >> 1) + (d & 1)
   497  	return int(d), int(s & 1)
   498  }
   499  
   500  func (p *p256Point) p256BaseMult(scalar []uint64) {
   501  	wvalue := (scalar[0] << 1) & 0x7f
   502  	sel, sign := boothW6(uint(wvalue))
   503  	p256SelectBase(&p.xyz, p256Precomputed, sel)
   504  	p256NegCond(p.xyz[4:8], sign)
   505  
   506  	// (This is one, in the Montgomery domain.)
   507  	p.xyz[8] = p256one[0]
   508  	p.xyz[9] = p256one[1]
   509  	p.xyz[10] = p256one[2]
   510  	p.xyz[11] = p256one[3]
   511  
   512  	var t0 p256Point
   513  	// (This is one, in the Montgomery domain.)
   514  	t0.xyz[8] = p256one[0]
   515  	t0.xyz[9] = p256one[1]
   516  	t0.xyz[10] = p256one[2]
   517  	t0.xyz[11] = p256one[3]
   518  
   519  	index := uint(5)
   520  	zero := sel
   521  
   522  	for i := 1; i < 43; i++ {
   523  		if index < 192 {
   524  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
   525  		} else {
   526  			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
   527  		}
   528  		index += 6
   529  		sel, sign = boothW6(uint(wvalue))
   530  		p256SelectBase(&t0.xyz, p256Precomputed[i*32*8*8:], sel)
   531  		p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
   532  		zero |= sel
   533  	}
   534  }
   535  
   536  func (p *p256Point) p256ScalarMult(scalar []uint64) {
   537  	// precomp is a table of precomputed points that stores powers of p
   538  	// from p^1 to p^16.
   539  	var precomp [16 * 4 * 3]uint64
   540  	var t0, t1, t2, t3 p256Point
   541  
   542  	// Prepare the table
   543  	p.p256StorePoint(&precomp, 0) // 1
   544  
   545  	p256PointDoubleAsm(t0.xyz[:], p.xyz[:])
   546  	p256PointDoubleAsm(t1.xyz[:], t0.xyz[:])
   547  	p256PointDoubleAsm(t2.xyz[:], t1.xyz[:])
   548  	p256PointDoubleAsm(t3.xyz[:], t2.xyz[:])
   549  	t0.p256StorePoint(&precomp, 1)  // 2
   550  	t1.p256StorePoint(&precomp, 3)  // 4
   551  	t2.p256StorePoint(&precomp, 7)  // 8
   552  	t3.p256StorePoint(&precomp, 15) // 16
   553  
   554  	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
   555  	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
   556  	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
   557  	t0.p256StorePoint(&precomp, 2) // 3
   558  	t1.p256StorePoint(&precomp, 4) // 5
   559  	t2.p256StorePoint(&precomp, 8) // 9
   560  
   561  	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
   562  	p256PointDoubleAsm(t1.xyz[:], t1.xyz[:])
   563  	t0.p256StorePoint(&precomp, 5) // 6
   564  	t1.p256StorePoint(&precomp, 9) // 10
   565  
   566  	p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:])
   567  	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
   568  	t2.p256StorePoint(&precomp, 6)  // 7
   569  	t1.p256StorePoint(&precomp, 10) // 11
   570  
   571  	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
   572  	p256PointDoubleAsm(t2.xyz[:], t2.xyz[:])
   573  	t0.p256StorePoint(&precomp, 11) // 12
   574  	t2.p256StorePoint(&precomp, 13) // 14
   575  
   576  	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
   577  	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
   578  	t0.p256StorePoint(&precomp, 12) // 13
   579  	t2.p256StorePoint(&precomp, 14) // 15
   580  
   581  	// Start scanning the window from top bit
   582  	index := uint(254)
   583  	var sel, sign int
   584  
   585  	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
   586  	sel, _ = boothW5(uint(wvalue))
   587  
   588  	p256Select(p.xyz[0:12], precomp[0:], sel)
   589  	zero := sel
   590  
   591  	for index > 4 {
   592  		index -= 5
   593  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   594  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   595  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   596  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   597  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   598  
   599  		if index < 192 {
   600  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
   601  		} else {
   602  			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
   603  		}
   604  
   605  		sel, sign = boothW5(uint(wvalue))
   606  
   607  		p256Select(t0.xyz[0:], precomp[0:], sel)
   608  		p256NegCond(t0.xyz[4:8], sign)
   609  		p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
   610  		p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
   611  		p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
   612  		zero |= sel
   613  	}
   614  
   615  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   616  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   617  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   618  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   619  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   620  
   621  	wvalue = (scalar[0] << 1) & 0x3f
   622  	sel, sign = boothW5(uint(wvalue))
   623  
   624  	p256Select(t0.xyz[0:], precomp[0:], sel)
   625  	p256NegCond(t0.xyz[4:8], sign)
   626  	p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
   627  	p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
   628  	p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
   629  }