github.com/AESNooper/go/src@v0.0.0-20220218095104-b56a4ab1bbbb/crypto/elliptic/p256_asm.go

github.com/AESNooper/go/src@v0.0.0-20220218095104-b56a4ab1bbbb/crypto/elliptic/p256_asm.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file contains the Go wrapper for the constant-time, 64-bit assembly
     6  // implementation of P256. The optimizations performed here are described in
     7  // detail in:
     8  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     9  //                          256-bit primes"
    10  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    11  // https://eprint.iacr.org/2013/816.pdf
    12  
    13  //go:build amd64 || arm64
    14  
    15  package elliptic
    16  
    17  import (
    18  	"math/big"
    19  )
    20  
    21  //go:generate go run -tags=tablegen gen_p256_table.go
    22  
    23  type (
    24  	p256Curve struct {
    25  		*CurveParams
    26  	}
    27  
    28  	p256Point struct {
    29  		xyz [12]uint64
    30  	}
    31  )
    32  
    33  var p256 p256Curve
    34  
    35  func initP256() {
    36  	// See FIPS 186-3, section D.2.3
    37  	p256.CurveParams = &CurveParams{Name: "P-256"}
    38  	p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
    39  	p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
    40  	p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
    41  	p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
    42  	p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
    43  	p256.BitSize = 256
    44  }
    45  
    46  func (curve p256Curve) Params() *CurveParams {
    47  	return curve.CurveParams
    48  }
    49  
    50  // Functions implemented in p256_asm_*64.s
    51  // Montgomery multiplication modulo P256
    52  //go:noescape
    53  func p256Mul(res, in1, in2 []uint64)
    54  
    55  // Montgomery square modulo P256, repeated n times (n >= 1)
    56  //go:noescape
    57  func p256Sqr(res, in []uint64, n int)
    58  
    59  // Montgomery multiplication by 1
    60  //go:noescape
    61  func p256FromMont(res, in []uint64)
    62  
    63  // iff cond == 1  val <- -val
    64  //go:noescape
    65  func p256NegCond(val []uint64, cond int)
    66  
    67  // if cond == 0 res <- b; else res <- a
    68  //go:noescape
    69  func p256MovCond(res, a, b []uint64, cond int)
    70  
    71  // Endianness swap
    72  //go:noescape
    73  func p256BigToLittle(res []uint64, in []byte)
    74  
    75  //go:noescape
    76  func p256LittleToBig(res []byte, in []uint64)
    77  
    78  // Constant time table access
    79  //go:noescape
    80  func p256Select(point, table []uint64, idx int)
    81  
    82  //go:noescape
    83  func p256SelectBase(point *[12]uint64, table string, idx int)
    84  
    85  // Montgomery multiplication modulo Ord(G)
    86  //go:noescape
    87  func p256OrdMul(res, in1, in2 []uint64)
    88  
    89  // Montgomery square modulo Ord(G), repeated n times
    90  //go:noescape
    91  func p256OrdSqr(res, in []uint64, n int)
    92  
    93  // Point add with in2 being affine point
    94  // If sign == 1 -> in2 = -in2
    95  // If sel == 0 -> res = in1
    96  // if zero == 0 -> res = in2
    97  //go:noescape
    98  func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
    99  
   100  // Point add. Returns one if the two input points were equal and zero
   101  // otherwise. (Note that, due to the way that the equations work out, some
   102  // representations of ∞ are considered equal to everything by this function.)
   103  //go:noescape
   104  func p256PointAddAsm(res, in1, in2 []uint64) int
   105  
   106  // Point double
   107  //go:noescape
   108  func p256PointDoubleAsm(res, in []uint64)
   109  
   110  func (curve p256Curve) Inverse(k *big.Int) *big.Int {
   111  	if k.Sign() < 0 {
   112  		// This should never happen.
   113  		k = new(big.Int).Neg(k)
   114  	}
   115  
   116  	if k.Cmp(p256.N) >= 0 {
   117  		// This should never happen.
   118  		k = new(big.Int).Mod(k, p256.N)
   119  	}
   120  
   121  	// table will store precomputed powers of x.
   122  	var table [4 * 9]uint64
   123  	var (
   124  		_1      = table[4*0 : 4*1]
   125  		_11     = table[4*1 : 4*2]
   126  		_101    = table[4*2 : 4*3]
   127  		_111    = table[4*3 : 4*4]
   128  		_1111   = table[4*4 : 4*5]
   129  		_10101  = table[4*5 : 4*6]
   130  		_101111 = table[4*6 : 4*7]
   131  		x       = table[4*7 : 4*8]
   132  		t       = table[4*8 : 4*9]
   133  	)
   134  
   135  	fromBig(x[:], k)
   136  	// This code operates in the Montgomery domain where R = 2^256 mod n
   137  	// and n is the order of the scalar field. (See initP256 for the
   138  	// value.) Elements in the Montgomery domain take the form a×R and
   139  	// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
   140  	// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
   141  	// i.e. converts x into the Montgomery domain.
   142  	// Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
   143  	RR := []uint64{0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, 0x66e12d94f3d95620}
   144  	p256OrdMul(_1, x, RR)      // _1
   145  	p256OrdSqr(x, _1, 1)       // _10
   146  	p256OrdMul(_11, x, _1)     // _11
   147  	p256OrdMul(_101, x, _11)   // _101
   148  	p256OrdMul(_111, x, _101)  // _111
   149  	p256OrdSqr(x, _101, 1)     // _1010
   150  	p256OrdMul(_1111, _101, x) // _1111
   151  
   152  	p256OrdSqr(t, x, 1)          // _10100
   153  	p256OrdMul(_10101, t, _1)    // _10101
   154  	p256OrdSqr(x, _10101, 1)     // _101010
   155  	p256OrdMul(_101111, _101, x) // _101111
   156  	p256OrdMul(x, _10101, x)     // _111111 = x6
   157  	p256OrdSqr(t, x, 2)          // _11111100
   158  	p256OrdMul(t, t, _11)        // _11111111 = x8
   159  	p256OrdSqr(x, t, 8)          // _ff00
   160  	p256OrdMul(x, x, t)          // _ffff = x16
   161  	p256OrdSqr(t, x, 16)         // _ffff0000
   162  	p256OrdMul(t, t, x)          // _ffffffff = x32
   163  
   164  	p256OrdSqr(x, t, 64)
   165  	p256OrdMul(x, x, t)
   166  	p256OrdSqr(x, x, 32)
   167  	p256OrdMul(x, x, t)
   168  
   169  	sqrs := []uint8{
   170  		6, 5, 4, 5, 5,
   171  		4, 3, 3, 5, 9,
   172  		6, 2, 5, 6, 5,
   173  		4, 5, 5, 3, 10,
   174  		2, 5, 5, 3, 7, 6}
   175  	muls := [][]uint64{
   176  		_101111, _111, _11, _1111, _10101,
   177  		_101, _101, _101, _111, _101111,
   178  		_1111, _1, _1, _1111, _111,
   179  		_111, _111, _101, _11, _101111,
   180  		_11, _11, _11, _1, _10101, _1111}
   181  
   182  	for i, s := range sqrs {
   183  		p256OrdSqr(x, x, int(s))
   184  		p256OrdMul(x, x, muls[i])
   185  	}
   186  
   187  	// Multiplying by one in the Montgomery domain converts a Montgomery
   188  	// value out of the domain.
   189  	one := []uint64{1, 0, 0, 0}
   190  	p256OrdMul(x, x, one)
   191  
   192  	xOut := make([]byte, 32)
   193  	p256LittleToBig(xOut, x)
   194  	return new(big.Int).SetBytes(xOut)
   195  }
   196  
   197  // fromBig converts a *big.Int into a format used by this code.
   198  func fromBig(out []uint64, big *big.Int) {
   199  	for i := range out {
   200  		out[i] = 0
   201  	}
   202  
   203  	for i, v := range big.Bits() {
   204  		out[i] = uint64(v)
   205  	}
   206  }
   207  
   208  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
   209  // to out. If the scalar is equal or greater than the order of the group, it's
   210  // reduced modulo that order.
   211  func p256GetScalar(out []uint64, in []byte) {
   212  	n := new(big.Int).SetBytes(in)
   213  
   214  	if n.Cmp(p256.N) >= 0 {
   215  		n.Mod(n, p256.N)
   216  	}
   217  	fromBig(out, n)
   218  }
   219  
   220  // p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
   221  // underlying field of the curve. (See initP256 for the value.) Thus rr here is
   222  // R×R mod p. See comment in Inverse about how this is used.
   223  var rr = []uint64{0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd}
   224  
   225  func maybeReduceModP(in *big.Int) *big.Int {
   226  	if in.Cmp(p256.P) < 0 {
   227  		return in
   228  	}
   229  	return new(big.Int).Mod(in, p256.P)
   230  }
   231  
   232  func (curve p256Curve) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
   233  	scalarReversed := make([]uint64, 4)
   234  	var r1, r2 p256Point
   235  	p256GetScalar(scalarReversed, baseScalar)
   236  	r1IsInfinity := scalarIsZero(scalarReversed)
   237  	r1.p256BaseMult(scalarReversed)
   238  
   239  	p256GetScalar(scalarReversed, scalar)
   240  	r2IsInfinity := scalarIsZero(scalarReversed)
   241  	fromBig(r2.xyz[0:4], maybeReduceModP(bigX))
   242  	fromBig(r2.xyz[4:8], maybeReduceModP(bigY))
   243  	p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:])
   244  	p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:])
   245  
   246  	// This sets r2's Z value to 1, in the Montgomery domain.
   247  	r2.xyz[8] = 0x0000000000000001
   248  	r2.xyz[9] = 0xffffffff00000000
   249  	r2.xyz[10] = 0xffffffffffffffff
   250  	r2.xyz[11] = 0x00000000fffffffe
   251  
   252  	r2.p256ScalarMult(scalarReversed)
   253  
   254  	var sum, double p256Point
   255  	pointsEqual := p256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:])
   256  	p256PointDoubleAsm(double.xyz[:], r1.xyz[:])
   257  	sum.CopyConditional(&double, pointsEqual)
   258  	sum.CopyConditional(&r1, r2IsInfinity)
   259  	sum.CopyConditional(&r2, r1IsInfinity)
   260  
   261  	return sum.p256PointToAffine()
   262  }
   263  
   264  func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
   265  	scalarReversed := make([]uint64, 4)
   266  	p256GetScalar(scalarReversed, scalar)
   267  
   268  	var r p256Point
   269  	r.p256BaseMult(scalarReversed)
   270  	return r.p256PointToAffine()
   271  }
   272  
   273  func (curve p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
   274  	scalarReversed := make([]uint64, 4)
   275  	p256GetScalar(scalarReversed, scalar)
   276  
   277  	var r p256Point
   278  	fromBig(r.xyz[0:4], maybeReduceModP(bigX))
   279  	fromBig(r.xyz[4:8], maybeReduceModP(bigY))
   280  	p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:])
   281  	p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:])
   282  	// This sets r2's Z value to 1, in the Montgomery domain.
   283  	r.xyz[8] = 0x0000000000000001
   284  	r.xyz[9] = 0xffffffff00000000
   285  	r.xyz[10] = 0xffffffffffffffff
   286  	r.xyz[11] = 0x00000000fffffffe
   287  
   288  	r.p256ScalarMult(scalarReversed)
   289  	return r.p256PointToAffine()
   290  }
   291  
   292  // uint64IsZero returns 1 if x is zero and zero otherwise.
   293  func uint64IsZero(x uint64) int {
   294  	x = ^x
   295  	x &= x >> 32
   296  	x &= x >> 16
   297  	x &= x >> 8
   298  	x &= x >> 4
   299  	x &= x >> 2
   300  	x &= x >> 1
   301  	return int(x & 1)
   302  }
   303  
   304  // scalarIsZero returns 1 if scalar represents the zero value, and zero
   305  // otherwise.
   306  func scalarIsZero(scalar []uint64) int {
   307  	return uint64IsZero(scalar[0] | scalar[1] | scalar[2] | scalar[3])
   308  }
   309  
   310  func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
   311  	zInv := make([]uint64, 4)
   312  	zInvSq := make([]uint64, 4)
   313  	p256Inverse(zInv, p.xyz[8:12])
   314  	p256Sqr(zInvSq, zInv, 1)
   315  	p256Mul(zInv, zInv, zInvSq)
   316  
   317  	p256Mul(zInvSq, p.xyz[0:4], zInvSq)
   318  	p256Mul(zInv, p.xyz[4:8], zInv)
   319  
   320  	p256FromMont(zInvSq, zInvSq)
   321  	p256FromMont(zInv, zInv)
   322  
   323  	xOut := make([]byte, 32)
   324  	yOut := make([]byte, 32)
   325  	p256LittleToBig(xOut, zInvSq)
   326  	p256LittleToBig(yOut, zInv)
   327  
   328  	return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut)
   329  }
   330  
   331  // CopyConditional copies overwrites p with src if v == 1, and leaves p
   332  // unchanged if v == 0.
   333  func (p *p256Point) CopyConditional(src *p256Point, v int) {
   334  	pMask := uint64(v) - 1
   335  	srcMask := ^pMask
   336  
   337  	for i, n := range p.xyz {
   338  		p.xyz[i] = (n & pMask) | (src.xyz[i] & srcMask)
   339  	}
   340  }
   341  
   342  // p256Inverse sets out to in^-1 mod p.
   343  func p256Inverse(out, in []uint64) {
   344  	var stack [6 * 4]uint64
   345  	p2 := stack[4*0 : 4*0+4]
   346  	p4 := stack[4*1 : 4*1+4]
   347  	p8 := stack[4*2 : 4*2+4]
   348  	p16 := stack[4*3 : 4*3+4]
   349  	p32 := stack[4*4 : 4*4+4]
   350  
   351  	p256Sqr(out, in, 1)
   352  	p256Mul(p2, out, in) // 3*p
   353  
   354  	p256Sqr(out, p2, 2)
   355  	p256Mul(p4, out, p2) // f*p
   356  
   357  	p256Sqr(out, p4, 4)
   358  	p256Mul(p8, out, p4) // ff*p
   359  
   360  	p256Sqr(out, p8, 8)
   361  	p256Mul(p16, out, p8) // ffff*p
   362  
   363  	p256Sqr(out, p16, 16)
   364  	p256Mul(p32, out, p16) // ffffffff*p
   365  
   366  	p256Sqr(out, p32, 32)
   367  	p256Mul(out, out, in)
   368  
   369  	p256Sqr(out, out, 128)
   370  	p256Mul(out, out, p32)
   371  
   372  	p256Sqr(out, out, 32)
   373  	p256Mul(out, out, p32)
   374  
   375  	p256Sqr(out, out, 16)
   376  	p256Mul(out, out, p16)
   377  
   378  	p256Sqr(out, out, 8)
   379  	p256Mul(out, out, p8)
   380  
   381  	p256Sqr(out, out, 4)
   382  	p256Mul(out, out, p4)
   383  
   384  	p256Sqr(out, out, 2)
   385  	p256Mul(out, out, p2)
   386  
   387  	p256Sqr(out, out, 2)
   388  	p256Mul(out, out, in)
   389  }
   390  
   391  func (p *p256Point) p256StorePoint(r *[16 * 4 * 3]uint64, index int) {
   392  	copy(r[index*12:], p.xyz[:])
   393  }
   394  
   395  func boothW5(in uint) (int, int) {
   396  	var s uint = ^((in >> 5) - 1)
   397  	var d uint = (1 << 6) - in - 1
   398  	d = (d & s) | (in & (^s))
   399  	d = (d >> 1) + (d & 1)
   400  	return int(d), int(s & 1)
   401  }
   402  
   403  func boothW6(in uint) (int, int) {
   404  	var s uint = ^((in >> 6) - 1)
   405  	var d uint = (1 << 7) - in - 1
   406  	d = (d & s) | (in & (^s))
   407  	d = (d >> 1) + (d & 1)
   408  	return int(d), int(s & 1)
   409  }
   410  
   411  func (p *p256Point) p256BaseMult(scalar []uint64) {
   412  	wvalue := (scalar[0] << 1) & 0x7f
   413  	sel, sign := boothW6(uint(wvalue))
   414  	p256SelectBase(&p.xyz, p256Precomputed, sel)
   415  	p256NegCond(p.xyz[4:8], sign)
   416  
   417  	// (This is one, in the Montgomery domain.)
   418  	p.xyz[8] = 0x0000000000000001
   419  	p.xyz[9] = 0xffffffff00000000
   420  	p.xyz[10] = 0xffffffffffffffff
   421  	p.xyz[11] = 0x00000000fffffffe
   422  
   423  	var t0 p256Point
   424  	// (This is one, in the Montgomery domain.)
   425  	t0.xyz[8] = 0x0000000000000001
   426  	t0.xyz[9] = 0xffffffff00000000
   427  	t0.xyz[10] = 0xffffffffffffffff
   428  	t0.xyz[11] = 0x00000000fffffffe
   429  
   430  	index := uint(5)
   431  	zero := sel
   432  
   433  	for i := 1; i < 43; i++ {
   434  		if index < 192 {
   435  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
   436  		} else {
   437  			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
   438  		}
   439  		index += 6
   440  		sel, sign = boothW6(uint(wvalue))
   441  		p256SelectBase(&t0.xyz, p256Precomputed[i*32*8*8:], sel)
   442  		p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
   443  		zero |= sel
   444  	}
   445  }
   446  
   447  func (p *p256Point) p256ScalarMult(scalar []uint64) {
   448  	// precomp is a table of precomputed points that stores powers of p
   449  	// from p^1 to p^16.
   450  	var precomp [16 * 4 * 3]uint64
   451  	var t0, t1, t2, t3 p256Point
   452  
   453  	// Prepare the table
   454  	p.p256StorePoint(&precomp, 0) // 1
   455  
   456  	p256PointDoubleAsm(t0.xyz[:], p.xyz[:])
   457  	p256PointDoubleAsm(t1.xyz[:], t0.xyz[:])
   458  	p256PointDoubleAsm(t2.xyz[:], t1.xyz[:])
   459  	p256PointDoubleAsm(t3.xyz[:], t2.xyz[:])
   460  	t0.p256StorePoint(&precomp, 1)  // 2
   461  	t1.p256StorePoint(&precomp, 3)  // 4
   462  	t2.p256StorePoint(&precomp, 7)  // 8
   463  	t3.p256StorePoint(&precomp, 15) // 16
   464  
   465  	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
   466  	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
   467  	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
   468  	t0.p256StorePoint(&precomp, 2) // 3
   469  	t1.p256StorePoint(&precomp, 4) // 5
   470  	t2.p256StorePoint(&precomp, 8) // 9
   471  
   472  	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
   473  	p256PointDoubleAsm(t1.xyz[:], t1.xyz[:])
   474  	t0.p256StorePoint(&precomp, 5) // 6
   475  	t1.p256StorePoint(&precomp, 9) // 10
   476  
   477  	p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:])
   478  	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
   479  	t2.p256StorePoint(&precomp, 6)  // 7
   480  	t1.p256StorePoint(&precomp, 10) // 11
   481  
   482  	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
   483  	p256PointDoubleAsm(t2.xyz[:], t2.xyz[:])
   484  	t0.p256StorePoint(&precomp, 11) // 12
   485  	t2.p256StorePoint(&precomp, 13) // 14
   486  
   487  	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
   488  	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
   489  	t0.p256StorePoint(&precomp, 12) // 13
   490  	t2.p256StorePoint(&precomp, 14) // 15
   491  
   492  	// Start scanning the window from top bit
   493  	index := uint(254)
   494  	var sel, sign int
   495  
   496  	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
   497  	sel, _ = boothW5(uint(wvalue))
   498  
   499  	p256Select(p.xyz[0:12], precomp[0:], sel)
   500  	zero := sel
   501  
   502  	for index > 4 {
   503  		index -= 5
   504  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   505  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   506  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   507  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   508  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   509  
   510  		if index < 192 {
   511  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
   512  		} else {
   513  			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
   514  		}
   515  
   516  		sel, sign = boothW5(uint(wvalue))
   517  
   518  		p256Select(t0.xyz[0:], precomp[0:], sel)
   519  		p256NegCond(t0.xyz[4:8], sign)
   520  		p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
   521  		p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
   522  		p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
   523  		zero |= sel
   524  	}
   525  
   526  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   527  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   528  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   529  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   530  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   531  
   532  	wvalue = (scalar[0] << 1) & 0x3f
   533  	sel, sign = boothW5(uint(wvalue))
   534  
   535  	p256Select(t0.xyz[0:], precomp[0:], sel)
   536  	p256NegCond(t0.xyz[4:8], sign)
   537  	p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
   538  	p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
   539  	p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
   540  }