github.com/kdevb0x/go@v0.0.0-20180115030120-39687051e9e7/src/crypto/elliptic/p256_amd64.go

github.com/kdevb0x/go@v0.0.0-20180115030120-39687051e9e7/src/crypto/elliptic/p256_amd64.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file contains the Go wrapper for the constant-time, 64-bit assembly
     6  // implementation of P256. The optimizations performed here are described in
     7  // detail in:
     8  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     9  //                          256-bit primes"
    10  // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    11  // https://eprint.iacr.org/2013/816.pdf
    12  
    13  // +build amd64
    14  
    15  package elliptic
    16  
    17  import (
    18  	"math/big"
    19  	"sync"
    20  )
    21  
    22  type (
    23  	p256Curve struct {
    24  		*CurveParams
    25  	}
    26  
    27  	p256Point struct {
    28  		xyz [12]uint64
    29  	}
    30  )
    31  
    32  var (
    33  	p256            p256Curve
    34  	p256Precomputed *[37][64 * 8]uint64
    35  	precomputeOnce  sync.Once
    36  )
    37  
    38  func initP256() {
    39  	// See FIPS 186-3, section D.2.3
    40  	p256.CurveParams = &CurveParams{Name: "P-256"}
    41  	p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
    42  	p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
    43  	p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
    44  	p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
    45  	p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
    46  	p256.BitSize = 256
    47  }
    48  
    49  func (curve p256Curve) Params() *CurveParams {
    50  	return curve.CurveParams
    51  }
    52  
    53  // Functions implemented in p256_asm_amd64.s
    54  // Montgomery multiplication modulo P256
    55  //go:noescape
    56  func p256Mul(res, in1, in2 []uint64)
    57  
    58  // Montgomery square modulo P256
    59  //go:noescape
    60  func p256Sqr(res, in []uint64)
    61  
    62  // Montgomery multiplication by 1
    63  //go:noescape
    64  func p256FromMont(res, in []uint64)
    65  
    66  // iff cond == 1  val <- -val
    67  //go:noescape
    68  func p256NegCond(val []uint64, cond int)
    69  
    70  // if cond == 0 res <- b; else res <- a
    71  //go:noescape
    72  func p256MovCond(res, a, b []uint64, cond int)
    73  
    74  // Endianness swap
    75  //go:noescape
    76  func p256BigToLittle(res []uint64, in []byte)
    77  
    78  //go:noescape
    79  func p256LittleToBig(res []byte, in []uint64)
    80  
    81  // Constant time table access
    82  //go:noescape
    83  func p256Select(point, table []uint64, idx int)
    84  
    85  //go:noescape
    86  func p256SelectBase(point, table []uint64, idx int)
    87  
    88  // Montgomery multiplication modulo Ord(G)
    89  //go:noescape
    90  func p256OrdMul(res, in1, in2 []uint64)
    91  
    92  // Montgomery square modulo Ord(G), repeated n times
    93  //go:noescape
    94  func p256OrdSqr(res, in []uint64, n int)
    95  
    96  // Point add with in2 being affine point
    97  // If sign == 1 -> in2 = -in2
    98  // If sel == 0 -> res = in1
    99  // if zero == 0 -> res = in2
   100  //go:noescape
   101  func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
   102  
   103  // Point add. Returns one if the two input points were equal and zero
   104  // otherwise. (Note that, due to the way that the equations work out, some
   105  // representations of ∞ are considered equal to everything by this function.)
   106  //go:noescape
   107  func p256PointAddAsm(res, in1, in2 []uint64) int
   108  
   109  // Point double
   110  //go:noescape
   111  func p256PointDoubleAsm(res, in []uint64)
   112  
   113  func (curve p256Curve) Inverse(k *big.Int) *big.Int {
   114  	if k.Sign() < 0 {
   115  		// This should never happen.
   116  		k = new(big.Int).Neg(k)
   117  	}
   118  
   119  	if k.Cmp(p256.N) >= 0 {
   120  		// This should never happen.
   121  		k = new(big.Int).Mod(k, p256.N)
   122  	}
   123  
   124  	// table will store precomputed powers of x. The four words at index
   125  	// 4×i store x^(i+1).
   126  	var table [4 * 15]uint64
   127  
   128  	x := make([]uint64, 4)
   129  	fromBig(x[:], k)
   130  	// This code operates in the Montgomery domain where R = 2^256 mod n
   131  	// and n is the order of the scalar field. (See initP256 for the
   132  	// value.) Elements in the Montgomery domain take the form a×R and
   133  	// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
   134  	// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
   135  	// i.e. converts x into the Montgomery domain.
   136  	RR := []uint64{0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, 0x66e12d94f3d95620}
   137  	p256OrdMul(table[:4], x, RR)
   138  
   139  	// Prepare the table, no need in constant time access, because the
   140  	// power is not a secret. (Entry 0 is never used.)
   141  	for i := 2; i < 16; i += 2 {
   142  		p256OrdSqr(table[4*(i-1):], table[4*((i/2)-1):], 1)
   143  		p256OrdMul(table[4*i:], table[4*(i-1):], table[:4])
   144  	}
   145  
   146  	x[0] = table[4*14+0] // f
   147  	x[1] = table[4*14+1]
   148  	x[2] = table[4*14+2]
   149  	x[3] = table[4*14+3]
   150  
   151  	p256OrdSqr(x, x, 4)
   152  	p256OrdMul(x, x, table[4*14:4*14+4]) // ff
   153  	t := make([]uint64, 4, 4)
   154  	t[0] = x[0]
   155  	t[1] = x[1]
   156  	t[2] = x[2]
   157  	t[3] = x[3]
   158  
   159  	p256OrdSqr(x, x, 8)
   160  	p256OrdMul(x, x, t) // ffff
   161  	t[0] = x[0]
   162  	t[1] = x[1]
   163  	t[2] = x[2]
   164  	t[3] = x[3]
   165  
   166  	p256OrdSqr(x, x, 16)
   167  	p256OrdMul(x, x, t) // ffffffff
   168  	t[0] = x[0]
   169  	t[1] = x[1]
   170  	t[2] = x[2]
   171  	t[3] = x[3]
   172  
   173  	p256OrdSqr(x, x, 64) // ffffffff0000000000000000
   174  	p256OrdMul(x, x, t)  // ffffffff00000000ffffffff
   175  	p256OrdSqr(x, x, 32) // ffffffff00000000ffffffff00000000
   176  	p256OrdMul(x, x, t)  // ffffffff00000000ffffffffffffffff
   177  
   178  	// Remaining 32 windows
   179  	expLo := [32]byte{0xb, 0xc, 0xe, 0x6, 0xf, 0xa, 0xa, 0xd, 0xa, 0x7, 0x1, 0x7, 0x9, 0xe, 0x8, 0x4, 0xf, 0x3, 0xb, 0x9, 0xc, 0xa, 0xc, 0x2, 0xf, 0xc, 0x6, 0x3, 0x2, 0x5, 0x4, 0xf}
   180  	for i := 0; i < 32; i++ {
   181  		p256OrdSqr(x, x, 4)
   182  		p256OrdMul(x, x, table[4*(expLo[i]-1):])
   183  	}
   184  
   185  	// Multiplying by one in the Montgomery domain converts a Montgomery
   186  	// value out of the domain.
   187  	one := []uint64{1, 0, 0, 0}
   188  	p256OrdMul(x, x, one)
   189  
   190  	xOut := make([]byte, 32)
   191  	p256LittleToBig(xOut, x)
   192  	return new(big.Int).SetBytes(xOut)
   193  }
   194  
   195  // fromBig converts a *big.Int into a format used by this code.
   196  func fromBig(out []uint64, big *big.Int) {
   197  	for i := range out {
   198  		out[i] = 0
   199  	}
   200  
   201  	for i, v := range big.Bits() {
   202  		out[i] = uint64(v)
   203  	}
   204  }
   205  
   206  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
   207  // to out. If the scalar is equal or greater than the order of the group, it's
   208  // reduced modulo that order.
   209  func p256GetScalar(out []uint64, in []byte) {
   210  	n := new(big.Int).SetBytes(in)
   211  
   212  	if n.Cmp(p256.N) >= 0 {
   213  		n.Mod(n, p256.N)
   214  	}
   215  	fromBig(out, n)
   216  }
   217  
   218  // p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
   219  // underlying field of the curve. (See initP256 for the value.) Thus rr here is
   220  // R×R mod p. See comment in Inverse about how this is used.
   221  var rr = []uint64{0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd}
   222  
   223  func maybeReduceModP(in *big.Int) *big.Int {
   224  	if in.Cmp(p256.P) < 0 {
   225  		return in
   226  	}
   227  	return new(big.Int).Mod(in, p256.P)
   228  }
   229  
   230  func (curve p256Curve) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
   231  	scalarReversed := make([]uint64, 4)
   232  	var r1, r2 p256Point
   233  	p256GetScalar(scalarReversed, baseScalar)
   234  	r1IsInfinity := scalarIsZero(scalarReversed)
   235  	r1.p256BaseMult(scalarReversed)
   236  
   237  	p256GetScalar(scalarReversed, scalar)
   238  	r2IsInfinity := scalarIsZero(scalarReversed)
   239  	fromBig(r2.xyz[0:4], maybeReduceModP(bigX))
   240  	fromBig(r2.xyz[4:8], maybeReduceModP(bigY))
   241  	p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:])
   242  	p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:])
   243  
   244  	// This sets r2's Z value to 1, in the Montgomery domain.
   245  	r2.xyz[8] = 0x0000000000000001
   246  	r2.xyz[9] = 0xffffffff00000000
   247  	r2.xyz[10] = 0xffffffffffffffff
   248  	r2.xyz[11] = 0x00000000fffffffe
   249  
   250  	r2.p256ScalarMult(scalarReversed)
   251  
   252  	var sum, double p256Point
   253  	pointsEqual := p256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:])
   254  	p256PointDoubleAsm(double.xyz[:], r1.xyz[:])
   255  	sum.CopyConditional(&double, pointsEqual)
   256  	sum.CopyConditional(&r1, r2IsInfinity)
   257  	sum.CopyConditional(&r2, r1IsInfinity)
   258  
   259  	return sum.p256PointToAffine()
   260  }
   261  
   262  func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
   263  	scalarReversed := make([]uint64, 4)
   264  	p256GetScalar(scalarReversed, scalar)
   265  
   266  	var r p256Point
   267  	r.p256BaseMult(scalarReversed)
   268  	return r.p256PointToAffine()
   269  }
   270  
   271  func (curve p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
   272  	scalarReversed := make([]uint64, 4)
   273  	p256GetScalar(scalarReversed, scalar)
   274  
   275  	var r p256Point
   276  	fromBig(r.xyz[0:4], maybeReduceModP(bigX))
   277  	fromBig(r.xyz[4:8], maybeReduceModP(bigY))
   278  	p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:])
   279  	p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:])
   280  	// This sets r2's Z value to 1, in the Montgomery domain.
   281  	r.xyz[8] = 0x0000000000000001
   282  	r.xyz[9] = 0xffffffff00000000
   283  	r.xyz[10] = 0xffffffffffffffff
   284  	r.xyz[11] = 0x00000000fffffffe
   285  
   286  	r.p256ScalarMult(scalarReversed)
   287  	return r.p256PointToAffine()
   288  }
   289  
   290  // uint64IsZero returns 1 if x is zero and zero otherwise.
   291  func uint64IsZero(x uint64) int {
   292  	x = ^x
   293  	x &= x >> 32
   294  	x &= x >> 16
   295  	x &= x >> 8
   296  	x &= x >> 4
   297  	x &= x >> 2
   298  	x &= x >> 1
   299  	return int(x & 1)
   300  }
   301  
   302  // scalarIsZero returns 1 if scalar represents the zero value, and zero
   303  // otherwise.
   304  func scalarIsZero(scalar []uint64) int {
   305  	return uint64IsZero(scalar[0] | scalar[1] | scalar[2] | scalar[3])
   306  }
   307  
   308  func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
   309  	zInv := make([]uint64, 4)
   310  	zInvSq := make([]uint64, 4)
   311  	p256Inverse(zInv, p.xyz[8:12])
   312  	p256Sqr(zInvSq, zInv)
   313  	p256Mul(zInv, zInv, zInvSq)
   314  
   315  	p256Mul(zInvSq, p.xyz[0:4], zInvSq)
   316  	p256Mul(zInv, p.xyz[4:8], zInv)
   317  
   318  	p256FromMont(zInvSq, zInvSq)
   319  	p256FromMont(zInv, zInv)
   320  
   321  	xOut := make([]byte, 32)
   322  	yOut := make([]byte, 32)
   323  	p256LittleToBig(xOut, zInvSq)
   324  	p256LittleToBig(yOut, zInv)
   325  
   326  	return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut)
   327  }
   328  
   329  // CopyConditional copies overwrites p with src if v == 1, and leaves p
   330  // unchanged if v == 0.
   331  func (p *p256Point) CopyConditional(src *p256Point, v int) {
   332  	pMask := uint64(v) - 1
   333  	srcMask := ^pMask
   334  
   335  	for i, n := range p.xyz {
   336  		p.xyz[i] = (n & pMask) | (src.xyz[i] & srcMask)
   337  	}
   338  }
   339  
   340  // p256Inverse sets out to in^-1 mod p.
   341  func p256Inverse(out, in []uint64) {
   342  	var stack [6 * 4]uint64
   343  	p2 := stack[4*0 : 4*0+4]
   344  	p4 := stack[4*1 : 4*1+4]
   345  	p8 := stack[4*2 : 4*2+4]
   346  	p16 := stack[4*3 : 4*3+4]
   347  	p32 := stack[4*4 : 4*4+4]
   348  
   349  	p256Sqr(out, in)
   350  	p256Mul(p2, out, in) // 3*p
   351  
   352  	p256Sqr(out, p2)
   353  	p256Sqr(out, out)
   354  	p256Mul(p4, out, p2) // f*p
   355  
   356  	p256Sqr(out, p4)
   357  	p256Sqr(out, out)
   358  	p256Sqr(out, out)
   359  	p256Sqr(out, out)
   360  	p256Mul(p8, out, p4) // ff*p
   361  
   362  	p256Sqr(out, p8)
   363  
   364  	for i := 0; i < 7; i++ {
   365  		p256Sqr(out, out)
   366  	}
   367  	p256Mul(p16, out, p8) // ffff*p
   368  
   369  	p256Sqr(out, p16)
   370  	for i := 0; i < 15; i++ {
   371  		p256Sqr(out, out)
   372  	}
   373  	p256Mul(p32, out, p16) // ffffffff*p
   374  
   375  	p256Sqr(out, p32)
   376  
   377  	for i := 0; i < 31; i++ {
   378  		p256Sqr(out, out)
   379  	}
   380  	p256Mul(out, out, in)
   381  
   382  	for i := 0; i < 32*4; i++ {
   383  		p256Sqr(out, out)
   384  	}
   385  	p256Mul(out, out, p32)
   386  
   387  	for i := 0; i < 32; i++ {
   388  		p256Sqr(out, out)
   389  	}
   390  	p256Mul(out, out, p32)
   391  
   392  	for i := 0; i < 16; i++ {
   393  		p256Sqr(out, out)
   394  	}
   395  	p256Mul(out, out, p16)
   396  
   397  	for i := 0; i < 8; i++ {
   398  		p256Sqr(out, out)
   399  	}
   400  	p256Mul(out, out, p8)
   401  
   402  	p256Sqr(out, out)
   403  	p256Sqr(out, out)
   404  	p256Sqr(out, out)
   405  	p256Sqr(out, out)
   406  	p256Mul(out, out, p4)
   407  
   408  	p256Sqr(out, out)
   409  	p256Sqr(out, out)
   410  	p256Mul(out, out, p2)
   411  
   412  	p256Sqr(out, out)
   413  	p256Sqr(out, out)
   414  	p256Mul(out, out, in)
   415  }
   416  
   417  func (p *p256Point) p256StorePoint(r *[16 * 4 * 3]uint64, index int) {
   418  	copy(r[index*12:], p.xyz[:])
   419  }
   420  
   421  func boothW5(in uint) (int, int) {
   422  	var s uint = ^((in >> 5) - 1)
   423  	var d uint = (1 << 6) - in - 1
   424  	d = (d & s) | (in & (^s))
   425  	d = (d >> 1) + (d & 1)
   426  	return int(d), int(s & 1)
   427  }
   428  
   429  func boothW7(in uint) (int, int) {
   430  	var s uint = ^((in >> 7) - 1)
   431  	var d uint = (1 << 8) - in - 1
   432  	d = (d & s) | (in & (^s))
   433  	d = (d >> 1) + (d & 1)
   434  	return int(d), int(s & 1)
   435  }
   436  
   437  func initTable() {
   438  	p256Precomputed = new([37][64 * 8]uint64)
   439  
   440  	basePoint := []uint64{
   441  		0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, 0x18905f76a53755c6,
   442  		0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325, 0x8571ff1825885d85,
   443  		0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe,
   444  	}
   445  	t1 := make([]uint64, 12)
   446  	t2 := make([]uint64, 12)
   447  	copy(t2, basePoint)
   448  
   449  	zInv := make([]uint64, 4)
   450  	zInvSq := make([]uint64, 4)
   451  	for j := 0; j < 64; j++ {
   452  		copy(t1, t2)
   453  		for i := 0; i < 37; i++ {
   454  			// The window size is 7 so we need to double 7 times.
   455  			if i != 0 {
   456  				for k := 0; k < 7; k++ {
   457  					p256PointDoubleAsm(t1, t1)
   458  				}
   459  			}
   460  			// Convert the point to affine form. (Its values are
   461  			// still in Montgomery form however.)
   462  			p256Inverse(zInv, t1[8:12])
   463  			p256Sqr(zInvSq, zInv)
   464  			p256Mul(zInv, zInv, zInvSq)
   465  
   466  			p256Mul(t1[:4], t1[:4], zInvSq)
   467  			p256Mul(t1[4:8], t1[4:8], zInv)
   468  
   469  			copy(t1[8:12], basePoint[8:12])
   470  			// Update the table entry
   471  			copy(p256Precomputed[i][j*8:], t1[:8])
   472  		}
   473  		if j == 0 {
   474  			p256PointDoubleAsm(t2, basePoint)
   475  		} else {
   476  			p256PointAddAsm(t2, t2, basePoint)
   477  		}
   478  	}
   479  }
   480  
   481  func (p *p256Point) p256BaseMult(scalar []uint64) {
   482  	precomputeOnce.Do(initTable)
   483  
   484  	wvalue := (scalar[0] << 1) & 0xff
   485  	sel, sign := boothW7(uint(wvalue))
   486  	p256SelectBase(p.xyz[0:8], p256Precomputed[0][0:], sel)
   487  	p256NegCond(p.xyz[4:8], sign)
   488  
   489  	// (This is one, in the Montgomery domain.)
   490  	p.xyz[8] = 0x0000000000000001
   491  	p.xyz[9] = 0xffffffff00000000
   492  	p.xyz[10] = 0xffffffffffffffff
   493  	p.xyz[11] = 0x00000000fffffffe
   494  
   495  	var t0 p256Point
   496  	// (This is one, in the Montgomery domain.)
   497  	t0.xyz[8] = 0x0000000000000001
   498  	t0.xyz[9] = 0xffffffff00000000
   499  	t0.xyz[10] = 0xffffffffffffffff
   500  	t0.xyz[11] = 0x00000000fffffffe
   501  
   502  	index := uint(6)
   503  	zero := sel
   504  
   505  	for i := 1; i < 37; i++ {
   506  		if index < 192 {
   507  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0xff
   508  		} else {
   509  			wvalue = (scalar[index/64] >> (index % 64)) & 0xff
   510  		}
   511  		index += 7
   512  		sel, sign = boothW7(uint(wvalue))
   513  		p256SelectBase(t0.xyz[0:8], p256Precomputed[i][0:], sel)
   514  		p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
   515  		zero |= sel
   516  	}
   517  }
   518  
   519  func (p *p256Point) p256ScalarMult(scalar []uint64) {
   520  	// precomp is a table of precomputed points that stores powers of p
   521  	// from p^1 to p^16.
   522  	var precomp [16 * 4 * 3]uint64
   523  	var t0, t1, t2, t3 p256Point
   524  
   525  	// Prepare the table
   526  	p.p256StorePoint(&precomp, 0) // 1
   527  
   528  	p256PointDoubleAsm(t0.xyz[:], p.xyz[:])
   529  	p256PointDoubleAsm(t1.xyz[:], t0.xyz[:])
   530  	p256PointDoubleAsm(t2.xyz[:], t1.xyz[:])
   531  	p256PointDoubleAsm(t3.xyz[:], t2.xyz[:])
   532  	t0.p256StorePoint(&precomp, 1)  // 2
   533  	t1.p256StorePoint(&precomp, 3)  // 4
   534  	t2.p256StorePoint(&precomp, 7)  // 8
   535  	t3.p256StorePoint(&precomp, 15) // 16
   536  
   537  	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
   538  	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
   539  	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
   540  	t0.p256StorePoint(&precomp, 2) // 3
   541  	t1.p256StorePoint(&precomp, 4) // 5
   542  	t2.p256StorePoint(&precomp, 8) // 9
   543  
   544  	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
   545  	p256PointDoubleAsm(t1.xyz[:], t1.xyz[:])
   546  	t0.p256StorePoint(&precomp, 5) // 6
   547  	t1.p256StorePoint(&precomp, 9) // 10
   548  
   549  	p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:])
   550  	p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
   551  	t2.p256StorePoint(&precomp, 6)  // 7
   552  	t1.p256StorePoint(&precomp, 10) // 11
   553  
   554  	p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
   555  	p256PointDoubleAsm(t2.xyz[:], t2.xyz[:])
   556  	t0.p256StorePoint(&precomp, 11) // 12
   557  	t2.p256StorePoint(&precomp, 13) // 14
   558  
   559  	p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
   560  	p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
   561  	t0.p256StorePoint(&precomp, 12) // 13
   562  	t2.p256StorePoint(&precomp, 14) // 15
   563  
   564  	// Start scanning the window from top bit
   565  	index := uint(254)
   566  	var sel, sign int
   567  
   568  	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
   569  	sel, _ = boothW5(uint(wvalue))
   570  
   571  	p256Select(p.xyz[0:12], precomp[0:], sel)
   572  	zero := sel
   573  
   574  	for index > 4 {
   575  		index -= 5
   576  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   577  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   578  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   579  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   580  		p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   581  
   582  		if index < 192 {
   583  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
   584  		} else {
   585  			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
   586  		}
   587  
   588  		sel, sign = boothW5(uint(wvalue))
   589  
   590  		p256Select(t0.xyz[0:], precomp[0:], sel)
   591  		p256NegCond(t0.xyz[4:8], sign)
   592  		p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
   593  		p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
   594  		p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
   595  		zero |= sel
   596  	}
   597  
   598  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   599  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   600  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   601  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   602  	p256PointDoubleAsm(p.xyz[:], p.xyz[:])
   603  
   604  	wvalue = (scalar[0] << 1) & 0x3f
   605  	sel, sign = boothW5(uint(wvalue))
   606  
   607  	p256Select(t0.xyz[0:], precomp[0:], sel)
   608  	p256NegCond(t0.xyz[4:8], sign)
   609  	p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
   610  	p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
   611  	p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
   612  }