github.com/d4l3k/go@v0.0.0-20151015000803-65fc379daeda/src/crypto/elliptic/p256.go (about)

     1  // Copyright 2013 The Go Authors.  All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package elliptic
     6  
     7  // This file contains a constant-time, 32-bit implementation of P256.
     8  
     9  import (
    10  	"math/big"
    11  )
    12  
    13  type p256Curve struct {
    14  	*CurveParams
    15  }
    16  
    17  var (
    18  	p256 p256Curve
    19  	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
    20  	// (2**257).
    21  	p256RInverse *big.Int
    22  )
    23  
    24  func initP256() {
    25  	// See FIPS 186-3, section D.2.3
    26  	p256.CurveParams = &CurveParams{Name: "P-256"}
    27  	p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
    28  	p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
    29  	p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
    30  	p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
    31  	p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
    32  	p256.BitSize = 256
    33  
    34  	p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
    35  }
    36  
    37  func (curve p256Curve) Params() *CurveParams {
    38  	return curve.CurveParams
    39  }
    40  
    41  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
    42  // to out. If the scalar is equal or greater than the order of the group, it's
    43  // reduced modulo that order.
    44  func p256GetScalar(out *[32]byte, in []byte) {
    45  	n := new(big.Int).SetBytes(in)
    46  	var scalarBytes []byte
    47  
    48  	if n.Cmp(p256.N) >= 0 {
    49  		n.Mod(n, p256.N)
    50  		scalarBytes = n.Bytes()
    51  	} else {
    52  		scalarBytes = in
    53  	}
    54  
    55  	for i, v := range scalarBytes {
    56  		out[len(scalarBytes)-(1+i)] = v
    57  	}
    58  }
    59  
    60  func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
    61  	var scalarReversed [32]byte
    62  	p256GetScalar(&scalarReversed, scalar)
    63  
    64  	var x1, y1, z1 [p256Limbs]uint32
    65  	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
    66  	return p256ToAffine(&x1, &y1, &z1)
    67  }
    68  
    69  func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
    70  	var scalarReversed [32]byte
    71  	p256GetScalar(&scalarReversed, scalar)
    72  
    73  	var px, py, x1, y1, z1 [p256Limbs]uint32
    74  	p256FromBig(&px, bigX)
    75  	p256FromBig(&py, bigY)
    76  	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
    77  	return p256ToAffine(&x1, &y1, &z1)
    78  }
    79  
    80  // Field elements are represented as nine, unsigned 32-bit words.
    81  //
    82  // The value of an field element is:
    83  //   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
    84  //
    85  // That is, each limb is alternately 29 or 28-bits wide in little-endian
    86  // order.
    87  //
    88  // This means that a field element hits 2**257, rather than 2**256 as we would
    89  // like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
    90  // problems when multiplying as terms end up one bit short of a limb which
    91  // would require much bit-shifting to correct.
    92  //
    93  // Finally, the values stored in a field element are in Montgomery form. So the
    94  // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
    95  // 2**257.
    96  
    97  const (
    98  	p256Limbs    = 9
    99  	bottom29Bits = 0x1fffffff
   100  )
   101  
   102  var (
   103  	// p256One is the number 1 as a field element.
   104  	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
   105  	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
   106  	// p256P is the prime modulus as a field element.
   107  	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
   108  	// p2562P is the twice prime modulus as a field element.
   109  	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
   110  )
   111  
   112  // p256Precomputed contains precomputed values to aid the calculation of scalar
   113  // multiples of the base point, G. It's actually two, equal length, tables
   114  // concatenated.
   115  //
   116  // The first table contains (x,y) field element pairs for 16 multiples of the
   117  // base point, G.
   118  //
   119  //   Index  |  Index (binary) | Value
   120  //       0  |           0000  | 0G (all zeros, omitted)
   121  //       1  |           0001  | G
   122  //       2  |           0010  | 2**64G
   123  //       3  |           0011  | 2**64G + G
   124  //       4  |           0100  | 2**128G
   125  //       5  |           0101  | 2**128G + G
   126  //       6  |           0110  | 2**128G + 2**64G
   127  //       7  |           0111  | 2**128G + 2**64G + G
   128  //       8  |           1000  | 2**192G
   129  //       9  |           1001  | 2**192G + G
   130  //      10  |           1010  | 2**192G + 2**64G
   131  //      11  |           1011  | 2**192G + 2**64G + G
   132  //      12  |           1100  | 2**192G + 2**128G
   133  //      13  |           1101  | 2**192G + 2**128G + G
   134  //      14  |           1110  | 2**192G + 2**128G + 2**64G
   135  //      15  |           1111  | 2**192G + 2**128G + 2**64G + G
   136  //
   137  // The second table follows the same style, but the terms are 2**32G,
   138  // 2**96G, 2**160G, 2**224G.
   139  //
   140  // This is ~2KB of data.
   141  var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
   142  	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
   143  	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
   144  	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
   145  	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
   146  	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
   147  	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
   148  	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
   149  	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
   150  	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
   151  	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
   152  	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
   153  	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
   154  	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
   155  	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
   156  	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
   157  	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
   158  	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
   159  	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
   160  	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
   161  	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
   162  	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
   163  	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
   164  	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
   165  	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
   166  	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
   167  	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
   168  	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
   169  	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
   170  	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
   171  	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
   172  	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
   173  	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
   174  	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
   175  	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
   176  	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
   177  	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
   178  	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
   179  	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
   180  	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
   181  	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
   182  	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
   183  	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
   184  	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
   185  	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
   186  	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
   187  	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
   188  	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
   189  	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
   190  	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
   191  	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
   192  	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
   193  	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
   194  	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
   195  	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
   196  	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
   197  	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
   198  	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
   199  	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
   200  	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
   201  	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
   202  }
   203  
   204  // Field element operations:
   205  
   206  // nonZeroToAllOnes returns:
   207  //   0xffffffff for 0 < x <= 2**31
   208  //   0 for x == 0 or x > 2**31.
   209  func nonZeroToAllOnes(x uint32) uint32 {
   210  	return ((x - 1) >> 31) - 1
   211  }
   212  
   213  // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
   214  // which is a term at 2**257.
   215  //
   216  // On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
   217  // On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
   218  func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
   219  	carry_mask := nonZeroToAllOnes(carry)
   220  
   221  	inout[0] += carry << 1
   222  	inout[3] += 0x10000000 & carry_mask
   223  	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
   224  	// previous line therefore this doesn't underflow.
   225  	inout[3] -= carry << 11
   226  	inout[4] += (0x20000000 - 1) & carry_mask
   227  	inout[5] += (0x10000000 - 1) & carry_mask
   228  	inout[6] += (0x20000000 - 1) & carry_mask
   229  	inout[6] -= carry << 22
   230  	// This may underflow if carry is non-zero but, if so, we'll fix it in the
   231  	// next line.
   232  	inout[7] -= 1 & carry_mask
   233  	inout[7] += carry << 25
   234  }
   235  
   236  // p256Sum sets out = in+in2.
   237  //
   238  // On entry, in[i]+in2[i] must not overflow a 32-bit word.
   239  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   240  func p256Sum(out, in, in2 *[p256Limbs]uint32) {
   241  	carry := uint32(0)
   242  	for i := 0; ; i++ {
   243  		out[i] = in[i] + in2[i]
   244  		out[i] += carry
   245  		carry = out[i] >> 29
   246  		out[i] &= bottom29Bits
   247  
   248  		i++
   249  		if i == p256Limbs {
   250  			break
   251  		}
   252  
   253  		out[i] = in[i] + in2[i]
   254  		out[i] += carry
   255  		carry = out[i] >> 28
   256  		out[i] &= bottom28Bits
   257  	}
   258  
   259  	p256ReduceCarry(out, carry)
   260  }
   261  
   262  const (
   263  	two30m2    = 1<<30 - 1<<2
   264  	two30p13m2 = 1<<30 + 1<<13 - 1<<2
   265  	two31m2    = 1<<31 - 1<<2
   266  	two31p24m2 = 1<<31 + 1<<24 - 1<<2
   267  	two30m27m2 = 1<<30 - 1<<27 - 1<<2
   268  )
   269  
   270  // p256Zero31 is 0 mod p.
   271  var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
   272  
   273  // p256Diff sets out = in-in2.
   274  //
   275  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   276  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   277  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   278  func p256Diff(out, in, in2 *[p256Limbs]uint32) {
   279  	var carry uint32
   280  
   281  	for i := 0; ; i++ {
   282  		out[i] = in[i] - in2[i]
   283  		out[i] += p256Zero31[i]
   284  		out[i] += carry
   285  		carry = out[i] >> 29
   286  		out[i] &= bottom29Bits
   287  
   288  		i++
   289  		if i == p256Limbs {
   290  			break
   291  		}
   292  
   293  		out[i] = in[i] - in2[i]
   294  		out[i] += p256Zero31[i]
   295  		out[i] += carry
   296  		carry = out[i] >> 28
   297  		out[i] &= bottom28Bits
   298  	}
   299  
   300  	p256ReduceCarry(out, carry)
   301  }
   302  
   303  // p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
   304  // the same 29,28,... bit positions as an field element.
   305  //
   306  // The values in field elements are in Montgomery form: x*R mod p where R =
   307  // 2**257. Since we just multiplied two Montgomery values together, the result
   308  // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
   309  // in Montgomery form.
   310  //
   311  // On entry: tmp[i] < 2**64
   312  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   313  func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
   314  	// The following table may be helpful when reading this code:
   315  	//
   316  	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
   317  	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
   318  	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
   319  	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
   320  	var tmp2 [18]uint32
   321  	var carry, x, xMask uint32
   322  
   323  	// tmp contains 64-bit words with the same 29,28,29-bit positions as an
   324  	// field element. So the top of an element of tmp might overlap with
   325  	// another element two positions down. The following loop eliminates
   326  	// this overlap.
   327  	tmp2[0] = uint32(tmp[0]) & bottom29Bits
   328  
   329  	tmp2[1] = uint32(tmp[0]) >> 29
   330  	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
   331  	tmp2[1] += uint32(tmp[1]) & bottom28Bits
   332  	carry = tmp2[1] >> 28
   333  	tmp2[1] &= bottom28Bits
   334  
   335  	for i := 2; i < 17; i++ {
   336  		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
   337  		tmp2[i] += (uint32(tmp[i-1])) >> 28
   338  		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
   339  		tmp2[i] += uint32(tmp[i]) & bottom29Bits
   340  		tmp2[i] += carry
   341  		carry = tmp2[i] >> 29
   342  		tmp2[i] &= bottom29Bits
   343  
   344  		i++
   345  		if i == 17 {
   346  			break
   347  		}
   348  		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
   349  		tmp2[i] += uint32(tmp[i-1]) >> 29
   350  		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
   351  		tmp2[i] += uint32(tmp[i]) & bottom28Bits
   352  		tmp2[i] += carry
   353  		carry = tmp2[i] >> 28
   354  		tmp2[i] &= bottom28Bits
   355  	}
   356  
   357  	tmp2[17] = uint32(tmp[15]>>32) >> 25
   358  	tmp2[17] += uint32(tmp[16]) >> 29
   359  	tmp2[17] += uint32(tmp[16]>>32) << 3
   360  	tmp2[17] += carry
   361  
   362  	// Montgomery elimination of terms:
   363  	//
   364  	// Since R is 2**257, we can divide by R with a bitwise shift if we can
   365  	// ensure that the right-most 257 bits are all zero. We can make that true
   366  	// by adding multiplies of p without affecting the value.
   367  	//
   368  	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
   369  	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
   370  	// We can do that for 8 further limbs and then right shift to eliminate the
   371  	// extra factor of R.
   372  	for i := 0; ; i += 2 {
   373  		tmp2[i+1] += tmp2[i] >> 29
   374  		x = tmp2[i] & bottom29Bits
   375  		xMask = nonZeroToAllOnes(x)
   376  		tmp2[i] = 0
   377  
   378  		// The bounds calculations for this loop are tricky. Each iteration of
   379  		// the loop eliminates two words by adding values to words to their
   380  		// right.
   381  		//
   382  		// The following table contains the amounts added to each word (as an
   383  		// offset from the value of i at the top of the loop). The amounts are
   384  		// accounted for from the first and second half of the loop separately
   385  		// and are written as, for example, 28 to mean a value <2**28.
   386  		//
   387  		// Word:                   3   4   5   6   7   8   9   10
   388  		// Added in top half:     28  11      29  21  29  28
   389  		//                                        28  29
   390  		//                                            29
   391  		// Added in bottom half:      29  10      28  21  28   28
   392  		//                                            29
   393  		//
   394  		// The value that is currently offset 7 will be offset 5 for the next
   395  		// iteration and then offset 3 for the iteration after that. Therefore
   396  		// the total value added will be the values added at 7, 5 and 3.
   397  		//
   398  		// The following table accumulates these values. The sums at the bottom
   399  		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
   400  		//
   401  		// Word:                   3   4   5   6   7   8   9  10  11  12  13
   402  		//                        28  11  10  29  21  29  28  28  28  28  28
   403  		//                            29  28  11  28  29  28  29  28  29  28
   404  		//                                    29  28  21  21  29  21  29  21
   405  		//                                        10  29  28  21  28  21  28
   406  		//                                        28  29  28  29  28  29  28
   407  		//                                            11  10  29  10  29  10
   408  		//                                            29  28  11  28  11
   409  		//                                                    29      29
   410  		//                        --------------------------------------------
   411  		//                                                30+ 31+ 30+ 31+ 30+
   412  		//                                                28+ 29+ 28+ 29+ 21+
   413  		//                                                21+ 28+ 21+ 28+ 10
   414  		//                                                10  21+ 10  21+
   415  		//                                                    11      11
   416  		//
   417  		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
   418  		// tmp2[10/12] has an initial value of <2**29, then the maximum value
   419  		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
   420  		// as required.
   421  		tmp2[i+3] += (x << 10) & bottom28Bits
   422  		tmp2[i+4] += (x >> 18)
   423  
   424  		tmp2[i+6] += (x << 21) & bottom29Bits
   425  		tmp2[i+7] += x >> 8
   426  
   427  		// At position 200, which is the starting bit position for word 7, we
   428  		// have a factor of 0xf000000 = 2**28 - 2**24.
   429  		tmp2[i+7] += 0x10000000 & xMask
   430  		tmp2[i+8] += (x - 1) & xMask
   431  		tmp2[i+7] -= (x << 24) & bottom28Bits
   432  		tmp2[i+8] -= x >> 4
   433  
   434  		tmp2[i+8] += 0x20000000 & xMask
   435  		tmp2[i+8] -= x
   436  		tmp2[i+8] += (x << 28) & bottom29Bits
   437  		tmp2[i+9] += ((x >> 1) - 1) & xMask
   438  
   439  		if i+1 == p256Limbs {
   440  			break
   441  		}
   442  		tmp2[i+2] += tmp2[i+1] >> 28
   443  		x = tmp2[i+1] & bottom28Bits
   444  		xMask = nonZeroToAllOnes(x)
   445  		tmp2[i+1] = 0
   446  
   447  		tmp2[i+4] += (x << 11) & bottom29Bits
   448  		tmp2[i+5] += (x >> 18)
   449  
   450  		tmp2[i+7] += (x << 21) & bottom28Bits
   451  		tmp2[i+8] += x >> 7
   452  
   453  		// At position 199, which is the starting bit of the 8th word when
   454  		// dealing with a context starting on an odd word, we have a factor of
   455  		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
   456  		// word from i+1 is i+8.
   457  		tmp2[i+8] += 0x20000000 & xMask
   458  		tmp2[i+9] += (x - 1) & xMask
   459  		tmp2[i+8] -= (x << 25) & bottom29Bits
   460  		tmp2[i+9] -= x >> 4
   461  
   462  		tmp2[i+9] += 0x10000000 & xMask
   463  		tmp2[i+9] -= x
   464  		tmp2[i+10] += (x - 1) & xMask
   465  	}
   466  
   467  	// We merge the right shift with a carry chain. The words above 2**257 have
   468  	// widths of 28,29,... which we need to correct when copying them down.
   469  	carry = 0
   470  	for i := 0; i < 8; i++ {
   471  		// The maximum value of tmp2[i + 9] occurs on the first iteration and
   472  		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
   473  		// therefore safe.
   474  		out[i] = tmp2[i+9]
   475  		out[i] += carry
   476  		out[i] += (tmp2[i+10] << 28) & bottom29Bits
   477  		carry = out[i] >> 29
   478  		out[i] &= bottom29Bits
   479  
   480  		i++
   481  		out[i] = tmp2[i+9] >> 1
   482  		out[i] += carry
   483  		carry = out[i] >> 28
   484  		out[i] &= bottom28Bits
   485  	}
   486  
   487  	out[8] = tmp2[17]
   488  	out[8] += carry
   489  	carry = out[8] >> 29
   490  	out[8] &= bottom29Bits
   491  
   492  	p256ReduceCarry(out, carry)
   493  }
   494  
   495  // p256Square sets out=in*in.
   496  //
   497  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
   498  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   499  func p256Square(out, in *[p256Limbs]uint32) {
   500  	var tmp [17]uint64
   501  
   502  	tmp[0] = uint64(in[0]) * uint64(in[0])
   503  	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
   504  	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
   505  		uint64(in[1])*(uint64(in[1])<<1)
   506  	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
   507  		uint64(in[1])*(uint64(in[2])<<1)
   508  	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
   509  		uint64(in[1])*(uint64(in[3])<<2) +
   510  		uint64(in[2])*uint64(in[2])
   511  	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
   512  		uint64(in[1])*(uint64(in[4])<<1) +
   513  		uint64(in[2])*(uint64(in[3])<<1)
   514  	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
   515  		uint64(in[1])*(uint64(in[5])<<2) +
   516  		uint64(in[2])*(uint64(in[4])<<1) +
   517  		uint64(in[3])*(uint64(in[3])<<1)
   518  	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
   519  		uint64(in[1])*(uint64(in[6])<<1) +
   520  		uint64(in[2])*(uint64(in[5])<<1) +
   521  		uint64(in[3])*(uint64(in[4])<<1)
   522  	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
   523  	// which is < 2**64 as required.
   524  	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
   525  		uint64(in[1])*(uint64(in[7])<<2) +
   526  		uint64(in[2])*(uint64(in[6])<<1) +
   527  		uint64(in[3])*(uint64(in[5])<<2) +
   528  		uint64(in[4])*uint64(in[4])
   529  	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
   530  		uint64(in[2])*(uint64(in[7])<<1) +
   531  		uint64(in[3])*(uint64(in[6])<<1) +
   532  		uint64(in[4])*(uint64(in[5])<<1)
   533  	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
   534  		uint64(in[3])*(uint64(in[7])<<2) +
   535  		uint64(in[4])*(uint64(in[6])<<1) +
   536  		uint64(in[5])*(uint64(in[5])<<1)
   537  	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
   538  		uint64(in[4])*(uint64(in[7])<<1) +
   539  		uint64(in[5])*(uint64(in[6])<<1)
   540  	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
   541  		uint64(in[5])*(uint64(in[7])<<2) +
   542  		uint64(in[6])*uint64(in[6])
   543  	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
   544  		uint64(in[6])*(uint64(in[7])<<1)
   545  	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
   546  		uint64(in[7])*(uint64(in[7])<<1)
   547  	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
   548  	tmp[16] = uint64(in[8]) * uint64(in[8])
   549  
   550  	p256ReduceDegree(out, tmp)
   551  }
   552  
   553  // p256Mul sets out=in*in2.
   554  //
   555  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   556  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   557  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   558  func p256Mul(out, in, in2 *[p256Limbs]uint32) {
   559  	var tmp [17]uint64
   560  
   561  	tmp[0] = uint64(in[0]) * uint64(in2[0])
   562  	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
   563  		uint64(in[1])*(uint64(in2[0])<<0)
   564  	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
   565  		uint64(in[1])*(uint64(in2[1])<<1) +
   566  		uint64(in[2])*(uint64(in2[0])<<0)
   567  	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
   568  		uint64(in[1])*(uint64(in2[2])<<0) +
   569  		uint64(in[2])*(uint64(in2[1])<<0) +
   570  		uint64(in[3])*(uint64(in2[0])<<0)
   571  	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
   572  		uint64(in[1])*(uint64(in2[3])<<1) +
   573  		uint64(in[2])*(uint64(in2[2])<<0) +
   574  		uint64(in[3])*(uint64(in2[1])<<1) +
   575  		uint64(in[4])*(uint64(in2[0])<<0)
   576  	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
   577  		uint64(in[1])*(uint64(in2[4])<<0) +
   578  		uint64(in[2])*(uint64(in2[3])<<0) +
   579  		uint64(in[3])*(uint64(in2[2])<<0) +
   580  		uint64(in[4])*(uint64(in2[1])<<0) +
   581  		uint64(in[5])*(uint64(in2[0])<<0)
   582  	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
   583  		uint64(in[1])*(uint64(in2[5])<<1) +
   584  		uint64(in[2])*(uint64(in2[4])<<0) +
   585  		uint64(in[3])*(uint64(in2[3])<<1) +
   586  		uint64(in[4])*(uint64(in2[2])<<0) +
   587  		uint64(in[5])*(uint64(in2[1])<<1) +
   588  		uint64(in[6])*(uint64(in2[0])<<0)
   589  	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
   590  		uint64(in[1])*(uint64(in2[6])<<0) +
   591  		uint64(in[2])*(uint64(in2[5])<<0) +
   592  		uint64(in[3])*(uint64(in2[4])<<0) +
   593  		uint64(in[4])*(uint64(in2[3])<<0) +
   594  		uint64(in[5])*(uint64(in2[2])<<0) +
   595  		uint64(in[6])*(uint64(in2[1])<<0) +
   596  		uint64(in[7])*(uint64(in2[0])<<0)
   597  	// tmp[8] has the greatest value but doesn't overflow. See logic in
   598  	// p256Square.
   599  	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
   600  		uint64(in[1])*(uint64(in2[7])<<1) +
   601  		uint64(in[2])*(uint64(in2[6])<<0) +
   602  		uint64(in[3])*(uint64(in2[5])<<1) +
   603  		uint64(in[4])*(uint64(in2[4])<<0) +
   604  		uint64(in[5])*(uint64(in2[3])<<1) +
   605  		uint64(in[6])*(uint64(in2[2])<<0) +
   606  		uint64(in[7])*(uint64(in2[1])<<1) +
   607  		uint64(in[8])*(uint64(in2[0])<<0)
   608  	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
   609  		uint64(in[2])*(uint64(in2[7])<<0) +
   610  		uint64(in[3])*(uint64(in2[6])<<0) +
   611  		uint64(in[4])*(uint64(in2[5])<<0) +
   612  		uint64(in[5])*(uint64(in2[4])<<0) +
   613  		uint64(in[6])*(uint64(in2[3])<<0) +
   614  		uint64(in[7])*(uint64(in2[2])<<0) +
   615  		uint64(in[8])*(uint64(in2[1])<<0)
   616  	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
   617  		uint64(in[3])*(uint64(in2[7])<<1) +
   618  		uint64(in[4])*(uint64(in2[6])<<0) +
   619  		uint64(in[5])*(uint64(in2[5])<<1) +
   620  		uint64(in[6])*(uint64(in2[4])<<0) +
   621  		uint64(in[7])*(uint64(in2[3])<<1) +
   622  		uint64(in[8])*(uint64(in2[2])<<0)
   623  	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
   624  		uint64(in[4])*(uint64(in2[7])<<0) +
   625  		uint64(in[5])*(uint64(in2[6])<<0) +
   626  		uint64(in[6])*(uint64(in2[5])<<0) +
   627  		uint64(in[7])*(uint64(in2[4])<<0) +
   628  		uint64(in[8])*(uint64(in2[3])<<0)
   629  	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
   630  		uint64(in[5])*(uint64(in2[7])<<1) +
   631  		uint64(in[6])*(uint64(in2[6])<<0) +
   632  		uint64(in[7])*(uint64(in2[5])<<1) +
   633  		uint64(in[8])*(uint64(in2[4])<<0)
   634  	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
   635  		uint64(in[6])*(uint64(in2[7])<<0) +
   636  		uint64(in[7])*(uint64(in2[6])<<0) +
   637  		uint64(in[8])*(uint64(in2[5])<<0)
   638  	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
   639  		uint64(in[7])*(uint64(in2[7])<<1) +
   640  		uint64(in[8])*(uint64(in2[6])<<0)
   641  	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
   642  		uint64(in[8])*(uint64(in2[7])<<0)
   643  	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
   644  
   645  	p256ReduceDegree(out, tmp)
   646  }
   647  
   648  func p256Assign(out, in *[p256Limbs]uint32) {
   649  	*out = *in
   650  }
   651  
   652  // p256Invert calculates |out| = |in|^{-1}
   653  //
   654  // Based on Fermat's Little Theorem:
   655  //   a^p = a (mod p)
   656  //   a^{p-1} = 1 (mod p)
   657  //   a^{p-2} = a^{-1} (mod p)
   658  func p256Invert(out, in *[p256Limbs]uint32) {
   659  	var ftmp, ftmp2 [p256Limbs]uint32
   660  
   661  	// each e_I will hold |in|^{2^I - 1}
   662  	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
   663  
   664  	p256Square(&ftmp, in)     // 2^1
   665  	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
   666  	p256Assign(&e2, &ftmp)
   667  	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
   668  	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
   669  	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
   670  	p256Assign(&e4, &ftmp)
   671  	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
   672  	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
   673  	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
   674  	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
   675  	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
   676  	p256Assign(&e8, &ftmp)
   677  	for i := 0; i < 8; i++ {
   678  		p256Square(&ftmp, &ftmp)
   679  	} // 2^16 - 2^8
   680  	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
   681  	p256Assign(&e16, &ftmp)
   682  	for i := 0; i < 16; i++ {
   683  		p256Square(&ftmp, &ftmp)
   684  	} // 2^32 - 2^16
   685  	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
   686  	p256Assign(&e32, &ftmp)
   687  	for i := 0; i < 32; i++ {
   688  		p256Square(&ftmp, &ftmp)
   689  	} // 2^64 - 2^32
   690  	p256Assign(&e64, &ftmp)
   691  	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
   692  	for i := 0; i < 192; i++ {
   693  		p256Square(&ftmp, &ftmp)
   694  	} // 2^256 - 2^224 + 2^192
   695  
   696  	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
   697  	for i := 0; i < 16; i++ {
   698  		p256Square(&ftmp2, &ftmp2)
   699  	} // 2^80 - 2^16
   700  	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
   701  	for i := 0; i < 8; i++ {
   702  		p256Square(&ftmp2, &ftmp2)
   703  	} // 2^88 - 2^8
   704  	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
   705  	for i := 0; i < 4; i++ {
   706  		p256Square(&ftmp2, &ftmp2)
   707  	} // 2^92 - 2^4
   708  	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
   709  	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
   710  	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
   711  	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
   712  	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
   713  	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
   714  	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
   715  
   716  	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
   717  }
   718  
   719  // p256Scalar3 sets out=3*out.
   720  //
   721  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   722  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   723  func p256Scalar3(out *[p256Limbs]uint32) {
   724  	var carry uint32
   725  
   726  	for i := 0; ; i++ {
   727  		out[i] *= 3
   728  		out[i] += carry
   729  		carry = out[i] >> 29
   730  		out[i] &= bottom29Bits
   731  
   732  		i++
   733  		if i == p256Limbs {
   734  			break
   735  		}
   736  
   737  		out[i] *= 3
   738  		out[i] += carry
   739  		carry = out[i] >> 28
   740  		out[i] &= bottom28Bits
   741  	}
   742  
   743  	p256ReduceCarry(out, carry)
   744  }
   745  
   746  // p256Scalar4 sets out=4*out.
   747  //
   748  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   749  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   750  func p256Scalar4(out *[p256Limbs]uint32) {
   751  	var carry, nextCarry uint32
   752  
   753  	for i := 0; ; i++ {
   754  		nextCarry = out[i] >> 27
   755  		out[i] <<= 2
   756  		out[i] &= bottom29Bits
   757  		out[i] += carry
   758  		carry = nextCarry + (out[i] >> 29)
   759  		out[i] &= bottom29Bits
   760  
   761  		i++
   762  		if i == p256Limbs {
   763  			break
   764  		}
   765  		nextCarry = out[i] >> 26
   766  		out[i] <<= 2
   767  		out[i] &= bottom28Bits
   768  		out[i] += carry
   769  		carry = nextCarry + (out[i] >> 28)
   770  		out[i] &= bottom28Bits
   771  	}
   772  
   773  	p256ReduceCarry(out, carry)
   774  }
   775  
   776  // p256Scalar8 sets out=8*out.
   777  //
   778  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   779  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   780  func p256Scalar8(out *[p256Limbs]uint32) {
   781  	var carry, nextCarry uint32
   782  
   783  	for i := 0; ; i++ {
   784  		nextCarry = out[i] >> 26
   785  		out[i] <<= 3
   786  		out[i] &= bottom29Bits
   787  		out[i] += carry
   788  		carry = nextCarry + (out[i] >> 29)
   789  		out[i] &= bottom29Bits
   790  
   791  		i++
   792  		if i == p256Limbs {
   793  			break
   794  		}
   795  		nextCarry = out[i] >> 25
   796  		out[i] <<= 3
   797  		out[i] &= bottom28Bits
   798  		out[i] += carry
   799  		carry = nextCarry + (out[i] >> 28)
   800  		out[i] &= bottom28Bits
   801  	}
   802  
   803  	p256ReduceCarry(out, carry)
   804  }
   805  
   806  // Group operations:
   807  //
   808  // Elements of the elliptic curve group are represented in Jacobian
   809  // coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
   810  // Jacobian form.
   811  
   812  // p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
   813  //
   814  // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
   815  func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
   816  	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
   817  
   818  	p256Square(&delta, z)
   819  	p256Square(&gamma, y)
   820  	p256Mul(&beta, x, &gamma)
   821  
   822  	p256Sum(&tmp, x, &delta)
   823  	p256Diff(&tmp2, x, &delta)
   824  	p256Mul(&alpha, &tmp, &tmp2)
   825  	p256Scalar3(&alpha)
   826  
   827  	p256Sum(&tmp, y, z)
   828  	p256Square(&tmp, &tmp)
   829  	p256Diff(&tmp, &tmp, &gamma)
   830  	p256Diff(zOut, &tmp, &delta)
   831  
   832  	p256Scalar4(&beta)
   833  	p256Square(xOut, &alpha)
   834  	p256Diff(xOut, xOut, &beta)
   835  	p256Diff(xOut, xOut, &beta)
   836  
   837  	p256Diff(&tmp, &beta, xOut)
   838  	p256Mul(&tmp, &alpha, &tmp)
   839  	p256Square(&tmp2, &gamma)
   840  	p256Scalar8(&tmp2)
   841  	p256Diff(yOut, &tmp, &tmp2)
   842  }
   843  
   844  // p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
   845  // (i.e. the second point is affine.)
   846  //
   847  // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   848  //
   849  // Note that this function does not handle P+P, infinity+P nor P+infinity
   850  // correctly.
   851  func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
   852  	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   853  
   854  	p256Square(&z1z1, z1)
   855  	p256Sum(&tmp, z1, z1)
   856  
   857  	p256Mul(&u2, x2, &z1z1)
   858  	p256Mul(&z1z1z1, z1, &z1z1)
   859  	p256Mul(&s2, y2, &z1z1z1)
   860  	p256Diff(&h, &u2, x1)
   861  	p256Sum(&i, &h, &h)
   862  	p256Square(&i, &i)
   863  	p256Mul(&j, &h, &i)
   864  	p256Diff(&r, &s2, y1)
   865  	p256Sum(&r, &r, &r)
   866  	p256Mul(&v, x1, &i)
   867  
   868  	p256Mul(zOut, &tmp, &h)
   869  	p256Square(&rr, &r)
   870  	p256Diff(xOut, &rr, &j)
   871  	p256Diff(xOut, xOut, &v)
   872  	p256Diff(xOut, xOut, &v)
   873  
   874  	p256Diff(&tmp, &v, xOut)
   875  	p256Mul(yOut, &tmp, &r)
   876  	p256Mul(&tmp, y1, &j)
   877  	p256Diff(yOut, yOut, &tmp)
   878  	p256Diff(yOut, yOut, &tmp)
   879  }
   880  
   881  // p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
   882  //
   883  // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   884  //
   885  // Note that this function does not handle P+P, infinity+P nor P+infinity
   886  // correctly.
   887  func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
   888  	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   889  
   890  	p256Square(&z1z1, z1)
   891  	p256Square(&z2z2, z2)
   892  	p256Mul(&u1, x1, &z2z2)
   893  
   894  	p256Sum(&tmp, z1, z2)
   895  	p256Square(&tmp, &tmp)
   896  	p256Diff(&tmp, &tmp, &z1z1)
   897  	p256Diff(&tmp, &tmp, &z2z2)
   898  
   899  	p256Mul(&z2z2z2, z2, &z2z2)
   900  	p256Mul(&s1, y1, &z2z2z2)
   901  
   902  	p256Mul(&u2, x2, &z1z1)
   903  	p256Mul(&z1z1z1, z1, &z1z1)
   904  	p256Mul(&s2, y2, &z1z1z1)
   905  	p256Diff(&h, &u2, &u1)
   906  	p256Sum(&i, &h, &h)
   907  	p256Square(&i, &i)
   908  	p256Mul(&j, &h, &i)
   909  	p256Diff(&r, &s2, &s1)
   910  	p256Sum(&r, &r, &r)
   911  	p256Mul(&v, &u1, &i)
   912  
   913  	p256Mul(zOut, &tmp, &h)
   914  	p256Square(&rr, &r)
   915  	p256Diff(xOut, &rr, &j)
   916  	p256Diff(xOut, xOut, &v)
   917  	p256Diff(xOut, xOut, &v)
   918  
   919  	p256Diff(&tmp, &v, xOut)
   920  	p256Mul(yOut, &tmp, &r)
   921  	p256Mul(&tmp, &s1, &j)
   922  	p256Diff(yOut, yOut, &tmp)
   923  	p256Diff(yOut, yOut, &tmp)
   924  }
   925  
   926  // p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
   927  //
   928  // On entry: mask is either 0 or 0xffffffff.
   929  func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
   930  	for i := 0; i < p256Limbs; i++ {
   931  		tmp := mask & (in[i] ^ out[i])
   932  		out[i] ^= tmp
   933  	}
   934  }
   935  
   936  // p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
   937  // On entry: index < 16, table[0] must be zero.
   938  func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
   939  	for i := range xOut {
   940  		xOut[i] = 0
   941  	}
   942  	for i := range yOut {
   943  		yOut[i] = 0
   944  	}
   945  
   946  	for i := uint32(1); i < 16; i++ {
   947  		mask := i ^ index
   948  		mask |= mask >> 2
   949  		mask |= mask >> 1
   950  		mask &= 1
   951  		mask--
   952  		for j := range xOut {
   953  			xOut[j] |= table[0] & mask
   954  			table = table[1:]
   955  		}
   956  		for j := range yOut {
   957  			yOut[j] |= table[0] & mask
   958  			table = table[1:]
   959  		}
   960  	}
   961  }
   962  
   963  // p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
   964  // table.
   965  // On entry: index < 16, table[0] must be zero.
   966  func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
   967  	for i := range xOut {
   968  		xOut[i] = 0
   969  	}
   970  	for i := range yOut {
   971  		yOut[i] = 0
   972  	}
   973  	for i := range zOut {
   974  		zOut[i] = 0
   975  	}
   976  
   977  	// The implicit value at index 0 is all zero. We don't need to perform that
   978  	// iteration of the loop because we already set out_* to zero.
   979  	for i := uint32(1); i < 16; i++ {
   980  		mask := i ^ index
   981  		mask |= mask >> 2
   982  		mask |= mask >> 1
   983  		mask &= 1
   984  		mask--
   985  		for j := range xOut {
   986  			xOut[j] |= table[i][0][j] & mask
   987  		}
   988  		for j := range yOut {
   989  			yOut[j] |= table[i][1][j] & mask
   990  		}
   991  		for j := range zOut {
   992  			zOut[j] |= table[i][2][j] & mask
   993  		}
   994  	}
   995  }
   996  
   997  // p256GetBit returns the bit'th bit of scalar.
   998  func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
   999  	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
  1000  }
  1001  
  1002  // p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
  1003  // little-endian number. Note that the value of scalar must be less than the
  1004  // order of the group.
  1005  func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
  1006  	nIsInfinityMask := ^uint32(0)
  1007  	var pIsNoninfiniteMask, mask, tableOffset uint32
  1008  	var px, py, tx, ty, tz [p256Limbs]uint32
  1009  
  1010  	for i := range xOut {
  1011  		xOut[i] = 0
  1012  	}
  1013  	for i := range yOut {
  1014  		yOut[i] = 0
  1015  	}
  1016  	for i := range zOut {
  1017  		zOut[i] = 0
  1018  	}
  1019  
  1020  	// The loop adds bits at positions 0, 64, 128 and 192, followed by
  1021  	// positions 32,96,160 and 224 and does this 32 times.
  1022  	for i := uint(0); i < 32; i++ {
  1023  		if i != 0 {
  1024  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1025  		}
  1026  		tableOffset = 0
  1027  		for j := uint(0); j <= 32; j += 32 {
  1028  			bit0 := p256GetBit(scalar, 31-i+j)
  1029  			bit1 := p256GetBit(scalar, 95-i+j)
  1030  			bit2 := p256GetBit(scalar, 159-i+j)
  1031  			bit3 := p256GetBit(scalar, 223-i+j)
  1032  			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
  1033  
  1034  			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
  1035  			tableOffset += 30 * p256Limbs
  1036  
  1037  			// Since scalar is less than the order of the group, we know that
  1038  			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
  1039  			// below.
  1040  			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
  1041  			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
  1042  			// (a.k.a.  the point at infinity). We handle that situation by
  1043  			// copying the point from the table.
  1044  			p256CopyConditional(xOut, &px, nIsInfinityMask)
  1045  			p256CopyConditional(yOut, &py, nIsInfinityMask)
  1046  			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
  1047  
  1048  			// Equally, the result is also wrong if the point from the table is
  1049  			// zero, which happens when the index is zero. We handle that by
  1050  			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
  1051  			pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1052  			mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1053  			p256CopyConditional(xOut, &tx, mask)
  1054  			p256CopyConditional(yOut, &ty, mask)
  1055  			p256CopyConditional(zOut, &tz, mask)
  1056  			// If p was not zero, then n is now non-zero.
  1057  			nIsInfinityMask &= ^pIsNoninfiniteMask
  1058  		}
  1059  	}
  1060  }
  1061  
  1062  // p256PointToAffine converts a Jacobian point to an affine point. If the input
  1063  // is the point at infinity then it returns (0, 0) in constant time.
  1064  func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
  1065  	var zInv, zInvSq [p256Limbs]uint32
  1066  
  1067  	p256Invert(&zInv, z)
  1068  	p256Square(&zInvSq, &zInv)
  1069  	p256Mul(xOut, x, &zInvSq)
  1070  	p256Mul(&zInv, &zInv, &zInvSq)
  1071  	p256Mul(yOut, y, &zInv)
  1072  }
  1073  
  1074  // p256ToAffine returns a pair of *big.Int containing the affine representation
  1075  // of {x,y,z}.
  1076  func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
  1077  	var xx, yy [p256Limbs]uint32
  1078  	p256PointToAffine(&xx, &yy, x, y, z)
  1079  	return p256ToBig(&xx), p256ToBig(&yy)
  1080  }
  1081  
  1082  // p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
  1083  func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
  1084  	var px, py, pz, tx, ty, tz [p256Limbs]uint32
  1085  	var precomp [16][3][p256Limbs]uint32
  1086  	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
  1087  
  1088  	// We precompute 0,1,2,... times {x,y}.
  1089  	precomp[1][0] = *x
  1090  	precomp[1][1] = *y
  1091  	precomp[1][2] = p256One
  1092  
  1093  	for i := 2; i < 16; i += 2 {
  1094  		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
  1095  		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
  1096  	}
  1097  
  1098  	for i := range xOut {
  1099  		xOut[i] = 0
  1100  	}
  1101  	for i := range yOut {
  1102  		yOut[i] = 0
  1103  	}
  1104  	for i := range zOut {
  1105  		zOut[i] = 0
  1106  	}
  1107  	nIsInfinityMask = ^uint32(0)
  1108  
  1109  	// We add in a window of four bits each iteration and do this 64 times.
  1110  	for i := 0; i < 64; i++ {
  1111  		if i != 0 {
  1112  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1113  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1114  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1115  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1116  		}
  1117  
  1118  		index = uint32(scalar[31-i/2])
  1119  		if (i & 1) == 1 {
  1120  			index &= 15
  1121  		} else {
  1122  			index >>= 4
  1123  		}
  1124  
  1125  		// See the comments in scalarBaseMult about handling infinities.
  1126  		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
  1127  		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
  1128  		p256CopyConditional(xOut, &px, nIsInfinityMask)
  1129  		p256CopyConditional(yOut, &py, nIsInfinityMask)
  1130  		p256CopyConditional(zOut, &pz, nIsInfinityMask)
  1131  
  1132  		pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1133  		mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1134  		p256CopyConditional(xOut, &tx, mask)
  1135  		p256CopyConditional(yOut, &ty, mask)
  1136  		p256CopyConditional(zOut, &tz, mask)
  1137  		nIsInfinityMask &= ^pIsNoninfiniteMask
  1138  	}
  1139  }
  1140  
  1141  // p256FromBig sets out = R*in.
  1142  func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
  1143  	tmp := new(big.Int).Lsh(in, 257)
  1144  	tmp.Mod(tmp, p256.P)
  1145  
  1146  	for i := 0; i < p256Limbs; i++ {
  1147  		if bits := tmp.Bits(); len(bits) > 0 {
  1148  			out[i] = uint32(bits[0]) & bottom29Bits
  1149  		} else {
  1150  			out[i] = 0
  1151  		}
  1152  		tmp.Rsh(tmp, 29)
  1153  
  1154  		i++
  1155  		if i == p256Limbs {
  1156  			break
  1157  		}
  1158  
  1159  		if bits := tmp.Bits(); len(bits) > 0 {
  1160  			out[i] = uint32(bits[0]) & bottom28Bits
  1161  		} else {
  1162  			out[i] = 0
  1163  		}
  1164  		tmp.Rsh(tmp, 28)
  1165  	}
  1166  }
  1167  
  1168  // p256ToBig returns a *big.Int containing the value of in.
  1169  func p256ToBig(in *[p256Limbs]uint32) *big.Int {
  1170  	result, tmp := new(big.Int), new(big.Int)
  1171  
  1172  	result.SetInt64(int64(in[p256Limbs-1]))
  1173  	for i := p256Limbs - 2; i >= 0; i-- {
  1174  		if (i & 1) == 0 {
  1175  			result.Lsh(result, 29)
  1176  		} else {
  1177  			result.Lsh(result, 28)
  1178  		}
  1179  		tmp.SetInt64(int64(in[i]))
  1180  		result.Add(result, tmp)
  1181  	}
  1182  
  1183  	result.Mul(result, p256RInverse)
  1184  	result.Mod(result, p256.P)
  1185  	return result
  1186  }