github.com/euank/go@v0.0.0-20160829210321-495514729181/src/crypto/elliptic/p256.go

github.com/euank/go@v0.0.0-20160829210321-495514729181/src/crypto/elliptic/p256.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !amd64
     6  
     7  package elliptic
     8  
     9  // This file contains a constant-time, 32-bit implementation of P256.
    10  
    11  import (
    12  	"math/big"
    13  )
    14  
    15  type p256Curve struct {
    16  	*CurveParams
    17  }
    18  
    19  var (
    20  	p256 p256Curve
    21  	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
    22  	// (2**257).
    23  	p256RInverse *big.Int
    24  )
    25  
    26  func initP256() {
    27  	// See FIPS 186-3, section D.2.3
    28  	p256.CurveParams = &CurveParams{Name: "P-256"}
    29  	p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
    30  	p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
    31  	p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
    32  	p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
    33  	p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
    34  	p256.BitSize = 256
    35  
    36  	p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
    37  }
    38  
    39  func (curve p256Curve) Params() *CurveParams {
    40  	return curve.CurveParams
    41  }
    42  
    43  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
    44  // to out. If the scalar is equal or greater than the order of the group, it's
    45  // reduced modulo that order.
    46  func p256GetScalar(out *[32]byte, in []byte) {
    47  	n := new(big.Int).SetBytes(in)
    48  	var scalarBytes []byte
    49  
    50  	if n.Cmp(p256.N) >= 0 {
    51  		n.Mod(n, p256.N)
    52  		scalarBytes = n.Bytes()
    53  	} else {
    54  		scalarBytes = in
    55  	}
    56  
    57  	for i, v := range scalarBytes {
    58  		out[len(scalarBytes)-(1+i)] = v
    59  	}
    60  }
    61  
    62  func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
    63  	var scalarReversed [32]byte
    64  	p256GetScalar(&scalarReversed, scalar)
    65  
    66  	var x1, y1, z1 [p256Limbs]uint32
    67  	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
    68  	return p256ToAffine(&x1, &y1, &z1)
    69  }
    70  
    71  func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
    72  	var scalarReversed [32]byte
    73  	p256GetScalar(&scalarReversed, scalar)
    74  
    75  	var px, py, x1, y1, z1 [p256Limbs]uint32
    76  	p256FromBig(&px, bigX)
    77  	p256FromBig(&py, bigY)
    78  	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
    79  	return p256ToAffine(&x1, &y1, &z1)
    80  }
    81  
    82  // Field elements are represented as nine, unsigned 32-bit words.
    83  //
    84  // The value of an field element is:
    85  //   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
    86  //
    87  // That is, each limb is alternately 29 or 28-bits wide in little-endian
    88  // order.
    89  //
    90  // This means that a field element hits 2**257, rather than 2**256 as we would
    91  // like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
    92  // problems when multiplying as terms end up one bit short of a limb which
    93  // would require much bit-shifting to correct.
    94  //
    95  // Finally, the values stored in a field element are in Montgomery form. So the
    96  // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
    97  // 2**257.
    98  
    99  const (
   100  	p256Limbs    = 9
   101  	bottom29Bits = 0x1fffffff
   102  )
   103  
   104  var (
   105  	// p256One is the number 1 as a field element.
   106  	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
   107  	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
   108  	// p256P is the prime modulus as a field element.
   109  	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
   110  	// p2562P is the twice prime modulus as a field element.
   111  	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
   112  )
   113  
   114  // p256Precomputed contains precomputed values to aid the calculation of scalar
   115  // multiples of the base point, G. It's actually two, equal length, tables
   116  // concatenated.
   117  //
   118  // The first table contains (x,y) field element pairs for 16 multiples of the
   119  // base point, G.
   120  //
   121  //   Index  |  Index (binary) | Value
   122  //       0  |           0000  | 0G (all zeros, omitted)
   123  //       1  |           0001  | G
   124  //       2  |           0010  | 2**64G
   125  //       3  |           0011  | 2**64G + G
   126  //       4  |           0100  | 2**128G
   127  //       5  |           0101  | 2**128G + G
   128  //       6  |           0110  | 2**128G + 2**64G
   129  //       7  |           0111  | 2**128G + 2**64G + G
   130  //       8  |           1000  | 2**192G
   131  //       9  |           1001  | 2**192G + G
   132  //      10  |           1010  | 2**192G + 2**64G
   133  //      11  |           1011  | 2**192G + 2**64G + G
   134  //      12  |           1100  | 2**192G + 2**128G
   135  //      13  |           1101  | 2**192G + 2**128G + G
   136  //      14  |           1110  | 2**192G + 2**128G + 2**64G
   137  //      15  |           1111  | 2**192G + 2**128G + 2**64G + G
   138  //
   139  // The second table follows the same style, but the terms are 2**32G,
   140  // 2**96G, 2**160G, 2**224G.
   141  //
   142  // This is ~2KB of data.
   143  var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
   144  	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
   145  	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
   146  	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
   147  	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
   148  	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
   149  	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
   150  	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
   151  	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
   152  	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
   153  	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
   154  	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
   155  	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
   156  	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
   157  	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
   158  	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
   159  	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
   160  	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
   161  	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
   162  	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
   163  	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
   164  	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
   165  	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
   166  	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
   167  	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
   168  	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
   169  	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
   170  	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
   171  	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
   172  	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
   173  	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
   174  	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
   175  	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
   176  	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
   177  	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
   178  	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
   179  	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
   180  	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
   181  	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
   182  	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
   183  	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
   184  	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
   185  	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
   186  	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
   187  	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
   188  	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
   189  	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
   190  	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
   191  	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
   192  	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
   193  	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
   194  	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
   195  	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
   196  	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
   197  	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
   198  	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
   199  	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
   200  	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
   201  	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
   202  	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
   203  	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
   204  }
   205  
   206  // Field element operations:
   207  
   208  // nonZeroToAllOnes returns:
   209  //   0xffffffff for 0 < x <= 2**31
   210  //   0 for x == 0 or x > 2**31.
   211  func nonZeroToAllOnes(x uint32) uint32 {
   212  	return ((x - 1) >> 31) - 1
   213  }
   214  
   215  // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
   216  // which is a term at 2**257.
   217  //
   218  // On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
   219  // On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
   220  func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
   221  	carry_mask := nonZeroToAllOnes(carry)
   222  
   223  	inout[0] += carry << 1
   224  	inout[3] += 0x10000000 & carry_mask
   225  	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
   226  	// previous line therefore this doesn't underflow.
   227  	inout[3] -= carry << 11
   228  	inout[4] += (0x20000000 - 1) & carry_mask
   229  	inout[5] += (0x10000000 - 1) & carry_mask
   230  	inout[6] += (0x20000000 - 1) & carry_mask
   231  	inout[6] -= carry << 22
   232  	// This may underflow if carry is non-zero but, if so, we'll fix it in the
   233  	// next line.
   234  	inout[7] -= 1 & carry_mask
   235  	inout[7] += carry << 25
   236  }
   237  
   238  // p256Sum sets out = in+in2.
   239  //
   240  // On entry, in[i]+in2[i] must not overflow a 32-bit word.
   241  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   242  func p256Sum(out, in, in2 *[p256Limbs]uint32) {
   243  	carry := uint32(0)
   244  	for i := 0; ; i++ {
   245  		out[i] = in[i] + in2[i]
   246  		out[i] += carry
   247  		carry = out[i] >> 29
   248  		out[i] &= bottom29Bits
   249  
   250  		i++
   251  		if i == p256Limbs {
   252  			break
   253  		}
   254  
   255  		out[i] = in[i] + in2[i]
   256  		out[i] += carry
   257  		carry = out[i] >> 28
   258  		out[i] &= bottom28Bits
   259  	}
   260  
   261  	p256ReduceCarry(out, carry)
   262  }
   263  
   264  const (
   265  	two30m2    = 1<<30 - 1<<2
   266  	two30p13m2 = 1<<30 + 1<<13 - 1<<2
   267  	two31m2    = 1<<31 - 1<<2
   268  	two31p24m2 = 1<<31 + 1<<24 - 1<<2
   269  	two30m27m2 = 1<<30 - 1<<27 - 1<<2
   270  )
   271  
   272  // p256Zero31 is 0 mod p.
   273  var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
   274  
   275  // p256Diff sets out = in-in2.
   276  //
   277  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   278  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   279  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   280  func p256Diff(out, in, in2 *[p256Limbs]uint32) {
   281  	var carry uint32
   282  
   283  	for i := 0; ; i++ {
   284  		out[i] = in[i] - in2[i]
   285  		out[i] += p256Zero31[i]
   286  		out[i] += carry
   287  		carry = out[i] >> 29
   288  		out[i] &= bottom29Bits
   289  
   290  		i++
   291  		if i == p256Limbs {
   292  			break
   293  		}
   294  
   295  		out[i] = in[i] - in2[i]
   296  		out[i] += p256Zero31[i]
   297  		out[i] += carry
   298  		carry = out[i] >> 28
   299  		out[i] &= bottom28Bits
   300  	}
   301  
   302  	p256ReduceCarry(out, carry)
   303  }
   304  
   305  // p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
   306  // the same 29,28,... bit positions as an field element.
   307  //
   308  // The values in field elements are in Montgomery form: x*R mod p where R =
   309  // 2**257. Since we just multiplied two Montgomery values together, the result
   310  // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
   311  // in Montgomery form.
   312  //
   313  // On entry: tmp[i] < 2**64
   314  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   315  func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
   316  	// The following table may be helpful when reading this code:
   317  	//
   318  	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
   319  	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
   320  	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
   321  	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
   322  	var tmp2 [18]uint32
   323  	var carry, x, xMask uint32
   324  
   325  	// tmp contains 64-bit words with the same 29,28,29-bit positions as an
   326  	// field element. So the top of an element of tmp might overlap with
   327  	// another element two positions down. The following loop eliminates
   328  	// this overlap.
   329  	tmp2[0] = uint32(tmp[0]) & bottom29Bits
   330  
   331  	tmp2[1] = uint32(tmp[0]) >> 29
   332  	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
   333  	tmp2[1] += uint32(tmp[1]) & bottom28Bits
   334  	carry = tmp2[1] >> 28
   335  	tmp2[1] &= bottom28Bits
   336  
   337  	for i := 2; i < 17; i++ {
   338  		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
   339  		tmp2[i] += (uint32(tmp[i-1])) >> 28
   340  		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
   341  		tmp2[i] += uint32(tmp[i]) & bottom29Bits
   342  		tmp2[i] += carry
   343  		carry = tmp2[i] >> 29
   344  		tmp2[i] &= bottom29Bits
   345  
   346  		i++
   347  		if i == 17 {
   348  			break
   349  		}
   350  		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
   351  		tmp2[i] += uint32(tmp[i-1]) >> 29
   352  		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
   353  		tmp2[i] += uint32(tmp[i]) & bottom28Bits
   354  		tmp2[i] += carry
   355  		carry = tmp2[i] >> 28
   356  		tmp2[i] &= bottom28Bits
   357  	}
   358  
   359  	tmp2[17] = uint32(tmp[15]>>32) >> 25
   360  	tmp2[17] += uint32(tmp[16]) >> 29
   361  	tmp2[17] += uint32(tmp[16]>>32) << 3
   362  	tmp2[17] += carry
   363  
   364  	// Montgomery elimination of terms:
   365  	//
   366  	// Since R is 2**257, we can divide by R with a bitwise shift if we can
   367  	// ensure that the right-most 257 bits are all zero. We can make that true
   368  	// by adding multiplies of p without affecting the value.
   369  	//
   370  	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
   371  	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
   372  	// We can do that for 8 further limbs and then right shift to eliminate the
   373  	// extra factor of R.
   374  	for i := 0; ; i += 2 {
   375  		tmp2[i+1] += tmp2[i] >> 29
   376  		x = tmp2[i] & bottom29Bits
   377  		xMask = nonZeroToAllOnes(x)
   378  		tmp2[i] = 0
   379  
   380  		// The bounds calculations for this loop are tricky. Each iteration of
   381  		// the loop eliminates two words by adding values to words to their
   382  		// right.
   383  		//
   384  		// The following table contains the amounts added to each word (as an
   385  		// offset from the value of i at the top of the loop). The amounts are
   386  		// accounted for from the first and second half of the loop separately
   387  		// and are written as, for example, 28 to mean a value <2**28.
   388  		//
   389  		// Word:                   3   4   5   6   7   8   9   10
   390  		// Added in top half:     28  11      29  21  29  28
   391  		//                                        28  29
   392  		//                                            29
   393  		// Added in bottom half:      29  10      28  21  28   28
   394  		//                                            29
   395  		//
   396  		// The value that is currently offset 7 will be offset 5 for the next
   397  		// iteration and then offset 3 for the iteration after that. Therefore
   398  		// the total value added will be the values added at 7, 5 and 3.
   399  		//
   400  		// The following table accumulates these values. The sums at the bottom
   401  		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
   402  		//
   403  		// Word:                   3   4   5   6   7   8   9  10  11  12  13
   404  		//                        28  11  10  29  21  29  28  28  28  28  28
   405  		//                            29  28  11  28  29  28  29  28  29  28
   406  		//                                    29  28  21  21  29  21  29  21
   407  		//                                        10  29  28  21  28  21  28
   408  		//                                        28  29  28  29  28  29  28
   409  		//                                            11  10  29  10  29  10
   410  		//                                            29  28  11  28  11
   411  		//                                                    29      29
   412  		//                        --------------------------------------------
   413  		//                                                30+ 31+ 30+ 31+ 30+
   414  		//                                                28+ 29+ 28+ 29+ 21+
   415  		//                                                21+ 28+ 21+ 28+ 10
   416  		//                                                10  21+ 10  21+
   417  		//                                                    11      11
   418  		//
   419  		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
   420  		// tmp2[10/12] has an initial value of <2**29, then the maximum value
   421  		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
   422  		// as required.
   423  		tmp2[i+3] += (x << 10) & bottom28Bits
   424  		tmp2[i+4] += (x >> 18)
   425  
   426  		tmp2[i+6] += (x << 21) & bottom29Bits
   427  		tmp2[i+7] += x >> 8
   428  
   429  		// At position 200, which is the starting bit position for word 7, we
   430  		// have a factor of 0xf000000 = 2**28 - 2**24.
   431  		tmp2[i+7] += 0x10000000 & xMask
   432  		tmp2[i+8] += (x - 1) & xMask
   433  		tmp2[i+7] -= (x << 24) & bottom28Bits
   434  		tmp2[i+8] -= x >> 4
   435  
   436  		tmp2[i+8] += 0x20000000 & xMask
   437  		tmp2[i+8] -= x
   438  		tmp2[i+8] += (x << 28) & bottom29Bits
   439  		tmp2[i+9] += ((x >> 1) - 1) & xMask
   440  
   441  		if i+1 == p256Limbs {
   442  			break
   443  		}
   444  		tmp2[i+2] += tmp2[i+1] >> 28
   445  		x = tmp2[i+1] & bottom28Bits
   446  		xMask = nonZeroToAllOnes(x)
   447  		tmp2[i+1] = 0
   448  
   449  		tmp2[i+4] += (x << 11) & bottom29Bits
   450  		tmp2[i+5] += (x >> 18)
   451  
   452  		tmp2[i+7] += (x << 21) & bottom28Bits
   453  		tmp2[i+8] += x >> 7
   454  
   455  		// At position 199, which is the starting bit of the 8th word when
   456  		// dealing with a context starting on an odd word, we have a factor of
   457  		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
   458  		// word from i+1 is i+8.
   459  		tmp2[i+8] += 0x20000000 & xMask
   460  		tmp2[i+9] += (x - 1) & xMask
   461  		tmp2[i+8] -= (x << 25) & bottom29Bits
   462  		tmp2[i+9] -= x >> 4
   463  
   464  		tmp2[i+9] += 0x10000000 & xMask
   465  		tmp2[i+9] -= x
   466  		tmp2[i+10] += (x - 1) & xMask
   467  	}
   468  
   469  	// We merge the right shift with a carry chain. The words above 2**257 have
   470  	// widths of 28,29,... which we need to correct when copying them down.
   471  	carry = 0
   472  	for i := 0; i < 8; i++ {
   473  		// The maximum value of tmp2[i + 9] occurs on the first iteration and
   474  		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
   475  		// therefore safe.
   476  		out[i] = tmp2[i+9]
   477  		out[i] += carry
   478  		out[i] += (tmp2[i+10] << 28) & bottom29Bits
   479  		carry = out[i] >> 29
   480  		out[i] &= bottom29Bits
   481  
   482  		i++
   483  		out[i] = tmp2[i+9] >> 1
   484  		out[i] += carry
   485  		carry = out[i] >> 28
   486  		out[i] &= bottom28Bits
   487  	}
   488  
   489  	out[8] = tmp2[17]
   490  	out[8] += carry
   491  	carry = out[8] >> 29
   492  	out[8] &= bottom29Bits
   493  
   494  	p256ReduceCarry(out, carry)
   495  }
   496  
   497  // p256Square sets out=in*in.
   498  //
   499  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
   500  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   501  func p256Square(out, in *[p256Limbs]uint32) {
   502  	var tmp [17]uint64
   503  
   504  	tmp[0] = uint64(in[0]) * uint64(in[0])
   505  	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
   506  	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
   507  		uint64(in[1])*(uint64(in[1])<<1)
   508  	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
   509  		uint64(in[1])*(uint64(in[2])<<1)
   510  	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
   511  		uint64(in[1])*(uint64(in[3])<<2) +
   512  		uint64(in[2])*uint64(in[2])
   513  	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
   514  		uint64(in[1])*(uint64(in[4])<<1) +
   515  		uint64(in[2])*(uint64(in[3])<<1)
   516  	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
   517  		uint64(in[1])*(uint64(in[5])<<2) +
   518  		uint64(in[2])*(uint64(in[4])<<1) +
   519  		uint64(in[3])*(uint64(in[3])<<1)
   520  	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
   521  		uint64(in[1])*(uint64(in[6])<<1) +
   522  		uint64(in[2])*(uint64(in[5])<<1) +
   523  		uint64(in[3])*(uint64(in[4])<<1)
   524  	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
   525  	// which is < 2**64 as required.
   526  	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
   527  		uint64(in[1])*(uint64(in[7])<<2) +
   528  		uint64(in[2])*(uint64(in[6])<<1) +
   529  		uint64(in[3])*(uint64(in[5])<<2) +
   530  		uint64(in[4])*uint64(in[4])
   531  	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
   532  		uint64(in[2])*(uint64(in[7])<<1) +
   533  		uint64(in[3])*(uint64(in[6])<<1) +
   534  		uint64(in[4])*(uint64(in[5])<<1)
   535  	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
   536  		uint64(in[3])*(uint64(in[7])<<2) +
   537  		uint64(in[4])*(uint64(in[6])<<1) +
   538  		uint64(in[5])*(uint64(in[5])<<1)
   539  	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
   540  		uint64(in[4])*(uint64(in[7])<<1) +
   541  		uint64(in[5])*(uint64(in[6])<<1)
   542  	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
   543  		uint64(in[5])*(uint64(in[7])<<2) +
   544  		uint64(in[6])*uint64(in[6])
   545  	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
   546  		uint64(in[6])*(uint64(in[7])<<1)
   547  	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
   548  		uint64(in[7])*(uint64(in[7])<<1)
   549  	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
   550  	tmp[16] = uint64(in[8]) * uint64(in[8])
   551  
   552  	p256ReduceDegree(out, tmp)
   553  }
   554  
   555  // p256Mul sets out=in*in2.
   556  //
   557  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   558  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   559  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   560  func p256Mul(out, in, in2 *[p256Limbs]uint32) {
   561  	var tmp [17]uint64
   562  
   563  	tmp[0] = uint64(in[0]) * uint64(in2[0])
   564  	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
   565  		uint64(in[1])*(uint64(in2[0])<<0)
   566  	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
   567  		uint64(in[1])*(uint64(in2[1])<<1) +
   568  		uint64(in[2])*(uint64(in2[0])<<0)
   569  	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
   570  		uint64(in[1])*(uint64(in2[2])<<0) +
   571  		uint64(in[2])*(uint64(in2[1])<<0) +
   572  		uint64(in[3])*(uint64(in2[0])<<0)
   573  	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
   574  		uint64(in[1])*(uint64(in2[3])<<1) +
   575  		uint64(in[2])*(uint64(in2[2])<<0) +
   576  		uint64(in[3])*(uint64(in2[1])<<1) +
   577  		uint64(in[4])*(uint64(in2[0])<<0)
   578  	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
   579  		uint64(in[1])*(uint64(in2[4])<<0) +
   580  		uint64(in[2])*(uint64(in2[3])<<0) +
   581  		uint64(in[3])*(uint64(in2[2])<<0) +
   582  		uint64(in[4])*(uint64(in2[1])<<0) +
   583  		uint64(in[5])*(uint64(in2[0])<<0)
   584  	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
   585  		uint64(in[1])*(uint64(in2[5])<<1) +
   586  		uint64(in[2])*(uint64(in2[4])<<0) +
   587  		uint64(in[3])*(uint64(in2[3])<<1) +
   588  		uint64(in[4])*(uint64(in2[2])<<0) +
   589  		uint64(in[5])*(uint64(in2[1])<<1) +
   590  		uint64(in[6])*(uint64(in2[0])<<0)
   591  	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
   592  		uint64(in[1])*(uint64(in2[6])<<0) +
   593  		uint64(in[2])*(uint64(in2[5])<<0) +
   594  		uint64(in[3])*(uint64(in2[4])<<0) +
   595  		uint64(in[4])*(uint64(in2[3])<<0) +
   596  		uint64(in[5])*(uint64(in2[2])<<0) +
   597  		uint64(in[6])*(uint64(in2[1])<<0) +
   598  		uint64(in[7])*(uint64(in2[0])<<0)
   599  	// tmp[8] has the greatest value but doesn't overflow. See logic in
   600  	// p256Square.
   601  	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
   602  		uint64(in[1])*(uint64(in2[7])<<1) +
   603  		uint64(in[2])*(uint64(in2[6])<<0) +
   604  		uint64(in[3])*(uint64(in2[5])<<1) +
   605  		uint64(in[4])*(uint64(in2[4])<<0) +
   606  		uint64(in[5])*(uint64(in2[3])<<1) +
   607  		uint64(in[6])*(uint64(in2[2])<<0) +
   608  		uint64(in[7])*(uint64(in2[1])<<1) +
   609  		uint64(in[8])*(uint64(in2[0])<<0)
   610  	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
   611  		uint64(in[2])*(uint64(in2[7])<<0) +
   612  		uint64(in[3])*(uint64(in2[6])<<0) +
   613  		uint64(in[4])*(uint64(in2[5])<<0) +
   614  		uint64(in[5])*(uint64(in2[4])<<0) +
   615  		uint64(in[6])*(uint64(in2[3])<<0) +
   616  		uint64(in[7])*(uint64(in2[2])<<0) +
   617  		uint64(in[8])*(uint64(in2[1])<<0)
   618  	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
   619  		uint64(in[3])*(uint64(in2[7])<<1) +
   620  		uint64(in[4])*(uint64(in2[6])<<0) +
   621  		uint64(in[5])*(uint64(in2[5])<<1) +
   622  		uint64(in[6])*(uint64(in2[4])<<0) +
   623  		uint64(in[7])*(uint64(in2[3])<<1) +
   624  		uint64(in[8])*(uint64(in2[2])<<0)
   625  	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
   626  		uint64(in[4])*(uint64(in2[7])<<0) +
   627  		uint64(in[5])*(uint64(in2[6])<<0) +
   628  		uint64(in[6])*(uint64(in2[5])<<0) +
   629  		uint64(in[7])*(uint64(in2[4])<<0) +
   630  		uint64(in[8])*(uint64(in2[3])<<0)
   631  	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
   632  		uint64(in[5])*(uint64(in2[7])<<1) +
   633  		uint64(in[6])*(uint64(in2[6])<<0) +
   634  		uint64(in[7])*(uint64(in2[5])<<1) +
   635  		uint64(in[8])*(uint64(in2[4])<<0)
   636  	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
   637  		uint64(in[6])*(uint64(in2[7])<<0) +
   638  		uint64(in[7])*(uint64(in2[6])<<0) +
   639  		uint64(in[8])*(uint64(in2[5])<<0)
   640  	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
   641  		uint64(in[7])*(uint64(in2[7])<<1) +
   642  		uint64(in[8])*(uint64(in2[6])<<0)
   643  	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
   644  		uint64(in[8])*(uint64(in2[7])<<0)
   645  	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
   646  
   647  	p256ReduceDegree(out, tmp)
   648  }
   649  
   650  func p256Assign(out, in *[p256Limbs]uint32) {
   651  	*out = *in
   652  }
   653  
   654  // p256Invert calculates |out| = |in|^{-1}
   655  //
   656  // Based on Fermat's Little Theorem:
   657  //   a^p = a (mod p)
   658  //   a^{p-1} = 1 (mod p)
   659  //   a^{p-2} = a^{-1} (mod p)
   660  func p256Invert(out, in *[p256Limbs]uint32) {
   661  	var ftmp, ftmp2 [p256Limbs]uint32
   662  
   663  	// each e_I will hold |in|^{2^I - 1}
   664  	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
   665  
   666  	p256Square(&ftmp, in)     // 2^1
   667  	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
   668  	p256Assign(&e2, &ftmp)
   669  	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
   670  	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
   671  	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
   672  	p256Assign(&e4, &ftmp)
   673  	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
   674  	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
   675  	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
   676  	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
   677  	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
   678  	p256Assign(&e8, &ftmp)
   679  	for i := 0; i < 8; i++ {
   680  		p256Square(&ftmp, &ftmp)
   681  	} // 2^16 - 2^8
   682  	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
   683  	p256Assign(&e16, &ftmp)
   684  	for i := 0; i < 16; i++ {
   685  		p256Square(&ftmp, &ftmp)
   686  	} // 2^32 - 2^16
   687  	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
   688  	p256Assign(&e32, &ftmp)
   689  	for i := 0; i < 32; i++ {
   690  		p256Square(&ftmp, &ftmp)
   691  	} // 2^64 - 2^32
   692  	p256Assign(&e64, &ftmp)
   693  	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
   694  	for i := 0; i < 192; i++ {
   695  		p256Square(&ftmp, &ftmp)
   696  	} // 2^256 - 2^224 + 2^192
   697  
   698  	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
   699  	for i := 0; i < 16; i++ {
   700  		p256Square(&ftmp2, &ftmp2)
   701  	} // 2^80 - 2^16
   702  	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
   703  	for i := 0; i < 8; i++ {
   704  		p256Square(&ftmp2, &ftmp2)
   705  	} // 2^88 - 2^8
   706  	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
   707  	for i := 0; i < 4; i++ {
   708  		p256Square(&ftmp2, &ftmp2)
   709  	} // 2^92 - 2^4
   710  	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
   711  	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
   712  	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
   713  	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
   714  	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
   715  	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
   716  	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
   717  
   718  	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
   719  }
   720  
   721  // p256Scalar3 sets out=3*out.
   722  //
   723  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   724  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   725  func p256Scalar3(out *[p256Limbs]uint32) {
   726  	var carry uint32
   727  
   728  	for i := 0; ; i++ {
   729  		out[i] *= 3
   730  		out[i] += carry
   731  		carry = out[i] >> 29
   732  		out[i] &= bottom29Bits
   733  
   734  		i++
   735  		if i == p256Limbs {
   736  			break
   737  		}
   738  
   739  		out[i] *= 3
   740  		out[i] += carry
   741  		carry = out[i] >> 28
   742  		out[i] &= bottom28Bits
   743  	}
   744  
   745  	p256ReduceCarry(out, carry)
   746  }
   747  
   748  // p256Scalar4 sets out=4*out.
   749  //
   750  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   751  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   752  func p256Scalar4(out *[p256Limbs]uint32) {
   753  	var carry, nextCarry uint32
   754  
   755  	for i := 0; ; i++ {
   756  		nextCarry = out[i] >> 27
   757  		out[i] <<= 2
   758  		out[i] &= bottom29Bits
   759  		out[i] += carry
   760  		carry = nextCarry + (out[i] >> 29)
   761  		out[i] &= bottom29Bits
   762  
   763  		i++
   764  		if i == p256Limbs {
   765  			break
   766  		}
   767  		nextCarry = out[i] >> 26
   768  		out[i] <<= 2
   769  		out[i] &= bottom28Bits
   770  		out[i] += carry
   771  		carry = nextCarry + (out[i] >> 28)
   772  		out[i] &= bottom28Bits
   773  	}
   774  
   775  	p256ReduceCarry(out, carry)
   776  }
   777  
   778  // p256Scalar8 sets out=8*out.
   779  //
   780  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   781  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   782  func p256Scalar8(out *[p256Limbs]uint32) {
   783  	var carry, nextCarry uint32
   784  
   785  	for i := 0; ; i++ {
   786  		nextCarry = out[i] >> 26
   787  		out[i] <<= 3
   788  		out[i] &= bottom29Bits
   789  		out[i] += carry
   790  		carry = nextCarry + (out[i] >> 29)
   791  		out[i] &= bottom29Bits
   792  
   793  		i++
   794  		if i == p256Limbs {
   795  			break
   796  		}
   797  		nextCarry = out[i] >> 25
   798  		out[i] <<= 3
   799  		out[i] &= bottom28Bits
   800  		out[i] += carry
   801  		carry = nextCarry + (out[i] >> 28)
   802  		out[i] &= bottom28Bits
   803  	}
   804  
   805  	p256ReduceCarry(out, carry)
   806  }
   807  
   808  // Group operations:
   809  //
   810  // Elements of the elliptic curve group are represented in Jacobian
   811  // coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
   812  // Jacobian form.
   813  
   814  // p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
   815  //
   816  // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
   817  func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
   818  	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
   819  
   820  	p256Square(&delta, z)
   821  	p256Square(&gamma, y)
   822  	p256Mul(&beta, x, &gamma)
   823  
   824  	p256Sum(&tmp, x, &delta)
   825  	p256Diff(&tmp2, x, &delta)
   826  	p256Mul(&alpha, &tmp, &tmp2)
   827  	p256Scalar3(&alpha)
   828  
   829  	p256Sum(&tmp, y, z)
   830  	p256Square(&tmp, &tmp)
   831  	p256Diff(&tmp, &tmp, &gamma)
   832  	p256Diff(zOut, &tmp, &delta)
   833  
   834  	p256Scalar4(&beta)
   835  	p256Square(xOut, &alpha)
   836  	p256Diff(xOut, xOut, &beta)
   837  	p256Diff(xOut, xOut, &beta)
   838  
   839  	p256Diff(&tmp, &beta, xOut)
   840  	p256Mul(&tmp, &alpha, &tmp)
   841  	p256Square(&tmp2, &gamma)
   842  	p256Scalar8(&tmp2)
   843  	p256Diff(yOut, &tmp, &tmp2)
   844  }
   845  
   846  // p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
   847  // (i.e. the second point is affine.)
   848  //
   849  // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   850  //
   851  // Note that this function does not handle P+P, infinity+P nor P+infinity
   852  // correctly.
   853  func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
   854  	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   855  
   856  	p256Square(&z1z1, z1)
   857  	p256Sum(&tmp, z1, z1)
   858  
   859  	p256Mul(&u2, x2, &z1z1)
   860  	p256Mul(&z1z1z1, z1, &z1z1)
   861  	p256Mul(&s2, y2, &z1z1z1)
   862  	p256Diff(&h, &u2, x1)
   863  	p256Sum(&i, &h, &h)
   864  	p256Square(&i, &i)
   865  	p256Mul(&j, &h, &i)
   866  	p256Diff(&r, &s2, y1)
   867  	p256Sum(&r, &r, &r)
   868  	p256Mul(&v, x1, &i)
   869  
   870  	p256Mul(zOut, &tmp, &h)
   871  	p256Square(&rr, &r)
   872  	p256Diff(xOut, &rr, &j)
   873  	p256Diff(xOut, xOut, &v)
   874  	p256Diff(xOut, xOut, &v)
   875  
   876  	p256Diff(&tmp, &v, xOut)
   877  	p256Mul(yOut, &tmp, &r)
   878  	p256Mul(&tmp, y1, &j)
   879  	p256Diff(yOut, yOut, &tmp)
   880  	p256Diff(yOut, yOut, &tmp)
   881  }
   882  
   883  // p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
   884  //
   885  // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   886  //
   887  // Note that this function does not handle P+P, infinity+P nor P+infinity
   888  // correctly.
   889  func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
   890  	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   891  
   892  	p256Square(&z1z1, z1)
   893  	p256Square(&z2z2, z2)
   894  	p256Mul(&u1, x1, &z2z2)
   895  
   896  	p256Sum(&tmp, z1, z2)
   897  	p256Square(&tmp, &tmp)
   898  	p256Diff(&tmp, &tmp, &z1z1)
   899  	p256Diff(&tmp, &tmp, &z2z2)
   900  
   901  	p256Mul(&z2z2z2, z2, &z2z2)
   902  	p256Mul(&s1, y1, &z2z2z2)
   903  
   904  	p256Mul(&u2, x2, &z1z1)
   905  	p256Mul(&z1z1z1, z1, &z1z1)
   906  	p256Mul(&s2, y2, &z1z1z1)
   907  	p256Diff(&h, &u2, &u1)
   908  	p256Sum(&i, &h, &h)
   909  	p256Square(&i, &i)
   910  	p256Mul(&j, &h, &i)
   911  	p256Diff(&r, &s2, &s1)
   912  	p256Sum(&r, &r, &r)
   913  	p256Mul(&v, &u1, &i)
   914  
   915  	p256Mul(zOut, &tmp, &h)
   916  	p256Square(&rr, &r)
   917  	p256Diff(xOut, &rr, &j)
   918  	p256Diff(xOut, xOut, &v)
   919  	p256Diff(xOut, xOut, &v)
   920  
   921  	p256Diff(&tmp, &v, xOut)
   922  	p256Mul(yOut, &tmp, &r)
   923  	p256Mul(&tmp, &s1, &j)
   924  	p256Diff(yOut, yOut, &tmp)
   925  	p256Diff(yOut, yOut, &tmp)
   926  }
   927  
   928  // p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
   929  //
   930  // On entry: mask is either 0 or 0xffffffff.
   931  func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
   932  	for i := 0; i < p256Limbs; i++ {
   933  		tmp := mask & (in[i] ^ out[i])
   934  		out[i] ^= tmp
   935  	}
   936  }
   937  
   938  // p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
   939  // On entry: index < 16, table[0] must be zero.
   940  func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
   941  	for i := range xOut {
   942  		xOut[i] = 0
   943  	}
   944  	for i := range yOut {
   945  		yOut[i] = 0
   946  	}
   947  
   948  	for i := uint32(1); i < 16; i++ {
   949  		mask := i ^ index
   950  		mask |= mask >> 2
   951  		mask |= mask >> 1
   952  		mask &= 1
   953  		mask--
   954  		for j := range xOut {
   955  			xOut[j] |= table[0] & mask
   956  			table = table[1:]
   957  		}
   958  		for j := range yOut {
   959  			yOut[j] |= table[0] & mask
   960  			table = table[1:]
   961  		}
   962  	}
   963  }
   964  
   965  // p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
   966  // table.
   967  // On entry: index < 16, table[0] must be zero.
   968  func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
   969  	for i := range xOut {
   970  		xOut[i] = 0
   971  	}
   972  	for i := range yOut {
   973  		yOut[i] = 0
   974  	}
   975  	for i := range zOut {
   976  		zOut[i] = 0
   977  	}
   978  
   979  	// The implicit value at index 0 is all zero. We don't need to perform that
   980  	// iteration of the loop because we already set out_* to zero.
   981  	for i := uint32(1); i < 16; i++ {
   982  		mask := i ^ index
   983  		mask |= mask >> 2
   984  		mask |= mask >> 1
   985  		mask &= 1
   986  		mask--
   987  		for j := range xOut {
   988  			xOut[j] |= table[i][0][j] & mask
   989  		}
   990  		for j := range yOut {
   991  			yOut[j] |= table[i][1][j] & mask
   992  		}
   993  		for j := range zOut {
   994  			zOut[j] |= table[i][2][j] & mask
   995  		}
   996  	}
   997  }
   998  
   999  // p256GetBit returns the bit'th bit of scalar.
  1000  func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
  1001  	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
  1002  }
  1003  
  1004  // p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
  1005  // little-endian number. Note that the value of scalar must be less than the
  1006  // order of the group.
  1007  func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
  1008  	nIsInfinityMask := ^uint32(0)
  1009  	var pIsNoninfiniteMask, mask, tableOffset uint32
  1010  	var px, py, tx, ty, tz [p256Limbs]uint32
  1011  
  1012  	for i := range xOut {
  1013  		xOut[i] = 0
  1014  	}
  1015  	for i := range yOut {
  1016  		yOut[i] = 0
  1017  	}
  1018  	for i := range zOut {
  1019  		zOut[i] = 0
  1020  	}
  1021  
  1022  	// The loop adds bits at positions 0, 64, 128 and 192, followed by
  1023  	// positions 32,96,160 and 224 and does this 32 times.
  1024  	for i := uint(0); i < 32; i++ {
  1025  		if i != 0 {
  1026  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1027  		}
  1028  		tableOffset = 0
  1029  		for j := uint(0); j <= 32; j += 32 {
  1030  			bit0 := p256GetBit(scalar, 31-i+j)
  1031  			bit1 := p256GetBit(scalar, 95-i+j)
  1032  			bit2 := p256GetBit(scalar, 159-i+j)
  1033  			bit3 := p256GetBit(scalar, 223-i+j)
  1034  			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
  1035  
  1036  			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
  1037  			tableOffset += 30 * p256Limbs
  1038  
  1039  			// Since scalar is less than the order of the group, we know that
  1040  			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
  1041  			// below.
  1042  			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
  1043  			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
  1044  			// (a.k.a.  the point at infinity). We handle that situation by
  1045  			// copying the point from the table.
  1046  			p256CopyConditional(xOut, &px, nIsInfinityMask)
  1047  			p256CopyConditional(yOut, &py, nIsInfinityMask)
  1048  			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
  1049  
  1050  			// Equally, the result is also wrong if the point from the table is
  1051  			// zero, which happens when the index is zero. We handle that by
  1052  			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
  1053  			pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1054  			mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1055  			p256CopyConditional(xOut, &tx, mask)
  1056  			p256CopyConditional(yOut, &ty, mask)
  1057  			p256CopyConditional(zOut, &tz, mask)
  1058  			// If p was not zero, then n is now non-zero.
  1059  			nIsInfinityMask &^= pIsNoninfiniteMask
  1060  		}
  1061  	}
  1062  }
  1063  
  1064  // p256PointToAffine converts a Jacobian point to an affine point. If the input
  1065  // is the point at infinity then it returns (0, 0) in constant time.
  1066  func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
  1067  	var zInv, zInvSq [p256Limbs]uint32
  1068  
  1069  	p256Invert(&zInv, z)
  1070  	p256Square(&zInvSq, &zInv)
  1071  	p256Mul(xOut, x, &zInvSq)
  1072  	p256Mul(&zInv, &zInv, &zInvSq)
  1073  	p256Mul(yOut, y, &zInv)
  1074  }
  1075  
  1076  // p256ToAffine returns a pair of *big.Int containing the affine representation
  1077  // of {x,y,z}.
  1078  func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
  1079  	var xx, yy [p256Limbs]uint32
  1080  	p256PointToAffine(&xx, &yy, x, y, z)
  1081  	return p256ToBig(&xx), p256ToBig(&yy)
  1082  }
  1083  
  1084  // p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
  1085  func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
  1086  	var px, py, pz, tx, ty, tz [p256Limbs]uint32
  1087  	var precomp [16][3][p256Limbs]uint32
  1088  	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
  1089  
  1090  	// We precompute 0,1,2,... times {x,y}.
  1091  	precomp[1][0] = *x
  1092  	precomp[1][1] = *y
  1093  	precomp[1][2] = p256One
  1094  
  1095  	for i := 2; i < 16; i += 2 {
  1096  		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
  1097  		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
  1098  	}
  1099  
  1100  	for i := range xOut {
  1101  		xOut[i] = 0
  1102  	}
  1103  	for i := range yOut {
  1104  		yOut[i] = 0
  1105  	}
  1106  	for i := range zOut {
  1107  		zOut[i] = 0
  1108  	}
  1109  	nIsInfinityMask = ^uint32(0)
  1110  
  1111  	// We add in a window of four bits each iteration and do this 64 times.
  1112  	for i := 0; i < 64; i++ {
  1113  		if i != 0 {
  1114  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1115  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1116  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1117  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1118  		}
  1119  
  1120  		index = uint32(scalar[31-i/2])
  1121  		if (i & 1) == 1 {
  1122  			index &= 15
  1123  		} else {
  1124  			index >>= 4
  1125  		}
  1126  
  1127  		// See the comments in scalarBaseMult about handling infinities.
  1128  		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
  1129  		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
  1130  		p256CopyConditional(xOut, &px, nIsInfinityMask)
  1131  		p256CopyConditional(yOut, &py, nIsInfinityMask)
  1132  		p256CopyConditional(zOut, &pz, nIsInfinityMask)
  1133  
  1134  		pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1135  		mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1136  		p256CopyConditional(xOut, &tx, mask)
  1137  		p256CopyConditional(yOut, &ty, mask)
  1138  		p256CopyConditional(zOut, &tz, mask)
  1139  		nIsInfinityMask &^= pIsNoninfiniteMask
  1140  	}
  1141  }
  1142  
  1143  // p256FromBig sets out = R*in.
  1144  func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
  1145  	tmp := new(big.Int).Lsh(in, 257)
  1146  	tmp.Mod(tmp, p256.P)
  1147  
  1148  	for i := 0; i < p256Limbs; i++ {
  1149  		if bits := tmp.Bits(); len(bits) > 0 {
  1150  			out[i] = uint32(bits[0]) & bottom29Bits
  1151  		} else {
  1152  			out[i] = 0
  1153  		}
  1154  		tmp.Rsh(tmp, 29)
  1155  
  1156  		i++
  1157  		if i == p256Limbs {
  1158  			break
  1159  		}
  1160  
  1161  		if bits := tmp.Bits(); len(bits) > 0 {
  1162  			out[i] = uint32(bits[0]) & bottom28Bits
  1163  		} else {
  1164  			out[i] = 0
  1165  		}
  1166  		tmp.Rsh(tmp, 28)
  1167  	}
  1168  }
  1169  
  1170  // p256ToBig returns a *big.Int containing the value of in.
  1171  func p256ToBig(in *[p256Limbs]uint32) *big.Int {
  1172  	result, tmp := new(big.Int), new(big.Int)
  1173  
  1174  	result.SetInt64(int64(in[p256Limbs-1]))
  1175  	for i := p256Limbs - 2; i >= 0; i-- {
  1176  		if (i & 1) == 0 {
  1177  			result.Lsh(result, 29)
  1178  		} else {
  1179  			result.Lsh(result, 28)
  1180  		}
  1181  		tmp.SetInt64(int64(in[i]))
  1182  		result.Add(result, tmp)
  1183  	}
  1184  
  1185  	result.Mul(result, p256RInverse)
  1186  	result.Mod(result, p256.P)
  1187  	return result
  1188  }