github.com/mangodowner/go-gm@v0.0.0-20180818020936-8baa2bd4408c/src/crypto/elliptic/p256.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !amd64
     6  
     7  package elliptic
     8  
     9  // This file contains a constant-time, 32-bit implementation of P256.
    10  
    11  import (
    12  	"math/big"
    13  )
    14  
    15  type p256Curve struct {
    16  	*CurveParams
    17  }
    18  
    19  var (
    20  	p256Params *CurveParams
    21  
    22  	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
    23  	// (2**257).
    24  	p256RInverse *big.Int
    25  )
    26  
    27  func initP256() {
    28  	// See FIPS 186-3, section D.2.3
    29  	p256Params = &CurveParams{Name: "P-256"}
    30  	p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
    31  	p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
    32  	p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
    33  	p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
    34  	p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
    35  	p256Params.BitSize = 256
    36  
    37  	p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
    38  
    39  	// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
    40  	initP256Arch()
    41  }
    42  
    43  func (curve p256Curve) Params() *CurveParams {
    44  	return curve.CurveParams
    45  }
    46  
    47  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
    48  // to out. If the scalar is equal or greater than the order of the group, it's
    49  // reduced modulo that order.
    50  func p256GetScalar(out *[32]byte, in []byte) {
    51  	n := new(big.Int).SetBytes(in)
    52  	var scalarBytes []byte
    53  
    54  	if n.Cmp(p256Params.N) >= 0 {
    55  		n.Mod(n, p256Params.N)
    56  		scalarBytes = n.Bytes()
    57  	} else {
    58  		scalarBytes = in
    59  	}
    60  
    61  	for i, v := range scalarBytes {
    62  		out[len(scalarBytes)-(1+i)] = v
    63  	}
    64  }
    65  
    66  func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
    67  	var scalarReversed [32]byte
    68  	p256GetScalar(&scalarReversed, scalar)
    69  
    70  	var x1, y1, z1 [p256Limbs]uint32
    71  	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
    72  	return p256ToAffine(&x1, &y1, &z1)
    73  }
    74  
    75  func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
    76  	var scalarReversed [32]byte
    77  	p256GetScalar(&scalarReversed, scalar)
    78  
    79  	var px, py, x1, y1, z1 [p256Limbs]uint32
    80  	p256FromBig(&px, bigX)
    81  	p256FromBig(&py, bigY)
    82  	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
    83  	return p256ToAffine(&x1, &y1, &z1)
    84  }
    85  
    86  // Field elements are represented as nine, unsigned 32-bit words.
    87  //
    88  // The value of an field element is:
    89  //   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
    90  //
    91  // That is, each limb is alternately 29 or 28-bits wide in little-endian
    92  // order.
    93  //
    94  // This means that a field element hits 2**257, rather than 2**256 as we would
    95  // like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
    96  // problems when multiplying as terms end up one bit short of a limb which
    97  // would require much bit-shifting to correct.
    98  //
    99  // Finally, the values stored in a field element are in Montgomery form. So the
   100  // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
   101  // 2**257.
   102  
   103  const (
   104  	p256Limbs    = 9
   105  	bottom29Bits = 0x1fffffff
   106  )
   107  
   108  var (
   109  	// p256One is the number 1 as a field element.
   110  	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
   111  	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
   112  	// p256P is the prime modulus as a field element.
   113  	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
   114  	// p2562P is the twice prime modulus as a field element.
   115  	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
   116  )
   117  
   118  // p256Precomputed contains precomputed values to aid the calculation of scalar
   119  // multiples of the base point, G. It's actually two, equal length, tables
   120  // concatenated.
   121  //
   122  // The first table contains (x,y) field element pairs for 16 multiples of the
   123  // base point, G.
   124  //
   125  //   Index  |  Index (binary) | Value
   126  //       0  |           0000  | 0G (all zeros, omitted)
   127  //       1  |           0001  | G
   128  //       2  |           0010  | 2**64G
   129  //       3  |           0011  | 2**64G + G
   130  //       4  |           0100  | 2**128G
   131  //       5  |           0101  | 2**128G + G
   132  //       6  |           0110  | 2**128G + 2**64G
   133  //       7  |           0111  | 2**128G + 2**64G + G
   134  //       8  |           1000  | 2**192G
   135  //       9  |           1001  | 2**192G + G
   136  //      10  |           1010  | 2**192G + 2**64G
   137  //      11  |           1011  | 2**192G + 2**64G + G
   138  //      12  |           1100  | 2**192G + 2**128G
   139  //      13  |           1101  | 2**192G + 2**128G + G
   140  //      14  |           1110  | 2**192G + 2**128G + 2**64G
   141  //      15  |           1111  | 2**192G + 2**128G + 2**64G + G
   142  //
   143  // The second table follows the same style, but the terms are 2**32G,
   144  // 2**96G, 2**160G, 2**224G.
   145  //
   146  // This is ~2KB of data.
   147  var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
   148  	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
   149  	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
   150  	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
   151  	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
   152  	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
   153  	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
   154  	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
   155  	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
   156  	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
   157  	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
   158  	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
   159  	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
   160  	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
   161  	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
   162  	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
   163  	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
   164  	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
   165  	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
   166  	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
   167  	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
   168  	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
   169  	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
   170  	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
   171  	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
   172  	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
   173  	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
   174  	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
   175  	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
   176  	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
   177  	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
   178  	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
   179  	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
   180  	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
   181  	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
   182  	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
   183  	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
   184  	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
   185  	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
   186  	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
   187  	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
   188  	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
   189  	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
   190  	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
   191  	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
   192  	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
   193  	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
   194  	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
   195  	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
   196  	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
   197  	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
   198  	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
   199  	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
   200  	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
   201  	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
   202  	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
   203  	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
   204  	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
   205  	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
   206  	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
   207  	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
   208  }
   209  
   210  // Field element operations:
   211  
   212  // nonZeroToAllOnes returns:
   213  //   0xffffffff for 0 < x <= 2**31
   214  //   0 for x == 0 or x > 2**31.
   215  func nonZeroToAllOnes(x uint32) uint32 {
   216  	return ((x - 1) >> 31) - 1
   217  }
   218  
   219  // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
   220  // which is a term at 2**257.
   221  //
   222  // On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
   223  // On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
   224  func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
   225  	carry_mask := nonZeroToAllOnes(carry)
   226  
   227  	inout[0] += carry << 1
   228  	inout[3] += 0x10000000 & carry_mask
   229  	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
   230  	// previous line therefore this doesn't underflow.
   231  	inout[3] -= carry << 11
   232  	inout[4] += (0x20000000 - 1) & carry_mask
   233  	inout[5] += (0x10000000 - 1) & carry_mask
   234  	inout[6] += (0x20000000 - 1) & carry_mask
   235  	inout[6] -= carry << 22
   236  	// This may underflow if carry is non-zero but, if so, we'll fix it in the
   237  	// next line.
   238  	inout[7] -= 1 & carry_mask
   239  	inout[7] += carry << 25
   240  }
   241  
   242  // p256Sum sets out = in+in2.
   243  //
   244  // On entry, in[i]+in2[i] must not overflow a 32-bit word.
   245  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   246  func p256Sum(out, in, in2 *[p256Limbs]uint32) {
   247  	carry := uint32(0)
   248  	for i := 0; ; i++ {
   249  		out[i] = in[i] + in2[i]
   250  		out[i] += carry
   251  		carry = out[i] >> 29
   252  		out[i] &= bottom29Bits
   253  
   254  		i++
   255  		if i == p256Limbs {
   256  			break
   257  		}
   258  
   259  		out[i] = in[i] + in2[i]
   260  		out[i] += carry
   261  		carry = out[i] >> 28
   262  		out[i] &= bottom28Bits
   263  	}
   264  
   265  	p256ReduceCarry(out, carry)
   266  }
   267  
   268  const (
   269  	two30m2    = 1<<30 - 1<<2
   270  	two30p13m2 = 1<<30 + 1<<13 - 1<<2
   271  	two31m2    = 1<<31 - 1<<2
   272  	two31p24m2 = 1<<31 + 1<<24 - 1<<2
   273  	two30m27m2 = 1<<30 - 1<<27 - 1<<2
   274  )
   275  
   276  // p256Zero31 is 0 mod p.
   277  var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
   278  
   279  // p256Diff sets out = in-in2.
   280  //
   281  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   282  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   283  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   284  func p256Diff(out, in, in2 *[p256Limbs]uint32) {
   285  	var carry uint32
   286  
   287  	for i := 0; ; i++ {
   288  		out[i] = in[i] - in2[i]
   289  		out[i] += p256Zero31[i]
   290  		out[i] += carry
   291  		carry = out[i] >> 29
   292  		out[i] &= bottom29Bits
   293  
   294  		i++
   295  		if i == p256Limbs {
   296  			break
   297  		}
   298  
   299  		out[i] = in[i] - in2[i]
   300  		out[i] += p256Zero31[i]
   301  		out[i] += carry
   302  		carry = out[i] >> 28
   303  		out[i] &= bottom28Bits
   304  	}
   305  
   306  	p256ReduceCarry(out, carry)
   307  }
   308  
   309  // p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
   310  // the same 29,28,... bit positions as an field element.
   311  //
   312  // The values in field elements are in Montgomery form: x*R mod p where R =
   313  // 2**257. Since we just multiplied two Montgomery values together, the result
   314  // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
   315  // in Montgomery form.
   316  //
   317  // On entry: tmp[i] < 2**64
   318  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   319  func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
   320  	// The following table may be helpful when reading this code:
   321  	//
   322  	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
   323  	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
   324  	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
   325  	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
   326  	var tmp2 [18]uint32
   327  	var carry, x, xMask uint32
   328  
   329  	// tmp contains 64-bit words with the same 29,28,29-bit positions as an
   330  	// field element. So the top of an element of tmp might overlap with
   331  	// another element two positions down. The following loop eliminates
   332  	// this overlap.
   333  	tmp2[0] = uint32(tmp[0]) & bottom29Bits
   334  
   335  	tmp2[1] = uint32(tmp[0]) >> 29
   336  	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
   337  	tmp2[1] += uint32(tmp[1]) & bottom28Bits
   338  	carry = tmp2[1] >> 28
   339  	tmp2[1] &= bottom28Bits
   340  
   341  	for i := 2; i < 17; i++ {
   342  		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
   343  		tmp2[i] += (uint32(tmp[i-1])) >> 28
   344  		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
   345  		tmp2[i] += uint32(tmp[i]) & bottom29Bits
   346  		tmp2[i] += carry
   347  		carry = tmp2[i] >> 29
   348  		tmp2[i] &= bottom29Bits
   349  
   350  		i++
   351  		if i == 17 {
   352  			break
   353  		}
   354  		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
   355  		tmp2[i] += uint32(tmp[i-1]) >> 29
   356  		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
   357  		tmp2[i] += uint32(tmp[i]) & bottom28Bits
   358  		tmp2[i] += carry
   359  		carry = tmp2[i] >> 28
   360  		tmp2[i] &= bottom28Bits
   361  	}
   362  
   363  	tmp2[17] = uint32(tmp[15]>>32) >> 25
   364  	tmp2[17] += uint32(tmp[16]) >> 29
   365  	tmp2[17] += uint32(tmp[16]>>32) << 3
   366  	tmp2[17] += carry
   367  
   368  	// Montgomery elimination of terms:
   369  	//
   370  	// Since R is 2**257, we can divide by R with a bitwise shift if we can
   371  	// ensure that the right-most 257 bits are all zero. We can make that true
   372  	// by adding multiplies of p without affecting the value.
   373  	//
   374  	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
   375  	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
   376  	// We can do that for 8 further limbs and then right shift to eliminate the
   377  	// extra factor of R.
   378  	for i := 0; ; i += 2 {
   379  		tmp2[i+1] += tmp2[i] >> 29
   380  		x = tmp2[i] & bottom29Bits
   381  		xMask = nonZeroToAllOnes(x)
   382  		tmp2[i] = 0
   383  
   384  		// The bounds calculations for this loop are tricky. Each iteration of
   385  		// the loop eliminates two words by adding values to words to their
   386  		// right.
   387  		//
   388  		// The following table contains the amounts added to each word (as an
   389  		// offset from the value of i at the top of the loop). The amounts are
   390  		// accounted for from the first and second half of the loop separately
   391  		// and are written as, for example, 28 to mean a value <2**28.
   392  		//
   393  		// Word:                   3   4   5   6   7   8   9   10
   394  		// Added in top half:     28  11      29  21  29  28
   395  		//                                        28  29
   396  		//                                            29
   397  		// Added in bottom half:      29  10      28  21  28   28
   398  		//                                            29
   399  		//
   400  		// The value that is currently offset 7 will be offset 5 for the next
   401  		// iteration and then offset 3 for the iteration after that. Therefore
   402  		// the total value added will be the values added at 7, 5 and 3.
   403  		//
   404  		// The following table accumulates these values. The sums at the bottom
   405  		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
   406  		//
   407  		// Word:                   3   4   5   6   7   8   9  10  11  12  13
   408  		//                        28  11  10  29  21  29  28  28  28  28  28
   409  		//                            29  28  11  28  29  28  29  28  29  28
   410  		//                                    29  28  21  21  29  21  29  21
   411  		//                                        10  29  28  21  28  21  28
   412  		//                                        28  29  28  29  28  29  28
   413  		//                                            11  10  29  10  29  10
   414  		//                                            29  28  11  28  11
   415  		//                                                    29      29
   416  		//                        --------------------------------------------
   417  		//                                                30+ 31+ 30+ 31+ 30+
   418  		//                                                28+ 29+ 28+ 29+ 21+
   419  		//                                                21+ 28+ 21+ 28+ 10
   420  		//                                                10  21+ 10  21+
   421  		//                                                    11      11
   422  		//
   423  		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
   424  		// tmp2[10/12] has an initial value of <2**29, then the maximum value
   425  		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
   426  		// as required.
   427  		tmp2[i+3] += (x << 10) & bottom28Bits
   428  		tmp2[i+4] += (x >> 18)
   429  
   430  		tmp2[i+6] += (x << 21) & bottom29Bits
   431  		tmp2[i+7] += x >> 8
   432  
   433  		// At position 200, which is the starting bit position for word 7, we
   434  		// have a factor of 0xf000000 = 2**28 - 2**24.
   435  		tmp2[i+7] += 0x10000000 & xMask
   436  		tmp2[i+8] += (x - 1) & xMask
   437  		tmp2[i+7] -= (x << 24) & bottom28Bits
   438  		tmp2[i+8] -= x >> 4
   439  
   440  		tmp2[i+8] += 0x20000000 & xMask
   441  		tmp2[i+8] -= x
   442  		tmp2[i+8] += (x << 28) & bottom29Bits
   443  		tmp2[i+9] += ((x >> 1) - 1) & xMask
   444  
   445  		if i+1 == p256Limbs {
   446  			break
   447  		}
   448  		tmp2[i+2] += tmp2[i+1] >> 28
   449  		x = tmp2[i+1] & bottom28Bits
   450  		xMask = nonZeroToAllOnes(x)
   451  		tmp2[i+1] = 0
   452  
   453  		tmp2[i+4] += (x << 11) & bottom29Bits
   454  		tmp2[i+5] += (x >> 18)
   455  
   456  		tmp2[i+7] += (x << 21) & bottom28Bits
   457  		tmp2[i+8] += x >> 7
   458  
   459  		// At position 199, which is the starting bit of the 8th word when
   460  		// dealing with a context starting on an odd word, we have a factor of
   461  		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
   462  		// word from i+1 is i+8.
   463  		tmp2[i+8] += 0x20000000 & xMask
   464  		tmp2[i+9] += (x - 1) & xMask
   465  		tmp2[i+8] -= (x << 25) & bottom29Bits
   466  		tmp2[i+9] -= x >> 4
   467  
   468  		tmp2[i+9] += 0x10000000 & xMask
   469  		tmp2[i+9] -= x
   470  		tmp2[i+10] += (x - 1) & xMask
   471  	}
   472  
   473  	// We merge the right shift with a carry chain. The words above 2**257 have
   474  	// widths of 28,29,... which we need to correct when copying them down.
   475  	carry = 0
   476  	for i := 0; i < 8; i++ {
   477  		// The maximum value of tmp2[i + 9] occurs on the first iteration and
   478  		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
   479  		// therefore safe.
   480  		out[i] = tmp2[i+9]
   481  		out[i] += carry
   482  		out[i] += (tmp2[i+10] << 28) & bottom29Bits
   483  		carry = out[i] >> 29
   484  		out[i] &= bottom29Bits
   485  
   486  		i++
   487  		out[i] = tmp2[i+9] >> 1
   488  		out[i] += carry
   489  		carry = out[i] >> 28
   490  		out[i] &= bottom28Bits
   491  	}
   492  
   493  	out[8] = tmp2[17]
   494  	out[8] += carry
   495  	carry = out[8] >> 29
   496  	out[8] &= bottom29Bits
   497  
   498  	p256ReduceCarry(out, carry)
   499  }
   500  
   501  // p256Square sets out=in*in.
   502  //
   503  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
   504  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   505  func p256Square(out, in *[p256Limbs]uint32) {
   506  	var tmp [17]uint64
   507  
   508  	tmp[0] = uint64(in[0]) * uint64(in[0])
   509  	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
   510  	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
   511  		uint64(in[1])*(uint64(in[1])<<1)
   512  	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
   513  		uint64(in[1])*(uint64(in[2])<<1)
   514  	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
   515  		uint64(in[1])*(uint64(in[3])<<2) +
   516  		uint64(in[2])*uint64(in[2])
   517  	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
   518  		uint64(in[1])*(uint64(in[4])<<1) +
   519  		uint64(in[2])*(uint64(in[3])<<1)
   520  	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
   521  		uint64(in[1])*(uint64(in[5])<<2) +
   522  		uint64(in[2])*(uint64(in[4])<<1) +
   523  		uint64(in[3])*(uint64(in[3])<<1)
   524  	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
   525  		uint64(in[1])*(uint64(in[6])<<1) +
   526  		uint64(in[2])*(uint64(in[5])<<1) +
   527  		uint64(in[3])*(uint64(in[4])<<1)
   528  	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
   529  	// which is < 2**64 as required.
   530  	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
   531  		uint64(in[1])*(uint64(in[7])<<2) +
   532  		uint64(in[2])*(uint64(in[6])<<1) +
   533  		uint64(in[3])*(uint64(in[5])<<2) +
   534  		uint64(in[4])*uint64(in[4])
   535  	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
   536  		uint64(in[2])*(uint64(in[7])<<1) +
   537  		uint64(in[3])*(uint64(in[6])<<1) +
   538  		uint64(in[4])*(uint64(in[5])<<1)
   539  	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
   540  		uint64(in[3])*(uint64(in[7])<<2) +
   541  		uint64(in[4])*(uint64(in[6])<<1) +
   542  		uint64(in[5])*(uint64(in[5])<<1)
   543  	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
   544  		uint64(in[4])*(uint64(in[7])<<1) +
   545  		uint64(in[5])*(uint64(in[6])<<1)
   546  	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
   547  		uint64(in[5])*(uint64(in[7])<<2) +
   548  		uint64(in[6])*uint64(in[6])
   549  	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
   550  		uint64(in[6])*(uint64(in[7])<<1)
   551  	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
   552  		uint64(in[7])*(uint64(in[7])<<1)
   553  	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
   554  	tmp[16] = uint64(in[8]) * uint64(in[8])
   555  
   556  	p256ReduceDegree(out, tmp)
   557  }
   558  
   559  // p256Mul sets out=in*in2.
   560  //
   561  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   562  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   563  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   564  func p256Mul(out, in, in2 *[p256Limbs]uint32) {
   565  	var tmp [17]uint64
   566  
   567  	tmp[0] = uint64(in[0]) * uint64(in2[0])
   568  	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
   569  		uint64(in[1])*(uint64(in2[0])<<0)
   570  	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
   571  		uint64(in[1])*(uint64(in2[1])<<1) +
   572  		uint64(in[2])*(uint64(in2[0])<<0)
   573  	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
   574  		uint64(in[1])*(uint64(in2[2])<<0) +
   575  		uint64(in[2])*(uint64(in2[1])<<0) +
   576  		uint64(in[3])*(uint64(in2[0])<<0)
   577  	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
   578  		uint64(in[1])*(uint64(in2[3])<<1) +
   579  		uint64(in[2])*(uint64(in2[2])<<0) +
   580  		uint64(in[3])*(uint64(in2[1])<<1) +
   581  		uint64(in[4])*(uint64(in2[0])<<0)
   582  	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
   583  		uint64(in[1])*(uint64(in2[4])<<0) +
   584  		uint64(in[2])*(uint64(in2[3])<<0) +
   585  		uint64(in[3])*(uint64(in2[2])<<0) +
   586  		uint64(in[4])*(uint64(in2[1])<<0) +
   587  		uint64(in[5])*(uint64(in2[0])<<0)
   588  	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
   589  		uint64(in[1])*(uint64(in2[5])<<1) +
   590  		uint64(in[2])*(uint64(in2[4])<<0) +
   591  		uint64(in[3])*(uint64(in2[3])<<1) +
   592  		uint64(in[4])*(uint64(in2[2])<<0) +
   593  		uint64(in[5])*(uint64(in2[1])<<1) +
   594  		uint64(in[6])*(uint64(in2[0])<<0)
   595  	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
   596  		uint64(in[1])*(uint64(in2[6])<<0) +
   597  		uint64(in[2])*(uint64(in2[5])<<0) +
   598  		uint64(in[3])*(uint64(in2[4])<<0) +
   599  		uint64(in[4])*(uint64(in2[3])<<0) +
   600  		uint64(in[5])*(uint64(in2[2])<<0) +
   601  		uint64(in[6])*(uint64(in2[1])<<0) +
   602  		uint64(in[7])*(uint64(in2[0])<<0)
   603  	// tmp[8] has the greatest value but doesn't overflow. See logic in
   604  	// p256Square.
   605  	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
   606  		uint64(in[1])*(uint64(in2[7])<<1) +
   607  		uint64(in[2])*(uint64(in2[6])<<0) +
   608  		uint64(in[3])*(uint64(in2[5])<<1) +
   609  		uint64(in[4])*(uint64(in2[4])<<0) +
   610  		uint64(in[5])*(uint64(in2[3])<<1) +
   611  		uint64(in[6])*(uint64(in2[2])<<0) +
   612  		uint64(in[7])*(uint64(in2[1])<<1) +
   613  		uint64(in[8])*(uint64(in2[0])<<0)
   614  	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
   615  		uint64(in[2])*(uint64(in2[7])<<0) +
   616  		uint64(in[3])*(uint64(in2[6])<<0) +
   617  		uint64(in[4])*(uint64(in2[5])<<0) +
   618  		uint64(in[5])*(uint64(in2[4])<<0) +
   619  		uint64(in[6])*(uint64(in2[3])<<0) +
   620  		uint64(in[7])*(uint64(in2[2])<<0) +
   621  		uint64(in[8])*(uint64(in2[1])<<0)
   622  	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
   623  		uint64(in[3])*(uint64(in2[7])<<1) +
   624  		uint64(in[4])*(uint64(in2[6])<<0) +
   625  		uint64(in[5])*(uint64(in2[5])<<1) +
   626  		uint64(in[6])*(uint64(in2[4])<<0) +
   627  		uint64(in[7])*(uint64(in2[3])<<1) +
   628  		uint64(in[8])*(uint64(in2[2])<<0)
   629  	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
   630  		uint64(in[4])*(uint64(in2[7])<<0) +
   631  		uint64(in[5])*(uint64(in2[6])<<0) +
   632  		uint64(in[6])*(uint64(in2[5])<<0) +
   633  		uint64(in[7])*(uint64(in2[4])<<0) +
   634  		uint64(in[8])*(uint64(in2[3])<<0)
   635  	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
   636  		uint64(in[5])*(uint64(in2[7])<<1) +
   637  		uint64(in[6])*(uint64(in2[6])<<0) +
   638  		uint64(in[7])*(uint64(in2[5])<<1) +
   639  		uint64(in[8])*(uint64(in2[4])<<0)
   640  	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
   641  		uint64(in[6])*(uint64(in2[7])<<0) +
   642  		uint64(in[7])*(uint64(in2[6])<<0) +
   643  		uint64(in[8])*(uint64(in2[5])<<0)
   644  	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
   645  		uint64(in[7])*(uint64(in2[7])<<1) +
   646  		uint64(in[8])*(uint64(in2[6])<<0)
   647  	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
   648  		uint64(in[8])*(uint64(in2[7])<<0)
   649  	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
   650  
   651  	p256ReduceDegree(out, tmp)
   652  }
   653  
   654  func p256Assign(out, in *[p256Limbs]uint32) {
   655  	*out = *in
   656  }
   657  
   658  // p256Invert calculates |out| = |in|^{-1}
   659  //
   660  // Based on Fermat's Little Theorem:
   661  //   a^p = a (mod p)
   662  //   a^{p-1} = 1 (mod p)
   663  //   a^{p-2} = a^{-1} (mod p)
   664  func p256Invert(out, in *[p256Limbs]uint32) {
   665  	var ftmp, ftmp2 [p256Limbs]uint32
   666  
   667  	// each e_I will hold |in|^{2^I - 1}
   668  	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
   669  
   670  	p256Square(&ftmp, in)     // 2^1
   671  	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
   672  	p256Assign(&e2, &ftmp)
   673  	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
   674  	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
   675  	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
   676  	p256Assign(&e4, &ftmp)
   677  	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
   678  	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
   679  	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
   680  	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
   681  	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
   682  	p256Assign(&e8, &ftmp)
   683  	for i := 0; i < 8; i++ {
   684  		p256Square(&ftmp, &ftmp)
   685  	} // 2^16 - 2^8
   686  	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
   687  	p256Assign(&e16, &ftmp)
   688  	for i := 0; i < 16; i++ {
   689  		p256Square(&ftmp, &ftmp)
   690  	} // 2^32 - 2^16
   691  	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
   692  	p256Assign(&e32, &ftmp)
   693  	for i := 0; i < 32; i++ {
   694  		p256Square(&ftmp, &ftmp)
   695  	} // 2^64 - 2^32
   696  	p256Assign(&e64, &ftmp)
   697  	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
   698  	for i := 0; i < 192; i++ {
   699  		p256Square(&ftmp, &ftmp)
   700  	} // 2^256 - 2^224 + 2^192
   701  
   702  	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
   703  	for i := 0; i < 16; i++ {
   704  		p256Square(&ftmp2, &ftmp2)
   705  	} // 2^80 - 2^16
   706  	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
   707  	for i := 0; i < 8; i++ {
   708  		p256Square(&ftmp2, &ftmp2)
   709  	} // 2^88 - 2^8
   710  	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
   711  	for i := 0; i < 4; i++ {
   712  		p256Square(&ftmp2, &ftmp2)
   713  	} // 2^92 - 2^4
   714  	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
   715  	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
   716  	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
   717  	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
   718  	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
   719  	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
   720  	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
   721  
   722  	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
   723  }
   724  
   725  // p256Scalar3 sets out=3*out.
   726  //
   727  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   728  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   729  func p256Scalar3(out *[p256Limbs]uint32) {
   730  	var carry uint32
   731  
   732  	for i := 0; ; i++ {
   733  		out[i] *= 3
   734  		out[i] += carry
   735  		carry = out[i] >> 29
   736  		out[i] &= bottom29Bits
   737  
   738  		i++
   739  		if i == p256Limbs {
   740  			break
   741  		}
   742  
   743  		out[i] *= 3
   744  		out[i] += carry
   745  		carry = out[i] >> 28
   746  		out[i] &= bottom28Bits
   747  	}
   748  
   749  	p256ReduceCarry(out, carry)
   750  }
   751  
   752  // p256Scalar4 sets out=4*out.
   753  //
   754  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   755  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   756  func p256Scalar4(out *[p256Limbs]uint32) {
   757  	var carry, nextCarry uint32
   758  
   759  	for i := 0; ; i++ {
   760  		nextCarry = out[i] >> 27
   761  		out[i] <<= 2
   762  		out[i] &= bottom29Bits
   763  		out[i] += carry
   764  		carry = nextCarry + (out[i] >> 29)
   765  		out[i] &= bottom29Bits
   766  
   767  		i++
   768  		if i == p256Limbs {
   769  			break
   770  		}
   771  		nextCarry = out[i] >> 26
   772  		out[i] <<= 2
   773  		out[i] &= bottom28Bits
   774  		out[i] += carry
   775  		carry = nextCarry + (out[i] >> 28)
   776  		out[i] &= bottom28Bits
   777  	}
   778  
   779  	p256ReduceCarry(out, carry)
   780  }
   781  
   782  // p256Scalar8 sets out=8*out.
   783  //
   784  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   785  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   786  func p256Scalar8(out *[p256Limbs]uint32) {
   787  	var carry, nextCarry uint32
   788  
   789  	for i := 0; ; i++ {
   790  		nextCarry = out[i] >> 26
   791  		out[i] <<= 3
   792  		out[i] &= bottom29Bits
   793  		out[i] += carry
   794  		carry = nextCarry + (out[i] >> 29)
   795  		out[i] &= bottom29Bits
   796  
   797  		i++
   798  		if i == p256Limbs {
   799  			break
   800  		}
   801  		nextCarry = out[i] >> 25
   802  		out[i] <<= 3
   803  		out[i] &= bottom28Bits
   804  		out[i] += carry
   805  		carry = nextCarry + (out[i] >> 28)
   806  		out[i] &= bottom28Bits
   807  	}
   808  
   809  	p256ReduceCarry(out, carry)
   810  }
   811  
   812  // Group operations:
   813  //
   814  // Elements of the elliptic curve group are represented in Jacobian
   815  // coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
   816  // Jacobian form.
   817  
   818  // p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
   819  //
   820  // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
   821  func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
   822  	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
   823  
   824  	p256Square(&delta, z)
   825  	p256Square(&gamma, y)
   826  	p256Mul(&beta, x, &gamma)
   827  
   828  	p256Sum(&tmp, x, &delta)
   829  	p256Diff(&tmp2, x, &delta)
   830  	p256Mul(&alpha, &tmp, &tmp2)
   831  	p256Scalar3(&alpha)
   832  
   833  	p256Sum(&tmp, y, z)
   834  	p256Square(&tmp, &tmp)
   835  	p256Diff(&tmp, &tmp, &gamma)
   836  	p256Diff(zOut, &tmp, &delta)
   837  
   838  	p256Scalar4(&beta)
   839  	p256Square(xOut, &alpha)
   840  	p256Diff(xOut, xOut, &beta)
   841  	p256Diff(xOut, xOut, &beta)
   842  
   843  	p256Diff(&tmp, &beta, xOut)
   844  	p256Mul(&tmp, &alpha, &tmp)
   845  	p256Square(&tmp2, &gamma)
   846  	p256Scalar8(&tmp2)
   847  	p256Diff(yOut, &tmp, &tmp2)
   848  }
   849  
   850  // p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
   851  // (i.e. the second point is affine.)
   852  //
   853  // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   854  //
   855  // Note that this function does not handle P+P, infinity+P nor P+infinity
   856  // correctly.
   857  func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
   858  	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   859  
   860  	p256Square(&z1z1, z1)
   861  	p256Sum(&tmp, z1, z1)
   862  
   863  	p256Mul(&u2, x2, &z1z1)
   864  	p256Mul(&z1z1z1, z1, &z1z1)
   865  	p256Mul(&s2, y2, &z1z1z1)
   866  	p256Diff(&h, &u2, x1)
   867  	p256Sum(&i, &h, &h)
   868  	p256Square(&i, &i)
   869  	p256Mul(&j, &h, &i)
   870  	p256Diff(&r, &s2, y1)
   871  	p256Sum(&r, &r, &r)
   872  	p256Mul(&v, x1, &i)
   873  
   874  	p256Mul(zOut, &tmp, &h)
   875  	p256Square(&rr, &r)
   876  	p256Diff(xOut, &rr, &j)
   877  	p256Diff(xOut, xOut, &v)
   878  	p256Diff(xOut, xOut, &v)
   879  
   880  	p256Diff(&tmp, &v, xOut)
   881  	p256Mul(yOut, &tmp, &r)
   882  	p256Mul(&tmp, y1, &j)
   883  	p256Diff(yOut, yOut, &tmp)
   884  	p256Diff(yOut, yOut, &tmp)
   885  }
   886  
   887  // p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
   888  //
   889  // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   890  //
   891  // Note that this function does not handle P+P, infinity+P nor P+infinity
   892  // correctly.
   893  func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
   894  	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   895  
   896  	p256Square(&z1z1, z1)
   897  	p256Square(&z2z2, z2)
   898  	p256Mul(&u1, x1, &z2z2)
   899  
   900  	p256Sum(&tmp, z1, z2)
   901  	p256Square(&tmp, &tmp)
   902  	p256Diff(&tmp, &tmp, &z1z1)
   903  	p256Diff(&tmp, &tmp, &z2z2)
   904  
   905  	p256Mul(&z2z2z2, z2, &z2z2)
   906  	p256Mul(&s1, y1, &z2z2z2)
   907  
   908  	p256Mul(&u2, x2, &z1z1)
   909  	p256Mul(&z1z1z1, z1, &z1z1)
   910  	p256Mul(&s2, y2, &z1z1z1)
   911  	p256Diff(&h, &u2, &u1)
   912  	p256Sum(&i, &h, &h)
   913  	p256Square(&i, &i)
   914  	p256Mul(&j, &h, &i)
   915  	p256Diff(&r, &s2, &s1)
   916  	p256Sum(&r, &r, &r)
   917  	p256Mul(&v, &u1, &i)
   918  
   919  	p256Mul(zOut, &tmp, &h)
   920  	p256Square(&rr, &r)
   921  	p256Diff(xOut, &rr, &j)
   922  	p256Diff(xOut, xOut, &v)
   923  	p256Diff(xOut, xOut, &v)
   924  
   925  	p256Diff(&tmp, &v, xOut)
   926  	p256Mul(yOut, &tmp, &r)
   927  	p256Mul(&tmp, &s1, &j)
   928  	p256Diff(yOut, yOut, &tmp)
   929  	p256Diff(yOut, yOut, &tmp)
   930  }
   931  
   932  // p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
   933  //
   934  // On entry: mask is either 0 or 0xffffffff.
   935  func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
   936  	for i := 0; i < p256Limbs; i++ {
   937  		tmp := mask & (in[i] ^ out[i])
   938  		out[i] ^= tmp
   939  	}
   940  }
   941  
   942  // p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
   943  // On entry: index < 16, table[0] must be zero.
   944  func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
   945  	for i := range xOut {
   946  		xOut[i] = 0
   947  	}
   948  	for i := range yOut {
   949  		yOut[i] = 0
   950  	}
   951  
   952  	for i := uint32(1); i < 16; i++ {
   953  		mask := i ^ index
   954  		mask |= mask >> 2
   955  		mask |= mask >> 1
   956  		mask &= 1
   957  		mask--
   958  		for j := range xOut {
   959  			xOut[j] |= table[0] & mask
   960  			table = table[1:]
   961  		}
   962  		for j := range yOut {
   963  			yOut[j] |= table[0] & mask
   964  			table = table[1:]
   965  		}
   966  	}
   967  }
   968  
   969  // p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
   970  // table.
   971  // On entry: index < 16, table[0] must be zero.
   972  func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
   973  	for i := range xOut {
   974  		xOut[i] = 0
   975  	}
   976  	for i := range yOut {
   977  		yOut[i] = 0
   978  	}
   979  	for i := range zOut {
   980  		zOut[i] = 0
   981  	}
   982  
   983  	// The implicit value at index 0 is all zero. We don't need to perform that
   984  	// iteration of the loop because we already set out_* to zero.
   985  	for i := uint32(1); i < 16; i++ {
   986  		mask := i ^ index
   987  		mask |= mask >> 2
   988  		mask |= mask >> 1
   989  		mask &= 1
   990  		mask--
   991  		for j := range xOut {
   992  			xOut[j] |= table[i][0][j] & mask
   993  		}
   994  		for j := range yOut {
   995  			yOut[j] |= table[i][1][j] & mask
   996  		}
   997  		for j := range zOut {
   998  			zOut[j] |= table[i][2][j] & mask
   999  		}
  1000  	}
  1001  }
  1002  
  1003  // p256GetBit returns the bit'th bit of scalar.
  1004  func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
  1005  	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
  1006  }
  1007  
  1008  // p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
  1009  // little-endian number. Note that the value of scalar must be less than the
  1010  // order of the group.
  1011  func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
  1012  	nIsInfinityMask := ^uint32(0)
  1013  	var pIsNoninfiniteMask, mask, tableOffset uint32
  1014  	var px, py, tx, ty, tz [p256Limbs]uint32
  1015  
  1016  	for i := range xOut {
  1017  		xOut[i] = 0
  1018  	}
  1019  	for i := range yOut {
  1020  		yOut[i] = 0
  1021  	}
  1022  	for i := range zOut {
  1023  		zOut[i] = 0
  1024  	}
  1025  
  1026  	// The loop adds bits at positions 0, 64, 128 and 192, followed by
  1027  	// positions 32,96,160 and 224 and does this 32 times.
  1028  	for i := uint(0); i < 32; i++ {
  1029  		if i != 0 {
  1030  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1031  		}
  1032  		tableOffset = 0
  1033  		for j := uint(0); j <= 32; j += 32 {
  1034  			bit0 := p256GetBit(scalar, 31-i+j)
  1035  			bit1 := p256GetBit(scalar, 95-i+j)
  1036  			bit2 := p256GetBit(scalar, 159-i+j)
  1037  			bit3 := p256GetBit(scalar, 223-i+j)
  1038  			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
  1039  
  1040  			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
  1041  			tableOffset += 30 * p256Limbs
  1042  
  1043  			// Since scalar is less than the order of the group, we know that
  1044  			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
  1045  			// below.
  1046  			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
  1047  			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
  1048  			// (a.k.a.  the point at infinity). We handle that situation by
  1049  			// copying the point from the table.
  1050  			p256CopyConditional(xOut, &px, nIsInfinityMask)
  1051  			p256CopyConditional(yOut, &py, nIsInfinityMask)
  1052  			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
  1053  
  1054  			// Equally, the result is also wrong if the point from the table is
  1055  			// zero, which happens when the index is zero. We handle that by
  1056  			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
  1057  			pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1058  			mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1059  			p256CopyConditional(xOut, &tx, mask)
  1060  			p256CopyConditional(yOut, &ty, mask)
  1061  			p256CopyConditional(zOut, &tz, mask)
  1062  			// If p was not zero, then n is now non-zero.
  1063  			nIsInfinityMask &^= pIsNoninfiniteMask
  1064  		}
  1065  	}
  1066  }
  1067  
  1068  // p256PointToAffine converts a Jacobian point to an affine point. If the input
  1069  // is the point at infinity then it returns (0, 0) in constant time.
  1070  func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
  1071  	var zInv, zInvSq [p256Limbs]uint32
  1072  
  1073  	p256Invert(&zInv, z)
  1074  	p256Square(&zInvSq, &zInv)
  1075  	p256Mul(xOut, x, &zInvSq)
  1076  	p256Mul(&zInv, &zInv, &zInvSq)
  1077  	p256Mul(yOut, y, &zInv)
  1078  }
  1079  
  1080  // p256ToAffine returns a pair of *big.Int containing the affine representation
  1081  // of {x,y,z}.
  1082  func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
  1083  	var xx, yy [p256Limbs]uint32
  1084  	p256PointToAffine(&xx, &yy, x, y, z)
  1085  	return p256ToBig(&xx), p256ToBig(&yy)
  1086  }
  1087  
  1088  // p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
  1089  func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
  1090  	var px, py, pz, tx, ty, tz [p256Limbs]uint32
  1091  	var precomp [16][3][p256Limbs]uint32
  1092  	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
  1093  
  1094  	// We precompute 0,1,2,... times {x,y}.
  1095  	precomp[1][0] = *x
  1096  	precomp[1][1] = *y
  1097  	precomp[1][2] = p256One
  1098  
  1099  	for i := 2; i < 16; i += 2 {
  1100  		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
  1101  		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
  1102  	}
  1103  
  1104  	for i := range xOut {
  1105  		xOut[i] = 0
  1106  	}
  1107  	for i := range yOut {
  1108  		yOut[i] = 0
  1109  	}
  1110  	for i := range zOut {
  1111  		zOut[i] = 0
  1112  	}
  1113  	nIsInfinityMask = ^uint32(0)
  1114  
  1115  	// We add in a window of four bits each iteration and do this 64 times.
  1116  	for i := 0; i < 64; i++ {
  1117  		if i != 0 {
  1118  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1119  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1120  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1121  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1122  		}
  1123  
  1124  		index = uint32(scalar[31-i/2])
  1125  		if (i & 1) == 1 {
  1126  			index &= 15
  1127  		} else {
  1128  			index >>= 4
  1129  		}
  1130  
  1131  		// See the comments in scalarBaseMult about handling infinities.
  1132  		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
  1133  		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
  1134  		p256CopyConditional(xOut, &px, nIsInfinityMask)
  1135  		p256CopyConditional(yOut, &py, nIsInfinityMask)
  1136  		p256CopyConditional(zOut, &pz, nIsInfinityMask)
  1137  
  1138  		pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1139  		mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1140  		p256CopyConditional(xOut, &tx, mask)
  1141  		p256CopyConditional(yOut, &ty, mask)
  1142  		p256CopyConditional(zOut, &tz, mask)
  1143  		nIsInfinityMask &^= pIsNoninfiniteMask
  1144  	}
  1145  }
  1146  
  1147  // p256FromBig sets out = R*in.
  1148  func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
  1149  	tmp := new(big.Int).Lsh(in, 257)
  1150  	tmp.Mod(tmp, p256Params.P)
  1151  
  1152  	for i := 0; i < p256Limbs; i++ {
  1153  		if bits := tmp.Bits(); len(bits) > 0 {
  1154  			out[i] = uint32(bits[0]) & bottom29Bits
  1155  		} else {
  1156  			out[i] = 0
  1157  		}
  1158  		tmp.Rsh(tmp, 29)
  1159  
  1160  		i++
  1161  		if i == p256Limbs {
  1162  			break
  1163  		}
  1164  
  1165  		if bits := tmp.Bits(); len(bits) > 0 {
  1166  			out[i] = uint32(bits[0]) & bottom28Bits
  1167  		} else {
  1168  			out[i] = 0
  1169  		}
  1170  		tmp.Rsh(tmp, 28)
  1171  	}
  1172  }
  1173  
  1174  // p256ToBig returns a *big.Int containing the value of in.
  1175  func p256ToBig(in *[p256Limbs]uint32) *big.Int {
  1176  	result, tmp := new(big.Int), new(big.Int)
  1177  
  1178  	result.SetInt64(int64(in[p256Limbs-1]))
  1179  	for i := p256Limbs - 2; i >= 0; i-- {
  1180  		if (i & 1) == 0 {
  1181  			result.Lsh(result, 29)
  1182  		} else {
  1183  			result.Lsh(result, 28)
  1184  		}
  1185  		tmp.SetInt64(int64(in[i]))
  1186  		result.Add(result, tmp)
  1187  	}
  1188  
  1189  	result.Mul(result, p256RInverse)
  1190  	result.Mod(result, p256Params.P)
  1191  	return result
  1192  }