github.com/AESNooper/go/src@v0.0.0-20220218095104-b56a4ab1bbbb/crypto/elliptic/p256.go

github.com/AESNooper/go/src@v0.0.0-20220218095104-b56a4ab1bbbb/crypto/elliptic/p256.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !amd64 && !arm64
     6  
     7  package elliptic
     8  
     9  // This file contains a constant-time, 32-bit implementation of P256.
    10  
    11  import (
    12  	"math/big"
    13  )
    14  
    15  type p256Curve struct {
    16  	*CurveParams
    17  }
    18  
    19  var (
    20  	p256Params *CurveParams
    21  
    22  	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
    23  	// (2**257).
    24  	p256RInverse *big.Int
    25  )
    26  
    27  func initP256() {
    28  	// See FIPS 186-3, section D.2.3
    29  	p256Params = &CurveParams{Name: "P-256"}
    30  	p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
    31  	p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
    32  	p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
    33  	p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
    34  	p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
    35  	p256Params.BitSize = 256
    36  
    37  	p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
    38  
    39  	// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
    40  	initP256Arch()
    41  }
    42  
    43  func (curve p256Curve) Params() *CurveParams {
    44  	return curve.CurveParams
    45  }
    46  
    47  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
    48  // to out. If the scalar is equal or greater than the order of the group, it's
    49  // reduced modulo that order.
    50  func p256GetScalar(out *[32]byte, in []byte) {
    51  	n := new(big.Int).SetBytes(in)
    52  	var scalarBytes []byte
    53  
    54  	if n.Cmp(p256Params.N) >= 0 {
    55  		n.Mod(n, p256Params.N)
    56  		scalarBytes = n.Bytes()
    57  	} else {
    58  		scalarBytes = in
    59  	}
    60  
    61  	for i, v := range scalarBytes {
    62  		out[len(scalarBytes)-(1+i)] = v
    63  	}
    64  }
    65  
    66  func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
    67  	var scalarReversed [32]byte
    68  	p256GetScalar(&scalarReversed, scalar)
    69  
    70  	var x1, y1, z1 [p256Limbs]uint32
    71  	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
    72  	return p256ToAffine(&x1, &y1, &z1)
    73  }
    74  
    75  func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
    76  	var scalarReversed [32]byte
    77  	p256GetScalar(&scalarReversed, scalar)
    78  
    79  	var px, py, x1, y1, z1 [p256Limbs]uint32
    80  	p256FromBig(&px, bigX)
    81  	p256FromBig(&py, bigY)
    82  	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
    83  	return p256ToAffine(&x1, &y1, &z1)
    84  }
    85  
    86  // Field elements are represented as nine, unsigned 32-bit words.
    87  //
    88  // The value of a field element is:
    89  //   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
    90  //
    91  // That is, each limb is alternately 29 or 28-bits wide in little-endian
    92  // order.
    93  //
    94  // This means that a field element hits 2**257, rather than 2**256 as we would
    95  // like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
    96  // problems when multiplying as terms end up one bit short of a limb which
    97  // would require much bit-shifting to correct.
    98  //
    99  // Finally, the values stored in a field element are in Montgomery form. So the
   100  // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
   101  // 2**257.
   102  
   103  const (
   104  	p256Limbs    = 9
   105  	bottom29Bits = 0x1fffffff
   106  )
   107  
   108  var (
   109  	// p256One is the number 1 as a field element.
   110  	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
   111  	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
   112  	// p256P is the prime modulus as a field element.
   113  	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
   114  	// p2562P is the twice prime modulus as a field element.
   115  	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
   116  )
   117  
   118  // p256Precomputed contains precomputed values to aid the calculation of scalar
   119  // multiples of the base point, G. It's actually two, equal length, tables
   120  // concatenated.
   121  //
   122  // The first table contains (x,y) field element pairs for 16 multiples of the
   123  // base point, G.
   124  //
   125  //   Index  |  Index (binary) | Value
   126  //       0  |           0000  | 0G (all zeros, omitted)
   127  //       1  |           0001  | G
   128  //       2  |           0010  | 2**64G
   129  //       3  |           0011  | 2**64G + G
   130  //       4  |           0100  | 2**128G
   131  //       5  |           0101  | 2**128G + G
   132  //       6  |           0110  | 2**128G + 2**64G
   133  //       7  |           0111  | 2**128G + 2**64G + G
   134  //       8  |           1000  | 2**192G
   135  //       9  |           1001  | 2**192G + G
   136  //      10  |           1010  | 2**192G + 2**64G
   137  //      11  |           1011  | 2**192G + 2**64G + G
   138  //      12  |           1100  | 2**192G + 2**128G
   139  //      13  |           1101  | 2**192G + 2**128G + G
   140  //      14  |           1110  | 2**192G + 2**128G + 2**64G
   141  //      15  |           1111  | 2**192G + 2**128G + 2**64G + G
   142  //
   143  // The second table follows the same style, but the terms are 2**32G,
   144  // 2**96G, 2**160G, 2**224G.
   145  //
   146  // This is ~2KB of data.
   147  var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
   148  	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
   149  	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
   150  	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
   151  	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
   152  	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
   153  	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
   154  	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
   155  	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
   156  	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
   157  	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
   158  	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
   159  	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
   160  	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
   161  	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
   162  	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
   163  	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
   164  	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
   165  	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
   166  	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
   167  	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
   168  	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
   169  	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
   170  	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
   171  	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
   172  	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
   173  	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
   174  	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
   175  	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
   176  	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
   177  	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
   178  	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
   179  	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
   180  	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
   181  	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
   182  	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
   183  	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
   184  	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
   185  	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
   186  	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
   187  	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
   188  	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
   189  	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
   190  	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
   191  	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
   192  	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
   193  	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
   194  	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
   195  	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
   196  	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
   197  	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
   198  	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
   199  	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
   200  	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
   201  	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
   202  	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
   203  	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
   204  	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
   205  	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
   206  	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
   207  	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
   208  }
   209  
   210  // Field element operations:
   211  
   212  const bottom28Bits = 0xfffffff
   213  
   214  // nonZeroToAllOnes returns:
   215  //   0xffffffff for 0 < x <= 2**31
   216  //   0 for x == 0 or x > 2**31.
   217  func nonZeroToAllOnes(x uint32) uint32 {
   218  	return ((x - 1) >> 31) - 1
   219  }
   220  
   221  // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
   222  // which is a term at 2**257.
   223  //
   224  // On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
   225  // On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
   226  func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
   227  	carry_mask := nonZeroToAllOnes(carry)
   228  
   229  	inout[0] += carry << 1
   230  	inout[3] += 0x10000000 & carry_mask
   231  	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
   232  	// previous line therefore this doesn't underflow.
   233  	inout[3] -= carry << 11
   234  	inout[4] += (0x20000000 - 1) & carry_mask
   235  	inout[5] += (0x10000000 - 1) & carry_mask
   236  	inout[6] += (0x20000000 - 1) & carry_mask
   237  	inout[6] -= carry << 22
   238  	// This may underflow if carry is non-zero but, if so, we'll fix it in the
   239  	// next line.
   240  	inout[7] -= 1 & carry_mask
   241  	inout[7] += carry << 25
   242  }
   243  
   244  // p256Sum sets out = in+in2.
   245  //
   246  // On entry, in[i]+in2[i] must not overflow a 32-bit word.
   247  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   248  func p256Sum(out, in, in2 *[p256Limbs]uint32) {
   249  	carry := uint32(0)
   250  	for i := 0; ; i++ {
   251  		out[i] = in[i] + in2[i]
   252  		out[i] += carry
   253  		carry = out[i] >> 29
   254  		out[i] &= bottom29Bits
   255  
   256  		i++
   257  		if i == p256Limbs {
   258  			break
   259  		}
   260  
   261  		out[i] = in[i] + in2[i]
   262  		out[i] += carry
   263  		carry = out[i] >> 28
   264  		out[i] &= bottom28Bits
   265  	}
   266  
   267  	p256ReduceCarry(out, carry)
   268  }
   269  
   270  const (
   271  	two30m2    = 1<<30 - 1<<2
   272  	two30p13m2 = 1<<30 + 1<<13 - 1<<2
   273  	two31m2    = 1<<31 - 1<<2
   274  	two31m3    = 1<<31 - 1<<3
   275  	two31p24m2 = 1<<31 + 1<<24 - 1<<2
   276  	two30m27m2 = 1<<30 - 1<<27 - 1<<2
   277  )
   278  
   279  // p256Zero31 is 0 mod p.
   280  var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
   281  
   282  // p256Diff sets out = in-in2.
   283  //
   284  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   285  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   286  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   287  func p256Diff(out, in, in2 *[p256Limbs]uint32) {
   288  	var carry uint32
   289  
   290  	for i := 0; ; i++ {
   291  		out[i] = in[i] - in2[i]
   292  		out[i] += p256Zero31[i]
   293  		out[i] += carry
   294  		carry = out[i] >> 29
   295  		out[i] &= bottom29Bits
   296  
   297  		i++
   298  		if i == p256Limbs {
   299  			break
   300  		}
   301  
   302  		out[i] = in[i] - in2[i]
   303  		out[i] += p256Zero31[i]
   304  		out[i] += carry
   305  		carry = out[i] >> 28
   306  		out[i] &= bottom28Bits
   307  	}
   308  
   309  	p256ReduceCarry(out, carry)
   310  }
   311  
   312  // p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
   313  // the same 29,28,... bit positions as a field element.
   314  //
   315  // The values in field elements are in Montgomery form: x*R mod p where R =
   316  // 2**257. Since we just multiplied two Montgomery values together, the result
   317  // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
   318  // in Montgomery form.
   319  //
   320  // On entry: tmp[i] < 2**64
   321  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   322  func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
   323  	// The following table may be helpful when reading this code:
   324  	//
   325  	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
   326  	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
   327  	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
   328  	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
   329  	var tmp2 [18]uint32
   330  	var carry, x, xMask uint32
   331  
   332  	// tmp contains 64-bit words with the same 29,28,29-bit positions as a
   333  	// field element. So the top of an element of tmp might overlap with
   334  	// another element two positions down. The following loop eliminates
   335  	// this overlap.
   336  	tmp2[0] = uint32(tmp[0]) & bottom29Bits
   337  
   338  	tmp2[1] = uint32(tmp[0]) >> 29
   339  	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
   340  	tmp2[1] += uint32(tmp[1]) & bottom28Bits
   341  	carry = tmp2[1] >> 28
   342  	tmp2[1] &= bottom28Bits
   343  
   344  	for i := 2; i < 17; i++ {
   345  		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
   346  		tmp2[i] += (uint32(tmp[i-1])) >> 28
   347  		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
   348  		tmp2[i] += uint32(tmp[i]) & bottom29Bits
   349  		tmp2[i] += carry
   350  		carry = tmp2[i] >> 29
   351  		tmp2[i] &= bottom29Bits
   352  
   353  		i++
   354  		if i == 17 {
   355  			break
   356  		}
   357  		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
   358  		tmp2[i] += uint32(tmp[i-1]) >> 29
   359  		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
   360  		tmp2[i] += uint32(tmp[i]) & bottom28Bits
   361  		tmp2[i] += carry
   362  		carry = tmp2[i] >> 28
   363  		tmp2[i] &= bottom28Bits
   364  	}
   365  
   366  	tmp2[17] = uint32(tmp[15]>>32) >> 25
   367  	tmp2[17] += uint32(tmp[16]) >> 29
   368  	tmp2[17] += uint32(tmp[16]>>32) << 3
   369  	tmp2[17] += carry
   370  
   371  	// Montgomery elimination of terms:
   372  	//
   373  	// Since R is 2**257, we can divide by R with a bitwise shift if we can
   374  	// ensure that the right-most 257 bits are all zero. We can make that true
   375  	// by adding multiplies of p without affecting the value.
   376  	//
   377  	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
   378  	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
   379  	// We can do that for 8 further limbs and then right shift to eliminate the
   380  	// extra factor of R.
   381  	for i := 0; ; i += 2 {
   382  		tmp2[i+1] += tmp2[i] >> 29
   383  		x = tmp2[i] & bottom29Bits
   384  		xMask = nonZeroToAllOnes(x)
   385  		tmp2[i] = 0
   386  
   387  		// The bounds calculations for this loop are tricky. Each iteration of
   388  		// the loop eliminates two words by adding values to words to their
   389  		// right.
   390  		//
   391  		// The following table contains the amounts added to each word (as an
   392  		// offset from the value of i at the top of the loop). The amounts are
   393  		// accounted for from the first and second half of the loop separately
   394  		// and are written as, for example, 28 to mean a value <2**28.
   395  		//
   396  		// Word:                   3   4   5   6   7   8   9   10
   397  		// Added in top half:     28  11      29  21  29  28
   398  		//                                        28  29
   399  		//                                            29
   400  		// Added in bottom half:      29  10      28  21  28   28
   401  		//                                            29
   402  		//
   403  		// The value that is currently offset 7 will be offset 5 for the next
   404  		// iteration and then offset 3 for the iteration after that. Therefore
   405  		// the total value added will be the values added at 7, 5 and 3.
   406  		//
   407  		// The following table accumulates these values. The sums at the bottom
   408  		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
   409  		//
   410  		// Word:                   3   4   5   6   7   8   9  10  11  12  13
   411  		//                        28  11  10  29  21  29  28  28  28  28  28
   412  		//                            29  28  11  28  29  28  29  28  29  28
   413  		//                                    29  28  21  21  29  21  29  21
   414  		//                                        10  29  28  21  28  21  28
   415  		//                                        28  29  28  29  28  29  28
   416  		//                                            11  10  29  10  29  10
   417  		//                                            29  28  11  28  11
   418  		//                                                    29      29
   419  		//                        --------------------------------------------
   420  		//                                                30+ 31+ 30+ 31+ 30+
   421  		//                                                28+ 29+ 28+ 29+ 21+
   422  		//                                                21+ 28+ 21+ 28+ 10
   423  		//                                                10  21+ 10  21+
   424  		//                                                    11      11
   425  		//
   426  		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
   427  		// tmp2[10/12] has an initial value of <2**29, then the maximum value
   428  		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
   429  		// as required.
   430  		tmp2[i+3] += (x << 10) & bottom28Bits
   431  		tmp2[i+4] += (x >> 18)
   432  
   433  		tmp2[i+6] += (x << 21) & bottom29Bits
   434  		tmp2[i+7] += x >> 8
   435  
   436  		// At position 200, which is the starting bit position for word 7, we
   437  		// have a factor of 0xf000000 = 2**28 - 2**24.
   438  		tmp2[i+7] += 0x10000000 & xMask
   439  		tmp2[i+8] += (x - 1) & xMask
   440  		tmp2[i+7] -= (x << 24) & bottom28Bits
   441  		tmp2[i+8] -= x >> 4
   442  
   443  		tmp2[i+8] += 0x20000000 & xMask
   444  		tmp2[i+8] -= x
   445  		tmp2[i+8] += (x << 28) & bottom29Bits
   446  		tmp2[i+9] += ((x >> 1) - 1) & xMask
   447  
   448  		if i+1 == p256Limbs {
   449  			break
   450  		}
   451  		tmp2[i+2] += tmp2[i+1] >> 28
   452  		x = tmp2[i+1] & bottom28Bits
   453  		xMask = nonZeroToAllOnes(x)
   454  		tmp2[i+1] = 0
   455  
   456  		tmp2[i+4] += (x << 11) & bottom29Bits
   457  		tmp2[i+5] += (x >> 18)
   458  
   459  		tmp2[i+7] += (x << 21) & bottom28Bits
   460  		tmp2[i+8] += x >> 7
   461  
   462  		// At position 199, which is the starting bit of the 8th word when
   463  		// dealing with a context starting on an odd word, we have a factor of
   464  		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
   465  		// word from i+1 is i+8.
   466  		tmp2[i+8] += 0x20000000 & xMask
   467  		tmp2[i+9] += (x - 1) & xMask
   468  		tmp2[i+8] -= (x << 25) & bottom29Bits
   469  		tmp2[i+9] -= x >> 4
   470  
   471  		tmp2[i+9] += 0x10000000 & xMask
   472  		tmp2[i+9] -= x
   473  		tmp2[i+10] += (x - 1) & xMask
   474  	}
   475  
   476  	// We merge the right shift with a carry chain. The words above 2**257 have
   477  	// widths of 28,29,... which we need to correct when copying them down.
   478  	carry = 0
   479  	for i := 0; i < 8; i++ {
   480  		// The maximum value of tmp2[i + 9] occurs on the first iteration and
   481  		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
   482  		// therefore safe.
   483  		out[i] = tmp2[i+9]
   484  		out[i] += carry
   485  		out[i] += (tmp2[i+10] << 28) & bottom29Bits
   486  		carry = out[i] >> 29
   487  		out[i] &= bottom29Bits
   488  
   489  		i++
   490  		out[i] = tmp2[i+9] >> 1
   491  		out[i] += carry
   492  		carry = out[i] >> 28
   493  		out[i] &= bottom28Bits
   494  	}
   495  
   496  	out[8] = tmp2[17]
   497  	out[8] += carry
   498  	carry = out[8] >> 29
   499  	out[8] &= bottom29Bits
   500  
   501  	p256ReduceCarry(out, carry)
   502  }
   503  
   504  // p256Square sets out=in*in.
   505  //
   506  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
   507  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   508  func p256Square(out, in *[p256Limbs]uint32) {
   509  	var tmp [17]uint64
   510  
   511  	tmp[0] = uint64(in[0]) * uint64(in[0])
   512  	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
   513  	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
   514  		uint64(in[1])*(uint64(in[1])<<1)
   515  	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
   516  		uint64(in[1])*(uint64(in[2])<<1)
   517  	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
   518  		uint64(in[1])*(uint64(in[3])<<2) +
   519  		uint64(in[2])*uint64(in[2])
   520  	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
   521  		uint64(in[1])*(uint64(in[4])<<1) +
   522  		uint64(in[2])*(uint64(in[3])<<1)
   523  	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
   524  		uint64(in[1])*(uint64(in[5])<<2) +
   525  		uint64(in[2])*(uint64(in[4])<<1) +
   526  		uint64(in[3])*(uint64(in[3])<<1)
   527  	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
   528  		uint64(in[1])*(uint64(in[6])<<1) +
   529  		uint64(in[2])*(uint64(in[5])<<1) +
   530  		uint64(in[3])*(uint64(in[4])<<1)
   531  	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
   532  	// which is < 2**64 as required.
   533  	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
   534  		uint64(in[1])*(uint64(in[7])<<2) +
   535  		uint64(in[2])*(uint64(in[6])<<1) +
   536  		uint64(in[3])*(uint64(in[5])<<2) +
   537  		uint64(in[4])*uint64(in[4])
   538  	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
   539  		uint64(in[2])*(uint64(in[7])<<1) +
   540  		uint64(in[3])*(uint64(in[6])<<1) +
   541  		uint64(in[4])*(uint64(in[5])<<1)
   542  	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
   543  		uint64(in[3])*(uint64(in[7])<<2) +
   544  		uint64(in[4])*(uint64(in[6])<<1) +
   545  		uint64(in[5])*(uint64(in[5])<<1)
   546  	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
   547  		uint64(in[4])*(uint64(in[7])<<1) +
   548  		uint64(in[5])*(uint64(in[6])<<1)
   549  	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
   550  		uint64(in[5])*(uint64(in[7])<<2) +
   551  		uint64(in[6])*uint64(in[6])
   552  	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
   553  		uint64(in[6])*(uint64(in[7])<<1)
   554  	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
   555  		uint64(in[7])*(uint64(in[7])<<1)
   556  	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
   557  	tmp[16] = uint64(in[8]) * uint64(in[8])
   558  
   559  	p256ReduceDegree(out, tmp)
   560  }
   561  
   562  // p256Mul sets out=in*in2.
   563  //
   564  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   565  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   566  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   567  func p256Mul(out, in, in2 *[p256Limbs]uint32) {
   568  	var tmp [17]uint64
   569  
   570  	tmp[0] = uint64(in[0]) * uint64(in2[0])
   571  	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
   572  		uint64(in[1])*(uint64(in2[0])<<0)
   573  	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
   574  		uint64(in[1])*(uint64(in2[1])<<1) +
   575  		uint64(in[2])*(uint64(in2[0])<<0)
   576  	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
   577  		uint64(in[1])*(uint64(in2[2])<<0) +
   578  		uint64(in[2])*(uint64(in2[1])<<0) +
   579  		uint64(in[3])*(uint64(in2[0])<<0)
   580  	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
   581  		uint64(in[1])*(uint64(in2[3])<<1) +
   582  		uint64(in[2])*(uint64(in2[2])<<0) +
   583  		uint64(in[3])*(uint64(in2[1])<<1) +
   584  		uint64(in[4])*(uint64(in2[0])<<0)
   585  	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
   586  		uint64(in[1])*(uint64(in2[4])<<0) +
   587  		uint64(in[2])*(uint64(in2[3])<<0) +
   588  		uint64(in[3])*(uint64(in2[2])<<0) +
   589  		uint64(in[4])*(uint64(in2[1])<<0) +
   590  		uint64(in[5])*(uint64(in2[0])<<0)
   591  	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
   592  		uint64(in[1])*(uint64(in2[5])<<1) +
   593  		uint64(in[2])*(uint64(in2[4])<<0) +
   594  		uint64(in[3])*(uint64(in2[3])<<1) +
   595  		uint64(in[4])*(uint64(in2[2])<<0) +
   596  		uint64(in[5])*(uint64(in2[1])<<1) +
   597  		uint64(in[6])*(uint64(in2[0])<<0)
   598  	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
   599  		uint64(in[1])*(uint64(in2[6])<<0) +
   600  		uint64(in[2])*(uint64(in2[5])<<0) +
   601  		uint64(in[3])*(uint64(in2[4])<<0) +
   602  		uint64(in[4])*(uint64(in2[3])<<0) +
   603  		uint64(in[5])*(uint64(in2[2])<<0) +
   604  		uint64(in[6])*(uint64(in2[1])<<0) +
   605  		uint64(in[7])*(uint64(in2[0])<<0)
   606  	// tmp[8] has the greatest value but doesn't overflow. See logic in
   607  	// p256Square.
   608  	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
   609  		uint64(in[1])*(uint64(in2[7])<<1) +
   610  		uint64(in[2])*(uint64(in2[6])<<0) +
   611  		uint64(in[3])*(uint64(in2[5])<<1) +
   612  		uint64(in[4])*(uint64(in2[4])<<0) +
   613  		uint64(in[5])*(uint64(in2[3])<<1) +
   614  		uint64(in[6])*(uint64(in2[2])<<0) +
   615  		uint64(in[7])*(uint64(in2[1])<<1) +
   616  		uint64(in[8])*(uint64(in2[0])<<0)
   617  	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
   618  		uint64(in[2])*(uint64(in2[7])<<0) +
   619  		uint64(in[3])*(uint64(in2[6])<<0) +
   620  		uint64(in[4])*(uint64(in2[5])<<0) +
   621  		uint64(in[5])*(uint64(in2[4])<<0) +
   622  		uint64(in[6])*(uint64(in2[3])<<0) +
   623  		uint64(in[7])*(uint64(in2[2])<<0) +
   624  		uint64(in[8])*(uint64(in2[1])<<0)
   625  	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
   626  		uint64(in[3])*(uint64(in2[7])<<1) +
   627  		uint64(in[4])*(uint64(in2[6])<<0) +
   628  		uint64(in[5])*(uint64(in2[5])<<1) +
   629  		uint64(in[6])*(uint64(in2[4])<<0) +
   630  		uint64(in[7])*(uint64(in2[3])<<1) +
   631  		uint64(in[8])*(uint64(in2[2])<<0)
   632  	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
   633  		uint64(in[4])*(uint64(in2[7])<<0) +
   634  		uint64(in[5])*(uint64(in2[6])<<0) +
   635  		uint64(in[6])*(uint64(in2[5])<<0) +
   636  		uint64(in[7])*(uint64(in2[4])<<0) +
   637  		uint64(in[8])*(uint64(in2[3])<<0)
   638  	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
   639  		uint64(in[5])*(uint64(in2[7])<<1) +
   640  		uint64(in[6])*(uint64(in2[6])<<0) +
   641  		uint64(in[7])*(uint64(in2[5])<<1) +
   642  		uint64(in[8])*(uint64(in2[4])<<0)
   643  	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
   644  		uint64(in[6])*(uint64(in2[7])<<0) +
   645  		uint64(in[7])*(uint64(in2[6])<<0) +
   646  		uint64(in[8])*(uint64(in2[5])<<0)
   647  	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
   648  		uint64(in[7])*(uint64(in2[7])<<1) +
   649  		uint64(in[8])*(uint64(in2[6])<<0)
   650  	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
   651  		uint64(in[8])*(uint64(in2[7])<<0)
   652  	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
   653  
   654  	p256ReduceDegree(out, tmp)
   655  }
   656  
   657  func p256Assign(out, in *[p256Limbs]uint32) {
   658  	*out = *in
   659  }
   660  
   661  // p256Invert calculates |out| = |in|^{-1}
   662  //
   663  // Based on Fermat's Little Theorem:
   664  //   a^p = a (mod p)
   665  //   a^{p-1} = 1 (mod p)
   666  //   a^{p-2} = a^{-1} (mod p)
   667  func p256Invert(out, in *[p256Limbs]uint32) {
   668  	var ftmp, ftmp2 [p256Limbs]uint32
   669  
   670  	// each e_I will hold |in|^{2^I - 1}
   671  	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
   672  
   673  	p256Square(&ftmp, in)     // 2^1
   674  	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
   675  	p256Assign(&e2, &ftmp)
   676  	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
   677  	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
   678  	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
   679  	p256Assign(&e4, &ftmp)
   680  	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
   681  	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
   682  	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
   683  	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
   684  	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
   685  	p256Assign(&e8, &ftmp)
   686  	for i := 0; i < 8; i++ {
   687  		p256Square(&ftmp, &ftmp)
   688  	} // 2^16 - 2^8
   689  	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
   690  	p256Assign(&e16, &ftmp)
   691  	for i := 0; i < 16; i++ {
   692  		p256Square(&ftmp, &ftmp)
   693  	} // 2^32 - 2^16
   694  	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
   695  	p256Assign(&e32, &ftmp)
   696  	for i := 0; i < 32; i++ {
   697  		p256Square(&ftmp, &ftmp)
   698  	} // 2^64 - 2^32
   699  	p256Assign(&e64, &ftmp)
   700  	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
   701  	for i := 0; i < 192; i++ {
   702  		p256Square(&ftmp, &ftmp)
   703  	} // 2^256 - 2^224 + 2^192
   704  
   705  	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
   706  	for i := 0; i < 16; i++ {
   707  		p256Square(&ftmp2, &ftmp2)
   708  	} // 2^80 - 2^16
   709  	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
   710  	for i := 0; i < 8; i++ {
   711  		p256Square(&ftmp2, &ftmp2)
   712  	} // 2^88 - 2^8
   713  	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
   714  	for i := 0; i < 4; i++ {
   715  		p256Square(&ftmp2, &ftmp2)
   716  	} // 2^92 - 2^4
   717  	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
   718  	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
   719  	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
   720  	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
   721  	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
   722  	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
   723  	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
   724  
   725  	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
   726  }
   727  
   728  // p256Scalar3 sets out=3*out.
   729  //
   730  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   731  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   732  func p256Scalar3(out *[p256Limbs]uint32) {
   733  	var carry uint32
   734  
   735  	for i := 0; ; i++ {
   736  		out[i] *= 3
   737  		out[i] += carry
   738  		carry = out[i] >> 29
   739  		out[i] &= bottom29Bits
   740  
   741  		i++
   742  		if i == p256Limbs {
   743  			break
   744  		}
   745  
   746  		out[i] *= 3
   747  		out[i] += carry
   748  		carry = out[i] >> 28
   749  		out[i] &= bottom28Bits
   750  	}
   751  
   752  	p256ReduceCarry(out, carry)
   753  }
   754  
   755  // p256Scalar4 sets out=4*out.
   756  //
   757  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   758  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   759  func p256Scalar4(out *[p256Limbs]uint32) {
   760  	var carry, nextCarry uint32
   761  
   762  	for i := 0; ; i++ {
   763  		nextCarry = out[i] >> 27
   764  		out[i] <<= 2
   765  		out[i] &= bottom29Bits
   766  		out[i] += carry
   767  		carry = nextCarry + (out[i] >> 29)
   768  		out[i] &= bottom29Bits
   769  
   770  		i++
   771  		if i == p256Limbs {
   772  			break
   773  		}
   774  		nextCarry = out[i] >> 26
   775  		out[i] <<= 2
   776  		out[i] &= bottom28Bits
   777  		out[i] += carry
   778  		carry = nextCarry + (out[i] >> 28)
   779  		out[i] &= bottom28Bits
   780  	}
   781  
   782  	p256ReduceCarry(out, carry)
   783  }
   784  
   785  // p256Scalar8 sets out=8*out.
   786  //
   787  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   788  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   789  func p256Scalar8(out *[p256Limbs]uint32) {
   790  	var carry, nextCarry uint32
   791  
   792  	for i := 0; ; i++ {
   793  		nextCarry = out[i] >> 26
   794  		out[i] <<= 3
   795  		out[i] &= bottom29Bits
   796  		out[i] += carry
   797  		carry = nextCarry + (out[i] >> 29)
   798  		out[i] &= bottom29Bits
   799  
   800  		i++
   801  		if i == p256Limbs {
   802  			break
   803  		}
   804  		nextCarry = out[i] >> 25
   805  		out[i] <<= 3
   806  		out[i] &= bottom28Bits
   807  		out[i] += carry
   808  		carry = nextCarry + (out[i] >> 28)
   809  		out[i] &= bottom28Bits
   810  	}
   811  
   812  	p256ReduceCarry(out, carry)
   813  }
   814  
   815  // Group operations:
   816  //
   817  // Elements of the elliptic curve group are represented in Jacobian
   818  // coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
   819  // Jacobian form.
   820  
   821  // p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
   822  //
   823  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
   824  func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
   825  	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
   826  
   827  	p256Square(&delta, z)
   828  	p256Square(&gamma, y)
   829  	p256Mul(&beta, x, &gamma)
   830  
   831  	p256Sum(&tmp, x, &delta)
   832  	p256Diff(&tmp2, x, &delta)
   833  	p256Mul(&alpha, &tmp, &tmp2)
   834  	p256Scalar3(&alpha)
   835  
   836  	p256Sum(&tmp, y, z)
   837  	p256Square(&tmp, &tmp)
   838  	p256Diff(&tmp, &tmp, &gamma)
   839  	p256Diff(zOut, &tmp, &delta)
   840  
   841  	p256Scalar4(&beta)
   842  	p256Square(xOut, &alpha)
   843  	p256Diff(xOut, xOut, &beta)
   844  	p256Diff(xOut, xOut, &beta)
   845  
   846  	p256Diff(&tmp, &beta, xOut)
   847  	p256Mul(&tmp, &alpha, &tmp)
   848  	p256Square(&tmp2, &gamma)
   849  	p256Scalar8(&tmp2)
   850  	p256Diff(yOut, &tmp, &tmp2)
   851  }
   852  
   853  // p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
   854  // (i.e. the second point is affine.)
   855  //
   856  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   857  //
   858  // Note that this function does not handle P+P, infinity+P nor P+infinity
   859  // correctly.
   860  func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
   861  	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   862  
   863  	p256Square(&z1z1, z1)
   864  	p256Sum(&tmp, z1, z1)
   865  
   866  	p256Mul(&u2, x2, &z1z1)
   867  	p256Mul(&z1z1z1, z1, &z1z1)
   868  	p256Mul(&s2, y2, &z1z1z1)
   869  	p256Diff(&h, &u2, x1)
   870  	p256Sum(&i, &h, &h)
   871  	p256Square(&i, &i)
   872  	p256Mul(&j, &h, &i)
   873  	p256Diff(&r, &s2, y1)
   874  	p256Sum(&r, &r, &r)
   875  	p256Mul(&v, x1, &i)
   876  
   877  	p256Mul(zOut, &tmp, &h)
   878  	p256Square(&rr, &r)
   879  	p256Diff(xOut, &rr, &j)
   880  	p256Diff(xOut, xOut, &v)
   881  	p256Diff(xOut, xOut, &v)
   882  
   883  	p256Diff(&tmp, &v, xOut)
   884  	p256Mul(yOut, &tmp, &r)
   885  	p256Mul(&tmp, y1, &j)
   886  	p256Diff(yOut, yOut, &tmp)
   887  	p256Diff(yOut, yOut, &tmp)
   888  }
   889  
   890  // p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
   891  //
   892  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   893  //
   894  // Note that this function does not handle P+P, infinity+P nor P+infinity
   895  // correctly.
   896  func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
   897  	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   898  
   899  	p256Square(&z1z1, z1)
   900  	p256Square(&z2z2, z2)
   901  	p256Mul(&u1, x1, &z2z2)
   902  
   903  	p256Sum(&tmp, z1, z2)
   904  	p256Square(&tmp, &tmp)
   905  	p256Diff(&tmp, &tmp, &z1z1)
   906  	p256Diff(&tmp, &tmp, &z2z2)
   907  
   908  	p256Mul(&z2z2z2, z2, &z2z2)
   909  	p256Mul(&s1, y1, &z2z2z2)
   910  
   911  	p256Mul(&u2, x2, &z1z1)
   912  	p256Mul(&z1z1z1, z1, &z1z1)
   913  	p256Mul(&s2, y2, &z1z1z1)
   914  	p256Diff(&h, &u2, &u1)
   915  	p256Sum(&i, &h, &h)
   916  	p256Square(&i, &i)
   917  	p256Mul(&j, &h, &i)
   918  	p256Diff(&r, &s2, &s1)
   919  	p256Sum(&r, &r, &r)
   920  	p256Mul(&v, &u1, &i)
   921  
   922  	p256Mul(zOut, &tmp, &h)
   923  	p256Square(&rr, &r)
   924  	p256Diff(xOut, &rr, &j)
   925  	p256Diff(xOut, xOut, &v)
   926  	p256Diff(xOut, xOut, &v)
   927  
   928  	p256Diff(&tmp, &v, xOut)
   929  	p256Mul(yOut, &tmp, &r)
   930  	p256Mul(&tmp, &s1, &j)
   931  	p256Diff(yOut, yOut, &tmp)
   932  	p256Diff(yOut, yOut, &tmp)
   933  }
   934  
   935  // p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
   936  //
   937  // On entry: mask is either 0 or 0xffffffff.
   938  func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
   939  	for i := 0; i < p256Limbs; i++ {
   940  		tmp := mask & (in[i] ^ out[i])
   941  		out[i] ^= tmp
   942  	}
   943  }
   944  
   945  // p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
   946  // On entry: index < 16, table[0] must be zero.
   947  func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
   948  	for i := range xOut {
   949  		xOut[i] = 0
   950  	}
   951  	for i := range yOut {
   952  		yOut[i] = 0
   953  	}
   954  
   955  	for i := uint32(1); i < 16; i++ {
   956  		mask := i ^ index
   957  		mask |= mask >> 2
   958  		mask |= mask >> 1
   959  		mask &= 1
   960  		mask--
   961  		for j := range xOut {
   962  			xOut[j] |= table[0] & mask
   963  			table = table[1:]
   964  		}
   965  		for j := range yOut {
   966  			yOut[j] |= table[0] & mask
   967  			table = table[1:]
   968  		}
   969  	}
   970  }
   971  
   972  // p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
   973  // table.
   974  // On entry: index < 16, table[0] must be zero.
   975  func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
   976  	for i := range xOut {
   977  		xOut[i] = 0
   978  	}
   979  	for i := range yOut {
   980  		yOut[i] = 0
   981  	}
   982  	for i := range zOut {
   983  		zOut[i] = 0
   984  	}
   985  
   986  	// The implicit value at index 0 is all zero. We don't need to perform that
   987  	// iteration of the loop because we already set out_* to zero.
   988  	for i := uint32(1); i < 16; i++ {
   989  		mask := i ^ index
   990  		mask |= mask >> 2
   991  		mask |= mask >> 1
   992  		mask &= 1
   993  		mask--
   994  		for j := range xOut {
   995  			xOut[j] |= table[i][0][j] & mask
   996  		}
   997  		for j := range yOut {
   998  			yOut[j] |= table[i][1][j] & mask
   999  		}
  1000  		for j := range zOut {
  1001  			zOut[j] |= table[i][2][j] & mask
  1002  		}
  1003  	}
  1004  }
  1005  
  1006  // p256GetBit returns the bit'th bit of scalar.
  1007  func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
  1008  	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
  1009  }
  1010  
  1011  // p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
  1012  // little-endian number. Note that the value of scalar must be less than the
  1013  // order of the group.
  1014  func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
  1015  	nIsInfinityMask := ^uint32(0)
  1016  	var pIsNoninfiniteMask, mask, tableOffset uint32
  1017  	var px, py, tx, ty, tz [p256Limbs]uint32
  1018  
  1019  	for i := range xOut {
  1020  		xOut[i] = 0
  1021  	}
  1022  	for i := range yOut {
  1023  		yOut[i] = 0
  1024  	}
  1025  	for i := range zOut {
  1026  		zOut[i] = 0
  1027  	}
  1028  
  1029  	// The loop adds bits at positions 0, 64, 128 and 192, followed by
  1030  	// positions 32,96,160 and 224 and does this 32 times.
  1031  	for i := uint(0); i < 32; i++ {
  1032  		if i != 0 {
  1033  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1034  		}
  1035  		tableOffset = 0
  1036  		for j := uint(0); j <= 32; j += 32 {
  1037  			bit0 := p256GetBit(scalar, 31-i+j)
  1038  			bit1 := p256GetBit(scalar, 95-i+j)
  1039  			bit2 := p256GetBit(scalar, 159-i+j)
  1040  			bit3 := p256GetBit(scalar, 223-i+j)
  1041  			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
  1042  
  1043  			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
  1044  			tableOffset += 30 * p256Limbs
  1045  
  1046  			// Since scalar is less than the order of the group, we know that
  1047  			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
  1048  			// below.
  1049  			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
  1050  			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
  1051  			// (a.k.a.  the point at infinity). We handle that situation by
  1052  			// copying the point from the table.
  1053  			p256CopyConditional(xOut, &px, nIsInfinityMask)
  1054  			p256CopyConditional(yOut, &py, nIsInfinityMask)
  1055  			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
  1056  
  1057  			// Equally, the result is also wrong if the point from the table is
  1058  			// zero, which happens when the index is zero. We handle that by
  1059  			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
  1060  			pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1061  			mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1062  			p256CopyConditional(xOut, &tx, mask)
  1063  			p256CopyConditional(yOut, &ty, mask)
  1064  			p256CopyConditional(zOut, &tz, mask)
  1065  			// If p was not zero, then n is now non-zero.
  1066  			nIsInfinityMask &^= pIsNoninfiniteMask
  1067  		}
  1068  	}
  1069  }
  1070  
  1071  // p256PointToAffine converts a Jacobian point to an affine point. If the input
  1072  // is the point at infinity then it returns (0, 0) in constant time.
  1073  func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
  1074  	var zInv, zInvSq [p256Limbs]uint32
  1075  
  1076  	p256Invert(&zInv, z)
  1077  	p256Square(&zInvSq, &zInv)
  1078  	p256Mul(xOut, x, &zInvSq)
  1079  	p256Mul(&zInv, &zInv, &zInvSq)
  1080  	p256Mul(yOut, y, &zInv)
  1081  }
  1082  
  1083  // p256ToAffine returns a pair of *big.Int containing the affine representation
  1084  // of {x,y,z}.
  1085  func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
  1086  	var xx, yy [p256Limbs]uint32
  1087  	p256PointToAffine(&xx, &yy, x, y, z)
  1088  	return p256ToBig(&xx), p256ToBig(&yy)
  1089  }
  1090  
  1091  // p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
  1092  func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
  1093  	var px, py, pz, tx, ty, tz [p256Limbs]uint32
  1094  	var precomp [16][3][p256Limbs]uint32
  1095  	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
  1096  
  1097  	// We precompute 0,1,2,... times {x,y}.
  1098  	precomp[1][0] = *x
  1099  	precomp[1][1] = *y
  1100  	precomp[1][2] = p256One
  1101  
  1102  	for i := 2; i < 16; i += 2 {
  1103  		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
  1104  		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
  1105  	}
  1106  
  1107  	for i := range xOut {
  1108  		xOut[i] = 0
  1109  	}
  1110  	for i := range yOut {
  1111  		yOut[i] = 0
  1112  	}
  1113  	for i := range zOut {
  1114  		zOut[i] = 0
  1115  	}
  1116  	nIsInfinityMask = ^uint32(0)
  1117  
  1118  	// We add in a window of four bits each iteration and do this 64 times.
  1119  	for i := 0; i < 64; i++ {
  1120  		if i != 0 {
  1121  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1122  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1123  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1124  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1125  		}
  1126  
  1127  		index = uint32(scalar[31-i/2])
  1128  		if (i & 1) == 1 {
  1129  			index &= 15
  1130  		} else {
  1131  			index >>= 4
  1132  		}
  1133  
  1134  		// See the comments in scalarBaseMult about handling infinities.
  1135  		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
  1136  		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
  1137  		p256CopyConditional(xOut, &px, nIsInfinityMask)
  1138  		p256CopyConditional(yOut, &py, nIsInfinityMask)
  1139  		p256CopyConditional(zOut, &pz, nIsInfinityMask)
  1140  
  1141  		pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1142  		mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1143  		p256CopyConditional(xOut, &tx, mask)
  1144  		p256CopyConditional(yOut, &ty, mask)
  1145  		p256CopyConditional(zOut, &tz, mask)
  1146  		nIsInfinityMask &^= pIsNoninfiniteMask
  1147  	}
  1148  }
  1149  
  1150  // p256FromBig sets out = R*in.
  1151  func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
  1152  	tmp := new(big.Int).Lsh(in, 257)
  1153  	tmp.Mod(tmp, p256Params.P)
  1154  
  1155  	for i := 0; i < p256Limbs; i++ {
  1156  		if bits := tmp.Bits(); len(bits) > 0 {
  1157  			out[i] = uint32(bits[0]) & bottom29Bits
  1158  		} else {
  1159  			out[i] = 0
  1160  		}
  1161  		tmp.Rsh(tmp, 29)
  1162  
  1163  		i++
  1164  		if i == p256Limbs {
  1165  			break
  1166  		}
  1167  
  1168  		if bits := tmp.Bits(); len(bits) > 0 {
  1169  			out[i] = uint32(bits[0]) & bottom28Bits
  1170  		} else {
  1171  			out[i] = 0
  1172  		}
  1173  		tmp.Rsh(tmp, 28)
  1174  	}
  1175  }
  1176  
  1177  // p256ToBig returns a *big.Int containing the value of in.
  1178  func p256ToBig(in *[p256Limbs]uint32) *big.Int {
  1179  	result, tmp := new(big.Int), new(big.Int)
  1180  
  1181  	result.SetInt64(int64(in[p256Limbs-1]))
  1182  	for i := p256Limbs - 2; i >= 0; i-- {
  1183  		if (i & 1) == 0 {
  1184  			result.Lsh(result, 29)
  1185  		} else {
  1186  			result.Lsh(result, 28)
  1187  		}
  1188  		tmp.SetInt64(int64(in[i]))
  1189  		result.Add(result, tmp)
  1190  	}
  1191  
  1192  	result.Mul(result, p256RInverse)
  1193  	result.Mod(result, p256Params.P)
  1194  	return result
  1195  }