github.com/remobjects/goldbaselibrary@v0.0.0-20230924164425-d458680a936b/Source/Gold/crypto/elliptic/p256_elements.go

github.com/remobjects/goldbaselibrary@v0.0.0-20230924164425-d458680a936b/Source/Gold/crypto/elliptic/p256_elements.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !amd64
     6  //##,!arm64
     7  
     8  package elliptic
     9  
    10  // This file contains a constant-time, 32-bit implementation of P256.
    11  
    12  import (
    13  	"math/big"
    14  )
    15  
    16  type p256Curve struct {
    17  	*CurveParams
    18  }
    19  
    20  var (
    21  	p256Params *CurveParams
    22  
    23  	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
    24  	// (2**257).
    25  	p256RInverse *big.Int
    26  )
    27  
    28  func initP256() {
    29  	// See FIPS 186-3, section D.2.3
    30  	p256Params = &CurveParams{Name: "P-256"}
    31  	p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
    32  	p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
    33  	p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
    34  	p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
    35  	p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
    36  	p256Params.BitSize = 256
    37  
    38  	p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
    39  
    40  	// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
    41  	initP256Arch()
    42  }
    43  
    44  func (curve p256Curve) Params() *CurveParams {
    45  	return curve.CurveParams
    46  }
    47  
    48  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
    49  // to out. If the scalar is equal or greater than the order of the group, it's
    50  // reduced modulo that order.
    51  func p256GetScalar(out *[32]byte, in []byte) {
    52  	n := new(big.Int).SetBytes(in)
    53  	var scalarBytes []byte
    54  
    55  	if n.Cmp(p256Params.N) >= 0 {
    56  		n.Mod(n, p256Params.N)
    57  		scalarBytes = n.Bytes()
    58  	} else {
    59  		scalarBytes = in
    60  	}
    61  
    62  	for i, v := range scalarBytes {
    63  		out[len(scalarBytes)-(1+i)] = v
    64  	}
    65  }
    66  
    67  func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
    68  	var scalarReversed [32]byte
    69  	p256GetScalar(&scalarReversed, scalar)
    70  
    71  	var x1, y1, z1 [p256Limbs]uint32
    72  	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
    73  	return p256ToAffine(&x1, &y1, &z1)
    74  }
    75  
    76  func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
    77  	var scalarReversed [32]byte
    78  	p256GetScalar(&scalarReversed, scalar)
    79  
    80  	var px, py, x1, y1, z1 [p256Limbs]uint32
    81  	p256FromBig(&px, bigX)
    82  	p256FromBig(&py, bigY)
    83  	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
    84  	return p256ToAffine(&x1, &y1, &z1)
    85  }
    86  
    87  // Field elements are represented as nine, unsigned 32-bit words.
    88  //
    89  // The value of an field element is:
    90  //   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
    91  //
    92  // That is, each limb is alternately 29 or 28-bits wide in little-endian
    93  // order.
    94  //
    95  // This means that a field element hits 2**257, rather than 2**256 as we would
    96  // like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
    97  // problems when multiplying as terms end up one bit short of a limb which
    98  // would require much bit-shifting to correct.
    99  //
   100  // Finally, the values stored in a field element are in Montgomery form. So the
   101  // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
   102  // 2**257.
   103  
   104  const (
   105  	p256Limbs    = 9
   106  	bottom29Bits = 0x1fffffff
   107  )
   108  
   109  var (
   110  	// p256One is the number 1 as a field element.
   111  	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
   112  	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
   113  	// p256P is the prime modulus as a field element.
   114  	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
   115  	// p2562P is the twice prime modulus as a field element.
   116  	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
   117  )
   118  
   119  // p256Precomputed contains precomputed values to aid the calculation of scalar
   120  // multiples of the base point, G. It's actually two, equal length, tables
   121  // concatenated.
   122  //
   123  // The first table contains (x,y) field element pairs for 16 multiples of the
   124  // base point, G.
   125  //
   126  //   Index  |  Index (binary) | Value
   127  //       0  |           0000  | 0G (all zeros, omitted)
   128  //       1  |           0001  | G
   129  //       2  |           0010  | 2**64G
   130  //       3  |           0011  | 2**64G + G
   131  //       4  |           0100  | 2**128G
   132  //       5  |           0101  | 2**128G + G
   133  //       6  |           0110  | 2**128G + 2**64G
   134  //       7  |           0111  | 2**128G + 2**64G + G
   135  //       8  |           1000  | 2**192G
   136  //       9  |           1001  | 2**192G + G
   137  //      10  |           1010  | 2**192G + 2**64G
   138  //      11  |           1011  | 2**192G + 2**64G + G
   139  //      12  |           1100  | 2**192G + 2**128G
   140  //      13  |           1101  | 2**192G + 2**128G + G
   141  //      14  |           1110  | 2**192G + 2**128G + 2**64G
   142  //      15  |           1111  | 2**192G + 2**128G + 2**64G + G
   143  //
   144  // The second table follows the same style, but the terms are 2**32G,
   145  // 2**96G, 2**160G, 2**224G.
   146  //
   147  // This is ~2KB of data.
   148  var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
   149  	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
   150  	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
   151  	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
   152  	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
   153  	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
   154  	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
   155  	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
   156  	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
   157  	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
   158  	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
   159  	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
   160  	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
   161  	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
   162  	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
   163  	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
   164  	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
   165  	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
   166  	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
   167  	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
   168  	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
   169  	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
   170  	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
   171  	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
   172  	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
   173  	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
   174  	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
   175  	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
   176  	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
   177  	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
   178  	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
   179  	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
   180  	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
   181  	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
   182  	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
   183  	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
   184  	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
   185  	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
   186  	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
   187  	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
   188  	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
   189  	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
   190  	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
   191  	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
   192  	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
   193  	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
   194  	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
   195  	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
   196  	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
   197  	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
   198  	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
   199  	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
   200  	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
   201  	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
   202  	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
   203  	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
   204  	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
   205  	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
   206  	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
   207  	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
   208  	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
   209  }
   210  
   211  // Field element operations:
   212  
   213  // nonZeroToAllOnes returns:
   214  //   0xffffffff for 0 < x <= 2**31
   215  //   0 for x == 0 or x > 2**31.
   216  func nonZeroToAllOnes(x uint32) uint32 {
   217  	return ((x - 1) >> 31) - 1
   218  }
   219  
   220  // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
   221  // which is a term at 2**257.
   222  //
   223  // On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
   224  // On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
   225  func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
   226  	carry_mask := nonZeroToAllOnes(carry)
   227  
   228  	inout[0] += carry << 1
   229  	inout[3] += 0x10000000 & carry_mask
   230  	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
   231  	// previous line therefore this doesn't underflow.
   232  	inout[3] -= carry << 11
   233  	inout[4] += (0x20000000 - 1) & carry_mask
   234  	inout[5] += (0x10000000 - 1) & carry_mask
   235  	inout[6] += (0x20000000 - 1) & carry_mask
   236  	inout[6] -= carry << 22
   237  	// This may underflow if carry is non-zero but, if so, we'll fix it in the
   238  	// next line.
   239  	inout[7] -= 1 & carry_mask
   240  	inout[7] += carry << 25
   241  }
   242  
   243  // p256Sum sets out = in+in2.
   244  //
   245  // On entry, in[i]+in2[i] must not overflow a 32-bit word.
   246  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   247  func p256Sum(out, in, in2 *[p256Limbs]uint32) {
   248  	carry := uint32(0)
   249  	for i := 0; ; i++ {
   250  		out[i] = in[i] + in2[i]
   251  		out[i] += carry
   252  		carry = out[i] >> 29
   253  		out[i] &= bottom29Bits
   254  
   255  		i++
   256  		if i == p256Limbs {
   257  			break
   258  		}
   259  
   260  		out[i] = in[i] + in2[i]
   261  		out[i] += carry
   262  		carry = out[i] >> 28
   263  		out[i] &= bottom28Bits
   264  	}
   265  
   266  	p256ReduceCarry(out, carry)
   267  }
   268  
   269  const (
   270  	two30m2    = 1<<30 - 1<<2
   271  	two30p13m2 = 1<<30 + 1<<13 - 1<<2
   272  	two31m2    = 1<<31 - 1<<2
   273  	two31p24m2 = 1<<31 + 1<<24 - 1<<2
   274  	two30m27m2 = 1<<30 - 1<<27 - 1<<2
   275  )
   276  
   277  // p256Zero31 is 0 mod p.
   278  var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
   279  
   280  // p256Diff sets out = in-in2.
   281  //
   282  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   283  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   284  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   285  func p256Diff(out, in, in2 *[p256Limbs]uint32) {
   286  	var carry uint32
   287  
   288  	for i := 0; ; i++ {
   289  		out[i] = in[i] - in2[i]
   290  		out[i] += p256Zero31[i]
   291  		out[i] += carry
   292  		carry = out[i] >> 29
   293  		out[i] &= bottom29Bits
   294  
   295  		i++
   296  		if i == p256Limbs {
   297  			break
   298  		}
   299  
   300  		out[i] = in[i] - in2[i]
   301  		out[i] += p256Zero31[i]
   302  		out[i] += carry
   303  		carry = out[i] >> 28
   304  		out[i] &= bottom28Bits
   305  	}
   306  
   307  	p256ReduceCarry(out, carry)
   308  }
   309  
   310  // p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
   311  // the same 29,28,... bit positions as an field element.
   312  //
   313  // The values in field elements are in Montgomery form: x*R mod p where R =
   314  // 2**257. Since we just multiplied two Montgomery values together, the result
   315  // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
   316  // in Montgomery form.
   317  //
   318  // On entry: tmp[i] < 2**64
   319  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   320  func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
   321  	// The following table may be helpful when reading this code:
   322  	//
   323  	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
   324  	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
   325  	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
   326  	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
   327  	var tmp2 [18]uint32
   328  	var carry, x, xMask uint32
   329  
   330  	// tmp contains 64-bit words with the same 29,28,29-bit positions as an
   331  	// field element. So the top of an element of tmp might overlap with
   332  	// another element two positions down. The following loop eliminates
   333  	// this overlap.
   334  	tmp2[0] = uint32(tmp[0]) & bottom29Bits
   335  
   336  	tmp2[1] = uint32(tmp[0]) >> 29
   337  	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
   338  	tmp2[1] += uint32(tmp[1]) & bottom28Bits
   339  	carry = tmp2[1] >> 28
   340  	tmp2[1] &= bottom28Bits
   341  
   342  	for i := 2; i < 17; i++ {
   343  		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
   344  		tmp2[i] += (uint32(tmp[i-1])) >> 28
   345  		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
   346  		tmp2[i] += uint32(tmp[i]) & bottom29Bits
   347  		tmp2[i] += carry
   348  		carry = tmp2[i] >> 29
   349  		tmp2[i] &= bottom29Bits
   350  
   351  		i++
   352  		if i == 17 {
   353  			break
   354  		}
   355  		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
   356  		tmp2[i] += uint32(tmp[i-1]) >> 29
   357  		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
   358  		tmp2[i] += uint32(tmp[i]) & bottom28Bits
   359  		tmp2[i] += carry
   360  		carry = tmp2[i] >> 28
   361  		tmp2[i] &= bottom28Bits
   362  	}
   363  
   364  	tmp2[17] = uint32(tmp[15]>>32) >> 25
   365  	tmp2[17] += uint32(tmp[16]) >> 29
   366  	tmp2[17] += uint32(tmp[16]>>32) << 3
   367  	tmp2[17] += carry
   368  
   369  	// Montgomery elimination of terms:
   370  	//
   371  	// Since R is 2**257, we can divide by R with a bitwise shift if we can
   372  	// ensure that the right-most 257 bits are all zero. We can make that true
   373  	// by adding multiplies of p without affecting the value.
   374  	//
   375  	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
   376  	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
   377  	// We can do that for 8 further limbs and then right shift to eliminate the
   378  	// extra factor of R.
   379  	for i := 0; ; i += 2 {
   380  		tmp2[i+1] += tmp2[i] >> 29
   381  		x = tmp2[i] & bottom29Bits
   382  		xMask = nonZeroToAllOnes(x)
   383  		tmp2[i] = 0
   384  
   385  		// The bounds calculations for this loop are tricky. Each iteration of
   386  		// the loop eliminates two words by adding values to words to their
   387  		// right.
   388  		//
   389  		// The following table contains the amounts added to each word (as an
   390  		// offset from the value of i at the top of the loop). The amounts are
   391  		// accounted for from the first and second half of the loop separately
   392  		// and are written as, for example, 28 to mean a value <2**28.
   393  		//
   394  		// Word:                   3   4   5   6   7   8   9   10
   395  		// Added in top half:     28  11      29  21  29  28
   396  		//                                        28  29
   397  		//                                            29
   398  		// Added in bottom half:      29  10      28  21  28   28
   399  		//                                            29
   400  		//
   401  		// The value that is currently offset 7 will be offset 5 for the next
   402  		// iteration and then offset 3 for the iteration after that. Therefore
   403  		// the total value added will be the values added at 7, 5 and 3.
   404  		//
   405  		// The following table accumulates these values. The sums at the bottom
   406  		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
   407  		//
   408  		// Word:                   3   4   5   6   7   8   9  10  11  12  13
   409  		//                        28  11  10  29  21  29  28  28  28  28  28
   410  		//                            29  28  11  28  29  28  29  28  29  28
   411  		//                                    29  28  21  21  29  21  29  21
   412  		//                                        10  29  28  21  28  21  28
   413  		//                                        28  29  28  29  28  29  28
   414  		//                                            11  10  29  10  29  10
   415  		//                                            29  28  11  28  11
   416  		//                                                    29      29
   417  		//                        --------------------------------------------
   418  		//                                                30+ 31+ 30+ 31+ 30+
   419  		//                                                28+ 29+ 28+ 29+ 21+
   420  		//                                                21+ 28+ 21+ 28+ 10
   421  		//                                                10  21+ 10  21+
   422  		//                                                    11      11
   423  		//
   424  		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
   425  		// tmp2[10/12] has an initial value of <2**29, then the maximum value
   426  		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
   427  		// as required.
   428  		tmp2[i+3] += (x << 10) & bottom28Bits
   429  		tmp2[i+4] += (x >> 18)
   430  
   431  		tmp2[i+6] += (x << 21) & bottom29Bits
   432  		tmp2[i+7] += x >> 8
   433  
   434  		// At position 200, which is the starting bit position for word 7, we
   435  		// have a factor of 0xf000000 = 2**28 - 2**24.
   436  		tmp2[i+7] += 0x10000000 & xMask
   437  		tmp2[i+8] += (x - 1) & xMask
   438  		tmp2[i+7] -= (x << 24) & bottom28Bits
   439  		tmp2[i+8] -= x >> 4
   440  
   441  		tmp2[i+8] += 0x20000000 & xMask
   442  		tmp2[i+8] -= x
   443  		tmp2[i+8] += (x << 28) & bottom29Bits
   444  		tmp2[i+9] += ((x >> 1) - 1) & xMask
   445  
   446  		if i+1 == p256Limbs {
   447  			break
   448  		}
   449  		tmp2[i+2] += tmp2[i+1] >> 28
   450  		x = tmp2[i+1] & bottom28Bits
   451  		xMask = nonZeroToAllOnes(x)
   452  		tmp2[i+1] = 0
   453  
   454  		tmp2[i+4] += (x << 11) & bottom29Bits
   455  		tmp2[i+5] += (x >> 18)
   456  
   457  		tmp2[i+7] += (x << 21) & bottom28Bits
   458  		tmp2[i+8] += x >> 7
   459  
   460  		// At position 199, which is the starting bit of the 8th word when
   461  		// dealing with a context starting on an odd word, we have a factor of
   462  		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
   463  		// word from i+1 is i+8.
   464  		tmp2[i+8] += 0x20000000 & xMask
   465  		tmp2[i+9] += (x - 1) & xMask
   466  		tmp2[i+8] -= (x << 25) & bottom29Bits
   467  		tmp2[i+9] -= x >> 4
   468  
   469  		tmp2[i+9] += 0x10000000 & xMask
   470  		tmp2[i+9] -= x
   471  		tmp2[i+10] += (x - 1) & xMask
   472  	}
   473  
   474  	// We merge the right shift with a carry chain. The words above 2**257 have
   475  	// widths of 28,29,... which we need to correct when copying them down.
   476  	carry = 0
   477  	for i := 0; i < 8; i++ {
   478  		// The maximum value of tmp2[i + 9] occurs on the first iteration and
   479  		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
   480  		// therefore safe.
   481  		out[i] = tmp2[i+9]
   482  		out[i] += carry
   483  		out[i] += (tmp2[i+10] << 28) & bottom29Bits
   484  		carry = out[i] >> 29
   485  		out[i] &= bottom29Bits
   486  
   487  		i++
   488  		out[i] = tmp2[i+9] >> 1
   489  		out[i] += carry
   490  		carry = out[i] >> 28
   491  		out[i] &= bottom28Bits
   492  	}
   493  
   494  	out[8] = tmp2[17]
   495  	out[8] += carry
   496  	carry = out[8] >> 29
   497  	out[8] &= bottom29Bits
   498  
   499  	p256ReduceCarry(out, carry)
   500  }
   501  
   502  // p256Square sets out=in*in.
   503  //
   504  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
   505  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   506  func p256Square(out, in *[p256Limbs]uint32) {
   507  	var tmp [17]uint64
   508  
   509  	tmp[0] = uint64(in[0]) * uint64(in[0])
   510  	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
   511  	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
   512  		uint64(in[1])*(uint64(in[1])<<1)
   513  	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
   514  		uint64(in[1])*(uint64(in[2])<<1)
   515  	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
   516  		uint64(in[1])*(uint64(in[3])<<2) +
   517  		uint64(in[2])*uint64(in[2])
   518  	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
   519  		uint64(in[1])*(uint64(in[4])<<1) +
   520  		uint64(in[2])*(uint64(in[3])<<1)
   521  	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
   522  		uint64(in[1])*(uint64(in[5])<<2) +
   523  		uint64(in[2])*(uint64(in[4])<<1) +
   524  		uint64(in[3])*(uint64(in[3])<<1)
   525  	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
   526  		uint64(in[1])*(uint64(in[6])<<1) +
   527  		uint64(in[2])*(uint64(in[5])<<1) +
   528  		uint64(in[3])*(uint64(in[4])<<1)
   529  	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
   530  	// which is < 2**64 as required.
   531  	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
   532  		uint64(in[1])*(uint64(in[7])<<2) +
   533  		uint64(in[2])*(uint64(in[6])<<1) +
   534  		uint64(in[3])*(uint64(in[5])<<2) +
   535  		uint64(in[4])*uint64(in[4])
   536  	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
   537  		uint64(in[2])*(uint64(in[7])<<1) +
   538  		uint64(in[3])*(uint64(in[6])<<1) +
   539  		uint64(in[4])*(uint64(in[5])<<1)
   540  	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
   541  		uint64(in[3])*(uint64(in[7])<<2) +
   542  		uint64(in[4])*(uint64(in[6])<<1) +
   543  		uint64(in[5])*(uint64(in[5])<<1)
   544  	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
   545  		uint64(in[4])*(uint64(in[7])<<1) +
   546  		uint64(in[5])*(uint64(in[6])<<1)
   547  	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
   548  		uint64(in[5])*(uint64(in[7])<<2) +
   549  		uint64(in[6])*uint64(in[6])
   550  	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
   551  		uint64(in[6])*(uint64(in[7])<<1)
   552  	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
   553  		uint64(in[7])*(uint64(in[7])<<1)
   554  	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
   555  	tmp[16] = uint64(in[8]) * uint64(in[8])
   556  
   557  	p256ReduceDegree(out, tmp)
   558  }
   559  
   560  // p256Mul sets out=in*in2.
   561  //
   562  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   563  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   564  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   565  func p256Mul(out, in, in2 *[p256Limbs]uint32) {
   566  	var tmp [17]uint64
   567  
   568  	tmp[0] = uint64(in[0]) * uint64(in2[0])
   569  	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
   570  		uint64(in[1])*(uint64(in2[0])<<0)
   571  	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
   572  		uint64(in[1])*(uint64(in2[1])<<1) +
   573  		uint64(in[2])*(uint64(in2[0])<<0)
   574  	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
   575  		uint64(in[1])*(uint64(in2[2])<<0) +
   576  		uint64(in[2])*(uint64(in2[1])<<0) +
   577  		uint64(in[3])*(uint64(in2[0])<<0)
   578  	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
   579  		uint64(in[1])*(uint64(in2[3])<<1) +
   580  		uint64(in[2])*(uint64(in2[2])<<0) +
   581  		uint64(in[3])*(uint64(in2[1])<<1) +
   582  		uint64(in[4])*(uint64(in2[0])<<0)
   583  	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
   584  		uint64(in[1])*(uint64(in2[4])<<0) +
   585  		uint64(in[2])*(uint64(in2[3])<<0) +
   586  		uint64(in[3])*(uint64(in2[2])<<0) +
   587  		uint64(in[4])*(uint64(in2[1])<<0) +
   588  		uint64(in[5])*(uint64(in2[0])<<0)
   589  	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
   590  		uint64(in[1])*(uint64(in2[5])<<1) +
   591  		uint64(in[2])*(uint64(in2[4])<<0) +
   592  		uint64(in[3])*(uint64(in2[3])<<1) +
   593  		uint64(in[4])*(uint64(in2[2])<<0) +
   594  		uint64(in[5])*(uint64(in2[1])<<1) +
   595  		uint64(in[6])*(uint64(in2[0])<<0)
   596  	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
   597  		uint64(in[1])*(uint64(in2[6])<<0) +
   598  		uint64(in[2])*(uint64(in2[5])<<0) +
   599  		uint64(in[3])*(uint64(in2[4])<<0) +
   600  		uint64(in[4])*(uint64(in2[3])<<0) +
   601  		uint64(in[5])*(uint64(in2[2])<<0) +
   602  		uint64(in[6])*(uint64(in2[1])<<0) +
   603  		uint64(in[7])*(uint64(in2[0])<<0)
   604  	// tmp[8] has the greatest value but doesn't overflow. See logic in
   605  	// p256Square.
   606  	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
   607  		uint64(in[1])*(uint64(in2[7])<<1) +
   608  		uint64(in[2])*(uint64(in2[6])<<0) +
   609  		uint64(in[3])*(uint64(in2[5])<<1) +
   610  		uint64(in[4])*(uint64(in2[4])<<0) +
   611  		uint64(in[5])*(uint64(in2[3])<<1) +
   612  		uint64(in[6])*(uint64(in2[2])<<0) +
   613  		uint64(in[7])*(uint64(in2[1])<<1) +
   614  		uint64(in[8])*(uint64(in2[0])<<0)
   615  	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
   616  		uint64(in[2])*(uint64(in2[7])<<0) +
   617  		uint64(in[3])*(uint64(in2[6])<<0) +
   618  		uint64(in[4])*(uint64(in2[5])<<0) +
   619  		uint64(in[5])*(uint64(in2[4])<<0) +
   620  		uint64(in[6])*(uint64(in2[3])<<0) +
   621  		uint64(in[7])*(uint64(in2[2])<<0) +
   622  		uint64(in[8])*(uint64(in2[1])<<0)
   623  	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
   624  		uint64(in[3])*(uint64(in2[7])<<1) +
   625  		uint64(in[4])*(uint64(in2[6])<<0) +
   626  		uint64(in[5])*(uint64(in2[5])<<1) +
   627  		uint64(in[6])*(uint64(in2[4])<<0) +
   628  		uint64(in[7])*(uint64(in2[3])<<1) +
   629  		uint64(in[8])*(uint64(in2[2])<<0)
   630  	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
   631  		uint64(in[4])*(uint64(in2[7])<<0) +
   632  		uint64(in[5])*(uint64(in2[6])<<0) +
   633  		uint64(in[6])*(uint64(in2[5])<<0) +
   634  		uint64(in[7])*(uint64(in2[4])<<0) +
   635  		uint64(in[8])*(uint64(in2[3])<<0)
   636  	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
   637  		uint64(in[5])*(uint64(in2[7])<<1) +
   638  		uint64(in[6])*(uint64(in2[6])<<0) +
   639  		uint64(in[7])*(uint64(in2[5])<<1) +
   640  		uint64(in[8])*(uint64(in2[4])<<0)
   641  	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
   642  		uint64(in[6])*(uint64(in2[7])<<0) +
   643  		uint64(in[7])*(uint64(in2[6])<<0) +
   644  		uint64(in[8])*(uint64(in2[5])<<0)
   645  	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
   646  		uint64(in[7])*(uint64(in2[7])<<1) +
   647  		uint64(in[8])*(uint64(in2[6])<<0)
   648  	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
   649  		uint64(in[8])*(uint64(in2[7])<<0)
   650  	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
   651  
   652  	p256ReduceDegree(out, tmp)
   653  }
   654  
   655  func p256Assign(out, in *[p256Limbs]uint32) {
   656  	for i := 0; i < p256Limbs; i++ {
   657  		out[i] = in[i]
   658  	}
   659  	// elements change
   660  	//*out = *in
   661  }
   662  
   663  // p256Invert calculates |out| = |in|^{-1}
   664  //
   665  // Based on Fermat's Little Theorem:
   666  //   a^p = a (mod p)
   667  //   a^{p-1} = 1 (mod p)
   668  //   a^{p-2} = a^{-1} (mod p)
   669  func p256Invert(out, in *[p256Limbs]uint32) {
   670  	var ftmp, ftmp2 [p256Limbs]uint32
   671  
   672  	// each e_I will hold |in|^{2^I - 1}
   673  	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
   674  
   675  	p256Square(&ftmp, in)     // 2^1
   676  	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
   677  	p256Assign(&e2, &ftmp)
   678  	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
   679  	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
   680  	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
   681  	p256Assign(&e4, &ftmp)
   682  	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
   683  	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
   684  	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
   685  	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
   686  	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
   687  	p256Assign(&e8, &ftmp)
   688  	for i := 0; i < 8; i++ {
   689  		p256Square(&ftmp, &ftmp)
   690  	} // 2^16 - 2^8
   691  	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
   692  	p256Assign(&e16, &ftmp)
   693  	for i := 0; i < 16; i++ {
   694  		p256Square(&ftmp, &ftmp)
   695  	} // 2^32 - 2^16
   696  	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
   697  	p256Assign(&e32, &ftmp)
   698  	for i := 0; i < 32; i++ {
   699  		p256Square(&ftmp, &ftmp)
   700  	} // 2^64 - 2^32
   701  	p256Assign(&e64, &ftmp)
   702  	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
   703  	for i := 0; i < 192; i++ {
   704  		p256Square(&ftmp, &ftmp)
   705  	} // 2^256 - 2^224 + 2^192
   706  
   707  	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
   708  	for i := 0; i < 16; i++ {
   709  		p256Square(&ftmp2, &ftmp2)
   710  	} // 2^80 - 2^16
   711  	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
   712  	for i := 0; i < 8; i++ {
   713  		p256Square(&ftmp2, &ftmp2)
   714  	} // 2^88 - 2^8
   715  	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
   716  	for i := 0; i < 4; i++ {
   717  		p256Square(&ftmp2, &ftmp2)
   718  	} // 2^92 - 2^4
   719  	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
   720  	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
   721  	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
   722  	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
   723  	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
   724  	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
   725  	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
   726  
   727  	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
   728  }
   729  
   730  // p256Scalar3 sets out=3*out.
   731  //
   732  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   733  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   734  func p256Scalar3(out *[p256Limbs]uint32) {
   735  	var carry uint32
   736  
   737  	for i := 0; ; i++ {
   738  		out[i] *= 3
   739  		out[i] += carry
   740  		carry = out[i] >> 29
   741  		out[i] &= bottom29Bits
   742  
   743  		i++
   744  		if i == p256Limbs {
   745  			break
   746  		}
   747  
   748  		out[i] *= 3
   749  		out[i] += carry
   750  		carry = out[i] >> 28
   751  		out[i] &= bottom28Bits
   752  	}
   753  
   754  	p256ReduceCarry(out, carry)
   755  }
   756  
   757  // p256Scalar4 sets out=4*out.
   758  //
   759  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   760  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   761  func p256Scalar4(out *[p256Limbs]uint32) {
   762  	var carry, nextCarry uint32
   763  
   764  	for i := 0; ; i++ {
   765  		nextCarry = out[i] >> 27
   766  		out[i] <<= 2
   767  		out[i] &= bottom29Bits
   768  		out[i] += carry
   769  		carry = nextCarry + (out[i] >> 29)
   770  		out[i] &= bottom29Bits
   771  
   772  		i++
   773  		if i == p256Limbs {
   774  			break
   775  		}
   776  		nextCarry = out[i] >> 26
   777  		out[i] <<= 2
   778  		out[i] &= bottom28Bits
   779  		out[i] += carry
   780  		carry = nextCarry + (out[i] >> 28)
   781  		out[i] &= bottom28Bits
   782  	}
   783  
   784  	p256ReduceCarry(out, carry)
   785  }
   786  
   787  // p256Scalar8 sets out=8*out.
   788  //
   789  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   790  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   791  func p256Scalar8(out *[p256Limbs]uint32) {
   792  	var carry, nextCarry uint32
   793  
   794  	for i := 0; ; i++ {
   795  		nextCarry = out[i] >> 26
   796  		out[i] <<= 3
   797  		out[i] &= bottom29Bits
   798  		out[i] += carry
   799  		carry = nextCarry + (out[i] >> 29)
   800  		out[i] &= bottom29Bits
   801  
   802  		i++
   803  		if i == p256Limbs {
   804  			break
   805  		}
   806  		nextCarry = out[i] >> 25
   807  		out[i] <<= 3
   808  		out[i] &= bottom28Bits
   809  		out[i] += carry
   810  		carry = nextCarry + (out[i] >> 28)
   811  		out[i] &= bottom28Bits
   812  	}
   813  
   814  	p256ReduceCarry(out, carry)
   815  }
   816  
   817  // Group operations:
   818  //
   819  // Elements of the elliptic curve group are represented in Jacobian
   820  // coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
   821  // Jacobian form.
   822  
   823  // p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
   824  //
   825  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
   826  func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
   827  	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
   828  
   829  	p256Square(&delta, z)
   830  	p256Square(&gamma, y)
   831  	p256Mul(&beta, x, &gamma)
   832  
   833  	p256Sum(&tmp, x, &delta)
   834  	p256Diff(&tmp2, x, &delta)
   835  	p256Mul(&alpha, &tmp, &tmp2)
   836  	p256Scalar3(&alpha)
   837  
   838  	p256Sum(&tmp, y, z)
   839  	p256Square(&tmp, &tmp)
   840  	p256Diff(&tmp, &tmp, &gamma)
   841  	p256Diff(zOut, &tmp, &delta)
   842  
   843  	p256Scalar4(&beta)
   844  	p256Square(xOut, &alpha)
   845  	p256Diff(xOut, xOut, &beta)
   846  	p256Diff(xOut, xOut, &beta)
   847  
   848  	p256Diff(&tmp, &beta, xOut)
   849  	p256Mul(&tmp, &alpha, &tmp)
   850  	p256Square(&tmp2, &gamma)
   851  	p256Scalar8(&tmp2)
   852  	p256Diff(yOut, &tmp, &tmp2)
   853  }
   854  
   855  // p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
   856  // (i.e. the second point is affine.)
   857  //
   858  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   859  //
   860  // Note that this function does not handle P+P, infinity+P nor P+infinity
   861  // correctly.
   862  func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
   863  	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   864  
   865  	p256Square(&z1z1, z1)
   866  	p256Sum(&tmp, z1, z1)
   867  
   868  	p256Mul(&u2, x2, &z1z1)
   869  	p256Mul(&z1z1z1, z1, &z1z1)
   870  	p256Mul(&s2, y2, &z1z1z1)
   871  	p256Diff(&h, &u2, x1)
   872  	p256Sum(&i, &h, &h)
   873  	p256Square(&i, &i)
   874  	p256Mul(&j, &h, &i)
   875  	p256Diff(&r, &s2, y1)
   876  	p256Sum(&r, &r, &r)
   877  	p256Mul(&v, x1, &i)
   878  
   879  	p256Mul(zOut, &tmp, &h)
   880  	p256Square(&rr, &r)
   881  	p256Diff(xOut, &rr, &j)
   882  	p256Diff(xOut, xOut, &v)
   883  	p256Diff(xOut, xOut, &v)
   884  
   885  	p256Diff(&tmp, &v, xOut)
   886  	p256Mul(yOut, &tmp, &r)
   887  	p256Mul(&tmp, y1, &j)
   888  	p256Diff(yOut, yOut, &tmp)
   889  	p256Diff(yOut, yOut, &tmp)
   890  }
   891  
   892  // p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
   893  //
   894  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   895  //
   896  // Note that this function does not handle P+P, infinity+P nor P+infinity
   897  // correctly.
   898  func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
   899  	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   900  
   901  	p256Square(&z1z1, z1)
   902  	p256Square(&z2z2, z2)
   903  	p256Mul(&u1, x1, &z2z2)
   904  
   905  	p256Sum(&tmp, z1, z2)
   906  	p256Square(&tmp, &tmp)
   907  	p256Diff(&tmp, &tmp, &z1z1)
   908  	p256Diff(&tmp, &tmp, &z2z2)
   909  
   910  	p256Mul(&z2z2z2, z2, &z2z2)
   911  	p256Mul(&s1, y1, &z2z2z2)
   912  
   913  	p256Mul(&u2, x2, &z1z1)
   914  	p256Mul(&z1z1z1, z1, &z1z1)
   915  	p256Mul(&s2, y2, &z1z1z1)
   916  	p256Diff(&h, &u2, &u1)
   917  	p256Sum(&i, &h, &h)
   918  	p256Square(&i, &i)
   919  	p256Mul(&j, &h, &i)
   920  	p256Diff(&r, &s2, &s1)
   921  	p256Sum(&r, &r, &r)
   922  	p256Mul(&v, &u1, &i)
   923  
   924  	p256Mul(zOut, &tmp, &h)
   925  	p256Square(&rr, &r)
   926  	p256Diff(xOut, &rr, &j)
   927  	p256Diff(xOut, xOut, &v)
   928  	p256Diff(xOut, xOut, &v)
   929  
   930  	p256Diff(&tmp, &v, xOut)
   931  	p256Mul(yOut, &tmp, &r)
   932  	p256Mul(&tmp, &s1, &j)
   933  	p256Diff(yOut, yOut, &tmp)
   934  	p256Diff(yOut, yOut, &tmp)
   935  }
   936  
   937  // p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
   938  //
   939  // On entry: mask is either 0 or 0xffffffff.
   940  func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
   941  	for i := 0; i < p256Limbs; i++ {
   942  		tmp := mask & (in[i] ^ out[i])
   943  		out[i] ^= tmp
   944  	}
   945  }
   946  
   947  // p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
   948  // On entry: index < 16, table[0] must be zero.
   949  func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
   950  	for i := range xOut {
   951  		xOut[i] = 0
   952  	}
   953  	for i := range yOut {
   954  		yOut[i] = 0
   955  	}
   956  
   957  	for i := uint32(1); i < 16; i++ {
   958  		mask := i ^ index
   959  		mask |= mask >> 2
   960  		mask |= mask >> 1
   961  		mask &= 1
   962  		mask--
   963  		for j := range xOut {
   964  			xOut[j] |= table[0] & mask
   965  			table = table[1:]
   966  		}
   967  		for j := range yOut {
   968  			yOut[j] |= table[0] & mask
   969  			table = table[1:]
   970  		}
   971  	}
   972  }
   973  
   974  // p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
   975  // table.
   976  // On entry: index < 16, table[0] must be zero.
   977  func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
   978  	for i := range xOut {
   979  		xOut[i] = 0
   980  	}
   981  	for i := range yOut {
   982  		yOut[i] = 0
   983  	}
   984  	for i := range zOut {
   985  		zOut[i] = 0
   986  	}
   987  
   988  	// The implicit value at index 0 is all zero. We don't need to perform that
   989  	// iteration of the loop because we already set out_* to zero.
   990  	for i := uint32(1); i < 16; i++ {
   991  		mask := i ^ index
   992  		mask |= mask >> 2
   993  		mask |= mask >> 1
   994  		mask &= 1
   995  		mask--
   996  		for j := range xOut {
   997  			xOut[j] |= table[i][0][j] & mask
   998  		}
   999  		for j := range yOut {
  1000  			yOut[j] |= table[i][1][j] & mask
  1001  		}
  1002  		for j := range zOut {
  1003  			zOut[j] |= table[i][2][j] & mask
  1004  		}
  1005  	}
  1006  }
  1007  
  1008  // p256GetBit returns the bit'th bit of scalar.
  1009  func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
  1010  	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
  1011  }
  1012  
  1013  // p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
  1014  // little-endian number. Note that the value of scalar must be less than the
  1015  // order of the group.
  1016  func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
  1017  	nIsInfinityMask := ^uint32(0)
  1018  	var pIsNoninfiniteMask, mask, tableOffset uint32
  1019  	var px, py, tx, ty, tz [p256Limbs]uint32
  1020  
  1021  	for i := range xOut {
  1022  		xOut[i] = 0
  1023  	}
  1024  	for i := range yOut {
  1025  		yOut[i] = 0
  1026  	}
  1027  	for i := range zOut {
  1028  		zOut[i] = 0
  1029  	}
  1030  
  1031  	// The loop adds bits at positions 0, 64, 128 and 192, followed by
  1032  	// positions 32,96,160 and 224 and does this 32 times.
  1033  	for i := uint(0); i < 32; i++ {
  1034  		if i != 0 {
  1035  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1036  		}
  1037  		tableOffset = 0
  1038  		for j := uint(0); j <= 32; j += 32 {
  1039  			bit0 := p256GetBit(scalar, 31-i+j)
  1040  			bit1 := p256GetBit(scalar, 95-i+j)
  1041  			bit2 := p256GetBit(scalar, 159-i+j)
  1042  			bit3 := p256GetBit(scalar, 223-i+j)
  1043  			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
  1044  
  1045  			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
  1046  			tableOffset += 30 * p256Limbs
  1047  
  1048  			// Since scalar is less than the order of the group, we know that
  1049  			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
  1050  			// below.
  1051  			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
  1052  			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
  1053  			// (a.k.a.  the point at infinity). We handle that situation by
  1054  			// copying the point from the table.
  1055  			p256CopyConditional(xOut, &px, nIsInfinityMask)
  1056  			p256CopyConditional(yOut, &py, nIsInfinityMask)
  1057  			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
  1058  
  1059  			// Equally, the result is also wrong if the point from the table is
  1060  			// zero, which happens when the index is zero. We handle that by
  1061  			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
  1062  			pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1063  			mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1064  			p256CopyConditional(xOut, &tx, mask)
  1065  			p256CopyConditional(yOut, &ty, mask)
  1066  			p256CopyConditional(zOut, &tz, mask)
  1067  			// If p was not zero, then n is now non-zero.
  1068  			nIsInfinityMask &^= pIsNoninfiniteMask
  1069  		}
  1070  	}
  1071  }
  1072  
  1073  // p256PointToAffine converts a Jacobian point to an affine point. If the input
  1074  // is the point at infinity then it returns (0, 0) in constant time.
  1075  func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
  1076  	var zInv, zInvSq [p256Limbs]uint32
  1077  
  1078  	p256Invert(&zInv, z)
  1079  	p256Square(&zInvSq, &zInv)
  1080  	p256Mul(xOut, x, &zInvSq)
  1081  	p256Mul(&zInv, &zInv, &zInvSq)
  1082  	p256Mul(yOut, y, &zInv)
  1083  }
  1084  
  1085  // p256ToAffine returns a pair of *big.Int containing the affine representation
  1086  // of {x,y,z}.
  1087  func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
  1088  	var xx, yy [p256Limbs]uint32
  1089  	p256PointToAffine(&xx, &yy, x, y, z)
  1090  	return p256ToBig(&xx), p256ToBig(&yy)
  1091  }
  1092  
  1093  // p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
  1094  func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
  1095  	var px, py, pz, tx, ty, tz [p256Limbs]uint32
  1096  	var precomp [16][3][p256Limbs]uint32
  1097  	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
  1098  
  1099  	// We precompute 0,1,2,... times {x,y}.
  1100  	precomp[1][0] = *x
  1101  	precomp[1][1] = *y
  1102  	precomp[1][2] = p256One
  1103  
  1104  	for i := 2; i < 16; i += 2 {
  1105  		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
  1106  		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
  1107  	}
  1108  
  1109  	for i := range xOut {
  1110  		xOut[i] = 0
  1111  	}
  1112  	for i := range yOut {
  1113  		yOut[i] = 0
  1114  	}
  1115  	for i := range zOut {
  1116  		zOut[i] = 0
  1117  	}
  1118  	nIsInfinityMask = ^uint32(0)
  1119  
  1120  	// We add in a window of four bits each iteration and do this 64 times.
  1121  	for i := 0; i < 64; i++ {
  1122  		if i != 0 {
  1123  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1124  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1125  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1126  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1127  		}
  1128  
  1129  		index = uint32(scalar[31-i/2])
  1130  		if (i & 1) == 1 {
  1131  			index &= 15
  1132  		} else {
  1133  			index >>= 4
  1134  		}
  1135  
  1136  		// See the comments in scalarBaseMult about handling infinities.
  1137  		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
  1138  		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
  1139  		p256CopyConditional(xOut, &px, nIsInfinityMask)
  1140  		p256CopyConditional(yOut, &py, nIsInfinityMask)
  1141  		p256CopyConditional(zOut, &pz, nIsInfinityMask)
  1142  
  1143  		pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1144  		mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1145  		p256CopyConditional(xOut, &tx, mask)
  1146  		p256CopyConditional(yOut, &ty, mask)
  1147  		p256CopyConditional(zOut, &tz, mask)
  1148  		nIsInfinityMask &^= pIsNoninfiniteMask
  1149  	}
  1150  }
  1151  
  1152  // p256FromBig sets out = R*in.
  1153  func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
  1154  	tmp := new(big.Int).Lsh(in, 257)
  1155  	tmp.Mod(tmp, p256Params.P)
  1156  
  1157  	for i := 0; i < p256Limbs; i++ {
  1158  		if bits := tmp.Bits(); len(bits) > 0 {
  1159  			out[i] = uint32(bits[0]) & bottom29Bits
  1160  		} else {
  1161  			out[i] = 0
  1162  		}
  1163  		tmp.Rsh(tmp, 29)
  1164  
  1165  		i++
  1166  		if i == p256Limbs {
  1167  			break
  1168  		}
  1169  
  1170  		if bits := tmp.Bits(); len(bits) > 0 {
  1171  			out[i] = uint32(bits[0]) & bottom28Bits
  1172  		} else {
  1173  			out[i] = 0
  1174  		}
  1175  		tmp.Rsh(tmp, 28)
  1176  	}
  1177  }
  1178  
  1179  // p256ToBig returns a *big.Int containing the value of in.
  1180  func p256ToBig(in *[p256Limbs]uint32) *big.Int {
  1181  	result, tmp := new(big.Int), new(big.Int)
  1182  
  1183  	result.SetInt64(int64(in[p256Limbs-1]))
  1184  	for i := p256Limbs - 2; i >= 0; i-- {
  1185  		if (i & 1) == 0 {
  1186  			result.Lsh(result, 29)
  1187  		} else {
  1188  			result.Lsh(result, 28)
  1189  		}
  1190  		tmp.SetInt64(int64(in[i]))
  1191  		result.Add(result, tmp)
  1192  	}
  1193  
  1194  	result.Mul(result, p256RInverse)
  1195  	result.Mod(result, p256Params.P)
  1196  	return result
  1197  }