github.com/fisco-bcos/crypto@v0.0.0-20200202032121-bd8ab0b5d4f1/elliptic/p256.go (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !amd64,!arm64
     6  
     7  package elliptic
     8  
     9  // This file contains a constant-time, 32-bit implementation of P256.
    10  
    11  import (
    12  	"math/big"
    13  )
    14  
    15  type p256Curve struct {
    16  	*CurveParams
    17  }
    18  
    19  var (
    20  	p256Params *CurveParams
    21  
    22  	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
    23  	// (2**257).
    24  	p256RInverse *big.Int
    25  )
    26  
    27  func initP256() {
    28  	// See FIPS 186-3, section D.2.3
    29  	p256Params = &CurveParams{Name: "P-256"}
    30  	p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
    31  	p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
    32  	p256Params.A, _ = new(big.Int).SetString("ffffffff00000001000000000000000000000000fffffffffffffffffffffffc", 16)
    33  	p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
    34  	p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
    35  	p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
    36  	p256Params.BitSize = 256
    37  
    38  	p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
    39  
    40  	// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
    41  	initP256Arch()
    42  }
    43  
    44  func (curve p256Curve) Params() *CurveParams {
    45  	return curve.CurveParams
    46  }
    47  
    48  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
    49  // to out. If the scalar is equal or greater than the order of the group, it's
    50  // reduced modulo that order.
    51  func p256GetScalar(out *[32]byte, in []byte) {
    52  	n := new(big.Int).SetBytes(in)
    53  	var scalarBytes []byte
    54  
    55  	if n.Cmp(p256Params.N) >= 0 {
    56  		n.Mod(n, p256Params.N)
    57  		scalarBytes = n.Bytes()
    58  	} else {
    59  		scalarBytes = in
    60  	}
    61  
    62  	for i, v := range scalarBytes {
    63  		out[len(scalarBytes)-(1+i)] = v
    64  	}
    65  }
    66  
    67  func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
    68  	var scalarReversed [32]byte
    69  	p256GetScalar(&scalarReversed, scalar)
    70  
    71  	var x1, y1, z1 [p256Limbs]uint32
    72  	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
    73  	return p256ToAffine(&x1, &y1, &z1)
    74  }
    75  
    76  func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
    77  	var scalarReversed [32]byte
    78  	p256GetScalar(&scalarReversed, scalar)
    79  
    80  	var px, py, x1, y1, z1 [p256Limbs]uint32
    81  	p256FromBig(&px, bigX)
    82  	p256FromBig(&py, bigY)
    83  	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
    84  	return p256ToAffine(&x1, &y1, &z1)
    85  }
    86  
    87  // Field elements are represented as nine, unsigned 32-bit words.
    88  //
    89  // The value of an field element is:
    90  //   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
    91  //
    92  // That is, each limb is alternately 29 or 28-bits wide in little-endian
    93  // order.
    94  //
    95  // This means that a field element hits 2**257, rather than 2**256 as we would
    96  // like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
    97  // problems when multiplying as terms end up one bit short of a limb which
    98  // would require much bit-shifting to correct.
    99  //
   100  // Finally, the values stored in a field element are in Montgomery form. So the
   101  // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
   102  // 2**257.
   103  
   104  const (
   105  	p256Limbs    = 9
   106  	bottom29Bits = 0x1fffffff
   107  )
   108  
   109  var (
   110  	// p256One is the number 1 as a field element.
   111  	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
   112  	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
   113  	// p256P is the prime modulus as a field element.
   114  	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
   115  	// p2562P is the twice prime modulus as a field element.
   116  	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
   117  )
   118  
   119  // p256Precomputed contains precomputed values to aid the calculation of scalar
   120  // multiples of the base point, G. It's actually two, equal length, tables
   121  // concatenated.
   122  //
   123  // The first table contains (x,y) field element pairs for 16 multiples of the
   124  // base point, G.
   125  //
   126  //   Index  |  Index (binary) | Value
   127  //       0  |           0000  | 0G (all zeros, omitted)
   128  //       1  |           0001  | G
   129  //       2  |           0010  | 2**64G
   130  //       3  |           0011  | 2**64G + G
   131  //       4  |           0100  | 2**128G
   132  //       5  |           0101  | 2**128G + G
   133  //       6  |           0110  | 2**128G + 2**64G
   134  //       7  |           0111  | 2**128G + 2**64G + G
   135  //       8  |           1000  | 2**192G
   136  //       9  |           1001  | 2**192G + G
   137  //      10  |           1010  | 2**192G + 2**64G
   138  //      11  |           1011  | 2**192G + 2**64G + G
   139  //      12  |           1100  | 2**192G + 2**128G
   140  //      13  |           1101  | 2**192G + 2**128G + G
   141  //      14  |           1110  | 2**192G + 2**128G + 2**64G
   142  //      15  |           1111  | 2**192G + 2**128G + 2**64G + G
   143  //
   144  // The second table follows the same style, but the terms are 2**32G,
   145  // 2**96G, 2**160G, 2**224G.
   146  //
   147  // This is ~2KB of data.
   148  var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
   149  	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
   150  	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
   151  	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
   152  	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
   153  	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
   154  	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
   155  	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
   156  	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
   157  	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
   158  	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
   159  	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
   160  	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
   161  	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
   162  	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
   163  	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
   164  	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
   165  	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
   166  	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
   167  	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
   168  	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
   169  	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
   170  	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
   171  	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
   172  	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
   173  	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
   174  	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
   175  	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
   176  	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
   177  	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
   178  	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
   179  	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
   180  	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
   181  	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
   182  	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
   183  	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
   184  	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
   185  	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
   186  	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
   187  	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
   188  	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
   189  	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
   190  	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
   191  	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
   192  	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
   193  	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
   194  	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
   195  	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
   196  	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
   197  	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
   198  	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
   199  	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
   200  	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
   201  	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
   202  	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
   203  	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
   204  	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
   205  	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
   206  	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
   207  	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
   208  	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
   209  }
   210  
   211  // Field element operations:
   212  
   213  // nonZeroToAllOnes returns:
   214  //   0xffffffff for 0 < x <= 2**31
   215  //   0 for x == 0 or x > 2**31.
   216  func nonZeroToAllOnes(x uint32) uint32 {
   217  	return ((x - 1) >> 31) - 1
   218  }
   219  
   220  // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
   221  // which is a term at 2**257.
   222  //
   223  // On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
   224  // On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
   225  func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
   226  	carry_mask := nonZeroToAllOnes(carry)
   227  
   228  	inout[0] += carry << 1
   229  	inout[3] += 0x10000000 & carry_mask
   230  	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
   231  	// previous line therefore this doesn't underflow.
   232  	inout[3] -= carry << 11
   233  	inout[4] += (0x20000000 - 1) & carry_mask
   234  	inout[5] += (0x10000000 - 1) & carry_mask
   235  	inout[6] += (0x20000000 - 1) & carry_mask
   236  	inout[6] -= carry << 22
   237  	// This may underflow if carry is non-zero but, if so, we'll fix it in the
   238  	// next line.
   239  	inout[7] -= 1 & carry_mask
   240  	inout[7] += carry << 25
   241  }
   242  
   243  // p256Sum sets out = in+in2.
   244  //
   245  // On entry, in[i]+in2[i] must not overflow a 32-bit word.
   246  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   247  func p256Sum(out, in, in2 *[p256Limbs]uint32) {
   248  	carry := uint32(0)
   249  	for i := 0; ; i++ {
   250  		out[i] = in[i] + in2[i]
   251  		out[i] += carry
   252  		carry = out[i] >> 29
   253  		out[i] &= bottom29Bits
   254  
   255  		i++
   256  		if i == p256Limbs {
   257  			break
   258  		}
   259  
   260  		out[i] = in[i] + in2[i]
   261  		out[i] += carry
   262  		carry = out[i] >> 28
   263  		out[i] &= bottom28Bits
   264  	}
   265  
   266  	p256ReduceCarry(out, carry)
   267  }
   268  
   269  const (
   270  	two30m2    = 1<<30 - 1<<2
   271  	two30p13m2 = 1<<30 + 1<<13 - 1<<2
   272  	two31m2    = 1<<31 - 1<<2
   273  	two31p24m2 = 1<<31 + 1<<24 - 1<<2
   274  	two30m27m2 = 1<<30 - 1<<27 - 1<<2
   275  )
   276  
   277  // p256Zero31 is 0 mod p.
   278  var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
   279  
   280  // p256Diff sets out = in-in2.
   281  //
   282  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   283  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   284  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   285  func p256Diff(out, in, in2 *[p256Limbs]uint32) {
   286  	var carry uint32
   287  
   288  	for i := 0; ; i++ {
   289  		out[i] = in[i] - in2[i]
   290  		out[i] += p256Zero31[i]
   291  		out[i] += carry
   292  		carry = out[i] >> 29
   293  		out[i] &= bottom29Bits
   294  
   295  		i++
   296  		if i == p256Limbs {
   297  			break
   298  		}
   299  
   300  		out[i] = in[i] - in2[i]
   301  		out[i] += p256Zero31[i]
   302  		out[i] += carry
   303  		carry = out[i] >> 28
   304  		out[i] &= bottom28Bits
   305  	}
   306  
   307  	p256ReduceCarry(out, carry)
   308  }
   309  
   310  // p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
   311  // the same 29,28,... bit positions as an field element.
   312  //
   313  // The values in field elements are in Montgomery form: x*R mod p where R =
   314  // 2**257. Since we just multiplied two Montgomery values together, the result
   315  // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
   316  // in Montgomery form.
   317  //
   318  // On entry: tmp[i] < 2**64
   319  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   320  func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
   321  	// The following table may be helpful when reading this code:
   322  	//
   323  	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
   324  	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
   325  	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
   326  	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
   327  	var tmp2 [18]uint32
   328  	var carry, x, xMask uint32
   329  
   330  	// tmp contains 64-bit words with the same 29,28,29-bit positions as an
   331  	// field element. So the top of an element of tmp might overlap with
   332  	// another element two positions down. The following loop eliminates
   333  	// this overlap.
   334  	tmp2[0] = uint32(tmp[0]) & bottom29Bits
   335  
   336  	tmp2[1] = uint32(tmp[0]) >> 29
   337  	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
   338  	tmp2[1] += uint32(tmp[1]) & bottom28Bits
   339  	carry = tmp2[1] >> 28
   340  	tmp2[1] &= bottom28Bits
   341  
   342  	for i := 2; i < 17; i++ {
   343  		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
   344  		tmp2[i] += (uint32(tmp[i-1])) >> 28
   345  		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
   346  		tmp2[i] += uint32(tmp[i]) & bottom29Bits
   347  		tmp2[i] += carry
   348  		carry = tmp2[i] >> 29
   349  		tmp2[i] &= bottom29Bits
   350  
   351  		i++
   352  		if i == 17 {
   353  			break
   354  		}
   355  		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
   356  		tmp2[i] += uint32(tmp[i-1]) >> 29
   357  		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
   358  		tmp2[i] += uint32(tmp[i]) & bottom28Bits
   359  		tmp2[i] += carry
   360  		carry = tmp2[i] >> 28
   361  		tmp2[i] &= bottom28Bits
   362  	}
   363  
   364  	tmp2[17] = uint32(tmp[15]>>32) >> 25
   365  	tmp2[17] += uint32(tmp[16]) >> 29
   366  	tmp2[17] += uint32(tmp[16]>>32) << 3
   367  	tmp2[17] += carry
   368  
   369  	// Montgomery elimination of terms:
   370  	//
   371  	// Since R is 2**257, we can divide by R with a bitwise shift if we can
   372  	// ensure that the right-most 257 bits are all zero. We can make that true
   373  	// by adding multiplies of p without affecting the value.
   374  	//
   375  	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
   376  	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
   377  	// We can do that for 8 further limbs and then right shift to eliminate the
   378  	// extra factor of R.
   379  	for i := 0; ; i += 2 {
   380  		tmp2[i+1] += tmp2[i] >> 29
   381  		x = tmp2[i] & bottom29Bits
   382  		xMask = nonZeroToAllOnes(x)
   383  		tmp2[i] = 0
   384  
   385  		// The bounds calculations for this loop are tricky. Each iteration of
   386  		// the loop eliminates two words by adding values to words to their
   387  		// right.
   388  		//
   389  		// The following table contains the amounts added to each word (as an
   390  		// offset from the value of i at the top of the loop). The amounts are
   391  		// accounted for from the first and second half of the loop separately
   392  		// and are written as, for example, 28 to mean a value <2**28.
   393  		//
   394  		// Word:                   3   4   5   6   7   8   9   10
   395  		// Added in top half:     28  11      29  21  29  28
   396  		//                                        28  29
   397  		//                                            29
   398  		// Added in bottom half:      29  10      28  21  28   28
   399  		//                                            29
   400  		//
   401  		// The value that is currently offset 7 will be offset 5 for the next
   402  		// iteration and then offset 3 for the iteration after that. Therefore
   403  		// the total value added will be the values added at 7, 5 and 3.
   404  		//
   405  		// The following table accumulates these values. The sums at the bottom
   406  		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
   407  		//
   408  		// Word:                   3   4   5   6   7   8   9  10  11  12  13
   409  		//                        28  11  10  29  21  29  28  28  28  28  28
   410  		//                            29  28  11  28  29  28  29  28  29  28
   411  		//                                    29  28  21  21  29  21  29  21
   412  		//                                        10  29  28  21  28  21  28
   413  		//                                        28  29  28  29  28  29  28
   414  		//                                            11  10  29  10  29  10
   415  		//                                            29  28  11  28  11
   416  		//                                                    29      29
   417  		//                        --------------------------------------------
   418  		//                                                30+ 31+ 30+ 31+ 30+
   419  		//                                                28+ 29+ 28+ 29+ 21+
   420  		//                                                21+ 28+ 21+ 28+ 10
   421  		//                                                10  21+ 10  21+
   422  		//                                                    11      11
   423  		//
   424  		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
   425  		// tmp2[10/12] has an initial value of <2**29, then the maximum value
   426  		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
   427  		// as required.
   428  		tmp2[i+3] += (x << 10) & bottom28Bits
   429  		tmp2[i+4] += (x >> 18)
   430  
   431  		tmp2[i+6] += (x << 21) & bottom29Bits
   432  		tmp2[i+7] += x >> 8
   433  
   434  		// At position 200, which is the starting bit position for word 7, we
   435  		// have a factor of 0xf000000 = 2**28 - 2**24.
   436  		tmp2[i+7] += 0x10000000 & xMask
   437  		tmp2[i+8] += (x - 1) & xMask
   438  		tmp2[i+7] -= (x << 24) & bottom28Bits
   439  		tmp2[i+8] -= x >> 4
   440  
   441  		tmp2[i+8] += 0x20000000 & xMask
   442  		tmp2[i+8] -= x
   443  		tmp2[i+8] += (x << 28) & bottom29Bits
   444  		tmp2[i+9] += ((x >> 1) - 1) & xMask
   445  
   446  		if i+1 == p256Limbs {
   447  			break
   448  		}
   449  		tmp2[i+2] += tmp2[i+1] >> 28
   450  		x = tmp2[i+1] & bottom28Bits
   451  		xMask = nonZeroToAllOnes(x)
   452  		tmp2[i+1] = 0
   453  
   454  		tmp2[i+4] += (x << 11) & bottom29Bits
   455  		tmp2[i+5] += (x >> 18)
   456  
   457  		tmp2[i+7] += (x << 21) & bottom28Bits
   458  		tmp2[i+8] += x >> 7
   459  
   460  		// At position 199, which is the starting bit of the 8th word when
   461  		// dealing with a context starting on an odd word, we have a factor of
   462  		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
   463  		// word from i+1 is i+8.
   464  		tmp2[i+8] += 0x20000000 & xMask
   465  		tmp2[i+9] += (x - 1) & xMask
   466  		tmp2[i+8] -= (x << 25) & bottom29Bits
   467  		tmp2[i+9] -= x >> 4
   468  
   469  		tmp2[i+9] += 0x10000000 & xMask
   470  		tmp2[i+9] -= x
   471  		tmp2[i+10] += (x - 1) & xMask
   472  	}
   473  
   474  	// We merge the right shift with a carry chain. The words above 2**257 have
   475  	// widths of 28,29,... which we need to correct when copying them down.
   476  	carry = 0
   477  	for i := 0; i < 8; i++ {
   478  		// The maximum value of tmp2[i + 9] occurs on the first iteration and
   479  		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
   480  		// therefore safe.
   481  		out[i] = tmp2[i+9]
   482  		out[i] += carry
   483  		out[i] += (tmp2[i+10] << 28) & bottom29Bits
   484  		carry = out[i] >> 29
   485  		out[i] &= bottom29Bits
   486  
   487  		i++
   488  		out[i] = tmp2[i+9] >> 1
   489  		out[i] += carry
   490  		carry = out[i] >> 28
   491  		out[i] &= bottom28Bits
   492  	}
   493  
   494  	out[8] = tmp2[17]
   495  	out[8] += carry
   496  	carry = out[8] >> 29
   497  	out[8] &= bottom29Bits
   498  
   499  	p256ReduceCarry(out, carry)
   500  }
   501  
   502  // p256Square sets out=in*in.
   503  //
   504  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
   505  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   506  func p256Square(out, in *[p256Limbs]uint32) {
   507  	var tmp [17]uint64
   508  
   509  	tmp[0] = uint64(in[0]) * uint64(in[0])
   510  	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
   511  	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
   512  		uint64(in[1])*(uint64(in[1])<<1)
   513  	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
   514  		uint64(in[1])*(uint64(in[2])<<1)
   515  	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
   516  		uint64(in[1])*(uint64(in[3])<<2) +
   517  		uint64(in[2])*uint64(in[2])
   518  	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
   519  		uint64(in[1])*(uint64(in[4])<<1) +
   520  		uint64(in[2])*(uint64(in[3])<<1)
   521  	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
   522  		uint64(in[1])*(uint64(in[5])<<2) +
   523  		uint64(in[2])*(uint64(in[4])<<1) +
   524  		uint64(in[3])*(uint64(in[3])<<1)
   525  	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
   526  		uint64(in[1])*(uint64(in[6])<<1) +
   527  		uint64(in[2])*(uint64(in[5])<<1) +
   528  		uint64(in[3])*(uint64(in[4])<<1)
   529  	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
   530  	// which is < 2**64 as required.
   531  	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
   532  		uint64(in[1])*(uint64(in[7])<<2) +
   533  		uint64(in[2])*(uint64(in[6])<<1) +
   534  		uint64(in[3])*(uint64(in[5])<<2) +
   535  		uint64(in[4])*uint64(in[4])
   536  	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
   537  		uint64(in[2])*(uint64(in[7])<<1) +
   538  		uint64(in[3])*(uint64(in[6])<<1) +
   539  		uint64(in[4])*(uint64(in[5])<<1)
   540  	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
   541  		uint64(in[3])*(uint64(in[7])<<2) +
   542  		uint64(in[4])*(uint64(in[6])<<1) +
   543  		uint64(in[5])*(uint64(in[5])<<1)
   544  	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
   545  		uint64(in[4])*(uint64(in[7])<<1) +
   546  		uint64(in[5])*(uint64(in[6])<<1)
   547  	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
   548  		uint64(in[5])*(uint64(in[7])<<2) +
   549  		uint64(in[6])*uint64(in[6])
   550  	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
   551  		uint64(in[6])*(uint64(in[7])<<1)
   552  	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
   553  		uint64(in[7])*(uint64(in[7])<<1)
   554  	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
   555  	tmp[16] = uint64(in[8]) * uint64(in[8])
   556  
   557  	p256ReduceDegree(out, tmp)
   558  }
   559  
   560  // p256Mul sets out=in*in2.
   561  //
   562  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   563  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   564  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   565  func p256Mul(out, in, in2 *[p256Limbs]uint32) {
   566  	var tmp [17]uint64
   567  
   568  	tmp[0] = uint64(in[0]) * uint64(in2[0])
   569  	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
   570  		uint64(in[1])*(uint64(in2[0])<<0)
   571  	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
   572  		uint64(in[1])*(uint64(in2[1])<<1) +
   573  		uint64(in[2])*(uint64(in2[0])<<0)
   574  	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
   575  		uint64(in[1])*(uint64(in2[2])<<0) +
   576  		uint64(in[2])*(uint64(in2[1])<<0) +
   577  		uint64(in[3])*(uint64(in2[0])<<0)
   578  	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
   579  		uint64(in[1])*(uint64(in2[3])<<1) +
   580  		uint64(in[2])*(uint64(in2[2])<<0) +
   581  		uint64(in[3])*(uint64(in2[1])<<1) +
   582  		uint64(in[4])*(uint64(in2[0])<<0)
   583  	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
   584  		uint64(in[1])*(uint64(in2[4])<<0) +
   585  		uint64(in[2])*(uint64(in2[3])<<0) +
   586  		uint64(in[3])*(uint64(in2[2])<<0) +
   587  		uint64(in[4])*(uint64(in2[1])<<0) +
   588  		uint64(in[5])*(uint64(in2[0])<<0)
   589  	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
   590  		uint64(in[1])*(uint64(in2[5])<<1) +
   591  		uint64(in[2])*(uint64(in2[4])<<0) +
   592  		uint64(in[3])*(uint64(in2[3])<<1) +
   593  		uint64(in[4])*(uint64(in2[2])<<0) +
   594  		uint64(in[5])*(uint64(in2[1])<<1) +
   595  		uint64(in[6])*(uint64(in2[0])<<0)
   596  	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
   597  		uint64(in[1])*(uint64(in2[6])<<0) +
   598  		uint64(in[2])*(uint64(in2[5])<<0) +
   599  		uint64(in[3])*(uint64(in2[4])<<0) +
   600  		uint64(in[4])*(uint64(in2[3])<<0) +
   601  		uint64(in[5])*(uint64(in2[2])<<0) +
   602  		uint64(in[6])*(uint64(in2[1])<<0) +
   603  		uint64(in[7])*(uint64(in2[0])<<0)
   604  	// tmp[8] has the greatest value but doesn't overflow. See logic in
   605  	// p256Square.
   606  	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
   607  		uint64(in[1])*(uint64(in2[7])<<1) +
   608  		uint64(in[2])*(uint64(in2[6])<<0) +
   609  		uint64(in[3])*(uint64(in2[5])<<1) +
   610  		uint64(in[4])*(uint64(in2[4])<<0) +
   611  		uint64(in[5])*(uint64(in2[3])<<1) +
   612  		uint64(in[6])*(uint64(in2[2])<<0) +
   613  		uint64(in[7])*(uint64(in2[1])<<1) +
   614  		uint64(in[8])*(uint64(in2[0])<<0)
   615  	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
   616  		uint64(in[2])*(uint64(in2[7])<<0) +
   617  		uint64(in[3])*(uint64(in2[6])<<0) +
   618  		uint64(in[4])*(uint64(in2[5])<<0) +
   619  		uint64(in[5])*(uint64(in2[4])<<0) +
   620  		uint64(in[6])*(uint64(in2[3])<<0) +
   621  		uint64(in[7])*(uint64(in2[2])<<0) +
   622  		uint64(in[8])*(uint64(in2[1])<<0)
   623  	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
   624  		uint64(in[3])*(uint64(in2[7])<<1) +
   625  		uint64(in[4])*(uint64(in2[6])<<0) +
   626  		uint64(in[5])*(uint64(in2[5])<<1) +
   627  		uint64(in[6])*(uint64(in2[4])<<0) +
   628  		uint64(in[7])*(uint64(in2[3])<<1) +
   629  		uint64(in[8])*(uint64(in2[2])<<0)
   630  	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
   631  		uint64(in[4])*(uint64(in2[7])<<0) +
   632  		uint64(in[5])*(uint64(in2[6])<<0) +
   633  		uint64(in[6])*(uint64(in2[5])<<0) +
   634  		uint64(in[7])*(uint64(in2[4])<<0) +
   635  		uint64(in[8])*(uint64(in2[3])<<0)
   636  	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
   637  		uint64(in[5])*(uint64(in2[7])<<1) +
   638  		uint64(in[6])*(uint64(in2[6])<<0) +
   639  		uint64(in[7])*(uint64(in2[5])<<1) +
   640  		uint64(in[8])*(uint64(in2[4])<<0)
   641  	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
   642  		uint64(in[6])*(uint64(in2[7])<<0) +
   643  		uint64(in[7])*(uint64(in2[6])<<0) +
   644  		uint64(in[8])*(uint64(in2[5])<<0)
   645  	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
   646  		uint64(in[7])*(uint64(in2[7])<<1) +
   647  		uint64(in[8])*(uint64(in2[6])<<0)
   648  	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
   649  		uint64(in[8])*(uint64(in2[7])<<0)
   650  	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
   651  
   652  	p256ReduceDegree(out, tmp)
   653  }
   654  
   655  func p256Assign(out, in *[p256Limbs]uint32) {
   656  	*out = *in
   657  }
   658  
   659  // p256Invert calculates |out| = |in|^{-1}
   660  //
   661  // Based on Fermat's Little Theorem:
   662  //   a^p = a (mod p)
   663  //   a^{p-1} = 1 (mod p)
   664  //   a^{p-2} = a^{-1} (mod p)
   665  func p256Invert(out, in *[p256Limbs]uint32) {
   666  	var ftmp, ftmp2 [p256Limbs]uint32
   667  
   668  	// each e_I will hold |in|^{2^I - 1}
   669  	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
   670  
   671  	p256Square(&ftmp, in)     // 2^1
   672  	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
   673  	p256Assign(&e2, &ftmp)
   674  	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
   675  	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
   676  	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
   677  	p256Assign(&e4, &ftmp)
   678  	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
   679  	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
   680  	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
   681  	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
   682  	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
   683  	p256Assign(&e8, &ftmp)
   684  	for i := 0; i < 8; i++ {
   685  		p256Square(&ftmp, &ftmp)
   686  	} // 2^16 - 2^8
   687  	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
   688  	p256Assign(&e16, &ftmp)
   689  	for i := 0; i < 16; i++ {
   690  		p256Square(&ftmp, &ftmp)
   691  	} // 2^32 - 2^16
   692  	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
   693  	p256Assign(&e32, &ftmp)
   694  	for i := 0; i < 32; i++ {
   695  		p256Square(&ftmp, &ftmp)
   696  	} // 2^64 - 2^32
   697  	p256Assign(&e64, &ftmp)
   698  	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
   699  	for i := 0; i < 192; i++ {
   700  		p256Square(&ftmp, &ftmp)
   701  	} // 2^256 - 2^224 + 2^192
   702  
   703  	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
   704  	for i := 0; i < 16; i++ {
   705  		p256Square(&ftmp2, &ftmp2)
   706  	} // 2^80 - 2^16
   707  	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
   708  	for i := 0; i < 8; i++ {
   709  		p256Square(&ftmp2, &ftmp2)
   710  	} // 2^88 - 2^8
   711  	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
   712  	for i := 0; i < 4; i++ {
   713  		p256Square(&ftmp2, &ftmp2)
   714  	} // 2^92 - 2^4
   715  	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
   716  	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
   717  	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
   718  	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
   719  	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
   720  	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
   721  	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
   722  
   723  	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
   724  }
   725  
   726  // p256Scalar3 sets out=3*out.
   727  //
   728  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   729  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   730  func p256Scalar3(out *[p256Limbs]uint32) {
   731  	var carry uint32
   732  
   733  	for i := 0; ; i++ {
   734  		out[i] *= 3
   735  		out[i] += carry
   736  		carry = out[i] >> 29
   737  		out[i] &= bottom29Bits
   738  
   739  		i++
   740  		if i == p256Limbs {
   741  			break
   742  		}
   743  
   744  		out[i] *= 3
   745  		out[i] += carry
   746  		carry = out[i] >> 28
   747  		out[i] &= bottom28Bits
   748  	}
   749  
   750  	p256ReduceCarry(out, carry)
   751  }
   752  
   753  // p256Scalar4 sets out=4*out.
   754  //
   755  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   756  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   757  func p256Scalar4(out *[p256Limbs]uint32) {
   758  	var carry, nextCarry uint32
   759  
   760  	for i := 0; ; i++ {
   761  		nextCarry = out[i] >> 27
   762  		out[i] <<= 2
   763  		out[i] &= bottom29Bits
   764  		out[i] += carry
   765  		carry = nextCarry + (out[i] >> 29)
   766  		out[i] &= bottom29Bits
   767  
   768  		i++
   769  		if i == p256Limbs {
   770  			break
   771  		}
   772  		nextCarry = out[i] >> 26
   773  		out[i] <<= 2
   774  		out[i] &= bottom28Bits
   775  		out[i] += carry
   776  		carry = nextCarry + (out[i] >> 28)
   777  		out[i] &= bottom28Bits
   778  	}
   779  
   780  	p256ReduceCarry(out, carry)
   781  }
   782  
   783  // p256Scalar8 sets out=8*out.
   784  //
   785  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   786  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   787  func p256Scalar8(out *[p256Limbs]uint32) {
   788  	var carry, nextCarry uint32
   789  
   790  	for i := 0; ; i++ {
   791  		nextCarry = out[i] >> 26
   792  		out[i] <<= 3
   793  		out[i] &= bottom29Bits
   794  		out[i] += carry
   795  		carry = nextCarry + (out[i] >> 29)
   796  		out[i] &= bottom29Bits
   797  
   798  		i++
   799  		if i == p256Limbs {
   800  			break
   801  		}
   802  		nextCarry = out[i] >> 25
   803  		out[i] <<= 3
   804  		out[i] &= bottom28Bits
   805  		out[i] += carry
   806  		carry = nextCarry + (out[i] >> 28)
   807  		out[i] &= bottom28Bits
   808  	}
   809  
   810  	p256ReduceCarry(out, carry)
   811  }
   812  
   813  // Group operations:
   814  //
   815  // Elements of the elliptic curve group are represented in Jacobian
   816  // coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
   817  // Jacobian form.
   818  
   819  // p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
   820  //
   821  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
   822  func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
   823  	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
   824  
   825  	p256Square(&delta, z)
   826  	p256Square(&gamma, y)
   827  	p256Mul(&beta, x, &gamma)
   828  
   829  	p256Sum(&tmp, x, &delta)
   830  	p256Diff(&tmp2, x, &delta)
   831  	p256Mul(&alpha, &tmp, &tmp2)
   832  	p256Scalar3(&alpha)
   833  
   834  	p256Sum(&tmp, y, z)
   835  	p256Square(&tmp, &tmp)
   836  	p256Diff(&tmp, &tmp, &gamma)
   837  	p256Diff(zOut, &tmp, &delta)
   838  
   839  	p256Scalar4(&beta)
   840  	p256Square(xOut, &alpha)
   841  	p256Diff(xOut, xOut, &beta)
   842  	p256Diff(xOut, xOut, &beta)
   843  
   844  	p256Diff(&tmp, &beta, xOut)
   845  	p256Mul(&tmp, &alpha, &tmp)
   846  	p256Square(&tmp2, &gamma)
   847  	p256Scalar8(&tmp2)
   848  	p256Diff(yOut, &tmp, &tmp2)
   849  }
   850  
   851  // p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
   852  // (i.e. the second point is affine.)
   853  //
   854  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   855  //
   856  // Note that this function does not handle P+P, infinity+P nor P+infinity
   857  // correctly.
   858  func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
   859  	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   860  
   861  	p256Square(&z1z1, z1)
   862  	p256Sum(&tmp, z1, z1)
   863  
   864  	p256Mul(&u2, x2, &z1z1)
   865  	p256Mul(&z1z1z1, z1, &z1z1)
   866  	p256Mul(&s2, y2, &z1z1z1)
   867  	p256Diff(&h, &u2, x1)
   868  	p256Sum(&i, &h, &h)
   869  	p256Square(&i, &i)
   870  	p256Mul(&j, &h, &i)
   871  	p256Diff(&r, &s2, y1)
   872  	p256Sum(&r, &r, &r)
   873  	p256Mul(&v, x1, &i)
   874  
   875  	p256Mul(zOut, &tmp, &h)
   876  	p256Square(&rr, &r)
   877  	p256Diff(xOut, &rr, &j)
   878  	p256Diff(xOut, xOut, &v)
   879  	p256Diff(xOut, xOut, &v)
   880  
   881  	p256Diff(&tmp, &v, xOut)
   882  	p256Mul(yOut, &tmp, &r)
   883  	p256Mul(&tmp, y1, &j)
   884  	p256Diff(yOut, yOut, &tmp)
   885  	p256Diff(yOut, yOut, &tmp)
   886  }
   887  
   888  // p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
   889  //
   890  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   891  //
   892  // Note that this function does not handle P+P, infinity+P nor P+infinity
   893  // correctly.
   894  func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
   895  	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   896  
   897  	p256Square(&z1z1, z1)
   898  	p256Square(&z2z2, z2)
   899  	p256Mul(&u1, x1, &z2z2)
   900  
   901  	p256Sum(&tmp, z1, z2)
   902  	p256Square(&tmp, &tmp)
   903  	p256Diff(&tmp, &tmp, &z1z1)
   904  	p256Diff(&tmp, &tmp, &z2z2)
   905  
   906  	p256Mul(&z2z2z2, z2, &z2z2)
   907  	p256Mul(&s1, y1, &z2z2z2)
   908  
   909  	p256Mul(&u2, x2, &z1z1)
   910  	p256Mul(&z1z1z1, z1, &z1z1)
   911  	p256Mul(&s2, y2, &z1z1z1)
   912  	p256Diff(&h, &u2, &u1)
   913  	p256Sum(&i, &h, &h)
   914  	p256Square(&i, &i)
   915  	p256Mul(&j, &h, &i)
   916  	p256Diff(&r, &s2, &s1)
   917  	p256Sum(&r, &r, &r)
   918  	p256Mul(&v, &u1, &i)
   919  
   920  	p256Mul(zOut, &tmp, &h)
   921  	p256Square(&rr, &r)
   922  	p256Diff(xOut, &rr, &j)
   923  	p256Diff(xOut, xOut, &v)
   924  	p256Diff(xOut, xOut, &v)
   925  
   926  	p256Diff(&tmp, &v, xOut)
   927  	p256Mul(yOut, &tmp, &r)
   928  	p256Mul(&tmp, &s1, &j)
   929  	p256Diff(yOut, yOut, &tmp)
   930  	p256Diff(yOut, yOut, &tmp)
   931  }
   932  
   933  // p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
   934  //
   935  // On entry: mask is either 0 or 0xffffffff.
   936  func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
   937  	for i := 0; i < p256Limbs; i++ {
   938  		tmp := mask & (in[i] ^ out[i])
   939  		out[i] ^= tmp
   940  	}
   941  }
   942  
   943  // p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
   944  // On entry: index < 16, table[0] must be zero.
   945  func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
   946  	for i := range xOut {
   947  		xOut[i] = 0
   948  	}
   949  	for i := range yOut {
   950  		yOut[i] = 0
   951  	}
   952  
   953  	for i := uint32(1); i < 16; i++ {
   954  		mask := i ^ index
   955  		mask |= mask >> 2
   956  		mask |= mask >> 1
   957  		mask &= 1
   958  		mask--
   959  		for j := range xOut {
   960  			xOut[j] |= table[0] & mask
   961  			table = table[1:]
   962  		}
   963  		for j := range yOut {
   964  			yOut[j] |= table[0] & mask
   965  			table = table[1:]
   966  		}
   967  	}
   968  }
   969  
   970  // p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
   971  // table.
   972  // On entry: index < 16, table[0] must be zero.
   973  func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
   974  	for i := range xOut {
   975  		xOut[i] = 0
   976  	}
   977  	for i := range yOut {
   978  		yOut[i] = 0
   979  	}
   980  	for i := range zOut {
   981  		zOut[i] = 0
   982  	}
   983  
   984  	// The implicit value at index 0 is all zero. We don't need to perform that
   985  	// iteration of the loop because we already set out_* to zero.
   986  	for i := uint32(1); i < 16; i++ {
   987  		mask := i ^ index
   988  		mask |= mask >> 2
   989  		mask |= mask >> 1
   990  		mask &= 1
   991  		mask--
   992  		for j := range xOut {
   993  			xOut[j] |= table[i][0][j] & mask
   994  		}
   995  		for j := range yOut {
   996  			yOut[j] |= table[i][1][j] & mask
   997  		}
   998  		for j := range zOut {
   999  			zOut[j] |= table[i][2][j] & mask
  1000  		}
  1001  	}
  1002  }
  1003  
  1004  // p256GetBit returns the bit'th bit of scalar.
  1005  func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
  1006  	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
  1007  }
  1008  
  1009  // p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
  1010  // little-endian number. Note that the value of scalar must be less than the
  1011  // order of the group.
  1012  func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
  1013  	nIsInfinityMask := ^uint32(0)
  1014  	var pIsNoninfiniteMask, mask, tableOffset uint32
  1015  	var px, py, tx, ty, tz [p256Limbs]uint32
  1016  
  1017  	for i := range xOut {
  1018  		xOut[i] = 0
  1019  	}
  1020  	for i := range yOut {
  1021  		yOut[i] = 0
  1022  	}
  1023  	for i := range zOut {
  1024  		zOut[i] = 0
  1025  	}
  1026  
  1027  	// The loop adds bits at positions 0, 64, 128 and 192, followed by
  1028  	// positions 32,96,160 and 224 and does this 32 times.
  1029  	for i := uint(0); i < 32; i++ {
  1030  		if i != 0 {
  1031  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1032  		}
  1033  		tableOffset = 0
  1034  		for j := uint(0); j <= 32; j += 32 {
  1035  			bit0 := p256GetBit(scalar, 31-i+j)
  1036  			bit1 := p256GetBit(scalar, 95-i+j)
  1037  			bit2 := p256GetBit(scalar, 159-i+j)
  1038  			bit3 := p256GetBit(scalar, 223-i+j)
  1039  			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
  1040  
  1041  			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
  1042  			tableOffset += 30 * p256Limbs
  1043  
  1044  			// Since scalar is less than the order of the group, we know that
  1045  			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
  1046  			// below.
  1047  			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
  1048  			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
  1049  			// (a.k.a.  the point at infinity). We handle that situation by
  1050  			// copying the point from the table.
  1051  			p256CopyConditional(xOut, &px, nIsInfinityMask)
  1052  			p256CopyConditional(yOut, &py, nIsInfinityMask)
  1053  			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
  1054  
  1055  			// Equally, the result is also wrong if the point from the table is
  1056  			// zero, which happens when the index is zero. We handle that by
  1057  			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
  1058  			pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1059  			mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1060  			p256CopyConditional(xOut, &tx, mask)
  1061  			p256CopyConditional(yOut, &ty, mask)
  1062  			p256CopyConditional(zOut, &tz, mask)
  1063  			// If p was not zero, then n is now non-zero.
  1064  			nIsInfinityMask &^= pIsNoninfiniteMask
  1065  		}
  1066  	}
  1067  }
  1068  
  1069  // p256PointToAffine converts a Jacobian point to an affine point. If the input
  1070  // is the point at infinity then it returns (0, 0) in constant time.
  1071  func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
  1072  	var zInv, zInvSq [p256Limbs]uint32
  1073  
  1074  	p256Invert(&zInv, z)
  1075  	p256Square(&zInvSq, &zInv)
  1076  	p256Mul(xOut, x, &zInvSq)
  1077  	p256Mul(&zInv, &zInv, &zInvSq)
  1078  	p256Mul(yOut, y, &zInv)
  1079  }
  1080  
  1081  // p256ToAffine returns a pair of *big.Int containing the affine representation
  1082  // of {x,y,z}.
  1083  func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
  1084  	var xx, yy [p256Limbs]uint32
  1085  	p256PointToAffine(&xx, &yy, x, y, z)
  1086  	return p256ToBig(&xx), p256ToBig(&yy)
  1087  }
  1088  
  1089  // p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
  1090  func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
  1091  	var px, py, pz, tx, ty, tz [p256Limbs]uint32
  1092  	var precomp [16][3][p256Limbs]uint32
  1093  	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
  1094  
  1095  	// We precompute 0,1,2,... times {x,y}.
  1096  	precomp[1][0] = *x
  1097  	precomp[1][1] = *y
  1098  	precomp[1][2] = p256One
  1099  
  1100  	for i := 2; i < 16; i += 2 {
  1101  		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
  1102  		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
  1103  	}
  1104  
  1105  	for i := range xOut {
  1106  		xOut[i] = 0
  1107  	}
  1108  	for i := range yOut {
  1109  		yOut[i] = 0
  1110  	}
  1111  	for i := range zOut {
  1112  		zOut[i] = 0
  1113  	}
  1114  	nIsInfinityMask = ^uint32(0)
  1115  
  1116  	// We add in a window of four bits each iteration and do this 64 times.
  1117  	for i := 0; i < 64; i++ {
  1118  		if i != 0 {
  1119  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1120  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1121  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1122  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1123  		}
  1124  
  1125  		index = uint32(scalar[31-i/2])
  1126  		if (i & 1) == 1 {
  1127  			index &= 15
  1128  		} else {
  1129  			index >>= 4
  1130  		}
  1131  
  1132  		// See the comments in scalarBaseMult about handling infinities.
  1133  		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
  1134  		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
  1135  		p256CopyConditional(xOut, &px, nIsInfinityMask)
  1136  		p256CopyConditional(yOut, &py, nIsInfinityMask)
  1137  		p256CopyConditional(zOut, &pz, nIsInfinityMask)
  1138  
  1139  		pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1140  		mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1141  		p256CopyConditional(xOut, &tx, mask)
  1142  		p256CopyConditional(yOut, &ty, mask)
  1143  		p256CopyConditional(zOut, &tz, mask)
  1144  		nIsInfinityMask &^= pIsNoninfiniteMask
  1145  	}
  1146  }
  1147  
  1148  // p256FromBig sets out = R*in.
  1149  func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
  1150  	tmp := new(big.Int).Lsh(in, 257)
  1151  	tmp.Mod(tmp, p256Params.P)
  1152  
  1153  	for i := 0; i < p256Limbs; i++ {
  1154  		if bits := tmp.Bits(); len(bits) > 0 {
  1155  			out[i] = uint32(bits[0]) & bottom29Bits
  1156  		} else {
  1157  			out[i] = 0
  1158  		}
  1159  		tmp.Rsh(tmp, 29)
  1160  
  1161  		i++
  1162  		if i == p256Limbs {
  1163  			break
  1164  		}
  1165  
  1166  		if bits := tmp.Bits(); len(bits) > 0 {
  1167  			out[i] = uint32(bits[0]) & bottom28Bits
  1168  		} else {
  1169  			out[i] = 0
  1170  		}
  1171  		tmp.Rsh(tmp, 28)
  1172  	}
  1173  }
  1174  
  1175  // p256ToBig returns a *big.Int containing the value of in.
  1176  func p256ToBig(in *[p256Limbs]uint32) *big.Int {
  1177  	result, tmp := new(big.Int), new(big.Int)
  1178  
  1179  	result.SetInt64(int64(in[p256Limbs-1]))
  1180  	for i := p256Limbs - 2; i >= 0; i-- {
  1181  		if (i & 1) == 0 {
  1182  			result.Lsh(result, 29)
  1183  		} else {
  1184  			result.Lsh(result, 28)
  1185  		}
  1186  		tmp.SetInt64(int64(in[i]))
  1187  		result.Add(result, tmp)
  1188  	}
  1189  
  1190  	result.Mul(result, p256RInverse)
  1191  	result.Mod(result, p256Params.P)
  1192  	return result
  1193  }