github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/sm2/p256.go

github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/sm2/p256.go (about)

     1  //go:build !amd64 && !arm64
     2  // +build !amd64,!arm64
     3  
     4  package sm2
     5  
     6  import (
     7  	"crypto/elliptic"
     8  	"fmt"
     9  	"math/big"
    10  )
    11  
    12  // See https://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background.
    13  // Group Level Optimizations, "Efficient and Secure Elliptic Curve Cryptography Implementation of Curve P-256"
    14  // SM2 P256 parameters reference GB/T 32918.5-2017 part 5.
    15  
    16  type p256Curve struct {
    17  	*elliptic.CurveParams
    18  }
    19  
    20  var (
    21  	p256Params *elliptic.CurveParams
    22  
    23  	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
    24  	// (2**257).
    25  	p256RInverse *big.Int
    26  )
    27  
    28  func initP256() {
    29  	p256Params = &elliptic.CurveParams{Name: "SM2-P-256"}
    30  	// 2**256 - 2**224 - 2**96 + 2**64 - 1
    31  	p256Params.P, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)
    32  	p256Params.N, _ = new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123", 16)
    33  	p256Params.B, _ = new(big.Int).SetString("28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93", 16)
    34  	p256Params.Gx, _ = new(big.Int).SetString("32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7", 16)
    35  	p256Params.Gy, _ = new(big.Int).SetString("BC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0", 16)
    36  	p256Params.BitSize = 256
    37  
    38  	// ModeInverse(2**257, P)
    39  	// p256RInverse = big.NewInt(0)
    40  	// r, _ := new(big.Int).SetString("20000000000000000000000000000000000000000000000000000000000000000", 16)
    41  	// p256RInverse.ModInverse(r, p256.P)
    42  	// fmt.Printf("%s\n", hex.EncodeToString(p256RInverse.Bytes()))
    43  	p256RInverse, _ = new(big.Int).SetString("7ffffffd80000002fffffffe000000017ffffffe800000037ffffffc80000002", 16)
    44  
    45  	// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
    46  	initP256Arch()
    47  }
    48  
    49  func (curve p256Curve) Params() *elliptic.CurveParams {
    50  	return curve.CurveParams
    51  }
    52  
    53  // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
    54  // to out. If the scalar is equal or greater than the order of the group, it's
    55  // reduced modulo that order.
    56  func p256GetScalar(out *[32]byte, in []byte) {
    57  	n := new(big.Int).SetBytes(in)
    58  	var scalarBytes []byte
    59  
    60  	if n.Cmp(p256.N) >= 0 || len(in) > len(out) {
    61  		n.Mod(n, p256.N)
    62  		scalarBytes = n.Bytes()
    63  	} else {
    64  		scalarBytes = in
    65  	}
    66  
    67  	for i, v := range scalarBytes {
    68  		out[len(scalarBytes)-(1+i)] = v
    69  	}
    70  }
    71  
    72  func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
    73  	var scalarReversed [32]byte
    74  	p256GetScalar(&scalarReversed, scalar)
    75  
    76  	var x1, y1, z1 [p256Limbs]uint32
    77  	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
    78  	return p256ToAffine(&x1, &y1, &z1)
    79  }
    80  
    81  func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
    82  	var scalarReversed [32]byte
    83  	p256GetScalar(&scalarReversed, scalar)
    84  
    85  	var px, py, x1, y1, z1 [p256Limbs]uint32
    86  	p256FromBig(&px, bigX)
    87  	p256FromBig(&py, bigY)
    88  	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
    89  	return p256ToAffine(&x1, &y1, &z1)
    90  }
    91  
    92  // Field elements are represented as nine, unsigned 32-bit words.
    93  //
    94  // The value of a field element is:
    95  //   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + (x[3] * 2**86) + (x[4] * 2**114) + (x[5] * 2**143) + (x[6] * 2**171) + (x[7] * 2**200) + (x[8] * 2**228)
    96  //
    97  // That is, each limb is alternately 29 or 28-bits wide in little-endian
    98  // order.
    99  //
   100  // This means that a field element hits 2**257, rather than 2**256 as we would
   101  // like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
   102  // problems when multiplying as terms end up one bit short of a limb which
   103  // would require much bit-shifting to correct.
   104  //
   105  // Finally, the values stored in a field element are in Montgomery form. So the
   106  // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
   107  // 2**257.
   108  
   109  const (
   110  	p256Limbs    = 9
   111  	bottom28Bits = 0xfffffff
   112  	bottom29Bits = 0x1fffffff
   113  )
   114  
   115  var (
   116  	// p256One is the number 1 as a field element.
   117  	p256One  = [p256Limbs]uint32{2, 0, 0x1fffff00, 0x7ff, 0, 0, 0, 0x2000000, 0}
   118  	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
   119  	// p256P is the prime modulus as a field element.
   120  	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x7f, 0xffffc00, 0x1fffffff, 0xfffffff, 0x1fffffff, 0xeffffff, 0xfffffff}
   121  	// p2562P is the twice prime modulus as a field element.
   122  	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0xff, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fffffff, 0xdffffff, 0x1fffffff}
   123  	// p256b is the curve param b as a field element
   124  	p256b = [p256Limbs]uint32{0x1781ba84, 0xd230632, 0x1537ab90, 0x9bcd74d, 0xe1e38e7, 0x5417a94, 0x12149e60, 0x17441c5, 0x481fc31}
   125  )
   126  
   127  // p256Precomputed contains precomputed values to aid the calculation of scalar
   128  // multiples of the base point, G. It's actually two, equal length, tables
   129  // concatenated.
   130  //
   131  // The first table contains (x,y) field element pairs for 16 multiples of the
   132  // base point, G.
   133  //
   134  //   Index  |  Index (binary) | Value
   135  //       0  |           0000  | 0G (all zeros, omitted)
   136  //       1  |           0001  | G
   137  //       2  |           0010  | 2**64G
   138  //       3  |           0011  | 2**64G + G
   139  //       4  |           0100  | 2**128G
   140  //       5  |           0101  | 2**128G + G
   141  //       6  |           0110  | 2**128G + 2**64G
   142  //       7  |           0111  | 2**128G + 2**64G + G
   143  //       8  |           1000  | 2**192G
   144  //       9  |           1001  | 2**192G + G
   145  //      10  |           1010  | 2**192G + 2**64G
   146  //      11  |           1011  | 2**192G + 2**64G + G
   147  //      12  |           1100  | 2**192G + 2**128G
   148  //      13  |           1101  | 2**192G + 2**128G + G
   149  //      14  |           1110  | 2**192G + 2**128G + 2**64G
   150  //      15  |           1111  | 2**192G + 2**128G + 2**64G + G
   151  //
   152  // The second table follows the same style, but the terms are 2**32G,
   153  // 2**96G, 2**160G, 2**224G.
   154  //      16  |          10000  | 2**32G
   155  //      17  |          10010  | 2**96G
   156  //      18  |          10001  | 2**96G  + 2**32G
   157  //      19  |          10011  | 2**160G
   158  //      20  |          10100  | 2**160G + 2**32G
   159  //      21  |          10101  | 2**160G + 2**96G
   160  //      22  |          10110  | 2**160G + 2**96G + 2**32G
   161  //      23  |          10111  | 2**224G
   162  //      24  |          11000  | 2**224G + 2**32G
   163  //      25  |          11001  | 2**224G + 2**96G
   164  //      26  |          11011  | 2**224G + 2**96G + 2**32G
   165  //      27  |          11100  | 2**224G + 2**160G
   166  //      28  |          11101  | 2**224G + 2**160G  + 2**32G
   167  //      29  |          11110  | 2**224G + 2**160G + 2**96G
   168  //      30  |          11111  | 2**224G + 2**160G + 2**96G + 2**32G
   169  // This is ~2KB of data.
   170  // precompute(1)
   171  // precompute(2**32)
   172  var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
   173  	0x830053d, 0x328990f, 0x6c04fe1, 0xc0f72e5, 0x1e19f3c, 0x666b093, 0x175a87b, 0xec38276, 0x222cf4b,
   174  	0x185a1bba, 0x354e593, 0x1295fac1, 0xf2bc469, 0x47c60fa, 0xc19b8a9, 0xf63533e, 0x903ae6b, 0xc79acba,
   175  	0x15b061a4, 0x33e020b, 0xdffb34b, 0xfcf2c8, 0x16582e08, 0x262f203, 0xfb34381, 0xa55452, 0x604f0ff,
   176  	0x41f1f90, 0xd64ced2, 0xee377bf, 0x75f05f0, 0x189467ae, 0xe2244e, 0x1e7700e8, 0x3fbc464, 0x9612d2e,
   177  	0x1341b3b8, 0xee84e23, 0x1edfa5b4, 0x14e6030, 0x19e87be9, 0x92f533c, 0x1665d96c, 0x226653e, 0xa238d3e,
   178  	0xf5c62c, 0x95bb7a, 0x1f0e5a41, 0x28789c3, 0x1f251d23, 0x8726609, 0xe918910, 0x8096848, 0xf63d028,
   179  	0x152296a1, 0x9f561a8, 0x14d376fb, 0x898788a, 0x61a95fb, 0xa59466d, 0x159a003d, 0x1ad1698, 0x93cca08,
   180  	0x1b314662, 0x706e006, 0x11ce1e30, 0x97b710, 0x172fbc0d, 0x8f50158, 0x11c7ffe7, 0xd182cce, 0xc6ad9e8,
   181  	0x12ea31b2, 0xc4e4f38, 0x175b0d96, 0xec06337, 0x75a9c12, 0xb001fdf, 0x93e82f5, 0x34607de, 0xb8035ed,
   182  	0x17f97924, 0x75cf9e6, 0xdceaedd, 0x2529924, 0x1a10c5ff, 0xb1a54dc, 0x19464d8, 0x2d1997, 0xde6a110,
   183  	0x1e276ee5, 0x95c510c, 0x1aca7c7a, 0xfe48aca, 0x121ad4d9, 0xe4132c6, 0x8239b9d, 0x40ea9cd, 0x816c7b,
   184  	0x632d7a4, 0xa679813, 0x5911fcf, 0x82b0f7c, 0x57b0ad5, 0xbef65, 0xd541365, 0x7f9921f, 0xc62e7a,
   185  	0x3f4b32d, 0x58e50e1, 0x6427aed, 0xdcdda67, 0xe8c2d3e, 0x6aa54a4, 0x18df4c35, 0x49a6a8e, 0x3cd3d0c,
   186  	0xd7adf2, 0xcbca97, 0x1bda5f2d, 0x3258579, 0x606b1e6, 0x6fc1b5b, 0x1ac27317, 0x503ca16, 0xa677435,
   187  	0x57bc73, 0x3992a42, 0xbab987b, 0xfab25eb, 0x128912a4, 0x90a1dc4, 0x1402d591, 0x9ffbcfc, 0xaa48856,
   188  	0x7a7c2dc, 0xcefd08a, 0x1b29bda6, 0xa785641, 0x16462d8c, 0x76241b7, 0x79b6c3b, 0x204ae18, 0xf41212b,
   189  	0x1f567a4d, 0xd6ce6db, 0xedf1784, 0x111df34, 0x85d7955, 0x55fc189, 0x1b7ae265, 0xf9281ac, 0xded7740,
   190  	0xf19468b, 0x83763bb, 0x8ff7234, 0x3da7df8, 0x9590ac3, 0xdc96f2a, 0x16e44896, 0x7931009, 0x99d5acc,
   191  	0x10f7b842, 0xaef5e84, 0xc0310d7, 0xdebac2c, 0x2a7b137, 0x4342344, 0x19633649, 0x3a10624, 0x4b4cb56,
   192  	0x1d809c59, 0xac007f, 0x1f0f4bcd, 0xa1ab06e, 0xc5042cf, 0x82c0c77, 0x76c7563, 0x22c30f3, 0x3bf1568,
   193  	0x7a895be, 0xfcca554, 0x12e90e4c, 0x7b4ab5f, 0x13aeb76b, 0x5887e2c, 0x1d7fe1e3, 0x908c8e3, 0x95800ee,
   194  	0xb36bd54, 0xf08905d, 0x4e73ae8, 0xf5a7e48, 0xa67cb0, 0x50e1067, 0x1b944a0a, 0xf29c83a, 0xb23cfb9,
   195  	0xbe1db1, 0x54de6e8, 0xd4707f2, 0x8ebcc2d, 0x2c77056, 0x1568ce4, 0x15fcc849, 0x4069712, 0xe2ed85f,
   196  	0x2c5ff09, 0x42a6929, 0x628e7ea, 0xbd5b355, 0xaf0bd79, 0xaa03699, 0xdb99816, 0x4379cef, 0x81d57b,
   197  	0x11237f01, 0xe2a820b, 0xfd53b95, 0x6beb5ee, 0x1aeb790c, 0xe470d53, 0x2c2cfee, 0x1c1d8d8, 0xa520fc4,
   198  	0x1518e034, 0xa584dd4, 0x29e572b, 0xd4594fc, 0x141a8f6f, 0x8dfccf3, 0x5d20ba3, 0x2eb60c3, 0x9f16eb0,
   199  	0x11cec356, 0xf039f84, 0x1b0990c1, 0xc91e526, 0x10b65bae, 0xf0616e8, 0x173fa3ff, 0xec8ccf9, 0xbe32790,
   200  	0x11da3e79, 0xe2f35c7, 0x908875c, 0xdacf7bd, 0x538c165, 0x8d1487f, 0x7c31aed, 0x21af228, 0x7e1689d,
   201  	0xdfc23ca, 0x24f15dc, 0x25ef3c4, 0x35248cd, 0x99a0f43, 0xa4b6ecc, 0xd066b3, 0x2481152, 0x37a7688,
   202  	0x15a444b6, 0xb62300c, 0x4b841b, 0xa655e79, 0xd53226d, 0xbeb348a, 0x127f3c2, 0xb989247, 0x71a277d,
   203  	0x19e9dfcb, 0xb8f92d0, 0xe2d226c, 0x390a8b0, 0x183cc462, 0x7bd8167, 0x1f32a552, 0x5e02db4, 0xa146ee9,
   204  	0x1a003957, 0x1c95f61, 0x1eeec155, 0x26f811f, 0xf9596ba, 0x3082bfb, 0x96df083, 0x3e3a289, 0x7e2d8be,
   205  	0x157a63e0, 0x99b8941, 0x1da7d345, 0xcc6cd0, 0x10beed9a, 0x48e83c0, 0x13aa2e25, 0x7cad710, 0x4029988,
   206  	0x13dfa9dd, 0xb94f884, 0x1f4adfef, 0xb88543, 0x16f5f8dc, 0xa6a67f4, 0x14e274e2, 0x5e56cf4, 0x2f24ef,
   207  	0x1e9ef967, 0xfe09bad, 0xfe079b3, 0xcc0ae9e, 0xb3edf6d, 0x3e961bc, 0x130d7831, 0x31043d6, 0xba986f9,
   208  	0x1d28055, 0x65240ca, 0x4971fa3, 0x81b17f8, 0x11ec34a5, 0x8366ddc, 0x1471809, 0xfa5f1c6, 0xc911e15,
   209  	0x8849491, 0xcf4c2e2, 0x14471b91, 0x39f75be, 0x445c21e, 0xf1585e9, 0x72cc11f, 0x4c79f0c, 0xe5522e1,
   210  	0x1874c1ee, 0x4444211, 0x7914884, 0x3d1b133, 0x25ba3c, 0x4194f65, 0x1c0457ef, 0xac4899d, 0xe1fa66c,
   211  	0x130a7918, 0x9b8d312, 0x4b1c5c8, 0x61ccac3, 0x18c8aa6f, 0xe93cb0a, 0xdccb12c, 0xde10825, 0x969737d,
   212  	0xf58c0c3, 0x7cee6a9, 0xc2c329a, 0xc7f9ed9, 0x107b3981, 0x696a40e, 0x152847ff, 0x4d88754, 0xb141f47,
   213  	0x5a16ffe, 0x3a7870a, 0x18667659, 0x3b72b03, 0xb1c9435, 0x9285394, 0xa00005a, 0x37506c, 0x2edc0bb,
   214  	0x19afe392, 0xeb39cac, 0x177ef286, 0xdf87197, 0x19f844ed, 0x31fe8, 0x15f9bfd, 0x80dbec, 0x342e96e,
   215  	0x497aced, 0xe88e909, 0x1f5fa9ba, 0x530a6ee, 0x1ef4e3f1, 0x69ffd12, 0x583006d, 0x2ecc9b1, 0x362db70,
   216  	0x18c7bdc5, 0xf4bb3c5, 0x1c90b957, 0xf067c09, 0x9768f2b, 0xf73566a, 0x1939a900, 0x198c38a, 0x202a2a1,
   217  	0x4bbf5a6, 0x4e265bc, 0x1f44b6e7, 0x185ca49, 0xa39e81b, 0x24aff5b, 0x4acc9c2, 0x638bdd3, 0xb65b2a8,
   218  	0x6def8be, 0xb94537a, 0x10b81dee, 0xe00ec55, 0x2f2cdf7, 0xc20622d, 0x2d20f36, 0xe03c8c9, 0x898ea76,
   219  	0x8e3921b, 0x8905bff, 0x1e94b6c8, 0xee7ad86, 0x154797f2, 0xa620863, 0x3fbd0d9, 0x1f3caab, 0x30c24bd,
   220  	0x19d3892f, 0x59c17a2, 0x1ab4b0ae, 0xf8714ee, 0x90c4098, 0xa9c800d, 0x1910236b, 0xea808d3, 0x9ae2f31,
   221  	0x1a15ad64, 0xa48c8d1, 0x184635a4, 0xb725ef1, 0x11921dcc, 0x3f866df, 0x16c27568, 0xbdf580a, 0xb08f55c,
   222  	0x186ee1c, 0xb1627fa, 0x34e82f6, 0x933837e, 0xf311be5, 0xfedb03b, 0x167f72cd, 0xa5469c0, 0x9c82531,
   223  	0xb92a24b, 0x14fdc8b, 0x141980d1, 0xbdc3a49, 0x7e02bb1, 0xaf4e6dd, 0x106d99e1, 0xd4616fc, 0x93c2717,
   224  	0x1c0a0507, 0xc6d5fed, 0x9a03d8b, 0xa1d22b0, 0x127853e3, 0xc4ac6b8, 0x1a048cf7, 0x9afb72c, 0x65d485d,
   225  	0x72d5998, 0xe9fa744, 0xe49e82c, 0x253cf80, 0x5f777ce, 0xa3799a5, 0x17270cbb, 0xc1d1ef0, 0xdf74977,
   226  	0x114cb859, 0xfa8e037, 0xb8f3fe5, 0xc734cc6, 0x70d3d61, 0xeadac62, 0x12093dd0, 0x9add67d, 0x87200d6,
   227  	0x175bcbb, 0xb29b49f, 0x1806b79c, 0x12fb61f, 0x170b3a10, 0x3aaf1cf, 0xa224085, 0x79d26af, 0x97759e2,
   228  	0x92e19f1, 0xb32714d, 0x1f00d9f1, 0xc728619, 0x9e6f627, 0xe745e24, 0x18ea4ace, 0xfc60a41, 0x125f5b2,
   229  	0xc3cf512, 0x39ed486, 0xf4d15fa, 0xf9167fd, 0x1c1f5dd5, 0xc21a53e, 0x1897930, 0x957a112, 0x21059a0,
   230  	0x1f9e3ddc, 0xa4dfced, 0x8427f6f, 0x726fbe7, 0x1ea658f8, 0x2fdcd4c, 0x17e9b66f, 0xb2e7c2e, 0x39923bf,
   231  	0x1bae104, 0x3973ce5, 0xc6f264c, 0x3511b84, 0x124195d7, 0x11996bd, 0x20be23d, 0xdc437c4, 0x4b4f16b,
   232  	0x11902a0, 0x6c29cc9, 0x1d5ffbe6, 0xdb0b4c7, 0x10144c14, 0x2f2b719, 0x301189, 0x2343336, 0xa0bf2ac,
   233  }
   234  
   235  func precompute(params *elliptic.CurveParams, base *big.Int) {
   236  	// 1/32/64/96/128/160/192/224
   237  	var values [4]*big.Int
   238  
   239  	values[0] = base
   240  	for i := 1; i < 4; i++ {
   241  		values[i] = new(big.Int)
   242  		values[i].Lsh(values[i-1], 64)
   243  	}
   244  	for i := 0; i < 4; i++ {
   245  		x, y := params.ScalarBaseMult(values[i].Bytes())
   246  		printPoint(params, x, y)
   247  		v := new(big.Int)
   248  		switch i {
   249  		case 1:
   250  			v.Add(values[0], values[1])
   251  			x, y := params.ScalarBaseMult(v.Bytes())
   252  			printPoint(params, x, y)
   253  		case 2:
   254  			v.Add(values[0], values[2])
   255  			x, y := params.ScalarBaseMult(v.Bytes())
   256  			printPoint(params, x, y)
   257  			v.Add(values[1], values[2])
   258  			x, y = params.ScalarBaseMult(v.Bytes())
   259  			printPoint(params, x, y)
   260  			v.Add(values[0], v)
   261  			x, y = params.ScalarBaseMult(v.Bytes())
   262  			printPoint(params, x, y)
   263  		case 3:
   264  			v.Add(values[0], values[3])
   265  			x, y := params.ScalarBaseMult(v.Bytes())
   266  			printPoint(params, x, y)
   267  			v.Add(values[1], values[3])
   268  			x, y = params.ScalarBaseMult(v.Bytes())
   269  			printPoint(params, x, y)
   270  			v.Add(values[0], v)
   271  			x, y = params.ScalarBaseMult(v.Bytes())
   272  			printPoint(params, x, y)
   273  			v.Add(values[2], values[3])
   274  			x, y = params.ScalarBaseMult(v.Bytes())
   275  			printPoint(params, x, y)
   276  			v.Add(values[0], v)
   277  			x, y = params.ScalarBaseMult(v.Bytes())
   278  			printPoint(params, x, y)
   279  			v.Add(values[2], values[3])
   280  			v.Add(v, values[1])
   281  			x, y = params.ScalarBaseMult(v.Bytes())
   282  			printPoint(params, x, y)
   283  			v.Add(v, values[0])
   284  			x, y = params.ScalarBaseMult(v.Bytes())
   285  			printPoint(params, x, y)
   286  		}
   287  	}
   288  }
   289  
   290  func printPoint(params *elliptic.CurveParams, x, y *big.Int) {
   291  	var out [p256Limbs]uint32
   292  	p256FromBigAgainstP(&out, x, params.P)
   293  	printp256Limbs(&out)
   294  	p256FromBigAgainstP(&out, y, params.P)
   295  	printp256Limbs(&out)
   296  }
   297  
   298  func printp256Limbs(one *[p256Limbs]uint32) {
   299  	for i := 0; i < p256Limbs; i++ {
   300  		fmt.Printf("0x%x, ", one[i])
   301  	}
   302  	fmt.Println()
   303  }
   304  
   305  func print1to7(params *elliptic.CurveParams) {
   306  	var out [p256Limbs]uint32
   307  	for i := 1; i < 8; i++ {
   308  		value := big.NewInt(int64(i))
   309  		p256FromBigAgainstP(&out, value, params.P)
   310  		printp256Limbs(&out)
   311  	}
   312  }
   313  
   314  // Field element operations:
   315  
   316  // nonZeroToAllOnes returns:
   317  //   0xffffffff for 0 < x <= 2**31
   318  //   0 for x == 0 or x > 2**31.
   319  func nonZeroToAllOnes(x uint32) uint32 {
   320  	return ((x - 1) >> 31) - 1
   321  }
   322  
   323  // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
   324  // which is a term at 2**257.
   325  //
   326  // On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
   327  // On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
   328  func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
   329  	carry_mask := nonZeroToAllOnes(carry)
   330  	inout[0] += carry << 1
   331  	// 2**30 = 0x40000000, this doesn't underflow
   332  	inout[2] -= carry << 8
   333  	inout[2] += 0x20000000 & carry_mask
   334  
   335  	inout[3] -= 1 & carry_mask
   336  	inout[3] += carry << 11
   337  
   338  	// 2**29 = 0x20000000, this doesn't underflow: 0xfffffff + 0x2000000 = 0x11ffffff < 0x20000000
   339  	inout[7] += carry << 25
   340  }
   341  
   342  // p256Sum sets out = in+in2.
   343  //
   344  // On entry, in[i]+in2[i] must not overflow a 32-bit word.
   345  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   346  func p256Sum(out, in, in2 *[p256Limbs]uint32) {
   347  	carry := uint32(0)
   348  	for i := 0; ; i++ {
   349  		out[i] = in[i] + in2[i]
   350  		out[i] += carry
   351  		carry = out[i] >> 29
   352  		out[i] &= bottom29Bits
   353  
   354  		i++
   355  		if i == p256Limbs {
   356  			break
   357  		}
   358  
   359  		out[i] = in[i] + in2[i]
   360  		out[i] += carry
   361  		carry = out[i] >> 28
   362  		out[i] &= bottom28Bits
   363  	}
   364  
   365  	p256ReduceCarry(out, carry)
   366  }
   367  
   368  // p256Zero31 is 0 mod p.
   369  // {two31m3, two30m2, two31p10m2, two30m13m2, two31m2, two30m2, two31m2, two30m27m2, two31m2}
   370  var p256Zero31 = [p256Limbs]uint32{0x7FFFFFF8, 0x3FFFFFFC, 0x800003FC, 0x3FFFDFFC, 0x7FFFFFFC, 0x3FFFFFFC, 0x7FFFFFFC, 0x37FFFFFC, 0x7FFFFFFC}
   371  
   372  func limbsToBig(in *[p256Limbs]uint32) *big.Int {
   373  	result, tmp := new(big.Int), new(big.Int)
   374  
   375  	result.SetInt64(int64(in[p256Limbs-1]))
   376  	for i := p256Limbs - 2; i >= 0; i-- {
   377  		if (i & 1) == 0 {
   378  			result.Lsh(result, 29)
   379  		} else {
   380  			result.Lsh(result, 28)
   381  		}
   382  		tmp.SetInt64(int64(in[i]))
   383  		result.Add(result, tmp)
   384  	}
   385  	return result
   386  }
   387  
   388  // p256GetZero31, the func to calucate p256Zero31
   389  func p256GetZero31(out *[p256Limbs]uint32) {
   390  	tmp := big.NewInt(0)
   391  	result := limbsToBig(&[p256Limbs]uint32{1 << 31, 1 << 30, 1 << 31, 1 << 30, 1 << 31, 1 << 30, 1 << 31, 1 << 30, 1 << 31})
   392  	tmp = tmp.Mod(result, p256.P)
   393  	tmp = tmp.Sub(result, tmp)
   394  	for i := 0; i < 9; i++ {
   395  		if bits := tmp.Bits(); len(bits) > 0 {
   396  			out[i] = uint32(bits[0]) & 0x7fffffff
   397  			if out[i] < 0x70000000 {
   398  				out[i] += 0x80000000
   399  			}
   400  		} else {
   401  			out[i] = 0x80000000
   402  		}
   403  		tmp.Sub(tmp, big.NewInt(int64(out[i])))
   404  		tmp.Rsh(tmp, 29)
   405  		i++
   406  		if i == p256Limbs {
   407  			break
   408  		}
   409  
   410  		if bits := tmp.Bits(); len(bits) > 0 {
   411  			out[i] = uint32(bits[0]) & 0x3fffffff
   412  			if out[i] < 0x30000000 {
   413  				out[i] += 0x40000000
   414  			}
   415  		} else {
   416  			out[i] = 0x40000000
   417  		}
   418  		tmp.Sub(tmp, big.NewInt(int64(out[i])))
   419  		tmp.Rsh(tmp, 28)
   420  	}
   421  }
   422  
   423  // p256Diff sets out = in-in2.
   424  //
   425  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   426  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   427  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   428  func p256Diff(out, in, in2 *[p256Limbs]uint32) {
   429  	var carry uint32
   430  
   431  	for i := 0; ; i++ {
   432  		out[i] = in[i] - in2[i]
   433  		out[i] += p256Zero31[i]
   434  		out[i] += carry
   435  		carry = out[i] >> 29
   436  		out[i] &= bottom29Bits
   437  		i++
   438  		if i == p256Limbs {
   439  			break
   440  		}
   441  
   442  		out[i] = in[i] - in2[i]
   443  		out[i] += p256Zero31[i]
   444  		out[i] += carry
   445  		carry = out[i] >> 28
   446  		out[i] &= bottom28Bits
   447  	}
   448  
   449  	p256ReduceCarry(out, carry)
   450  }
   451  
   452  // p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
   453  // the same 29,28,... bit positions as a field element.
   454  //
   455  // The values in field elements are in Montgomery form: x*R mod p where R =
   456  // 2**257. Since we just multiplied two Montgomery values together, the result
   457  // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
   458  // in Montgomery form.
   459  //
   460  // On entry: tmp[i] < 2**64
   461  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   462  func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
   463  	// The following table may be helpful when reading this code:
   464  	//
   465  	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
   466  	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
   467  	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
   468  	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
   469  	var tmp2 [18]uint32
   470  	var carry, x, xMask uint32
   471  
   472  	// tmp contains 64-bit words with the same 29,28,29-bit positions as an
   473  	// field element. So the top of an element of tmp might overlap with
   474  	// another element two positions down. The following loop eliminates
   475  	// this overlap.
   476  	tmp2[0] = uint32(tmp[0]) & bottom29Bits
   477  
   478  	tmp2[1] = uint32(tmp[0]) >> 29
   479  	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
   480  	tmp2[1] += uint32(tmp[1]) & bottom28Bits
   481  	carry = tmp2[1] >> 28
   482  	tmp2[1] &= bottom28Bits
   483  
   484  	for i := 2; i < 17; i++ {
   485  		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
   486  		tmp2[i] += (uint32(tmp[i-1])) >> 28
   487  		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
   488  		tmp2[i] += uint32(tmp[i]) & bottom29Bits
   489  		tmp2[i] += carry
   490  		carry = tmp2[i] >> 29
   491  		tmp2[i] &= bottom29Bits
   492  
   493  		i++
   494  		if i == 17 {
   495  			break
   496  		}
   497  		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
   498  		tmp2[i] += uint32(tmp[i-1]) >> 29
   499  		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
   500  		tmp2[i] += uint32(tmp[i]) & bottom28Bits
   501  		tmp2[i] += carry
   502  		carry = tmp2[i] >> 28
   503  		tmp2[i] &= bottom28Bits
   504  	}
   505  
   506  	tmp2[17] = uint32(tmp[15]>>32) >> 25
   507  	tmp2[17] += uint32(tmp[16]) >> 29
   508  	tmp2[17] += uint32(tmp[16]>>32) << 3
   509  	tmp2[17] += carry
   510  
   511  	// Montgomery elimination of terms:
   512  	//
   513  	// Since R is 2**257, we can divide by R with a bitwise shift if we can
   514  	// ensure that the right-most 257 bits are all zero. We can make that true
   515  	// by adding multiplies of p without affecting the value.
   516  	//
   517  	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
   518  	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
   519  	// We can do that for 8 further limbs and then right shift to eliminate the
   520  	// extra factor of R.
   521  	for i := 0; ; i += 2 {
   522  		tmp2[i+1] += tmp2[i] >> 29
   523  		x = tmp2[i] & bottom29Bits
   524  		xMask = nonZeroToAllOnes(x)
   525  		tmp2[i] = 0
   526  
   527  		// The bounds calculations for this loop are tricky. Each iteration of
   528  		// the loop eliminates two words by adding values to words to their
   529  		// right.
   530  		//
   531  		// The following table contains the amounts added to each word (as an
   532  		// offset from the value of i at the top of the loop). The amounts are
   533  		// accounted for from the first and second half of the loop separately
   534  		// and are written as, for example, 28 to mean a value <2**28.
   535  		//
   536  		// Word:                2   3   4   5   6   7   8   9   10
   537  		// Added in top half:   29 28  29  29  29  29  29  28
   538  		//                             29  28  29  28  29
   539  		//                                             29
   540  		// Added in bottom half:   28  29  28  28  28  29  28   28
   541  		//                                 28  29  28  29  28
   542  		//
   543  		//
   544  		// The following table accumulates these values. The sums at the bottom
   545  		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
   546  		//
   547  		// Word:   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
   548  		//        29  28  29  29  29  29  29  28  28  28  28  28  28  28  28  28
   549  		//            28  29  28  29  28  29  28  29  28  29  28  29  28  29
   550  		//                29  28  28  28  29  28  29  28  29  28  29  28  29
   551  		//                29  28  29  28  29  29  29  29  29  29  29  29  29
   552  		//                    28  29  29  29  28  29  28  29  28  29  28
   553  		//                    28  29  28  29  28  29  28  29  28  29
   554  		//                        29  28  29  28  29  28  29  28  29
   555  		//                        29  28  28  29  29  29  29  29  29
   556  		//                            28  29  28  28  28  28  28
   557  		//                            28  29  28  29  28  29
   558  		//                                29  28  29  28  29
   559  		//                                29  28  29  28  29
   560  		//                                29  28  29
   561  		//                                        29
   562  		//         -------------------------------------------------
   563  		// according the table, from tmp2[6] to tmp[14], consider their initial value,
   564  		// they will overflow the word of 32bits, so we need to normalize them every iteration.
   565  		// This requires more CPU resources than NIST P256.
   566  		//
   567  
   568  		tmp2[i+2] += (x << 7) & bottom29Bits
   569  		tmp2[i+3] += (x >> 22)
   570  
   571  		// At position 86, which is the starting bit position for word 3, we
   572  		// have a factor of 0xffffc00 = 2**28 - 2**10
   573  		tmp2[i+3] += 0x10000000 & xMask
   574  		tmp2[i+4] += (x - 1) & xMask
   575  		tmp2[i+3] -= (x << 10) & bottom28Bits
   576  		tmp2[i+4] -= x >> 18
   577  
   578  		tmp2[i+4] += 0x20000000 & xMask
   579  		tmp2[i+4] -= x
   580  		tmp2[i+5] += (x - 1) & xMask
   581  
   582  		tmp2[i+5] += 0x10000000 & xMask
   583  		tmp2[i+5] -= x
   584  		tmp2[i+6] += (x - 1) & xMask
   585  
   586  		tmp2[i+6] += 0x20000000 & xMask
   587  		tmp2[i+6] -= x
   588  		tmp2[i+7] += (x - 1) & xMask
   589  
   590  		// At position 200, which is the starting bit position for word 7, we
   591  		// have a factor of 0xeffffff = 2**28 - 2**24 - 1
   592  		tmp2[i+7] += 0x10000000 & xMask
   593  		tmp2[i+7] -= x
   594  		tmp2[i+8] += (x - 1) & xMask
   595  		tmp2[i+7] -= (x << 24) & bottom28Bits
   596  		tmp2[i+8] -= x >> 4
   597  
   598  		tmp2[i+8] += 0x20000000 & xMask
   599  		tmp2[i+8] -= x
   600  		tmp2[i+8] += (x << 28) & bottom29Bits
   601  		tmp2[i+9] += ((x >> 1) - 1) & xMask
   602  
   603  		if i+1 == p256Limbs {
   604  			break
   605  		}
   606  
   607  		tmp2[i+2] += tmp2[i+1] >> 28
   608  		x = tmp2[i+1] & bottom28Bits
   609  		xMask = nonZeroToAllOnes(x)
   610  		tmp2[i+1] = 0
   611  
   612  		tmp2[i+3] += (x << 7) & bottom28Bits
   613  		tmp2[i+4] += (x >> 21)
   614  
   615  		// At position 85, which is the starting bit position for word 3, we
   616  		// have a factor of 0x1ffff800 = 2**29 - 2**11
   617  		tmp2[i+4] += 0x20000000 & xMask
   618  		tmp2[i+5] += (x - 1) & xMask
   619  		tmp2[i+4] -= (x << 11) & bottom29Bits
   620  		tmp2[i+5] -= x >> 18
   621  
   622  		tmp2[i+5] += 0x10000000 & xMask
   623  		tmp2[i+5] -= x
   624  		tmp2[i+6] += (x - 1) & xMask
   625  
   626  		tmp2[i+6] += 0x20000000 & xMask
   627  		tmp2[i+6] -= x
   628  		tmp2[i+7] += (x - 1) & xMask
   629  
   630  		tmp2[i+7] += 0x10000000 & xMask
   631  		tmp2[i+7] -= x
   632  		tmp2[i+8] += (x - 1) & xMask
   633  
   634  		// At position 199, which is the starting bit position for word 7, we
   635  		// have a factor of 0x1dffffff = 2**29 - 2**25 - 1
   636  		tmp2[i+8] += 0x20000000 & xMask
   637  		tmp2[i+8] -= x
   638  		tmp2[i+9] += (x - 1) & xMask
   639  		tmp2[i+8] -= (x << 25) & bottom29Bits
   640  		tmp2[i+9] -= x >> 4
   641  
   642  		tmp2[i+9] += 0x10000000 & xMask
   643  		tmp2[i+9] -= x
   644  		tmp2[i+10] += (x - 1) & xMask
   645  
   646  		// Need to normalize below limbs to avoid overflow the word in the next iteration
   647  		tmp2[i+7] += tmp2[i+6] >> 29
   648  		tmp2[i+6] = tmp2[i+6] & bottom29Bits
   649  
   650  		tmp2[i+8] += tmp2[i+7] >> 28
   651  		tmp2[i+7] = tmp2[i+7] & bottom28Bits
   652  
   653  		tmp2[i+9] += tmp2[i+8] >> 29
   654  		tmp2[i+8] = tmp2[i+8] & bottom29Bits
   655  
   656  		tmp2[i+10] += tmp2[i+9] >> 28
   657  		tmp2[i+9] = tmp2[i+9] & bottom28Bits
   658  	}
   659  
   660  	// We merge the right shift with a carry chain. The words above 2**257 have
   661  	// widths of 28,29,... which we need to correct when copying them down.
   662  	carry = 0
   663  	for i := 0; i < 8; i++ {
   664  		// The maximum value of tmp2[i + 9] occurs on the first iteration and
   665  		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
   666  		// therefore safe.
   667  		out[i] = tmp2[i+9]
   668  		out[i] += carry
   669  		out[i] += (tmp2[i+10] << 28) & bottom29Bits
   670  		carry = out[i] >> 29
   671  		out[i] &= bottom29Bits
   672  
   673  		i++
   674  		out[i] = tmp2[i+9] >> 1
   675  		out[i] += carry
   676  		carry = out[i] >> 28
   677  		out[i] &= bottom28Bits
   678  	}
   679  
   680  	out[8] = tmp2[17]
   681  	out[8] += carry
   682  	carry = out[8] >> 29
   683  	out[8] &= bottom29Bits
   684  
   685  	p256ReduceCarry(out, carry)
   686  }
   687  
   688  // p256Square sets out=in*in.
   689  //
   690  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
   691  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   692  func p256Square(out, in *[p256Limbs]uint32) {
   693  	var tmp [17]uint64
   694  
   695  	tmp[0] = uint64(in[0]) * uint64(in[0])
   696  	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
   697  	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
   698  		uint64(in[1])*(uint64(in[1])<<1)
   699  	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
   700  		uint64(in[1])*(uint64(in[2])<<1)
   701  	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
   702  		uint64(in[1])*(uint64(in[3])<<2) +
   703  		uint64(in[2])*uint64(in[2])
   704  	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
   705  		uint64(in[1])*(uint64(in[4])<<1) +
   706  		uint64(in[2])*(uint64(in[3])<<1)
   707  	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
   708  		uint64(in[1])*(uint64(in[5])<<2) +
   709  		uint64(in[2])*(uint64(in[4])<<1) +
   710  		uint64(in[3])*(uint64(in[3])<<1)
   711  	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
   712  		uint64(in[1])*(uint64(in[6])<<1) +
   713  		uint64(in[2])*(uint64(in[5])<<1) +
   714  		uint64(in[3])*(uint64(in[4])<<1)
   715  	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
   716  	// which is < 2**64 as required.
   717  	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
   718  		uint64(in[1])*(uint64(in[7])<<2) +
   719  		uint64(in[2])*(uint64(in[6])<<1) +
   720  		uint64(in[3])*(uint64(in[5])<<2) +
   721  		uint64(in[4])*uint64(in[4])
   722  	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
   723  		uint64(in[2])*(uint64(in[7])<<1) +
   724  		uint64(in[3])*(uint64(in[6])<<1) +
   725  		uint64(in[4])*(uint64(in[5])<<1)
   726  	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
   727  		uint64(in[3])*(uint64(in[7])<<2) +
   728  		uint64(in[4])*(uint64(in[6])<<1) +
   729  		uint64(in[5])*(uint64(in[5])<<1)
   730  	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
   731  		uint64(in[4])*(uint64(in[7])<<1) +
   732  		uint64(in[5])*(uint64(in[6])<<1)
   733  	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
   734  		uint64(in[5])*(uint64(in[7])<<2) +
   735  		uint64(in[6])*uint64(in[6])
   736  	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
   737  		uint64(in[6])*(uint64(in[7])<<1)
   738  	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
   739  		uint64(in[7])*(uint64(in[7])<<1)
   740  	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
   741  	tmp[16] = uint64(in[8]) * uint64(in[8])
   742  
   743  	p256ReduceDegree(out, tmp)
   744  }
   745  
   746  // p256Mul sets out=in*in2.
   747  //
   748  // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   749  //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   750  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   751  func p256Mul(out, in, in2 *[p256Limbs]uint32) {
   752  	var tmp [17]uint64
   753  
   754  	tmp[0] = uint64(in[0]) * uint64(in2[0])
   755  	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) + //2**29
   756  		uint64(in[1])*(uint64(in2[0])<<0)
   757  	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) + //2**57
   758  		uint64(in[1])*(uint64(in2[1])<<1) +
   759  		uint64(in[2])*(uint64(in2[0])<<0)
   760  	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) + //2**86
   761  		uint64(in[1])*(uint64(in2[2])<<0) +
   762  		uint64(in[2])*(uint64(in2[1])<<0) +
   763  		uint64(in[3])*(uint64(in2[0])<<0)
   764  	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) + //2**114
   765  		uint64(in[1])*(uint64(in2[3])<<1) +
   766  		uint64(in[2])*(uint64(in2[2])<<0) +
   767  		uint64(in[3])*(uint64(in2[1])<<1) +
   768  		uint64(in[4])*(uint64(in2[0])<<0)
   769  	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) + //2**143
   770  		uint64(in[1])*(uint64(in2[4])<<0) +
   771  		uint64(in[2])*(uint64(in2[3])<<0) +
   772  		uint64(in[3])*(uint64(in2[2])<<0) +
   773  		uint64(in[4])*(uint64(in2[1])<<0) +
   774  		uint64(in[5])*(uint64(in2[0])<<0)
   775  	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) + //2**171
   776  		uint64(in[1])*(uint64(in2[5])<<1) +
   777  		uint64(in[2])*(uint64(in2[4])<<0) +
   778  		uint64(in[3])*(uint64(in2[3])<<1) +
   779  		uint64(in[4])*(uint64(in2[2])<<0) +
   780  		uint64(in[5])*(uint64(in2[1])<<1) +
   781  		uint64(in[6])*(uint64(in2[0])<<0)
   782  	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) + //2**200
   783  		uint64(in[1])*(uint64(in2[6])<<0) +
   784  		uint64(in[2])*(uint64(in2[5])<<0) +
   785  		uint64(in[3])*(uint64(in2[4])<<0) +
   786  		uint64(in[4])*(uint64(in2[3])<<0) +
   787  		uint64(in[5])*(uint64(in2[2])<<0) +
   788  		uint64(in[6])*(uint64(in2[1])<<0) +
   789  		uint64(in[7])*(uint64(in2[0])<<0)
   790  	// tmp[8] has the greatest value but doesn't overflow. See logic in
   791  	// p256Square.
   792  	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) + // 2**228
   793  		uint64(in[1])*(uint64(in2[7])<<1) +
   794  		uint64(in[2])*(uint64(in2[6])<<0) +
   795  		uint64(in[3])*(uint64(in2[5])<<1) +
   796  		uint64(in[4])*(uint64(in2[4])<<0) +
   797  		uint64(in[5])*(uint64(in2[3])<<1) +
   798  		uint64(in[6])*(uint64(in2[2])<<0) +
   799  		uint64(in[7])*(uint64(in2[1])<<1) +
   800  		uint64(in[8])*(uint64(in2[0])<<0)
   801  	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) + //2**257
   802  		uint64(in[2])*(uint64(in2[7])<<0) +
   803  		uint64(in[3])*(uint64(in2[6])<<0) +
   804  		uint64(in[4])*(uint64(in2[5])<<0) +
   805  		uint64(in[5])*(uint64(in2[4])<<0) +
   806  		uint64(in[6])*(uint64(in2[3])<<0) +
   807  		uint64(in[7])*(uint64(in2[2])<<0) +
   808  		uint64(in[8])*(uint64(in2[1])<<0)
   809  	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) + //2**285
   810  		uint64(in[3])*(uint64(in2[7])<<1) +
   811  		uint64(in[4])*(uint64(in2[6])<<0) +
   812  		uint64(in[5])*(uint64(in2[5])<<1) +
   813  		uint64(in[6])*(uint64(in2[4])<<0) +
   814  		uint64(in[7])*(uint64(in2[3])<<1) +
   815  		uint64(in[8])*(uint64(in2[2])<<0)
   816  	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) + //2**314
   817  		uint64(in[4])*(uint64(in2[7])<<0) +
   818  		uint64(in[5])*(uint64(in2[6])<<0) +
   819  		uint64(in[6])*(uint64(in2[5])<<0) +
   820  		uint64(in[7])*(uint64(in2[4])<<0) +
   821  		uint64(in[8])*(uint64(in2[3])<<0)
   822  	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) + //2**342
   823  		uint64(in[5])*(uint64(in2[7])<<1) +
   824  		uint64(in[6])*(uint64(in2[6])<<0) +
   825  		uint64(in[7])*(uint64(in2[5])<<1) +
   826  		uint64(in[8])*(uint64(in2[4])<<0)
   827  	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) + //2**371
   828  		uint64(in[6])*(uint64(in2[7])<<0) +
   829  		uint64(in[7])*(uint64(in2[6])<<0) +
   830  		uint64(in[8])*(uint64(in2[5])<<0)
   831  	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) + //2**399
   832  		uint64(in[7])*(uint64(in2[7])<<1) +
   833  		uint64(in[8])*(uint64(in2[6])<<0)
   834  	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) + //2**428
   835  		uint64(in[8])*(uint64(in2[7])<<0)
   836  	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0) //2**456
   837  
   838  	p256ReduceDegree(out, tmp)
   839  }
   840  
   841  func p256Assign(out, in *[p256Limbs]uint32) {
   842  	*out = *in
   843  }
   844  
   845  // p256Invert calculates |out| = |in|^{-1}
   846  //
   847  // Based on Fermat's Little Theorem:
   848  //   a^p = a (mod p)
   849  //   a^{p-1} = 1 (mod p)
   850  //   a^{p-2} = a^{-1} (mod p)
   851  func p256Invert(out, in *[p256Limbs]uint32) {
   852  	var ftmp, ftmp2 [p256Limbs]uint32
   853  
   854  	// each e_I will hold |in|^{2^I - 1}
   855  	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
   856  	// 2^32-2
   857  	var e32m2 [p256Limbs]uint32
   858  
   859  	p256Square(&ftmp, in) // 2^1
   860  	p256Assign(&ftmp2, &ftmp)
   861  	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
   862  	p256Assign(&e2, &ftmp)
   863  	p256Square(&ftmp, &ftmp) // 2^3 - 2^1
   864  	p256Square(&ftmp, &ftmp) // 2^4 - 2^2
   865  	p256Assign(&e32m2, &ftmp)
   866  	p256Mul(&e32m2, &e32m2, &ftmp2) // 2^4 - 2^2 + 2^1 = 2^4 - 2
   867  	p256Mul(&ftmp, &ftmp, &e2)      // 2^4 - 2^0
   868  	p256Assign(&e4, &ftmp)
   869  	for i := 0; i < 4; i++ {
   870  		p256Square(&ftmp, &ftmp)
   871  	} // 2^8 - 2^4
   872  	p256Mul(&e32m2, &e32m2, &ftmp) // 2^8 - 2
   873  
   874  	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
   875  	p256Assign(&e8, &ftmp)
   876  	for i := 0; i < 8; i++ {
   877  		p256Square(&ftmp, &ftmp)
   878  	} // 2^16 - 2^8
   879  	p256Mul(&e32m2, &e32m2, &ftmp) // 2^16 - 2
   880  	p256Mul(&ftmp, &ftmp, &e8)     // 2^16 - 2^0
   881  	p256Assign(&e16, &ftmp)
   882  	for i := 0; i < 16; i++ {
   883  		p256Square(&ftmp, &ftmp)
   884  	} // 2^32 - 2^16
   885  	p256Mul(&e32m2, &e32m2, &ftmp) // 2^32 - 2
   886  
   887  	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
   888  	p256Assign(&e32, &ftmp)
   889  	for i := 0; i < 32; i++ {
   890  		p256Square(&ftmp, &ftmp)
   891  	} // 2^64 - 2^32
   892  	p256Assign(&e64, &ftmp)
   893  	p256Mul(&e64, &e64, &e32) // 2^64 - 2^0
   894  	p256Assign(&ftmp, &e64)
   895  
   896  	for i := 0; i < 64; i++ {
   897  		p256Square(&ftmp, &ftmp)
   898  	} // 2^128 - 2^64
   899  	p256Mul(&ftmp, &ftmp, &e64) // 2^128 - 1
   900  
   901  	for i := 0; i < 96; i++ {
   902  		p256Square(&ftmp, &ftmp)
   903  	} // 2^224 - 2^96
   904  
   905  	p256Assign(&ftmp2, &e32m2)
   906  	for i := 0; i < 224; i++ {
   907  		p256Square(&ftmp2, &ftmp2)
   908  	} // 2^256 - 2^225
   909  
   910  	p256Mul(&ftmp, &ftmp, &ftmp2) // 2^256 - 2^224 - 2^96
   911  
   912  	p256Assign(&ftmp2, &e32)
   913  
   914  	for i := 0; i < 16; i++ {
   915  		p256Square(&ftmp2, &ftmp2)
   916  	} // 2^48 - 2^16
   917  	p256Mul(&ftmp2, &e16, &ftmp2) // 2^48 - 2^0
   918  
   919  	for i := 0; i < 8; i++ {
   920  		p256Square(&ftmp2, &ftmp2)
   921  	} // 2^56 - 2^8
   922  	p256Mul(&ftmp2, &e8, &ftmp2) // 2^56 - 2^0
   923  
   924  	for i := 0; i < 4; i++ {
   925  		p256Square(&ftmp2, &ftmp2)
   926  	} // 2^60 - 2^4
   927  	p256Mul(&ftmp2, &e4, &ftmp2) // 2^60 - 2^0
   928  
   929  	for i := 0; i < 2; i++ {
   930  		p256Square(&ftmp2, &ftmp2)
   931  	} // 2^62 - 2^2
   932  
   933  	p256Mul(&ftmp2, &e2, &ftmp2) // 2^62 - 2^0
   934  	for i := 0; i < 2; i++ {
   935  		p256Square(&ftmp2, &ftmp2)
   936  	} // 2^64 - 2^2
   937  	p256Mul(&ftmp2, in, &ftmp2) // 2^64 - 3
   938  	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 - 2^96 + 2^64 - 3
   939  }
   940  
   941  // p256Scalar3 sets out=3*out.
   942  //
   943  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   944  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   945  func p256Scalar3(out *[p256Limbs]uint32) {
   946  	var carry uint32
   947  
   948  	for i := 0; ; i++ {
   949  		out[i] *= 3
   950  		out[i] += carry
   951  		carry = out[i] >> 29
   952  		out[i] &= bottom29Bits
   953  
   954  		i++
   955  		if i == p256Limbs {
   956  			break
   957  		}
   958  
   959  		out[i] *= 3
   960  		out[i] += carry
   961  		carry = out[i] >> 28
   962  		out[i] &= bottom28Bits
   963  	}
   964  
   965  	p256ReduceCarry(out, carry)
   966  }
   967  
   968  // p256Scalar4 sets out=4*out.
   969  //
   970  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   971  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   972  func p256Scalar4(out *[p256Limbs]uint32) {
   973  	var carry, nextCarry uint32
   974  
   975  	for i := 0; ; i++ {
   976  		nextCarry = out[i] >> 27
   977  		out[i] <<= 2
   978  		out[i] &= bottom29Bits
   979  		out[i] += carry
   980  		carry = nextCarry + (out[i] >> 29)
   981  		out[i] &= bottom29Bits
   982  
   983  		i++
   984  		if i == p256Limbs {
   985  			break
   986  		}
   987  		nextCarry = out[i] >> 26
   988  		out[i] <<= 2
   989  		out[i] &= bottom28Bits
   990  		out[i] += carry
   991  		carry = nextCarry + (out[i] >> 28)
   992  		out[i] &= bottom28Bits
   993  	}
   994  
   995  	p256ReduceCarry(out, carry)
   996  }
   997  
   998  // p256Scalar8 sets out=8*out.
   999  //
  1000  // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
  1001  // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
  1002  func p256Scalar8(out *[p256Limbs]uint32) {
  1003  	var carry, nextCarry uint32
  1004  
  1005  	for i := 0; ; i++ {
  1006  		nextCarry = out[i] >> 26
  1007  		out[i] <<= 3
  1008  		out[i] &= bottom29Bits
  1009  		out[i] += carry
  1010  		carry = nextCarry + (out[i] >> 29)
  1011  		out[i] &= bottom29Bits
  1012  
  1013  		i++
  1014  		if i == p256Limbs {
  1015  			break
  1016  		}
  1017  		nextCarry = out[i] >> 25
  1018  		out[i] <<= 3
  1019  		out[i] &= bottom28Bits
  1020  		out[i] += carry
  1021  		carry = nextCarry + (out[i] >> 28)
  1022  		out[i] &= bottom28Bits
  1023  	}
  1024  
  1025  	p256ReduceCarry(out, carry)
  1026  }
  1027  
  1028  // Group operations:
  1029  //
  1030  // Elements of the elliptic curve group are represented in Jacobian
  1031  // coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
  1032  // Jacobian form.
  1033  
  1034  // p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
  1035  //
  1036  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
  1037  func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
  1038  	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
  1039  
  1040  	p256Square(&delta, z)     // delta = z^2
  1041  	p256Square(&gamma, y)     // gamma = y^2
  1042  	p256Mul(&beta, x, &gamma) // beta = x * gamma = x * y^2
  1043  
  1044  	p256Sum(&tmp, x, &delta)     // tmp = x + delta = x + z^2
  1045  	p256Diff(&tmp2, x, &delta)   // tmp2 = x - delta = x - z^2
  1046  	p256Mul(&alpha, &tmp, &tmp2) // alpha = tmp * tmp2 = (x + z^2) * (x - z^2) = x^2 - z^4
  1047  	p256Scalar3(&alpha)          // alpha = alpah * 3 = 3*(x^2 - z^4)
  1048  
  1049  	p256Sum(&tmp, y, z)          // tmp = y+z
  1050  	p256Square(&tmp, &tmp)       // tmp = (y+z)^2
  1051  	p256Diff(&tmp, &tmp, &gamma) // tmp = tmp - gamma = (y+z)^2 - y^2
  1052  	p256Diff(zOut, &tmp, &delta) // zOut = tmp - delta = (y+z)^2 - y^2 - z^2
  1053  
  1054  	p256Scalar4(&beta)          // beta = beta * 4 = 4 * x * y^2
  1055  	p256Square(xOut, &alpha)    // xOut = alpha ^ 2 = (3*(x^2 - z^4))^2
  1056  	p256Diff(xOut, xOut, &beta) // xOut = xOut - beta = (3*(x^2 - z^4))^2 - 4 * x * y^2
  1057  	p256Diff(xOut, xOut, &beta) // xOut = xOut - beta = (3*(x^2 - z^4))^2 - 8 * x * y^2
  1058  
  1059  	p256Diff(&tmp, &beta, xOut) // tmp = beta - xOut
  1060  	p256Mul(&tmp, &alpha, &tmp) // tmp = 3*(x^2 - z^4) * (beta - xOut)
  1061  	p256Square(&tmp2, &gamma)   // tmp2 = gamma^2 = y^4
  1062  	p256Scalar8(&tmp2)          // tmp2 = 8*tmp2 = 8*y^4
  1063  	p256Diff(yOut, &tmp, &tmp2) // yOut = (3*x^2 - 3*z^4) * (beta - xOut) - 8*y^4
  1064  }
  1065  
  1066  // p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
  1067  // (i.e. the second point is affine.)
  1068  //
  1069  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
  1070  //
  1071  // Note that this function does not handle P+P, infinity+P nor P+infinity
  1072  // correctly.
  1073  func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
  1074  	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
  1075  
  1076  	p256Square(&z1z1, z1)
  1077  	p256Sum(&tmp, z1, z1)
  1078  
  1079  	p256Mul(&u2, x2, &z1z1)
  1080  	p256Mul(&z1z1z1, z1, &z1z1)
  1081  	p256Mul(&s2, y2, &z1z1z1)
  1082  	p256Diff(&h, &u2, x1)
  1083  	p256Sum(&i, &h, &h)
  1084  	p256Square(&i, &i)
  1085  	p256Mul(&j, &h, &i)
  1086  	p256Diff(&r, &s2, y1)
  1087  	p256Sum(&r, &r, &r)
  1088  	p256Mul(&v, x1, &i)
  1089  
  1090  	p256Mul(zOut, &tmp, &h)
  1091  	p256Square(&rr, &r)
  1092  	p256Diff(xOut, &rr, &j)
  1093  	p256Diff(xOut, xOut, &v)
  1094  	p256Diff(xOut, xOut, &v)
  1095  
  1096  	p256Diff(&tmp, &v, xOut)
  1097  	p256Mul(yOut, &tmp, &r)
  1098  	p256Mul(&tmp, y1, &j)
  1099  	p256Diff(yOut, yOut, &tmp)
  1100  	p256Diff(yOut, yOut, &tmp)
  1101  }
  1102  
  1103  // p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
  1104  //
  1105  // See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
  1106  //
  1107  // Note that this function does not handle P+P, infinity+P nor P+infinity
  1108  // correctly.
  1109  func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
  1110  	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
  1111  
  1112  	p256Square(&z1z1, z1)
  1113  	p256Square(&z2z2, z2)
  1114  	p256Mul(&u1, x1, &z2z2)
  1115  
  1116  	p256Sum(&tmp, z1, z2)
  1117  	p256Square(&tmp, &tmp)
  1118  	p256Diff(&tmp, &tmp, &z1z1)
  1119  	p256Diff(&tmp, &tmp, &z2z2)
  1120  
  1121  	p256Mul(&z2z2z2, z2, &z2z2)
  1122  	p256Mul(&s1, y1, &z2z2z2)
  1123  
  1124  	p256Mul(&u2, x2, &z1z1)
  1125  	p256Mul(&z1z1z1, z1, &z1z1)
  1126  	p256Mul(&s2, y2, &z1z1z1)
  1127  	p256Diff(&h, &u2, &u1)
  1128  	p256Sum(&i, &h, &h)
  1129  	p256Square(&i, &i)
  1130  	p256Mul(&j, &h, &i)
  1131  	p256Diff(&r, &s2, &s1)
  1132  	p256Sum(&r, &r, &r)
  1133  	p256Mul(&v, &u1, &i)
  1134  
  1135  	p256Mul(zOut, &tmp, &h)
  1136  	p256Square(&rr, &r)
  1137  	p256Diff(xOut, &rr, &j)
  1138  	p256Diff(xOut, xOut, &v)
  1139  	p256Diff(xOut, xOut, &v)
  1140  
  1141  	p256Diff(&tmp, &v, xOut)
  1142  	p256Mul(yOut, &tmp, &r)
  1143  	p256Mul(&tmp, &s1, &j)
  1144  	p256Diff(yOut, yOut, &tmp)
  1145  	p256Diff(yOut, yOut, &tmp)
  1146  }
  1147  
  1148  // p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
  1149  //
  1150  // On entry: mask is either 0 or 0xffffffff.
  1151  func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
  1152  	for i := 0; i < p256Limbs; i++ {
  1153  		tmp := mask & (in[i] ^ out[i])
  1154  		out[i] ^= tmp
  1155  	}
  1156  }
  1157  
  1158  // p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
  1159  // On entry: index < 16, table[0] must be zero.
  1160  // Constant time table access, safe select.
  1161  func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
  1162  	for i := range xOut {
  1163  		xOut[i] = 0
  1164  	}
  1165  	for i := range yOut {
  1166  		yOut[i] = 0
  1167  	}
  1168  
  1169  	for i := uint32(1); i < 16; i++ {
  1170  		mask := i ^ index // mask is zero when i equals index, otherwise non-zero. mask = {b3, b2, b1, b0}, ignore unused bits.
  1171  		mask |= mask >> 2 // mask = {b3, b2, b1 | b3, b0 | b2}
  1172  		mask |= mask >> 1 // mask = {b3, b2 | b3, b1 | b2 | b3, b0 | b1 | b2 | b3}
  1173  		mask &= 1         // mask = {0, 0, 0, b0 | b1 | b2 | b3}
  1174  		mask--            // mask = 0xffffffff when i equals index, otherwise 0x00000000
  1175  		for j := range xOut {
  1176  			xOut[j] |= table[0] & mask
  1177  			table = table[1:]
  1178  		}
  1179  		for j := range yOut {
  1180  			yOut[j] |= table[0] & mask
  1181  			table = table[1:]
  1182  		}
  1183  	}
  1184  }
  1185  
  1186  // p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
  1187  // table.
  1188  // On entry: index < 16, table[0] must be zero.
  1189  func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
  1190  	for i := range xOut {
  1191  		xOut[i] = 0
  1192  	}
  1193  	for i := range yOut {
  1194  		yOut[i] = 0
  1195  	}
  1196  	for i := range zOut {
  1197  		zOut[i] = 0
  1198  	}
  1199  
  1200  	// The implicit value at index 0 is all zero. We don't need to perform that
  1201  	// iteration of the loop because we already set out_* to zero.
  1202  	for i := uint32(1); i < 16; i++ {
  1203  		mask := i ^ index
  1204  		mask |= mask >> 2
  1205  		mask |= mask >> 1
  1206  		mask &= 1
  1207  		mask--
  1208  		for j := range xOut {
  1209  			xOut[j] |= table[i][0][j] & mask
  1210  		}
  1211  		for j := range yOut {
  1212  			yOut[j] |= table[i][1][j] & mask
  1213  		}
  1214  		for j := range zOut {
  1215  			zOut[j] |= table[i][2][j] & mask
  1216  		}
  1217  	}
  1218  }
  1219  
  1220  // p256GetBit returns the bit'th bit of scalar.
  1221  func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
  1222  	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
  1223  }
  1224  
  1225  // p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
  1226  // little-endian number. Note that the value of scalar must be less than the
  1227  // order of the group.
  1228  func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
  1229  	nIsInfinityMask := ^uint32(0)
  1230  	var pIsNoninfiniteMask, mask, tableOffset uint32
  1231  	var px, py, tx, ty, tz [p256Limbs]uint32
  1232  
  1233  	for i := range xOut {
  1234  		xOut[i] = 0
  1235  	}
  1236  	for i := range yOut {
  1237  		yOut[i] = 0
  1238  	}
  1239  	for i := range zOut {
  1240  		zOut[i] = 0
  1241  	}
  1242  
  1243  	// The loop adds bits at positions 0, 64, 128 and 192, followed by
  1244  	// positions 32,96,160 and 224 and does this 32 times.
  1245  	for i := uint(0); i < 32; i++ {
  1246  		if i != 0 {
  1247  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1248  		}
  1249  		tableOffset = 0
  1250  		for j := uint(0); j <= 32; j += 32 {
  1251  			bit0 := p256GetBit(scalar, 31-i+j)
  1252  			bit1 := p256GetBit(scalar, 95-i+j)
  1253  			bit2 := p256GetBit(scalar, 159-i+j)
  1254  			bit3 := p256GetBit(scalar, 223-i+j)
  1255  			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
  1256  
  1257  			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
  1258  			tableOffset += 30 * p256Limbs
  1259  
  1260  			// Since scalar is less than the order of the group, we know that
  1261  			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
  1262  			// below.
  1263  			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
  1264  			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
  1265  			// (a.k.a.  the point at infinity). We handle that situation by
  1266  			// copying the point from the table.
  1267  			p256CopyConditional(xOut, &px, nIsInfinityMask)
  1268  			p256CopyConditional(yOut, &py, nIsInfinityMask)
  1269  			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
  1270  
  1271  			// Equally, the result is also wrong if the point from the table is
  1272  			// zero, which happens when the index is zero. We handle that by
  1273  			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
  1274  			pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1275  			mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1276  			p256CopyConditional(xOut, &tx, mask)
  1277  			p256CopyConditional(yOut, &ty, mask)
  1278  			p256CopyConditional(zOut, &tz, mask)
  1279  			// If p was not zero, then n is now non-zero.
  1280  			nIsInfinityMask &^= pIsNoninfiniteMask
  1281  		}
  1282  	}
  1283  }
  1284  
  1285  // p256PointToAffine converts a Jacobian point to an affine point. If the input
  1286  // is the point at infinity then it returns (0, 0) in constant time.
  1287  func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
  1288  	var zInv, zInvSq [p256Limbs]uint32
  1289  
  1290  	p256Invert(&zInv, z)
  1291  	p256Square(&zInvSq, &zInv)
  1292  	p256Mul(xOut, x, &zInvSq)
  1293  	p256Mul(&zInv, &zInv, &zInvSq)
  1294  	p256Mul(yOut, y, &zInv)
  1295  }
  1296  
  1297  // p256ToAffine returns a pair of *big.Int containing the affine representation
  1298  // of {x,y,z}.
  1299  func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
  1300  	var xx, yy [p256Limbs]uint32
  1301  	p256PointToAffine(&xx, &yy, x, y, z)
  1302  	return p256ToBig(&xx), p256ToBig(&yy)
  1303  }
  1304  
  1305  // p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
  1306  func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
  1307  	var px, py, pz, tx, ty, tz [p256Limbs]uint32
  1308  	var precomp [16][3][p256Limbs]uint32
  1309  	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
  1310  
  1311  	// We precompute 0,1,2,... times {x,y}.
  1312  	precomp[1][0] = *x
  1313  	precomp[1][1] = *y
  1314  	precomp[1][2] = p256One
  1315  
  1316  	for i := 2; i < 16; i += 2 {
  1317  		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
  1318  		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
  1319  	}
  1320  
  1321  	for i := range xOut {
  1322  		xOut[i] = 0
  1323  	}
  1324  	for i := range yOut {
  1325  		yOut[i] = 0
  1326  	}
  1327  	for i := range zOut {
  1328  		zOut[i] = 0
  1329  	}
  1330  	nIsInfinityMask = ^uint32(0)
  1331  
  1332  	// We add in a window of four bits each iteration and do this 64 times.
  1333  	for i := 0; i < 64; i++ {
  1334  		if i != 0 {
  1335  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1336  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1337  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1338  			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1339  		}
  1340  
  1341  		index = uint32(scalar[31-i/2])
  1342  		if (i & 1) == 1 {
  1343  			index &= 15
  1344  		} else {
  1345  			index >>= 4
  1346  		}
  1347  
  1348  		// See the comments in scalarBaseMult about handling infinities.
  1349  		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
  1350  		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
  1351  		p256CopyConditional(xOut, &px, nIsInfinityMask)
  1352  		p256CopyConditional(yOut, &py, nIsInfinityMask)
  1353  		p256CopyConditional(zOut, &pz, nIsInfinityMask)
  1354  
  1355  		pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1356  		mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1357  		p256CopyConditional(xOut, &tx, mask)
  1358  		p256CopyConditional(yOut, &ty, mask)
  1359  		p256CopyConditional(zOut, &tz, mask)
  1360  		nIsInfinityMask &^= pIsNoninfiniteMask
  1361  	}
  1362  }
  1363  
  1364  // p256FromBig sets out = R*in.
  1365  func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
  1366  	p256FromBigAgainstP(out, in, p256.P)
  1367  }
  1368  
  1369  func p256FromBigAgainstP(out *[p256Limbs]uint32, in *big.Int, p *big.Int) {
  1370  	tmp := new(big.Int).Lsh(in, 257)
  1371  	tmp.Mod(tmp, p)
  1372  
  1373  	for i := 0; i < p256Limbs; i++ {
  1374  		if bits := tmp.Bits(); len(bits) > 0 {
  1375  			out[i] = uint32(bits[0]) & bottom29Bits
  1376  		} else {
  1377  			out[i] = 0
  1378  		}
  1379  		tmp.Rsh(tmp, 29)
  1380  
  1381  		i++
  1382  		if i == p256Limbs {
  1383  			break
  1384  		}
  1385  
  1386  		if bits := tmp.Bits(); len(bits) > 0 {
  1387  			out[i] = uint32(bits[0]) & bottom28Bits
  1388  		} else {
  1389  			out[i] = 0
  1390  		}
  1391  		tmp.Rsh(tmp, 28)
  1392  	}
  1393  }
  1394  
  1395  // p256ToBig returns a *big.Int containing the value of in.
  1396  func p256ToBig(in *[p256Limbs]uint32) *big.Int {
  1397  	result := limbsToBig(in)
  1398  	result.Mul(result, p256RInverse)
  1399  	result.Mod(result, p256.P)
  1400  	return result
  1401  }