github.com/euank/go@v0.0.0-20160829210321-495514729181/src/crypto/elliptic/p256_amd64.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file contains the Go wrapper for the constant-time, 64-bit assembly 6 // implementation of P256. The optimizations performed here are described in 7 // detail in: 8 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 9 // 256-bit primes" 10 // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x 11 // https://eprint.iacr.org/2013/816.pdf 12 13 // +build amd64 14 15 package elliptic 16 17 import ( 18 "math/big" 19 "sync" 20 ) 21 22 type ( 23 p256Curve struct { 24 *CurveParams 25 } 26 27 p256Point struct { 28 xyz [12]uint64 29 } 30 ) 31 32 var ( 33 p256 p256Curve 34 p256Precomputed *[37][64 * 8]uint64 35 precomputeOnce sync.Once 36 ) 37 38 func initP256() { 39 // See FIPS 186-3, section D.2.3 40 p256.CurveParams = &CurveParams{Name: "P-256"} 41 p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10) 42 p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10) 43 p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16) 44 p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16) 45 p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16) 46 p256.BitSize = 256 47 } 48 49 func (curve p256Curve) Params() *CurveParams { 50 return curve.CurveParams 51 } 52 53 // Functions implemented in p256_asm_amd64.s 54 // Montgomery multiplication modulo P256 55 func p256Mul(res, in1, in2 []uint64) 56 57 // Montgomery square modulo P256 58 func p256Sqr(res, in []uint64) 59 60 // Montgomery multiplication by 1 61 func p256FromMont(res, in []uint64) 62 63 // iff cond == 1 val <- -val 64 func p256NegCond(val []uint64, cond int) 65 66 // if cond == 0 res <- b; else res <- a 67 func p256MovCond(res, a, b []uint64, cond int) 68 69 // Endianness swap 70 func p256BigToLittle(res []uint64, in []byte) 71 func p256LittleToBig(res []byte, in []uint64) 72 73 // Constant time table access 74 func p256Select(point, table []uint64, idx int) 75 func p256SelectBase(point, table []uint64, idx int) 76 77 // Montgomery multiplication modulo Ord(G) 78 func p256OrdMul(res, in1, in2 []uint64) 79 80 // Montgomery square modulo Ord(G), repeated n times 81 func p256OrdSqr(res, in []uint64, n int) 82 83 // Point add with in2 being affine point 84 // If sign == 1 -> in2 = -in2 85 // If sel == 0 -> res = in1 86 // if zero == 0 -> res = in2 87 func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) 88 89 // Point add 90 func p256PointAddAsm(res, in1, in2 []uint64) 91 92 // Point double 93 func p256PointDoubleAsm(res, in []uint64) 94 95 func (curve p256Curve) Inverse(k *big.Int) *big.Int { 96 if k.Sign() < 0 { 97 // This should never happen. 98 k = new(big.Int).Neg(k) 99 } 100 101 if k.Cmp(p256.N) >= 0 { 102 // This should never happen. 103 k = new(big.Int).Mod(k, p256.N) 104 } 105 106 // table will store precomputed powers of x. The four words at index 107 // 4×i store x^(i+1). 108 var table [4 * 15]uint64 109 110 x := make([]uint64, 4) 111 fromBig(x[:], k) 112 // This code operates in the Montgomery domain where R = 2^256 mod n 113 // and n is the order of the scalar field. (See initP256 for the 114 // value.) Elements in the Montgomery domain take the form a×R and 115 // multiplication of x and y in the calculates (x × y × R^-1) mod n. RR 116 // is R×R mod n thus the Montgomery multiplication x and RR gives x×R, 117 // i.e. converts x into the Montgomery domain. 118 RR := []uint64{0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, 0x66e12d94f3d95620} 119 p256OrdMul(table[:4], x, RR) 120 121 // Prepare the table, no need in constant time access, because the 122 // power is not a secret. (Entry 0 is never used.) 123 for i := 2; i < 16; i += 2 { 124 p256OrdSqr(table[4*(i-1):], table[4*((i/2)-1):], 1) 125 p256OrdMul(table[4*i:], table[4*(i-1):], table[:4]) 126 } 127 128 x[0] = table[4*14+0] // f 129 x[1] = table[4*14+1] 130 x[2] = table[4*14+2] 131 x[3] = table[4*14+3] 132 133 p256OrdSqr(x, x, 4) 134 p256OrdMul(x, x, table[4*14:4*14+4]) // ff 135 t := make([]uint64, 4, 4) 136 t[0] = x[0] 137 t[1] = x[1] 138 t[2] = x[2] 139 t[3] = x[3] 140 141 p256OrdSqr(x, x, 8) 142 p256OrdMul(x, x, t) // ffff 143 t[0] = x[0] 144 t[1] = x[1] 145 t[2] = x[2] 146 t[3] = x[3] 147 148 p256OrdSqr(x, x, 16) 149 p256OrdMul(x, x, t) // ffffffff 150 t[0] = x[0] 151 t[1] = x[1] 152 t[2] = x[2] 153 t[3] = x[3] 154 155 p256OrdSqr(x, x, 64) // ffffffff0000000000000000 156 p256OrdMul(x, x, t) // ffffffff00000000ffffffff 157 p256OrdSqr(x, x, 32) // ffffffff00000000ffffffff00000000 158 p256OrdMul(x, x, t) // ffffffff00000000ffffffffffffffff 159 160 // Remaining 32 windows 161 expLo := [32]byte{0xb, 0xc, 0xe, 0x6, 0xf, 0xa, 0xa, 0xd, 0xa, 0x7, 0x1, 0x7, 0x9, 0xe, 0x8, 0x4, 0xf, 0x3, 0xb, 0x9, 0xc, 0xa, 0xc, 0x2, 0xf, 0xc, 0x6, 0x3, 0x2, 0x5, 0x4, 0xf} 162 for i := 0; i < 32; i++ { 163 p256OrdSqr(x, x, 4) 164 p256OrdMul(x, x, table[4*(expLo[i]-1):]) 165 } 166 167 // Multiplying by one in the Montgomery domain converts a Montgomery 168 // value out of the domain. 169 one := []uint64{1, 0, 0, 0} 170 p256OrdMul(x, x, one) 171 172 xOut := make([]byte, 32) 173 p256LittleToBig(xOut, x) 174 return new(big.Int).SetBytes(xOut) 175 } 176 177 // fromBig converts a *big.Int into a format used by this code. 178 func fromBig(out []uint64, big *big.Int) { 179 for i := range out { 180 out[i] = 0 181 } 182 183 for i, v := range big.Bits() { 184 out[i] = uint64(v) 185 } 186 } 187 188 // p256GetScalar endian-swaps the big-endian scalar value from in and writes it 189 // to out. If the scalar is equal or greater than the order of the group, it's 190 // reduced modulo that order. 191 func p256GetScalar(out []uint64, in []byte) { 192 n := new(big.Int).SetBytes(in) 193 194 if n.Cmp(p256.N) >= 0 { 195 n.Mod(n, p256.N) 196 } 197 fromBig(out, n) 198 } 199 200 // p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the 201 // underlying field of the curve. (See initP256 for the value.) Thus rr here is 202 // R×R mod p. See comment in Inverse about how this is used. 203 var rr = []uint64{0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd} 204 205 func maybeReduceModP(in *big.Int) *big.Int { 206 if in.Cmp(p256.P) < 0 { 207 return in 208 } 209 return new(big.Int).Mod(in, p256.P) 210 } 211 212 func (curve p256Curve) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) { 213 scalarReversed := make([]uint64, 4) 214 var r1, r2 p256Point 215 p256GetScalar(scalarReversed, baseScalar) 216 r1.p256BaseMult(scalarReversed) 217 218 p256GetScalar(scalarReversed, scalar) 219 fromBig(r2.xyz[0:4], maybeReduceModP(bigX)) 220 fromBig(r2.xyz[4:8], maybeReduceModP(bigY)) 221 p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:]) 222 p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:]) 223 224 // This sets r2's Z value to 1, in the Montgomery domain. 225 r2.xyz[8] = 0x0000000000000001 226 r2.xyz[9] = 0xffffffff00000000 227 r2.xyz[10] = 0xffffffffffffffff 228 r2.xyz[11] = 0x00000000fffffffe 229 230 r2.p256ScalarMult(scalarReversed) 231 p256PointAddAsm(r1.xyz[:], r1.xyz[:], r2.xyz[:]) 232 return r1.p256PointToAffine() 233 } 234 235 func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) { 236 scalarReversed := make([]uint64, 4) 237 p256GetScalar(scalarReversed, scalar) 238 239 var r p256Point 240 r.p256BaseMult(scalarReversed) 241 return r.p256PointToAffine() 242 } 243 244 func (curve p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) { 245 scalarReversed := make([]uint64, 4) 246 p256GetScalar(scalarReversed, scalar) 247 248 var r p256Point 249 fromBig(r.xyz[0:4], maybeReduceModP(bigX)) 250 fromBig(r.xyz[4:8], maybeReduceModP(bigY)) 251 p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:]) 252 p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:]) 253 // This sets r2's Z value to 1, in the Montgomery domain. 254 r.xyz[8] = 0x0000000000000001 255 r.xyz[9] = 0xffffffff00000000 256 r.xyz[10] = 0xffffffffffffffff 257 r.xyz[11] = 0x00000000fffffffe 258 259 r.p256ScalarMult(scalarReversed) 260 return r.p256PointToAffine() 261 } 262 263 func (p *p256Point) p256PointToAffine() (x, y *big.Int) { 264 zInv := make([]uint64, 4) 265 zInvSq := make([]uint64, 4) 266 p256Inverse(zInv, p.xyz[8:12]) 267 p256Sqr(zInvSq, zInv) 268 p256Mul(zInv, zInv, zInvSq) 269 270 p256Mul(zInvSq, p.xyz[0:4], zInvSq) 271 p256Mul(zInv, p.xyz[4:8], zInv) 272 273 p256FromMont(zInvSq, zInvSq) 274 p256FromMont(zInv, zInv) 275 276 xOut := make([]byte, 32) 277 yOut := make([]byte, 32) 278 p256LittleToBig(xOut, zInvSq) 279 p256LittleToBig(yOut, zInv) 280 281 return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut) 282 } 283 284 // p256Inverse sets out to in^-1 mod p. 285 func p256Inverse(out, in []uint64) { 286 var stack [6 * 4]uint64 287 p2 := stack[4*0 : 4*0+4] 288 p4 := stack[4*1 : 4*1+4] 289 p8 := stack[4*2 : 4*2+4] 290 p16 := stack[4*3 : 4*3+4] 291 p32 := stack[4*4 : 4*4+4] 292 293 p256Sqr(out, in) 294 p256Mul(p2, out, in) // 3*p 295 296 p256Sqr(out, p2) 297 p256Sqr(out, out) 298 p256Mul(p4, out, p2) // f*p 299 300 p256Sqr(out, p4) 301 p256Sqr(out, out) 302 p256Sqr(out, out) 303 p256Sqr(out, out) 304 p256Mul(p8, out, p4) // ff*p 305 306 p256Sqr(out, p8) 307 308 for i := 0; i < 7; i++ { 309 p256Sqr(out, out) 310 } 311 p256Mul(p16, out, p8) // ffff*p 312 313 p256Sqr(out, p16) 314 for i := 0; i < 15; i++ { 315 p256Sqr(out, out) 316 } 317 p256Mul(p32, out, p16) // ffffffff*p 318 319 p256Sqr(out, p32) 320 321 for i := 0; i < 31; i++ { 322 p256Sqr(out, out) 323 } 324 p256Mul(out, out, in) 325 326 for i := 0; i < 32*4; i++ { 327 p256Sqr(out, out) 328 } 329 p256Mul(out, out, p32) 330 331 for i := 0; i < 32; i++ { 332 p256Sqr(out, out) 333 } 334 p256Mul(out, out, p32) 335 336 for i := 0; i < 16; i++ { 337 p256Sqr(out, out) 338 } 339 p256Mul(out, out, p16) 340 341 for i := 0; i < 8; i++ { 342 p256Sqr(out, out) 343 } 344 p256Mul(out, out, p8) 345 346 p256Sqr(out, out) 347 p256Sqr(out, out) 348 p256Sqr(out, out) 349 p256Sqr(out, out) 350 p256Mul(out, out, p4) 351 352 p256Sqr(out, out) 353 p256Sqr(out, out) 354 p256Mul(out, out, p2) 355 356 p256Sqr(out, out) 357 p256Sqr(out, out) 358 p256Mul(out, out, in) 359 } 360 361 func (p *p256Point) p256StorePoint(r *[16 * 4 * 3]uint64, index int) { 362 copy(r[index*12:], p.xyz[:]) 363 } 364 365 func boothW5(in uint) (int, int) { 366 var s uint = ^((in >> 5) - 1) 367 var d uint = (1 << 6) - in - 1 368 d = (d & s) | (in & (^s)) 369 d = (d >> 1) + (d & 1) 370 return int(d), int(s & 1) 371 } 372 373 func boothW7(in uint) (int, int) { 374 var s uint = ^((in >> 7) - 1) 375 var d uint = (1 << 8) - in - 1 376 d = (d & s) | (in & (^s)) 377 d = (d >> 1) + (d & 1) 378 return int(d), int(s & 1) 379 } 380 381 func initTable() { 382 p256Precomputed = new([37][64 * 8]uint64) 383 384 basePoint := []uint64{ 385 0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, 0x18905f76a53755c6, 386 0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325, 0x8571ff1825885d85, 387 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe, 388 } 389 t1 := make([]uint64, 12) 390 t2 := make([]uint64, 12) 391 copy(t2, basePoint) 392 393 zInv := make([]uint64, 4) 394 zInvSq := make([]uint64, 4) 395 for j := 0; j < 64; j++ { 396 copy(t1, t2) 397 for i := 0; i < 37; i++ { 398 // The window size is 7 so we need to double 7 times. 399 if i != 0 { 400 for k := 0; k < 7; k++ { 401 p256PointDoubleAsm(t1, t1) 402 } 403 } 404 // Convert the point to affine form. (Its values are 405 // still in Montgomery form however.) 406 p256Inverse(zInv, t1[8:12]) 407 p256Sqr(zInvSq, zInv) 408 p256Mul(zInv, zInv, zInvSq) 409 410 p256Mul(t1[:4], t1[:4], zInvSq) 411 p256Mul(t1[4:8], t1[4:8], zInv) 412 413 copy(t1[8:12], basePoint[8:12]) 414 // Update the table entry 415 copy(p256Precomputed[i][j*8:], t1[:8]) 416 } 417 if j == 0 { 418 p256PointDoubleAsm(t2, basePoint) 419 } else { 420 p256PointAddAsm(t2, t2, basePoint) 421 } 422 } 423 } 424 425 func (p *p256Point) p256BaseMult(scalar []uint64) { 426 precomputeOnce.Do(initTable) 427 428 wvalue := (scalar[0] << 1) & 0xff 429 sel, sign := boothW7(uint(wvalue)) 430 p256SelectBase(p.xyz[0:8], p256Precomputed[0][0:], sel) 431 p256NegCond(p.xyz[4:8], sign) 432 433 // (This is one, in the Montgomery domain.) 434 p.xyz[8] = 0x0000000000000001 435 p.xyz[9] = 0xffffffff00000000 436 p.xyz[10] = 0xffffffffffffffff 437 p.xyz[11] = 0x00000000fffffffe 438 439 var t0 p256Point 440 // (This is one, in the Montgomery domain.) 441 t0.xyz[8] = 0x0000000000000001 442 t0.xyz[9] = 0xffffffff00000000 443 t0.xyz[10] = 0xffffffffffffffff 444 t0.xyz[11] = 0x00000000fffffffe 445 446 index := uint(6) 447 zero := sel 448 449 for i := 1; i < 37; i++ { 450 if index < 192 { 451 wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0xff 452 } else { 453 wvalue = (scalar[index/64] >> (index % 64)) & 0xff 454 } 455 index += 7 456 sel, sign = boothW7(uint(wvalue)) 457 p256SelectBase(t0.xyz[0:8], p256Precomputed[i][0:], sel) 458 p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero) 459 zero |= sel 460 } 461 } 462 463 func (p *p256Point) p256ScalarMult(scalar []uint64) { 464 // precomp is a table of precomputed points that stores powers of p 465 // from p^1 to p^16. 466 var precomp [16 * 4 * 3]uint64 467 var t0, t1, t2, t3 p256Point 468 469 // Prepare the table 470 p.p256StorePoint(&precomp, 0) // 1 471 472 p256PointDoubleAsm(t0.xyz[:], p.xyz[:]) 473 p256PointDoubleAsm(t1.xyz[:], t0.xyz[:]) 474 p256PointDoubleAsm(t2.xyz[:], t1.xyz[:]) 475 p256PointDoubleAsm(t3.xyz[:], t2.xyz[:]) 476 t0.p256StorePoint(&precomp, 1) // 2 477 t1.p256StorePoint(&precomp, 3) // 4 478 t2.p256StorePoint(&precomp, 7) // 8 479 t3.p256StorePoint(&precomp, 15) // 16 480 481 p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:]) 482 p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:]) 483 p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:]) 484 t0.p256StorePoint(&precomp, 2) // 3 485 t1.p256StorePoint(&precomp, 4) // 5 486 t2.p256StorePoint(&precomp, 8) // 9 487 488 p256PointDoubleAsm(t0.xyz[:], t0.xyz[:]) 489 p256PointDoubleAsm(t1.xyz[:], t1.xyz[:]) 490 t0.p256StorePoint(&precomp, 5) // 6 491 t1.p256StorePoint(&precomp, 9) // 10 492 493 p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:]) 494 p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:]) 495 t2.p256StorePoint(&precomp, 6) // 7 496 t1.p256StorePoint(&precomp, 10) // 11 497 498 p256PointDoubleAsm(t0.xyz[:], t0.xyz[:]) 499 p256PointDoubleAsm(t2.xyz[:], t2.xyz[:]) 500 t0.p256StorePoint(&precomp, 11) // 12 501 t2.p256StorePoint(&precomp, 13) // 14 502 503 p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:]) 504 p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:]) 505 t0.p256StorePoint(&precomp, 12) // 13 506 t2.p256StorePoint(&precomp, 14) // 15 507 508 // Start scanning the window from top bit 509 index := uint(254) 510 var sel, sign int 511 512 wvalue := (scalar[index/64] >> (index % 64)) & 0x3f 513 sel, _ = boothW5(uint(wvalue)) 514 515 p256Select(p.xyz[0:12], precomp[0:], sel) 516 zero := sel 517 518 for index > 4 { 519 index -= 5 520 p256PointDoubleAsm(p.xyz[:], p.xyz[:]) 521 p256PointDoubleAsm(p.xyz[:], p.xyz[:]) 522 p256PointDoubleAsm(p.xyz[:], p.xyz[:]) 523 p256PointDoubleAsm(p.xyz[:], p.xyz[:]) 524 p256PointDoubleAsm(p.xyz[:], p.xyz[:]) 525 526 if index < 192 { 527 wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f 528 } else { 529 wvalue = (scalar[index/64] >> (index % 64)) & 0x3f 530 } 531 532 sel, sign = boothW5(uint(wvalue)) 533 534 p256Select(t0.xyz[0:], precomp[0:], sel) 535 p256NegCond(t0.xyz[4:8], sign) 536 p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:]) 537 p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel) 538 p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero) 539 zero |= sel 540 } 541 542 p256PointDoubleAsm(p.xyz[:], p.xyz[:]) 543 p256PointDoubleAsm(p.xyz[:], p.xyz[:]) 544 p256PointDoubleAsm(p.xyz[:], p.xyz[:]) 545 p256PointDoubleAsm(p.xyz[:], p.xyz[:]) 546 p256PointDoubleAsm(p.xyz[:], p.xyz[:]) 547 548 wvalue = (scalar[0] << 1) & 0x3f 549 sel, sign = boothW5(uint(wvalue)) 550 551 p256Select(t0.xyz[0:], precomp[0:], sel) 552 p256NegCond(t0.xyz[4:8], sign) 553 p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:]) 554 p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel) 555 p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero) 556 }