github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/sm2p256_asm.go (about) 1 // It is by standing on the shoulders of giants. 2 3 // This file contains the Go wrapper for the constant-time, 64-bit assembly 4 // implementation of P256. The optimizations performed here are described in 5 // detail in: 6 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 7 // 256-bit primes" 8 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x 9 // https://eprint.iacr.org/2013/816.pdf 10 //go:build (amd64 || arm64 || s390x || ppc64le) && !purego 11 12 package sm2ec 13 14 import ( 15 _ "embed" 16 "errors" 17 "math/bits" 18 "runtime" 19 "unsafe" 20 21 "golang.org/x/sys/cpu" 22 ) 23 24 // p256Element is a P-256 base field element in [0, P-1] in the Montgomery 25 // domain (with R 2²⁵⁶) as four limbs in little-endian order value. 26 type p256Element [4]uint64 27 28 // p256One is one in the Montgomery domain. 29 var p256One = p256Element{0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000} 30 31 var p256Zero = p256Element{} 32 33 // p256P is 2^256 - 2^224 - 2^96 + 2^64 - 1. 34 var p256P = p256Element{0xffffffffffffffff, 0xffffffff00000000, 35 0xffffffffffffffff, 0xfffffffeffffffff} 36 37 // P256Point is a P-256 point. The zero value should not be assumed to be valid 38 // (although it is in this implementation). 39 type SM2P256Point struct { 40 // (X:Y:Z) are Jacobian coordinates where x = X/Z² and y = Y/Z³. The point 41 // at infinity can be represented by any set of coordinates with Z = 0. 42 x, y, z p256Element 43 } 44 45 // NewSM2P256Point returns a new SM2P256Point representing the point at infinity. 46 func NewSM2P256Point() *SM2P256Point { 47 return &SM2P256Point{ 48 x: p256One, y: p256One, z: p256Zero, 49 } 50 } 51 52 // SetGenerator sets p to the canonical generator and returns p. 53 func (p *SM2P256Point) SetGenerator() *SM2P256Point { 54 p.x = p256Element{0x61328990f418029e, 0x3e7981eddca6c050, 55 0xd6a1ed99ac24c3c3, 0x91167a5ee1c13b05} 56 p.y = p256Element{0xc1354e593c2d0ddd, 0xc1f5e5788d3295fa, 57 0x8d4cfb066e2a48f8, 0x63cd65d481d735bd} 58 p.z = p256One 59 return p 60 } 61 62 // Set sets p = q and returns p. 63 func (p *SM2P256Point) Set(q *SM2P256Point) *SM2P256Point { 64 p.x, p.y, p.z = q.x, q.y, q.z 65 return p 66 } 67 68 const p256ElementLength = 32 69 const p256UncompressedLength = 1 + 2*p256ElementLength 70 const p256CompressedLength = 1 + p256ElementLength 71 72 // SetBytes sets p to the compressed, uncompressed, or infinity value encoded in 73 // b, as specified in SEC 1, Version 2.0, Section 2.3.4. If the point is not on 74 // the curve, it returns nil and an error, and the receiver is unchanged. 75 // Otherwise, it returns p. 76 func (p *SM2P256Point) SetBytes(b []byte) (*SM2P256Point, error) { 77 // p256Mul operates in the Montgomery domain with R = 2²⁵⁶ mod p. Thus rr 78 // here is R in the Montgomery domain, or R×R mod p. See comment in 79 // P256OrdInverse about how this is used. 80 rr := p256Element{0x0000000200000003, 0x00000002ffffffff, 81 0x0000000100000001, 0x0000000400000002} 82 83 switch { 84 // Point at infinity. 85 case len(b) == 1 && b[0] == 0: 86 return p.Set(NewSM2P256Point()), nil 87 88 // Uncompressed form. 89 case len(b) == p256UncompressedLength && b[0] == 4: 90 var r SM2P256Point 91 p256BigToLittle(&r.x, (*[32]byte)(b[1:33])) 92 p256BigToLittle(&r.y, (*[32]byte)(b[33:65])) 93 if p256LessThanP(&r.x) == 0 || p256LessThanP(&r.y) == 0 { 94 return nil, errors.New("invalid P256 element encoding") 95 } 96 p256Mul(&r.x, &r.x, &rr) 97 p256Mul(&r.y, &r.y, &rr) 98 if err := p256CheckOnCurve(&r.x, &r.y); err != nil { 99 return nil, err 100 } 101 r.z = p256One 102 return p.Set(&r), nil 103 104 // Compressed form. 105 case len(b) == p256CompressedLength && (b[0] == 2 || b[0] == 3): 106 var r SM2P256Point 107 p256BigToLittle(&r.x, (*[32]byte)(b[1:33])) 108 if p256LessThanP(&r.x) == 0 { 109 return nil, errors.New("invalid P256 element encoding") 110 } 111 p256Mul(&r.x, &r.x, &rr) 112 113 // y² = x³ - 3x + b 114 p256Polynomial(&r.y, &r.x) 115 if !p256Sqrt(&r.y, &r.y) { 116 return nil, errors.New("invalid P256 compressed point encoding") 117 } 118 119 // Select the positive or negative root, as indicated by the least 120 // significant bit, based on the encoding type byte. 121 yy := new(p256Element) 122 p256FromMont(yy, &r.y) 123 cond := int(yy[0]&1) ^ int(b[0]&1) 124 p256NegCond(&r.y, cond) 125 126 r.z = p256One 127 return p.Set(&r), nil 128 129 default: 130 return nil, errors.New("invalid P256 point encoding") 131 } 132 } 133 134 // p256Polynomial sets y2 to x³ - 3x + b, and returns y2. 135 func p256Polynomial(y2, x *p256Element) *p256Element { 136 x3 := new(p256Element) 137 p256Sqr(x3, x, 1) 138 p256Mul(x3, x3, x) 139 140 threeX := new(p256Element) 141 p256Add(threeX, x, x) 142 p256Add(threeX, threeX, x) 143 p256NegCond(threeX, 1) 144 145 p256B := &p256Element{0x90d230632bc0dd42, 0x71cf379ae9b537ab, 146 0x527981505ea51c3c, 0x240fe188ba20e2c8} 147 148 p256Add(x3, x3, threeX) 149 p256Add(x3, x3, p256B) 150 151 *y2 = *x3 152 return y2 153 } 154 155 func p256CheckOnCurve(x, y *p256Element) error { 156 // y² = x³ - 3x + b 157 rhs := p256Polynomial(new(p256Element), x) 158 lhs := new(p256Element) 159 p256Sqr(lhs, y, 1) 160 if p256Equal(lhs, rhs) != 1 { 161 return errors.New("point not on SM2 P256 curve") 162 } 163 return nil 164 } 165 166 // p256LessThanP returns 1 if x < p, and 0 otherwise. Note that a p256Element is 167 // not allowed to be equal to or greater than p, so if this function returns 0 168 // then x is invalid. 169 func p256LessThanP(x *p256Element) int { 170 var b uint64 171 _, b = bits.Sub64(x[0], p256P[0], b) 172 _, b = bits.Sub64(x[1], p256P[1], b) 173 _, b = bits.Sub64(x[2], p256P[2], b) 174 _, b = bits.Sub64(x[3], p256P[3], b) 175 return int(b) 176 } 177 178 // p256Add sets res = x + y. 179 func p256Add(res, x, y *p256Element) { 180 var c, b uint64 181 t1 := make([]uint64, 4) 182 t1[0], c = bits.Add64(x[0], y[0], 0) 183 t1[1], c = bits.Add64(x[1], y[1], c) 184 t1[2], c = bits.Add64(x[2], y[2], c) 185 t1[3], c = bits.Add64(x[3], y[3], c) 186 t2 := make([]uint64, 4) 187 t2[0], b = bits.Sub64(t1[0], p256P[0], 0) 188 t2[1], b = bits.Sub64(t1[1], p256P[1], b) 189 t2[2], b = bits.Sub64(t1[2], p256P[2], b) 190 t2[3], b = bits.Sub64(t1[3], p256P[3], b) 191 // Three options: 192 // - a+b < p 193 // then c is 0, b is 1, and t1 is correct 194 // - p <= a+b < 2^256 195 // then c is 0, b is 0, and t2 is correct 196 // - 2^256 <= a+b 197 // then c is 1, b is 1, and t2 is correct 198 t2Mask := (c ^ b) - 1 199 res[0] = (t1[0] & ^t2Mask) | (t2[0] & t2Mask) 200 res[1] = (t1[1] & ^t2Mask) | (t2[1] & t2Mask) 201 res[2] = (t1[2] & ^t2Mask) | (t2[2] & t2Mask) 202 res[3] = (t1[3] & ^t2Mask) | (t2[3] & t2Mask) 203 } 204 205 // p256Sqrt sets e to a square root of x. If x is not a square, p256Sqrt returns 206 // false and e is unchanged. e and x can overlap. 207 func p256Sqrt(e, x *p256Element) (isSquare bool) { 208 z, t0, t1, t2, t3, t4 := new(p256Element), new(p256Element), new(p256Element), new(p256Element), new(p256Element), new(p256Element) 209 210 // Since p = 3 mod 4, exponentiation by (p + 1) / 4 yields a square root candidate. 211 // 212 // The sequence of 13 multiplications and 253 squarings is derived from the 213 // following addition chain generated with github.com/mmcloughlin/addchain v0.4.0. 214 // 215 // _10 = 2*1 216 // _11 = 1 + _10 217 // _110 = 2*_11 218 // _111 = 1 + _110 219 // _1110 = 2*_111 220 // _1111 = 1 + _1110 221 // _11110 = 2*_1111 222 // _111100 = 2*_11110 223 // _1111000 = 2*_111100 224 // i19 = (_1111000 << 3 + _111100) << 5 + _1111000 225 // x31 = (i19 << 2 + _11110) << 14 + i19 + _111 226 // i42 = x31 << 4 227 // i73 = i42 << 31 228 // i74 = i42 + i73 229 // i171 = (i73 << 32 + i74) << 62 + i74 + _1111 230 // return (i171 << 32 + 1) << 62 231 // 232 p256Sqr(z, x, 1) // z.Square(x) 233 p256Mul(z, x, z) // z.Mul(x, z) 234 p256Sqr(z, z, 1) // z.Square(z) 235 p256Mul(t0, x, z) // t0.Mul(x, z) 236 p256Sqr(z, t0, 1) // z.Square(t0) 237 p256Mul(z, x, z) // z.Mul(x, z) 238 p256Sqr(t2, z, 1) // t2.Square(z) 239 p256Sqr(t3, t2, 1) // t3.Square(t2) 240 p256Sqr(t1, t3, 1) // t1.Square(t3) 241 // t4.Square(t1) 242 //for s := 1; s < 3; s++ { 243 // t4.Square(t4) 244 //} 245 p256Sqr(t4, t1, 3) 246 p256Mul(t3, t3, t4) // t3.Mul(t3, t4) 247 //for s := 0; s < 5; s++ { 248 // t3.Square(t3) 249 //} 250 p256Sqr(t3, t3, 5) 251 p256Mul(t1, t1, t3) // t1.Mul(t1, t3) 252 //t3.Square(t1) 253 //for s := 1; s < 2; s++ { 254 // t3.Square(t3) 255 //} 256 p256Sqr(t3, t1, 2) 257 p256Mul(t2, t2, t3) // t2.Mul(t2, t3) 258 //for s := 0; s < 14; s++ { 259 // t2.Square(t2) 260 //} 261 p256Sqr(t2, t2, 14) 262 p256Mul(t1, t1, t2) // t1.Mul(t1, t2) 263 264 p256Mul(t0, t0, t1) // t0.Mul(t0, t1) 265 //for s := 0; s < 4; s++ { 266 // t0.Square(t0) 267 //} 268 p256Sqr(t0, t0, 4) 269 //t1.Square(t0) 270 //for s := 1; s < 31; s++ { 271 // t1.Square(t1) 272 //} 273 p256Sqr(t1, t0, 31) 274 p256Mul(t0, t0, t1) //t0.Mul(t0, t1) 275 //for s := 0; s < 32; s++ { 276 // t1.Square(t1) 277 //} 278 p256Sqr(t1, t1, 32) 279 280 p256Mul(t1, t0, t1) //t1.Mul(t0, t1) 281 //for s := 0; s < 62; s++ { 282 // t1.Square(t1) 283 //} 284 p256Sqr(t1, t1, 62) 285 p256Mul(t0, t0, t1) //t0.Mul(t0, t1) 286 p256Mul(z, z, t0) //z.Mul(z, t0) 287 //for s := 0; s < 32; s++ { 288 // e.Square(e) 289 //} 290 p256Sqr(z, z, 32) 291 p256Mul(z, z, x) // z.Mul(x, z) 292 //for s := 0; s < 62; s++ { 293 // z.Square(z) 294 //} 295 p256Sqr(z, z, 62) 296 297 p256Sqr(t1, z, 1) 298 if p256Equal(t1, x) != 1 { 299 return false 300 } 301 *e = *z 302 return true 303 } 304 305 // The following assembly functions are implemented in p256_asm_*.s 306 307 // amd64 assembly uses ADCX/ADOX/MULX 308 var supportBMI2 = cpu.X86.HasADX && cpu.X86.HasBMI2 309 310 var supportAVX2 = cpu.X86.HasAVX2 311 312 // Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p. 313 // 314 //go:noescape 315 func p256Mul(res, in1, in2 *p256Element) 316 317 // Montgomery square, repeated n times (n >= 1). 318 // 319 //go:noescape 320 func p256Sqr(res, in *p256Element, n int) 321 322 // Montgomery multiplication by R⁻¹, or 1 outside the domain. 323 // Sets res = in * R⁻¹, bringing res out of the Montgomery domain. 324 // 325 //go:noescape 326 func p256FromMont(res, in *p256Element) 327 328 // If cond is not 0, sets val = -val mod p. 329 // 330 //go:noescape 331 func p256NegCond(val *p256Element, cond int) 332 333 // If cond is 0, sets res = b, otherwise sets res = a. 334 // 335 //go:noescape 336 func p256MovCond(res, a, b *SM2P256Point, cond int) 337 338 //go:noescape 339 func p256BigToLittle(res *p256Element, in *[32]byte) 340 341 //go:noescape 342 func p256LittleToBig(res *[32]byte, in *p256Element) 343 344 //go:noescape 345 func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 346 347 //go:noescape 348 func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 349 350 // p256OrdReduce ensures s is in the range [0, ord(G)-1]. 351 // 352 //go:noescape 353 func p256OrdReduce(s *p256OrdElement) 354 355 // p256Table is a table of the first 16 multiples of a point. Points are stored 356 // at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15. 357 // [0]P is the point at infinity and it's not stored. 358 type p256Table [32]SM2P256Point 359 360 // p256Select sets res to the point at index idx in the table. 361 // idx must be in [0, limit-1]. It executes in constant time. 362 // 363 //go:noescape 364 func p256Select(res *SM2P256Point, table *p256Table, idx, limit int) 365 366 // p256AffinePoint is a point in affine coordinates (x, y). x and y are still 367 // Montgomery domain elements. The point can't be the point at infinity. 368 type p256AffinePoint struct { 369 x, y p256Element 370 } 371 372 // p256AffineTable is a table of the first 32 multiples of a point. Points are 373 // stored at an index offset of -1 like in p256Table, and [0]P is not stored. 374 type p256AffineTable [32]p256AffinePoint 375 376 // p256Precomputed is a series of precomputed multiples of G, the canonical 377 // generator. The first p256AffineTable contains multiples of G. The second one 378 // multiples of [2⁶]G, the third one of [2¹²]G, and so on, where each successive 379 // table is the previous table doubled six times. Six is the width of the 380 // sliding window used in p256ScalarMult, and having each table already 381 // pre-doubled lets us avoid the doublings between windows entirely. This table 382 // MUST NOT be modified, as it aliases into p256PrecomputedEmbed below. 383 var p256Precomputed *[43]p256AffineTable 384 385 //go:embed p256_asm_table.bin 386 var p256PrecomputedEmbed string 387 388 func leUint64(b []byte) uint64 { 389 _ = b[7] // bounds check hint to compiler; see golang.org/issue/14808 390 return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | 391 uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 392 } 393 394 func init() { 395 p256PrecomputedPtr := (*unsafe.Pointer)(unsafe.Pointer(&p256PrecomputedEmbed)) 396 if runtime.GOARCH == "s390x" { 397 var newTable [43 * 32 * 2 * 4]uint64 398 for i, x := range (*[43 * 32 * 2 * 4][8]byte)(*p256PrecomputedPtr) { 399 newTable[i] = leUint64(x[:]) 400 } 401 newTablePtr := unsafe.Pointer(&newTable) 402 p256PrecomputedPtr = &newTablePtr 403 } 404 p256Precomputed = (*[43]p256AffineTable)(*p256PrecomputedPtr) 405 } 406 407 // p256SelectAffine sets res to the point at index idx in the table. 408 // idx must be in [0, 31]. It executes in constant time. 409 // 410 //go:noescape 411 func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 412 413 // Point addition with an affine point and constant time conditions. 414 // If zero is 0, sets res = in2. If sel is 0, sets res = in1. 415 // If sign is not 0, sets res = in1 + -in2. Otherwise, sets res = in1 + in2 416 // 417 //go:noescape 418 func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int) 419 420 // Point addition. Sets res = in1 + in2. Returns one if the two input points 421 // were equal and zero otherwise. If in1 or in2 are the point at infinity, res 422 // and the return value are undefined. 423 // 424 //go:noescape 425 func p256PointAddAsm(res, in1, in2 *SM2P256Point) int 426 427 // Point doubling. Sets res = in + in. in can be the point at infinity. 428 // 429 //go:noescape 430 func p256PointDoubleAsm(res, in *SM2P256Point) 431 432 // Point doubling 6 times. in can be the point at infinity. 433 // 434 //go:noescape 435 func p256PointDouble6TimesAsm(res, in *SM2P256Point) 436 437 // p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the 438 // Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order. 439 type p256OrdElement [4]uint64 440 441 // Add sets q = p1 + p2, and returns q. The points may overlap. 442 func (q *SM2P256Point) Add(r1, r2 *SM2P256Point) *SM2P256Point { 443 var sum, double SM2P256Point 444 r1IsInfinity := r1.isInfinity() 445 r2IsInfinity := r2.isInfinity() 446 pointsEqual := p256PointAddAsm(&sum, r1, r2) 447 p256PointDoubleAsm(&double, r1) 448 p256MovCond(&sum, &double, &sum, pointsEqual) 449 p256MovCond(&sum, r1, &sum, r2IsInfinity) 450 p256MovCond(&sum, r2, &sum, r1IsInfinity) 451 return q.Set(&sum) 452 } 453 454 // Double sets q = p + p, and returns q. The points may overlap. 455 func (q *SM2P256Point) Double(p *SM2P256Point) *SM2P256Point { 456 var double SM2P256Point 457 p256PointDoubleAsm(&double, p) 458 return q.Set(&double) 459 } 460 461 // ScalarBaseMult sets r = scalar * generator, where scalar is a 32-byte big 462 // endian value, and returns r. If scalar is not 32 bytes long, ScalarBaseMult 463 // returns an error and the receiver is unchanged. 464 func (r *SM2P256Point) ScalarBaseMult(scalar []byte) (*SM2P256Point, error) { 465 if len(scalar) != 32 { 466 return nil, errors.New("invalid scalar length") 467 } 468 scalarReversed := new(p256OrdElement) 469 p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar)) 470 p256OrdReduce(scalarReversed) 471 r.p256BaseMult(scalarReversed) 472 return r, nil 473 } 474 475 // ScalarMult sets r = scalar * q, where scalar is a 32-byte big endian value, 476 // and returns r. If scalar is not 32 bytes long, ScalarBaseMult returns an 477 // error and the receiver is unchanged. 478 func (r *SM2P256Point) ScalarMult(q *SM2P256Point, scalar []byte) (*SM2P256Point, error) { 479 if len(scalar) != 32 { 480 return nil, errors.New("invalid scalar length") 481 } 482 scalarReversed := new(p256OrdElement) 483 p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar)) 484 p256OrdReduce(scalarReversed) 485 r.Set(q).p256ScalarMult(scalarReversed) 486 return r, nil 487 } 488 489 // uint64IsZero returns 1 if x is zero and zero otherwise. 490 func uint64IsZero(x uint64) int { 491 x = ^x 492 x &= x >> 32 493 x &= x >> 16 494 x &= x >> 8 495 x &= x >> 4 496 x &= x >> 2 497 x &= x >> 1 498 return int(x & 1) 499 } 500 501 // p256Equal returns 1 if a and b are equal and 0 otherwise. 502 func p256Equal(a, b *p256Element) int { 503 var acc uint64 504 for i := range a { 505 acc |= a[i] ^ b[i] 506 } 507 return uint64IsZero(acc) 508 } 509 510 // isInfinity returns 1 if p is the point at infinity and 0 otherwise. 511 func (p *SM2P256Point) isInfinity() int { 512 return p256Equal(&p.z, &p256Zero) 513 } 514 515 // Bytes returns the uncompressed or infinity encoding of p, as specified in 516 // SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the point at 517 // infinity is shorter than all other encodings. 518 func (p *SM2P256Point) Bytes() []byte { 519 // This function is outlined to make the allocations inline in the caller 520 // rather than happen on the heap. 521 var out [p256UncompressedLength]byte 522 return p.bytes(&out) 523 } 524 525 func (p *SM2P256Point) bytes(out *[p256UncompressedLength]byte) []byte { 526 // The proper representation of the point at infinity is a single zero byte. 527 if p.isInfinity() == 1 { 528 return append(out[:0], 0) 529 } 530 531 x, y := new(p256Element), new(p256Element) 532 p.affineFromMont(x, y) 533 534 out[0] = 4 // Uncompressed form. 535 p256LittleToBig((*[32]byte)(out[1:33]), x) 536 p256LittleToBig((*[32]byte)(out[33:65]), y) 537 538 return out[:] 539 } 540 541 // affineFromMont sets (x, y) to the affine coordinates of p, converted out of the 542 // Montgomery domain. 543 func (p *SM2P256Point) affineFromMont(x, y *p256Element) { 544 p256Inverse(y, &p.z) 545 p256Sqr(x, y, 1) 546 p256Mul(y, y, x) 547 548 p256Mul(x, &p.x, x) 549 p256Mul(y, &p.y, y) 550 551 p256FromMont(x, x) 552 p256FromMont(y, y) 553 } 554 555 // BytesX returns the encoding of the x-coordinate of p, as specified in SEC 1, 556 // Version 2.0, Section 2.3.5, or an error if p is the point at infinity. 557 func (p *SM2P256Point) BytesX() ([]byte, error) { 558 // This function is outlined to make the allocations inline in the caller 559 // rather than happen on the heap. 560 var out [p256ElementLength]byte 561 return p.bytesX(&out) 562 } 563 564 func (p *SM2P256Point) bytesX(out *[p256ElementLength]byte) ([]byte, error) { 565 if p.isInfinity() == 1 { 566 return nil, errors.New("SM2 point is the point at infinity") 567 } 568 569 x := new(p256Element) 570 p256Inverse(x, &p.z) 571 p256Sqr(x, x, 1) 572 p256Mul(x, &p.x, x) 573 p256FromMont(x, x) 574 p256LittleToBig((*[32]byte)(out[:]), x) 575 576 return out[:], nil 577 } 578 579 // BytesCompressed returns the compressed or infinity encoding of p, as 580 // specified in SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the 581 // point at infinity is shorter than all other encodings. 582 func (p *SM2P256Point) BytesCompressed() []byte { 583 // This function is outlined to make the allocations inline in the caller 584 // rather than happen on the heap. 585 var out [p256CompressedLength]byte 586 return p.bytesCompressed(&out) 587 } 588 589 func (p *SM2P256Point) bytesCompressed(out *[p256CompressedLength]byte) []byte { 590 if p.isInfinity() == 1 { 591 return append(out[:0], 0) 592 } 593 594 x, y := new(p256Element), new(p256Element) 595 p.affineFromMont(x, y) 596 597 out[0] = 2 | byte(y[0]&1) 598 p256LittleToBig((*[32]byte)(out[1:33]), x) 599 600 return out[:] 601 } 602 603 // Select sets q to p1 if cond == 1, and to p2 if cond == 0. 604 func (q *SM2P256Point) Select(p1, p2 *SM2P256Point, cond int) *SM2P256Point { 605 p256MovCond(q, p1, p2, cond) 606 return q 607 } 608 609 // p256Inverse sets out to in⁻¹ mod p. If in is zero, out will be zero. 610 func p256Inverse(out, in *p256Element) { 611 // Inversion is calculated through exponentiation by p - 2, per Fermat's 612 // little theorem. 613 // 614 // The sequence of 14 multiplications and 255 squarings is derived from the 615 // following addition chain generated with github.com/mmcloughlin/addchain 616 // v0.4.0. 617 // 618 // _10 = 2*1 619 // _11 = 1 + _10 620 // _110 = 2*_11 621 // _111 = 1 + _110 622 // _111000 = _111 << 3 623 // _111111 = _111 + _111000 624 // _1111110 = 2*_111111 625 // _1111111 = 1 + _1111110 626 // x12 = _1111110 << 5 + _111111 627 // x24 = x12 << 12 + x12 628 // x31 = x24 << 7 + _1111111 629 // i39 = x31 << 2 630 // i68 = i39 << 29 631 // x62 = x31 + i68 632 // i71 = i68 << 2 633 // x64 = i39 + i71 + _11 634 // i265 = ((i71 << 32 + x64) << 64 + x64) << 94 635 // return (x62 + i265) << 2 + 1 636 // Allocate Temporaries. 637 var ( 638 t0 = new(p256Element) 639 t1 = new(p256Element) 640 t2 = new(p256Element) 641 ) 642 // Step 1: z = x^0x2 643 //z.Sqr(x) 644 p256Sqr(out, in, 1) 645 646 // Step 2: t0 = x^0x3 647 // t0.Mul(x, z) 648 p256Mul(t0, in, out) 649 650 // Step 3: z = x^0x6 651 // z.Sqr(t0) 652 p256Sqr(out, t0, 1) 653 654 // Step 4: z = x^0x7 655 // z.Mul(x, z) 656 p256Mul(out, in, out) 657 658 // Step 7: t1 = x^0x38 659 //t1.Sqr(z) 660 //for s := 1; s < 3; s++ { 661 // t1.Sqr(t1) 662 //} 663 p256Sqr(t1, out, 3) 664 665 // Step 8: t1 = x^0x3f 666 //t1.Mul(z, t1) 667 p256Mul(t1, out, t1) 668 669 // Step 9: t2 = x^0x7e 670 //t2.Sqr(t1) 671 p256Sqr(t2, t1, 1) 672 673 // Step 10: z = x^0x7f 674 //z.Mul(x, t2) 675 p256Mul(out, in, t2) 676 677 // Step 15: t2 = x^0xfc0 678 //for s := 0; s < 5; s++ { 679 // t2.Sqr(t2) 680 //} 681 p256Sqr(t2, t2, 5) 682 683 // Step 16: t1 = x^0xfff 684 //t1.Mul(t1, t2) 685 p256Mul(t1, t1, t2) 686 687 // Step 28: t2 = x^0xfff000 688 //t2.Sqr(t1) 689 //for s := 1; s < 12; s++ { 690 // t2.Sqr(t2) 691 //} 692 p256Sqr(t2, t1, 12) 693 694 // Step 29: t1 = x^0xffffff 695 //t1.Mul(t1, t2) 696 p256Mul(t1, t1, t2) 697 698 // Step 36: t1 = x^0x7fffff80 699 //for s := 0; s < 7; s++ { 700 // t1.Sqr(t1) 701 //} 702 p256Sqr(t1, t1, 7) 703 704 // Step 37: z = x^0x7fffffff 705 //z.Mul(z, t1) 706 p256Mul(out, out, t1) 707 708 // Step 39: t2 = x^0x1fffffffc 709 //t2.Sqr(z) 710 //for s := 1; s < 2; s++ { 711 // t2.Sqr(t2) 712 //} 713 p256Sqr(t2, out, 2) 714 715 // Step 68: t1 = x^0x3fffffff80000000 716 //t1.Sqr(t2) 717 //for s := 1; s < 29; s++ { 718 // t1.Sqr(t1) 719 //} 720 p256Sqr(t1, t2, 29) 721 722 // Step 69: z = x^0x3fffffffffffffff 723 //z.Mul(z, t1) 724 p256Mul(out, out, t1) 725 726 // Step 71: t1 = x^0xfffffffe00000000 727 //for s := 0; s < 2; s++ { 728 // t1.Sqr(t1) 729 //} 730 p256Sqr(t1, t1, 2) 731 732 // Step 72: t2 = x^0xfffffffffffffffc 733 //t2.Mul(t2, t1) 734 p256Mul(t2, t2, t1) 735 736 // Step 73: t0 = x^0xffffffffffffffff 737 //t0.Mul(t0, t2) 738 p256Mul(t0, t0, t2) 739 740 // Step 105: t1 = x^0xfffffffe0000000000000000 741 //for s := 0; s < 32; s++ { 742 // t1.Sqr(t1) 743 //} 744 p256Sqr(t1, t1, 32) 745 746 // Step 106: t1 = x^0xfffffffeffffffffffffffff 747 //t1.Mul(t0, t1) 748 p256Mul(t1, t0, t1) 749 750 // Step 170: t1 = x^0xfffffffeffffffffffffffff0000000000000000 751 //for s := 0; s < 64; s++ { 752 // t1.Sqr(t1) 753 //} 754 p256Sqr(t1, t1, 64) 755 756 // Step 171: t0 = x^0xfffffffeffffffffffffffffffffffffffffffff 757 //t0.Mul(t0, t1) 758 p256Mul(t0, t0, t1) 759 760 // Step 265: t0 = x^0x3fffffffbfffffffffffffffffffffffffffffffc00000000000000000000000 761 //for s := 0; s < 94; s++ { 762 // t0.Sqr(t0) 763 //} 764 p256Sqr(t0, t0, 94) 765 766 // Step 266: z = x^0x3fffffffbfffffffffffffffffffffffffffffffc00000003fffffffffffffff 767 //z.Mul(z, t0) 768 p256Mul(out, out, t0) 769 770 // Step 268: z = x^0xfffffffeffffffffffffffffffffffffffffffff00000000fffffffffffffffc 771 //for s := 0; s < 2; s++ { 772 // z.Sqr(z) 773 //} 774 p256Sqr(out, out, 2) 775 776 // Step 269: z = x^0xfffffffeffffffffffffffffffffffffffffffff00000000fffffffffffffffd 777 //z.Mul(x, z) 778 p256Mul(out, in, out) 779 } 780 781 // This function takes those six bits as an integer (0 .. 63), writing the 782 // recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute 783 // value, in the range 0 .. 16). Note that this integer essentially provides 784 // the input bits "shifted to the left" by one position: for example, the input 785 // to compute the least significant recoded digit, given that there's no bit 786 // b_-1, has to be b_4 b_3 b_2 b_1 b_0 0. 787 // 788 // Reference: 789 // https://github.com/openssl/openssl/blob/master/crypto/ec/ecp_nistputil.c 790 // https://github.com/google/boringssl/blob/master/crypto/fipsmodule/ec/util.c 791 func boothW5(in uint) (int, int) { 792 var s uint = ^((in >> 5) - 1) // sets all bits to MSB(in), 'in' seen as 6-bit value 793 var d uint = (1 << 6) - in - 1 // d = 63 - in, or d = ^in & 0x3f 794 d = (d & s) | (in & (^s)) // d = in if in < 2^5; otherwise, d = 63 - in 795 d = (d >> 1) + (d & 1) // d = (d + 1) / 2 796 return int(d), int(s & 1) 797 } 798 799 func boothW6(in uint) (int, int) { 800 var s uint = ^((in >> 6) - 1) 801 var d uint = (1 << 7) - in - 1 802 d = (d & s) | (in & (^s)) 803 d = (d >> 1) + (d & 1) 804 return int(d), int(s & 1) 805 } 806 807 func (p *SM2P256Point) p256BaseMult(scalar *p256OrdElement) { 808 var t0 p256AffinePoint 809 810 wvalue := (scalar[0] << 1) & 0x7f 811 sel, sign := boothW6(uint(wvalue)) 812 p256SelectAffine(&t0, &p256Precomputed[0], sel) 813 p.x, p.y, p.z = t0.x, t0.y, p256One 814 p256NegCond(&p.y, sign) 815 816 index := uint(5) 817 zero := sel 818 819 for i := 1; i < 43; i++ { 820 if index >= 192 { 821 wvalue = (scalar[3] >> (index & 63)) & 0x7f 822 } else if index >= 128 { 823 wvalue = ((scalar[2] >> (index & 63)) + (scalar[3] << (64 - (index & 63)))) & 0x7f 824 } else if index >= 64 { 825 wvalue = ((scalar[1] >> (index & 63)) + (scalar[2] << (64 - (index & 63)))) & 0x7f 826 } else { 827 wvalue = ((scalar[0] >> (index & 63)) + (scalar[1] << (64 - (index & 63)))) & 0x7f 828 } 829 index += 6 830 sel, sign = boothW6(uint(wvalue)) 831 p256SelectAffine(&t0, &p256Precomputed[i], sel) 832 p256PointAddAffineAsm(p, p, &t0, sign, sel, zero) 833 zero |= sel 834 } 835 836 // If the whole scalar was zero, set to the point at infinity. 837 p256MovCond(p, p, NewSM2P256Point(), zero) 838 } 839 840 func (p *SM2P256Point) p256ScalarMult(scalar *p256OrdElement) { 841 // precomp is a table of precomputed points that stores powers of p 842 // from p^1 to p^32. 843 var precomp p256Table 844 var t0, t1 SM2P256Point 845 846 // Prepare the table 847 precomp[0] = *p // 1 848 849 p256PointDoubleAsm(&precomp[1], p) //2 850 p256PointAddAsm(&precomp[2], &precomp[1], p) //3 851 p256PointDoubleAsm(&precomp[3], &precomp[1]) //4 852 p256PointAddAsm(&precomp[4], &precomp[3], p) //5 853 p256PointDoubleAsm(&precomp[5], &precomp[2]) //6 854 p256PointAddAsm(&precomp[6], &precomp[5], p) //7 855 p256PointDoubleAsm(&precomp[7], &precomp[3]) //8 856 p256PointAddAsm(&precomp[8], &precomp[7], p) //9 857 p256PointDoubleAsm(&precomp[9], &precomp[4]) //10 858 p256PointAddAsm(&precomp[10], &precomp[9], p) //11 859 p256PointDoubleAsm(&precomp[11], &precomp[5]) //12 860 p256PointAddAsm(&precomp[12], &precomp[11], p) //13 861 p256PointDoubleAsm(&precomp[13], &precomp[6]) //14 862 p256PointAddAsm(&precomp[14], &precomp[13], p) //15 863 p256PointDoubleAsm(&precomp[15], &precomp[7]) //16 864 865 p256PointAddAsm(&precomp[16], &precomp[15], p) //17 866 p256PointDoubleAsm(&precomp[17], &precomp[8]) //18 867 p256PointAddAsm(&precomp[18], &precomp[17], p) //19 868 p256PointDoubleAsm(&precomp[19], &precomp[9]) //20 869 p256PointAddAsm(&precomp[20], &precomp[19], p) //21 870 p256PointDoubleAsm(&precomp[21], &precomp[10]) //22 871 p256PointAddAsm(&precomp[22], &precomp[21], p) //23 872 p256PointDoubleAsm(&precomp[23], &precomp[11]) //24 873 p256PointAddAsm(&precomp[24], &precomp[23], p) //25 874 p256PointDoubleAsm(&precomp[25], &precomp[12]) //26 875 p256PointAddAsm(&precomp[26], &precomp[25], p) //27 876 p256PointDoubleAsm(&precomp[27], &precomp[13]) //28 877 p256PointAddAsm(&precomp[28], &precomp[27], p) //29 878 p256PointDoubleAsm(&precomp[29], &precomp[14]) //30 879 p256PointAddAsm(&precomp[30], &precomp[29], p) //31 880 p256PointDoubleAsm(&precomp[31], &precomp[15]) //32 881 882 // Start scanning the window from top bit 883 index := uint(251) 884 var sel, sign int 885 886 wvalue := (scalar[index/64] >> (index % 64)) & 0x7f 887 sel, _ = boothW6(uint(wvalue)) 888 889 p256Select(p, &precomp, sel, 32) 890 zero := sel 891 892 for index > 5 { 893 index -= 6 894 895 p256PointDouble6TimesAsm(p, p) 896 897 if index >= 192 { 898 wvalue = (scalar[3] >> (index & 63)) & 0x7f 899 } else if index >= 128 { 900 wvalue = ((scalar[2] >> (index & 63)) + (scalar[3] << (64 - (index & 63)))) & 0x7f 901 } else if index >= 64 { 902 wvalue = ((scalar[1] >> (index & 63)) + (scalar[2] << (64 - (index & 63)))) & 0x7f 903 } else { 904 wvalue = ((scalar[0] >> (index & 63)) + (scalar[1] << (64 - (index & 63)))) & 0x7f 905 } 906 907 sel, sign = boothW6(uint(wvalue)) 908 909 p256Select(&t0, &precomp, sel, 32) 910 p256NegCond(&t0.y, sign) 911 p256PointAddAsm(&t1, p, &t0) 912 p256MovCond(&t1, &t1, p, sel) 913 p256MovCond(p, &t1, &t0, zero) 914 zero |= sel 915 } 916 p256PointDouble6TimesAsm(p, p) 917 918 wvalue = (scalar[0] << 1) & 0x7f 919 sel, sign = boothW6(uint(wvalue)) 920 921 p256Select(&t0, &precomp, sel, 32) 922 p256NegCond(&t0.y, sign) 923 p256PointAddAsm(&t1, p, &t0) 924 p256MovCond(&t1, &t1, p, sel) 925 p256MovCond(p, &t1, &t0, zero) 926 }