github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/x/crypto/curve25519/mont25519_amd64.go (about)

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build amd64,!gccgo,!appengine
     6  
     7  package curve25519
     8  
     9  // These functions are implemented in the .s files. The names of the functions
    10  // in the rest of the file are also taken from the SUPERCOP sources to help
    11  // people following along.
    12  
    13  //go:noescape
    14  
    15  func cswap(inout *[5]uint64, v uint64)
    16  
    17  //go:noescape
    18  
    19  func ladderstep(inout *[5][5]uint64)
    20  
    21  //go:noescape
    22  
    23  func freeze(inout *[5]uint64)
    24  
    25  //go:noescape
    26  
    27  func mul(dest, a, b *[5]uint64)
    28  
    29  //go:noescape
    30  
    31  func square(out, in *[5]uint64)
    32  
    33  // mladder uses a Montgomery ladder to calculate (xr/zr) *= s.
    34  func mladder(xr, zr *[5]uint64, s *[32]byte) {
    35  	var work [5][5]uint64
    36  
    37  	work[0] = *xr
    38  	setint(&work[1], 1)
    39  	setint(&work[2], 0)
    40  	work[3] = *xr
    41  	setint(&work[4], 1)
    42  
    43  	j := uint(6)
    44  	var prevbit byte
    45  
    46  	for i := 31; i >= 0; i-- {
    47  		for j < 8 {
    48  			bit := ((*s)[i] >> j) & 1
    49  			swap := bit ^ prevbit
    50  			prevbit = bit
    51  			cswap(&work[1], uint64(swap))
    52  			ladderstep(&work)
    53  			j--
    54  		}
    55  		j = 7
    56  	}
    57  
    58  	*xr = work[1]
    59  	*zr = work[2]
    60  }
    61  
    62  func scalarMult(out, in, base *[32]byte) {
    63  	var e [32]byte
    64  	copy(e[:], (*in)[:])
    65  	e[0] &= 248
    66  	e[31] &= 127
    67  	e[31] |= 64
    68  
    69  	var t, z [5]uint64
    70  	unpack(&t, base)
    71  	mladder(&t, &z, &e)
    72  	invert(&z, &z)
    73  	mul(&t, &t, &z)
    74  	pack(out, &t)
    75  }
    76  
    77  func setint(r *[5]uint64, v uint64) {
    78  	r[0] = v
    79  	r[1] = 0
    80  	r[2] = 0
    81  	r[3] = 0
    82  	r[4] = 0
    83  }
    84  
    85  // unpack sets r = x where r consists of 5, 51-bit limbs in little-endian
    86  // order.
    87  func unpack(r *[5]uint64, x *[32]byte) {
    88  	r[0] = uint64(x[0]) |
    89  		uint64(x[1])<<8 |
    90  		uint64(x[2])<<16 |
    91  		uint64(x[3])<<24 |
    92  		uint64(x[4])<<32 |
    93  		uint64(x[5])<<40 |
    94  		uint64(x[6]&7)<<48
    95  
    96  	r[1] = uint64(x[6])>>3 |
    97  		uint64(x[7])<<5 |
    98  		uint64(x[8])<<13 |
    99  		uint64(x[9])<<21 |
   100  		uint64(x[10])<<29 |
   101  		uint64(x[11])<<37 |
   102  		uint64(x[12]&63)<<45
   103  
   104  	r[2] = uint64(x[12])>>6 |
   105  		uint64(x[13])<<2 |
   106  		uint64(x[14])<<10 |
   107  		uint64(x[15])<<18 |
   108  		uint64(x[16])<<26 |
   109  		uint64(x[17])<<34 |
   110  		uint64(x[18])<<42 |
   111  		uint64(x[19]&1)<<50
   112  
   113  	r[3] = uint64(x[19])>>1 |
   114  		uint64(x[20])<<7 |
   115  		uint64(x[21])<<15 |
   116  		uint64(x[22])<<23 |
   117  		uint64(x[23])<<31 |
   118  		uint64(x[24])<<39 |
   119  		uint64(x[25]&15)<<47
   120  
   121  	r[4] = uint64(x[25])>>4 |
   122  		uint64(x[26])<<4 |
   123  		uint64(x[27])<<12 |
   124  		uint64(x[28])<<20 |
   125  		uint64(x[29])<<28 |
   126  		uint64(x[30])<<36 |
   127  		uint64(x[31]&127)<<44
   128  }
   129  
   130  // pack sets out = x where out is the usual, little-endian form of the 5,
   131  // 51-bit limbs in x.
   132  func pack(out *[32]byte, x *[5]uint64) {
   133  	t := *x
   134  	freeze(&t)
   135  
   136  	out[0] = byte(t[0])
   137  	out[1] = byte(t[0] >> 8)
   138  	out[2] = byte(t[0] >> 16)
   139  	out[3] = byte(t[0] >> 24)
   140  	out[4] = byte(t[0] >> 32)
   141  	out[5] = byte(t[0] >> 40)
   142  	out[6] = byte(t[0] >> 48)
   143  
   144  	out[6] ^= byte(t[1]<<3) & 0xf8
   145  	out[7] = byte(t[1] >> 5)
   146  	out[8] = byte(t[1] >> 13)
   147  	out[9] = byte(t[1] >> 21)
   148  	out[10] = byte(t[1] >> 29)
   149  	out[11] = byte(t[1] >> 37)
   150  	out[12] = byte(t[1] >> 45)
   151  
   152  	out[12] ^= byte(t[2]<<6) & 0xc0
   153  	out[13] = byte(t[2] >> 2)
   154  	out[14] = byte(t[2] >> 10)
   155  	out[15] = byte(t[2] >> 18)
   156  	out[16] = byte(t[2] >> 26)
   157  	out[17] = byte(t[2] >> 34)
   158  	out[18] = byte(t[2] >> 42)
   159  	out[19] = byte(t[2] >> 50)
   160  
   161  	out[19] ^= byte(t[3]<<1) & 0xfe
   162  	out[20] = byte(t[3] >> 7)
   163  	out[21] = byte(t[3] >> 15)
   164  	out[22] = byte(t[3] >> 23)
   165  	out[23] = byte(t[3] >> 31)
   166  	out[24] = byte(t[3] >> 39)
   167  	out[25] = byte(t[3] >> 47)
   168  
   169  	out[25] ^= byte(t[4]<<4) & 0xf0
   170  	out[26] = byte(t[4] >> 4)
   171  	out[27] = byte(t[4] >> 12)
   172  	out[28] = byte(t[4] >> 20)
   173  	out[29] = byte(t[4] >> 28)
   174  	out[30] = byte(t[4] >> 36)
   175  	out[31] = byte(t[4] >> 44)
   176  }
   177  
   178  // invert calculates r = x^-1 mod p using Fermat's little theorem.
   179  func invert(r *[5]uint64, x *[5]uint64) {
   180  	var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t [5]uint64
   181  
   182  	square(&z2, x)        /* 2 */
   183  	square(&t, &z2)       /* 4 */
   184  	square(&t, &t)        /* 8 */
   185  	mul(&z9, &t, x)       /* 9 */
   186  	mul(&z11, &z9, &z2)   /* 11 */
   187  	square(&t, &z11)      /* 22 */
   188  	mul(&z2_5_0, &t, &z9) /* 2^5 - 2^0 = 31 */
   189  
   190  	square(&t, &z2_5_0)      /* 2^6 - 2^1 */
   191  	for i := 1; i < 5; i++ { /* 2^20 - 2^10 */
   192  		square(&t, &t)
   193  	}
   194  	mul(&z2_10_0, &t, &z2_5_0) /* 2^10 - 2^0 */
   195  
   196  	square(&t, &z2_10_0)      /* 2^11 - 2^1 */
   197  	for i := 1; i < 10; i++ { /* 2^20 - 2^10 */
   198  		square(&t, &t)
   199  	}
   200  	mul(&z2_20_0, &t, &z2_10_0) /* 2^20 - 2^0 */
   201  
   202  	square(&t, &z2_20_0)      /* 2^21 - 2^1 */
   203  	for i := 1; i < 20; i++ { /* 2^40 - 2^20 */
   204  		square(&t, &t)
   205  	}
   206  	mul(&t, &t, &z2_20_0) /* 2^40 - 2^0 */
   207  
   208  	square(&t, &t)            /* 2^41 - 2^1 */
   209  	for i := 1; i < 10; i++ { /* 2^50 - 2^10 */
   210  		square(&t, &t)
   211  	}
   212  	mul(&z2_50_0, &t, &z2_10_0) /* 2^50 - 2^0 */
   213  
   214  	square(&t, &z2_50_0)      /* 2^51 - 2^1 */
   215  	for i := 1; i < 50; i++ { /* 2^100 - 2^50 */
   216  		square(&t, &t)
   217  	}
   218  	mul(&z2_100_0, &t, &z2_50_0) /* 2^100 - 2^0 */
   219  
   220  	square(&t, &z2_100_0)      /* 2^101 - 2^1 */
   221  	for i := 1; i < 100; i++ { /* 2^200 - 2^100 */
   222  		square(&t, &t)
   223  	}
   224  	mul(&t, &t, &z2_100_0) /* 2^200 - 2^0 */
   225  
   226  	square(&t, &t)            /* 2^201 - 2^1 */
   227  	for i := 1; i < 50; i++ { /* 2^250 - 2^50 */
   228  		square(&t, &t)
   229  	}
   230  	mul(&t, &t, &z2_50_0) /* 2^250 - 2^0 */
   231  
   232  	square(&t, &t) /* 2^251 - 2^1 */
   233  	square(&t, &t) /* 2^252 - 2^2 */
   234  	square(&t, &t) /* 2^253 - 2^3 */
   235  
   236  	square(&t, &t) /* 2^254 - 2^4 */
   237  
   238  	square(&t, &t)   /* 2^255 - 2^5 */
   239  	mul(r, &t, &z11) /* 2^255 - 21 */
   240  }