github.com/cloudflare/circl@v1.5.0/pke/kyber/internal/common/amd64.go (about)

     1  //go:build amd64 && !purego
     2  // +build amd64,!purego
     3  
     4  package common
     5  
     6  import (
     7  	"golang.org/x/sys/cpu"
     8  )
     9  
    10  // ZetasAVX2 contains all ζ used in NTT (like the Zetas array), but also
    11  // the values int16(zeta * 62209) for each zeta, which is used in
    12  // Montgomery reduction.  There is some duplication and reordering as
    13  // compared to Zetas to make it more convenient for use with AVX2.
    14  var ZetasAVX2 = [...]int16{
    15  	// level 1: int16(Zetas[1]*62209) and Zetas[1]
    16  	31499, 2571,
    17  
    18  	// level 2
    19  	//
    20  	// int16(Zetas[2]*62209), Zetas[2], int16(Zetas[3]*62209), Zetas[3]
    21  	14746, 2970, 788, 1812,
    22  
    23  	// level 3, like level 2.
    24  	13525, 1493, -12402, 1422, 28191, 287, -16694, 202,
    25  
    26  	0, 0, // padding
    27  
    28  	// layer 4. offset: 1*16
    29  	//
    30  	// The precomputed multiplication and zetas are grouped by 16 at a
    31  	// time as used in the set of butterflies, etc.
    32  	-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
    33  	27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
    34  	3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
    35  	622, 622, 622, 622, 622, 622, 622, 622,
    36  	-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
    37  	-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
    38  	1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
    39  	182, 182, 182, 182, 182, 182, 182, 182,
    40  	10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
    41  	1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
    42  	962, 962, 962, 962, 962, 962, 962, 962,
    43  	2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
    44  	-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
    45  	31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
    46  	1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
    47  	1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
    48  
    49  	// layer 5. offset: 9*16
    50  	-5827, -5827, -5827, -5827, 17364, 17364, 17364, 17364,
    51  	-26360, -26360, -26360, -26360, -29057, -29057, -29057, -29057,
    52  	573, 573, 573, 573, 2004, 2004, 2004, 2004,
    53  	264, 264, 264, 264, 383, 383, 383, 383,
    54  	5572, 5572, 5572, 5572, -1102, -1102, -1102, -1102,
    55  	21439, 21439, 21439, 21439, -26241, -26241, -26241, -26241,
    56  	2500, 2500, 2500, 2500, 1458, 1458, 1458, 1458,
    57  	1727, 1727, 1727, 1727, 3199, 3199, 3199, 3199,
    58  	-28072, -28072, -28072, -28072, 24313, 24313, 24313, 24313,
    59  	-10532, -10532, -10532, -10532, 8800, 8800, 8800, 8800,
    60  	2648, 2648, 2648, 2648, 1017, 1017, 1017, 1017,
    61  	732, 732, 732, 732, 608, 608, 608, 608,
    62  	18427, 18427, 18427, 18427, 8859, 8859, 8859, 8859,
    63  	26676, 26676, 26676, 26676, -16162, -16162, -16162, -16162,
    64  	1787, 1787, 1787, 1787, 411, 411, 411, 411,
    65  	3124, 3124, 3124, 3124, 1758, 1758, 1758, 1758,
    66  
    67  	// layer 6. offset: 17*16
    68  	-5689, -5689, -6516, -6516, 1497, 1497, 30967, 30967,
    69  	-23564, -23564, 20179, 20179, 20711, 20711, 25081, 25081,
    70  	1223, 1223, 652, 652, 2777, 2777, 1015, 1015,
    71  	2036, 2036, 1491, 1491, 3047, 3047, 1785, 1785,
    72  	-12796, -12796, 26617, 26617, 16065, 16065, -12441, -12441,
    73  	9135, 9135, -649, -649, -25986, -25986, 27837, 27837,
    74  	516, 516, 3321, 3321, 3009, 3009, 2663, 2663,
    75  	1711, 1711, 2167, 2167, 126, 126, 1469, 1469,
    76  	19884, 19884, -28249, -28249, -15886, -15886, -8898, -8898,
    77  	-28309, -28309, 9076, 9076, -30198, -30198, 18250, 18250,
    78  	2476, 2476, 3239, 3239, 3058, 3058, 830, 830,
    79  	107, 107, 1908, 1908, 3082, 3082, 2378, 2378,
    80  	13427, 13427, 14017, 14017, -29155, -29155, -12756, -12756,
    81  	16832, 16832, 4312, 4312, -24155, -24155, -17914, -17914,
    82  	2931, 2931, 961, 961, 1821, 1821, 2604, 2604,
    83  	448, 448, 2264, 2264, 677, 677, 2054, 2054,
    84  
    85  	// layer 7. offset: 25*16
    86  	-334, 11182, -11477, 13387, -32226, -14233, 20494, -21655,
    87  	-27738, 13131, 945, -4586, -14882, 23093, 6182, 5493,
    88  	2226, 430, 555, 843, 2078, 871, 1550, 105,
    89  	422, 587, 177, 3094, 3038, 2869, 1574, 1653,
    90  	32011, -32502, 10631, 30318, 29176, -18741, -28761, 12639,
    91  	-18485, 20100, 17561, 18525, -14430, 19529, -5275, -12618,
    92  	3083, 778, 1159, 3182, 2552, 1483, 2727, 1119,
    93  	1739, 644, 2457, 349, 418, 329, 3173, 3254,
    94  	-31183, 20297, 25435, 2146, -7382, 15356, 24392, -32384,
    95  	-20926, -6279, 10946, -14902, 24215, -11044, 16990, 14470,
    96  	817, 1097, 603, 610, 1322, 2044, 1864, 384,
    97  	2114, 3193, 1218, 1994, 2455, 220, 2142, 1670,
    98  	10336, -21497, -7933, -20198, -22501, 23211, 10907, -17442,
    99  	31637, -23859, 28644, -20257, 23998, 7757, -17422, 23132,
   100  	2144, 1799, 2051, 794, 1819, 2475, 2459, 478,
   101  	3221, 3021, 996, 991, 958, 1869, 1522, 1628,
   102  
   103  	// layer 1 inverse
   104  	23132, -17422, 7757, 23998, -20257, 28644, -23859, 31637,
   105  	-17442, 10907, 23211, -22501, -20198, -7933, -21497, 10336,
   106  	1628, 1522, 1869, 958, 991, 996, 3021, 3221,
   107  	478, 2459, 2475, 1819, 794, 2051, 1799, 2144,
   108  	14470, 16990, -11044, 24215, -14902, 10946, -6279, -20926,
   109  	-32384, 24392, 15356, -7382, 2146, 25435, 20297, -31183,
   110  	1670, 2142, 220, 2455, 1994, 1218, 3193, 2114,
   111  	384, 1864, 2044, 1322, 610, 603, 1097, 817,
   112  	-12618, -5275, 19529, -14430, 18525, 17561, 20100, -18485,
   113  	12639, -28761, -18741, 29176, 30318, 10631, -32502, 32011,
   114  	3254, 3173, 329, 418, 349, 2457, 644, 1739,
   115  	1119, 2727, 1483, 2552, 3182, 1159, 778, 3083,
   116  	5493, 6182, 23093, -14882, -4586, 945, 13131, -27738,
   117  	-21655, 20494, -14233, -32226, 13387, -11477, 11182, -334,
   118  	1653, 1574, 2869, 3038, 3094, 177, 587, 422,
   119  	105, 1550, 871, 2078, 843, 555, 430, 2226,
   120  
   121  	// layer 2 inverse
   122  	-17914, -17914, -24155, -24155, 4312, 4312, 16832, 16832,
   123  	-12756, -12756, -29155, -29155, 14017, 14017, 13427, 13427,
   124  	2054, 2054, 677, 677, 2264, 2264, 448, 448,
   125  	2604, 2604, 1821, 1821, 961, 961, 2931, 2931,
   126  	18250, 18250, -30198, -30198, 9076, 9076, -28309, -28309,
   127  	-8898, -8898, -15886, -15886, -28249, -28249, 19884, 19884,
   128  	2378, 2378, 3082, 3082, 1908, 1908, 107, 107,
   129  	830, 830, 3058, 3058, 3239, 3239, 2476, 2476,
   130  	27837, 27837, -25986, -25986, -649, -649, 9135, 9135,
   131  	-12441, -12441, 16065, 16065, 26617, 26617, -12796, -12796,
   132  	1469, 1469, 126, 126, 2167, 2167, 1711, 1711,
   133  	2663, 2663, 3009, 3009, 3321, 3321, 516, 516,
   134  	25081, 25081, 20711, 20711, 20179, 20179, -23564, -23564,
   135  	30967, 30967, 1497, 1497, -6516, -6516, -5689, -5689,
   136  	1785, 1785, 3047, 3047, 1491, 1491, 2036, 2036,
   137  	1015, 1015, 2777, 2777, 652, 652, 1223, 1223,
   138  
   139  	// layer 3 inverse
   140  	-16162, -16162, -16162, -16162, 26676, 26676, 26676, 26676,
   141  	8859, 8859, 8859, 8859, 18427, 18427, 18427, 18427,
   142  	1758, 1758, 1758, 1758, 3124, 3124, 3124, 3124,
   143  	411, 411, 411, 411, 1787, 1787, 1787, 1787,
   144  	8800, 8800, 8800, 8800, -10532, -10532, -10532, -10532,
   145  	24313, 24313, 24313, 24313, -28072, -28072, -28072, -28072,
   146  	608, 608, 608, 608, 732, 732, 732, 732,
   147  	1017, 1017, 1017, 1017, 2648, 2648, 2648, 2648,
   148  	-26241, -26241, -26241, -26241, 21439, 21439, 21439, 21439,
   149  	-1102, -1102, -1102, -1102, 5572, 5572, 5572, 5572,
   150  	3199, 3199, 3199, 3199, 1727, 1727, 1727, 1727,
   151  	1458, 1458, 1458, 1458, 2500, 2500, 2500, 2500,
   152  	-29057, -29057, -29057, -29057, -26360, -26360, -26360, -26360,
   153  	17364, 17364, 17364, 17364, -5827, -5827, -5827, -5827,
   154  	383, 383, 383, 383, 264, 264, 264, 264,
   155  	2004, 2004, 2004, 2004, 573, 573, 573, 573,
   156  
   157  	// layer 4 inverse
   158  	31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
   159  	-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
   160  	1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
   161  	1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
   162  	1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
   163  	10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
   164  	2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
   165  	962, 962, 962, 962, 962, 962, 962, 962,
   166  	-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
   167  	-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
   168  	182, 182, 182, 182, 182, 182, 182, 182,
   169  	1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
   170  	27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
   171  	-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
   172  	622, 622, 622, 622, 622, 622, 622, 622,
   173  	3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
   174  
   175  	// layer 5 inverse
   176  	-16694, 202, 28191, 287, -12402, 1422, 13525, 1493,
   177  
   178  	// layer 6 inverse
   179  	788, 1812, 14746, 2970,
   180  
   181  	// layer 7 inverse
   182  	31499, 2571,
   183  }
   184  
   185  // Sets p to a + b.  Does not normalize coefficients.
   186  func (p *Poly) Add(a, b *Poly) {
   187  	if cpu.X86.HasAVX2 {
   188  		addAVX2(
   189  			(*[N]int16)(p),
   190  			(*[N]int16)(a),
   191  			(*[N]int16)(b),
   192  		)
   193  	} else {
   194  		p.addGeneric(a, b)
   195  	}
   196  }
   197  
   198  // Sets p to a - b.  Does not normalize coefficients.
   199  func (p *Poly) Sub(a, b *Poly) {
   200  	if cpu.X86.HasAVX2 {
   201  		subAVX2(
   202  			(*[N]int16)(p),
   203  			(*[N]int16)(a),
   204  			(*[N]int16)(b),
   205  		)
   206  	} else {
   207  		p.subGeneric(a, b)
   208  	}
   209  }
   210  
   211  // Executes an in-place forward "NTT" on p.
   212  //
   213  // Assumes the coefficients are in absolute value ≤q.  The resulting
   214  // coefficients are in absolute value ≤7q.  If the input is in Montgomery
   215  // form, then the result is in Montgomery form and so (by linearity of the NTT)
   216  // if the input is in regular form, then the result is also in regular form.
   217  // The order of coefficients will be "tangled". These can be put back into
   218  // their proper order by calling Detangle().
   219  func (p *Poly) NTT() {
   220  	if cpu.X86.HasAVX2 {
   221  		nttAVX2((*[N]int16)(p))
   222  	} else {
   223  		p.nttGeneric()
   224  	}
   225  }
   226  
   227  // Executes an in-place inverse "NTT" on p and multiply by the Montgomery
   228  // factor R.
   229  //
   230  // Requires coefficients to be in "tangled" order, see Tangle().
   231  // Assumes the coefficients are in absolute value ≤q.  The resulting
   232  // coefficients are in absolute value ≤q.  If the input is in Montgomery
   233  // form, then the result is in Montgomery form and so (by linearity)
   234  // if the input is in regular form, then the result is also in regular form.
   235  func (p *Poly) InvNTT() {
   236  	if cpu.X86.HasAVX2 {
   237  		invNttAVX2((*[N]int16)(p))
   238  	} else {
   239  		p.invNTTGeneric()
   240  	}
   241  }
   242  
   243  // Sets p to the "pointwise" multiplication of a and b.
   244  //
   245  // That is: InvNTT(p) = InvNTT(a) * InvNTT(b).  Assumes a and b are in
   246  // Montgomery form.  Products between coefficients of a and b must be strictly
   247  // bounded in absolute value by 2¹⁵q.  p will be in Montgomery form and
   248  // bounded in absolute value by 2q.
   249  //
   250  // Requires a and b to be in "tangled" order, see Tangle().  p will be in
   251  // tangled order as well.
   252  func (p *Poly) MulHat(a, b *Poly) {
   253  	if cpu.X86.HasAVX2 {
   254  		mulHatAVX2(
   255  			(*[N]int16)(p),
   256  			(*[N]int16)(a),
   257  			(*[N]int16)(b),
   258  		)
   259  	} else {
   260  		p.mulHatGeneric(a, b)
   261  	}
   262  }
   263  
   264  // Puts p into the right form to be used with (among others) InvNTT().
   265  func (p *Poly) Tangle() {
   266  	if cpu.X86.HasAVX2 {
   267  		tangleAVX2((*[N]int16)(p))
   268  	}
   269  
   270  	// When AVX2 is not available, we use the standard order.
   271  }
   272  
   273  // Puts p back into standard form.
   274  func (p *Poly) Detangle() {
   275  	if cpu.X86.HasAVX2 {
   276  		detangleAVX2((*[N]int16)(p))
   277  	}
   278  
   279  	// When AVX2 is not available, we use the standard order.
   280  }
   281  
   282  // Almost normalizes coefficients.
   283  //
   284  // Ensures each coefficient is in {0, …, q}.
   285  func (p *Poly) BarrettReduce() {
   286  	if cpu.X86.HasAVX2 {
   287  		barrettReduceAVX2((*[N]int16)(p))
   288  	} else {
   289  		p.barrettReduceGeneric()
   290  	}
   291  }
   292  
   293  // Normalizes coefficients.
   294  //
   295  // Ensures each coefficient is in {0, …, q-1}.
   296  func (p *Poly) Normalize() {
   297  	if cpu.X86.HasAVX2 {
   298  		normalizeAVX2((*[N]int16)(p))
   299  	} else {
   300  		p.normalizeGeneric()
   301  	}
   302  }