github.com/cloudflare/circl@v1.5.0/pke/kyber/internal/common/amd64.go (about) 1 //go:build amd64 && !purego 2 // +build amd64,!purego 3 4 package common 5 6 import ( 7 "golang.org/x/sys/cpu" 8 ) 9 10 // ZetasAVX2 contains all ζ used in NTT (like the Zetas array), but also 11 // the values int16(zeta * 62209) for each zeta, which is used in 12 // Montgomery reduction. There is some duplication and reordering as 13 // compared to Zetas to make it more convenient for use with AVX2. 14 var ZetasAVX2 = [...]int16{ 15 // level 1: int16(Zetas[1]*62209) and Zetas[1] 16 31499, 2571, 17 18 // level 2 19 // 20 // int16(Zetas[2]*62209), Zetas[2], int16(Zetas[3]*62209), Zetas[3] 21 14746, 2970, 788, 1812, 22 23 // level 3, like level 2. 24 13525, 1493, -12402, 1422, 28191, 287, -16694, 202, 25 26 0, 0, // padding 27 28 // layer 4. offset: 1*16 29 // 30 // The precomputed multiplication and zetas are grouped by 16 at a 31 // time as used in the set of butterflies, etc. 32 -20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906, 33 27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758, 34 3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158, 35 622, 622, 622, 622, 622, 622, 622, 622, 36 -3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799, 37 -15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690, 38 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 39 182, 182, 182, 182, 182, 182, 182, 182, 40 10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690, 41 1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359, 42 962, 962, 962, 962, 962, 962, 962, 962, 43 2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127, 44 -11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201, 45 31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164, 46 1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855, 47 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 48 49 // layer 5. offset: 9*16 50 -5827, -5827, -5827, -5827, 17364, 17364, 17364, 17364, 51 -26360, -26360, -26360, -26360, -29057, -29057, -29057, -29057, 52 573, 573, 573, 573, 2004, 2004, 2004, 2004, 53 264, 264, 264, 264, 383, 383, 383, 383, 54 5572, 5572, 5572, 5572, -1102, -1102, -1102, -1102, 55 21439, 21439, 21439, 21439, -26241, -26241, -26241, -26241, 56 2500, 2500, 2500, 2500, 1458, 1458, 1458, 1458, 57 1727, 1727, 1727, 1727, 3199, 3199, 3199, 3199, 58 -28072, -28072, -28072, -28072, 24313, 24313, 24313, 24313, 59 -10532, -10532, -10532, -10532, 8800, 8800, 8800, 8800, 60 2648, 2648, 2648, 2648, 1017, 1017, 1017, 1017, 61 732, 732, 732, 732, 608, 608, 608, 608, 62 18427, 18427, 18427, 18427, 8859, 8859, 8859, 8859, 63 26676, 26676, 26676, 26676, -16162, -16162, -16162, -16162, 64 1787, 1787, 1787, 1787, 411, 411, 411, 411, 65 3124, 3124, 3124, 3124, 1758, 1758, 1758, 1758, 66 67 // layer 6. offset: 17*16 68 -5689, -5689, -6516, -6516, 1497, 1497, 30967, 30967, 69 -23564, -23564, 20179, 20179, 20711, 20711, 25081, 25081, 70 1223, 1223, 652, 652, 2777, 2777, 1015, 1015, 71 2036, 2036, 1491, 1491, 3047, 3047, 1785, 1785, 72 -12796, -12796, 26617, 26617, 16065, 16065, -12441, -12441, 73 9135, 9135, -649, -649, -25986, -25986, 27837, 27837, 74 516, 516, 3321, 3321, 3009, 3009, 2663, 2663, 75 1711, 1711, 2167, 2167, 126, 126, 1469, 1469, 76 19884, 19884, -28249, -28249, -15886, -15886, -8898, -8898, 77 -28309, -28309, 9076, 9076, -30198, -30198, 18250, 18250, 78 2476, 2476, 3239, 3239, 3058, 3058, 830, 830, 79 107, 107, 1908, 1908, 3082, 3082, 2378, 2378, 80 13427, 13427, 14017, 14017, -29155, -29155, -12756, -12756, 81 16832, 16832, 4312, 4312, -24155, -24155, -17914, -17914, 82 2931, 2931, 961, 961, 1821, 1821, 2604, 2604, 83 448, 448, 2264, 2264, 677, 677, 2054, 2054, 84 85 // layer 7. offset: 25*16 86 -334, 11182, -11477, 13387, -32226, -14233, 20494, -21655, 87 -27738, 13131, 945, -4586, -14882, 23093, 6182, 5493, 88 2226, 430, 555, 843, 2078, 871, 1550, 105, 89 422, 587, 177, 3094, 3038, 2869, 1574, 1653, 90 32011, -32502, 10631, 30318, 29176, -18741, -28761, 12639, 91 -18485, 20100, 17561, 18525, -14430, 19529, -5275, -12618, 92 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 93 1739, 644, 2457, 349, 418, 329, 3173, 3254, 94 -31183, 20297, 25435, 2146, -7382, 15356, 24392, -32384, 95 -20926, -6279, 10946, -14902, 24215, -11044, 16990, 14470, 96 817, 1097, 603, 610, 1322, 2044, 1864, 384, 97 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, 98 10336, -21497, -7933, -20198, -22501, 23211, 10907, -17442, 99 31637, -23859, 28644, -20257, 23998, 7757, -17422, 23132, 100 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 101 3221, 3021, 996, 991, 958, 1869, 1522, 1628, 102 103 // layer 1 inverse 104 23132, -17422, 7757, 23998, -20257, 28644, -23859, 31637, 105 -17442, 10907, 23211, -22501, -20198, -7933, -21497, 10336, 106 1628, 1522, 1869, 958, 991, 996, 3021, 3221, 107 478, 2459, 2475, 1819, 794, 2051, 1799, 2144, 108 14470, 16990, -11044, 24215, -14902, 10946, -6279, -20926, 109 -32384, 24392, 15356, -7382, 2146, 25435, 20297, -31183, 110 1670, 2142, 220, 2455, 1994, 1218, 3193, 2114, 111 384, 1864, 2044, 1322, 610, 603, 1097, 817, 112 -12618, -5275, 19529, -14430, 18525, 17561, 20100, -18485, 113 12639, -28761, -18741, 29176, 30318, 10631, -32502, 32011, 114 3254, 3173, 329, 418, 349, 2457, 644, 1739, 115 1119, 2727, 1483, 2552, 3182, 1159, 778, 3083, 116 5493, 6182, 23093, -14882, -4586, 945, 13131, -27738, 117 -21655, 20494, -14233, -32226, 13387, -11477, 11182, -334, 118 1653, 1574, 2869, 3038, 3094, 177, 587, 422, 119 105, 1550, 871, 2078, 843, 555, 430, 2226, 120 121 // layer 2 inverse 122 -17914, -17914, -24155, -24155, 4312, 4312, 16832, 16832, 123 -12756, -12756, -29155, -29155, 14017, 14017, 13427, 13427, 124 2054, 2054, 677, 677, 2264, 2264, 448, 448, 125 2604, 2604, 1821, 1821, 961, 961, 2931, 2931, 126 18250, 18250, -30198, -30198, 9076, 9076, -28309, -28309, 127 -8898, -8898, -15886, -15886, -28249, -28249, 19884, 19884, 128 2378, 2378, 3082, 3082, 1908, 1908, 107, 107, 129 830, 830, 3058, 3058, 3239, 3239, 2476, 2476, 130 27837, 27837, -25986, -25986, -649, -649, 9135, 9135, 131 -12441, -12441, 16065, 16065, 26617, 26617, -12796, -12796, 132 1469, 1469, 126, 126, 2167, 2167, 1711, 1711, 133 2663, 2663, 3009, 3009, 3321, 3321, 516, 516, 134 25081, 25081, 20711, 20711, 20179, 20179, -23564, -23564, 135 30967, 30967, 1497, 1497, -6516, -6516, -5689, -5689, 136 1785, 1785, 3047, 3047, 1491, 1491, 2036, 2036, 137 1015, 1015, 2777, 2777, 652, 652, 1223, 1223, 138 139 // layer 3 inverse 140 -16162, -16162, -16162, -16162, 26676, 26676, 26676, 26676, 141 8859, 8859, 8859, 8859, 18427, 18427, 18427, 18427, 142 1758, 1758, 1758, 1758, 3124, 3124, 3124, 3124, 143 411, 411, 411, 411, 1787, 1787, 1787, 1787, 144 8800, 8800, 8800, 8800, -10532, -10532, -10532, -10532, 145 24313, 24313, 24313, 24313, -28072, -28072, -28072, -28072, 146 608, 608, 608, 608, 732, 732, 732, 732, 147 1017, 1017, 1017, 1017, 2648, 2648, 2648, 2648, 148 -26241, -26241, -26241, -26241, 21439, 21439, 21439, 21439, 149 -1102, -1102, -1102, -1102, 5572, 5572, 5572, 5572, 150 3199, 3199, 3199, 3199, 1727, 1727, 1727, 1727, 151 1458, 1458, 1458, 1458, 2500, 2500, 2500, 2500, 152 -29057, -29057, -29057, -29057, -26360, -26360, -26360, -26360, 153 17364, 17364, 17364, 17364, -5827, -5827, -5827, -5827, 154 383, 383, 383, 383, 264, 264, 264, 264, 155 2004, 2004, 2004, 2004, 573, 573, 573, 573, 156 157 // layer 4 inverse 158 31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164, 159 -11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201, 160 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 161 1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855, 162 1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359, 163 10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690, 164 2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127, 165 962, 962, 962, 962, 962, 962, 962, 962, 166 -15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690, 167 -3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799, 168 182, 182, 182, 182, 182, 182, 182, 182, 169 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 170 27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758, 171 -20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906, 172 622, 622, 622, 622, 622, 622, 622, 622, 173 3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158, 174 175 // layer 5 inverse 176 -16694, 202, 28191, 287, -12402, 1422, 13525, 1493, 177 178 // layer 6 inverse 179 788, 1812, 14746, 2970, 180 181 // layer 7 inverse 182 31499, 2571, 183 } 184 185 // Sets p to a + b. Does not normalize coefficients. 186 func (p *Poly) Add(a, b *Poly) { 187 if cpu.X86.HasAVX2 { 188 addAVX2( 189 (*[N]int16)(p), 190 (*[N]int16)(a), 191 (*[N]int16)(b), 192 ) 193 } else { 194 p.addGeneric(a, b) 195 } 196 } 197 198 // Sets p to a - b. Does not normalize coefficients. 199 func (p *Poly) Sub(a, b *Poly) { 200 if cpu.X86.HasAVX2 { 201 subAVX2( 202 (*[N]int16)(p), 203 (*[N]int16)(a), 204 (*[N]int16)(b), 205 ) 206 } else { 207 p.subGeneric(a, b) 208 } 209 } 210 211 // Executes an in-place forward "NTT" on p. 212 // 213 // Assumes the coefficients are in absolute value ≤q. The resulting 214 // coefficients are in absolute value ≤7q. If the input is in Montgomery 215 // form, then the result is in Montgomery form and so (by linearity of the NTT) 216 // if the input is in regular form, then the result is also in regular form. 217 // The order of coefficients will be "tangled". These can be put back into 218 // their proper order by calling Detangle(). 219 func (p *Poly) NTT() { 220 if cpu.X86.HasAVX2 { 221 nttAVX2((*[N]int16)(p)) 222 } else { 223 p.nttGeneric() 224 } 225 } 226 227 // Executes an in-place inverse "NTT" on p and multiply by the Montgomery 228 // factor R. 229 // 230 // Requires coefficients to be in "tangled" order, see Tangle(). 231 // Assumes the coefficients are in absolute value ≤q. The resulting 232 // coefficients are in absolute value ≤q. If the input is in Montgomery 233 // form, then the result is in Montgomery form and so (by linearity) 234 // if the input is in regular form, then the result is also in regular form. 235 func (p *Poly) InvNTT() { 236 if cpu.X86.HasAVX2 { 237 invNttAVX2((*[N]int16)(p)) 238 } else { 239 p.invNTTGeneric() 240 } 241 } 242 243 // Sets p to the "pointwise" multiplication of a and b. 244 // 245 // That is: InvNTT(p) = InvNTT(a) * InvNTT(b). Assumes a and b are in 246 // Montgomery form. Products between coefficients of a and b must be strictly 247 // bounded in absolute value by 2¹⁵q. p will be in Montgomery form and 248 // bounded in absolute value by 2q. 249 // 250 // Requires a and b to be in "tangled" order, see Tangle(). p will be in 251 // tangled order as well. 252 func (p *Poly) MulHat(a, b *Poly) { 253 if cpu.X86.HasAVX2 { 254 mulHatAVX2( 255 (*[N]int16)(p), 256 (*[N]int16)(a), 257 (*[N]int16)(b), 258 ) 259 } else { 260 p.mulHatGeneric(a, b) 261 } 262 } 263 264 // Puts p into the right form to be used with (among others) InvNTT(). 265 func (p *Poly) Tangle() { 266 if cpu.X86.HasAVX2 { 267 tangleAVX2((*[N]int16)(p)) 268 } 269 270 // When AVX2 is not available, we use the standard order. 271 } 272 273 // Puts p back into standard form. 274 func (p *Poly) Detangle() { 275 if cpu.X86.HasAVX2 { 276 detangleAVX2((*[N]int16)(p)) 277 } 278 279 // When AVX2 is not available, we use the standard order. 280 } 281 282 // Almost normalizes coefficients. 283 // 284 // Ensures each coefficient is in {0, …, q}. 285 func (p *Poly) BarrettReduce() { 286 if cpu.X86.HasAVX2 { 287 barrettReduceAVX2((*[N]int16)(p)) 288 } else { 289 p.barrettReduceGeneric() 290 } 291 } 292 293 // Normalizes coefficients. 294 // 295 // Ensures each coefficient is in {0, …, q-1}. 296 func (p *Poly) Normalize() { 297 if cpu.X86.HasAVX2 { 298 normalizeAVX2((*[N]int16)(p)) 299 } else { 300 p.normalizeGeneric() 301 } 302 }