github.com/consensys/gnark-crypto@v0.14.0/ecc/bls12-378/multiexp.go (about) 1 // Copyright 2020 Consensys Software Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Code generated by consensys/gnark-crypto DO NOT EDIT 16 17 package bls12378 18 19 import ( 20 "errors" 21 "github.com/consensys/gnark-crypto/ecc" 22 "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" 23 "github.com/consensys/gnark-crypto/internal/parallel" 24 "math" 25 "runtime" 26 ) 27 28 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 29 // 30 // This call return an error if len(scalars) != len(points) or if provided config is invalid. 31 func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { 32 var _p G1Jac 33 if _, err := _p.MultiExp(points, scalars, config); err != nil { 34 return nil, err 35 } 36 p.FromJacobian(&_p) 37 return p, nil 38 } 39 40 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 41 // 42 // This call return an error if len(scalars) != len(points) or if provided config is invalid. 43 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { 44 // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. 45 // note: 46 // each of the msmCX method is the same, except for the c constant it declares 47 // duplicating (through template generation) these methods allows to declare the buckets on the stack 48 // the choice of c needs to be improved: 49 // there is a theoretical value that gives optimal asymptotics 50 // but in practice, other factors come into play, including: 51 // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 52 // * number of CPUs 53 // * cache friendliness (which depends on the host, G1 or G2... ) 54 // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. 55 56 // for each msmCX 57 // step 1 58 // we compute, for each scalars over c-bit wide windows, nbChunk digits 59 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract 60 // 2^{c} to the current digit, making it negative. 61 // negative digits will be processed in the next step as adding -G into the bucket instead of G 62 // (computing -G is cheap, and this saves us half of the buckets) 63 // step 2 64 // buckets are declared on the stack 65 // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) 66 // we use jacobian extended formulas here as they are faster than mixed addition 67 // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel 68 // step 3 69 // reduce the buckets weighed sums into our result (msmReduceChunk) 70 71 // ensure len(points) == len(scalars) 72 nbPoints := len(points) 73 if nbPoints != len(scalars) { 74 return nil, errors.New("len(points) != len(scalars)") 75 } 76 77 // if nbTasks is not set, use all available CPUs 78 if config.NbTasks <= 0 { 79 config.NbTasks = runtime.NumCPU() * 2 80 } else if config.NbTasks > 1024 { 81 return nil, errors.New("invalid config: config.NbTasks > 1024") 82 } 83 84 // here, we compute the best C for nbPoints 85 // we split recursively until nbChunks(c) >= nbTasks, 86 bestC := func(nbPoints int) uint64 { 87 // implemented msmC methods (the c we use must be in this slice) 88 implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} 89 var C uint64 90 // approximate cost (in group operations) 91 // cost = bits/c * (nbPoints + 2^{c}) 92 // this needs to be verified empirically. 93 // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results 94 min := math.MaxFloat64 95 for _, c := range implementedCs { 96 cc := (fr.Bits + 1) * (nbPoints + (1 << c)) 97 cost := float64(cc) / float64(c) 98 if cost < min { 99 min = cost 100 C = c 101 } 102 } 103 return C 104 } 105 106 C := bestC(nbPoints) 107 nbChunks := int(computeNbChunks(C)) 108 109 // should we recursively split the msm in half? (see below) 110 // we want to minimize the execution time of the algorithm; 111 // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. 112 113 // costFunction returns a metric that represent the "wall time" of the algorithm 114 costFunction := func(nbTasks, nbCpus, costPerTask int) int { 115 // cost for the reduction of all tasks (msmReduceChunk) 116 totalCost := nbTasks 117 118 // cost for the computation of each task (msmProcessChunk) 119 for nbTasks >= nbCpus { 120 nbTasks -= nbCpus 121 totalCost += costPerTask 122 } 123 if nbTasks > 0 { 124 totalCost += costPerTask 125 } 126 return totalCost 127 } 128 129 // costPerTask is the approximate number of group ops per task 130 costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } 131 132 costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) 133 134 cPostSplit := bestC(nbPoints / 2) 135 nbChunksPostSplit := int(computeNbChunks(cPostSplit)) 136 costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) 137 138 // if the cost of the split msm is lower than the cost of the non split msm, we split 139 if costPostSplit < costPreSplit { 140 config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) 141 var _p G1Jac 142 chDone := make(chan struct{}, 1) 143 go func() { 144 _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) 145 close(chDone) 146 }() 147 p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) 148 <-chDone 149 p.AddAssign(&_p) 150 return p, nil 151 } 152 153 // if we don't split, we use the best C we found 154 _innerMsmG1(p, C, points, scalars, config) 155 156 return p, nil 157 } 158 159 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { 160 // partition the scalars 161 digits, chunkStats := partitionScalars(scalars, c, config.NbTasks) 162 163 nbChunks := computeNbChunks(c) 164 165 // for each chunk, spawn one go routine that'll loop through all the scalars in the 166 // corresponding bit-window 167 // note that buckets is an array allocated on the stack and this is critical for performance 168 169 // each go routine sends its result in chChunks[i] channel 170 chChunks := make([]chan g1JacExtended, nbChunks) 171 for i := 0; i < len(chChunks); i++ { 172 chChunks[i] = make(chan g1JacExtended, 1) 173 } 174 175 // we use a semaphore to limit the number of go routines running concurrently 176 // (only if nbTasks < nbCPU) 177 var sem chan struct{} 178 if config.NbTasks < runtime.NumCPU() { 179 // we add nbChunks because if chunk is overweight we split it in two 180 sem = make(chan struct{}, config.NbTasks+int(nbChunks)) 181 for i := 0; i < config.NbTasks; i++ { 182 sem <- struct{}{} 183 } 184 defer func() { 185 close(sem) 186 }() 187 } 188 189 // the last chunk may be processed with a different method than the rest, as it could be smaller. 190 n := len(points) 191 for j := int(nbChunks - 1); j >= 0; j-- { 192 processChunk := getChunkProcessorG1(c, chunkStats[j]) 193 if j == int(nbChunks-1) { 194 processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) 195 } 196 if chunkStats[j].weight >= 115 { 197 // we split this in more go routines since this chunk has more work to do than the others. 198 // else what would happen is this go routine would finish much later than the others. 199 chSplit := make(chan g1JacExtended, 2) 200 split := n / 2 201 202 if sem != nil { 203 sem <- struct{}{} // add another token to the semaphore, since we split in two. 204 } 205 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) 206 go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) 207 go func(chunkID int) { 208 s1 := <-chSplit 209 s2 := <-chSplit 210 close(chSplit) 211 s1.add(&s2) 212 chChunks[chunkID] <- s1 213 }(j) 214 continue 215 } 216 go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) 217 } 218 219 return msmReduceChunkG1Affine(p, int(c), chChunks[:]) 220 } 221 222 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk 223 // to return the best algorithm to process the chunk. 224 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { 225 switch c { 226 227 case 2: 228 return processChunkG1Jacobian[bucketg1JacExtendedC2] 229 case 3: 230 return processChunkG1Jacobian[bucketg1JacExtendedC3] 231 case 4: 232 return processChunkG1Jacobian[bucketg1JacExtendedC4] 233 case 5: 234 return processChunkG1Jacobian[bucketg1JacExtendedC5] 235 case 6: 236 return processChunkG1Jacobian[bucketg1JacExtendedC6] 237 case 7: 238 return processChunkG1Jacobian[bucketg1JacExtendedC7] 239 case 8: 240 return processChunkG1Jacobian[bucketg1JacExtendedC8] 241 case 9: 242 return processChunkG1Jacobian[bucketg1JacExtendedC9] 243 case 10: 244 const batchSize = 80 245 // here we could check some chunk statistic (deviation, ...) to determine if calling 246 // the batch affine version is worth it. 247 if stat.nbBucketFilled < batchSize { 248 // clear indicator that batch affine method is not appropriate here. 249 return processChunkG1Jacobian[bucketg1JacExtendedC10] 250 } 251 return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] 252 case 11: 253 const batchSize = 150 254 // here we could check some chunk statistic (deviation, ...) to determine if calling 255 // the batch affine version is worth it. 256 if stat.nbBucketFilled < batchSize { 257 // clear indicator that batch affine method is not appropriate here. 258 return processChunkG1Jacobian[bucketg1JacExtendedC11] 259 } 260 return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11] 261 case 12: 262 const batchSize = 200 263 // here we could check some chunk statistic (deviation, ...) to determine if calling 264 // the batch affine version is worth it. 265 if stat.nbBucketFilled < batchSize { 266 // clear indicator that batch affine method is not appropriate here. 267 return processChunkG1Jacobian[bucketg1JacExtendedC12] 268 } 269 return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12] 270 case 13: 271 const batchSize = 350 272 // here we could check some chunk statistic (deviation, ...) to determine if calling 273 // the batch affine version is worth it. 274 if stat.nbBucketFilled < batchSize { 275 // clear indicator that batch affine method is not appropriate here. 276 return processChunkG1Jacobian[bucketg1JacExtendedC13] 277 } 278 return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13] 279 case 14: 280 const batchSize = 400 281 // here we could check some chunk statistic (deviation, ...) to determine if calling 282 // the batch affine version is worth it. 283 if stat.nbBucketFilled < batchSize { 284 // clear indicator that batch affine method is not appropriate here. 285 return processChunkG1Jacobian[bucketg1JacExtendedC14] 286 } 287 return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14] 288 case 15: 289 const batchSize = 500 290 // here we could check some chunk statistic (deviation, ...) to determine if calling 291 // the batch affine version is worth it. 292 if stat.nbBucketFilled < batchSize { 293 // clear indicator that batch affine method is not appropriate here. 294 return processChunkG1Jacobian[bucketg1JacExtendedC15] 295 } 296 return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15] 297 case 16: 298 const batchSize = 640 299 // here we could check some chunk statistic (deviation, ...) to determine if calling 300 // the batch affine version is worth it. 301 if stat.nbBucketFilled < batchSize { 302 // clear indicator that batch affine method is not appropriate here. 303 return processChunkG1Jacobian[bucketg1JacExtendedC16] 304 } 305 return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] 306 default: 307 // panic("will not happen c != previous values is not generated by templates") 308 return processChunkG1Jacobian[bucketg1JacExtendedC16] 309 } 310 } 311 312 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp 313 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { 314 var _p g1JacExtended 315 totalj := <-chChunks[len(chChunks)-1] 316 _p.Set(&totalj) 317 for j := len(chChunks) - 2; j >= 0; j-- { 318 for l := 0; l < c; l++ { 319 _p.double(&_p) 320 } 321 totalj := <-chChunks[j] 322 _p.add(&totalj) 323 } 324 325 return p.unsafeFromJacExtended(&_p) 326 } 327 328 // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] * 329 // combinationCoeff^i and stores the result in p. It returns error in case 330 // configuration is invalid. 331 func (p *G1Affine) Fold(points []G1Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { 332 var _p G1Jac 333 if _, err := _p.Fold(points, combinationCoeff, config); err != nil { 334 return nil, err 335 } 336 p.FromJacobian(&_p) 337 return p, nil 338 } 339 340 // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] * 341 // combinationCoeff^i and stores the result in p. It returns error in case 342 // configuration is invalid. 343 func (p *G1Jac) Fold(points []G1Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { 344 scalars := make([]fr.Element, len(points)) 345 scalar := fr.NewElement(1) 346 for i := 0; i < len(points); i++ { 347 scalars[i].Set(&scalar) 348 scalar.Mul(&scalar, &combinationCoeff) 349 } 350 return p.MultiExp(points, scalars, config) 351 } 352 353 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 354 // 355 // This call return an error if len(scalars) != len(points) or if provided config is invalid. 356 func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { 357 var _p G2Jac 358 if _, err := _p.MultiExp(points, scalars, config); err != nil { 359 return nil, err 360 } 361 p.FromJacobian(&_p) 362 return p, nil 363 } 364 365 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 366 // 367 // This call return an error if len(scalars) != len(points) or if provided config is invalid. 368 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { 369 // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. 370 // note: 371 // each of the msmCX method is the same, except for the c constant it declares 372 // duplicating (through template generation) these methods allows to declare the buckets on the stack 373 // the choice of c needs to be improved: 374 // there is a theoretical value that gives optimal asymptotics 375 // but in practice, other factors come into play, including: 376 // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 377 // * number of CPUs 378 // * cache friendliness (which depends on the host, G1 or G2... ) 379 // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. 380 381 // for each msmCX 382 // step 1 383 // we compute, for each scalars over c-bit wide windows, nbChunk digits 384 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract 385 // 2^{c} to the current digit, making it negative. 386 // negative digits will be processed in the next step as adding -G into the bucket instead of G 387 // (computing -G is cheap, and this saves us half of the buckets) 388 // step 2 389 // buckets are declared on the stack 390 // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) 391 // we use jacobian extended formulas here as they are faster than mixed addition 392 // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel 393 // step 3 394 // reduce the buckets weighed sums into our result (msmReduceChunk) 395 396 // ensure len(points) == len(scalars) 397 nbPoints := len(points) 398 if nbPoints != len(scalars) { 399 return nil, errors.New("len(points) != len(scalars)") 400 } 401 402 // if nbTasks is not set, use all available CPUs 403 if config.NbTasks <= 0 { 404 config.NbTasks = runtime.NumCPU() * 2 405 } else if config.NbTasks > 1024 { 406 return nil, errors.New("invalid config: config.NbTasks > 1024") 407 } 408 409 // here, we compute the best C for nbPoints 410 // we split recursively until nbChunks(c) >= nbTasks, 411 bestC := func(nbPoints int) uint64 { 412 // implemented msmC methods (the c we use must be in this slice) 413 implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} 414 var C uint64 415 // approximate cost (in group operations) 416 // cost = bits/c * (nbPoints + 2^{c}) 417 // this needs to be verified empirically. 418 // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results 419 min := math.MaxFloat64 420 for _, c := range implementedCs { 421 cc := (fr.Bits + 1) * (nbPoints + (1 << c)) 422 cost := float64(cc) / float64(c) 423 if cost < min { 424 min = cost 425 C = c 426 } 427 } 428 return C 429 } 430 431 C := bestC(nbPoints) 432 nbChunks := int(computeNbChunks(C)) 433 434 // should we recursively split the msm in half? (see below) 435 // we want to minimize the execution time of the algorithm; 436 // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. 437 438 // costFunction returns a metric that represent the "wall time" of the algorithm 439 costFunction := func(nbTasks, nbCpus, costPerTask int) int { 440 // cost for the reduction of all tasks (msmReduceChunk) 441 totalCost := nbTasks 442 443 // cost for the computation of each task (msmProcessChunk) 444 for nbTasks >= nbCpus { 445 nbTasks -= nbCpus 446 totalCost += costPerTask 447 } 448 if nbTasks > 0 { 449 totalCost += costPerTask 450 } 451 return totalCost 452 } 453 454 // costPerTask is the approximate number of group ops per task 455 costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } 456 457 costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) 458 459 cPostSplit := bestC(nbPoints / 2) 460 nbChunksPostSplit := int(computeNbChunks(cPostSplit)) 461 costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) 462 463 // if the cost of the split msm is lower than the cost of the non split msm, we split 464 if costPostSplit < costPreSplit { 465 config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) 466 var _p G2Jac 467 chDone := make(chan struct{}, 1) 468 go func() { 469 _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) 470 close(chDone) 471 }() 472 p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) 473 <-chDone 474 p.AddAssign(&_p) 475 return p, nil 476 } 477 478 // if we don't split, we use the best C we found 479 _innerMsmG2(p, C, points, scalars, config) 480 481 return p, nil 482 } 483 484 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { 485 // partition the scalars 486 digits, chunkStats := partitionScalars(scalars, c, config.NbTasks) 487 488 nbChunks := computeNbChunks(c) 489 490 // for each chunk, spawn one go routine that'll loop through all the scalars in the 491 // corresponding bit-window 492 // note that buckets is an array allocated on the stack and this is critical for performance 493 494 // each go routine sends its result in chChunks[i] channel 495 chChunks := make([]chan g2JacExtended, nbChunks) 496 for i := 0; i < len(chChunks); i++ { 497 chChunks[i] = make(chan g2JacExtended, 1) 498 } 499 500 // we use a semaphore to limit the number of go routines running concurrently 501 // (only if nbTasks < nbCPU) 502 var sem chan struct{} 503 if config.NbTasks < runtime.NumCPU() { 504 // we add nbChunks because if chunk is overweight we split it in two 505 sem = make(chan struct{}, config.NbTasks+int(nbChunks)) 506 for i := 0; i < config.NbTasks; i++ { 507 sem <- struct{}{} 508 } 509 defer func() { 510 close(sem) 511 }() 512 } 513 514 // the last chunk may be processed with a different method than the rest, as it could be smaller. 515 n := len(points) 516 for j := int(nbChunks - 1); j >= 0; j-- { 517 processChunk := getChunkProcessorG2(c, chunkStats[j]) 518 if j == int(nbChunks-1) { 519 processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) 520 } 521 if chunkStats[j].weight >= 115 { 522 // we split this in more go routines since this chunk has more work to do than the others. 523 // else what would happen is this go routine would finish much later than the others. 524 chSplit := make(chan g2JacExtended, 2) 525 split := n / 2 526 527 if sem != nil { 528 sem <- struct{}{} // add another token to the semaphore, since we split in two. 529 } 530 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) 531 go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) 532 go func(chunkID int) { 533 s1 := <-chSplit 534 s2 := <-chSplit 535 close(chSplit) 536 s1.add(&s2) 537 chChunks[chunkID] <- s1 538 }(j) 539 continue 540 } 541 go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) 542 } 543 544 return msmReduceChunkG2Affine(p, int(c), chChunks[:]) 545 } 546 547 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk 548 // to return the best algorithm to process the chunk. 549 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { 550 switch c { 551 552 case 2: 553 return processChunkG2Jacobian[bucketg2JacExtendedC2] 554 case 3: 555 return processChunkG2Jacobian[bucketg2JacExtendedC3] 556 case 4: 557 return processChunkG2Jacobian[bucketg2JacExtendedC4] 558 case 5: 559 return processChunkG2Jacobian[bucketg2JacExtendedC5] 560 case 6: 561 return processChunkG2Jacobian[bucketg2JacExtendedC6] 562 case 7: 563 return processChunkG2Jacobian[bucketg2JacExtendedC7] 564 case 8: 565 return processChunkG2Jacobian[bucketg2JacExtendedC8] 566 case 9: 567 return processChunkG2Jacobian[bucketg2JacExtendedC9] 568 case 10: 569 const batchSize = 80 570 // here we could check some chunk statistic (deviation, ...) to determine if calling 571 // the batch affine version is worth it. 572 if stat.nbBucketFilled < batchSize { 573 // clear indicator that batch affine method is not appropriate here. 574 return processChunkG2Jacobian[bucketg2JacExtendedC10] 575 } 576 return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] 577 case 11: 578 const batchSize = 150 579 // here we could check some chunk statistic (deviation, ...) to determine if calling 580 // the batch affine version is worth it. 581 if stat.nbBucketFilled < batchSize { 582 // clear indicator that batch affine method is not appropriate here. 583 return processChunkG2Jacobian[bucketg2JacExtendedC11] 584 } 585 return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11] 586 case 12: 587 const batchSize = 200 588 // here we could check some chunk statistic (deviation, ...) to determine if calling 589 // the batch affine version is worth it. 590 if stat.nbBucketFilled < batchSize { 591 // clear indicator that batch affine method is not appropriate here. 592 return processChunkG2Jacobian[bucketg2JacExtendedC12] 593 } 594 return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12] 595 case 13: 596 const batchSize = 350 597 // here we could check some chunk statistic (deviation, ...) to determine if calling 598 // the batch affine version is worth it. 599 if stat.nbBucketFilled < batchSize { 600 // clear indicator that batch affine method is not appropriate here. 601 return processChunkG2Jacobian[bucketg2JacExtendedC13] 602 } 603 return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13] 604 case 14: 605 const batchSize = 400 606 // here we could check some chunk statistic (deviation, ...) to determine if calling 607 // the batch affine version is worth it. 608 if stat.nbBucketFilled < batchSize { 609 // clear indicator that batch affine method is not appropriate here. 610 return processChunkG2Jacobian[bucketg2JacExtendedC14] 611 } 612 return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14] 613 case 15: 614 const batchSize = 500 615 // here we could check some chunk statistic (deviation, ...) to determine if calling 616 // the batch affine version is worth it. 617 if stat.nbBucketFilled < batchSize { 618 // clear indicator that batch affine method is not appropriate here. 619 return processChunkG2Jacobian[bucketg2JacExtendedC15] 620 } 621 return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15] 622 case 16: 623 const batchSize = 640 624 // here we could check some chunk statistic (deviation, ...) to determine if calling 625 // the batch affine version is worth it. 626 if stat.nbBucketFilled < batchSize { 627 // clear indicator that batch affine method is not appropriate here. 628 return processChunkG2Jacobian[bucketg2JacExtendedC16] 629 } 630 return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] 631 default: 632 // panic("will not happen c != previous values is not generated by templates") 633 return processChunkG2Jacobian[bucketg2JacExtendedC16] 634 } 635 } 636 637 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp 638 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { 639 var _p g2JacExtended 640 totalj := <-chChunks[len(chChunks)-1] 641 _p.Set(&totalj) 642 for j := len(chChunks) - 2; j >= 0; j-- { 643 for l := 0; l < c; l++ { 644 _p.double(&_p) 645 } 646 totalj := <-chChunks[j] 647 _p.add(&totalj) 648 } 649 650 return p.unsafeFromJacExtended(&_p) 651 } 652 653 // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] * 654 // combinationCoeff^i and stores the result in p. It returns error in case 655 // configuration is invalid. 656 func (p *G2Affine) Fold(points []G2Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { 657 var _p G2Jac 658 if _, err := _p.Fold(points, combinationCoeff, config); err != nil { 659 return nil, err 660 } 661 p.FromJacobian(&_p) 662 return p, nil 663 } 664 665 // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] * 666 // combinationCoeff^i and stores the result in p. It returns error in case 667 // configuration is invalid. 668 func (p *G2Jac) Fold(points []G2Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { 669 scalars := make([]fr.Element, len(points)) 670 scalar := fr.NewElement(1) 671 for i := 0; i < len(points); i++ { 672 scalars[i].Set(&scalar) 673 scalar.Mul(&scalar, &combinationCoeff) 674 } 675 return p.MultiExp(points, scalars, config) 676 } 677 678 // selector stores the index, mask and shifts needed to select bits from a scalar 679 // it is used during the multiExp algorithm or the batch scalar multiplication 680 type selector struct { 681 index uint64 // index in the multi-word scalar to select bits from 682 mask uint64 // mask (c-bit wide) 683 shift uint64 // shift needed to get our bits on low positions 684 685 multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) 686 maskHigh uint64 // same than mask, for index+1 687 shiftHigh uint64 // same than shift, for index+1 688 } 689 690 // return number of chunks for a given window size c 691 // the last chunk may be bigger to accommodate a potential carry from the NAF decomposition 692 func computeNbChunks(c uint64) uint64 { 693 return (fr.Bits + c - 1) / c 694 } 695 696 // return the last window size for a scalar; 697 // this last window should accommodate a carry (from the NAF decomposition) 698 // it can be == c if we have 1 available bit 699 // it can be > c if we have 0 available bit 700 // it can be < c if we have 2+ available bits 701 func lastC(c uint64) uint64 { 702 nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits 703 return c + 1 - nbAvailableBits 704 } 705 706 type chunkStat struct { 707 // relative weight of work compared to other chunks. 100.0 -> nominal weight. 708 weight float32 709 710 // percentage of bucket filled in the window; 711 ppBucketFilled float32 712 nbBucketFilled int 713 } 714 715 // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits 716 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract 717 // 2^{c} to the current digit, making it negative. 718 // negative digits can be processed in a later step as adding -G into the bucket instead of G 719 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) 720 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { 721 // no benefit here to have more tasks than CPUs 722 if nbTasks > runtime.NumCPU() { 723 nbTasks = runtime.NumCPU() 724 } 725 726 // number of c-bit radixes in a scalar 727 nbChunks := computeNbChunks(c) 728 729 digits := make([]uint16, len(scalars)*int(nbChunks)) 730 731 mask := uint64((1 << c) - 1) // low c bits are 1 732 max := int(1<<(c-1)) - 1 // max value (inclusive) we want for our digits 733 cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words 734 735 // compute offset and word selector / shift to select the right bits of our windows 736 selectors := make([]selector, nbChunks) 737 for chunk := uint64(0); chunk < nbChunks; chunk++ { 738 jc := uint64(chunk * c) 739 d := selector{} 740 d.index = jc / 64 741 d.shift = jc - (d.index * 64) 742 d.mask = mask << d.shift 743 d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) 744 if d.multiWordSelect { 745 nbBitsHigh := d.shift - uint64(64-c) 746 d.maskHigh = (1 << nbBitsHigh) - 1 747 d.shiftHigh = (c - nbBitsHigh) 748 } 749 selectors[chunk] = d 750 } 751 752 parallel.Execute(len(scalars), func(start, end int) { 753 for i := start; i < end; i++ { 754 if scalars[i].IsZero() { 755 // everything is 0, no need to process this scalar 756 continue 757 } 758 scalar := scalars[i].Bits() 759 760 var carry int 761 762 // for each chunk in the scalar, compute the current digit, and an eventual carry 763 for chunk := uint64(0); chunk < nbChunks-1; chunk++ { 764 s := selectors[chunk] 765 766 // init with carry if any 767 digit := carry 768 carry = 0 769 770 // digit = value of the c-bit window 771 digit += int((scalar[s.index] & s.mask) >> s.shift) 772 773 if s.multiWordSelect { 774 // we are selecting bits over 2 words 775 digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh 776 } 777 778 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract 779 // 2^{c} to the current digit, making it negative. 780 if digit > max { 781 digit -= (1 << c) 782 carry = 1 783 } 784 785 // if digit is zero, no impact on result 786 if digit == 0 { 787 continue 788 } 789 790 var bits uint16 791 if digit > 0 { 792 bits = uint16(digit) << 1 793 } else { 794 bits = (uint16(-digit-1) << 1) + 1 795 } 796 digits[int(chunk)*len(scalars)+i] = bits 797 } 798 799 // for the last chunk, we don't want to borrow from a next window 800 // (but may have a larger max value) 801 chunk := nbChunks - 1 802 s := selectors[chunk] 803 // init with carry if any 804 digit := carry 805 // digit = value of the c-bit window 806 digit += int((scalar[s.index] & s.mask) >> s.shift) 807 if s.multiWordSelect { 808 // we are selecting bits over 2 words 809 digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh 810 } 811 digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 812 } 813 814 }, nbTasks) 815 816 // aggregate chunk stats 817 chunkStats := make([]chunkStat, nbChunks) 818 if c <= 9 { 819 // no need to compute stats for small window sizes 820 return digits, chunkStats 821 } 822 parallel.Execute(len(chunkStats), func(start, end int) { 823 // for each chunk compute the statistics 824 for chunkID := start; chunkID < end; chunkID++ { 825 // indicates if a bucket is hit. 826 var b bitSetC16 827 828 // digits for the chunk 829 chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] 830 831 totalOps := 0 832 nz := 0 // non zero buckets count 833 for _, digit := range chunkDigits { 834 if digit == 0 { 835 continue 836 } 837 totalOps++ 838 bucketID := digit >> 1 839 if digit&1 == 0 { 840 bucketID -= 1 841 } 842 if !b[bucketID] { 843 nz++ 844 b[bucketID] = true 845 } 846 } 847 chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after 848 chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) 849 chunkStats[chunkID].nbBucketFilled = nz 850 } 851 }, nbTasks) 852 853 totalOps := float32(0.0) 854 for _, stat := range chunkStats { 855 totalOps += stat.weight 856 } 857 858 target := totalOps / float32(nbChunks) 859 if target != 0.0 { 860 // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. 861 for i := 0; i < len(chunkStats); i++ { 862 chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target 863 } 864 } 865 866 return digits, chunkStats 867 }