github.com/consensys/gnark-crypto@v0.14.0/ecc/bls12-378/multiexp.go (about)

     1  // Copyright 2020 Consensys Software Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Code generated by consensys/gnark-crypto DO NOT EDIT
    16  
    17  package bls12378
    18  
    19  import (
    20  	"errors"
    21  	"github.com/consensys/gnark-crypto/ecc"
    22  	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
    23  	"github.com/consensys/gnark-crypto/internal/parallel"
    24  	"math"
    25  	"runtime"
    26  )
    27  
    28  // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
    29  //
    30  // This call return an error if len(scalars) != len(points) or if provided config is invalid.
    31  func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
    32  	var _p G1Jac
    33  	if _, err := _p.MultiExp(points, scalars, config); err != nil {
    34  		return nil, err
    35  	}
    36  	p.FromJacobian(&_p)
    37  	return p, nil
    38  }
    39  
    40  // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
    41  //
    42  // This call return an error if len(scalars) != len(points) or if provided config is invalid.
    43  func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
    44  	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
    45  	// note:
    46  	// each of the msmCX method is the same, except for the c constant it declares
    47  	// duplicating (through template generation) these methods allows to declare the buckets on the stack
    48  	// the choice of c needs to be improved:
    49  	// there is a theoretical value that gives optimal asymptotics
    50  	// but in practice, other factors come into play, including:
    51  	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
    52  	// * number of CPUs
    53  	// * cache friendliness (which depends on the host, G1 or G2... )
    54  	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
    55  
    56  	// for each msmCX
    57  	// step 1
    58  	// we compute, for each scalars over c-bit wide windows, nbChunk digits
    59  	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract
    60  	// 2^{c} to the current digit, making it negative.
    61  	// negative digits will be processed in the next step as adding -G into the bucket instead of G
    62  	// (computing -G is cheap, and this saves us half of the buckets)
    63  	// step 2
    64  	// buckets are declared on the stack
    65  	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
    66  	// we use jacobian extended formulas here as they are faster than mixed addition
    67  	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
    68  	// step 3
    69  	// reduce the buckets weighed sums into our result (msmReduceChunk)
    70  
    71  	// ensure len(points) == len(scalars)
    72  	nbPoints := len(points)
    73  	if nbPoints != len(scalars) {
    74  		return nil, errors.New("len(points) != len(scalars)")
    75  	}
    76  
    77  	// if nbTasks is not set, use all available CPUs
    78  	if config.NbTasks <= 0 {
    79  		config.NbTasks = runtime.NumCPU() * 2
    80  	} else if config.NbTasks > 1024 {
    81  		return nil, errors.New("invalid config: config.NbTasks > 1024")
    82  	}
    83  
    84  	// here, we compute the best C for nbPoints
    85  	// we split recursively until nbChunks(c) >= nbTasks,
    86  	bestC := func(nbPoints int) uint64 {
    87  		// implemented msmC methods (the c we use must be in this slice)
    88  		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
    89  		var C uint64
    90  		// approximate cost (in group operations)
    91  		// cost = bits/c * (nbPoints + 2^{c})
    92  		// this needs to be verified empirically.
    93  		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
    94  		min := math.MaxFloat64
    95  		for _, c := range implementedCs {
    96  			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
    97  			cost := float64(cc) / float64(c)
    98  			if cost < min {
    99  				min = cost
   100  				C = c
   101  			}
   102  		}
   103  		return C
   104  	}
   105  
   106  	C := bestC(nbPoints)
   107  	nbChunks := int(computeNbChunks(C))
   108  
   109  	// should we recursively split the msm in half? (see below)
   110  	// we want to minimize the execution time of the algorithm;
   111  	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
   112  
   113  	// costFunction returns a metric that represent the "wall time" of the algorithm
   114  	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
   115  		// cost for the reduction of all tasks (msmReduceChunk)
   116  		totalCost := nbTasks
   117  
   118  		// cost for the computation of each task (msmProcessChunk)
   119  		for nbTasks >= nbCpus {
   120  			nbTasks -= nbCpus
   121  			totalCost += costPerTask
   122  		}
   123  		if nbTasks > 0 {
   124  			totalCost += costPerTask
   125  		}
   126  		return totalCost
   127  	}
   128  
   129  	// costPerTask is the approximate number of group ops per task
   130  	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
   131  
   132  	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
   133  
   134  	cPostSplit := bestC(nbPoints / 2)
   135  	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
   136  	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
   137  
   138  	// if the cost of the split msm is lower than the cost of the non split msm, we split
   139  	if costPostSplit < costPreSplit {
   140  		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
   141  		var _p G1Jac
   142  		chDone := make(chan struct{}, 1)
   143  		go func() {
   144  			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
   145  			close(chDone)
   146  		}()
   147  		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
   148  		<-chDone
   149  		p.AddAssign(&_p)
   150  		return p, nil
   151  	}
   152  
   153  	// if we don't split, we use the best C we found
   154  	_innerMsmG1(p, C, points, scalars, config)
   155  
   156  	return p, nil
   157  }
   158  
   159  func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
   160  	// partition the scalars
   161  	digits, chunkStats := partitionScalars(scalars, c, config.NbTasks)
   162  
   163  	nbChunks := computeNbChunks(c)
   164  
   165  	// for each chunk, spawn one go routine that'll loop through all the scalars in the
   166  	// corresponding bit-window
   167  	// note that buckets is an array allocated on the stack and this is critical for performance
   168  
   169  	// each go routine sends its result in chChunks[i] channel
   170  	chChunks := make([]chan g1JacExtended, nbChunks)
   171  	for i := 0; i < len(chChunks); i++ {
   172  		chChunks[i] = make(chan g1JacExtended, 1)
   173  	}
   174  
   175  	// we use a semaphore to limit the number of go routines running concurrently
   176  	// (only if nbTasks < nbCPU)
   177  	var sem chan struct{}
   178  	if config.NbTasks < runtime.NumCPU() {
   179  		// we add nbChunks because if chunk is overweight we split it in two
   180  		sem = make(chan struct{}, config.NbTasks+int(nbChunks))
   181  		for i := 0; i < config.NbTasks; i++ {
   182  			sem <- struct{}{}
   183  		}
   184  		defer func() {
   185  			close(sem)
   186  		}()
   187  	}
   188  
   189  	// the last chunk may be processed with a different method than the rest, as it could be smaller.
   190  	n := len(points)
   191  	for j := int(nbChunks - 1); j >= 0; j-- {
   192  		processChunk := getChunkProcessorG1(c, chunkStats[j])
   193  		if j == int(nbChunks-1) {
   194  			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
   195  		}
   196  		if chunkStats[j].weight >= 115 {
   197  			// we split this in more go routines since this chunk has more work to do than the others.
   198  			// else what would happen is this go routine would finish much later than the others.
   199  			chSplit := make(chan g1JacExtended, 2)
   200  			split := n / 2
   201  
   202  			if sem != nil {
   203  				sem <- struct{}{} // add another token to the semaphore, since we split in two.
   204  			}
   205  			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
   206  			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
   207  			go func(chunkID int) {
   208  				s1 := <-chSplit
   209  				s2 := <-chSplit
   210  				close(chSplit)
   211  				s1.add(&s2)
   212  				chChunks[chunkID] <- s1
   213  			}(j)
   214  			continue
   215  		}
   216  		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
   217  	}
   218  
   219  	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
   220  }
   221  
   222  // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
   223  // to return the best algorithm to process the chunk.
   224  func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) {
   225  	switch c {
   226  
   227  	case 2:
   228  		return processChunkG1Jacobian[bucketg1JacExtendedC2]
   229  	case 3:
   230  		return processChunkG1Jacobian[bucketg1JacExtendedC3]
   231  	case 4:
   232  		return processChunkG1Jacobian[bucketg1JacExtendedC4]
   233  	case 5:
   234  		return processChunkG1Jacobian[bucketg1JacExtendedC5]
   235  	case 6:
   236  		return processChunkG1Jacobian[bucketg1JacExtendedC6]
   237  	case 7:
   238  		return processChunkG1Jacobian[bucketg1JacExtendedC7]
   239  	case 8:
   240  		return processChunkG1Jacobian[bucketg1JacExtendedC8]
   241  	case 9:
   242  		return processChunkG1Jacobian[bucketg1JacExtendedC9]
   243  	case 10:
   244  		const batchSize = 80
   245  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   246  		// the batch affine version is worth it.
   247  		if stat.nbBucketFilled < batchSize {
   248  			// clear indicator that batch affine method is not appropriate here.
   249  			return processChunkG1Jacobian[bucketg1JacExtendedC10]
   250  		}
   251  		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
   252  	case 11:
   253  		const batchSize = 150
   254  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   255  		// the batch affine version is worth it.
   256  		if stat.nbBucketFilled < batchSize {
   257  			// clear indicator that batch affine method is not appropriate here.
   258  			return processChunkG1Jacobian[bucketg1JacExtendedC11]
   259  		}
   260  		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
   261  	case 12:
   262  		const batchSize = 200
   263  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   264  		// the batch affine version is worth it.
   265  		if stat.nbBucketFilled < batchSize {
   266  			// clear indicator that batch affine method is not appropriate here.
   267  			return processChunkG1Jacobian[bucketg1JacExtendedC12]
   268  		}
   269  		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
   270  	case 13:
   271  		const batchSize = 350
   272  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   273  		// the batch affine version is worth it.
   274  		if stat.nbBucketFilled < batchSize {
   275  			// clear indicator that batch affine method is not appropriate here.
   276  			return processChunkG1Jacobian[bucketg1JacExtendedC13]
   277  		}
   278  		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
   279  	case 14:
   280  		const batchSize = 400
   281  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   282  		// the batch affine version is worth it.
   283  		if stat.nbBucketFilled < batchSize {
   284  			// clear indicator that batch affine method is not appropriate here.
   285  			return processChunkG1Jacobian[bucketg1JacExtendedC14]
   286  		}
   287  		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
   288  	case 15:
   289  		const batchSize = 500
   290  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   291  		// the batch affine version is worth it.
   292  		if stat.nbBucketFilled < batchSize {
   293  			// clear indicator that batch affine method is not appropriate here.
   294  			return processChunkG1Jacobian[bucketg1JacExtendedC15]
   295  		}
   296  		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
   297  	case 16:
   298  		const batchSize = 640
   299  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   300  		// the batch affine version is worth it.
   301  		if stat.nbBucketFilled < batchSize {
   302  			// clear indicator that batch affine method is not appropriate here.
   303  			return processChunkG1Jacobian[bucketg1JacExtendedC16]
   304  		}
   305  		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
   306  	default:
   307  		// panic("will not happen c != previous values is not generated by templates")
   308  		return processChunkG1Jacobian[bucketg1JacExtendedC16]
   309  	}
   310  }
   311  
   312  // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
   313  func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
   314  	var _p g1JacExtended
   315  	totalj := <-chChunks[len(chChunks)-1]
   316  	_p.Set(&totalj)
   317  	for j := len(chChunks) - 2; j >= 0; j-- {
   318  		for l := 0; l < c; l++ {
   319  			_p.double(&_p)
   320  		}
   321  		totalj := <-chChunks[j]
   322  		_p.add(&totalj)
   323  	}
   324  
   325  	return p.unsafeFromJacExtended(&_p)
   326  }
   327  
   328  // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] *
   329  // combinationCoeff^i and stores the result in p. It returns error in case
   330  // configuration is invalid.
   331  func (p *G1Affine) Fold(points []G1Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
   332  	var _p G1Jac
   333  	if _, err := _p.Fold(points, combinationCoeff, config); err != nil {
   334  		return nil, err
   335  	}
   336  	p.FromJacobian(&_p)
   337  	return p, nil
   338  }
   339  
   340  // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] *
   341  // combinationCoeff^i and stores the result in p. It returns error in case
   342  // configuration is invalid.
   343  func (p *G1Jac) Fold(points []G1Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
   344  	scalars := make([]fr.Element, len(points))
   345  	scalar := fr.NewElement(1)
   346  	for i := 0; i < len(points); i++ {
   347  		scalars[i].Set(&scalar)
   348  		scalar.Mul(&scalar, &combinationCoeff)
   349  	}
   350  	return p.MultiExp(points, scalars, config)
   351  }
   352  
   353  // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
   354  //
   355  // This call return an error if len(scalars) != len(points) or if provided config is invalid.
   356  func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
   357  	var _p G2Jac
   358  	if _, err := _p.MultiExp(points, scalars, config); err != nil {
   359  		return nil, err
   360  	}
   361  	p.FromJacobian(&_p)
   362  	return p, nil
   363  }
   364  
   365  // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
   366  //
   367  // This call return an error if len(scalars) != len(points) or if provided config is invalid.
   368  func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
   369  	// TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability.
   370  	// note:
   371  	// each of the msmCX method is the same, except for the c constant it declares
   372  	// duplicating (through template generation) these methods allows to declare the buckets on the stack
   373  	// the choice of c needs to be improved:
   374  	// there is a theoretical value that gives optimal asymptotics
   375  	// but in practice, other factors come into play, including:
   376  	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
   377  	// * number of CPUs
   378  	// * cache friendliness (which depends on the host, G1 or G2... )
   379  	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
   380  
   381  	// for each msmCX
   382  	// step 1
   383  	// we compute, for each scalars over c-bit wide windows, nbChunk digits
   384  	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract
   385  	// 2^{c} to the current digit, making it negative.
   386  	// negative digits will be processed in the next step as adding -G into the bucket instead of G
   387  	// (computing -G is cheap, and this saves us half of the buckets)
   388  	// step 2
   389  	// buckets are declared on the stack
   390  	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
   391  	// we use jacobian extended formulas here as they are faster than mixed addition
   392  	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
   393  	// step 3
   394  	// reduce the buckets weighed sums into our result (msmReduceChunk)
   395  
   396  	// ensure len(points) == len(scalars)
   397  	nbPoints := len(points)
   398  	if nbPoints != len(scalars) {
   399  		return nil, errors.New("len(points) != len(scalars)")
   400  	}
   401  
   402  	// if nbTasks is not set, use all available CPUs
   403  	if config.NbTasks <= 0 {
   404  		config.NbTasks = runtime.NumCPU() * 2
   405  	} else if config.NbTasks > 1024 {
   406  		return nil, errors.New("invalid config: config.NbTasks > 1024")
   407  	}
   408  
   409  	// here, we compute the best C for nbPoints
   410  	// we split recursively until nbChunks(c) >= nbTasks,
   411  	bestC := func(nbPoints int) uint64 {
   412  		// implemented msmC methods (the c we use must be in this slice)
   413  		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
   414  		var C uint64
   415  		// approximate cost (in group operations)
   416  		// cost = bits/c * (nbPoints + 2^{c})
   417  		// this needs to be verified empirically.
   418  		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
   419  		min := math.MaxFloat64
   420  		for _, c := range implementedCs {
   421  			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
   422  			cost := float64(cc) / float64(c)
   423  			if cost < min {
   424  				min = cost
   425  				C = c
   426  			}
   427  		}
   428  		return C
   429  	}
   430  
   431  	C := bestC(nbPoints)
   432  	nbChunks := int(computeNbChunks(C))
   433  
   434  	// should we recursively split the msm in half? (see below)
   435  	// we want to minimize the execution time of the algorithm;
   436  	// splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it.
   437  
   438  	// costFunction returns a metric that represent the "wall time" of the algorithm
   439  	costFunction := func(nbTasks, nbCpus, costPerTask int) int {
   440  		// cost for the reduction of all tasks (msmReduceChunk)
   441  		totalCost := nbTasks
   442  
   443  		// cost for the computation of each task (msmProcessChunk)
   444  		for nbTasks >= nbCpus {
   445  			nbTasks -= nbCpus
   446  			totalCost += costPerTask
   447  		}
   448  		if nbTasks > 0 {
   449  			totalCost += costPerTask
   450  		}
   451  		return totalCost
   452  	}
   453  
   454  	// costPerTask is the approximate number of group ops per task
   455  	costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) }
   456  
   457  	costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints))
   458  
   459  	cPostSplit := bestC(nbPoints / 2)
   460  	nbChunksPostSplit := int(computeNbChunks(cPostSplit))
   461  	costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2))
   462  
   463  	// if the cost of the split msm is lower than the cost of the non split msm, we split
   464  	if costPostSplit < costPreSplit {
   465  		config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0))
   466  		var _p G2Jac
   467  		chDone := make(chan struct{}, 1)
   468  		go func() {
   469  			_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
   470  			close(chDone)
   471  		}()
   472  		p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
   473  		<-chDone
   474  		p.AddAssign(&_p)
   475  		return p, nil
   476  	}
   477  
   478  	// if we don't split, we use the best C we found
   479  	_innerMsmG2(p, C, points, scalars, config)
   480  
   481  	return p, nil
   482  }
   483  
   484  func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
   485  	// partition the scalars
   486  	digits, chunkStats := partitionScalars(scalars, c, config.NbTasks)
   487  
   488  	nbChunks := computeNbChunks(c)
   489  
   490  	// for each chunk, spawn one go routine that'll loop through all the scalars in the
   491  	// corresponding bit-window
   492  	// note that buckets is an array allocated on the stack and this is critical for performance
   493  
   494  	// each go routine sends its result in chChunks[i] channel
   495  	chChunks := make([]chan g2JacExtended, nbChunks)
   496  	for i := 0; i < len(chChunks); i++ {
   497  		chChunks[i] = make(chan g2JacExtended, 1)
   498  	}
   499  
   500  	// we use a semaphore to limit the number of go routines running concurrently
   501  	// (only if nbTasks < nbCPU)
   502  	var sem chan struct{}
   503  	if config.NbTasks < runtime.NumCPU() {
   504  		// we add nbChunks because if chunk is overweight we split it in two
   505  		sem = make(chan struct{}, config.NbTasks+int(nbChunks))
   506  		for i := 0; i < config.NbTasks; i++ {
   507  			sem <- struct{}{}
   508  		}
   509  		defer func() {
   510  			close(sem)
   511  		}()
   512  	}
   513  
   514  	// the last chunk may be processed with a different method than the rest, as it could be smaller.
   515  	n := len(points)
   516  	for j := int(nbChunks - 1); j >= 0; j-- {
   517  		processChunk := getChunkProcessorG2(c, chunkStats[j])
   518  		if j == int(nbChunks-1) {
   519  			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
   520  		}
   521  		if chunkStats[j].weight >= 115 {
   522  			// we split this in more go routines since this chunk has more work to do than the others.
   523  			// else what would happen is this go routine would finish much later than the others.
   524  			chSplit := make(chan g2JacExtended, 2)
   525  			split := n / 2
   526  
   527  			if sem != nil {
   528  				sem <- struct{}{} // add another token to the semaphore, since we split in two.
   529  			}
   530  			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem)
   531  			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem)
   532  			go func(chunkID int) {
   533  				s1 := <-chSplit
   534  				s2 := <-chSplit
   535  				close(chSplit)
   536  				s1.add(&s2)
   537  				chChunks[chunkID] <- s1
   538  			}(j)
   539  			continue
   540  		}
   541  		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem)
   542  	}
   543  
   544  	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
   545  }
   546  
   547  // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
   548  // to return the best algorithm to process the chunk.
   549  func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) {
   550  	switch c {
   551  
   552  	case 2:
   553  		return processChunkG2Jacobian[bucketg2JacExtendedC2]
   554  	case 3:
   555  		return processChunkG2Jacobian[bucketg2JacExtendedC3]
   556  	case 4:
   557  		return processChunkG2Jacobian[bucketg2JacExtendedC4]
   558  	case 5:
   559  		return processChunkG2Jacobian[bucketg2JacExtendedC5]
   560  	case 6:
   561  		return processChunkG2Jacobian[bucketg2JacExtendedC6]
   562  	case 7:
   563  		return processChunkG2Jacobian[bucketg2JacExtendedC7]
   564  	case 8:
   565  		return processChunkG2Jacobian[bucketg2JacExtendedC8]
   566  	case 9:
   567  		return processChunkG2Jacobian[bucketg2JacExtendedC9]
   568  	case 10:
   569  		const batchSize = 80
   570  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   571  		// the batch affine version is worth it.
   572  		if stat.nbBucketFilled < batchSize {
   573  			// clear indicator that batch affine method is not appropriate here.
   574  			return processChunkG2Jacobian[bucketg2JacExtendedC10]
   575  		}
   576  		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
   577  	case 11:
   578  		const batchSize = 150
   579  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   580  		// the batch affine version is worth it.
   581  		if stat.nbBucketFilled < batchSize {
   582  			// clear indicator that batch affine method is not appropriate here.
   583  			return processChunkG2Jacobian[bucketg2JacExtendedC11]
   584  		}
   585  		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
   586  	case 12:
   587  		const batchSize = 200
   588  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   589  		// the batch affine version is worth it.
   590  		if stat.nbBucketFilled < batchSize {
   591  			// clear indicator that batch affine method is not appropriate here.
   592  			return processChunkG2Jacobian[bucketg2JacExtendedC12]
   593  		}
   594  		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
   595  	case 13:
   596  		const batchSize = 350
   597  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   598  		// the batch affine version is worth it.
   599  		if stat.nbBucketFilled < batchSize {
   600  			// clear indicator that batch affine method is not appropriate here.
   601  			return processChunkG2Jacobian[bucketg2JacExtendedC13]
   602  		}
   603  		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
   604  	case 14:
   605  		const batchSize = 400
   606  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   607  		// the batch affine version is worth it.
   608  		if stat.nbBucketFilled < batchSize {
   609  			// clear indicator that batch affine method is not appropriate here.
   610  			return processChunkG2Jacobian[bucketg2JacExtendedC14]
   611  		}
   612  		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
   613  	case 15:
   614  		const batchSize = 500
   615  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   616  		// the batch affine version is worth it.
   617  		if stat.nbBucketFilled < batchSize {
   618  			// clear indicator that batch affine method is not appropriate here.
   619  			return processChunkG2Jacobian[bucketg2JacExtendedC15]
   620  		}
   621  		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
   622  	case 16:
   623  		const batchSize = 640
   624  		// here we could check some chunk statistic (deviation, ...) to determine if calling
   625  		// the batch affine version is worth it.
   626  		if stat.nbBucketFilled < batchSize {
   627  			// clear indicator that batch affine method is not appropriate here.
   628  			return processChunkG2Jacobian[bucketg2JacExtendedC16]
   629  		}
   630  		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
   631  	default:
   632  		// panic("will not happen c != previous values is not generated by templates")
   633  		return processChunkG2Jacobian[bucketg2JacExtendedC16]
   634  	}
   635  }
   636  
   637  // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
   638  func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
   639  	var _p g2JacExtended
   640  	totalj := <-chChunks[len(chChunks)-1]
   641  	_p.Set(&totalj)
   642  	for j := len(chChunks) - 2; j >= 0; j-- {
   643  		for l := 0; l < c; l++ {
   644  			_p.double(&_p)
   645  		}
   646  		totalj := <-chChunks[j]
   647  		_p.add(&totalj)
   648  	}
   649  
   650  	return p.unsafeFromJacExtended(&_p)
   651  }
   652  
   653  // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] *
   654  // combinationCoeff^i and stores the result in p. It returns error in case
   655  // configuration is invalid.
   656  func (p *G2Affine) Fold(points []G2Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
   657  	var _p G2Jac
   658  	if _, err := _p.Fold(points, combinationCoeff, config); err != nil {
   659  		return nil, err
   660  	}
   661  	p.FromJacobian(&_p)
   662  	return p, nil
   663  }
   664  
   665  // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] *
   666  // combinationCoeff^i and stores the result in p. It returns error in case
   667  // configuration is invalid.
   668  func (p *G2Jac) Fold(points []G2Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
   669  	scalars := make([]fr.Element, len(points))
   670  	scalar := fr.NewElement(1)
   671  	for i := 0; i < len(points); i++ {
   672  		scalars[i].Set(&scalar)
   673  		scalar.Mul(&scalar, &combinationCoeff)
   674  	}
   675  	return p.MultiExp(points, scalars, config)
   676  }
   677  
   678  // selector stores the index, mask and shifts needed to select bits from a scalar
   679  // it is used during the multiExp algorithm or the batch scalar multiplication
   680  type selector struct {
   681  	index uint64 // index in the multi-word scalar to select bits from
   682  	mask  uint64 // mask (c-bit wide)
   683  	shift uint64 // shift needed to get our bits on low positions
   684  
   685  	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
   686  	maskHigh        uint64 // same than mask, for index+1
   687  	shiftHigh       uint64 // same than shift, for index+1
   688  }
   689  
   690  // return number of chunks for a given window size c
   691  // the last chunk may be bigger to accommodate a potential carry from the NAF decomposition
   692  func computeNbChunks(c uint64) uint64 {
   693  	return (fr.Bits + c - 1) / c
   694  }
   695  
   696  // return the last window size for a scalar;
   697  // this last window should accommodate a carry (from the NAF decomposition)
   698  // it can be == c if we have 1 available bit
   699  // it can be > c if we have 0 available bit
   700  // it can be < c if we have 2+ available bits
   701  func lastC(c uint64) uint64 {
   702  	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
   703  	return c + 1 - nbAvailableBits
   704  }
   705  
   706  type chunkStat struct {
   707  	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
   708  	weight float32
   709  
   710  	// percentage of bucket filled in the window;
   711  	ppBucketFilled float32
   712  	nbBucketFilled int
   713  }
   714  
   715  // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
   716  // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract
   717  // 2^{c} to the current digit, making it negative.
   718  // negative digits can be processed in a later step as adding -G into the bucket instead of G
   719  // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
   720  func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) {
   721  	// no benefit here to have more tasks than CPUs
   722  	if nbTasks > runtime.NumCPU() {
   723  		nbTasks = runtime.NumCPU()
   724  	}
   725  
   726  	// number of c-bit radixes in a scalar
   727  	nbChunks := computeNbChunks(c)
   728  
   729  	digits := make([]uint16, len(scalars)*int(nbChunks))
   730  
   731  	mask := uint64((1 << c) - 1) // low c bits are 1
   732  	max := int(1<<(c-1)) - 1     // max value (inclusive) we want for our digits
   733  	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
   734  
   735  	// compute offset and word selector / shift to select the right bits of our windows
   736  	selectors := make([]selector, nbChunks)
   737  	for chunk := uint64(0); chunk < nbChunks; chunk++ {
   738  		jc := uint64(chunk * c)
   739  		d := selector{}
   740  		d.index = jc / 64
   741  		d.shift = jc - (d.index * 64)
   742  		d.mask = mask << d.shift
   743  		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
   744  		if d.multiWordSelect {
   745  			nbBitsHigh := d.shift - uint64(64-c)
   746  			d.maskHigh = (1 << nbBitsHigh) - 1
   747  			d.shiftHigh = (c - nbBitsHigh)
   748  		}
   749  		selectors[chunk] = d
   750  	}
   751  
   752  	parallel.Execute(len(scalars), func(start, end int) {
   753  		for i := start; i < end; i++ {
   754  			if scalars[i].IsZero() {
   755  				// everything is 0, no need to process this scalar
   756  				continue
   757  			}
   758  			scalar := scalars[i].Bits()
   759  
   760  			var carry int
   761  
   762  			// for each chunk in the scalar, compute the current digit, and an eventual carry
   763  			for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
   764  				s := selectors[chunk]
   765  
   766  				// init with carry if any
   767  				digit := carry
   768  				carry = 0
   769  
   770  				// digit = value of the c-bit window
   771  				digit += int((scalar[s.index] & s.mask) >> s.shift)
   772  
   773  				if s.multiWordSelect {
   774  					// we are selecting bits over 2 words
   775  					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
   776  				}
   777  
   778  				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract
   779  				// 2^{c} to the current digit, making it negative.
   780  				if digit > max {
   781  					digit -= (1 << c)
   782  					carry = 1
   783  				}
   784  
   785  				// if digit is zero, no impact on result
   786  				if digit == 0 {
   787  					continue
   788  				}
   789  
   790  				var bits uint16
   791  				if digit > 0 {
   792  					bits = uint16(digit) << 1
   793  				} else {
   794  					bits = (uint16(-digit-1) << 1) + 1
   795  				}
   796  				digits[int(chunk)*len(scalars)+i] = bits
   797  			}
   798  
   799  			// for the last chunk, we don't want to borrow from a next window
   800  			// (but may have a larger max value)
   801  			chunk := nbChunks - 1
   802  			s := selectors[chunk]
   803  			// init with carry if any
   804  			digit := carry
   805  			// digit = value of the c-bit window
   806  			digit += int((scalar[s.index] & s.mask) >> s.shift)
   807  			if s.multiWordSelect {
   808  				// we are selecting bits over 2 words
   809  				digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
   810  			}
   811  			digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
   812  		}
   813  
   814  	}, nbTasks)
   815  
   816  	// aggregate  chunk stats
   817  	chunkStats := make([]chunkStat, nbChunks)
   818  	if c <= 9 {
   819  		// no need to compute stats for small window sizes
   820  		return digits, chunkStats
   821  	}
   822  	parallel.Execute(len(chunkStats), func(start, end int) {
   823  		// for each chunk compute the statistics
   824  		for chunkID := start; chunkID < end; chunkID++ {
   825  			// indicates if a bucket is hit.
   826  			var b bitSetC16
   827  
   828  			// digits for the chunk
   829  			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
   830  
   831  			totalOps := 0
   832  			nz := 0 // non zero buckets count
   833  			for _, digit := range chunkDigits {
   834  				if digit == 0 {
   835  					continue
   836  				}
   837  				totalOps++
   838  				bucketID := digit >> 1
   839  				if digit&1 == 0 {
   840  					bucketID -= 1
   841  				}
   842  				if !b[bucketID] {
   843  					nz++
   844  					b[bucketID] = true
   845  				}
   846  			}
   847  			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
   848  			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
   849  			chunkStats[chunkID].nbBucketFilled = nz
   850  		}
   851  	}, nbTasks)
   852  
   853  	totalOps := float32(0.0)
   854  	for _, stat := range chunkStats {
   855  		totalOps += stat.weight
   856  	}
   857  
   858  	target := totalOps / float32(nbChunks)
   859  	if target != 0.0 {
   860  		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
   861  		for i := 0; i < len(chunkStats); i++ {
   862  			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
   863  		}
   864  	}
   865  
   866  	return digits, chunkStats
   867  }