github.com/consensys/gnark-crypto@v0.14.0/ecc/bw6-761/multiexp.go (about) 1 // Copyright 2020 Consensys Software Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Code generated by consensys/gnark-crypto DO NOT EDIT 16 17 package bw6761 18 19 import ( 20 "errors" 21 "github.com/consensys/gnark-crypto/ecc" 22 "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" 23 "github.com/consensys/gnark-crypto/internal/parallel" 24 "math" 25 "runtime" 26 ) 27 28 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 29 // 30 // This call return an error if len(scalars) != len(points) or if provided config is invalid. 31 func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { 32 var _p G1Jac 33 if _, err := _p.MultiExp(points, scalars, config); err != nil { 34 return nil, err 35 } 36 p.FromJacobian(&_p) 37 return p, nil 38 } 39 40 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 41 // 42 // This call return an error if len(scalars) != len(points) or if provided config is invalid. 43 func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { 44 // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. 45 // note: 46 // each of the msmCX method is the same, except for the c constant it declares 47 // duplicating (through template generation) these methods allows to declare the buckets on the stack 48 // the choice of c needs to be improved: 49 // there is a theoretical value that gives optimal asymptotics 50 // but in practice, other factors come into play, including: 51 // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 52 // * number of CPUs 53 // * cache friendliness (which depends on the host, G1 or G2... ) 54 // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. 55 56 // for each msmCX 57 // step 1 58 // we compute, for each scalars over c-bit wide windows, nbChunk digits 59 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract 60 // 2^{c} to the current digit, making it negative. 61 // negative digits will be processed in the next step as adding -G into the bucket instead of G 62 // (computing -G is cheap, and this saves us half of the buckets) 63 // step 2 64 // buckets are declared on the stack 65 // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) 66 // we use jacobian extended formulas here as they are faster than mixed addition 67 // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel 68 // step 3 69 // reduce the buckets weighed sums into our result (msmReduceChunk) 70 71 // ensure len(points) == len(scalars) 72 nbPoints := len(points) 73 if nbPoints != len(scalars) { 74 return nil, errors.New("len(points) != len(scalars)") 75 } 76 77 // if nbTasks is not set, use all available CPUs 78 if config.NbTasks <= 0 { 79 config.NbTasks = runtime.NumCPU() * 2 80 } else if config.NbTasks > 1024 { 81 return nil, errors.New("invalid config: config.NbTasks > 1024") 82 } 83 84 // here, we compute the best C for nbPoints 85 // we split recursively until nbChunks(c) >= nbTasks, 86 bestC := func(nbPoints int) uint64 { 87 // implemented msmC methods (the c we use must be in this slice) 88 implementedCs := []uint64{4, 5, 8, 10, 16} 89 var C uint64 90 // approximate cost (in group operations) 91 // cost = bits/c * (nbPoints + 2^{c}) 92 // this needs to be verified empirically. 93 // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results 94 min := math.MaxFloat64 95 for _, c := range implementedCs { 96 cc := (fr.Bits + 1) * (nbPoints + (1 << c)) 97 cost := float64(cc) / float64(c) 98 if cost < min { 99 min = cost 100 C = c 101 } 102 } 103 return C 104 } 105 106 C := bestC(nbPoints) 107 nbChunks := int(computeNbChunks(C)) 108 109 // should we recursively split the msm in half? (see below) 110 // we want to minimize the execution time of the algorithm; 111 // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. 112 113 // costFunction returns a metric that represent the "wall time" of the algorithm 114 costFunction := func(nbTasks, nbCpus, costPerTask int) int { 115 // cost for the reduction of all tasks (msmReduceChunk) 116 totalCost := nbTasks 117 118 // cost for the computation of each task (msmProcessChunk) 119 for nbTasks >= nbCpus { 120 nbTasks -= nbCpus 121 totalCost += costPerTask 122 } 123 if nbTasks > 0 { 124 totalCost += costPerTask 125 } 126 return totalCost 127 } 128 129 // costPerTask is the approximate number of group ops per task 130 costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } 131 132 costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) 133 134 cPostSplit := bestC(nbPoints / 2) 135 nbChunksPostSplit := int(computeNbChunks(cPostSplit)) 136 costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) 137 138 // if the cost of the split msm is lower than the cost of the non split msm, we split 139 if costPostSplit < costPreSplit { 140 config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) 141 var _p G1Jac 142 chDone := make(chan struct{}, 1) 143 go func() { 144 _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) 145 close(chDone) 146 }() 147 p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) 148 <-chDone 149 p.AddAssign(&_p) 150 return p, nil 151 } 152 153 // if we don't split, we use the best C we found 154 _innerMsmG1(p, C, points, scalars, config) 155 156 return p, nil 157 } 158 159 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac { 160 // partition the scalars 161 digits, chunkStats := partitionScalars(scalars, c, config.NbTasks) 162 163 nbChunks := computeNbChunks(c) 164 165 // for each chunk, spawn one go routine that'll loop through all the scalars in the 166 // corresponding bit-window 167 // note that buckets is an array allocated on the stack and this is critical for performance 168 169 // each go routine sends its result in chChunks[i] channel 170 chChunks := make([]chan g1JacExtended, nbChunks) 171 for i := 0; i < len(chChunks); i++ { 172 chChunks[i] = make(chan g1JacExtended, 1) 173 } 174 175 // we use a semaphore to limit the number of go routines running concurrently 176 // (only if nbTasks < nbCPU) 177 var sem chan struct{} 178 if config.NbTasks < runtime.NumCPU() { 179 // we add nbChunks because if chunk is overweight we split it in two 180 sem = make(chan struct{}, config.NbTasks+int(nbChunks)) 181 for i := 0; i < config.NbTasks; i++ { 182 sem <- struct{}{} 183 } 184 defer func() { 185 close(sem) 186 }() 187 } 188 189 // the last chunk may be processed with a different method than the rest, as it could be smaller. 190 n := len(points) 191 for j := int(nbChunks - 1); j >= 0; j-- { 192 processChunk := getChunkProcessorG1(c, chunkStats[j]) 193 if j == int(nbChunks-1) { 194 processChunk = getChunkProcessorG1(lastC(c), chunkStats[j]) 195 } 196 if chunkStats[j].weight >= 115 { 197 // we split this in more go routines since this chunk has more work to do than the others. 198 // else what would happen is this go routine would finish much later than the others. 199 chSplit := make(chan g1JacExtended, 2) 200 split := n / 2 201 202 if sem != nil { 203 sem <- struct{}{} // add another token to the semaphore, since we split in two. 204 } 205 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) 206 go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) 207 go func(chunkID int) { 208 s1 := <-chSplit 209 s2 := <-chSplit 210 close(chSplit) 211 s1.add(&s2) 212 chChunks[chunkID] <- s1 213 }(j) 214 continue 215 } 216 go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) 217 } 218 219 return msmReduceChunkG1Affine(p, int(c), chChunks[:]) 220 } 221 222 // getChunkProcessorG1 decides, depending on c window size and statistics for the chunk 223 // to return the best algorithm to process the chunk. 224 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16, sem chan struct{}) { 225 switch c { 226 227 case 2: 228 return processChunkG1Jacobian[bucketg1JacExtendedC2] 229 case 3: 230 return processChunkG1Jacobian[bucketg1JacExtendedC3] 231 case 4: 232 return processChunkG1Jacobian[bucketg1JacExtendedC4] 233 case 5: 234 return processChunkG1Jacobian[bucketg1JacExtendedC5] 235 case 8: 236 return processChunkG1Jacobian[bucketg1JacExtendedC8] 237 case 10: 238 const batchSize = 80 239 // here we could check some chunk statistic (deviation, ...) to determine if calling 240 // the batch affine version is worth it. 241 if stat.nbBucketFilled < batchSize { 242 // clear indicator that batch affine method is not appropriate here. 243 return processChunkG1Jacobian[bucketg1JacExtendedC10] 244 } 245 return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10] 246 case 16: 247 const batchSize = 640 248 // here we could check some chunk statistic (deviation, ...) to determine if calling 249 // the batch affine version is worth it. 250 if stat.nbBucketFilled < batchSize { 251 // clear indicator that batch affine method is not appropriate here. 252 return processChunkG1Jacobian[bucketg1JacExtendedC16] 253 } 254 return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16] 255 default: 256 // panic("will not happen c != previous values is not generated by templates") 257 return processChunkG1Jacobian[bucketg1JacExtendedC16] 258 } 259 } 260 261 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp 262 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { 263 var _p g1JacExtended 264 totalj := <-chChunks[len(chChunks)-1] 265 _p.Set(&totalj) 266 for j := len(chChunks) - 2; j >= 0; j-- { 267 for l := 0; l < c; l++ { 268 _p.double(&_p) 269 } 270 totalj := <-chChunks[j] 271 _p.add(&totalj) 272 } 273 274 return p.unsafeFromJacExtended(&_p) 275 } 276 277 // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] * 278 // combinationCoeff^i and stores the result in p. It returns error in case 279 // configuration is invalid. 280 func (p *G1Affine) Fold(points []G1Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) { 281 var _p G1Jac 282 if _, err := _p.Fold(points, combinationCoeff, config); err != nil { 283 return nil, err 284 } 285 p.FromJacobian(&_p) 286 return p, nil 287 } 288 289 // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] * 290 // combinationCoeff^i and stores the result in p. It returns error in case 291 // configuration is invalid. 292 func (p *G1Jac) Fold(points []G1Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) { 293 scalars := make([]fr.Element, len(points)) 294 scalar := fr.NewElement(1) 295 for i := 0; i < len(points); i++ { 296 scalars[i].Set(&scalar) 297 scalar.Mul(&scalar, &combinationCoeff) 298 } 299 return p.MultiExp(points, scalars, config) 300 } 301 302 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 303 // 304 // This call return an error if len(scalars) != len(points) or if provided config is invalid. 305 func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { 306 var _p G2Jac 307 if _, err := _p.MultiExp(points, scalars, config); err != nil { 308 return nil, err 309 } 310 p.FromJacobian(&_p) 311 return p, nil 312 } 313 314 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 315 // 316 // This call return an error if len(scalars) != len(points) or if provided config is invalid. 317 func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { 318 // TODO @gbotrel replace the ecc.MultiExpConfig by a Option pattern for maintainability. 319 // note: 320 // each of the msmCX method is the same, except for the c constant it declares 321 // duplicating (through template generation) these methods allows to declare the buckets on the stack 322 // the choice of c needs to be improved: 323 // there is a theoretical value that gives optimal asymptotics 324 // but in practice, other factors come into play, including: 325 // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 326 // * number of CPUs 327 // * cache friendliness (which depends on the host, G1 or G2... ) 328 // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. 329 330 // for each msmCX 331 // step 1 332 // we compute, for each scalars over c-bit wide windows, nbChunk digits 333 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract 334 // 2^{c} to the current digit, making it negative. 335 // negative digits will be processed in the next step as adding -G into the bucket instead of G 336 // (computing -G is cheap, and this saves us half of the buckets) 337 // step 2 338 // buckets are declared on the stack 339 // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) 340 // we use jacobian extended formulas here as they are faster than mixed addition 341 // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel 342 // step 3 343 // reduce the buckets weighed sums into our result (msmReduceChunk) 344 345 // ensure len(points) == len(scalars) 346 nbPoints := len(points) 347 if nbPoints != len(scalars) { 348 return nil, errors.New("len(points) != len(scalars)") 349 } 350 351 // if nbTasks is not set, use all available CPUs 352 if config.NbTasks <= 0 { 353 config.NbTasks = runtime.NumCPU() * 2 354 } else if config.NbTasks > 1024 { 355 return nil, errors.New("invalid config: config.NbTasks > 1024") 356 } 357 358 // here, we compute the best C for nbPoints 359 // we split recursively until nbChunks(c) >= nbTasks, 360 bestC := func(nbPoints int) uint64 { 361 // implemented msmC methods (the c we use must be in this slice) 362 implementedCs := []uint64{4, 5, 8, 10, 16} 363 var C uint64 364 // approximate cost (in group operations) 365 // cost = bits/c * (nbPoints + 2^{c}) 366 // this needs to be verified empirically. 367 // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results 368 min := math.MaxFloat64 369 for _, c := range implementedCs { 370 cc := (fr.Bits + 1) * (nbPoints + (1 << c)) 371 cost := float64(cc) / float64(c) 372 if cost < min { 373 min = cost 374 C = c 375 } 376 } 377 return C 378 } 379 380 C := bestC(nbPoints) 381 nbChunks := int(computeNbChunks(C)) 382 383 // should we recursively split the msm in half? (see below) 384 // we want to minimize the execution time of the algorithm; 385 // splitting the msm will **add** operations, but if it allows to use more CPU, it might be worth it. 386 387 // costFunction returns a metric that represent the "wall time" of the algorithm 388 costFunction := func(nbTasks, nbCpus, costPerTask int) int { 389 // cost for the reduction of all tasks (msmReduceChunk) 390 totalCost := nbTasks 391 392 // cost for the computation of each task (msmProcessChunk) 393 for nbTasks >= nbCpus { 394 nbTasks -= nbCpus 395 totalCost += costPerTask 396 } 397 if nbTasks > 0 { 398 totalCost += costPerTask 399 } 400 return totalCost 401 } 402 403 // costPerTask is the approximate number of group ops per task 404 costPerTask := func(c uint64, nbPoints int) int { return (nbPoints + int((1 << c))) } 405 406 costPreSplit := costFunction(nbChunks, config.NbTasks, costPerTask(C, nbPoints)) 407 408 cPostSplit := bestC(nbPoints / 2) 409 nbChunksPostSplit := int(computeNbChunks(cPostSplit)) 410 costPostSplit := costFunction(nbChunksPostSplit*2, config.NbTasks, costPerTask(cPostSplit, nbPoints/2)) 411 412 // if the cost of the split msm is lower than the cost of the non split msm, we split 413 if costPostSplit < costPreSplit { 414 config.NbTasks = int(math.Ceil(float64(config.NbTasks) / 2.0)) 415 var _p G2Jac 416 chDone := make(chan struct{}, 1) 417 go func() { 418 _p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config) 419 close(chDone) 420 }() 421 p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config) 422 <-chDone 423 p.AddAssign(&_p) 424 return p, nil 425 } 426 427 // if we don't split, we use the best C we found 428 _innerMsmG2(p, C, points, scalars, config) 429 430 return p, nil 431 } 432 433 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac { 434 // partition the scalars 435 digits, chunkStats := partitionScalars(scalars, c, config.NbTasks) 436 437 nbChunks := computeNbChunks(c) 438 439 // for each chunk, spawn one go routine that'll loop through all the scalars in the 440 // corresponding bit-window 441 // note that buckets is an array allocated on the stack and this is critical for performance 442 443 // each go routine sends its result in chChunks[i] channel 444 chChunks := make([]chan g2JacExtended, nbChunks) 445 for i := 0; i < len(chChunks); i++ { 446 chChunks[i] = make(chan g2JacExtended, 1) 447 } 448 449 // we use a semaphore to limit the number of go routines running concurrently 450 // (only if nbTasks < nbCPU) 451 var sem chan struct{} 452 if config.NbTasks < runtime.NumCPU() { 453 // we add nbChunks because if chunk is overweight we split it in two 454 sem = make(chan struct{}, config.NbTasks+int(nbChunks)) 455 for i := 0; i < config.NbTasks; i++ { 456 sem <- struct{}{} 457 } 458 defer func() { 459 close(sem) 460 }() 461 } 462 463 // the last chunk may be processed with a different method than the rest, as it could be smaller. 464 n := len(points) 465 for j := int(nbChunks - 1); j >= 0; j-- { 466 processChunk := getChunkProcessorG2(c, chunkStats[j]) 467 if j == int(nbChunks-1) { 468 processChunk = getChunkProcessorG2(lastC(c), chunkStats[j]) 469 } 470 if chunkStats[j].weight >= 115 { 471 // we split this in more go routines since this chunk has more work to do than the others. 472 // else what would happen is this go routine would finish much later than the others. 473 chSplit := make(chan g2JacExtended, 2) 474 split := n / 2 475 476 if sem != nil { 477 sem <- struct{}{} // add another token to the semaphore, since we split in two. 478 } 479 go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split], sem) 480 go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n], sem) 481 go func(chunkID int) { 482 s1 := <-chSplit 483 s2 := <-chSplit 484 close(chSplit) 485 s1.add(&s2) 486 chChunks[chunkID] <- s1 487 }(j) 488 continue 489 } 490 go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n], sem) 491 } 492 493 return msmReduceChunkG2Affine(p, int(c), chChunks[:]) 494 } 495 496 // getChunkProcessorG2 decides, depending on c window size and statistics for the chunk 497 // to return the best algorithm to process the chunk. 498 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16, sem chan struct{}) { 499 switch c { 500 501 case 2: 502 return processChunkG2Jacobian[bucketg2JacExtendedC2] 503 case 3: 504 return processChunkG2Jacobian[bucketg2JacExtendedC3] 505 case 4: 506 return processChunkG2Jacobian[bucketg2JacExtendedC4] 507 case 5: 508 return processChunkG2Jacobian[bucketg2JacExtendedC5] 509 case 8: 510 return processChunkG2Jacobian[bucketg2JacExtendedC8] 511 case 10: 512 const batchSize = 80 513 // here we could check some chunk statistic (deviation, ...) to determine if calling 514 // the batch affine version is worth it. 515 if stat.nbBucketFilled < batchSize { 516 // clear indicator that batch affine method is not appropriate here. 517 return processChunkG2Jacobian[bucketg2JacExtendedC10] 518 } 519 return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10] 520 case 16: 521 const batchSize = 640 522 // here we could check some chunk statistic (deviation, ...) to determine if calling 523 // the batch affine version is worth it. 524 if stat.nbBucketFilled < batchSize { 525 // clear indicator that batch affine method is not appropriate here. 526 return processChunkG2Jacobian[bucketg2JacExtendedC16] 527 } 528 return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16] 529 default: 530 // panic("will not happen c != previous values is not generated by templates") 531 return processChunkG2Jacobian[bucketg2JacExtendedC16] 532 } 533 } 534 535 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp 536 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { 537 var _p g2JacExtended 538 totalj := <-chChunks[len(chChunks)-1] 539 _p.Set(&totalj) 540 for j := len(chChunks) - 2; j >= 0; j-- { 541 for l := 0; l < c; l++ { 542 _p.double(&_p) 543 } 544 totalj := <-chChunks[j] 545 _p.add(&totalj) 546 } 547 548 return p.unsafeFromJacExtended(&_p) 549 } 550 551 // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] * 552 // combinationCoeff^i and stores the result in p. It returns error in case 553 // configuration is invalid. 554 func (p *G2Affine) Fold(points []G2Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) { 555 var _p G2Jac 556 if _, err := _p.Fold(points, combinationCoeff, config); err != nil { 557 return nil, err 558 } 559 p.FromJacobian(&_p) 560 return p, nil 561 } 562 563 // Fold computes the multi-exponentiation \sum_{i=0}^{len(points)-1} points[i] * 564 // combinationCoeff^i and stores the result in p. It returns error in case 565 // configuration is invalid. 566 func (p *G2Jac) Fold(points []G2Affine, combinationCoeff fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) { 567 scalars := make([]fr.Element, len(points)) 568 scalar := fr.NewElement(1) 569 for i := 0; i < len(points); i++ { 570 scalars[i].Set(&scalar) 571 scalar.Mul(&scalar, &combinationCoeff) 572 } 573 return p.MultiExp(points, scalars, config) 574 } 575 576 // selector stores the index, mask and shifts needed to select bits from a scalar 577 // it is used during the multiExp algorithm or the batch scalar multiplication 578 type selector struct { 579 index uint64 // index in the multi-word scalar to select bits from 580 mask uint64 // mask (c-bit wide) 581 shift uint64 // shift needed to get our bits on low positions 582 583 multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) 584 maskHigh uint64 // same than mask, for index+1 585 shiftHigh uint64 // same than shift, for index+1 586 } 587 588 // return number of chunks for a given window size c 589 // the last chunk may be bigger to accommodate a potential carry from the NAF decomposition 590 func computeNbChunks(c uint64) uint64 { 591 return (fr.Bits + c - 1) / c 592 } 593 594 // return the last window size for a scalar; 595 // this last window should accommodate a carry (from the NAF decomposition) 596 // it can be == c if we have 1 available bit 597 // it can be > c if we have 0 available bit 598 // it can be < c if we have 2+ available bits 599 func lastC(c uint64) uint64 { 600 nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits 601 return c + 1 - nbAvailableBits 602 } 603 604 type chunkStat struct { 605 // relative weight of work compared to other chunks. 100.0 -> nominal weight. 606 weight float32 607 608 // percentage of bucket filled in the window; 609 ppBucketFilled float32 610 nbBucketFilled int 611 } 612 613 // partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits 614 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract 615 // 2^{c} to the current digit, making it negative. 616 // negative digits can be processed in a later step as adding -G into the bucket instead of G 617 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication) 618 func partitionScalars(scalars []fr.Element, c uint64, nbTasks int) ([]uint16, []chunkStat) { 619 // no benefit here to have more tasks than CPUs 620 if nbTasks > runtime.NumCPU() { 621 nbTasks = runtime.NumCPU() 622 } 623 624 // number of c-bit radixes in a scalar 625 nbChunks := computeNbChunks(c) 626 627 digits := make([]uint16, len(scalars)*int(nbChunks)) 628 629 mask := uint64((1 << c) - 1) // low c bits are 1 630 max := int(1<<(c-1)) - 1 // max value (inclusive) we want for our digits 631 cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words 632 633 // compute offset and word selector / shift to select the right bits of our windows 634 selectors := make([]selector, nbChunks) 635 for chunk := uint64(0); chunk < nbChunks; chunk++ { 636 jc := uint64(chunk * c) 637 d := selector{} 638 d.index = jc / 64 639 d.shift = jc - (d.index * 64) 640 d.mask = mask << d.shift 641 d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) 642 if d.multiWordSelect { 643 nbBitsHigh := d.shift - uint64(64-c) 644 d.maskHigh = (1 << nbBitsHigh) - 1 645 d.shiftHigh = (c - nbBitsHigh) 646 } 647 selectors[chunk] = d 648 } 649 650 parallel.Execute(len(scalars), func(start, end int) { 651 for i := start; i < end; i++ { 652 if scalars[i].IsZero() { 653 // everything is 0, no need to process this scalar 654 continue 655 } 656 scalar := scalars[i].Bits() 657 658 var carry int 659 660 // for each chunk in the scalar, compute the current digit, and an eventual carry 661 for chunk := uint64(0); chunk < nbChunks-1; chunk++ { 662 s := selectors[chunk] 663 664 // init with carry if any 665 digit := carry 666 carry = 0 667 668 // digit = value of the c-bit window 669 digit += int((scalar[s.index] & s.mask) >> s.shift) 670 671 if s.multiWordSelect { 672 // we are selecting bits over 2 words 673 digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh 674 } 675 676 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and subtract 677 // 2^{c} to the current digit, making it negative. 678 if digit > max { 679 digit -= (1 << c) 680 carry = 1 681 } 682 683 // if digit is zero, no impact on result 684 if digit == 0 { 685 continue 686 } 687 688 var bits uint16 689 if digit > 0 { 690 bits = uint16(digit) << 1 691 } else { 692 bits = (uint16(-digit-1) << 1) + 1 693 } 694 digits[int(chunk)*len(scalars)+i] = bits 695 } 696 697 // for the last chunk, we don't want to borrow from a next window 698 // (but may have a larger max value) 699 chunk := nbChunks - 1 700 s := selectors[chunk] 701 // init with carry if any 702 digit := carry 703 // digit = value of the c-bit window 704 digit += int((scalar[s.index] & s.mask) >> s.shift) 705 if s.multiWordSelect { 706 // we are selecting bits over 2 words 707 digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh 708 } 709 digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1 710 } 711 712 }, nbTasks) 713 714 // aggregate chunk stats 715 chunkStats := make([]chunkStat, nbChunks) 716 if c <= 9 { 717 // no need to compute stats for small window sizes 718 return digits, chunkStats 719 } 720 parallel.Execute(len(chunkStats), func(start, end int) { 721 // for each chunk compute the statistics 722 for chunkID := start; chunkID < end; chunkID++ { 723 // indicates if a bucket is hit. 724 var b bitSetC16 725 726 // digits for the chunk 727 chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)] 728 729 totalOps := 0 730 nz := 0 // non zero buckets count 731 for _, digit := range chunkDigits { 732 if digit == 0 { 733 continue 734 } 735 totalOps++ 736 bucketID := digit >> 1 737 if digit&1 == 0 { 738 bucketID -= 1 739 } 740 if !b[bucketID] { 741 nz++ 742 b[bucketID] = true 743 } 744 } 745 chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after 746 chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1))) 747 chunkStats[chunkID].nbBucketFilled = nz 748 } 749 }, nbTasks) 750 751 totalOps := float32(0.0) 752 for _, stat := range chunkStats { 753 totalOps += stat.weight 754 } 755 756 target := totalOps / float32(nbChunks) 757 if target != 0.0 { 758 // if target == 0, it means all the scalars are 0 everywhere, there is no work to be done. 759 for i := 0; i < len(chunkStats); i++ { 760 chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target 761 } 762 } 763 764 return digits, chunkStats 765 }