github.com/sunriselayer/sunrise-da@v0.13.1-sr3/das/coordinator.go (about)

     1  package das
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	libhead "github.com/celestiaorg/go-header"
     9  
    10  	"github.com/sunriselayer/sunrise-da/header"
    11  	"github.com/sunriselayer/sunrise-da/share/p2p/shrexsub"
    12  )
    13  
    14  // samplingCoordinator runs and coordinates sampling workers and updates current sampling state
    15  type samplingCoordinator struct {
    16  	concurrencyLimit int
    17  	samplingTimeout  time.Duration
    18  
    19  	getter      libhead.Getter[*header.ExtendedHeader]
    20  	sampleFn    sampleFn
    21  	broadcastFn shrexsub.BroadcastFn
    22  
    23  	state coordinatorState
    24  
    25  	// resultCh fans-in sampling results from worker to coordinator
    26  	resultCh chan result
    27  	// updHeadCh signals to update network head header height
    28  	updHeadCh chan *header.ExtendedHeader
    29  	// waitCh signals to block coordinator for external access to state
    30  	waitCh chan *sync.WaitGroup
    31  
    32  	workersWg sync.WaitGroup
    33  	metrics   *metrics
    34  	done
    35  }
    36  
    37  // result will carry errors to coordinator after worker finishes the job
    38  type result struct {
    39  	job
    40  	failed map[uint64]int
    41  	err    error
    42  }
    43  
    44  func newSamplingCoordinator(
    45  	params Parameters,
    46  	getter libhead.Getter[*header.ExtendedHeader],
    47  	sample sampleFn,
    48  	broadcast shrexsub.BroadcastFn,
    49  ) *samplingCoordinator {
    50  	return &samplingCoordinator{
    51  		concurrencyLimit: params.ConcurrencyLimit,
    52  		samplingTimeout:  params.SampleTimeout,
    53  		getter:           getter,
    54  		sampleFn:         sample,
    55  		broadcastFn:      broadcast,
    56  		state:            newCoordinatorState(params),
    57  		resultCh:         make(chan result),
    58  		updHeadCh:        make(chan *header.ExtendedHeader),
    59  		waitCh:           make(chan *sync.WaitGroup),
    60  		done:             newDone("sampling coordinator"),
    61  	}
    62  }
    63  
    64  func (sc *samplingCoordinator) run(ctx context.Context, cp checkpoint) {
    65  	sc.state.resumeFromCheckpoint(cp)
    66  
    67  	// resume workers
    68  	for _, wk := range cp.Workers {
    69  		sc.runWorker(ctx, sc.state.newJob(wk.JobType, wk.From, wk.To))
    70  	}
    71  
    72  	for {
    73  		for !sc.concurrencyLimitReached() {
    74  			next, found := sc.state.nextJob()
    75  			if !found {
    76  				break
    77  			}
    78  			sc.runWorker(ctx, next)
    79  		}
    80  
    81  		select {
    82  		case head := <-sc.updHeadCh:
    83  			if sc.state.isNewHead(head.Height()) {
    84  				if !sc.recentJobsLimitReached() {
    85  					sc.runWorker(ctx, sc.state.recentJob(head))
    86  				}
    87  				sc.state.updateHead(head.Height())
    88  				// run worker without concurrency limit restrictions to reduced delay
    89  				sc.metrics.observeNewHead(ctx)
    90  			}
    91  		case res := <-sc.resultCh:
    92  			sc.state.handleResult(res)
    93  		case wg := <-sc.waitCh:
    94  			wg.Wait()
    95  		case <-ctx.Done():
    96  			sc.workersWg.Wait()
    97  			sc.indicateDone()
    98  			return
    99  		}
   100  	}
   101  }
   102  
   103  // runWorker runs job in separate worker go-routine
   104  func (sc *samplingCoordinator) runWorker(ctx context.Context, j job) {
   105  	w := newWorker(j, sc.getter, sc.sampleFn, sc.broadcastFn, sc.metrics)
   106  	sc.state.putInProgress(j.id, w.getState)
   107  
   108  	// launch worker go-routine
   109  	sc.workersWg.Add(1)
   110  	go func() {
   111  		defer sc.workersWg.Done()
   112  		w.run(ctx, sc.samplingTimeout, sc.resultCh)
   113  	}()
   114  }
   115  
   116  // listen notifies the coordinator about a new network head received via subscription.
   117  func (sc *samplingCoordinator) listen(ctx context.Context, h *header.ExtendedHeader) {
   118  	select {
   119  	case sc.updHeadCh <- h:
   120  	case <-ctx.Done():
   121  	}
   122  }
   123  
   124  // stats pauses the coordinator to get stats in a concurrently safe manner
   125  func (sc *samplingCoordinator) stats(ctx context.Context) (SamplingStats, error) {
   126  	var wg sync.WaitGroup
   127  	wg.Add(1)
   128  	defer wg.Done()
   129  
   130  	select {
   131  	case sc.waitCh <- &wg:
   132  	case <-ctx.Done():
   133  		return SamplingStats{}, ctx.Err()
   134  	}
   135  
   136  	return sc.state.unsafeStats(), nil
   137  }
   138  
   139  func (sc *samplingCoordinator) getCheckpoint(ctx context.Context) (checkpoint, error) {
   140  	stats, err := sc.stats(ctx)
   141  	if err != nil {
   142  		return checkpoint{}, err
   143  	}
   144  	return newCheckpoint(stats), nil
   145  }
   146  
   147  // concurrencyLimitReached indicates whether concurrencyLimit has been reached
   148  func (sc *samplingCoordinator) concurrencyLimitReached() bool {
   149  	return len(sc.state.inProgress) >= sc.concurrencyLimit
   150  }
   151  
   152  // recentJobsLimitReached indicates whether concurrency limit for recent jobs has been reached
   153  func (sc *samplingCoordinator) recentJobsLimitReached() bool {
   154  	return len(sc.state.inProgress) >= 2*sc.concurrencyLimit
   155  }