github.com/celestiaorg/celestia-node@v0.15.0-beta.1/das/state.go

github.com/celestiaorg/celestia-node@v0.15.0-beta.1/das/state.go (about)

     1  package das
     2  
     3  import (
     4  	"context"
     5  	"sync/atomic"
     6  	"time"
     7  
     8  	"github.com/celestiaorg/celestia-node/header"
     9  )
    10  
    11  // coordinatorState represents the current state of sampling process
    12  type coordinatorState struct {
    13  	// sampleFrom is the height from which the DASer will start sampling
    14  	sampleFrom uint64
    15  	// samplingRange is the maximum amount of headers processed in one job.
    16  	samplingRange uint64
    17  
    18  	// keeps track of running workers
    19  	inProgress map[int]func() workerState
    20  
    21  	// retryStrategy implements retry backoff
    22  	retryStrategy retryStrategy
    23  	// stores heights of failed headers with amount of retry attempt as value
    24  	failed map[uint64]retryAttempt
    25  	// inRetry stores (height -> attempt count) of failed headers that are currently being retried by
    26  	// workers
    27  	inRetry map[uint64]retryAttempt
    28  
    29  	// nextJobID is a unique identifier that will be used for creation of next job
    30  	nextJobID int
    31  	// all headers before next were sent to workers
    32  	next uint64
    33  	// networkHead is the height of the latest known network head
    34  	networkHead uint64
    35  
    36  	// catchUpDone indicates if all headers are sampled
    37  	catchUpDone atomic.Bool
    38  	// catchUpDoneCh blocks until all headers are sampled
    39  	catchUpDoneCh chan struct{}
    40  }
    41  
    42  // retryAttempt represents a retry attempt with a backoff delay.
    43  type retryAttempt struct {
    44  	// count specifies the number of retry attempts made so far.
    45  	count int
    46  	// after specifies the time for the next retry attempt.
    47  	after time.Time
    48  }
    49  
    50  // newCoordinatorState initiates state for samplingCoordinator
    51  func newCoordinatorState(params Parameters) coordinatorState {
    52  	return coordinatorState{
    53  		sampleFrom:    params.SampleFrom,
    54  		samplingRange: params.SamplingRange,
    55  		inProgress:    make(map[int]func() workerState),
    56  		retryStrategy: newRetryStrategy(exponentialBackoff(
    57  			defaultBackoffInitialInterval,
    58  			defaultBackoffMultiplier,
    59  			defaultBackoffMaxRetryCount)),
    60  		failed:        make(map[uint64]retryAttempt),
    61  		inRetry:       make(map[uint64]retryAttempt),
    62  		nextJobID:     0,
    63  		next:          params.SampleFrom,
    64  		networkHead:   params.SampleFrom,
    65  		catchUpDoneCh: make(chan struct{}),
    66  	}
    67  }
    68  
    69  func (s *coordinatorState) resumeFromCheckpoint(c checkpoint) {
    70  	s.next = c.SampleFrom
    71  	s.networkHead = c.NetworkHead
    72  
    73  	for h, count := range c.Failed {
    74  		// resumed retries should start without backoff delay
    75  		s.failed[h] = retryAttempt{
    76  			count: count,
    77  			after: time.Now(),
    78  		}
    79  	}
    80  }
    81  
    82  func (s *coordinatorState) handleResult(res result) {
    83  	delete(s.inProgress, res.id)
    84  
    85  	switch res.jobType {
    86  	case recentJob, catchupJob:
    87  		s.handleRecentOrCatchupResult(res)
    88  	case retryJob:
    89  		s.handleRetryResult(res)
    90  	}
    91  
    92  	s.checkDone()
    93  }
    94  
    95  func (s *coordinatorState) handleRecentOrCatchupResult(res result) {
    96  	// check if the worker retried any of the previously failed heights
    97  	for h := range s.failed {
    98  		if h < res.from || h > res.to {
    99  			continue
   100  		}
   101  
   102  		if res.failed[h] == 0 {
   103  			delete(s.failed, h)
   104  		}
   105  	}
   106  
   107  	// update failed heights
   108  	for h := range res.failed {
   109  		nextRetry, _ := s.retryStrategy.nextRetry(retryAttempt{}, time.Now())
   110  		s.failed[h] = nextRetry
   111  	}
   112  }
   113  
   114  func (s *coordinatorState) handleRetryResult(res result) {
   115  	// move heights that has failed again to failed with keeping retry count, they will be picked up by
   116  	// retry workers later
   117  	for h := range res.failed {
   118  		lastRetry := s.inRetry[h]
   119  		// height will be retried after backoff
   120  		nextRetry, retryExceeded := s.retryStrategy.nextRetry(lastRetry, time.Now())
   121  		if retryExceeded {
   122  			log.Warnw("header exceeded maximum amount of sampling attempts",
   123  				"height", h,
   124  				"attempts", nextRetry.count)
   125  		}
   126  		s.failed[h] = nextRetry
   127  	}
   128  
   129  	// processed height are either already moved to failed map or succeeded, cleanup inRetry
   130  	for h := res.from; h <= res.to; h++ {
   131  		delete(s.inRetry, h)
   132  	}
   133  }
   134  
   135  func (s *coordinatorState) isNewHead(newHead uint64) bool {
   136  	// seen this header before
   137  	if newHead <= s.networkHead {
   138  		log.Warnf("received head height: %v, which is lower or the same as previously known: %v", newHead, s.networkHead)
   139  		return false
   140  	}
   141  	return true
   142  }
   143  
   144  func (s *coordinatorState) updateHead(newHead uint64) {
   145  	if s.networkHead == s.sampleFrom {
   146  		log.Infow("found first header, starting sampling")
   147  	}
   148  
   149  	s.networkHead = newHead
   150  	log.Debugw("updated head", "from_height", s.networkHead, "to_height", newHead)
   151  	s.checkDone()
   152  }
   153  
   154  // recentJob creates a job to process a recent header.
   155  func (s *coordinatorState) recentJob(header *header.ExtendedHeader) job {
   156  	// move next, to prevent catchup job from processing same height
   157  	if s.next == header.Height() {
   158  		s.next++
   159  	}
   160  	s.nextJobID++
   161  	return job{
   162  		id:      s.nextJobID,
   163  		jobType: recentJob,
   164  		header:  header,
   165  		from:    header.Height(),
   166  		to:      header.Height(),
   167  	}
   168  }
   169  
   170  // nextJob will return next catchup or retry job according to priority (retry -> catchup)
   171  func (s *coordinatorState) nextJob() (next job, found bool) {
   172  	// check for if any retry jobs are available
   173  	if job, found := s.retryJob(); found {
   174  		return job, found
   175  	}
   176  
   177  	// if no retry jobs, make a catchup job
   178  	return s.catchupJob()
   179  }
   180  
   181  // catchupJob creates a catchup job if catchup is not finished
   182  func (s *coordinatorState) catchupJob() (next job, found bool) {
   183  	if s.next > s.networkHead {
   184  		return job{}, false
   185  	}
   186  
   187  	to := s.next + s.samplingRange - 1
   188  	if to > s.networkHead {
   189  		to = s.networkHead
   190  	}
   191  	j := s.newJob(catchupJob, s.next, to)
   192  	s.next = to + 1
   193  	return j, true
   194  }
   195  
   196  // retryJob creates a job to retry previously failed header
   197  func (s *coordinatorState) retryJob() (next job, found bool) {
   198  	for h, attempt := range s.failed {
   199  		if !attempt.canRetry() {
   200  			// height will be retried later
   201  			continue
   202  		}
   203  
   204  		// move header from failed into retry
   205  		delete(s.failed, h)
   206  		s.inRetry[h] = attempt
   207  		j := s.newJob(retryJob, h, h)
   208  		return j, true
   209  	}
   210  
   211  	return job{}, false
   212  }
   213  
   214  func (s *coordinatorState) putInProgress(jobID int, getState func() workerState) {
   215  	s.inProgress[jobID] = getState
   216  }
   217  
   218  func (s *coordinatorState) newJob(jobType jobType, from, to uint64) job {
   219  	s.nextJobID++
   220  	return job{
   221  		id:      s.nextJobID,
   222  		jobType: jobType,
   223  		from:    from,
   224  		to:      to,
   225  	}
   226  }
   227  
   228  // unsafeStats collects coordinator stats without thread-safety
   229  func (s *coordinatorState) unsafeStats() SamplingStats {
   230  	workers := make([]WorkerStats, 0, len(s.inProgress))
   231  	lowestFailedOrInProgress := s.next
   232  	failed := make(map[uint64]int)
   233  
   234  	// gather worker stats
   235  	for _, getStats := range s.inProgress {
   236  		wstats := getStats()
   237  		var errMsg string
   238  		if wstats.err != nil {
   239  			errMsg = wstats.err.Error()
   240  		}
   241  		workers = append(workers, WorkerStats{
   242  			JobType: wstats.job.jobType,
   243  			Curr:    wstats.curr,
   244  			From:    wstats.from,
   245  			To:      wstats.to,
   246  			ErrMsg:  errMsg,
   247  		})
   248  
   249  		for h := range wstats.failed {
   250  			failed[h]++
   251  			if h < lowestFailedOrInProgress {
   252  				lowestFailedOrInProgress = h
   253  			}
   254  		}
   255  
   256  		if wstats.curr < lowestFailedOrInProgress {
   257  			lowestFailedOrInProgress = wstats.curr
   258  		}
   259  	}
   260  
   261  	// set lowestFailedOrInProgress to minimum failed - 1
   262  	for h, retry := range s.failed {
   263  		failed[h] += retry.count
   264  		if h < lowestFailedOrInProgress {
   265  			lowestFailedOrInProgress = h
   266  		}
   267  	}
   268  
   269  	for h, retry := range s.inRetry {
   270  		failed[h] += retry.count
   271  	}
   272  
   273  	return SamplingStats{
   274  		SampledChainHead: lowestFailedOrInProgress - 1,
   275  		CatchupHead:      s.next - 1,
   276  		NetworkHead:      s.networkHead,
   277  		Failed:           failed,
   278  		Workers:          workers,
   279  		Concurrency:      len(workers),
   280  		CatchUpDone:      s.catchUpDone.Load(),
   281  		IsRunning:        len(workers) > 0 || s.catchUpDone.Load(),
   282  	}
   283  }
   284  
   285  func (s *coordinatorState) checkDone() {
   286  	if len(s.inProgress) == 0 && len(s.failed) == 0 && s.next > s.networkHead {
   287  		if s.catchUpDone.CompareAndSwap(false, true) {
   288  			close(s.catchUpDoneCh)
   289  		}
   290  		return
   291  	}
   292  
   293  	if s.catchUpDone.Load() {
   294  		// overwrite channel before storing done flag
   295  		s.catchUpDoneCh = make(chan struct{})
   296  		s.catchUpDone.Store(false)
   297  	}
   298  }
   299  
   300  // waitCatchUp waits for sampling process to indicate catchup is done
   301  func (s *coordinatorState) waitCatchUp(ctx context.Context) error {
   302  	if s.catchUpDone.Load() {
   303  		return nil
   304  	}
   305  	select {
   306  	case <-s.catchUpDoneCh:
   307  	case <-ctx.Done():
   308  		return ctx.Err()
   309  	}
   310  	return nil
   311  }
   312  
   313  // canRetry returns true if the time stored in the "after" has passed.
   314  func (r retryAttempt) canRetry() bool {
   315  	return r.after.Before(time.Now())
   316  }