github.com/sunriselayer/sunrise-da@v0.13.1-sr3/das/state.go (about) 1 package das 2 3 import ( 4 "context" 5 "sync/atomic" 6 "time" 7 8 "github.com/sunriselayer/sunrise-da/header" 9 ) 10 11 // coordinatorState represents the current state of sampling process 12 type coordinatorState struct { 13 // sampleFrom is the height from which the DASer will start sampling 14 sampleFrom uint64 15 // samplingRange is the maximum amount of headers processed in one job. 16 samplingRange uint64 17 18 // keeps track of running workers 19 inProgress map[int]func() workerState 20 21 // retryStrategy implements retry backoff 22 retryStrategy retryStrategy 23 // stores heights of failed headers with amount of retry attempt as value 24 failed map[uint64]retryAttempt 25 // inRetry stores (height -> attempt count) of failed headers that are currently being retried by 26 // workers 27 inRetry map[uint64]retryAttempt 28 29 // nextJobID is a unique identifier that will be used for creation of next job 30 nextJobID int 31 // all headers before next were sent to workers 32 next uint64 33 // networkHead is the height of the latest known network head 34 networkHead uint64 35 36 // catchUpDone indicates if all headers are sampled 37 catchUpDone atomic.Bool 38 // catchUpDoneCh blocks until all headers are sampled 39 catchUpDoneCh chan struct{} 40 } 41 42 // retryAttempt represents a retry attempt with a backoff delay. 43 type retryAttempt struct { 44 // count specifies the number of retry attempts made so far. 45 count int 46 // after specifies the time for the next retry attempt. 47 after time.Time 48 } 49 50 // newCoordinatorState initiates state for samplingCoordinator 51 func newCoordinatorState(params Parameters) coordinatorState { 52 return coordinatorState{ 53 sampleFrom: params.SampleFrom, 54 samplingRange: params.SamplingRange, 55 inProgress: make(map[int]func() workerState), 56 retryStrategy: newRetryStrategy(exponentialBackoff( 57 defaultBackoffInitialInterval, 58 defaultBackoffMultiplier, 59 defaultBackoffMaxRetryCount)), 60 failed: make(map[uint64]retryAttempt), 61 inRetry: make(map[uint64]retryAttempt), 62 nextJobID: 0, 63 next: params.SampleFrom, 64 networkHead: params.SampleFrom, 65 catchUpDoneCh: make(chan struct{}), 66 } 67 } 68 69 func (s *coordinatorState) resumeFromCheckpoint(c checkpoint) { 70 s.next = c.SampleFrom 71 s.networkHead = c.NetworkHead 72 73 for h, count := range c.Failed { 74 // resumed retries should start without backoff delay 75 s.failed[h] = retryAttempt{ 76 count: count, 77 after: time.Now(), 78 } 79 } 80 } 81 82 func (s *coordinatorState) handleResult(res result) { 83 delete(s.inProgress, res.id) 84 85 switch res.jobType { 86 case recentJob, catchupJob: 87 s.handleRecentOrCatchupResult(res) 88 case retryJob: 89 s.handleRetryResult(res) 90 } 91 92 s.checkDone() 93 } 94 95 func (s *coordinatorState) handleRecentOrCatchupResult(res result) { 96 // check if the worker retried any of the previously failed heights 97 for h := range s.failed { 98 if h < res.from || h > res.to { 99 continue 100 } 101 102 if res.failed[h] == 0 { 103 delete(s.failed, h) 104 } 105 } 106 107 // update failed heights 108 for h := range res.failed { 109 nextRetry, _ := s.retryStrategy.nextRetry(retryAttempt{}, time.Now()) 110 s.failed[h] = nextRetry 111 } 112 } 113 114 func (s *coordinatorState) handleRetryResult(res result) { 115 // move heights that has failed again to failed with keeping retry count, they will be picked up by 116 // retry workers later 117 for h := range res.failed { 118 lastRetry := s.inRetry[h] 119 // height will be retried after backoff 120 nextRetry, retryExceeded := s.retryStrategy.nextRetry(lastRetry, time.Now()) 121 if retryExceeded { 122 log.Warnw("header exceeded maximum amount of sampling attempts", 123 "height", h, 124 "attempts", nextRetry.count) 125 } 126 s.failed[h] = nextRetry 127 } 128 129 // processed height are either already moved to failed map or succeeded, cleanup inRetry 130 for h := res.from; h <= res.to; h++ { 131 delete(s.inRetry, h) 132 } 133 } 134 135 func (s *coordinatorState) isNewHead(newHead uint64) bool { 136 // seen this header before 137 if newHead <= s.networkHead { 138 log.Warnf("received head height: %v, which is lower or the same as previously known: %v", newHead, s.networkHead) 139 return false 140 } 141 return true 142 } 143 144 func (s *coordinatorState) updateHead(newHead uint64) { 145 if s.networkHead == s.sampleFrom { 146 log.Infow("found first header, starting sampling") 147 } 148 149 s.networkHead = newHead 150 log.Debugw("updated head", "from_height", s.networkHead, "to_height", newHead) 151 s.checkDone() 152 } 153 154 // recentJob creates a job to process a recent header. 155 func (s *coordinatorState) recentJob(header *header.ExtendedHeader) job { 156 // move next, to prevent catchup job from processing same height 157 if s.next == header.Height() { 158 s.next++ 159 } 160 s.nextJobID++ 161 return job{ 162 id: s.nextJobID, 163 jobType: recentJob, 164 header: header, 165 from: header.Height(), 166 to: header.Height(), 167 } 168 } 169 170 // nextJob will return next catchup or retry job according to priority (retry -> catchup) 171 func (s *coordinatorState) nextJob() (next job, found bool) { 172 // check for if any retry jobs are available 173 if job, found := s.retryJob(); found { 174 return job, found 175 } 176 177 // if no retry jobs, make a catchup job 178 return s.catchupJob() 179 } 180 181 // catchupJob creates a catchup job if catchup is not finished 182 func (s *coordinatorState) catchupJob() (next job, found bool) { 183 if s.next > s.networkHead { 184 return job{}, false 185 } 186 187 to := s.next + s.samplingRange - 1 188 if to > s.networkHead { 189 to = s.networkHead 190 } 191 j := s.newJob(catchupJob, s.next, to) 192 s.next = to + 1 193 return j, true 194 } 195 196 // retryJob creates a job to retry previously failed header 197 func (s *coordinatorState) retryJob() (next job, found bool) { 198 for h, attempt := range s.failed { 199 if !attempt.canRetry() { 200 // height will be retried later 201 continue 202 } 203 204 // move header from failed into retry 205 delete(s.failed, h) 206 s.inRetry[h] = attempt 207 j := s.newJob(retryJob, h, h) 208 return j, true 209 } 210 211 return job{}, false 212 } 213 214 func (s *coordinatorState) putInProgress(jobID int, getState func() workerState) { 215 s.inProgress[jobID] = getState 216 } 217 218 func (s *coordinatorState) newJob(jobType jobType, from, to uint64) job { 219 s.nextJobID++ 220 return job{ 221 id: s.nextJobID, 222 jobType: jobType, 223 from: from, 224 to: to, 225 } 226 } 227 228 // unsafeStats collects coordinator stats without thread-safety 229 func (s *coordinatorState) unsafeStats() SamplingStats { 230 workers := make([]WorkerStats, 0, len(s.inProgress)) 231 lowestFailedOrInProgress := s.next 232 failed := make(map[uint64]int) 233 234 // gather worker stats 235 for _, getStats := range s.inProgress { 236 wstats := getStats() 237 var errMsg string 238 if wstats.err != nil { 239 errMsg = wstats.err.Error() 240 } 241 workers = append(workers, WorkerStats{ 242 JobType: wstats.job.jobType, 243 Curr: wstats.curr, 244 From: wstats.from, 245 To: wstats.to, 246 ErrMsg: errMsg, 247 }) 248 249 for h := range wstats.failed { 250 failed[h]++ 251 if h < lowestFailedOrInProgress { 252 lowestFailedOrInProgress = h 253 } 254 } 255 256 if wstats.curr < lowestFailedOrInProgress { 257 lowestFailedOrInProgress = wstats.curr 258 } 259 } 260 261 // set lowestFailedOrInProgress to minimum failed - 1 262 for h, retry := range s.failed { 263 failed[h] += retry.count 264 if h < lowestFailedOrInProgress { 265 lowestFailedOrInProgress = h 266 } 267 } 268 269 for h, retry := range s.inRetry { 270 failed[h] += retry.count 271 } 272 273 return SamplingStats{ 274 SampledChainHead: lowestFailedOrInProgress - 1, 275 CatchupHead: s.next - 1, 276 NetworkHead: s.networkHead, 277 Failed: failed, 278 Workers: workers, 279 Concurrency: len(workers), 280 CatchUpDone: s.catchUpDone.Load(), 281 IsRunning: len(workers) > 0 || s.catchUpDone.Load(), 282 } 283 } 284 285 func (s *coordinatorState) checkDone() { 286 if len(s.inProgress) == 0 && len(s.failed) == 0 && s.next > s.networkHead { 287 if s.catchUpDone.CompareAndSwap(false, true) { 288 close(s.catchUpDoneCh) 289 } 290 return 291 } 292 293 if s.catchUpDone.Load() { 294 // overwrite channel before storing done flag 295 s.catchUpDoneCh = make(chan struct{}) 296 s.catchUpDone.Store(false) 297 } 298 } 299 300 // waitCatchUp waits for sampling process to indicate catchup is done 301 func (s *coordinatorState) waitCatchUp(ctx context.Context) error { 302 if s.catchUpDone.Load() { 303 return nil 304 } 305 select { 306 case <-s.catchUpDoneCh: 307 case <-ctx.Done(): 308 return ctx.Err() 309 } 310 return nil 311 } 312 313 // canRetry returns true if the time stored in the "after" has passed. 314 func (r retryAttempt) canRetry() bool { 315 return r.after.Before(time.Now()) 316 }