github.com/koko1123/flow-go-1@v0.29.6/module/jobqueue/consumer.go (about) 1 package jobqueue 2 3 import ( 4 "errors" 5 "fmt" 6 "sync" 7 8 "github.com/rs/zerolog" 9 "go.uber.org/atomic" 10 11 "github.com/koko1123/flow-go-1/module" 12 "github.com/koko1123/flow-go-1/storage" 13 ) 14 15 type Worker interface { 16 // returned error must be unexpected fatal error 17 Run(job module.Job) error 18 } 19 20 type Consumer struct { 21 mu sync.Mutex 22 log zerolog.Logger 23 24 // Storage 25 jobs module.Jobs // storage to read jobs from 26 progress storage.ConsumerProgress // to resume from first unprocessed job after restarting 27 28 // dependency 29 worker Worker // to process job and notify consumer when finish processing a job 30 31 // Config 32 maxProcessing uint64 // max number of jobs to be processed concurrently 33 maxSearchAhead uint64 // max number of jobs beyond processedIndex to process. 0 means no limit 34 35 // State Variables 36 running bool // a signal to control whether to start processing more jobs. Useful for waiting 37 // until the workers are ready 38 isChecking *atomic.Bool // allow only one process checking job processable 39 // are ready, and stop when shutting down. 40 runningJobs sync.WaitGroup // to wait for all existing jobs to finish for graceful shutdown 41 42 processedIndex uint64 43 processings map[uint64]*jobStatus // keep track of the status of each on going job 44 processingsIndex map[module.JobID]uint64 // lookup the index of the job, useful when fast forwarding the 45 // `processed` variable 46 } 47 48 func NewConsumer( 49 log zerolog.Logger, 50 jobs module.Jobs, 51 progress storage.ConsumerProgress, 52 worker Worker, 53 maxProcessing uint64, 54 maxSearchAhead uint64, 55 ) *Consumer { 56 return &Consumer{ 57 log: log.With().Str("sub_module", "job_queue").Logger(), 58 59 // store dependency 60 jobs: jobs, 61 progress: progress, 62 worker: worker, 63 64 // update config 65 maxProcessing: maxProcessing, 66 maxSearchAhead: maxSearchAhead, 67 68 // init state variables 69 running: false, 70 isChecking: atomic.NewBool(false), 71 processedIndex: 0, 72 processings: make(map[uint64]*jobStatus), 73 processingsIndex: make(map[module.JobID]uint64), 74 } 75 } 76 77 // Start starts consuming the jobs from the job queue. 78 func (c *Consumer) Start(defaultIndex uint64) error { 79 c.mu.Lock() 80 defer c.mu.Unlock() 81 82 if c.running { 83 return nil 84 } 85 86 c.running = true 87 88 // on startup, sync with storage for the processed index 89 // to ensure the consistency 90 processedIndex, err := c.progress.ProcessedIndex() 91 if errors.Is(err, storage.ErrNotFound) { 92 err := c.progress.InitProcessedIndex(defaultIndex) 93 if errors.Is(err, storage.ErrAlreadyExists) { 94 return fmt.Errorf("processed index has already been inited, no effect for the second time. default index: %v", 95 defaultIndex) 96 } 97 98 if err != nil { 99 return fmt.Errorf("could not init processed index: %w", err) 100 } 101 102 processedIndex = defaultIndex 103 104 c.log.Warn().Uint64("processed index", processedIndex). 105 Msg("processed index not found, initialized.") 106 } else if err != nil { 107 return fmt.Errorf("could not read processed index: %w", err) 108 } 109 110 c.processedIndex = processedIndex 111 112 c.checkProcessable() 113 114 c.log.Info(). 115 Uint64("processed", processedIndex). 116 Msg("consumer started") 117 return nil 118 } 119 120 // Stop stops consuming jobs from the job queue. 121 // It blocks until the existing worker finish processing the job 122 // Note, it won't stop the existing worker from finishing their job 123 func (c *Consumer) Stop() { 124 c.mu.Lock() 125 c.running = false 126 // not to use `defer`, otherwise runningJobs.Wait will hold the lock and cause deadlock 127 c.mu.Unlock() 128 129 c.log.Info().Msg("stopping consumer") 130 c.runningJobs.Wait() 131 c.log.Info().Msg("consumer stopped") 132 } 133 134 // Size returns number of in-memory jobs that consumer is processing. 135 func (c *Consumer) Size() uint { 136 c.mu.Lock() 137 defer c.mu.Unlock() 138 139 return uint(len(c.processings)) 140 } 141 142 // LastProcessedIndex returns the last processed job index 143 func (c *Consumer) LastProcessedIndex() uint64 { 144 c.mu.Lock() 145 defer c.mu.Unlock() 146 147 return c.processedIndex 148 } 149 150 // NotifyJobIsDone let the consumer know a job has been finished, so that consumer will take 151 // the next job from the job queue if there are workers available. It returns the last processed job index. 152 func (c *Consumer) NotifyJobIsDone(jobID module.JobID) uint64 { 153 c.mu.Lock() 154 defer c.mu.Unlock() 155 c.log.Debug().Str("job_id", string(jobID)).Msg("finishing job") 156 157 if c.doneJob(jobID) { 158 c.checkProcessable() 159 } 160 161 return c.processedIndex 162 } 163 164 // Check allows the job publisher to notify the consumer that a new job has been added, so that 165 // the consumer can check if the job is processable 166 // since multiple checks at the same time are unnecessary, we could only keep one check by checking. 167 // an atomic isChecking value. 168 func (c *Consumer) Check() { 169 if !c.isChecking.CompareAndSwap(false, true) { 170 // other process is checking, we could exit and rely on that process to check 171 // processable jobs 172 return 173 } 174 175 // still need to lock here, since checkProcessable might update the state vars. 176 c.mu.Lock() 177 defer c.mu.Unlock() 178 179 c.checkProcessable() 180 181 c.isChecking.Store(false) 182 } 183 184 // checkProcessable is a wrap of the `run` function with logging 185 func (c *Consumer) checkProcessable() { 186 c.log.Debug().Msg("checking processable jobs") 187 188 processingCount, err := c.run() 189 if err != nil { 190 c.log.Error().Err(err).Msg("failed to check processables") 191 return 192 } 193 194 if processingCount > 0 { 195 c.log.Info().Int64("processing", processingCount).Msg("processing jobs") 196 } else { 197 c.log.Debug().Bool("running", c.running).Msg("no job found") 198 } 199 200 } 201 202 // run checks if there are processable jobs and process them by giving 203 // them to the callback functions. 204 // this function is passive, it won't trigger itself, but can only be 205 // triggered by either Start or NotifyJobIsDone 206 func (c *Consumer) run() (int64, error) { 207 processedFrom := c.processedIndex 208 processables, processedTo, err := c.processableJobs() 209 if err != nil { 210 return 0, fmt.Errorf("could not query processable jobs: %w", err) 211 } 212 213 c.log.Debug(). 214 Uint64("processed_from", processedFrom). 215 Uint64("processed_to", processedTo). 216 Int("processables", len(processables)). 217 Bool("running", c.running). 218 Msg("running") 219 220 for _, indexedJob := range processables { 221 jobID := indexedJob.job.ID() 222 223 c.processingsIndex[jobID] = indexedJob.index 224 c.processings[indexedJob.index] = &jobStatus{ 225 jobID: jobID, 226 done: false, 227 } 228 229 c.runningJobs.Add(1) 230 go func(j *jobAtIndex) { 231 err := c.worker.Run(j.job) 232 if err != nil { 233 c.log.Fatal().Err(err).Msg("could not run the job") 234 } 235 c.runningJobs.Done() 236 }(indexedJob) 237 } 238 239 err = c.progress.SetProcessedIndex(processedTo) 240 if err != nil { 241 return 0, fmt.Errorf("could not set processed index %v, %w", processedTo, err) 242 } 243 244 for index := c.processedIndex + 1; index <= processedTo; index++ { 245 jobStatus, ok := c.processings[index] 246 if !ok { 247 continue 248 } 249 250 delete(c.processings, index) 251 delete(c.processingsIndex, jobStatus.jobID) 252 } 253 254 c.processedIndex = processedTo 255 256 return int64(len(processables)), nil 257 } 258 259 func (c *Consumer) processableJobs() ([]*jobAtIndex, uint64, error) { 260 processables, processedTo, err := processableJobs( 261 c.jobs, 262 c.processings, 263 c.maxProcessing, 264 c.maxSearchAhead, 265 c.processedIndex, 266 ) 267 268 if err != nil { 269 return nil, 0, err 270 } 271 272 // if the consumer has been stopped, we allow the existing worker to update the progressed index 273 // but won't return any new job for processing 274 if !c.running { 275 return nil, processedTo, nil 276 } 277 278 return processables, processedTo, nil 279 } 280 281 // processableJobs check the worker's capacity and if sufficient, read 282 // jobs from the storage, return the processable jobs, and the processed 283 // index 284 func processableJobs(jobs module.Jobs, processings map[uint64]*jobStatus, maxProcessing uint64, maxSearchAhead uint64, processedIndex uint64) ([]*jobAtIndex, uint64, 285 error) { 286 processables := make([]*jobAtIndex, 0) 287 288 // count how many jobs are still processing, 289 // in order to decide whether to process a new job 290 processing := uint64(0) 291 292 // determine if the consumer should pause processing new jobs because it's too far ahead of 293 // the lowest in progress index 294 shouldPause := func(index uint64) bool { 295 if maxSearchAhead == 0 { 296 return false 297 } 298 299 return index-processedIndex > maxSearchAhead 300 } 301 302 // if still have processing capacity, find the next processable job 303 for i := processedIndex + 1; processing < maxProcessing && !shouldPause(i); i++ { 304 status, ok := processings[i] 305 306 // if no worker is processing the next job, try to read it and process 307 if !ok { 308 // take one job 309 job, err := jobs.AtIndex(i) 310 311 // if there is no more job at this index, we could stop 312 if errors.Is(err, storage.ErrNotFound) { 313 break 314 } 315 316 // exception 317 if err != nil { 318 return nil, 0, fmt.Errorf("could not read job at index %v, %w", i, err) 319 } 320 321 processing++ 322 323 processables = append(processables, &jobAtIndex{ 324 job: job, 325 index: i, 326 }) 327 continue 328 } 329 330 // only increment the processing variable when 331 // the job is not done, meaning still processing 332 if !status.done { 333 processing++ 334 continue 335 } 336 337 if i == processedIndex+1 { 338 processedIndex++ 339 } 340 } 341 342 return processables, processedIndex, nil 343 } 344 345 // doneJob updates the internal state to mark the job has been processed 346 // return true if the job is changed from processing to finished 347 // return false if the job is already finished, or removed 348 func (c *Consumer) doneJob(jobID module.JobID) bool { 349 // lock 350 index, ok := c.processingsIndex[jobID] 351 if !ok { 352 // job must has been processed 353 return false 354 } 355 356 status, ok := c.processings[index] 357 if !ok { 358 // must be a bug, if went here 359 c.log.Fatal().Msgf("bug, job (%v) can not be found by index (%v)", jobID, index) 360 return false 361 } 362 363 if status.done { 364 // job has been done already 365 return false 366 } 367 368 status.done = true 369 return true 370 } 371 372 type jobAtIndex struct { 373 job module.Job 374 index uint64 375 } 376 377 type jobStatus struct { 378 jobID module.JobID 379 done bool 380 }