github.com/onflow/flow-go@v0.33.17/module/jobqueue/consumer.go (about)

     1  package jobqueue
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"sync"
     7  
     8  	"github.com/rs/zerolog"
     9  	"go.uber.org/atomic"
    10  
    11  	"github.com/onflow/flow-go/module"
    12  	"github.com/onflow/flow-go/storage"
    13  )
    14  
    15  type Worker interface {
    16  	// returned error must be unexpected fatal error
    17  	Run(job module.Job) error
    18  }
    19  
    20  type Consumer struct {
    21  	mu  sync.Mutex
    22  	log zerolog.Logger
    23  
    24  	// Storage
    25  	jobs     module.Jobs              // storage to read jobs from
    26  	progress storage.ConsumerProgress // to resume from first unprocessed job after restarting
    27  
    28  	// dependency
    29  	worker Worker // to process job and notify consumer when finish processing a job
    30  
    31  	// Config
    32  	maxProcessing  uint64 // max number of jobs to be processed concurrently
    33  	maxSearchAhead uint64 // max number of jobs beyond processedIndex to process. 0 means no limit
    34  
    35  	// State Variables
    36  	running bool // a signal to control whether to start processing more jobs. Useful for waiting
    37  	// until the workers are ready
    38  	isChecking *atomic.Bool // allow only one process checking job processable
    39  	// are ready, and stop when shutting down.
    40  	runningJobs sync.WaitGroup // to wait for all existing jobs to finish for graceful shutdown
    41  
    42  	processedIndex   uint64
    43  	processings      map[uint64]*jobStatus   // keep track of the status of each on going job
    44  	processingsIndex map[module.JobID]uint64 // lookup the index of the job, useful when fast forwarding the
    45  	// `processed` variable
    46  
    47  	started *atomic.Bool // only allow the consumer to be started once, and forbid calls to Check before Start
    48  }
    49  
    50  func NewConsumer(
    51  	log zerolog.Logger,
    52  	jobs module.Jobs,
    53  	progress storage.ConsumerProgress,
    54  	worker Worker,
    55  	maxProcessing uint64,
    56  	maxSearchAhead uint64,
    57  	defaultIndex uint64,
    58  ) (*Consumer, error) {
    59  
    60  	processedIndex, err := readProcessedIndex(log, progress, defaultIndex)
    61  	if err != nil {
    62  		return nil, fmt.Errorf("could not read processed index: %w", err)
    63  	}
    64  
    65  	return &Consumer{
    66  		log: log.With().Str("sub_module", "job_queue").Logger(),
    67  
    68  		// store dependency
    69  		jobs:     jobs,
    70  		progress: progress,
    71  		worker:   worker,
    72  
    73  		// update config
    74  		maxProcessing:  maxProcessing,
    75  		maxSearchAhead: maxSearchAhead,
    76  
    77  		// init state variables
    78  		running:          false,
    79  		isChecking:       atomic.NewBool(false),
    80  		started:          atomic.NewBool(false),
    81  		processedIndex:   processedIndex,
    82  		processings:      make(map[uint64]*jobStatus),
    83  		processingsIndex: make(map[module.JobID]uint64),
    84  	}, nil
    85  }
    86  
    87  func readProcessedIndex(log zerolog.Logger, progress storage.ConsumerProgress, defaultIndex uint64) (uint64, error) {
    88  	// on startup, sync with storage for the processed index
    89  	// to ensure the consistency
    90  	processedIndex, err := progress.ProcessedIndex()
    91  	if errors.Is(err, storage.ErrNotFound) {
    92  		err := progress.InitProcessedIndex(defaultIndex)
    93  		if errors.Is(err, storage.ErrAlreadyExists) {
    94  			return 0, fmt.Errorf("processed index has already been inited, no effect for the second time. default index: %v",
    95  				defaultIndex)
    96  		}
    97  
    98  		if err != nil {
    99  			return 0, fmt.Errorf("could not init processed index: %w", err)
   100  		}
   101  
   102  		log.Warn().Uint64("processed index", processedIndex).
   103  			Msg("processed index not found, initialized.")
   104  		return defaultIndex, nil
   105  	}
   106  
   107  	if err != nil {
   108  		return 0, fmt.Errorf("could not read processed index: %w", err)
   109  	}
   110  
   111  	return processedIndex, nil
   112  }
   113  
   114  // Start starts consuming the jobs from the job queue.
   115  func (c *Consumer) Start() error {
   116  	c.mu.Lock()
   117  	defer c.mu.Unlock()
   118  
   119  	if !c.started.CompareAndSwap(false, true) {
   120  		return fmt.Errorf("consumer has already been started")
   121  	}
   122  	c.running = true
   123  
   124  	c.log.Info().
   125  		Uint64("processed", c.processedIndex).
   126  		Msg("consumer started")
   127  
   128  	c.checkProcessable()
   129  
   130  	return nil
   131  }
   132  
   133  // Stop stops consuming jobs from the job queue.
   134  // It blocks until the existing worker finish processing the job
   135  // Note, it won't stop the existing worker from finishing their job
   136  func (c *Consumer) Stop() {
   137  	c.mu.Lock()
   138  	c.running = false
   139  	// not to use `defer`, otherwise runningJobs.Wait will hold the lock and cause deadlock
   140  	c.mu.Unlock()
   141  
   142  	c.log.Info().Msg("stopping consumer")
   143  	c.runningJobs.Wait()
   144  	c.log.Info().Msg("consumer stopped")
   145  }
   146  
   147  // Size returns number of in-memory jobs that consumer is processing.
   148  func (c *Consumer) Size() uint {
   149  	c.mu.Lock()
   150  	defer c.mu.Unlock()
   151  
   152  	return uint(len(c.processings))
   153  }
   154  
   155  // LastProcessedIndex returns the last processed job index
   156  func (c *Consumer) LastProcessedIndex() uint64 {
   157  	c.mu.Lock()
   158  	defer c.mu.Unlock()
   159  
   160  	return c.processedIndex
   161  }
   162  
   163  // NotifyJobIsDone let the consumer know a job has been finished, so that consumer will take
   164  // the next job from the job queue if there are workers available. It returns the last processed job index.
   165  func (c *Consumer) NotifyJobIsDone(jobID module.JobID) uint64 {
   166  	c.mu.Lock()
   167  	defer c.mu.Unlock()
   168  	c.log.Debug().Str("job_id", string(jobID)).Msg("finishing job")
   169  
   170  	if c.doneJob(jobID) {
   171  		c.checkProcessable()
   172  	}
   173  
   174  	return c.processedIndex
   175  }
   176  
   177  // Check allows the job publisher to notify the consumer that a new job has been added, so that
   178  // the consumer can check if the job is processable
   179  // since multiple checks at the same time are unnecessary, we could only keep one check by checking.
   180  // an atomic isChecking value.
   181  func (c *Consumer) Check() {
   182  	if !c.started.Load() {
   183  		// Check is not allowed before the consumer is started
   184  		c.log.Warn().Msg("ignoring Check before Start")
   185  		return
   186  	}
   187  
   188  	if !c.isChecking.CompareAndSwap(false, true) {
   189  		// other process is checking, we could exit and rely on that process to check
   190  		// processable jobs
   191  		return
   192  	}
   193  
   194  	// still need to lock here, since checkProcessable might update the state vars.
   195  	c.mu.Lock()
   196  	defer c.mu.Unlock()
   197  
   198  	c.checkProcessable()
   199  
   200  	c.isChecking.Store(false)
   201  }
   202  
   203  // checkProcessable is a wrap of the `run` function with logging
   204  func (c *Consumer) checkProcessable() {
   205  	c.log.Debug().Msg("checking processable jobs")
   206  
   207  	processingCount, err := c.run()
   208  	if err != nil {
   209  		c.log.Error().Err(err).Msg("failed to check processables")
   210  		return
   211  	}
   212  
   213  	if processingCount > 0 {
   214  		c.log.Info().Int64("processing", processingCount).Msg("processing jobs")
   215  	} else {
   216  		c.log.Debug().Bool("running", c.running).Msg("no job found")
   217  	}
   218  
   219  }
   220  
   221  // run checks if there are processable jobs and process them by giving
   222  // them to the callback functions.
   223  // this function is passive, it won't trigger itself, but can only be
   224  // triggered by either Start or NotifyJobIsDone
   225  func (c *Consumer) run() (int64, error) {
   226  	processedFrom := c.processedIndex
   227  	processables, processedTo, err := c.processableJobs()
   228  	if err != nil {
   229  		return 0, fmt.Errorf("could not query processable jobs: %w", err)
   230  	}
   231  
   232  	c.log.Debug().
   233  		Uint64("processed_from", processedFrom).
   234  		Uint64("processed_to", processedTo).
   235  		Int("processables", len(processables)).
   236  		Bool("running", c.running).
   237  		Msg("running")
   238  
   239  	for _, indexedJob := range processables {
   240  		jobID := indexedJob.job.ID()
   241  
   242  		c.processingsIndex[jobID] = indexedJob.index
   243  		c.processings[indexedJob.index] = &jobStatus{
   244  			jobID: jobID,
   245  			done:  false,
   246  		}
   247  
   248  		c.runningJobs.Add(1)
   249  		go func(j *jobAtIndex) {
   250  			err := c.worker.Run(j.job)
   251  			if err != nil {
   252  				c.log.Fatal().Err(err).Msg("could not run the job")
   253  			}
   254  			c.runningJobs.Done()
   255  		}(indexedJob)
   256  	}
   257  
   258  	err = c.progress.SetProcessedIndex(processedTo)
   259  	if err != nil {
   260  		return 0, fmt.Errorf("could not set processed index %v, %w", processedTo, err)
   261  	}
   262  
   263  	for index := c.processedIndex + 1; index <= processedTo; index++ {
   264  		jobStatus, ok := c.processings[index]
   265  		if !ok {
   266  			continue
   267  		}
   268  
   269  		delete(c.processings, index)
   270  		delete(c.processingsIndex, jobStatus.jobID)
   271  	}
   272  
   273  	c.processedIndex = processedTo
   274  
   275  	return int64(len(processables)), nil
   276  }
   277  
   278  func (c *Consumer) processableJobs() ([]*jobAtIndex, uint64, error) {
   279  	processables, processedTo, err := processableJobs(
   280  		c.jobs,
   281  		c.processings,
   282  		c.maxProcessing,
   283  		c.maxSearchAhead,
   284  		c.processedIndex,
   285  	)
   286  
   287  	if err != nil {
   288  		return nil, 0, err
   289  	}
   290  
   291  	// if the consumer has been stopped, we allow the existing worker to update the progressed index
   292  	// but won't return any new job for processing
   293  	if !c.running {
   294  		return nil, processedTo, nil
   295  	}
   296  
   297  	return processables, processedTo, nil
   298  }
   299  
   300  // processableJobs check the worker's capacity and if sufficient, read
   301  // jobs from the storage, return the processable jobs, and the processed
   302  // index
   303  func processableJobs(jobs module.Jobs, processings map[uint64]*jobStatus, maxProcessing uint64, maxSearchAhead uint64, processedIndex uint64) ([]*jobAtIndex, uint64,
   304  	error) {
   305  	processables := make([]*jobAtIndex, 0)
   306  
   307  	// count how many jobs are still processing,
   308  	// in order to decide whether to process a new job
   309  	processing := uint64(0)
   310  
   311  	// determine if the consumer should pause processing new jobs because it's too far ahead of
   312  	// the lowest in progress index
   313  	shouldPause := func(index uint64) bool {
   314  		if maxSearchAhead == 0 {
   315  			return false
   316  		}
   317  
   318  		return index-processedIndex > maxSearchAhead
   319  	}
   320  
   321  	// if still have processing capacity, find the next processable job
   322  	for i := processedIndex + 1; processing < maxProcessing && !shouldPause(i); i++ {
   323  		status, ok := processings[i]
   324  
   325  		// if no worker is processing the next job, try to read it and process
   326  		if !ok {
   327  			// take one job
   328  			job, err := jobs.AtIndex(i)
   329  
   330  			// if there is no more job at this index, we could stop
   331  			if errors.Is(err, storage.ErrNotFound) {
   332  				break
   333  			}
   334  
   335  			// exception
   336  			if err != nil {
   337  				return nil, 0, fmt.Errorf("could not read job at index %v, %w", i, err)
   338  			}
   339  
   340  			processing++
   341  
   342  			processables = append(processables, &jobAtIndex{
   343  				job:   job,
   344  				index: i,
   345  			})
   346  			continue
   347  		}
   348  
   349  		// only increment the processing variable when
   350  		// the job is not done, meaning still processing
   351  		if !status.done {
   352  			processing++
   353  			continue
   354  		}
   355  
   356  		if i == processedIndex+1 {
   357  			processedIndex++
   358  		}
   359  	}
   360  
   361  	return processables, processedIndex, nil
   362  }
   363  
   364  // doneJob updates the internal state to mark the job has been processed
   365  // return true if the job is changed from processing to finished
   366  // return false if the job is already finished, or removed
   367  func (c *Consumer) doneJob(jobID module.JobID) bool {
   368  	// lock
   369  	index, ok := c.processingsIndex[jobID]
   370  	if !ok {
   371  		// job must has been processed
   372  		return false
   373  	}
   374  
   375  	status, ok := c.processings[index]
   376  	if !ok {
   377  		// must be a bug, if went here
   378  		c.log.Fatal().Msgf("bug, job (%v) can not be found by index (%v)", jobID, index)
   379  		return false
   380  	}
   381  
   382  	if status.done {
   383  		// job has been done already
   384  		return false
   385  	}
   386  
   387  	status.done = true
   388  	return true
   389  }
   390  
   391  type jobAtIndex struct {
   392  	job   module.Job
   393  	index uint64
   394  }
   395  
   396  type jobStatus struct {
   397  	jobID module.JobID
   398  	done  bool
   399  }