github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/jobqueue/consumer.go

github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/jobqueue/consumer.go (about)

     1  package jobqueue
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"sync"
     7  
     8  	"github.com/rs/zerolog"
     9  	"go.uber.org/atomic"
    10  
    11  	"github.com/onflow/flow-go/module"
    12  	"github.com/onflow/flow-go/storage"
    13  )
    14  
    15  type Worker interface {
    16  	// returned error must be unexpected fatal error
    17  	Run(job module.Job) error
    18  }
    19  
    20  type Consumer struct {
    21  	mu  sync.Mutex
    22  	log zerolog.Logger
    23  
    24  	// Storage
    25  	jobs     module.Jobs              // storage to read jobs from
    26  	progress storage.ConsumerProgress // to resume from first unprocessed job after restarting
    27  
    28  	// dependency
    29  	worker Worker // to process job and notify consumer when finish processing a job
    30  
    31  	// Config
    32  	maxProcessing  uint64 // max number of jobs to be processed concurrently
    33  	maxSearchAhead uint64 // max number of jobs beyond processedIndex to process. 0 means no limit
    34  
    35  	// State Variables
    36  	running bool // a signal to control whether to start processing more jobs. Useful for waiting
    37  	// until the workers are ready
    38  	isChecking *atomic.Bool // allow only one process checking job processable
    39  	// are ready, and stop when shutting down.
    40  	runningJobs sync.WaitGroup // to wait for all existing jobs to finish for graceful shutdown
    41  
    42  	processedIndex   uint64
    43  	processings      map[uint64]*jobStatus   // keep track of the status of each on going job
    44  	processingsIndex map[module.JobID]uint64 // lookup the index of the job, useful when fast forwarding the
    45  	// `processed` variable
    46  
    47  	started *atomic.Bool // only allow the consumer to be started once, and forbid calls to Check before Start
    48  }
    49  
    50  func NewConsumer(
    51  	log zerolog.Logger,
    52  	jobs module.Jobs,
    53  	progress storage.ConsumerProgress,
    54  	worker Worker,
    55  	maxProcessing uint64,
    56  	maxSearchAhead uint64,
    57  	defaultIndex uint64,
    58  ) (*Consumer, error) {
    59  
    60  	processedIndex, err := readProcessedIndex(log, progress, defaultIndex)
    61  	if err != nil {
    62  		return nil, fmt.Errorf("could not read processed index: %w", err)
    63  	}
    64  
    65  	return &Consumer{
    66  		log: log.With().Str("sub_module", "job_queue").Logger(),
    67  
    68  		// store dependency
    69  		jobs:     jobs,
    70  		progress: progress,
    71  		worker:   worker,
    72  
    73  		// update config
    74  		maxProcessing:  maxProcessing,
    75  		maxSearchAhead: maxSearchAhead,
    76  
    77  		// init state variables
    78  		running:          false,
    79  		isChecking:       atomic.NewBool(false),
    80  		started:          atomic.NewBool(false),
    81  		processedIndex:   processedIndex,
    82  		processings:      make(map[uint64]*jobStatus),
    83  		processingsIndex: make(map[module.JobID]uint64),
    84  	}, nil
    85  }
    86  
    87  func readProcessedIndex(log zerolog.Logger, progress storage.ConsumerProgress, defaultIndex uint64) (uint64, error) {
    88  	// on startup, sync with storage for the processed index
    89  	// to ensure the consistency
    90  	processedIndex, err := progress.ProcessedIndex()
    91  	if errors.Is(err, storage.ErrNotFound) {
    92  		err := progress.InitProcessedIndex(defaultIndex)
    93  		if errors.Is(err, storage.ErrAlreadyExists) {
    94  			return 0, fmt.Errorf("processed index has already been inited, no effect for the second time. default index: %v",
    95  				defaultIndex)
    96  		}
    97  
    98  		if err != nil {
    99  			return 0, fmt.Errorf("could not init processed index: %w", err)
   100  		}
   101  
   102  		log.Warn().Uint64("processed index", processedIndex).
   103  			Msg("processed index not found, initialized.")
   104  		return defaultIndex, nil
   105  	}
   106  
   107  	if err != nil {
   108  		return 0, fmt.Errorf("could not read processed index: %w", err)
   109  	}
   110  
   111  	return processedIndex, nil
   112  }
   113  
   114  // Start starts consuming the jobs from the job queue.
   115  func (c *Consumer) Start() error {
   116  	c.mu.Lock()
   117  	defer c.mu.Unlock()
   118  
   119  	if !c.started.CompareAndSwap(false, true) {
   120  		return fmt.Errorf("consumer has already been started")
   121  	}
   122  	c.running = true
   123  
   124  	c.log.Info().
   125  		Uint64("processed", c.processedIndex).
   126  		Msg("consumer started")
   127  
   128  	c.checkProcessable()
   129  
   130  	return nil
   131  }
   132  
   133  // Stop stops consuming jobs from the job queue.
   134  // It blocks until the existing worker finish processing the job
   135  // Note, it won't stop the existing worker from finishing their job
   136  func (c *Consumer) Stop() {
   137  	c.mu.Lock()
   138  	c.running = false
   139  	// not to use `defer`, otherwise runningJobs.Wait will hold the lock and cause deadlock
   140  	c.mu.Unlock()
   141  
   142  	c.log.Info().Msg("stopping consumer")
   143  	c.runningJobs.Wait()
   144  	c.log.Info().Msg("consumer stopped")
   145  }
   146  
   147  // Size returns number of in-memory jobs that consumer is processing.
   148  func (c *Consumer) Size() uint {
   149  	c.mu.Lock()
   150  	defer c.mu.Unlock()
   151  
   152  	return uint(len(c.processings))
   153  }
   154  
   155  // LastProcessedIndex returns the last processed job index
   156  func (c *Consumer) LastProcessedIndex() uint64 {
   157  	c.mu.Lock()
   158  	defer c.mu.Unlock()
   159  
   160  	return c.processedIndex
   161  }
   162  
   163  // NotifyJobIsDone let the consumer know a job has been finished, so that consumer will take
   164  // the next job from the job queue if there are workers available. It returns the last processed job index.
   165  func (c *Consumer) NotifyJobIsDone(jobID module.JobID) uint64 {
   166  	c.mu.Lock()
   167  	defer c.mu.Unlock()
   168  	c.log.Debug().Str("job_id", string(jobID)).Msg("finishing job")
   169  
   170  	if c.doneJob(jobID) {
   171  		c.checkProcessable()
   172  	}
   173  
   174  	return c.processedIndex
   175  }
   176  
   177  // Check allows the job publisher to notify the consumer that a new job has been added, so that
   178  // the consumer can check if the job is processable
   179  // since multiple checks at the same time are unnecessary, we could only keep one check by checking.
   180  // an atomic isChecking value.
   181  func (c *Consumer) Check() {
   182  	if !c.started.Load() {
   183  		// Check is not allowed before the consumer is started
   184  		c.log.Warn().Msg("ignoring Check before Start")
   185  		return
   186  	}
   187  
   188  	if !c.isChecking.CompareAndSwap(false, true) {
   189  		// other process is checking, we could exit and rely on that process to check
   190  		// processable jobs
   191  		return
   192  	}
   193  
   194  	// still need to lock here, since checkProcessable might update the state vars.
   195  	c.mu.Lock()
   196  	defer c.mu.Unlock()
   197  
   198  	c.checkProcessable()
   199  
   200  	c.isChecking.Store(false)
   201  }
   202  
   203  // checkProcessable is a wrap of the `run` function with logging
   204  func (c *Consumer) checkProcessable() {
   205  	c.log.Debug().Msg("checking processable jobs")
   206  
   207  	processingCount, err := c.run()
   208  	if err != nil {
   209  		c.log.Error().Err(err).Msg("failed to check processables")
   210  		return
   211  	}
   212  
   213  	if processingCount > 0 {
   214  		c.log.Info().Int64("processing", processingCount).Msg("processing jobs")
   215  	} else {
   216  		c.log.Debug().Bool("running", c.running).Msg("no job found")
   217  	}
   218  }
   219  
   220  // run checks if there are processable jobs and process them by giving
   221  // them to the callback functions.
   222  // this function is passive, it won't trigger itself, but can only be
   223  // triggered by either Start or NotifyJobIsDone
   224  func (c *Consumer) run() (int64, error) {
   225  	processedFrom := c.processedIndex
   226  	processables, processedTo, err := c.processableJobs()
   227  	if err != nil {
   228  		return 0, fmt.Errorf("could not query processable jobs: %w", err)
   229  	}
   230  
   231  	c.log.Debug().
   232  		Uint64("processed_from", processedFrom).
   233  		Uint64("processed_to", processedTo).
   234  		Int("processables", len(processables)).
   235  		Bool("running", c.running).
   236  		Msg("running")
   237  
   238  	for _, indexedJob := range processables {
   239  		jobID := indexedJob.job.ID()
   240  
   241  		c.processingsIndex[jobID] = indexedJob.index
   242  		c.processings[indexedJob.index] = &jobStatus{
   243  			jobID: jobID,
   244  			done:  false,
   245  		}
   246  
   247  		c.runningJobs.Add(1)
   248  		go func(j *jobAtIndex) {
   249  			err := c.worker.Run(j.job)
   250  			if err != nil {
   251  				c.log.Fatal().Err(err).Msg("could not run the job")
   252  			}
   253  			c.runningJobs.Done()
   254  		}(indexedJob)
   255  	}
   256  
   257  	err = c.progress.SetProcessedIndex(processedTo)
   258  	if err != nil {
   259  		return 0, fmt.Errorf("could not set processed index %v, %w", processedTo, err)
   260  	}
   261  
   262  	for index := c.processedIndex + 1; index <= processedTo; index++ {
   263  		jobStatus, ok := c.processings[index]
   264  		if !ok {
   265  			continue
   266  		}
   267  
   268  		delete(c.processings, index)
   269  		delete(c.processingsIndex, jobStatus.jobID)
   270  	}
   271  
   272  	c.processedIndex = processedTo
   273  
   274  	return int64(len(processables)), nil
   275  }
   276  
   277  func (c *Consumer) processableJobs() ([]*jobAtIndex, uint64, error) {
   278  	processables, processedTo, err := processableJobs(
   279  		c.jobs,
   280  		c.processings,
   281  		c.maxProcessing,
   282  		c.maxSearchAhead,
   283  		c.processedIndex,
   284  	)
   285  
   286  	if err != nil {
   287  		return nil, 0, err
   288  	}
   289  
   290  	// if the consumer has been stopped, we allow the existing worker to update the progressed index
   291  	// but won't return any new job for processing
   292  	if !c.running {
   293  		return nil, processedTo, nil
   294  	}
   295  
   296  	return processables, processedTo, nil
   297  }
   298  
   299  // processableJobs check the worker's capacity and if sufficient, read
   300  // jobs from the storage, return the processable jobs, and the processed
   301  // index
   302  func processableJobs(jobs module.Jobs, processings map[uint64]*jobStatus, maxProcessing uint64, maxSearchAhead uint64, processedIndex uint64) ([]*jobAtIndex, uint64,
   303  	error) {
   304  	processables := make([]*jobAtIndex, 0)
   305  
   306  	// count how many jobs are still processing,
   307  	// in order to decide whether to process a new job
   308  	processing := uint64(0)
   309  
   310  	// determine if the consumer should pause processing new jobs because it's too far ahead of
   311  	// the lowest in progress index
   312  	shouldPause := func(index uint64) bool {
   313  		if maxSearchAhead == 0 {
   314  			return false
   315  		}
   316  
   317  		return index-processedIndex > maxSearchAhead
   318  	}
   319  
   320  	// if still have processing capacity, find the next processable job
   321  	for i := processedIndex + 1; processing < maxProcessing && !shouldPause(i); i++ {
   322  		status, ok := processings[i]
   323  
   324  		// if no worker is processing the next job, try to read it and process
   325  		if !ok {
   326  			// take one job
   327  			job, err := jobs.AtIndex(i)
   328  
   329  			// if there is no more job at this index, we could stop
   330  			if errors.Is(err, storage.ErrNotFound) {
   331  				break
   332  			}
   333  
   334  			// exception
   335  			if err != nil {
   336  				return nil, 0, fmt.Errorf("could not read job at index %v, %w", i, err)
   337  			}
   338  
   339  			processing++
   340  
   341  			processables = append(processables, &jobAtIndex{
   342  				job:   job,
   343  				index: i,
   344  			})
   345  			continue
   346  		}
   347  
   348  		// only increment the processing variable when
   349  		// the job is not done, meaning still processing
   350  		if !status.done {
   351  			processing++
   352  			continue
   353  		}
   354  
   355  		if i == processedIndex+1 {
   356  			processedIndex++
   357  		}
   358  	}
   359  
   360  	return processables, processedIndex, nil
   361  }
   362  
   363  // doneJob updates the internal state to mark the job has been processed
   364  // return true if the job is changed from processing to finished
   365  // return false if the job is already finished, or removed
   366  func (c *Consumer) doneJob(jobID module.JobID) bool {
   367  	// lock
   368  	index, ok := c.processingsIndex[jobID]
   369  	if !ok {
   370  		// job must has been processed
   371  		return false
   372  	}
   373  
   374  	status, ok := c.processings[index]
   375  	if !ok {
   376  		// must be a bug, if went here
   377  		c.log.Fatal().Msgf("bug, job (%v) can not be found by index (%v)", jobID, index)
   378  		return false
   379  	}
   380  
   381  	if status.done {
   382  		// job has been done already
   383  		return false
   384  	}
   385  
   386  	status.done = true
   387  	return true
   388  }
   389  
   390  type jobAtIndex struct {
   391  	job   module.Job
   392  	index uint64
   393  }
   394  
   395  type jobStatus struct {
   396  	jobID module.JobID
   397  	done  bool
   398  }