github.com/koko1123/flow-go-1@v0.29.6/module/jobqueue/consumer.go (about)

     1  package jobqueue
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"sync"
     7  
     8  	"github.com/rs/zerolog"
     9  	"go.uber.org/atomic"
    10  
    11  	"github.com/koko1123/flow-go-1/module"
    12  	"github.com/koko1123/flow-go-1/storage"
    13  )
    14  
    15  type Worker interface {
    16  	// returned error must be unexpected fatal error
    17  	Run(job module.Job) error
    18  }
    19  
    20  type Consumer struct {
    21  	mu  sync.Mutex
    22  	log zerolog.Logger
    23  
    24  	// Storage
    25  	jobs     module.Jobs              // storage to read jobs from
    26  	progress storage.ConsumerProgress // to resume from first unprocessed job after restarting
    27  
    28  	// dependency
    29  	worker Worker // to process job and notify consumer when finish processing a job
    30  
    31  	// Config
    32  	maxProcessing  uint64 // max number of jobs to be processed concurrently
    33  	maxSearchAhead uint64 // max number of jobs beyond processedIndex to process. 0 means no limit
    34  
    35  	// State Variables
    36  	running bool // a signal to control whether to start processing more jobs. Useful for waiting
    37  	// until the workers are ready
    38  	isChecking *atomic.Bool // allow only one process checking job processable
    39  	// are ready, and stop when shutting down.
    40  	runningJobs sync.WaitGroup // to wait for all existing jobs to finish for graceful shutdown
    41  
    42  	processedIndex   uint64
    43  	processings      map[uint64]*jobStatus   // keep track of the status of each on going job
    44  	processingsIndex map[module.JobID]uint64 // lookup the index of the job, useful when fast forwarding the
    45  	// `processed` variable
    46  }
    47  
    48  func NewConsumer(
    49  	log zerolog.Logger,
    50  	jobs module.Jobs,
    51  	progress storage.ConsumerProgress,
    52  	worker Worker,
    53  	maxProcessing uint64,
    54  	maxSearchAhead uint64,
    55  ) *Consumer {
    56  	return &Consumer{
    57  		log: log.With().Str("sub_module", "job_queue").Logger(),
    58  
    59  		// store dependency
    60  		jobs:     jobs,
    61  		progress: progress,
    62  		worker:   worker,
    63  
    64  		// update config
    65  		maxProcessing:  maxProcessing,
    66  		maxSearchAhead: maxSearchAhead,
    67  
    68  		// init state variables
    69  		running:          false,
    70  		isChecking:       atomic.NewBool(false),
    71  		processedIndex:   0,
    72  		processings:      make(map[uint64]*jobStatus),
    73  		processingsIndex: make(map[module.JobID]uint64),
    74  	}
    75  }
    76  
    77  // Start starts consuming the jobs from the job queue.
    78  func (c *Consumer) Start(defaultIndex uint64) error {
    79  	c.mu.Lock()
    80  	defer c.mu.Unlock()
    81  
    82  	if c.running {
    83  		return nil
    84  	}
    85  
    86  	c.running = true
    87  
    88  	// on startup, sync with storage for the processed index
    89  	// to ensure the consistency
    90  	processedIndex, err := c.progress.ProcessedIndex()
    91  	if errors.Is(err, storage.ErrNotFound) {
    92  		err := c.progress.InitProcessedIndex(defaultIndex)
    93  		if errors.Is(err, storage.ErrAlreadyExists) {
    94  			return fmt.Errorf("processed index has already been inited, no effect for the second time. default index: %v",
    95  				defaultIndex)
    96  		}
    97  
    98  		if err != nil {
    99  			return fmt.Errorf("could not init processed index: %w", err)
   100  		}
   101  
   102  		processedIndex = defaultIndex
   103  
   104  		c.log.Warn().Uint64("processed index", processedIndex).
   105  			Msg("processed index not found, initialized.")
   106  	} else if err != nil {
   107  		return fmt.Errorf("could not read processed index: %w", err)
   108  	}
   109  
   110  	c.processedIndex = processedIndex
   111  
   112  	c.checkProcessable()
   113  
   114  	c.log.Info().
   115  		Uint64("processed", processedIndex).
   116  		Msg("consumer started")
   117  	return nil
   118  }
   119  
   120  // Stop stops consuming jobs from the job queue.
   121  // It blocks until the existing worker finish processing the job
   122  // Note, it won't stop the existing worker from finishing their job
   123  func (c *Consumer) Stop() {
   124  	c.mu.Lock()
   125  	c.running = false
   126  	// not to use `defer`, otherwise runningJobs.Wait will hold the lock and cause deadlock
   127  	c.mu.Unlock()
   128  
   129  	c.log.Info().Msg("stopping consumer")
   130  	c.runningJobs.Wait()
   131  	c.log.Info().Msg("consumer stopped")
   132  }
   133  
   134  // Size returns number of in-memory jobs that consumer is processing.
   135  func (c *Consumer) Size() uint {
   136  	c.mu.Lock()
   137  	defer c.mu.Unlock()
   138  
   139  	return uint(len(c.processings))
   140  }
   141  
   142  // LastProcessedIndex returns the last processed job index
   143  func (c *Consumer) LastProcessedIndex() uint64 {
   144  	c.mu.Lock()
   145  	defer c.mu.Unlock()
   146  
   147  	return c.processedIndex
   148  }
   149  
   150  // NotifyJobIsDone let the consumer know a job has been finished, so that consumer will take
   151  // the next job from the job queue if there are workers available. It returns the last processed job index.
   152  func (c *Consumer) NotifyJobIsDone(jobID module.JobID) uint64 {
   153  	c.mu.Lock()
   154  	defer c.mu.Unlock()
   155  	c.log.Debug().Str("job_id", string(jobID)).Msg("finishing job")
   156  
   157  	if c.doneJob(jobID) {
   158  		c.checkProcessable()
   159  	}
   160  
   161  	return c.processedIndex
   162  }
   163  
   164  // Check allows the job publisher to notify the consumer that a new job has been added, so that
   165  // the consumer can check if the job is processable
   166  // since multiple checks at the same time are unnecessary, we could only keep one check by checking.
   167  // an atomic isChecking value.
   168  func (c *Consumer) Check() {
   169  	if !c.isChecking.CompareAndSwap(false, true) {
   170  		// other process is checking, we could exit and rely on that process to check
   171  		// processable jobs
   172  		return
   173  	}
   174  
   175  	// still need to lock here, since checkProcessable might update the state vars.
   176  	c.mu.Lock()
   177  	defer c.mu.Unlock()
   178  
   179  	c.checkProcessable()
   180  
   181  	c.isChecking.Store(false)
   182  }
   183  
   184  // checkProcessable is a wrap of the `run` function with logging
   185  func (c *Consumer) checkProcessable() {
   186  	c.log.Debug().Msg("checking processable jobs")
   187  
   188  	processingCount, err := c.run()
   189  	if err != nil {
   190  		c.log.Error().Err(err).Msg("failed to check processables")
   191  		return
   192  	}
   193  
   194  	if processingCount > 0 {
   195  		c.log.Info().Int64("processing", processingCount).Msg("processing jobs")
   196  	} else {
   197  		c.log.Debug().Bool("running", c.running).Msg("no job found")
   198  	}
   199  
   200  }
   201  
   202  // run checks if there are processable jobs and process them by giving
   203  // them to the callback functions.
   204  // this function is passive, it won't trigger itself, but can only be
   205  // triggered by either Start or NotifyJobIsDone
   206  func (c *Consumer) run() (int64, error) {
   207  	processedFrom := c.processedIndex
   208  	processables, processedTo, err := c.processableJobs()
   209  	if err != nil {
   210  		return 0, fmt.Errorf("could not query processable jobs: %w", err)
   211  	}
   212  
   213  	c.log.Debug().
   214  		Uint64("processed_from", processedFrom).
   215  		Uint64("processed_to", processedTo).
   216  		Int("processables", len(processables)).
   217  		Bool("running", c.running).
   218  		Msg("running")
   219  
   220  	for _, indexedJob := range processables {
   221  		jobID := indexedJob.job.ID()
   222  
   223  		c.processingsIndex[jobID] = indexedJob.index
   224  		c.processings[indexedJob.index] = &jobStatus{
   225  			jobID: jobID,
   226  			done:  false,
   227  		}
   228  
   229  		c.runningJobs.Add(1)
   230  		go func(j *jobAtIndex) {
   231  			err := c.worker.Run(j.job)
   232  			if err != nil {
   233  				c.log.Fatal().Err(err).Msg("could not run the job")
   234  			}
   235  			c.runningJobs.Done()
   236  		}(indexedJob)
   237  	}
   238  
   239  	err = c.progress.SetProcessedIndex(processedTo)
   240  	if err != nil {
   241  		return 0, fmt.Errorf("could not set processed index %v, %w", processedTo, err)
   242  	}
   243  
   244  	for index := c.processedIndex + 1; index <= processedTo; index++ {
   245  		jobStatus, ok := c.processings[index]
   246  		if !ok {
   247  			continue
   248  		}
   249  
   250  		delete(c.processings, index)
   251  		delete(c.processingsIndex, jobStatus.jobID)
   252  	}
   253  
   254  	c.processedIndex = processedTo
   255  
   256  	return int64(len(processables)), nil
   257  }
   258  
   259  func (c *Consumer) processableJobs() ([]*jobAtIndex, uint64, error) {
   260  	processables, processedTo, err := processableJobs(
   261  		c.jobs,
   262  		c.processings,
   263  		c.maxProcessing,
   264  		c.maxSearchAhead,
   265  		c.processedIndex,
   266  	)
   267  
   268  	if err != nil {
   269  		return nil, 0, err
   270  	}
   271  
   272  	// if the consumer has been stopped, we allow the existing worker to update the progressed index
   273  	// but won't return any new job for processing
   274  	if !c.running {
   275  		return nil, processedTo, nil
   276  	}
   277  
   278  	return processables, processedTo, nil
   279  }
   280  
   281  // processableJobs check the worker's capacity and if sufficient, read
   282  // jobs from the storage, return the processable jobs, and the processed
   283  // index
   284  func processableJobs(jobs module.Jobs, processings map[uint64]*jobStatus, maxProcessing uint64, maxSearchAhead uint64, processedIndex uint64) ([]*jobAtIndex, uint64,
   285  	error) {
   286  	processables := make([]*jobAtIndex, 0)
   287  
   288  	// count how many jobs are still processing,
   289  	// in order to decide whether to process a new job
   290  	processing := uint64(0)
   291  
   292  	// determine if the consumer should pause processing new jobs because it's too far ahead of
   293  	// the lowest in progress index
   294  	shouldPause := func(index uint64) bool {
   295  		if maxSearchAhead == 0 {
   296  			return false
   297  		}
   298  
   299  		return index-processedIndex > maxSearchAhead
   300  	}
   301  
   302  	// if still have processing capacity, find the next processable job
   303  	for i := processedIndex + 1; processing < maxProcessing && !shouldPause(i); i++ {
   304  		status, ok := processings[i]
   305  
   306  		// if no worker is processing the next job, try to read it and process
   307  		if !ok {
   308  			// take one job
   309  			job, err := jobs.AtIndex(i)
   310  
   311  			// if there is no more job at this index, we could stop
   312  			if errors.Is(err, storage.ErrNotFound) {
   313  				break
   314  			}
   315  
   316  			// exception
   317  			if err != nil {
   318  				return nil, 0, fmt.Errorf("could not read job at index %v, %w", i, err)
   319  			}
   320  
   321  			processing++
   322  
   323  			processables = append(processables, &jobAtIndex{
   324  				job:   job,
   325  				index: i,
   326  			})
   327  			continue
   328  		}
   329  
   330  		// only increment the processing variable when
   331  		// the job is not done, meaning still processing
   332  		if !status.done {
   333  			processing++
   334  			continue
   335  		}
   336  
   337  		if i == processedIndex+1 {
   338  			processedIndex++
   339  		}
   340  	}
   341  
   342  	return processables, processedIndex, nil
   343  }
   344  
   345  // doneJob updates the internal state to mark the job has been processed
   346  // return true if the job is changed from processing to finished
   347  // return false if the job is already finished, or removed
   348  func (c *Consumer) doneJob(jobID module.JobID) bool {
   349  	// lock
   350  	index, ok := c.processingsIndex[jobID]
   351  	if !ok {
   352  		// job must has been processed
   353  		return false
   354  	}
   355  
   356  	status, ok := c.processings[index]
   357  	if !ok {
   358  		// must be a bug, if went here
   359  		c.log.Fatal().Msgf("bug, job (%v) can not be found by index (%v)", jobID, index)
   360  		return false
   361  	}
   362  
   363  	if status.done {
   364  		// job has been done already
   365  		return false
   366  	}
   367  
   368  	status.done = true
   369  	return true
   370  }
   371  
   372  type jobAtIndex struct {
   373  	job   module.Job
   374  	index uint64
   375  }
   376  
   377  type jobStatus struct {
   378  	jobID module.JobID
   379  	done  bool
   380  }