go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/tqtesting/scheduler.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/tqtesting/scheduler.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tqtesting
    16  
    17  import (
    18  	"container/heap"
    19  	"context"
    20  	"fmt"
    21  	"math"
    22  	"sort"
    23  	"strings"
    24  	"sync"
    25  	"time"
    26  
    27  	taskspb "cloud.google.com/go/cloudtasks/apiv2/cloudtaskspb"
    28  	"cloud.google.com/go/pubsub/apiv1/pubsubpb"
    29  	"google.golang.org/grpc/codes"
    30  	"google.golang.org/grpc/status"
    31  	"google.golang.org/protobuf/proto"
    32  
    33  	"go.chromium.org/luci/common/clock"
    34  	"go.chromium.org/luci/common/data/stringset"
    35  
    36  	"go.chromium.org/luci/server/tq/internal/reminder"
    37  )
    38  
    39  // ClockTag tags the clock used in scheduler's sleep.
    40  const ClockTag = "tq-scheduler-sleep"
    41  
    42  // Scheduler knows how to execute submitted tasks when they are due.
    43  //
    44  // This is a very primitive in-memory unholy hybrid of Cloud Tasks and PubSub
    45  // services that can be used in tests and on localhost.
    46  //
    47  // Must be configured before the first Run call.Can be reconfigured between Run
    48  // calls, but changing the configuration while Run is running is not allowed.
    49  //
    50  // Scheduler implements tq.Submitter interface.
    51  type Scheduler struct {
    52  	// Executor knows how to execute tasks when their ETA arrives.
    53  	Executor Executor
    54  
    55  	// MaxAttempts is the maximum number of attempts for a task, including the
    56  	// first attempt.
    57  	//
    58  	// If negative the number of attempts is unlimited.
    59  	//
    60  	// Default is 20.
    61  	MaxAttempts int
    62  
    63  	// MinBackoff is an initial retry delay for failed tasks.
    64  	//
    65  	// It is doubled after each failed attempt until it reaches MaxBackoff after
    66  	// which it stays constant.
    67  	//
    68  	// Default is 1 sec.
    69  	MinBackoff time.Duration
    70  
    71  	// MaxBackoff is an upper limit on a retry delay.
    72  	//
    73  	// Default is 5 min.
    74  	MaxBackoff time.Duration
    75  
    76  	// TaskSucceeded is called from within the executor's `done` callback whenever
    77  	// a task finishes successfully, perhaps after a bunch of retries.
    78  	//
    79  	// Receives the same context as passed to Run.
    80  	TaskSucceeded func(ctx context.Context, task *Task)
    81  
    82  	// TaskFailed is called from within the executor's `done` callback whenever
    83  	// a task fails after being attempted MaxAttempts times.
    84  	//
    85  	// Receives the same context as passed to Run.
    86  	TaskFailed func(ctx context.Context, task *Task)
    87  
    88  	m                sync.Mutex         // a global lock protecting everything
    89  	clock            clock.Clock        // used to make sure only one clock is used
    90  	nextID           int64              // for generating task names
    91  	seen             stringset.Set      // names of all tasks scheduled ever
    92  	tasks            tasksHeap          // scheduled tasks, earliest to execute first
    93  	executing        map[*Task]struct{} // tasks being executed right now
    94  	recentlyFinished []*Task            // tasks recently finished and not yet examined by Run
    95  	wg               sync.WaitGroup     // tracks 'executing' set
    96  	wakeUp           chan struct{}      // used to wake up Run
    97  }
    98  
    99  // Task represents an enqueued or executing task.
   100  type Task struct {
   101  	Payload proto.Message // a clone of the original AddTask payload, if available
   102  
   103  	Task    *taskspb.Task           // a clone of the Cloud Tasks task as passed to Submit
   104  	Message *pubsubpb.PubsubMessage // a clone of the PubSub message as passed to Submit
   105  
   106  	Name  string    // full task name (perhaps generated)
   107  	Class string    // TaskClass.ID passed in RegisterTaskClass.
   108  	ETA   time.Time // when the task is due, always set at now or in future
   109  
   110  	Finished  time.Time // when the task finished last execution attempt
   111  	Attempts  int       // 0 initially, incremented before each execution attempt
   112  	Executing bool      // true if executing right now
   113  
   114  	index int // index in tasksHeap
   115  }
   116  
   117  // Copy makes a shallow copy of the task.
   118  func (t *Task) Copy() *Task {
   119  	cpy := *t
   120  	return &cpy
   121  }
   122  
   123  // TaskList is a collection of tasks.
   124  type TaskList []*Task
   125  
   126  // Payloads returns a list with individual task payloads.
   127  func (tl TaskList) Payloads() []proto.Message {
   128  	p := make([]proto.Message, len(tl))
   129  	for i, t := range tl {
   130  		p[i] = t.Payload
   131  	}
   132  	return p
   133  }
   134  
   135  // Filter returns a new task list with tasks matching the filter.
   136  func (tl TaskList) Filter(cb func(*Task) bool) TaskList {
   137  	var out TaskList
   138  	for _, t := range tl {
   139  		if cb(t) {
   140  			out = append(out, t)
   141  		}
   142  	}
   143  	return out
   144  }
   145  
   146  // Executing returns a list of tasks executing right now.
   147  func (tl TaskList) Executing() TaskList {
   148  	return tl.Filter(func(t *Task) bool { return t.Executing })
   149  }
   150  
   151  // Pending returns a list of tasks waiting execution.
   152  func (tl TaskList) Pending() TaskList {
   153  	return tl.Filter(func(t *Task) bool { return !t.Executing })
   154  }
   155  
   156  // SortByETA sorts the list in-place by ETA.
   157  //
   158  // The full sorting key is
   159  // (!task.Executing, task.ETA, task.Class, task.Name)
   160  //
   161  // Returns it to allow chaining calls.
   162  func (tl TaskList) SortByETA() TaskList {
   163  	sort.Slice(tl, func(i, j int) bool {
   164  		switch l, r := tl[i], tl[j]; {
   165  		case l.Executing && !r.Executing:
   166  			return true
   167  		case !l.Executing && r.Executing:
   168  			return false
   169  		case !l.ETA.Equal(r.ETA):
   170  			return l.ETA.Before(r.ETA)
   171  		case l.Class != r.Class:
   172  			return l.Class < r.Class
   173  		default:
   174  			return l.Name < r.Name
   175  		}
   176  	})
   177  	return tl
   178  }
   179  
   180  // TasksCollector returns a callback that adds tasks to the given list.
   181  //
   182  // Can be passed as TaskSucceeded or TaskFailed callback to the Scheduler.
   183  //
   184  // Synchronizes access to the list internally, but the list should be read
   185  // from only when the Scheduler is paused.
   186  func TasksCollector(tl *TaskList) func(context.Context, *Task) {
   187  	var m sync.Mutex
   188  	return func(_ context.Context, t *Task) {
   189  		m.Lock()
   190  		*tl = append(*tl, t.Copy())
   191  		m.Unlock()
   192  	}
   193  }
   194  
   195  // Executor knows how to execute tasks when their ETA arrives.
   196  type Executor interface {
   197  	// Execute is called from Run to execute the task.
   198  	//
   199  	// The executor may execute the task right away in a blocking way or dispatch
   200  	// it to some other goroutine. Either way it must call `done` callback when it
   201  	// is done executing the task, indicating whether the task should be
   202  	// reenqueued for a retry.
   203  	//
   204  	// It is safe to call Scheduler's Submit from inside Execute.
   205  	//
   206  	// Receives the exact same context as Run(...), in particular this context
   207  	// is canceled when Run is done.
   208  	Execute(ctx context.Context, t *Task, done func(retry bool))
   209  }
   210  
   211  // Submit schedules a task for later execution.
   212  func (s *Scheduler) Submit(ctx context.Context, p *reminder.Payload) error {
   213  	// Validate the request and transform it into *Task. Note that this validation
   214  	// is pretty sloppy. It validates only things Scheduler depends on. It doesn't
   215  	// validate full conformance to Cloud APIs.
   216  	var task *Task
   217  	var namePrefix string
   218  	var err error
   219  	switch {
   220  	case p.CreateTaskRequest != nil:
   221  		task, namePrefix, err = s.prepCloudTasksTask(ctx, p.CreateTaskRequest)
   222  	case p.PublishRequest != nil:
   223  		task, namePrefix, err = s.prepPubSubTask(ctx, p.PublishRequest)
   224  	default:
   225  		err = status.Errorf(codes.InvalidArgument, "unrecognized payload kind")
   226  	}
   227  	if err != nil {
   228  		return err
   229  	}
   230  
   231  	task.Class = p.TaskClass
   232  	if p.Raw != nil {
   233  		task.Payload = proto.Clone(p.Raw)
   234  	}
   235  
   236  	s.m.Lock()
   237  	defer s.m.Unlock()
   238  
   239  	s.checkClockLocked(ctx)
   240  
   241  	if s.seen == nil {
   242  		s.seen = stringset.New(1)
   243  	}
   244  	if s.executing == nil {
   245  		s.executing = make(map[*Task]struct{}, 1)
   246  	}
   247  
   248  	if task.Name == "" {
   249  		task.Name = fmt.Sprintf("%s/generated-task-id-%08d", namePrefix, s.nextID)
   250  		s.nextID++
   251  	} else if !s.seen.Add(task.Name) {
   252  		return status.Errorf(codes.AlreadyExists, "task %q already exists", task.Name)
   253  	}
   254  
   255  	s.enqueueLocked(task)
   256  	return nil
   257  }
   258  
   259  // prepCloudTasksTask makes *Task out of a Cloud Tasks request.
   260  func (s *Scheduler) prepCloudTasksTask(ctx context.Context, req *taskspb.CreateTaskRequest) (*Task, string, error) {
   261  	if req.Parent == "" {
   262  		return nil, "", status.Errorf(codes.InvalidArgument, "no Parent in the request")
   263  	}
   264  	if req.Task == nil {
   265  		return nil, "", status.Errorf(codes.InvalidArgument, "no Task in the request")
   266  	}
   267  	if req.Task.Name != "" && !strings.HasPrefix(req.Task.Name, req.Parent+"/tasks/") {
   268  		return nil, "", status.Errorf(codes.InvalidArgument, "bad task name")
   269  	}
   270  
   271  	task := &Task{
   272  		Task: proto.Clone(req.Task).(*taskspb.Task),
   273  		Name: req.Task.Name,
   274  		ETA:  req.Task.ScheduleTime.AsTime(),
   275  	}
   276  	if now := clock.Now(ctx); task.ETA.Before(now) {
   277  		task.ETA = now
   278  	}
   279  
   280  	return task, req.Parent + "/tasks/", nil
   281  }
   282  
   283  // prepPubSubTask makes *Task out of Cloud PubSub request.
   284  func (s *Scheduler) prepPubSubTask(ctx context.Context, req *pubsubpb.PublishRequest) (*Task, string, error) {
   285  	if req.Topic == "" {
   286  		return nil, "", status.Errorf(codes.InvalidArgument, "no Topic in the request")
   287  	}
   288  	if len(req.Messages) != 1 {
   289  		return nil, "", status.Errorf(codes.InvalidArgument, "expecting 1 message, got %d", len(req.Messages))
   290  	}
   291  	return &Task{
   292  		Message: proto.Clone(req.Messages[0]).(*pubsubpb.PubsubMessage),
   293  		ETA:     clock.Now(ctx),
   294  	}, req.Topic + "/messages/", nil
   295  }
   296  
   297  // Tasks returns a snapshot of the scheduler state.
   298  //
   299  // Recalculates it from scratch, so it is a pretty expensive call.
   300  //
   301  // Tasks are ordered by ETA: currently executing tasks first, then scheduled
   302  // tasks.
   303  func (s *Scheduler) Tasks() TaskList {
   304  	s.m.Lock()
   305  	defer s.m.Unlock()
   306  
   307  	tasks := make(TaskList, 0, len(s.tasks)+len(s.executing))
   308  	for _, t := range s.tasks {
   309  		tasks = append(tasks, t.Copy())
   310  	}
   311  	for t := range s.executing {
   312  		tasks = append(tasks, t.Copy())
   313  	}
   314  
   315  	return tasks.SortByETA()
   316  }
   317  
   318  // Run executes the scheduler's loop until the context is canceled or one of
   319  // the stop conditions are hit.
   320  //
   321  // By default executes tasks serially. Pass ParallelExecute() option to execute
   322  // them asynchronously.
   323  //
   324  // Upon exit all executing tasks has finished, there still may be pending tasks.
   325  //
   326  // Panics if Run is already running (perhaps in another goroutine).
   327  func (s *Scheduler) Run(ctx context.Context, opts ...RunOption) {
   328  	func() {
   329  		s.m.Lock()
   330  		defer s.m.Unlock()
   331  		s.checkClockLocked(ctx)
   332  		if s.wakeUp != nil {
   333  			panic("Run is already running")
   334  		}
   335  		s.wakeUp = make(chan struct{}, 1)
   336  	}()
   337  
   338  	defer func() {
   339  		s.m.Lock()
   340  		defer s.m.Unlock()
   341  		close(s.wakeUp)
   342  		s.wakeUp = nil
   343  		s.recentlyFinished = nil
   344  	}()
   345  
   346  	// Waits for all initiated executing tasks to finish before returning.
   347  	defer s.wg.Wait()
   348  
   349  	parallelExec := false
   350  	for _, opt := range opts {
   351  		if _, ok := opt.(parallelExecute); ok {
   352  			parallelExec = true
   353  			break
   354  		}
   355  	}
   356  
   357  	for ctx.Err() == nil {
   358  		if s.shouldStop(opts) {
   359  			return
   360  		}
   361  		switch task, nextETA, taskDone := s.tryDequeueTask(ctx); {
   362  		case task != nil:
   363  			// Pass the task to the executor. It may either execute it right away
   364  			// or asynchronously later. Either way, when it is done it will call
   365  			// the finalization callback.
   366  			if !parallelExec {
   367  				s.Executor.Execute(ctx, task, taskDone)
   368  			} else {
   369  				go func() { s.Executor.Execute(ctx, task, taskDone) }()
   370  			}
   371  		case !nextETA.IsZero():
   372  			select {
   373  			case <-s.wakeUp:
   374  			case <-clock.After(clock.Tag(ctx, ClockTag), nextETA.Sub(clock.Now(ctx))):
   375  			}
   376  		default:
   377  			select {
   378  			case <-s.wakeUp:
   379  			case <-ctx.Done():
   380  			}
   381  		}
   382  	}
   383  }
   384  
   385  // enqueueLocked adds the task to the task heap and wakes up the scheduler.
   386  func (s *Scheduler) enqueueLocked(task *Task) {
   387  	heap.Push(&s.tasks, task)
   388  	s.wakeUpLocked()
   389  }
   390  
   391  // wakeUpLocked signals s.wakeUp channel.
   392  //
   393  // This would wake up Run if it is listening or does nothing if wakeUp is nil
   394  // (i.e. Run is not running).
   395  func (s *Scheduler) wakeUpLocked() {
   396  	select {
   397  	case s.wakeUp <- struct{}{}:
   398  	default:
   399  	}
   400  }
   401  
   402  // tryDequeueTask pops the earliest task if it is ready for execution.
   403  //
   404  // A task is executable if it has ETA <= now. If no tasks are ready, returns
   405  // ETA of the earliest task or time.Time{} if the queue is empty.
   406  //
   407  // If pops a task, returns a callback that must be called (perhaps
   408  // asynchronously) when the task finishes execution.
   409  func (s *Scheduler) tryDequeueTask(ctx context.Context) (t *Task, eta time.Time, done func(retry bool)) {
   410  	s.m.Lock()
   411  	defer s.m.Unlock()
   412  
   413  	if len(s.tasks) == 0 {
   414  		return nil, time.Time{}, nil
   415  	}
   416  	if eta := s.tasks[0].ETA; eta.After(clock.Now(ctx)) {
   417  		return nil, eta, nil
   418  	}
   419  
   420  	task := heap.Pop(&s.tasks).(*Task)
   421  	task.Attempts++
   422  	task.Executing = true
   423  	s.executing[task] = struct{}{}
   424  	s.wg.Add(1)
   425  
   426  	return task, time.Time{}, func(retry bool) {
   427  		defer s.wg.Done()
   428  
   429  		reenqueued := false
   430  
   431  		s.m.Lock()
   432  		defer func() {
   433  			s.m.Unlock()
   434  			if !reenqueued {
   435  				switch {
   436  				case !retry && s.TaskSucceeded != nil:
   437  					s.TaskSucceeded(ctx, task)
   438  				case retry && s.TaskFailed != nil:
   439  					s.TaskFailed(ctx, task)
   440  				}
   441  			}
   442  		}()
   443  
   444  		task.Executing = false
   445  		task.Finished = clock.Now(ctx)
   446  		delete(s.executing, task)
   447  
   448  		if retry {
   449  			if ok, delay := s.evalRetryLocked(task); ok {
   450  				task.ETA = clock.Now(ctx).Add(delay)
   451  				s.enqueueLocked(task)
   452  				reenqueued = true
   453  			}
   454  		}
   455  
   456  		if !reenqueued {
   457  			s.recentlyFinished = append(s.recentlyFinished, task)
   458  			s.wakeUpLocked() // to let Run examine stop conditions
   459  		}
   460  	}
   461  }
   462  
   463  // evalRetryLocked decides if a task should be retried and when.
   464  func (s *Scheduler) evalRetryLocked(t *Task) (retry bool, delay time.Duration) {
   465  	maxAttempts := s.MaxAttempts
   466  	if maxAttempts == 0 {
   467  		maxAttempts = 20
   468  	}
   469  
   470  	minBackoff := s.MinBackoff
   471  	if minBackoff == 0 {
   472  		minBackoff = time.Second
   473  	}
   474  
   475  	maxBackoff := s.MaxBackoff
   476  	if maxBackoff == 0 {
   477  		maxBackoff = 5 * time.Minute
   478  	}
   479  
   480  	if maxAttempts > 0 && t.Attempts >= maxAttempts {
   481  		return false, 0
   482  	}
   483  
   484  	delay = time.Duration(math.Pow(2, float64(t.Attempts))) * minBackoff
   485  	if delay > maxBackoff {
   486  		delay = maxBackoff
   487  	}
   488  	return true, delay
   489  }
   490  
   491  // shouldStop returns true if the scheduler should stop now.
   492  func (s *Scheduler) shouldStop(opts []RunOption) bool {
   493  	s.m.Lock()
   494  	defer s.m.Unlock()
   495  
   496  	recentlyFinished := s.recentlyFinished
   497  	s.recentlyFinished = s.recentlyFinished[:0]
   498  
   499  	for _, opt := range opts {
   500  		switch v := opt.(type) {
   501  		case stopWhenDrained:
   502  			if len(s.tasks) == 0 && len(s.executing) == 0 {
   503  				return true
   504  			}
   505  		case stopAfter:
   506  			for _, t := range recentlyFinished {
   507  				if v.examine(t) {
   508  					return true
   509  				}
   510  			}
   511  		case stopBefore:
   512  			if len(s.tasks) > 0 && v.examine(s.tasks[0]) {
   513  				return true
   514  			}
   515  		}
   516  	}
   517  	return false
   518  }
   519  
   520  // checkClockLocked panics if `ctx` uses an unexpected clock.
   521  func (s *Scheduler) checkClockLocked(ctx context.Context) {
   522  	clock := clock.Get(ctx)
   523  	if s.clock == nil {
   524  		s.clock = clock
   525  	} else if s.clock != clock {
   526  		panic("multiple clocks used with a single Scheduler, this is dangerous")
   527  	}
   528  }
   529  
   530  ////////////////////////////////////////////////////////////////////////////////
   531  
   532  // tasksHeap is a heap of scheduled tasks, the implementation is copy-pasted
   533  // from the godoc.
   534  type tasksHeap []*Task
   535  
   536  func (th tasksHeap) Len() int { return len(th) }
   537  
   538  func (th tasksHeap) Less(i, j int) bool {
   539  	l, r := th[i], th[j]
   540  	if l.ETA.Equal(r.ETA) {
   541  		return l.Name < r.Name
   542  	}
   543  	return l.ETA.Before(r.ETA)
   544  }
   545  
   546  func (th tasksHeap) Swap(i, j int) {
   547  	th[i], th[j] = th[j], th[i]
   548  	th[i].index = i
   549  	th[j].index = j
   550  }
   551  
   552  func (th *tasksHeap) Push(x any) {
   553  	n := len(*th)
   554  	item := x.(*Task)
   555  	item.index = n
   556  	*th = append(*th, item)
   557  }
   558  
   559  func (th *tasksHeap) Pop() any {
   560  	old := *th
   561  	n := len(old)
   562  	item := old[n-1]
   563  	old[n-1] = nil  // avoid memory leak
   564  	item.index = -1 // for safety
   565  	*th = old[0 : n-1]
   566  	return item
   567  }