go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/tqtesting/scheduler.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tqtesting 16 17 import ( 18 "container/heap" 19 "context" 20 "fmt" 21 "math" 22 "sort" 23 "strings" 24 "sync" 25 "time" 26 27 taskspb "cloud.google.com/go/cloudtasks/apiv2/cloudtaskspb" 28 "cloud.google.com/go/pubsub/apiv1/pubsubpb" 29 "google.golang.org/grpc/codes" 30 "google.golang.org/grpc/status" 31 "google.golang.org/protobuf/proto" 32 33 "go.chromium.org/luci/common/clock" 34 "go.chromium.org/luci/common/data/stringset" 35 36 "go.chromium.org/luci/server/tq/internal/reminder" 37 ) 38 39 // ClockTag tags the clock used in scheduler's sleep. 40 const ClockTag = "tq-scheduler-sleep" 41 42 // Scheduler knows how to execute submitted tasks when they are due. 43 // 44 // This is a very primitive in-memory unholy hybrid of Cloud Tasks and PubSub 45 // services that can be used in tests and on localhost. 46 // 47 // Must be configured before the first Run call.Can be reconfigured between Run 48 // calls, but changing the configuration while Run is running is not allowed. 49 // 50 // Scheduler implements tq.Submitter interface. 51 type Scheduler struct { 52 // Executor knows how to execute tasks when their ETA arrives. 53 Executor Executor 54 55 // MaxAttempts is the maximum number of attempts for a task, including the 56 // first attempt. 57 // 58 // If negative the number of attempts is unlimited. 59 // 60 // Default is 20. 61 MaxAttempts int 62 63 // MinBackoff is an initial retry delay for failed tasks. 64 // 65 // It is doubled after each failed attempt until it reaches MaxBackoff after 66 // which it stays constant. 67 // 68 // Default is 1 sec. 69 MinBackoff time.Duration 70 71 // MaxBackoff is an upper limit on a retry delay. 72 // 73 // Default is 5 min. 74 MaxBackoff time.Duration 75 76 // TaskSucceeded is called from within the executor's `done` callback whenever 77 // a task finishes successfully, perhaps after a bunch of retries. 78 // 79 // Receives the same context as passed to Run. 80 TaskSucceeded func(ctx context.Context, task *Task) 81 82 // TaskFailed is called from within the executor's `done` callback whenever 83 // a task fails after being attempted MaxAttempts times. 84 // 85 // Receives the same context as passed to Run. 86 TaskFailed func(ctx context.Context, task *Task) 87 88 m sync.Mutex // a global lock protecting everything 89 clock clock.Clock // used to make sure only one clock is used 90 nextID int64 // for generating task names 91 seen stringset.Set // names of all tasks scheduled ever 92 tasks tasksHeap // scheduled tasks, earliest to execute first 93 executing map[*Task]struct{} // tasks being executed right now 94 recentlyFinished []*Task // tasks recently finished and not yet examined by Run 95 wg sync.WaitGroup // tracks 'executing' set 96 wakeUp chan struct{} // used to wake up Run 97 } 98 99 // Task represents an enqueued or executing task. 100 type Task struct { 101 Payload proto.Message // a clone of the original AddTask payload, if available 102 103 Task *taskspb.Task // a clone of the Cloud Tasks task as passed to Submit 104 Message *pubsubpb.PubsubMessage // a clone of the PubSub message as passed to Submit 105 106 Name string // full task name (perhaps generated) 107 Class string // TaskClass.ID passed in RegisterTaskClass. 108 ETA time.Time // when the task is due, always set at now or in future 109 110 Finished time.Time // when the task finished last execution attempt 111 Attempts int // 0 initially, incremented before each execution attempt 112 Executing bool // true if executing right now 113 114 index int // index in tasksHeap 115 } 116 117 // Copy makes a shallow copy of the task. 118 func (t *Task) Copy() *Task { 119 cpy := *t 120 return &cpy 121 } 122 123 // TaskList is a collection of tasks. 124 type TaskList []*Task 125 126 // Payloads returns a list with individual task payloads. 127 func (tl TaskList) Payloads() []proto.Message { 128 p := make([]proto.Message, len(tl)) 129 for i, t := range tl { 130 p[i] = t.Payload 131 } 132 return p 133 } 134 135 // Filter returns a new task list with tasks matching the filter. 136 func (tl TaskList) Filter(cb func(*Task) bool) TaskList { 137 var out TaskList 138 for _, t := range tl { 139 if cb(t) { 140 out = append(out, t) 141 } 142 } 143 return out 144 } 145 146 // Executing returns a list of tasks executing right now. 147 func (tl TaskList) Executing() TaskList { 148 return tl.Filter(func(t *Task) bool { return t.Executing }) 149 } 150 151 // Pending returns a list of tasks waiting execution. 152 func (tl TaskList) Pending() TaskList { 153 return tl.Filter(func(t *Task) bool { return !t.Executing }) 154 } 155 156 // SortByETA sorts the list in-place by ETA. 157 // 158 // The full sorting key is 159 // (!task.Executing, task.ETA, task.Class, task.Name) 160 // 161 // Returns it to allow chaining calls. 162 func (tl TaskList) SortByETA() TaskList { 163 sort.Slice(tl, func(i, j int) bool { 164 switch l, r := tl[i], tl[j]; { 165 case l.Executing && !r.Executing: 166 return true 167 case !l.Executing && r.Executing: 168 return false 169 case !l.ETA.Equal(r.ETA): 170 return l.ETA.Before(r.ETA) 171 case l.Class != r.Class: 172 return l.Class < r.Class 173 default: 174 return l.Name < r.Name 175 } 176 }) 177 return tl 178 } 179 180 // TasksCollector returns a callback that adds tasks to the given list. 181 // 182 // Can be passed as TaskSucceeded or TaskFailed callback to the Scheduler. 183 // 184 // Synchronizes access to the list internally, but the list should be read 185 // from only when the Scheduler is paused. 186 func TasksCollector(tl *TaskList) func(context.Context, *Task) { 187 var m sync.Mutex 188 return func(_ context.Context, t *Task) { 189 m.Lock() 190 *tl = append(*tl, t.Copy()) 191 m.Unlock() 192 } 193 } 194 195 // Executor knows how to execute tasks when their ETA arrives. 196 type Executor interface { 197 // Execute is called from Run to execute the task. 198 // 199 // The executor may execute the task right away in a blocking way or dispatch 200 // it to some other goroutine. Either way it must call `done` callback when it 201 // is done executing the task, indicating whether the task should be 202 // reenqueued for a retry. 203 // 204 // It is safe to call Scheduler's Submit from inside Execute. 205 // 206 // Receives the exact same context as Run(...), in particular this context 207 // is canceled when Run is done. 208 Execute(ctx context.Context, t *Task, done func(retry bool)) 209 } 210 211 // Submit schedules a task for later execution. 212 func (s *Scheduler) Submit(ctx context.Context, p *reminder.Payload) error { 213 // Validate the request and transform it into *Task. Note that this validation 214 // is pretty sloppy. It validates only things Scheduler depends on. It doesn't 215 // validate full conformance to Cloud APIs. 216 var task *Task 217 var namePrefix string 218 var err error 219 switch { 220 case p.CreateTaskRequest != nil: 221 task, namePrefix, err = s.prepCloudTasksTask(ctx, p.CreateTaskRequest) 222 case p.PublishRequest != nil: 223 task, namePrefix, err = s.prepPubSubTask(ctx, p.PublishRequest) 224 default: 225 err = status.Errorf(codes.InvalidArgument, "unrecognized payload kind") 226 } 227 if err != nil { 228 return err 229 } 230 231 task.Class = p.TaskClass 232 if p.Raw != nil { 233 task.Payload = proto.Clone(p.Raw) 234 } 235 236 s.m.Lock() 237 defer s.m.Unlock() 238 239 s.checkClockLocked(ctx) 240 241 if s.seen == nil { 242 s.seen = stringset.New(1) 243 } 244 if s.executing == nil { 245 s.executing = make(map[*Task]struct{}, 1) 246 } 247 248 if task.Name == "" { 249 task.Name = fmt.Sprintf("%s/generated-task-id-%08d", namePrefix, s.nextID) 250 s.nextID++ 251 } else if !s.seen.Add(task.Name) { 252 return status.Errorf(codes.AlreadyExists, "task %q already exists", task.Name) 253 } 254 255 s.enqueueLocked(task) 256 return nil 257 } 258 259 // prepCloudTasksTask makes *Task out of a Cloud Tasks request. 260 func (s *Scheduler) prepCloudTasksTask(ctx context.Context, req *taskspb.CreateTaskRequest) (*Task, string, error) { 261 if req.Parent == "" { 262 return nil, "", status.Errorf(codes.InvalidArgument, "no Parent in the request") 263 } 264 if req.Task == nil { 265 return nil, "", status.Errorf(codes.InvalidArgument, "no Task in the request") 266 } 267 if req.Task.Name != "" && !strings.HasPrefix(req.Task.Name, req.Parent+"/tasks/") { 268 return nil, "", status.Errorf(codes.InvalidArgument, "bad task name") 269 } 270 271 task := &Task{ 272 Task: proto.Clone(req.Task).(*taskspb.Task), 273 Name: req.Task.Name, 274 ETA: req.Task.ScheduleTime.AsTime(), 275 } 276 if now := clock.Now(ctx); task.ETA.Before(now) { 277 task.ETA = now 278 } 279 280 return task, req.Parent + "/tasks/", nil 281 } 282 283 // prepPubSubTask makes *Task out of Cloud PubSub request. 284 func (s *Scheduler) prepPubSubTask(ctx context.Context, req *pubsubpb.PublishRequest) (*Task, string, error) { 285 if req.Topic == "" { 286 return nil, "", status.Errorf(codes.InvalidArgument, "no Topic in the request") 287 } 288 if len(req.Messages) != 1 { 289 return nil, "", status.Errorf(codes.InvalidArgument, "expecting 1 message, got %d", len(req.Messages)) 290 } 291 return &Task{ 292 Message: proto.Clone(req.Messages[0]).(*pubsubpb.PubsubMessage), 293 ETA: clock.Now(ctx), 294 }, req.Topic + "/messages/", nil 295 } 296 297 // Tasks returns a snapshot of the scheduler state. 298 // 299 // Recalculates it from scratch, so it is a pretty expensive call. 300 // 301 // Tasks are ordered by ETA: currently executing tasks first, then scheduled 302 // tasks. 303 func (s *Scheduler) Tasks() TaskList { 304 s.m.Lock() 305 defer s.m.Unlock() 306 307 tasks := make(TaskList, 0, len(s.tasks)+len(s.executing)) 308 for _, t := range s.tasks { 309 tasks = append(tasks, t.Copy()) 310 } 311 for t := range s.executing { 312 tasks = append(tasks, t.Copy()) 313 } 314 315 return tasks.SortByETA() 316 } 317 318 // Run executes the scheduler's loop until the context is canceled or one of 319 // the stop conditions are hit. 320 // 321 // By default executes tasks serially. Pass ParallelExecute() option to execute 322 // them asynchronously. 323 // 324 // Upon exit all executing tasks has finished, there still may be pending tasks. 325 // 326 // Panics if Run is already running (perhaps in another goroutine). 327 func (s *Scheduler) Run(ctx context.Context, opts ...RunOption) { 328 func() { 329 s.m.Lock() 330 defer s.m.Unlock() 331 s.checkClockLocked(ctx) 332 if s.wakeUp != nil { 333 panic("Run is already running") 334 } 335 s.wakeUp = make(chan struct{}, 1) 336 }() 337 338 defer func() { 339 s.m.Lock() 340 defer s.m.Unlock() 341 close(s.wakeUp) 342 s.wakeUp = nil 343 s.recentlyFinished = nil 344 }() 345 346 // Waits for all initiated executing tasks to finish before returning. 347 defer s.wg.Wait() 348 349 parallelExec := false 350 for _, opt := range opts { 351 if _, ok := opt.(parallelExecute); ok { 352 parallelExec = true 353 break 354 } 355 } 356 357 for ctx.Err() == nil { 358 if s.shouldStop(opts) { 359 return 360 } 361 switch task, nextETA, taskDone := s.tryDequeueTask(ctx); { 362 case task != nil: 363 // Pass the task to the executor. It may either execute it right away 364 // or asynchronously later. Either way, when it is done it will call 365 // the finalization callback. 366 if !parallelExec { 367 s.Executor.Execute(ctx, task, taskDone) 368 } else { 369 go func() { s.Executor.Execute(ctx, task, taskDone) }() 370 } 371 case !nextETA.IsZero(): 372 select { 373 case <-s.wakeUp: 374 case <-clock.After(clock.Tag(ctx, ClockTag), nextETA.Sub(clock.Now(ctx))): 375 } 376 default: 377 select { 378 case <-s.wakeUp: 379 case <-ctx.Done(): 380 } 381 } 382 } 383 } 384 385 // enqueueLocked adds the task to the task heap and wakes up the scheduler. 386 func (s *Scheduler) enqueueLocked(task *Task) { 387 heap.Push(&s.tasks, task) 388 s.wakeUpLocked() 389 } 390 391 // wakeUpLocked signals s.wakeUp channel. 392 // 393 // This would wake up Run if it is listening or does nothing if wakeUp is nil 394 // (i.e. Run is not running). 395 func (s *Scheduler) wakeUpLocked() { 396 select { 397 case s.wakeUp <- struct{}{}: 398 default: 399 } 400 } 401 402 // tryDequeueTask pops the earliest task if it is ready for execution. 403 // 404 // A task is executable if it has ETA <= now. If no tasks are ready, returns 405 // ETA of the earliest task or time.Time{} if the queue is empty. 406 // 407 // If pops a task, returns a callback that must be called (perhaps 408 // asynchronously) when the task finishes execution. 409 func (s *Scheduler) tryDequeueTask(ctx context.Context) (t *Task, eta time.Time, done func(retry bool)) { 410 s.m.Lock() 411 defer s.m.Unlock() 412 413 if len(s.tasks) == 0 { 414 return nil, time.Time{}, nil 415 } 416 if eta := s.tasks[0].ETA; eta.After(clock.Now(ctx)) { 417 return nil, eta, nil 418 } 419 420 task := heap.Pop(&s.tasks).(*Task) 421 task.Attempts++ 422 task.Executing = true 423 s.executing[task] = struct{}{} 424 s.wg.Add(1) 425 426 return task, time.Time{}, func(retry bool) { 427 defer s.wg.Done() 428 429 reenqueued := false 430 431 s.m.Lock() 432 defer func() { 433 s.m.Unlock() 434 if !reenqueued { 435 switch { 436 case !retry && s.TaskSucceeded != nil: 437 s.TaskSucceeded(ctx, task) 438 case retry && s.TaskFailed != nil: 439 s.TaskFailed(ctx, task) 440 } 441 } 442 }() 443 444 task.Executing = false 445 task.Finished = clock.Now(ctx) 446 delete(s.executing, task) 447 448 if retry { 449 if ok, delay := s.evalRetryLocked(task); ok { 450 task.ETA = clock.Now(ctx).Add(delay) 451 s.enqueueLocked(task) 452 reenqueued = true 453 } 454 } 455 456 if !reenqueued { 457 s.recentlyFinished = append(s.recentlyFinished, task) 458 s.wakeUpLocked() // to let Run examine stop conditions 459 } 460 } 461 } 462 463 // evalRetryLocked decides if a task should be retried and when. 464 func (s *Scheduler) evalRetryLocked(t *Task) (retry bool, delay time.Duration) { 465 maxAttempts := s.MaxAttempts 466 if maxAttempts == 0 { 467 maxAttempts = 20 468 } 469 470 minBackoff := s.MinBackoff 471 if minBackoff == 0 { 472 minBackoff = time.Second 473 } 474 475 maxBackoff := s.MaxBackoff 476 if maxBackoff == 0 { 477 maxBackoff = 5 * time.Minute 478 } 479 480 if maxAttempts > 0 && t.Attempts >= maxAttempts { 481 return false, 0 482 } 483 484 delay = time.Duration(math.Pow(2, float64(t.Attempts))) * minBackoff 485 if delay > maxBackoff { 486 delay = maxBackoff 487 } 488 return true, delay 489 } 490 491 // shouldStop returns true if the scheduler should stop now. 492 func (s *Scheduler) shouldStop(opts []RunOption) bool { 493 s.m.Lock() 494 defer s.m.Unlock() 495 496 recentlyFinished := s.recentlyFinished 497 s.recentlyFinished = s.recentlyFinished[:0] 498 499 for _, opt := range opts { 500 switch v := opt.(type) { 501 case stopWhenDrained: 502 if len(s.tasks) == 0 && len(s.executing) == 0 { 503 return true 504 } 505 case stopAfter: 506 for _, t := range recentlyFinished { 507 if v.examine(t) { 508 return true 509 } 510 } 511 case stopBefore: 512 if len(s.tasks) > 0 && v.examine(s.tasks[0]) { 513 return true 514 } 515 } 516 } 517 return false 518 } 519 520 // checkClockLocked panics if `ctx` uses an unexpected clock. 521 func (s *Scheduler) checkClockLocked(ctx context.Context) { 522 clock := clock.Get(ctx) 523 if s.clock == nil { 524 s.clock = clock 525 } else if s.clock != clock { 526 panic("multiple clocks used with a single Scheduler, this is dangerous") 527 } 528 } 529 530 //////////////////////////////////////////////////////////////////////////////// 531 532 // tasksHeap is a heap of scheduled tasks, the implementation is copy-pasted 533 // from the godoc. 534 type tasksHeap []*Task 535 536 func (th tasksHeap) Len() int { return len(th) } 537 538 func (th tasksHeap) Less(i, j int) bool { 539 l, r := th[i], th[j] 540 if l.ETA.Equal(r.ETA) { 541 return l.Name < r.Name 542 } 543 return l.ETA.Before(r.ETA) 544 } 545 546 func (th tasksHeap) Swap(i, j int) { 547 th[i], th[j] = th[j], th[i] 548 th[i].index = i 549 th[j].index = j 550 } 551 552 func (th *tasksHeap) Push(x any) { 553 n := len(*th) 554 item := x.(*Task) 555 item.index = n 556 *th = append(*th, item) 557 } 558 559 func (th *tasksHeap) Pop() any { 560 old := *th 561 n := len(old) 562 item := old[n-1] 563 old[n-1] = nil // avoid memory leak 564 item.index = -1 // for safety 565 *th = old[0 : n-1] 566 return item 567 }