github.com/hugh712/snapd@v0.0.0-20200910133618-1a99902bd583/overlord/state/taskrunner.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2016 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 package state 21 22 import ( 23 "sync" 24 "time" 25 26 "gopkg.in/tomb.v2" 27 28 "github.com/snapcore/snapd/logger" 29 ) 30 31 // HandlerFunc is the type of function for the handlers 32 type HandlerFunc func(task *Task, tomb *tomb.Tomb) error 33 34 // Retry is returned from a handler to signal that is ok to rerun the 35 // task at a later point. It's to be used also when a task goroutine 36 // is asked to stop through its tomb. After can be used to indicate 37 // how much to postpone the retry, 0 (the default) means at the next 38 // ensure pass and is what should be used if stopped through its tomb. 39 // Reason is an optional explanation of the conflict. 40 type Retry struct { 41 After time.Duration 42 Reason string 43 } 44 45 func (r *Retry) Error() string { 46 return "task should be retried" 47 } 48 49 type blockedFunc func(t *Task, running []*Task) bool 50 51 // TaskRunner controls the running of goroutines to execute known task kinds. 52 type TaskRunner struct { 53 state *State 54 55 // locking 56 mu sync.Mutex 57 handlers map[string]handlerPair 58 optional []optionalHandler 59 cleanups map[string]HandlerFunc 60 stopped bool 61 62 blocked []blockedFunc 63 someBlocked bool 64 65 // optional callback executed on task errors 66 taskErrorCallback func(err error) 67 68 // go-routines lifecycle 69 tombs map[string]*tomb.Tomb 70 } 71 72 type handlerPair struct { 73 do, undo HandlerFunc 74 } 75 76 type optionalHandler struct { 77 match func(t *Task) bool 78 handlerPair 79 } 80 81 // NewTaskRunner creates a new TaskRunner 82 func NewTaskRunner(s *State) *TaskRunner { 83 return &TaskRunner{ 84 state: s, 85 handlers: make(map[string]handlerPair), 86 cleanups: make(map[string]HandlerFunc), 87 tombs: make(map[string]*tomb.Tomb), 88 } 89 } 90 91 // OnTaskError sets an error callback executed when any task errors out. 92 func (r *TaskRunner) OnTaskError(f func(err error)) { 93 r.taskErrorCallback = f 94 } 95 96 // AddHandler registers the functions to concurrently call for doing and 97 // undoing tasks of the given kind. The undo handler may be nil. 98 func (r *TaskRunner) AddHandler(kind string, do, undo HandlerFunc) { 99 r.mu.Lock() 100 defer r.mu.Unlock() 101 102 r.handlers[kind] = handlerPair{do, undo} 103 } 104 105 // AddOptionalHandler register functions for doing and undoing tasks that match 106 // the given predicate if no explicit handler was registered for the task kind. 107 func (r *TaskRunner) AddOptionalHandler(match func(t *Task) bool, do, undo HandlerFunc) { 108 r.optional = append(r.optional, optionalHandler{match, handlerPair{do, undo}}) 109 } 110 111 func (r *TaskRunner) handlerPair(t *Task) handlerPair { 112 if handler, ok := r.handlers[t.Kind()]; ok { 113 return handler 114 } 115 for _, h := range r.optional { 116 if h.match(t) { 117 return h.handlerPair 118 } 119 } 120 return handlerPair{} 121 } 122 123 // KnownTaskKinds returns all tasks kinds handled by this runner. 124 func (r *TaskRunner) KnownTaskKinds() []string { 125 kinds := make([]string, 0, len(r.handlers)) 126 for h := range r.handlers { 127 kinds = append(kinds, h) 128 } 129 return kinds 130 } 131 132 // AddCleanup registers a function to be called after the change completes, 133 // for cleaning up data left behind by tasks of the specified kind. 134 // The provided function will be called no matter what the final status of the 135 // task is. This mechanism enables keeping data around for a potential undo 136 // until there's no more chance of the task being undone. 137 // 138 // The cleanup function is run concurrently with other cleanup functions, 139 // despite any wait ordering between the tasks. If it returns an error, 140 // it will be retried later. 141 // 142 // The handler for tasks of the provided kind must have been previously 143 // registered before AddCleanup is called for it. 144 func (r *TaskRunner) AddCleanup(kind string, cleanup HandlerFunc) { 145 r.mu.Lock() 146 defer r.mu.Unlock() 147 if _, ok := r.handlers[kind]; !ok { 148 panic("internal error: attempted to register cleanup for unknown task kind") 149 } 150 r.cleanups[kind] = cleanup 151 } 152 153 // SetBlocked sets a predicate function to decide whether to block a task from running based on the current running tasks. It can be used to control task serialisation. 154 func (r *TaskRunner) SetBlocked(pred func(t *Task, running []*Task) bool) { 155 r.mu.Lock() 156 defer r.mu.Unlock() 157 158 r.blocked = []blockedFunc{pred} 159 } 160 161 // AddBlocked adds a predicate function to decide whether to block a task from running based on the current running tasks. It can be used to control task serialisation. All added predicates are considered in turn until one returns true, or none. 162 func (r *TaskRunner) AddBlocked(pred func(t *Task, running []*Task) bool) { 163 r.mu.Lock() 164 defer r.mu.Unlock() 165 166 r.blocked = append(r.blocked, pred) 167 } 168 169 // run must be called with the state lock in place 170 func (r *TaskRunner) run(t *Task) { 171 var handler HandlerFunc 172 var accuRuntime func(dur time.Duration) 173 switch t.Status() { 174 case DoStatus: 175 t.SetStatus(DoingStatus) 176 fallthrough 177 case DoingStatus: 178 handler = r.handlerPair(t).do 179 accuRuntime = t.accumulateDoingTime 180 181 case UndoStatus: 182 t.SetStatus(UndoingStatus) 183 fallthrough 184 case UndoingStatus: 185 handler = r.handlerPair(t).undo 186 accuRuntime = t.accumulateUndoingTime 187 188 default: 189 panic("internal error: attempted to run task in status " + t.Status().String()) 190 } 191 if handler == nil { 192 panic("internal error: attempted to run task with nil handler for status " + t.Status().String()) 193 } 194 195 t.At(time.Time{}) // clear schedule 196 tomb := &tomb.Tomb{} 197 r.tombs[t.ID()] = tomb 198 tomb.Go(func() error { 199 // Capture the error result with tomb.Kill so we can 200 // use tomb.Err uniformily to consider both it or a 201 // overriding previous Kill reason. 202 t0 := time.Now() 203 tomb.Kill(handler(t, tomb)) 204 t1 := time.Now() 205 206 // Locks must be acquired in the same order everywhere. 207 r.mu.Lock() 208 defer r.mu.Unlock() 209 r.state.Lock() 210 defer r.state.Unlock() 211 accuRuntime(t1.Sub(t0)) 212 213 delete(r.tombs, t.ID()) 214 215 // some tasks were blocked, now there's chance the 216 // blocked predicate will change its value 217 if r.someBlocked { 218 r.state.EnsureBefore(0) 219 } 220 221 err := tomb.Err() 222 switch err.(type) { 223 case nil: 224 // we are ok 225 case *Retry: 226 // preserve 227 default: 228 if r.stopped { 229 // we are shutting down, errors might be due 230 // to cancellations, to be safe retry 231 err = &Retry{} 232 } 233 } 234 235 switch x := err.(type) { 236 case *Retry: 237 // Handler asked to be called again later. 238 // TODO Allow postponing retries past the next Ensure. 239 if t.Status() == AbortStatus { 240 // Would work without it but might take two ensures. 241 r.tryUndo(t) 242 } else if x.After != 0 { 243 t.At(timeNow().Add(x.After)) 244 } 245 case nil: 246 var next []*Task 247 switch t.Status() { 248 case DoingStatus: 249 t.SetStatus(DoneStatus) 250 fallthrough 251 case DoneStatus: 252 next = t.HaltTasks() 253 case AbortStatus: 254 // It was actually Done if it got here. 255 t.SetStatus(UndoStatus) 256 r.state.EnsureBefore(0) 257 case UndoingStatus: 258 t.SetStatus(UndoneStatus) 259 fallthrough 260 case UndoneStatus: 261 next = t.WaitTasks() 262 } 263 if len(next) > 0 { 264 r.state.EnsureBefore(0) 265 } 266 default: 267 r.abortLanes(t.Change(), t.Lanes()) 268 t.SetStatus(ErrorStatus) 269 t.Errorf("%s", err) 270 // ensure the error is available in the global log too 271 logger.Noticef("[change %s %q task] failed: %v", t.Change().ID(), t.Summary(), err) 272 if r.taskErrorCallback != nil { 273 r.taskErrorCallback(err) 274 } 275 } 276 277 return nil 278 }) 279 } 280 281 func (r *TaskRunner) clean(t *Task) { 282 if !t.Change().IsReady() { 283 // Whole Change is not ready so don't run cleanups yet. 284 return 285 } 286 287 cleanup, ok := r.cleanups[t.Kind()] 288 if !ok { 289 t.SetClean() 290 return 291 } 292 293 tomb := &tomb.Tomb{} 294 r.tombs[t.ID()] = tomb 295 tomb.Go(func() error { 296 tomb.Kill(cleanup(t, tomb)) 297 298 // Locks must be acquired in the same order everywhere. 299 r.mu.Lock() 300 defer r.mu.Unlock() 301 r.state.Lock() 302 defer r.state.Unlock() 303 304 delete(r.tombs, t.ID()) 305 306 if tomb.Err() != nil { 307 logger.Debugf("Cleaning task %s: %s", t.ID(), tomb.Err()) 308 } else { 309 t.SetClean() 310 } 311 return nil 312 }) 313 } 314 315 func (r *TaskRunner) abortLanes(chg *Change, lanes []int) { 316 chg.AbortLanes(lanes) 317 ensureScheduled := false 318 for _, t := range chg.Tasks() { 319 status := t.Status() 320 if status == AbortStatus { 321 if tb, ok := r.tombs[t.ID()]; ok { 322 tb.Kill(nil) 323 } 324 } 325 if !ensureScheduled && !status.Ready() { 326 ensureScheduled = true 327 r.state.EnsureBefore(0) 328 } 329 } 330 } 331 332 // tryUndo replaces the status of a knowingly aborted task. 333 func (r *TaskRunner) tryUndo(t *Task) { 334 if t.Status() == AbortStatus && r.handlerPair(t).undo == nil { 335 // Cannot undo but it was stopped in flight. 336 // Hold so it doesn't look like it finished. 337 t.SetStatus(HoldStatus) 338 if len(t.WaitTasks()) > 0 { 339 r.state.EnsureBefore(0) 340 } 341 } else { 342 t.SetStatus(UndoStatus) 343 r.state.EnsureBefore(0) 344 } 345 } 346 347 // Ensure starts new goroutines for all known tasks with no pending 348 // dependencies. 349 // Note that Ensure will lock the state. 350 func (r *TaskRunner) Ensure() error { 351 r.mu.Lock() 352 defer r.mu.Unlock() 353 354 if r.stopped { 355 // we are stopping, don't run another ensure 356 return nil 357 } 358 359 // Locks must be acquired in the same order everywhere. 360 r.state.Lock() 361 defer r.state.Unlock() 362 363 r.someBlocked = false 364 running := make([]*Task, 0, len(r.tombs)) 365 for tid := range r.tombs { 366 t := r.state.Task(tid) 367 if t != nil { 368 running = append(running, t) 369 } 370 } 371 372 ensureTime := timeNow() 373 nextTaskTime := time.Time{} 374 ConsiderTasks: 375 for _, t := range r.state.Tasks() { 376 handlers := r.handlerPair(t) 377 if handlers.do == nil { 378 // Handled by a different runner instance. 379 continue 380 } 381 382 tb := r.tombs[t.ID()] 383 384 if t.Status() == AbortStatus { 385 if tb != nil { 386 tb.Kill(nil) 387 continue 388 } 389 r.tryUndo(t) 390 } 391 392 if tb != nil { 393 // Already being handled. 394 continue 395 } 396 397 status := t.Status() 398 if status.Ready() { 399 if !t.IsClean() { 400 r.clean(t) 401 } 402 continue 403 } 404 405 if mustWait(t) { 406 // Dependencies still unhandled. 407 continue 408 } 409 410 if status == UndoStatus && handlers.undo == nil { 411 // Although this has no dependencies itself, it must have waited 412 // above too since follow up tasks may have handlers again. 413 // Cannot undo. Revert to done status. 414 t.SetStatus(DoneStatus) 415 if len(t.WaitTasks()) > 0 { 416 r.state.EnsureBefore(0) 417 } 418 continue 419 } 420 421 // skip tasks scheduled for later and also track the earliest one 422 tWhen := t.AtTime() 423 if !tWhen.IsZero() && ensureTime.Before(tWhen) { 424 if nextTaskTime.IsZero() || nextTaskTime.After(tWhen) { 425 nextTaskTime = tWhen 426 } 427 continue 428 } 429 430 // check if any of the blocked predicates returns true 431 // and skip the task if so 432 for _, blocked := range r.blocked { 433 if blocked(t, running) { 434 r.someBlocked = true 435 continue ConsiderTasks 436 } 437 } 438 439 logger.Debugf("Running task %s on %s: %s", t.ID(), t.Status(), t.Summary()) 440 r.run(t) 441 442 running = append(running, t) 443 } 444 445 // schedule next Ensure no later than the next task time 446 if !nextTaskTime.IsZero() { 447 r.state.EnsureBefore(nextTaskTime.Sub(ensureTime)) 448 } 449 450 return nil 451 } 452 453 // mustWait returns whether task t must wait for other tasks to be done. 454 func mustWait(t *Task) bool { 455 switch t.Status() { 456 case DoStatus: 457 for _, wt := range t.WaitTasks() { 458 if wt.Status() != DoneStatus { 459 return true 460 } 461 } 462 case UndoStatus: 463 for _, ht := range t.HaltTasks() { 464 if !ht.Status().Ready() { 465 return true 466 } 467 } 468 } 469 return false 470 } 471 472 // wait expects to be called with th r.mu lock held 473 func (r *TaskRunner) wait() { 474 for len(r.tombs) > 0 { 475 for _, t := range r.tombs { 476 r.mu.Unlock() 477 t.Wait() 478 r.mu.Lock() 479 break 480 } 481 } 482 } 483 484 // Stop kills all concurrent activities and returns after that's done. 485 func (r *TaskRunner) Stop() { 486 r.mu.Lock() 487 defer r.mu.Unlock() 488 489 r.stopped = true 490 491 for _, tb := range r.tombs { 492 tb.Kill(nil) 493 } 494 495 r.wait() 496 } 497 498 // Wait waits for all concurrent activities and returns after that's done. 499 func (r *TaskRunner) Wait() { 500 r.mu.Lock() 501 defer r.mu.Unlock() 502 503 r.wait() 504 } 505 506 // StopKinds kills all concurrent tasks of the given kinds and returns 507 // after that's done. 508 func (r *TaskRunner) StopKinds(kind ...string) { 509 r.mu.Lock() 510 defer r.mu.Unlock() 511 512 kinds := make(map[string]bool, len(kind)) 513 for _, k := range kind { 514 kinds[k] = true 515 } 516 517 var tombs []*tomb.Tomb 518 // Locks must be acquired in the same order everywhere: 519 // r.mu, r.state 520 r.state.Lock() 521 for tid, tb := range r.tombs { 522 task := r.state.Task(tid) 523 if task == nil || !kinds[task.Kind()] { 524 continue 525 } 526 tombs = append(tombs, tb) 527 tb.Kill(nil) 528 } 529 r.state.Unlock() 530 531 for _, tb := range tombs { 532 r.mu.Unlock() 533 tb.Wait() 534 r.mu.Lock() 535 } 536 }