github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/util/stop/stopper.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package stop 12 13 import ( 14 "context" 15 "fmt" 16 "net/http" 17 "sort" 18 "strings" 19 "sync" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/roachpb" 23 "github.com/cockroachdb/cockroach/pkg/settings" 24 "github.com/cockroachdb/cockroach/pkg/util/caller" 25 "github.com/cockroachdb/cockroach/pkg/util/log" 26 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 27 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 28 "github.com/cockroachdb/cockroach/pkg/util/tracing" 29 "github.com/cockroachdb/errors" 30 opentracing "github.com/opentracing/opentracing-go" 31 ) 32 33 const asyncTaskNamePrefix = "[async] " 34 35 // ErrThrottled is returned from RunLimitedAsyncTask in the event that there 36 // is no more capacity for async tasks, as limited by the semaphore. 37 var ErrThrottled = errors.New("throttled on async limiting semaphore") 38 39 // ErrUnavailable indicates that the server is quiescing and is unable to 40 // process new work. 41 var ErrUnavailable = &roachpb.NodeUnavailableError{} 42 43 func register(s *Stopper) { 44 trackedStoppers.Lock() 45 trackedStoppers.stoppers = append(trackedStoppers.stoppers, s) 46 trackedStoppers.Unlock() 47 } 48 49 func unregister(s *Stopper) { 50 trackedStoppers.Lock() 51 defer trackedStoppers.Unlock() 52 sl := trackedStoppers.stoppers 53 for i, tracked := range sl { 54 if tracked == s { 55 trackedStoppers.stoppers = sl[:i+copy(sl[i:], sl[i+1:])] 56 return 57 } 58 } 59 panic("attempt to unregister untracked stopper") 60 } 61 62 var trackedStoppers struct { 63 syncutil.Mutex 64 stoppers []*Stopper 65 } 66 67 // HandleDebug responds with the list of stopper tasks actively running. 68 func HandleDebug(w http.ResponseWriter, r *http.Request) { 69 w.Header().Set("Content-Type", "text/plain; charset=utf-8") 70 trackedStoppers.Lock() 71 defer trackedStoppers.Unlock() 72 for _, s := range trackedStoppers.stoppers { 73 s.mu.Lock() 74 fmt.Fprintf(w, "%p: %d tasks\n%s", s, s.mu.numTasks, s.runningTasksLocked()) 75 s.mu.Unlock() 76 } 77 } 78 79 // Closer is an interface for objects to attach to the stopper to 80 // be closed once the stopper completes. 81 type Closer interface { 82 Close() 83 } 84 85 // CloserFn is type that allows any function to be a Closer. 86 type CloserFn func() 87 88 // Close implements the Closer interface. 89 func (f CloserFn) Close() { 90 f() 91 } 92 93 // A Stopper provides a channel-based mechanism to stop an arbitrary 94 // array of workers. Each worker is registered with the stopper via 95 // the RunWorker() method. The system further allows execution of functions 96 // through RunTask() and RunAsyncTask(). 97 // 98 // Stopping occurs in two phases: the first is the request to stop, which moves 99 // the stopper into a quiescing phase. While quiescing, calls to RunTask() & 100 // RunAsyncTask() don't execute the function passed in and return ErrUnavailable. 101 // When all outstanding tasks have been completed, the stopper 102 // closes its stopper channel, which signals all live workers that it's safe to 103 // shut down. When all workers have shutdown, the stopper is complete. 104 // 105 // An arbitrary list of objects implementing the Closer interface may 106 // be added to the stopper via AddCloser(), to be closed after the 107 // stopper has stopped. 108 type Stopper struct { 109 quiescer chan struct{} // Closed when quiescing 110 stopper chan struct{} // Closed when stopping 111 stopped chan struct{} // Closed when stopped completely 112 onPanic func(interface{}) // called with recover() on panic on any goroutine 113 stop sync.WaitGroup // Incremented for outstanding workers 114 mu struct { 115 syncutil.Mutex 116 quiesce *sync.Cond // Conditional variable to wait for outstanding tasks 117 quiescing bool // true when Stop() has been called 118 numTasks int // number of outstanding tasks 119 tasks TaskMap 120 closers []Closer 121 idAlloc int 122 qCancels map[int]func() 123 sCancels map[int]func() 124 125 stopCalled bool // turns all but first call to Stop into noop 126 } 127 } 128 129 // An Option can be passed to NewStopper. 130 type Option interface { 131 apply(*Stopper) 132 } 133 134 type optionPanicHandler func(interface{}) 135 136 func (oph optionPanicHandler) apply(stopper *Stopper) { 137 stopper.onPanic = oph 138 } 139 140 // OnPanic is an option which lets the Stopper recover from all panics using 141 // the provided panic handler. 142 // 143 // When Stop() is invoked during stack unwinding, OnPanic is also invoked, but 144 // Stop() may not have carried out its duties. 145 func OnPanic(handler func(interface{})) Option { 146 return optionPanicHandler(handler) 147 } 148 149 // NewStopper returns an instance of Stopper. 150 func NewStopper(options ...Option) *Stopper { 151 s := &Stopper{ 152 quiescer: make(chan struct{}), 153 stopper: make(chan struct{}), 154 stopped: make(chan struct{}), 155 } 156 157 s.mu.tasks = TaskMap{} 158 s.mu.qCancels = map[int]func(){} 159 s.mu.sCancels = map[int]func(){} 160 161 for _, opt := range options { 162 opt.apply(s) 163 } 164 165 s.mu.quiesce = sync.NewCond(&s.mu) 166 register(s) 167 return s 168 } 169 170 // Recover is used internally by Stopper to provide a hook for recovery of 171 // panics on goroutines started by the Stopper. It can also be invoked 172 // explicitly (via "defer s.Recover()") on goroutines that are created outside 173 // of Stopper. 174 func (s *Stopper) Recover(ctx context.Context) { 175 if r := recover(); r != nil { 176 if s.onPanic != nil { 177 s.onPanic(r) 178 return 179 } 180 if sv := settings.TODO(); sv != nil { 181 log.ReportPanic(ctx, sv, r, 1) 182 } 183 panic(r) 184 } 185 } 186 187 // RunWorker runs the supplied function as a "worker" to be stopped 188 // by the stopper. The function <f> is run in a goroutine. 189 func (s *Stopper) RunWorker(ctx context.Context, f func(context.Context)) { 190 s.stop.Add(1) 191 go func() { 192 // Remove any associated span; we need to ensure this because the 193 // worker may run longer than the caller which presumably closes 194 // any spans it has created. 195 ctx = opentracing.ContextWithSpan(ctx, nil) 196 defer s.Recover(ctx) 197 defer s.stop.Done() 198 f(ctx) 199 }() 200 } 201 202 // AddCloser adds an object to close after the stopper has been stopped. 203 // 204 // WARNING: memory resources acquired by this method will stay around for 205 // the lifetime of the Stopper. Use with care to avoid leaking memory. 206 func (s *Stopper) AddCloser(c Closer) { 207 s.mu.Lock() 208 defer s.mu.Unlock() 209 select { 210 case <-s.stopper: 211 // Close immediately. 212 c.Close() 213 default: 214 s.mu.closers = append(s.mu.closers, c) 215 } 216 } 217 218 // WithCancelOnQuiesce returns a child context which is canceled when the 219 // returned cancel function is called or when the Stopper begins to quiesce, 220 // whichever happens first. 221 // 222 // Canceling this context releases resources associated with it, so code should 223 // call cancel as soon as the operations running in this Context complete. 224 func (s *Stopper) WithCancelOnQuiesce(ctx context.Context) (context.Context, func()) { 225 return s.withCancel(ctx, s.mu.qCancels, s.quiescer) 226 } 227 228 // WithCancelOnStop returns a child context which is canceled when the 229 // returned cancel function is called or when the Stopper begins to stop, 230 // whichever happens first. 231 // 232 // Canceling this context releases resources associated with it, so code should 233 // call cancel as soon as the operations running in this Context complete. 234 func (s *Stopper) WithCancelOnStop(ctx context.Context) (context.Context, func()) { 235 return s.withCancel(ctx, s.mu.sCancels, s.stopper) 236 } 237 238 func (s *Stopper) withCancel( 239 ctx context.Context, cancels map[int]func(), cancelCh chan struct{}, 240 ) (context.Context, func()) { 241 var cancel func() 242 ctx, cancel = context.WithCancel(ctx) 243 s.mu.Lock() 244 defer s.mu.Unlock() 245 select { 246 case <-cancelCh: 247 // Cancel immediately. 248 cancel() 249 return ctx, func() {} 250 default: 251 id := s.mu.idAlloc 252 s.mu.idAlloc++ 253 cancels[id] = cancel 254 return ctx, func() { 255 cancel() 256 s.mu.Lock() 257 defer s.mu.Unlock() 258 delete(cancels, id) 259 } 260 } 261 } 262 263 // RunTask adds one to the count of tasks left to quiesce in the system. 264 // Any worker which is a "first mover" when starting tasks must call this method 265 // before starting work on a new task. First movers include goroutines launched 266 // to do periodic work and the kv/db.go gateway which accepts external client 267 // requests. 268 // 269 // taskName is used as the "operation" field of the span opened for this task 270 // and is visible in traces. It's also part of reports printed by stoppers 271 // waiting to stop. The convention is 272 // <package name>.<struct name>: <succinct description of the task's action> 273 // 274 // Returns an error to indicate that the system is currently quiescing and 275 // function f was not called. 276 func (s *Stopper) RunTask(ctx context.Context, taskName string, f func(context.Context)) error { 277 if !s.runPrelude(taskName) { 278 return ErrUnavailable 279 } 280 281 // Call f. 282 defer s.Recover(ctx) 283 defer s.runPostlude(taskName) 284 285 f(ctx) 286 return nil 287 } 288 289 // RunTaskWithErr is like RunTask(), but takes in a callback that can return an 290 // error. The error is returned to the caller. 291 func (s *Stopper) RunTaskWithErr( 292 ctx context.Context, taskName string, f func(context.Context) error, 293 ) error { 294 if !s.runPrelude(taskName) { 295 return ErrUnavailable 296 } 297 298 // Call f. 299 defer s.Recover(ctx) 300 defer s.runPostlude(taskName) 301 302 return f(ctx) 303 } 304 305 // RunAsyncTask is like RunTask, except the callback is run in a goroutine. The 306 // method doesn't block for the callback to finish execution. 307 func (s *Stopper) RunAsyncTask( 308 ctx context.Context, taskName string, f func(context.Context), 309 ) error { 310 taskName = asyncTaskNamePrefix + taskName 311 if !s.runPrelude(taskName) { 312 return ErrUnavailable 313 } 314 315 ctx, span := tracing.ForkCtxSpan(ctx, taskName) 316 317 // Call f. 318 go func() { 319 defer s.Recover(ctx) 320 defer s.runPostlude(taskName) 321 defer tracing.FinishSpan(span) 322 323 f(ctx) 324 }() 325 return nil 326 } 327 328 // RunLimitedAsyncTask runs function f in a goroutine, using the given 329 // channel as a semaphore to limit the number of tasks that are run 330 // concurrently to the channel's capacity. If wait is true, blocks 331 // until the semaphore is available in order to push back on callers 332 // that may be trying to create many tasks. If wait is false, returns 333 // immediately with an error if the semaphore is not 334 // available. It is the caller's responsibility to ensure that sem is 335 // closed when the stopper is quiesced. For quotapools which live for the 336 // lifetime of the stopper, it is generally best to register the sem with the 337 // stopper using AddCloser. 338 func (s *Stopper) RunLimitedAsyncTask( 339 ctx context.Context, taskName string, sem *quotapool.IntPool, wait bool, f func(context.Context), 340 ) (err error) { 341 // Wait for permission to run from the semaphore. 342 var alloc *quotapool.IntAlloc 343 if wait { 344 alloc, err = sem.Acquire(ctx, 1) 345 } else { 346 alloc, err = sem.TryAcquire(ctx, 1) 347 } 348 if errors.Is(err, quotapool.ErrNotEnoughQuota) { 349 err = ErrThrottled 350 } else if quotapool.HasErrClosed(err) { 351 err = ErrUnavailable 352 } 353 if err != nil { 354 return err 355 } 356 defer func() { 357 // If the err is non-nil then we know that we did not start the async task 358 // and thus we need to release the acquired quota. If it is nil then we 359 // did start the task and it will release the quota. 360 if err != nil { 361 alloc.Release() 362 } 363 }() 364 365 // Check for canceled context: it's possible to get the semaphore even 366 // if the context is canceled. 367 if ctx.Err() != nil { 368 return ctx.Err() 369 } 370 if !s.runPrelude(taskName) { 371 return ErrUnavailable 372 } 373 374 ctx, span := tracing.ForkCtxSpan(ctx, taskName) 375 376 go func() { 377 defer s.Recover(ctx) 378 defer s.runPostlude(taskName) 379 defer alloc.Release() 380 defer tracing.FinishSpan(span) 381 382 f(ctx) 383 }() 384 return nil 385 } 386 387 func (s *Stopper) runPrelude(taskName string) bool { 388 s.mu.Lock() 389 defer s.mu.Unlock() 390 if s.mu.quiescing { 391 return false 392 } 393 s.mu.numTasks++ 394 s.mu.tasks[taskName]++ 395 return true 396 } 397 398 func (s *Stopper) runPostlude(taskName string) { 399 s.mu.Lock() 400 defer s.mu.Unlock() 401 s.mu.numTasks-- 402 s.mu.tasks[taskName]-- 403 s.mu.quiesce.Broadcast() 404 } 405 406 // NumTasks returns the number of active tasks. 407 func (s *Stopper) NumTasks() int { 408 s.mu.Lock() 409 defer s.mu.Unlock() 410 return s.mu.numTasks 411 } 412 413 // A TaskMap is returned by RunningTasks(). 414 type TaskMap map[string]int 415 416 // String implements fmt.Stringer and returns a sorted multi-line listing of 417 // the TaskMap. 418 func (tm TaskMap) String() string { 419 var lines []string 420 for location, num := range tm { 421 lines = append(lines, fmt.Sprintf("%-6d %s", num, location)) 422 } 423 sort.Sort(sort.Reverse(sort.StringSlice(lines))) 424 return strings.Join(lines, "\n") 425 } 426 427 // RunningTasks returns a map containing the count of running tasks keyed by 428 // call site. 429 func (s *Stopper) RunningTasks() TaskMap { 430 s.mu.Lock() 431 defer s.mu.Unlock() 432 return s.runningTasksLocked() 433 } 434 435 func (s *Stopper) runningTasksLocked() TaskMap { 436 m := TaskMap{} 437 for k := range s.mu.tasks { 438 if s.mu.tasks[k] == 0 { 439 continue 440 } 441 m[k] = s.mu.tasks[k] 442 } 443 return m 444 } 445 446 // Stop signals all live workers to stop and then waits for each to 447 // confirm it has stopped. 448 func (s *Stopper) Stop(ctx context.Context) { 449 s.mu.Lock() 450 stopCalled := s.mu.stopCalled 451 s.mu.stopCalled = true 452 s.mu.Unlock() 453 454 if stopCalled { 455 return 456 } 457 458 defer s.Recover(ctx) 459 defer unregister(s) 460 461 if log.V(1) { 462 file, line, _ := caller.Lookup(1) 463 log.Infof(ctx, 464 "stop has been called from %s:%d, stopping or quiescing all running tasks", file, line) 465 } 466 // Don't bother doing stuff cleanly if we're panicking, that would likely 467 // block. Instead, best effort only. This cleans up the stack traces, 468 // avoids stalls and helps some tests in `./cli` finish cleanly (where 469 // panics happen on purpose). 470 if r := recover(); r != nil { 471 go s.Quiesce(ctx) 472 close(s.stopper) 473 close(s.stopped) 474 s.mu.Lock() 475 for _, c := range s.mu.closers { 476 go c.Close() 477 } 478 s.mu.Unlock() 479 panic(r) 480 } 481 482 s.Quiesce(ctx) 483 s.mu.Lock() 484 for _, cancel := range s.mu.sCancels { 485 cancel() 486 } 487 close(s.stopper) 488 s.mu.Unlock() 489 490 s.stop.Wait() 491 s.mu.Lock() 492 defer s.mu.Unlock() 493 for _, c := range s.mu.closers { 494 c.Close() 495 } 496 close(s.stopped) 497 } 498 499 // ShouldQuiesce returns a channel which will be closed when Stop() has been 500 // invoked and outstanding tasks should begin to quiesce. 501 func (s *Stopper) ShouldQuiesce() <-chan struct{} { 502 if s == nil { 503 // A nil stopper will never signal ShouldQuiesce, but will also never panic. 504 return nil 505 } 506 return s.quiescer 507 } 508 509 // ShouldStop returns a channel which will be closed when Stop() has been 510 // invoked and outstanding tasks have quiesced. 511 func (s *Stopper) ShouldStop() <-chan struct{} { 512 if s == nil { 513 // A nil stopper will never signal ShouldStop, but will also never panic. 514 return nil 515 } 516 return s.stopper 517 } 518 519 // IsStopped returns a channel which will be closed after Stop() has 520 // been invoked to full completion, meaning all workers have completed 521 // and all closers have been closed. 522 func (s *Stopper) IsStopped() <-chan struct{} { 523 if s == nil { 524 return nil 525 } 526 return s.stopped 527 } 528 529 // Quiesce moves the stopper to state quiescing and waits until all 530 // tasks complete. This is used from Stop() and unittests. 531 func (s *Stopper) Quiesce(ctx context.Context) { 532 defer s.Recover(ctx) 533 s.mu.Lock() 534 defer s.mu.Unlock() 535 for _, cancel := range s.mu.qCancels { 536 cancel() 537 } 538 if !s.mu.quiescing { 539 log.Infof(ctx, "quiescing") 540 s.mu.quiescing = true 541 close(s.quiescer) 542 } 543 for s.mu.numTasks > 0 { 544 t := time.AfterFunc(5*time.Second, func() { 545 // If we're waiting for 5+s without a task terminating, log the ones 546 // that remain. 547 log.Infof(ctx, "quiescing; tasks left:\n%s", s.RunningTasks()) 548 }) 549 // Unlock s.mu, wait for the signal, and lock s.mu. 550 s.mu.quiesce.Wait() 551 t.Stop() 552 } 553 }