github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/watchdog/watchdog.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package watchdog is responsible for monitoring the sentry for tasks that may
    16  // potentially be stuck or looping inderterminally causing hard to debug hangs in
    17  // the untrusted app.
    18  //
    19  // It works by periodically querying all tasks to check whether they are in user
    20  // mode (RunUser), kernel mode (RunSys), or blocked in the kernel (OffCPU). Tasks
    21  // that have been running in kernel mode for a long time in the same syscall
    22  // without blocking are considered stuck and are reported.
    23  //
    24  // When a stuck task is detected, the watchdog can take one of the following actions:
    25  //  1. LogWarning: Logs a warning message followed by a stack dump of all goroutines.
    26  //     If a tasks continues to be stuck, the message will repeat every minute, unless
    27  //     a new stuck task is detected
    28  //  2. Panic: same as above, followed by panic()
    29  package watchdog
    30  
    31  import (
    32  	"bytes"
    33  	"fmt"
    34  	"time"
    35  
    36  	"github.com/metacubex/gvisor/pkg/abi/linux"
    37  	"github.com/metacubex/gvisor/pkg/log"
    38  	"github.com/metacubex/gvisor/pkg/metric"
    39  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    40  	ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time"
    41  	"github.com/metacubex/gvisor/pkg/sync"
    42  )
    43  
    44  // Opts configures the watchdog.
    45  type Opts struct {
    46  	// TaskTimeout is the amount of time to allow a task to execute the
    47  	// same syscall without blocking before it's declared stuck.
    48  	TaskTimeout time.Duration
    49  
    50  	// TaskTimeoutAction indicates what action to take when a stuck tasks
    51  	// is detected.
    52  	TaskTimeoutAction Action
    53  
    54  	// StartupTimeout is the amount of time to allow between watchdog
    55  	// creation and calling watchdog.Start.
    56  	StartupTimeout time.Duration
    57  
    58  	// StartupTimeoutAction indicates what action to take when
    59  	// watchdog.Start is not called within the timeout.
    60  	StartupTimeoutAction Action
    61  }
    62  
    63  // DefaultOpts is a default set of options for the watchdog.
    64  var DefaultOpts = Opts{
    65  	// Task timeout.
    66  	TaskTimeout:       3 * time.Minute,
    67  	TaskTimeoutAction: LogWarning,
    68  
    69  	// Startup timeout.
    70  	StartupTimeout:       30 * time.Second,
    71  	StartupTimeoutAction: LogWarning,
    72  }
    73  
    74  // descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period
    75  // is discounted from task's last update time. It's set high enough that small scheduling delays won't
    76  // trigger it.
    77  const descheduleThreshold = 1 * time.Second
    78  
    79  // Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck.
    80  var stackDumpSameTaskPeriod = time.Minute
    81  
    82  // Action defines what action to take when a stuck task is detected.
    83  type Action int
    84  
    85  const (
    86  	// LogWarning logs warning message followed by stack trace.
    87  	LogWarning Action = iota
    88  
    89  	// Panic will do the same logging as LogWarning and panic().
    90  	Panic
    91  )
    92  
    93  // Set implements flag.Value.
    94  func (a *Action) Set(v string) error {
    95  	switch v {
    96  	case "log", "logwarning":
    97  		*a = LogWarning
    98  	case "panic":
    99  		*a = Panic
   100  	default:
   101  		return fmt.Errorf("invalid watchdog action %q", v)
   102  	}
   103  	return nil
   104  }
   105  
   106  // Get implements flag.Value.
   107  func (a *Action) Get() any {
   108  	return *a
   109  }
   110  
   111  // String returns Action's string representation.
   112  func (a Action) String() string {
   113  	switch a {
   114  	case LogWarning:
   115  		return "logWarning"
   116  	case Panic:
   117  		return "panic"
   118  	default:
   119  		panic(fmt.Sprintf("Invalid watchdog action: %d", a))
   120  	}
   121  }
   122  
   123  // Watchdog is the main watchdog class. It controls a goroutine that periodically
   124  // analyses all tasks and reports if any of them appear to be stuck.
   125  type Watchdog struct {
   126  	// Configuration options are embedded.
   127  	Opts
   128  
   129  	// period indicates how often to check all tasks. It's calculated based on
   130  	// opts.TaskTimeout.
   131  	period time.Duration
   132  
   133  	// k is where the tasks come from.
   134  	k *kernel.Kernel
   135  
   136  	// stop is used to notify to watchdog should stop.
   137  	stop chan struct{}
   138  
   139  	// done is used to notify when the watchdog has stopped.
   140  	done chan struct{}
   141  
   142  	// offenders map contains all tasks that are currently stuck.
   143  	offenders map[*kernel.Task]*offender
   144  
   145  	// lastStackDump tracks the last time a stack dump was generated to prevent
   146  	// spamming the log.
   147  	lastStackDump time.Time
   148  
   149  	// lastRun is set to the last time the watchdog executed a monitoring loop.
   150  	lastRun ktime.Time
   151  
   152  	// mu protects the fields below.
   153  	mu sync.Mutex
   154  
   155  	// running is true if the watchdog is running.
   156  	running bool
   157  
   158  	// startCalled is true if Start has ever been called. It remains true
   159  	// even if Stop is called.
   160  	startCalled bool
   161  }
   162  
   163  type offender struct {
   164  	lastUpdateTime ktime.Time
   165  }
   166  
   167  // New creates a new watchdog.
   168  func New(k *kernel.Kernel, opts Opts) *Watchdog {
   169  	// 4 is arbitrary, just don't want to prolong 'TaskTimeout' too much.
   170  	period := opts.TaskTimeout / 4
   171  	w := &Watchdog{
   172  		Opts:      opts,
   173  		k:         k,
   174  		period:    period,
   175  		offenders: make(map[*kernel.Task]*offender),
   176  		stop:      make(chan struct{}),
   177  		done:      make(chan struct{}),
   178  	}
   179  
   180  	// Handle StartupTimeout if it exists.
   181  	if w.StartupTimeout > 0 {
   182  		log.Infof("Watchdog waiting %v for startup", w.StartupTimeout)
   183  		go w.waitForStart() // S/R-SAFE: watchdog is stopped during save and restarted after restore.
   184  	}
   185  
   186  	return w
   187  }
   188  
   189  // Start starts the watchdog.
   190  func (w *Watchdog) Start() {
   191  	w.mu.Lock()
   192  	defer w.mu.Unlock()
   193  	w.startCalled = true
   194  
   195  	if w.running {
   196  		return
   197  	}
   198  
   199  	if w.TaskTimeout == 0 {
   200  		log.Infof("Watchdog task timeout disabled")
   201  		return
   202  	}
   203  	w.lastRun = w.k.MonotonicClock().Now()
   204  
   205  	log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.TaskTimeout, w.TaskTimeoutAction)
   206  	go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore.
   207  	w.running = true
   208  }
   209  
   210  // Stop requests the watchdog to stop and wait for it.
   211  func (w *Watchdog) Stop() {
   212  	if w.TaskTimeout == 0 {
   213  		return
   214  	}
   215  
   216  	w.mu.Lock()
   217  	defer w.mu.Unlock()
   218  	if !w.running {
   219  		return
   220  	}
   221  	log.Infof("Stopping watchdog")
   222  	w.stop <- struct{}{}
   223  	<-w.done
   224  	w.running = false
   225  	log.Infof("Watchdog stopped")
   226  }
   227  
   228  // waitForStart waits for Start to be called and takes action if it does not
   229  // happen within the startup timeout.
   230  func (w *Watchdog) waitForStart() {
   231  	<-time.After(w.StartupTimeout)
   232  	w.mu.Lock()
   233  	defer w.mu.Unlock()
   234  	if w.startCalled {
   235  		// We are fine.
   236  		return
   237  	}
   238  
   239  	metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeWatchdogStuckStartup)
   240  
   241  	var buf bytes.Buffer
   242  	buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
   243  	w.doAction(w.StartupTimeoutAction, false, &buf)
   244  }
   245  
   246  // loop is the main watchdog routine. It only returns when 'Stop()' is called.
   247  func (w *Watchdog) loop() {
   248  	// Loop until someone stops it.
   249  	for {
   250  		select {
   251  		case <-w.stop:
   252  			w.done <- struct{}{}
   253  			return
   254  		case <-time.After(w.period):
   255  			w.runTurn()
   256  		}
   257  	}
   258  }
   259  
   260  // runTurn runs a single pass over all tasks and reports anything it finds.
   261  func (w *Watchdog) runTurn() {
   262  	// Someone needs to watch the watchdog. The call below can get stuck if there
   263  	// is a deadlock affecting root's PID namespace mutex. Run it in a goroutine
   264  	// and report if it takes too long to return.
   265  	var tasks []*kernel.Task
   266  	done := make(chan struct{})
   267  	go func() { // S/R-SAFE: watchdog is stopped and restarted during S/R.
   268  		tasks = w.k.TaskSet().Root.Tasks()
   269  		close(done)
   270  	}()
   271  
   272  	select {
   273  	case <-done:
   274  	case <-time.After(w.TaskTimeout):
   275  		// Report if the watchdog is not making progress.
   276  		// No one is watching the watchdog watcher though.
   277  		w.reportStuckWatchdog()
   278  		<-done
   279  	}
   280  
   281  	newOffenders := make(map[*kernel.Task]*offender)
   282  	newTaskFound := false
   283  	now := ktime.FromNanoseconds(int64(w.k.CPUClockNow() * uint64(linux.ClockTick)))
   284  
   285  	// The process may be running with low CPU limit making tasks appear stuck because
   286  	// are starved of CPU cycles. An estimate is that Tasks could have been starved
   287  	// since the last time the watchdog run. If the watchdog detects that scheduling
   288  	// is off, it will discount the entire duration since last run from 'lastUpdateTime'.
   289  	discount := time.Duration(0)
   290  	if now.Sub(w.lastRun.Add(w.period)) > descheduleThreshold {
   291  		discount = now.Sub(w.lastRun)
   292  	}
   293  	w.lastRun = now
   294  
   295  	log.Infof("Watchdog starting loop, tasks: %d, discount: %v", len(tasks), discount)
   296  	for _, t := range tasks {
   297  		tsched := t.TaskGoroutineSchedInfo()
   298  
   299  		// An offender is a task running inside the kernel for longer than the specified timeout.
   300  		if tsched.State == kernel.TaskGoroutineRunningSys {
   301  			lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick)))
   302  			elapsed := now.Sub(lastUpdateTime) - discount
   303  			if elapsed > w.TaskTimeout {
   304  				tc, ok := w.offenders[t]
   305  				if !ok {
   306  					// New stuck task detected.
   307  					//
   308  					// Note that tasks blocked doing IO may be considered stuck in kernel,
   309  					// unless they are surrounded by
   310  					// Task.UninterruptibleSleepStart/Finish.
   311  					tc = &offender{lastUpdateTime: lastUpdateTime}
   312  					metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeWatchdogStuckTasks)
   313  					newTaskFound = true
   314  				}
   315  				newOffenders[t] = tc
   316  			}
   317  		}
   318  	}
   319  	if len(newOffenders) > 0 {
   320  		w.report(newOffenders, newTaskFound, now)
   321  	}
   322  
   323  	// Remember which tasks have been reported.
   324  	w.offenders = newOffenders
   325  }
   326  
   327  // report takes appropriate action when a stuck task is detected.
   328  func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) {
   329  	var buf bytes.Buffer
   330  	buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders)))
   331  	for t, o := range offenders {
   332  		tid := w.k.TaskSet().Root.IDOfTask(t)
   333  		buf.WriteString(fmt.Sprintf("\tTask tid: %v (goroutine %d), entered RunSys state %v ago.\n", tid, t.GoroutineID(), now.Sub(o.lastUpdateTime)))
   334  	}
   335  	buf.WriteString("Search for 'goroutine <id>' in the stack dump to find the offending goroutine(s)")
   336  
   337  	// Force stack dump only if a new task is detected.
   338  	w.doAction(w.TaskTimeoutAction, newTaskFound, &buf)
   339  }
   340  
   341  func (w *Watchdog) reportStuckWatchdog() {
   342  	var buf bytes.Buffer
   343  	buf.WriteString("Watchdog goroutine is stuck")
   344  	w.doAction(w.TaskTimeoutAction, false, &buf)
   345  }
   346  
   347  // doAction will take the given action. If the action is LogWarning, the stack
   348  // is not always dumped to the log to prevent log flooding. "forceStack"
   349  // guarantees that the stack will be dumped regardless.
   350  func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
   351  	switch action {
   352  	case LogWarning:
   353  		// Dump stack only if forced or sometime has passed since the last time a
   354  		// stack dump was generated.
   355  		if !forceStack && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod {
   356  			msg.WriteString("\n...[stack dump skipped]...")
   357  			log.Warningf(msg.String())
   358  			return
   359  		}
   360  		log.TracebackAll(msg.String())
   361  		w.lastStackDump = time.Now()
   362  
   363  	case Panic:
   364  		// Panic will skip over running tasks, which is likely the culprit here. So manually
   365  		// dump all stacks before panic'ing.
   366  		log.TracebackAll(msg.String())
   367  
   368  		// Attempt to flush metrics, timeout and move on in case metrics are stuck as well.
   369  		metricsEmitted := make(chan struct{}, 1)
   370  		go func() { // S/R-SAFE: watchdog is stopped during save and restarted after restore.
   371  			// Flush metrics before killing process.
   372  			metric.EmitMetricUpdate()
   373  			metricsEmitted <- struct{}{}
   374  		}()
   375  		select {
   376  		case <-metricsEmitted:
   377  		case <-time.After(1 * time.Second):
   378  		}
   379  		panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String()))
   380  
   381  	default:
   382  		panic(fmt.Sprintf("Unknown watchdog action %v", action))
   383  
   384  	}
   385  }