github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/watchdog/watchdog.go

github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/watchdog/watchdog.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package watchdog is responsible for monitoring the sentry for tasks that may
    16  // potentially be stuck or looping inderterminally causing hard to debug hungs in
    17  // the untrusted app.
    18  //
    19  // It works by periodically querying all tasks to check whether they are in user
    20  // mode (RunUser), kernel mode (RunSys), or blocked in the kernel (OffCPU). Tasks
    21  // that have been running in kernel mode for a long time in the same syscall
    22  // without blocking are considered stuck and are reported.
    23  //
    24  // When a stuck task is detected, the watchdog can take one of the following actions:
    25  //		1. LogWarning: Logs a warning message followed by a stack dump of all goroutines.
    26  //			 If a tasks continues to be stuck, the message will repeat every minute, unless
    27  //			 a new stuck task is detected
    28  //		2. Panic: same as above, followed by panic()
    29  //
    30  package watchdog
    31  
    32  import (
    33  	"bytes"
    34  	"fmt"
    35  	"time"
    36  
    37  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    38  	"github.com/SagerNet/gvisor/pkg/log"
    39  	"github.com/SagerNet/gvisor/pkg/metric"
    40  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    41  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    42  	"github.com/SagerNet/gvisor/pkg/sync"
    43  )
    44  
    45  // Opts configures the watchdog.
    46  type Opts struct {
    47  	// TaskTimeout is the amount of time to allow a task to execute the
    48  	// same syscall without blocking before it's declared stuck.
    49  	TaskTimeout time.Duration
    50  
    51  	// TaskTimeoutAction indicates what action to take when a stuck tasks
    52  	// is detected.
    53  	TaskTimeoutAction Action
    54  
    55  	// StartupTimeout is the amount of time to allow between watchdog
    56  	// creation and calling watchdog.Start.
    57  	StartupTimeout time.Duration
    58  
    59  	// StartupTimeoutAction indicates what action to take when
    60  	// watchdog.Start is not called within the timeout.
    61  	StartupTimeoutAction Action
    62  }
    63  
    64  // DefaultOpts is a default set of options for the watchdog.
    65  var DefaultOpts = Opts{
    66  	// Task timeout.
    67  	TaskTimeout:       3 * time.Minute,
    68  	TaskTimeoutAction: LogWarning,
    69  
    70  	// Startup timeout.
    71  	StartupTimeout:       30 * time.Second,
    72  	StartupTimeoutAction: LogWarning,
    73  }
    74  
    75  // descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period
    76  // is discounted from task's last update time. It's set high enough that small scheduling delays won't
    77  // trigger it.
    78  const descheduleThreshold = 1 * time.Second
    79  
    80  // Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck.
    81  var stackDumpSameTaskPeriod = time.Minute
    82  
    83  // Action defines what action to take when a stuck task is detected.
    84  type Action int
    85  
    86  const (
    87  	// LogWarning logs warning message followed by stack trace.
    88  	LogWarning Action = iota
    89  
    90  	// Panic will do the same logging as LogWarning and panic().
    91  	Panic
    92  )
    93  
    94  // Set implements flag.Value.
    95  func (a *Action) Set(v string) error {
    96  	switch v {
    97  	case "log", "logwarning":
    98  		*a = LogWarning
    99  	case "panic":
   100  		*a = Panic
   101  	default:
   102  		return fmt.Errorf("invalid watchdog action %q", v)
   103  	}
   104  	return nil
   105  }
   106  
   107  // Get implements flag.Value.
   108  func (a *Action) Get() interface{} {
   109  	return *a
   110  }
   111  
   112  // String returns Action's string representation.
   113  func (a Action) String() string {
   114  	switch a {
   115  	case LogWarning:
   116  		return "logWarning"
   117  	case Panic:
   118  		return "panic"
   119  	default:
   120  		panic(fmt.Sprintf("Invalid watchdog action: %d", a))
   121  	}
   122  }
   123  
   124  // Watchdog is the main watchdog class. It controls a goroutine that periodically
   125  // analyses all tasks and reports if any of them appear to be stuck.
   126  type Watchdog struct {
   127  	// Configuration options are embedded.
   128  	Opts
   129  
   130  	// period indicates how often to check all tasks. It's calculated based on
   131  	// opts.TaskTimeout.
   132  	period time.Duration
   133  
   134  	// k is where the tasks come from.
   135  	k *kernel.Kernel
   136  
   137  	// stop is used to notify to watchdog should stop.
   138  	stop chan struct{}
   139  
   140  	// done is used to notify when the watchdog has stopped.
   141  	done chan struct{}
   142  
   143  	// offenders map contains all tasks that are currently stuck.
   144  	offenders map[*kernel.Task]*offender
   145  
   146  	// lastStackDump tracks the last time a stack dump was generated to prevent
   147  	// spamming the log.
   148  	lastStackDump time.Time
   149  
   150  	// lastRun is set to the last time the watchdog executed a monitoring loop.
   151  	lastRun ktime.Time
   152  
   153  	// mu protects the fields below.
   154  	mu sync.Mutex
   155  
   156  	// running is true if the watchdog is running.
   157  	running bool
   158  
   159  	// startCalled is true if Start has ever been called. It remains true
   160  	// even if Stop is called.
   161  	startCalled bool
   162  }
   163  
   164  type offender struct {
   165  	lastUpdateTime ktime.Time
   166  }
   167  
   168  // New creates a new watchdog.
   169  func New(k *kernel.Kernel, opts Opts) *Watchdog {
   170  	// 4 is arbitrary, just don't want to prolong 'TaskTimeout' too much.
   171  	period := opts.TaskTimeout / 4
   172  	w := &Watchdog{
   173  		Opts:      opts,
   174  		k:         k,
   175  		period:    period,
   176  		offenders: make(map[*kernel.Task]*offender),
   177  		stop:      make(chan struct{}),
   178  		done:      make(chan struct{}),
   179  	}
   180  
   181  	// Handle StartupTimeout if it exists.
   182  	if w.StartupTimeout > 0 {
   183  		log.Infof("Watchdog waiting %v for startup", w.StartupTimeout)
   184  		go w.waitForStart() // S/R-SAFE: watchdog is stopped buring save and restarted after restore.
   185  	}
   186  
   187  	return w
   188  }
   189  
   190  // Start starts the watchdog.
   191  func (w *Watchdog) Start() {
   192  	w.mu.Lock()
   193  	defer w.mu.Unlock()
   194  	w.startCalled = true
   195  
   196  	if w.running {
   197  		return
   198  	}
   199  
   200  	if w.TaskTimeout == 0 {
   201  		log.Infof("Watchdog task timeout disabled")
   202  		return
   203  	}
   204  	w.lastRun = w.k.MonotonicClock().Now()
   205  
   206  	log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.TaskTimeout, w.TaskTimeoutAction)
   207  	go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore.
   208  	w.running = true
   209  }
   210  
   211  // Stop requests the watchdog to stop and wait for it.
   212  func (w *Watchdog) Stop() {
   213  	if w.TaskTimeout == 0 {
   214  		return
   215  	}
   216  
   217  	w.mu.Lock()
   218  	defer w.mu.Unlock()
   219  	if !w.running {
   220  		return
   221  	}
   222  	log.Infof("Stopping watchdog")
   223  	w.stop <- struct{}{}
   224  	<-w.done
   225  	w.running = false
   226  	log.Infof("Watchdog stopped")
   227  }
   228  
   229  // waitForStart waits for Start to be called and takes action if it does not
   230  // happen within the startup timeout.
   231  func (w *Watchdog) waitForStart() {
   232  	<-time.After(w.StartupTimeout)
   233  	w.mu.Lock()
   234  	defer w.mu.Unlock()
   235  	if w.startCalled {
   236  		// We are fine.
   237  		return
   238  	}
   239  
   240  	metric.WeirdnessMetric.Increment("watchdog_stuck_startup")
   241  
   242  	var buf bytes.Buffer
   243  	buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
   244  	w.doAction(w.StartupTimeoutAction, false, &buf)
   245  }
   246  
   247  // loop is the main watchdog routine. It only returns when 'Stop()' is called.
   248  func (w *Watchdog) loop() {
   249  	// Loop until someone stops it.
   250  	for {
   251  		select {
   252  		case <-w.stop:
   253  			w.done <- struct{}{}
   254  			return
   255  		case <-time.After(w.period):
   256  			w.runTurn()
   257  		}
   258  	}
   259  }
   260  
   261  // runTurn runs a single pass over all tasks and reports anything it finds.
   262  func (w *Watchdog) runTurn() {
   263  	// Someone needs to watch the watchdog. The call below can get stuck if there
   264  	// is a deadlock affecting root's PID namespace mutex. Run it in a goroutine
   265  	// and report if it takes too long to return.
   266  	var tasks []*kernel.Task
   267  	done := make(chan struct{})
   268  	go func() { // S/R-SAFE: watchdog is stopped and restarted during S/R.
   269  		tasks = w.k.TaskSet().Root.Tasks()
   270  		close(done)
   271  	}()
   272  
   273  	select {
   274  	case <-done:
   275  	case <-time.After(w.TaskTimeout):
   276  		// Report if the watchdog is not making progress.
   277  		// No one is watching the watchdog watcher though.
   278  		w.reportStuckWatchdog()
   279  		<-done
   280  	}
   281  
   282  	newOffenders := make(map[*kernel.Task]*offender)
   283  	newTaskFound := false
   284  	now := ktime.FromNanoseconds(int64(w.k.CPUClockNow() * uint64(linux.ClockTick)))
   285  
   286  	// The process may be running with low CPU limit making tasks appear stuck because
   287  	// are starved of CPU cycles. An estimate is that Tasks could have been starved
   288  	// since the last time the watchdog run. If the watchdog detects that scheduling
   289  	// is off, it will discount the entire duration since last run from 'lastUpdateTime'.
   290  	discount := time.Duration(0)
   291  	if now.Sub(w.lastRun.Add(w.period)) > descheduleThreshold {
   292  		discount = now.Sub(w.lastRun)
   293  	}
   294  	w.lastRun = now
   295  
   296  	log.Infof("Watchdog starting loop, tasks: %d, discount: %v", len(tasks), discount)
   297  	for _, t := range tasks {
   298  		tsched := t.TaskGoroutineSchedInfo()
   299  
   300  		// An offender is a task running inside the kernel for longer than the specified timeout.
   301  		if tsched.State == kernel.TaskGoroutineRunningSys {
   302  			lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick)))
   303  			elapsed := now.Sub(lastUpdateTime) - discount
   304  			if elapsed > w.TaskTimeout {
   305  				tc, ok := w.offenders[t]
   306  				if !ok {
   307  					// New stuck task detected.
   308  					//
   309  					// Note that tasks blocked doing IO may be considered stuck in kernel,
   310  					// unless they are surrounded by
   311  					// Task.UninterruptibleSleepStart/Finish.
   312  					tc = &offender{lastUpdateTime: lastUpdateTime}
   313  					metric.WeirdnessMetric.Increment("watchdog_stuck_tasks")
   314  					newTaskFound = true
   315  				}
   316  				newOffenders[t] = tc
   317  			}
   318  		}
   319  	}
   320  	if len(newOffenders) > 0 {
   321  		w.report(newOffenders, newTaskFound, now)
   322  	}
   323  
   324  	// Remember which tasks have been reported.
   325  	w.offenders = newOffenders
   326  }
   327  
   328  // report takes appropriate action when a stuck task is detected.
   329  func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) {
   330  	var buf bytes.Buffer
   331  	buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders)))
   332  	for t, o := range offenders {
   333  		tid := w.k.TaskSet().Root.IDOfTask(t)
   334  		buf.WriteString(fmt.Sprintf("\tTask tid: %v (goroutine %d), entered RunSys state %v ago.\n", tid, t.GoroutineID(), now.Sub(o.lastUpdateTime)))
   335  	}
   336  	buf.WriteString("Search for 'goroutine <id>' in the stack dump to find the offending goroutine(s)")
   337  
   338  	// Force stack dump only if a new task is detected.
   339  	w.doAction(w.TaskTimeoutAction, newTaskFound, &buf)
   340  }
   341  
   342  func (w *Watchdog) reportStuckWatchdog() {
   343  	var buf bytes.Buffer
   344  	buf.WriteString("Watchdog goroutine is stuck")
   345  	w.doAction(w.TaskTimeoutAction, false, &buf)
   346  }
   347  
   348  // doAction will take the given action. If the action is LogWarning, the stack
   349  // is not always dumped to the log to prevent log flooding. "forceStack"
   350  // guarantees that the stack will be dumped regardless.
   351  func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
   352  	switch action {
   353  	case LogWarning:
   354  		// Dump stack only if forced or sometime has passed since the last time a
   355  		// stack dump was generated.
   356  		if !forceStack && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod {
   357  			msg.WriteString("\n...[stack dump skipped]...")
   358  			log.Warningf(msg.String())
   359  			return
   360  		}
   361  		log.TracebackAll(msg.String())
   362  		w.lastStackDump = time.Now()
   363  
   364  	case Panic:
   365  		// Panic will skip over running tasks, which is likely the culprit here. So manually
   366  		// dump all stacks before panic'ing.
   367  		log.TracebackAll(msg.String())
   368  
   369  		// Attempt to flush metrics, timeout and move on in case metrics are stuck as well.
   370  		metricsEmitted := make(chan struct{}, 1)
   371  		go func() { // S/R-SAFE: watchdog is stopped during save and restarted after restore.
   372  			// Flush metrics before killing process.
   373  			metric.EmitMetricUpdate()
   374  			metricsEmitted <- struct{}{}
   375  		}()
   376  		select {
   377  		case <-metricsEmitted:
   378  		case <-time.After(1 * time.Second):
   379  		}
   380  		panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String()))
   381  
   382  	default:
   383  		panic(fmt.Sprintf("Unknown watchdog action %v", action))
   384  
   385  	}
   386  }