github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/vm/vm.go (about)

     1  // Copyright 2015 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  // Package vm provides an abstract test machine (VM, physical machine, etc)
     5  // interface for the rest of the system.
     6  // For convenience test machines are subsequently collectively called VMs.
     7  // Package wraps vmimpl package interface with some common functionality
     8  // and higher-level interface.
     9  package vm
    10  
    11  import (
    12  	"bytes"
    13  	"context"
    14  	"errors"
    15  	"fmt"
    16  	"io"
    17  	"os"
    18  	"path/filepath"
    19  	"strings"
    20  	"sync/atomic"
    21  	"time"
    22  
    23  	"github.com/google/syzkaller/pkg/log"
    24  	"github.com/google/syzkaller/pkg/mgrconfig"
    25  	"github.com/google/syzkaller/pkg/osutil"
    26  	"github.com/google/syzkaller/pkg/report"
    27  	"github.com/google/syzkaller/pkg/report/crash"
    28  	"github.com/google/syzkaller/pkg/stat"
    29  	"github.com/google/syzkaller/sys/targets"
    30  	"github.com/google/syzkaller/vm/dispatcher"
    31  	"github.com/google/syzkaller/vm/vmimpl"
    32  
    33  	// Import all VM implementations, so that users only need to import vm.
    34  	_ "github.com/google/syzkaller/vm/adb"
    35  	_ "github.com/google/syzkaller/vm/bhyve"
    36  	_ "github.com/google/syzkaller/vm/cuttlefish"
    37  	_ "github.com/google/syzkaller/vm/gce"
    38  	_ "github.com/google/syzkaller/vm/gvisor"
    39  	_ "github.com/google/syzkaller/vm/isolated"
    40  	_ "github.com/google/syzkaller/vm/proxyapp"
    41  	_ "github.com/google/syzkaller/vm/qemu"
    42  	_ "github.com/google/syzkaller/vm/starnix"
    43  	_ "github.com/google/syzkaller/vm/virtualbox"
    44  	_ "github.com/google/syzkaller/vm/vmm"
    45  	_ "github.com/google/syzkaller/vm/vmware"
    46  )
    47  
    48  type Pool struct {
    49  	impl               vmimpl.Pool
    50  	typ                vmimpl.Type
    51  	workdir            string
    52  	template           string
    53  	timeouts           targets.Timeouts
    54  	count              int
    55  	activeCount        int32
    56  	snapshot           bool
    57  	hostFuzzer         bool
    58  	statOutputReceived *stat.Val
    59  }
    60  
    61  type Instance struct {
    62  	pool          *Pool
    63  	impl          vmimpl.Instance
    64  	workdir       string
    65  	index         int
    66  	snapshotSetup bool
    67  	onClose       func()
    68  }
    69  
    70  var (
    71  	Shutdown                = vmimpl.Shutdown
    72  	ErrTimeout              = vmimpl.ErrTimeout
    73  	_          BootErrorer  = vmimpl.BootError{}
    74  	_          InfraErrorer = vmimpl.InfraError{}
    75  )
    76  
    77  func ShutdownCtx() context.Context {
    78  	ctx, done := context.WithCancel(context.Background())
    79  	go func() {
    80  		<-Shutdown
    81  		done()
    82  	}()
    83  	return ctx
    84  }
    85  
    86  type BootErrorer interface {
    87  	BootError() (string, []byte)
    88  }
    89  
    90  type InfraErrorer interface {
    91  	InfraError() (string, []byte)
    92  }
    93  
    94  // vmType splits the VM type from any suffix (separated by ":"). This is mostly
    95  // useful for the "proxyapp" type, where pkg/build needs to specify/handle
    96  // sub-types.
    97  func vmType(fullName string) string {
    98  	name, _, _ := strings.Cut(fullName, ":")
    99  	return name
   100  }
   101  
   102  // AllowsOvercommit returns if the instance type allows overcommit of instances
   103  // (i.e. creation of instances out-of-thin-air). Overcommit is used during image
   104  // and patch testing in syz-ci when it just asks for more than specified in config
   105  // instances. Generally virtual machines (qemu, gce) support overcommit,
   106  // while physical machines (adb, isolated) do not. Strictly speaking, we should
   107  // never use overcommit and use only what's specified in config, because we
   108  // override resource limits specified in config (e.g. can OOM). But it works and
   109  // makes lots of things much simpler.
   110  func AllowsOvercommit(typ string) bool {
   111  	return vmimpl.Types[vmType(typ)].Overcommit
   112  }
   113  
   114  // Create creates a VM pool that can be used to create individual VMs.
   115  func Create(cfg *mgrconfig.Config, debug bool) (*Pool, error) {
   116  	typ, ok := vmimpl.Types[vmType(cfg.Type)]
   117  	if !ok {
   118  		return nil, fmt.Errorf("unknown instance type '%v'", cfg.Type)
   119  	}
   120  	env := &vmimpl.Env{
   121  		Name:      cfg.Name,
   122  		OS:        cfg.TargetOS,
   123  		Arch:      cfg.TargetVMArch,
   124  		Workdir:   cfg.Workdir,
   125  		Image:     cfg.Image,
   126  		SSHKey:    cfg.SSHKey,
   127  		SSHUser:   cfg.SSHUser,
   128  		Timeouts:  cfg.Timeouts,
   129  		Snapshot:  cfg.Snapshot,
   130  		Debug:     debug,
   131  		Config:    cfg.VM,
   132  		KernelSrc: cfg.KernelSrc,
   133  	}
   134  	impl, err := typ.Ctor(env)
   135  	if err != nil {
   136  		return nil, err
   137  	}
   138  	count := impl.Count()
   139  	if debug && count > 1 {
   140  		log.Logf(0, "limiting number of VMs from %v to 1 in debug mode", count)
   141  		count = 1
   142  	}
   143  	return &Pool{
   144  		impl:       impl,
   145  		typ:        typ,
   146  		workdir:    env.Workdir,
   147  		template:   cfg.WorkdirTemplate,
   148  		timeouts:   cfg.Timeouts,
   149  		count:      count,
   150  		snapshot:   cfg.Snapshot,
   151  		hostFuzzer: cfg.SysTarget.HostFuzzer,
   152  		statOutputReceived: stat.New("vm output", "Bytes of VM console output received",
   153  			stat.Graph("traffic"), stat.Rate{}, stat.FormatMB),
   154  	}, nil
   155  }
   156  
   157  func (pool *Pool) Count() int {
   158  	return pool.count
   159  }
   160  
   161  func (pool *Pool) Create(ctx context.Context, index int) (*Instance, error) {
   162  	if index < 0 || index >= pool.count {
   163  		return nil, fmt.Errorf("invalid VM index %v (count %v)", index, pool.count)
   164  	}
   165  	workdir, err := osutil.ProcessTempDir(pool.workdir)
   166  	if err != nil {
   167  		return nil, fmt.Errorf("failed to create instance temp dir: %w", err)
   168  	}
   169  	if pool.template != "" {
   170  		if err := osutil.CopyDirRecursively(pool.template, filepath.Join(workdir, "template")); err != nil {
   171  			return nil, err
   172  		}
   173  	}
   174  	impl, err := pool.impl.Create(ctx, workdir, index)
   175  	if err != nil {
   176  		os.RemoveAll(workdir)
   177  		return nil, err
   178  	}
   179  	atomic.AddInt32(&pool.activeCount, 1)
   180  	return &Instance{
   181  		pool:    pool,
   182  		impl:    impl,
   183  		workdir: workdir,
   184  		index:   index,
   185  		onClose: func() { atomic.AddInt32(&pool.activeCount, -1) },
   186  	}, nil
   187  }
   188  
   189  // TODO: Integration or end-to-end testing is needed.
   190  //
   191  //	https://github.com/google/syzkaller/pull/3269#discussion_r967650801
   192  func (pool *Pool) Close() error {
   193  	if pool.activeCount != 0 {
   194  		panic("all the instances should be closed before pool.Close()")
   195  	}
   196  	if closer, ok := pool.impl.(io.Closer); ok {
   197  		return closer.Close()
   198  	}
   199  	return nil
   200  }
   201  
   202  // SetupSnapshot must be called once before calling RunSnapshot.
   203  // Input is copied into the VM in an implementation defined way and is interpreted by executor.
   204  func (inst *Instance) SetupSnapshot(input []byte) error {
   205  	impl, ok := inst.impl.(snapshotter)
   206  	if !ok {
   207  		return errors.New("this VM type does not support snapshot mode")
   208  	}
   209  	if inst.snapshotSetup {
   210  		return fmt.Errorf("SetupSnapshot called twice")
   211  	}
   212  	inst.snapshotSetup = true
   213  	return impl.SetupSnapshot(input)
   214  }
   215  
   216  // RunSnapshot runs one input in snapshotting mode.
   217  // Input is copied into the VM in an implementation defined way and is interpreted by executor.
   218  // Result is the result provided by the executor.
   219  // Output is the kernel console output during execution of the input.
   220  func (inst *Instance) RunSnapshot(input []byte) (result, output []byte, err error) {
   221  	impl, ok := inst.impl.(snapshotter)
   222  	if !ok {
   223  		return nil, nil, errors.New("this VM type does not support snapshot mode")
   224  	}
   225  	if !inst.snapshotSetup {
   226  		return nil, nil, fmt.Errorf("RunSnapshot without SetupSnapshot")
   227  	}
   228  	// Executor has own timeout logic, so use a slightly larger timeout here.
   229  	timeout := inst.pool.timeouts.Program / 5 * 7
   230  	return impl.RunSnapshot(timeout, input)
   231  }
   232  
   233  type snapshotter interface {
   234  	SetupSnapshot([]byte) error
   235  	RunSnapshot(time.Duration, []byte) ([]byte, []byte, error)
   236  }
   237  
   238  func (inst *Instance) Copy(hostSrc string) (string, error) {
   239  	return inst.impl.Copy(hostSrc)
   240  }
   241  
   242  func (inst *Instance) Forward(port int) (string, error) {
   243  	return inst.impl.Forward(port)
   244  }
   245  
   246  type ExitCondition int
   247  
   248  const (
   249  	// The program is allowed to exit after timeout.
   250  	ExitTimeout = ExitCondition(1 << iota)
   251  	// The program is allowed to exit with no errors.
   252  	ExitNormal
   253  	// The program is allowed to exit with errors.
   254  	ExitError
   255  )
   256  
   257  type RunOptions struct {
   258  	// exitCondition says which exit modes should be considered as errors/OK
   259  	exitCondition ExitCondition
   260  	// BeforeContext is how many bytes BEFORE the crash description to keep in the report.
   261  	beforeContext int
   262  	// afterContext is how many bytes AFTER the crash description to keep in the report.
   263  	afterContext int
   264  	// An early notification that the command has finished / VM crashed.
   265  	earlyFinishCb   func()
   266  	injectExecuting <-chan bool
   267  	tickerPeriod    time.Duration
   268  }
   269  
   270  func WithExitCondition(exitCondition ExitCondition) func(*RunOptions) {
   271  	return func(opts *RunOptions) {
   272  		opts.exitCondition = exitCondition
   273  	}
   274  }
   275  
   276  func WithBeforeContext(beforeContext int) func(*RunOptions) {
   277  	return func(opts *RunOptions) {
   278  		opts.beforeContext = beforeContext
   279  	}
   280  }
   281  
   282  func WithInjectExecuting(injectExecuting <-chan bool) func(*RunOptions) {
   283  	return func(opts *RunOptions) {
   284  		opts.injectExecuting = injectExecuting
   285  	}
   286  }
   287  
   288  func WithEarlyFinishCb(cb func()) func(*RunOptions) {
   289  	return func(opts *RunOptions) {
   290  		opts.earlyFinishCb = cb
   291  	}
   292  }
   293  
   294  // Run runs cmd inside of the VM (think of ssh cmd) and monitors command execution
   295  // and the kernel console output. It detects kernel oopses in output, lost connections, hangs, etc.
   296  // Returns command+kernel output and a non-symbolized crash report (nil if no error happens).
   297  func (inst *Instance) Run(ctx context.Context, reporter *report.Reporter, command string, opts ...func(*RunOptions)) (
   298  	[]byte, []*report.Report, error) {
   299  	runOptions := &RunOptions{
   300  		beforeContext: 128 << 10,
   301  		afterContext:  128 << 10,
   302  		tickerPeriod:  10 * time.Second,
   303  	}
   304  	for _, opt := range opts {
   305  		opt(runOptions)
   306  	}
   307  
   308  	outc, errc, err := inst.impl.Run(ctx, command)
   309  	if err != nil {
   310  		return nil, nil, err
   311  	}
   312  	mon := &monitor{
   313  		RunOptions:      runOptions,
   314  		inst:            inst,
   315  		outc:            outc,
   316  		errc:            errc,
   317  		reporter:        reporter,
   318  		lastExecuteTime: time.Now(),
   319  	}
   320  	reps := mon.monitorExecution()
   321  	return mon.output, reps, nil
   322  }
   323  
   324  func (inst *Instance) Info() ([]byte, error) {
   325  	if ii, ok := inst.impl.(vmimpl.Infoer); ok {
   326  		return ii.Info()
   327  	}
   328  	return nil, nil
   329  }
   330  
   331  func (inst *Instance) diagnose(reps []*report.Report) ([]byte, bool) {
   332  	if len(reps) == 0 {
   333  		panic("reps is empty")
   334  	}
   335  	return inst.impl.Diagnose(reps[0])
   336  }
   337  
   338  func (inst *Instance) Index() int {
   339  	return inst.index
   340  }
   341  
   342  func (inst *Instance) Close() error {
   343  	err := inst.impl.Close()
   344  	if retErr := os.RemoveAll(inst.workdir); err == nil {
   345  		err = retErr
   346  	}
   347  	inst.onClose()
   348  	return err
   349  }
   350  
   351  type Dispatcher = dispatcher.Pool[*Instance]
   352  
   353  func NewDispatcher(pool *Pool, def dispatcher.Runner[*Instance]) *Dispatcher {
   354  	return dispatcher.NewPool(pool.count, pool.Create, def)
   355  }
   356  
   357  type monitor struct {
   358  	*RunOptions
   359  	inst     *Instance
   360  	outc     <-chan []byte
   361  	errc     <-chan error
   362  	reporter *report.Reporter
   363  	// output is at most mon.beforeContext + len(report) + afterContext bytes.
   364  	output []byte
   365  	// curPos in the output to scan for the matches.
   366  	curPos          int
   367  	lastExecuteTime time.Time
   368  	// extractCalled is used to prevent multiple extractError calls.
   369  	extractCalled bool
   370  }
   371  
   372  func (mon *monitor) monitorExecution() []*report.Report {
   373  	ticker := time.NewTicker(mon.tickerPeriod * mon.inst.pool.timeouts.Scale)
   374  	defer ticker.Stop()
   375  	defer func() {
   376  		if mon.earlyFinishCb != nil {
   377  			mon.earlyFinishCb()
   378  		}
   379  	}()
   380  	for {
   381  		select {
   382  		case err := <-mon.errc:
   383  			switch err {
   384  			case nil:
   385  				// The program has exited without errors,
   386  				// but wait for kernel output in case there is some delayed oops.
   387  				crash := ""
   388  				if mon.exitCondition&ExitNormal == 0 {
   389  					crash = lostConnectionCrash
   390  				}
   391  				return mon.extractErrors(crash)
   392  			case ErrTimeout:
   393  				if mon.exitCondition&ExitTimeout == 0 {
   394  					return mon.extractErrors(timeoutCrash)
   395  				}
   396  				return nil
   397  			default:
   398  				// Note: connection lost can race with a kernel oops message.
   399  				// In such case we want to return the kernel oops.
   400  				crash := ""
   401  				if mon.exitCondition&ExitError == 0 {
   402  					crash = lostConnectionCrash
   403  				}
   404  				return mon.extractErrors(crash)
   405  			}
   406  		case out, ok := <-mon.outc:
   407  			if !ok {
   408  				mon.outc = nil
   409  				continue
   410  			}
   411  			mon.inst.pool.statOutputReceived.Add(len(out))
   412  			if rep, done := mon.appendOutput(out); done {
   413  				return rep
   414  			}
   415  		case <-mon.injectExecuting:
   416  			mon.lastExecuteTime = time.Now()
   417  		case <-ticker.C:
   418  			// Detect both "no output whatsoever" and "kernel episodically prints
   419  			// something to console, but fuzzer is not actually executing programs".
   420  			if time.Since(mon.lastExecuteTime) > mon.inst.pool.timeouts.NoOutput {
   421  				return mon.extractErrors(noOutputCrash)
   422  			}
   423  		case <-Shutdown:
   424  			return nil
   425  		}
   426  	}
   427  }
   428  
   429  func (mon *monitor) appendOutput(out []byte) ([]*report.Report, bool) {
   430  	lastPos := len(mon.output)
   431  	mon.output = append(mon.output, out...)
   432  	if bytes.Contains(mon.output[lastPos:], []byte(executedProgramsStart)) {
   433  		mon.lastExecuteTime = time.Now()
   434  	}
   435  	if mon.reporter.ContainsCrash(mon.output[mon.curPos:]) {
   436  		return mon.extractErrors("unknown error"), true
   437  	}
   438  	if len(mon.output) > 2*mon.beforeContext {
   439  		copy(mon.output, mon.output[len(mon.output)-mon.beforeContext:])
   440  		mon.output = mon.output[:mon.beforeContext]
   441  	}
   442  	// Find the starting position for crash matching on the next iteration.
   443  	// We step back from the end of output by maxErrorLength to handle the case
   444  	// when a crash line is currently split/incomplete. And then we try to find
   445  	// the preceding '\n' to have a full line. This is required to handle
   446  	// the case when a particular pattern is ignored as crash, but a suffix
   447  	// of the pattern is detected as crash (e.g. "ODEBUG:" is trimmed to "BUG:").
   448  	mon.curPos = len(mon.output) - maxErrorLength
   449  	for i := 0; i < maxErrorLength; i++ {
   450  		if mon.curPos <= 0 || mon.output[mon.curPos-1] == '\n' {
   451  			break
   452  		}
   453  		mon.curPos--
   454  	}
   455  	mon.curPos = max(mon.curPos, 0)
   456  	return nil, false
   457  }
   458  
   459  func (mon *monitor) extractErrors(defaultError string) []*report.Report {
   460  	if mon.extractCalled {
   461  		panic("extractError called twice")
   462  	}
   463  	mon.extractCalled = true
   464  	if mon.earlyFinishCb != nil {
   465  		mon.earlyFinishCb()
   466  		mon.earlyFinishCb = nil
   467  	}
   468  	diagOutput, diagWait := []byte{}, false
   469  	if defaultError != "" {
   470  		diagOutput, diagWait = mon.inst.diagnose(mon.createReports(defaultError))
   471  	}
   472  	// Give it some time to finish writing the error message.
   473  	// But don't wait for "no output", we already waited enough.
   474  	if defaultError != noOutputCrash || diagWait {
   475  		mon.waitForOutput()
   476  	}
   477  	// Check the executorPreemptedStr only for preemptible instances since executor can print
   478  	// the string spuriously in some cases (gets SIGTERM from test program somehow).
   479  	if mon.inst.pool.typ.Preemptible && bytes.Contains(mon.output, []byte(executorPreemptedStr)) {
   480  		return nil
   481  	}
   482  	if defaultError == "" && mon.reporter.ContainsCrash(mon.output[mon.curPos:]) {
   483  		// We did not call Diagnose above because we thought there is no error, so call it now.
   484  		diagOutput, diagWait = mon.inst.diagnose(mon.createReports(defaultError))
   485  		if diagWait {
   486  			mon.waitForOutput()
   487  		}
   488  	}
   489  	reps := mon.createReports(defaultError)
   490  	if len(reps) == 0 {
   491  		return nil
   492  	}
   493  	if len(diagOutput) > 0 {
   494  		reps[0].Output = append(reps[0].Output, vmDiagnosisStart...)
   495  		reps[0].Output = append(reps[0].Output, diagOutput...)
   496  	}
   497  	return reps
   498  }
   499  
   500  func (mon *monitor) createReports(defaultError string) []*report.Report {
   501  	curPos := mon.curPos
   502  	var res []*report.Report
   503  	for {
   504  		rep := mon.reporter.ParseFrom(mon.output, curPos)
   505  		if rep == nil {
   506  			if defaultError == "" || len(res) > 0 {
   507  				return res
   508  			}
   509  			typ := crash.UnknownType
   510  			if defaultError == lostConnectionCrash {
   511  				typ = crash.LostConnection
   512  			}
   513  			return []*report.Report{{
   514  				Title:      defaultError,
   515  				Output:     mon.output,
   516  				Suppressed: report.IsSuppressed(mon.reporter, mon.output),
   517  				Type:       typ,
   518  			}}
   519  		}
   520  		curPos = rep.SkipPos
   521  		start := max(rep.StartPos-mon.beforeContext, 0)
   522  		end := min(rep.EndPos+mon.afterContext, len(rep.Output))
   523  		rep.Output = rep.Output[start:end]
   524  		rep.StartPos -= start
   525  		rep.EndPos -= start
   526  		if len(res) == 0 || (len(res) > 0 && !rep.Corrupted && !rep.Suppressed) {
   527  			res = append(res, rep)
   528  		}
   529  	}
   530  }
   531  
   532  func (mon *monitor) waitForOutput() {
   533  	timer := time.NewTimer(vmimpl.WaitForOutputTimeout * mon.inst.pool.timeouts.Scale)
   534  	defer timer.Stop()
   535  	for {
   536  		select {
   537  		case out, ok := <-mon.outc:
   538  			if !ok {
   539  				return
   540  			}
   541  			mon.output = append(mon.output, out...)
   542  		case <-timer.C:
   543  			return
   544  		case <-Shutdown:
   545  			return
   546  		}
   547  	}
   548  }
   549  
   550  const (
   551  	maxErrorLength = 256
   552  
   553  	lostConnectionCrash = "lost connection to test machine"
   554  	noOutputCrash       = "no output from test machine"
   555  	timeoutCrash        = "timed out"
   556  
   557  	executorPreemptedStr  = "SYZ-EXECUTOR: PREEMPTED"
   558  	vmDiagnosisStart      = "\nVM DIAGNOSIS:\n"
   559  	executedProgramsStart = "executed programs:" // syz-execprog output
   560  )