github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/vm/vm.go (about) 1 // Copyright 2015 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 // Package vm provides an abstract test machine (VM, physical machine, etc) 5 // interface for the rest of the system. 6 // For convenience test machines are subsequently collectively called VMs. 7 // Package wraps vmimpl package interface with some common functionality 8 // and higher-level interface. 9 package vm 10 11 import ( 12 "bytes" 13 "context" 14 "errors" 15 "fmt" 16 "io" 17 "os" 18 "path/filepath" 19 "strings" 20 "sync/atomic" 21 "time" 22 23 "github.com/google/syzkaller/pkg/log" 24 "github.com/google/syzkaller/pkg/mgrconfig" 25 "github.com/google/syzkaller/pkg/osutil" 26 "github.com/google/syzkaller/pkg/report" 27 "github.com/google/syzkaller/pkg/report/crash" 28 "github.com/google/syzkaller/pkg/stat" 29 "github.com/google/syzkaller/sys/targets" 30 "github.com/google/syzkaller/vm/dispatcher" 31 "github.com/google/syzkaller/vm/vmimpl" 32 33 // Import all VM implementations, so that users only need to import vm. 34 _ "github.com/google/syzkaller/vm/adb" 35 _ "github.com/google/syzkaller/vm/bhyve" 36 _ "github.com/google/syzkaller/vm/cuttlefish" 37 _ "github.com/google/syzkaller/vm/gce" 38 _ "github.com/google/syzkaller/vm/gvisor" 39 _ "github.com/google/syzkaller/vm/isolated" 40 _ "github.com/google/syzkaller/vm/proxyapp" 41 _ "github.com/google/syzkaller/vm/qemu" 42 _ "github.com/google/syzkaller/vm/starnix" 43 _ "github.com/google/syzkaller/vm/virtualbox" 44 _ "github.com/google/syzkaller/vm/vmm" 45 _ "github.com/google/syzkaller/vm/vmware" 46 ) 47 48 type Pool struct { 49 impl vmimpl.Pool 50 typ vmimpl.Type 51 workdir string 52 template string 53 timeouts targets.Timeouts 54 count int 55 activeCount int32 56 snapshot bool 57 hostFuzzer bool 58 statOutputReceived *stat.Val 59 } 60 61 type Instance struct { 62 pool *Pool 63 impl vmimpl.Instance 64 workdir string 65 index int 66 snapshotSetup bool 67 onClose func() 68 } 69 70 var ( 71 Shutdown = vmimpl.Shutdown 72 ErrTimeout = vmimpl.ErrTimeout 73 _ BootErrorer = vmimpl.BootError{} 74 _ InfraErrorer = vmimpl.InfraError{} 75 ) 76 77 func ShutdownCtx() context.Context { 78 ctx, done := context.WithCancel(context.Background()) 79 go func() { 80 <-Shutdown 81 done() 82 }() 83 return ctx 84 } 85 86 type BootErrorer interface { 87 BootError() (string, []byte) 88 } 89 90 type InfraErrorer interface { 91 InfraError() (string, []byte) 92 } 93 94 // vmType splits the VM type from any suffix (separated by ":"). This is mostly 95 // useful for the "proxyapp" type, where pkg/build needs to specify/handle 96 // sub-types. 97 func vmType(fullName string) string { 98 name, _, _ := strings.Cut(fullName, ":") 99 return name 100 } 101 102 // AllowsOvercommit returns if the instance type allows overcommit of instances 103 // (i.e. creation of instances out-of-thin-air). Overcommit is used during image 104 // and patch testing in syz-ci when it just asks for more than specified in config 105 // instances. Generally virtual machines (qemu, gce) support overcommit, 106 // while physical machines (adb, isolated) do not. Strictly speaking, we should 107 // never use overcommit and use only what's specified in config, because we 108 // override resource limits specified in config (e.g. can OOM). But it works and 109 // makes lots of things much simpler. 110 func AllowsOvercommit(typ string) bool { 111 return vmimpl.Types[vmType(typ)].Overcommit 112 } 113 114 // Create creates a VM pool that can be used to create individual VMs. 115 func Create(cfg *mgrconfig.Config, debug bool) (*Pool, error) { 116 typ, ok := vmimpl.Types[vmType(cfg.Type)] 117 if !ok { 118 return nil, fmt.Errorf("unknown instance type '%v'", cfg.Type) 119 } 120 env := &vmimpl.Env{ 121 Name: cfg.Name, 122 OS: cfg.TargetOS, 123 Arch: cfg.TargetVMArch, 124 Workdir: cfg.Workdir, 125 Image: cfg.Image, 126 SSHKey: cfg.SSHKey, 127 SSHUser: cfg.SSHUser, 128 Timeouts: cfg.Timeouts, 129 Snapshot: cfg.Snapshot, 130 Debug: debug, 131 Config: cfg.VM, 132 KernelSrc: cfg.KernelSrc, 133 } 134 impl, err := typ.Ctor(env) 135 if err != nil { 136 return nil, err 137 } 138 count := impl.Count() 139 if debug && count > 1 { 140 log.Logf(0, "limiting number of VMs from %v to 1 in debug mode", count) 141 count = 1 142 } 143 return &Pool{ 144 impl: impl, 145 typ: typ, 146 workdir: env.Workdir, 147 template: cfg.WorkdirTemplate, 148 timeouts: cfg.Timeouts, 149 count: count, 150 snapshot: cfg.Snapshot, 151 hostFuzzer: cfg.SysTarget.HostFuzzer, 152 statOutputReceived: stat.New("vm output", "Bytes of VM console output received", 153 stat.Graph("traffic"), stat.Rate{}, stat.FormatMB), 154 }, nil 155 } 156 157 func (pool *Pool) Count() int { 158 return pool.count 159 } 160 161 func (pool *Pool) Create(ctx context.Context, index int) (*Instance, error) { 162 if index < 0 || index >= pool.count { 163 return nil, fmt.Errorf("invalid VM index %v (count %v)", index, pool.count) 164 } 165 workdir, err := osutil.ProcessTempDir(pool.workdir) 166 if err != nil { 167 return nil, fmt.Errorf("failed to create instance temp dir: %w", err) 168 } 169 if pool.template != "" { 170 if err := osutil.CopyDirRecursively(pool.template, filepath.Join(workdir, "template")); err != nil { 171 return nil, err 172 } 173 } 174 impl, err := pool.impl.Create(ctx, workdir, index) 175 if err != nil { 176 os.RemoveAll(workdir) 177 return nil, err 178 } 179 atomic.AddInt32(&pool.activeCount, 1) 180 return &Instance{ 181 pool: pool, 182 impl: impl, 183 workdir: workdir, 184 index: index, 185 onClose: func() { atomic.AddInt32(&pool.activeCount, -1) }, 186 }, nil 187 } 188 189 // TODO: Integration or end-to-end testing is needed. 190 // 191 // https://github.com/google/syzkaller/pull/3269#discussion_r967650801 192 func (pool *Pool) Close() error { 193 if pool.activeCount != 0 { 194 panic("all the instances should be closed before pool.Close()") 195 } 196 if closer, ok := pool.impl.(io.Closer); ok { 197 return closer.Close() 198 } 199 return nil 200 } 201 202 // SetupSnapshot must be called once before calling RunSnapshot. 203 // Input is copied into the VM in an implementation defined way and is interpreted by executor. 204 func (inst *Instance) SetupSnapshot(input []byte) error { 205 impl, ok := inst.impl.(snapshotter) 206 if !ok { 207 return errors.New("this VM type does not support snapshot mode") 208 } 209 if inst.snapshotSetup { 210 return fmt.Errorf("SetupSnapshot called twice") 211 } 212 inst.snapshotSetup = true 213 return impl.SetupSnapshot(input) 214 } 215 216 // RunSnapshot runs one input in snapshotting mode. 217 // Input is copied into the VM in an implementation defined way and is interpreted by executor. 218 // Result is the result provided by the executor. 219 // Output is the kernel console output during execution of the input. 220 func (inst *Instance) RunSnapshot(input []byte) (result, output []byte, err error) { 221 impl, ok := inst.impl.(snapshotter) 222 if !ok { 223 return nil, nil, errors.New("this VM type does not support snapshot mode") 224 } 225 if !inst.snapshotSetup { 226 return nil, nil, fmt.Errorf("RunSnapshot without SetupSnapshot") 227 } 228 // Executor has own timeout logic, so use a slightly larger timeout here. 229 timeout := inst.pool.timeouts.Program / 5 * 7 230 return impl.RunSnapshot(timeout, input) 231 } 232 233 type snapshotter interface { 234 SetupSnapshot([]byte) error 235 RunSnapshot(time.Duration, []byte) ([]byte, []byte, error) 236 } 237 238 func (inst *Instance) Copy(hostSrc string) (string, error) { 239 return inst.impl.Copy(hostSrc) 240 } 241 242 func (inst *Instance) Forward(port int) (string, error) { 243 return inst.impl.Forward(port) 244 } 245 246 type ExitCondition int 247 248 const ( 249 // The program is allowed to exit after timeout. 250 ExitTimeout = ExitCondition(1 << iota) 251 // The program is allowed to exit with no errors. 252 ExitNormal 253 // The program is allowed to exit with errors. 254 ExitError 255 ) 256 257 type RunOptions struct { 258 // exitCondition says which exit modes should be considered as errors/OK 259 exitCondition ExitCondition 260 // BeforeContext is how many bytes BEFORE the crash description to keep in the report. 261 beforeContext int 262 // afterContext is how many bytes AFTER the crash description to keep in the report. 263 afterContext int 264 // An early notification that the command has finished / VM crashed. 265 earlyFinishCb func() 266 injectExecuting <-chan bool 267 tickerPeriod time.Duration 268 } 269 270 func WithExitCondition(exitCondition ExitCondition) func(*RunOptions) { 271 return func(opts *RunOptions) { 272 opts.exitCondition = exitCondition 273 } 274 } 275 276 func WithBeforeContext(beforeContext int) func(*RunOptions) { 277 return func(opts *RunOptions) { 278 opts.beforeContext = beforeContext 279 } 280 } 281 282 func WithInjectExecuting(injectExecuting <-chan bool) func(*RunOptions) { 283 return func(opts *RunOptions) { 284 opts.injectExecuting = injectExecuting 285 } 286 } 287 288 func WithEarlyFinishCb(cb func()) func(*RunOptions) { 289 return func(opts *RunOptions) { 290 opts.earlyFinishCb = cb 291 } 292 } 293 294 // Run runs cmd inside of the VM (think of ssh cmd) and monitors command execution 295 // and the kernel console output. It detects kernel oopses in output, lost connections, hangs, etc. 296 // Returns command+kernel output and a non-symbolized crash report (nil if no error happens). 297 func (inst *Instance) Run(ctx context.Context, reporter *report.Reporter, command string, opts ...func(*RunOptions)) ( 298 []byte, []*report.Report, error) { 299 runOptions := &RunOptions{ 300 beforeContext: 128 << 10, 301 afterContext: 128 << 10, 302 tickerPeriod: 10 * time.Second, 303 } 304 for _, opt := range opts { 305 opt(runOptions) 306 } 307 308 outc, errc, err := inst.impl.Run(ctx, command) 309 if err != nil { 310 return nil, nil, err 311 } 312 mon := &monitor{ 313 RunOptions: runOptions, 314 inst: inst, 315 outc: outc, 316 errc: errc, 317 reporter: reporter, 318 lastExecuteTime: time.Now(), 319 } 320 reps := mon.monitorExecution() 321 return mon.output, reps, nil 322 } 323 324 func (inst *Instance) Info() ([]byte, error) { 325 if ii, ok := inst.impl.(vmimpl.Infoer); ok { 326 return ii.Info() 327 } 328 return nil, nil 329 } 330 331 func (inst *Instance) diagnose(reps []*report.Report) ([]byte, bool) { 332 if len(reps) == 0 { 333 panic("reps is empty") 334 } 335 return inst.impl.Diagnose(reps[0]) 336 } 337 338 func (inst *Instance) Index() int { 339 return inst.index 340 } 341 342 func (inst *Instance) Close() error { 343 err := inst.impl.Close() 344 if retErr := os.RemoveAll(inst.workdir); err == nil { 345 err = retErr 346 } 347 inst.onClose() 348 return err 349 } 350 351 type Dispatcher = dispatcher.Pool[*Instance] 352 353 func NewDispatcher(pool *Pool, def dispatcher.Runner[*Instance]) *Dispatcher { 354 return dispatcher.NewPool(pool.count, pool.Create, def) 355 } 356 357 type monitor struct { 358 *RunOptions 359 inst *Instance 360 outc <-chan []byte 361 errc <-chan error 362 reporter *report.Reporter 363 // output is at most mon.beforeContext + len(report) + afterContext bytes. 364 output []byte 365 // curPos in the output to scan for the matches. 366 curPos int 367 lastExecuteTime time.Time 368 // extractCalled is used to prevent multiple extractError calls. 369 extractCalled bool 370 } 371 372 func (mon *monitor) monitorExecution() []*report.Report { 373 ticker := time.NewTicker(mon.tickerPeriod * mon.inst.pool.timeouts.Scale) 374 defer ticker.Stop() 375 defer func() { 376 if mon.earlyFinishCb != nil { 377 mon.earlyFinishCb() 378 } 379 }() 380 for { 381 select { 382 case err := <-mon.errc: 383 switch err { 384 case nil: 385 // The program has exited without errors, 386 // but wait for kernel output in case there is some delayed oops. 387 crash := "" 388 if mon.exitCondition&ExitNormal == 0 { 389 crash = lostConnectionCrash 390 } 391 return mon.extractErrors(crash) 392 case ErrTimeout: 393 if mon.exitCondition&ExitTimeout == 0 { 394 return mon.extractErrors(timeoutCrash) 395 } 396 return nil 397 default: 398 // Note: connection lost can race with a kernel oops message. 399 // In such case we want to return the kernel oops. 400 crash := "" 401 if mon.exitCondition&ExitError == 0 { 402 crash = lostConnectionCrash 403 } 404 return mon.extractErrors(crash) 405 } 406 case out, ok := <-mon.outc: 407 if !ok { 408 mon.outc = nil 409 continue 410 } 411 mon.inst.pool.statOutputReceived.Add(len(out)) 412 if rep, done := mon.appendOutput(out); done { 413 return rep 414 } 415 case <-mon.injectExecuting: 416 mon.lastExecuteTime = time.Now() 417 case <-ticker.C: 418 // Detect both "no output whatsoever" and "kernel episodically prints 419 // something to console, but fuzzer is not actually executing programs". 420 if time.Since(mon.lastExecuteTime) > mon.inst.pool.timeouts.NoOutput { 421 return mon.extractErrors(noOutputCrash) 422 } 423 case <-Shutdown: 424 return nil 425 } 426 } 427 } 428 429 func (mon *monitor) appendOutput(out []byte) ([]*report.Report, bool) { 430 lastPos := len(mon.output) 431 mon.output = append(mon.output, out...) 432 if bytes.Contains(mon.output[lastPos:], []byte(executedProgramsStart)) { 433 mon.lastExecuteTime = time.Now() 434 } 435 if mon.reporter.ContainsCrash(mon.output[mon.curPos:]) { 436 return mon.extractErrors("unknown error"), true 437 } 438 if len(mon.output) > 2*mon.beforeContext { 439 copy(mon.output, mon.output[len(mon.output)-mon.beforeContext:]) 440 mon.output = mon.output[:mon.beforeContext] 441 } 442 // Find the starting position for crash matching on the next iteration. 443 // We step back from the end of output by maxErrorLength to handle the case 444 // when a crash line is currently split/incomplete. And then we try to find 445 // the preceding '\n' to have a full line. This is required to handle 446 // the case when a particular pattern is ignored as crash, but a suffix 447 // of the pattern is detected as crash (e.g. "ODEBUG:" is trimmed to "BUG:"). 448 mon.curPos = len(mon.output) - maxErrorLength 449 for i := 0; i < maxErrorLength; i++ { 450 if mon.curPos <= 0 || mon.output[mon.curPos-1] == '\n' { 451 break 452 } 453 mon.curPos-- 454 } 455 mon.curPos = max(mon.curPos, 0) 456 return nil, false 457 } 458 459 func (mon *monitor) extractErrors(defaultError string) []*report.Report { 460 if mon.extractCalled { 461 panic("extractError called twice") 462 } 463 mon.extractCalled = true 464 if mon.earlyFinishCb != nil { 465 mon.earlyFinishCb() 466 mon.earlyFinishCb = nil 467 } 468 diagOutput, diagWait := []byte{}, false 469 if defaultError != "" { 470 diagOutput, diagWait = mon.inst.diagnose(mon.createReports(defaultError)) 471 } 472 // Give it some time to finish writing the error message. 473 // But don't wait for "no output", we already waited enough. 474 if defaultError != noOutputCrash || diagWait { 475 mon.waitForOutput() 476 } 477 // Check the executorPreemptedStr only for preemptible instances since executor can print 478 // the string spuriously in some cases (gets SIGTERM from test program somehow). 479 if mon.inst.pool.typ.Preemptible && bytes.Contains(mon.output, []byte(executorPreemptedStr)) { 480 return nil 481 } 482 if defaultError == "" && mon.reporter.ContainsCrash(mon.output[mon.curPos:]) { 483 // We did not call Diagnose above because we thought there is no error, so call it now. 484 diagOutput, diagWait = mon.inst.diagnose(mon.createReports(defaultError)) 485 if diagWait { 486 mon.waitForOutput() 487 } 488 } 489 reps := mon.createReports(defaultError) 490 if len(reps) == 0 { 491 return nil 492 } 493 if len(diagOutput) > 0 { 494 reps[0].Output = append(reps[0].Output, vmDiagnosisStart...) 495 reps[0].Output = append(reps[0].Output, diagOutput...) 496 } 497 return reps 498 } 499 500 func (mon *monitor) createReports(defaultError string) []*report.Report { 501 curPos := mon.curPos 502 var res []*report.Report 503 for { 504 rep := mon.reporter.ParseFrom(mon.output, curPos) 505 if rep == nil { 506 if defaultError == "" || len(res) > 0 { 507 return res 508 } 509 typ := crash.UnknownType 510 if defaultError == lostConnectionCrash { 511 typ = crash.LostConnection 512 } 513 return []*report.Report{{ 514 Title: defaultError, 515 Output: mon.output, 516 Suppressed: report.IsSuppressed(mon.reporter, mon.output), 517 Type: typ, 518 }} 519 } 520 curPos = rep.SkipPos 521 start := max(rep.StartPos-mon.beforeContext, 0) 522 end := min(rep.EndPos+mon.afterContext, len(rep.Output)) 523 rep.Output = rep.Output[start:end] 524 rep.StartPos -= start 525 rep.EndPos -= start 526 if len(res) == 0 || (len(res) > 0 && !rep.Corrupted && !rep.Suppressed) { 527 res = append(res, rep) 528 } 529 } 530 } 531 532 func (mon *monitor) waitForOutput() { 533 timer := time.NewTimer(vmimpl.WaitForOutputTimeout * mon.inst.pool.timeouts.Scale) 534 defer timer.Stop() 535 for { 536 select { 537 case out, ok := <-mon.outc: 538 if !ok { 539 return 540 } 541 mon.output = append(mon.output, out...) 542 case <-timer.C: 543 return 544 case <-Shutdown: 545 return 546 } 547 } 548 } 549 550 const ( 551 maxErrorLength = 256 552 553 lostConnectionCrash = "lost connection to test machine" 554 noOutputCrash = "no output from test machine" 555 timeoutCrash = "timed out" 556 557 executorPreemptedStr = "SYZ-EXECUTOR: PREEMPTED" 558 vmDiagnosisStart = "\nVM DIAGNOSIS:\n" 559 executedProgramsStart = "executed programs:" // syz-execprog output 560 )