github.com/google/syzkaller@v0.0.0-20240517125934-c0f1611a36d6/syz-manager/manager.go (about) 1 // Copyright 2015 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 package main 5 6 import ( 7 "bytes" 8 "context" 9 "encoding/json" 10 "flag" 11 "fmt" 12 "io" 13 "math/rand" 14 "net" 15 "os" 16 "os/exec" 17 "path/filepath" 18 "sync" 19 "sync/atomic" 20 "time" 21 22 "github.com/google/syzkaller/dashboard/dashapi" 23 "github.com/google/syzkaller/pkg/asset" 24 "github.com/google/syzkaller/pkg/corpus" 25 "github.com/google/syzkaller/pkg/csource" 26 "github.com/google/syzkaller/pkg/db" 27 "github.com/google/syzkaller/pkg/flatrpc" 28 "github.com/google/syzkaller/pkg/fuzzer" 29 "github.com/google/syzkaller/pkg/fuzzer/queue" 30 "github.com/google/syzkaller/pkg/gce" 31 "github.com/google/syzkaller/pkg/hash" 32 "github.com/google/syzkaller/pkg/instance" 33 "github.com/google/syzkaller/pkg/ipc" 34 "github.com/google/syzkaller/pkg/log" 35 "github.com/google/syzkaller/pkg/mgrconfig" 36 "github.com/google/syzkaller/pkg/osutil" 37 "github.com/google/syzkaller/pkg/report" 38 crash_pkg "github.com/google/syzkaller/pkg/report/crash" 39 "github.com/google/syzkaller/pkg/repro" 40 "github.com/google/syzkaller/pkg/signal" 41 "github.com/google/syzkaller/pkg/stats" 42 "github.com/google/syzkaller/prog" 43 "github.com/google/syzkaller/sys/targets" 44 "github.com/google/syzkaller/vm" 45 ) 46 47 var ( 48 flagConfig = flag.String("config", "", "configuration file") 49 flagDebug = flag.Bool("debug", false, "dump all VM output to console") 50 flagBench = flag.String("bench", "", "write execution statistics into this file periodically") 51 ) 52 53 type Manager struct { 54 cfg *mgrconfig.Config 55 vmPool *vm.Pool 56 target *prog.Target 57 sysTarget *targets.Target 58 reporter *report.Reporter 59 crashdir string 60 serv *RPCServer 61 corpus *corpus.Corpus 62 corpusDB *db.DB 63 corpusDBMu sync.Mutex // for concurrent operations on corpusDB 64 corpusPreloaded chan bool 65 firstConnect atomic.Int64 // unix time, or 0 if not connected 66 crashTypes map[string]bool 67 vmStop chan bool 68 enabledFeatures flatrpc.Feature 69 checkDone bool 70 fresh bool 71 expertMode bool 72 nextInstanceID atomic.Uint64 73 74 dash *dashapi.Dashboard 75 76 mu sync.Mutex 77 fuzzer atomic.Pointer[fuzzer.Fuzzer] 78 phase int 79 targetEnabledSyscalls map[*prog.Syscall]bool 80 81 disabledHashes map[string]struct{} 82 seeds [][]byte 83 newRepros [][]byte 84 lastMinCorpus int 85 memoryLeakFrames map[string]bool 86 dataRaceFrames map[string]bool 87 saturatedCalls map[string]bool 88 89 needMoreRepros chan chan bool 90 externalReproQueue chan *Crash 91 reproRequest chan chan map[string]bool 92 93 // For checking that files that we are using are not changing under us. 94 // Maps file name to modification time. 95 usedFiles map[string]time.Time 96 97 assetStorage *asset.Storage 98 99 bootTime stats.AverageValue[time.Duration] 100 101 Stats 102 } 103 104 const ( 105 // Just started, nothing done yet. 106 phaseInit = iota 107 // Corpus is loaded and machine is checked. 108 phaseLoadedCorpus 109 // Triaged all inputs from corpus. 110 // This is when we start querying hub and minimizing persistent corpus. 111 phaseTriagedCorpus 112 // Done the first request to hub. 113 phaseQueriedHub 114 // Triaged all new inputs from hub. 115 // This is when we start reproducing crashes. 116 phaseTriagedHub 117 ) 118 119 const currentDBVersion = 4 120 121 type Crash struct { 122 instanceName string 123 fromHub bool // this crash was created based on a repro from syz-hub 124 fromDashboard bool // .. or from dashboard 125 *report.Report 126 machineInfo []byte 127 } 128 129 func main() { 130 if prog.GitRevision == "" { 131 log.Fatalf("bad syz-manager build: build with make, run bin/syz-manager") 132 } 133 flag.Parse() 134 log.EnableLogCaching(1000, 1<<20) 135 cfg, err := mgrconfig.LoadFile(*flagConfig) 136 if err != nil { 137 log.Fatalf("%v", err) 138 } 139 if cfg.DashboardAddr != "" { 140 // This lets better distinguish logs of individual syz-manager instances. 141 log.SetName(cfg.Name) 142 } 143 RunManager(cfg) 144 } 145 146 func RunManager(cfg *mgrconfig.Config) { 147 var vmPool *vm.Pool 148 // Type "none" is a special case for debugging/development when manager 149 // does not start any VMs, but instead you start them manually 150 // and start syz-fuzzer there. 151 if cfg.Type != "none" { 152 var err error 153 vmPool, err = vm.Create(cfg, *flagDebug) 154 if err != nil { 155 log.Fatalf("%v", err) 156 } 157 } 158 159 crashdir := filepath.Join(cfg.Workdir, "crashes") 160 osutil.MkdirAll(crashdir) 161 162 reporter, err := report.NewReporter(cfg) 163 if err != nil { 164 log.Fatalf("%v", err) 165 } 166 167 corpusUpdates := make(chan corpus.NewItemEvent, 32) 168 mgr := &Manager{ 169 cfg: cfg, 170 vmPool: vmPool, 171 corpus: corpus.NewMonitoredCorpus(context.Background(), corpusUpdates), 172 corpusPreloaded: make(chan bool), 173 target: cfg.Target, 174 sysTarget: cfg.SysTarget, 175 reporter: reporter, 176 crashdir: crashdir, 177 crashTypes: make(map[string]bool), 178 disabledHashes: make(map[string]struct{}), 179 memoryLeakFrames: make(map[string]bool), 180 dataRaceFrames: make(map[string]bool), 181 fresh: true, 182 vmStop: make(chan bool), 183 externalReproQueue: make(chan *Crash, 10), 184 needMoreRepros: make(chan chan bool), 185 reproRequest: make(chan chan map[string]bool), 186 usedFiles: make(map[string]time.Time), 187 saturatedCalls: make(map[string]bool), 188 } 189 190 mgr.initStats() 191 go mgr.preloadCorpus() 192 mgr.initHTTP() // Creates HTTP server. 193 mgr.collectUsedFiles() 194 go mgr.corpusInputHandler(corpusUpdates) 195 196 // Create RPC server for fuzzers. 197 mgr.serv, err = startRPCServer(mgr) 198 if err != nil { 199 log.Fatalf("failed to create rpc server: %v", err) 200 } 201 202 if cfg.DashboardAddr != "" { 203 mgr.dash, err = dashapi.New(cfg.DashboardClient, cfg.DashboardAddr, cfg.DashboardKey) 204 if err != nil { 205 log.Fatalf("failed to create dashapi connection: %v", err) 206 } 207 } 208 209 if !cfg.AssetStorage.IsEmpty() { 210 mgr.assetStorage, err = asset.StorageFromConfig(cfg.AssetStorage, mgr.dash) 211 if err != nil { 212 log.Fatalf("failed to init asset storage: %v", err) 213 } 214 } 215 216 if *flagBench != "" { 217 mgr.initBench() 218 } 219 220 go mgr.heartbeatLoop() 221 osutil.HandleInterrupts(vm.Shutdown) 222 if mgr.vmPool == nil { 223 log.Logf(0, "no VMs started (type=none)") 224 log.Logf(0, "you are supposed to start syz-fuzzer manually as:") 225 log.Logf(0, "syz-fuzzer -manager=manager.ip:%v [other flags as necessary]", mgr.serv.port) 226 <-vm.Shutdown 227 return 228 } 229 mgr.vmLoop() 230 } 231 232 func (mgr *Manager) heartbeatLoop() { 233 lastTime := time.Now() 234 for now := range time.NewTicker(10 * time.Second).C { 235 diff := int(now.Sub(lastTime)) 236 lastTime = now 237 if mgr.firstConnect.Load() == 0 { 238 continue 239 } 240 mgr.statFuzzingTime.Add(diff * mgr.statNumFuzzing.Val()) 241 buf := new(bytes.Buffer) 242 for _, stat := range stats.Collect(stats.Console) { 243 fmt.Fprintf(buf, "%v=%v ", stat.Name, stat.Value) 244 } 245 log.Logf(0, "%s", buf.String()) 246 } 247 } 248 249 func (mgr *Manager) initBench() { 250 f, err := os.OpenFile(*flagBench, os.O_WRONLY|os.O_CREATE|os.O_EXCL, osutil.DefaultFilePerm) 251 if err != nil { 252 log.Fatalf("failed to open bench file: %v", err) 253 } 254 go func() { 255 for range time.NewTicker(time.Minute).C { 256 vals := make(map[string]int) 257 for _, stat := range stats.Collect(stats.All) { 258 vals[stat.Name] = stat.V 259 } 260 data, err := json.MarshalIndent(vals, "", " ") 261 if err != nil { 262 log.Fatalf("failed to serialize bench data") 263 } 264 if _, err := f.Write(append(data, '\n')); err != nil { 265 log.Fatalf("failed to write bench data") 266 } 267 } 268 }() 269 } 270 271 type RunResult struct { 272 idx int 273 crash *Crash 274 err error 275 } 276 277 type ReproResult struct { 278 instances []int 279 report0 *report.Report // the original report we started reproducing 280 repro *repro.Result 281 strace *repro.StraceResult 282 stats *repro.Stats 283 err error 284 fromHub bool 285 fromDashboard bool 286 originalTitle string // crash title before we started bug reproduction 287 } 288 289 // Manager needs to be refactored (#605). 290 // nolint: gocyclo, gocognit, funlen 291 func (mgr *Manager) vmLoop() { 292 log.Logf(0, "booting test machines...") 293 log.Logf(0, "wait for the connection from test machine...") 294 instancesPerRepro := 3 295 vmCount := mgr.vmPool.Count() 296 maxReproVMs := vmCount - mgr.cfg.FuzzingVMs 297 if instancesPerRepro > maxReproVMs && maxReproVMs > 0 { 298 instancesPerRepro = maxReproVMs 299 } 300 instances := SequentialResourcePool(vmCount, 5*time.Second) 301 runDone := make(chan *RunResult, 1) 302 pendingRepro := make(map[*Crash]bool) 303 reproducing := make(map[string]bool) 304 var reproQueue []*Crash 305 reproDone := make(chan *ReproResult, 1) 306 stopPending := false 307 shutdown := vm.Shutdown 308 for shutdown != nil || instances.Len() != vmCount { 309 mgr.mu.Lock() 310 phase := mgr.phase 311 mgr.mu.Unlock() 312 313 for crash := range pendingRepro { 314 if reproducing[crash.Title] { 315 continue 316 } 317 delete(pendingRepro, crash) 318 if !mgr.needRepro(crash) { 319 continue 320 } 321 log.Logf(1, "loop: add to repro queue '%v'", crash.Title) 322 reproducing[crash.Title] = true 323 reproQueue = append(reproQueue, crash) 324 } 325 326 log.Logf(1, "loop: phase=%v shutdown=%v instances=%v/%v %+v repro: pending=%v reproducing=%v queued=%v", 327 phase, shutdown == nil, instances.Len(), vmCount, instances.Snapshot(), 328 len(pendingRepro), len(reproducing), len(reproQueue)) 329 330 canRepro := func() bool { 331 return phase >= phaseTriagedHub && len(reproQueue) != 0 && 332 (mgr.statNumReproducing.Val()+1)*instancesPerRepro <= maxReproVMs 333 } 334 335 if shutdown != nil { 336 for canRepro() { 337 vmIndexes := instances.Take(instancesPerRepro) 338 if vmIndexes == nil { 339 break 340 } 341 last := len(reproQueue) - 1 342 crash := reproQueue[last] 343 reproQueue[last] = nil 344 reproQueue = reproQueue[:last] 345 mgr.statNumReproducing.Add(1) 346 log.Logf(0, "loop: starting repro of '%v' on instances %+v", crash.Title, vmIndexes) 347 go func() { 348 reproDone <- mgr.runRepro(crash, vmIndexes, instances.Put) 349 }() 350 } 351 for !canRepro() { 352 idx := instances.TakeOne() 353 if idx == nil { 354 break 355 } 356 log.Logf(1, "loop: starting instance %v", *idx) 357 go func() { 358 crash, err := mgr.runInstance(*idx) 359 runDone <- &RunResult{*idx, crash, err} 360 }() 361 } 362 } 363 364 var stopRequest chan bool 365 if !stopPending && canRepro() { 366 stopRequest = mgr.vmStop 367 } 368 369 wait: 370 select { 371 case <-instances.Freed: 372 // An instance has been released. 373 case stopRequest <- true: 374 log.Logf(1, "loop: issued stop request") 375 stopPending = true 376 case res := <-runDone: 377 log.Logf(1, "loop: instance %v finished, crash=%v", res.idx, res.crash != nil) 378 if res.err != nil && shutdown != nil { 379 log.Logf(0, "%v", res.err) 380 } 381 stopPending = false 382 instances.Put(res.idx) 383 // On shutdown qemu crashes with "qemu: terminating on signal 2", 384 // which we detect as "lost connection". Don't save that as crash. 385 if shutdown != nil && res.crash != nil { 386 needRepro := mgr.saveCrash(res.crash) 387 if needRepro { 388 log.Logf(1, "loop: add pending repro for '%v'", res.crash.Title) 389 pendingRepro[res.crash] = true 390 } 391 } 392 case res := <-reproDone: 393 mgr.statNumReproducing.Add(-1) 394 crepro := false 395 title := "" 396 if res.repro != nil { 397 crepro = res.repro.CRepro 398 title = res.repro.Report.Title 399 } 400 log.Logf(0, "loop: repro on %+v finished '%v', repro=%v crepro=%v desc='%v'"+ 401 " hub=%v from_dashboard=%v", 402 res.instances, res.report0.Title, res.repro != nil, crepro, title, 403 res.fromHub, res.fromDashboard, 404 ) 405 if res.err != nil { 406 reportReproError(res.err) 407 } 408 delete(reproducing, res.report0.Title) 409 if res.repro == nil { 410 if res.fromHub { 411 log.Logf(1, "repro '%v' came from syz-hub, not reporting the failure", 412 res.report0.Title) 413 } else { 414 log.Logf(1, "report repro failure of '%v'", res.report0.Title) 415 mgr.saveFailedRepro(res.report0, res.stats) 416 } 417 } else { 418 mgr.saveRepro(res) 419 } 420 case <-shutdown: 421 log.Logf(1, "loop: shutting down...") 422 shutdown = nil 423 case crash := <-mgr.externalReproQueue: 424 log.Logf(1, "loop: got repro request") 425 pendingRepro[crash] = true 426 case reply := <-mgr.needMoreRepros: 427 reply <- phase >= phaseTriagedHub && 428 len(reproQueue)+len(pendingRepro)+len(reproducing) == 0 429 goto wait 430 case reply := <-mgr.reproRequest: 431 repros := make(map[string]bool) 432 for title := range reproducing { 433 repros[title] = true 434 } 435 reply <- repros 436 goto wait 437 } 438 } 439 } 440 441 func reportReproError(err error) { 442 shutdown := false 443 select { 444 case <-vm.Shutdown: 445 shutdown = true 446 default: 447 } 448 449 switch err { 450 case repro.ErrNoPrograms: 451 // This is not extraordinary as programs are collected via SSH. 452 log.Logf(0, "repro failed: %v", err) 453 return 454 case repro.ErrNoVMs: 455 // This error is to be expected if we're shutting down. 456 if shutdown { 457 return 458 } 459 } 460 // Report everything else as errors. 461 log.Errorf("repro failed: %v", err) 462 } 463 464 func (mgr *Manager) runRepro(crash *Crash, vmIndexes []int, putInstances func(...int)) *ReproResult { 465 res, stats, err := repro.Run(crash.Output, mgr.cfg, mgr.enabledFeatures, mgr.reporter, mgr.vmPool, vmIndexes) 466 ret := &ReproResult{ 467 instances: vmIndexes, 468 report0: crash.Report, 469 repro: res, 470 stats: stats, 471 err: err, 472 fromHub: crash.fromHub, 473 fromDashboard: crash.fromDashboard, 474 originalTitle: crash.Title, 475 } 476 if err == nil && res != nil && mgr.cfg.StraceBin != "" { 477 // We need only one instance to get strace output, release the rest. 478 putInstances(vmIndexes[1:]...) 479 defer putInstances(vmIndexes[0]) 480 481 const straceAttempts = 2 482 for i := 1; i <= straceAttempts; i++ { 483 strace := repro.RunStrace(res, mgr.cfg, mgr.reporter, mgr.vmPool, vmIndexes[0]) 484 sameBug := strace.IsSameBug(res) 485 log.Logf(0, "strace run attempt %d/%d for '%s': same bug %v, error %v", 486 i, straceAttempts, res.Report.Title, sameBug, strace.Error) 487 // We only want to save strace output if it resulted in the same bug. 488 // Otherwise, it will be hard to reproduce on syzbot and will confuse users. 489 if sameBug { 490 ret.strace = strace 491 break 492 } 493 } 494 } else { 495 putInstances(vmIndexes...) 496 } 497 return ret 498 } 499 500 type ResourcePool struct { 501 ids []int 502 mu sync.RWMutex 503 Freed chan interface{} 504 } 505 506 func SequentialResourcePool(count int, delay time.Duration) *ResourcePool { 507 ret := &ResourcePool{Freed: make(chan interface{}, 1)} 508 go func() { 509 for i := 0; i < count; i++ { 510 ret.Put(i) 511 time.Sleep(delay) 512 } 513 }() 514 return ret 515 } 516 517 func (pool *ResourcePool) Put(ids ...int) { 518 pool.mu.Lock() 519 defer pool.mu.Unlock() 520 pool.ids = append(pool.ids, ids...) 521 // Notify the listener. 522 select { 523 case pool.Freed <- true: 524 default: 525 } 526 } 527 528 func (pool *ResourcePool) Len() int { 529 pool.mu.RLock() 530 defer pool.mu.RUnlock() 531 return len(pool.ids) 532 } 533 534 func (pool *ResourcePool) Snapshot() []int { 535 pool.mu.RLock() 536 defer pool.mu.RUnlock() 537 return append([]int{}, pool.ids...) 538 } 539 540 func (pool *ResourcePool) Take(cnt int) []int { 541 pool.mu.Lock() 542 defer pool.mu.Unlock() 543 totalItems := len(pool.ids) 544 if totalItems < cnt { 545 return nil 546 } 547 ret := append([]int{}, pool.ids[totalItems-cnt:]...) 548 pool.ids = pool.ids[:totalItems-cnt] 549 return ret 550 } 551 552 func (pool *ResourcePool) TakeOne() *int { 553 ret := pool.Take(1) 554 if ret == nil { 555 return nil 556 } 557 return &ret[0] 558 } 559 560 func (mgr *Manager) preloadCorpus() { 561 corpusDB, err := db.Open(filepath.Join(mgr.cfg.Workdir, "corpus.db"), true) 562 if err != nil { 563 if corpusDB == nil { 564 log.Fatalf("failed to open corpus database: %v", err) 565 } 566 log.Errorf("read %v inputs from corpus and got error: %v", len(corpusDB.Records), err) 567 } 568 mgr.corpusDB = corpusDB 569 570 if seedDir := filepath.Join(mgr.cfg.Syzkaller, "sys", mgr.cfg.TargetOS, "test"); osutil.IsExist(seedDir) { 571 seeds, err := os.ReadDir(seedDir) 572 if err != nil { 573 log.Fatalf("failed to read seeds dir: %v", err) 574 } 575 for _, seed := range seeds { 576 data, err := os.ReadFile(filepath.Join(seedDir, seed.Name())) 577 if err != nil { 578 log.Fatalf("failed to read seed %v: %v", seed.Name(), err) 579 } 580 mgr.seeds = append(mgr.seeds, data) 581 } 582 } 583 close(mgr.corpusPreloaded) 584 } 585 586 func (mgr *Manager) loadCorpus() { 587 <-mgr.corpusPreloaded 588 // By default we don't re-minimize/re-smash programs from corpus, 589 // it takes lots of time on start and is unnecessary. 590 // However, on version bumps we can selectively re-minimize/re-smash. 591 minimized, smashed := true, true 592 switch mgr.corpusDB.Version { 593 case 0: 594 // Version 0 had broken minimization, so we need to re-minimize. 595 minimized = false 596 fallthrough 597 case 1: 598 // Version 1->2: memory is preallocated so lots of mmaps become unnecessary. 599 minimized = false 600 fallthrough 601 case 2: 602 // Version 2->3: big-endian hints. 603 smashed = false 604 fallthrough 605 case 3: 606 // Version 3->4: to shake things up. 607 minimized = false 608 fallthrough 609 case currentDBVersion: 610 } 611 var candidates []fuzzer.Candidate 612 broken := 0 613 for key, rec := range mgr.corpusDB.Records { 614 drop, item := mgr.loadProg(rec.Val, minimized, smashed) 615 if drop { 616 mgr.corpusDB.Delete(key) 617 broken++ 618 } 619 if item != nil { 620 candidates = append(candidates, *item) 621 } 622 } 623 mgr.fresh = len(mgr.corpusDB.Records) == 0 624 seeds := 0 625 for _, seed := range mgr.seeds { 626 _, item := mgr.loadProg(seed, true, false) 627 if item != nil { 628 candidates = append(candidates, *item) 629 seeds++ 630 } 631 } 632 log.Logf(0, "%-24v: %v (%v broken, %v seeds)", "corpus", len(candidates), broken, seeds) 633 mgr.seeds = nil 634 635 // We duplicate all inputs in the corpus and shuffle the second part. 636 // This solves the following problem. A fuzzer can crash while triaging candidates, 637 // in such case it will also lost all cached candidates. Or, the input can be somewhat flaky 638 // and doesn't give the coverage on first try. So we give each input the second chance. 639 // Shuffling should alleviate deterministically losing the same inputs on fuzzer crashing. 640 candidates = append(candidates, candidates...) 641 shuffle := candidates[len(candidates)/2:] 642 rand.Shuffle(len(shuffle), func(i, j int) { 643 shuffle[i], shuffle[j] = shuffle[j], shuffle[i] 644 }) 645 if mgr.phase != phaseInit { 646 panic(fmt.Sprintf("loadCorpus: bad phase %v", mgr.phase)) 647 } 648 mgr.phase = phaseLoadedCorpus 649 mgr.fuzzer.Load().AddCandidates(candidates) 650 } 651 652 // Returns (delete item from the corpus, a fuzzer.Candidate object). 653 func (mgr *Manager) loadProg(data []byte, minimized, smashed bool) (drop bool, candidate *fuzzer.Candidate) { 654 p, disabled, bad := parseProgram(mgr.target, mgr.targetEnabledSyscalls, data) 655 if bad != nil { 656 return true, nil 657 } 658 if disabled { 659 if mgr.cfg.PreserveCorpus { 660 // This program contains a disabled syscall. 661 // We won't execute it, but remember its hash so 662 // it is not deleted during minimization. 663 mgr.disabledHashes[hash.String(data)] = struct{}{} 664 } else { 665 // We cut out the disabled syscalls and let syz-fuzzer retriage and 666 // minimize what remains from the prog. The original prog will be 667 // deleted from the corpus. 668 leftover := programLeftover(mgr.target, mgr.targetEnabledSyscalls, data) 669 if leftover != nil { 670 candidate = &fuzzer.Candidate{ 671 Prog: leftover, 672 Minimized: false, 673 Smashed: smashed, 674 } 675 } 676 } 677 return false, candidate 678 } 679 return false, &fuzzer.Candidate{ 680 Prog: p, 681 Minimized: minimized, 682 Smashed: smashed, 683 } 684 } 685 686 func programLeftover(target *prog.Target, enabled map[*prog.Syscall]bool, data []byte) *prog.Prog { 687 p, err := target.Deserialize(data, prog.NonStrict) 688 if err != nil { 689 panic(fmt.Sprintf("subsequent deserialization failed: %s", data)) 690 } 691 for i := 0; i < len(p.Calls); { 692 c := p.Calls[i] 693 if !enabled[c.Meta] { 694 p.RemoveCall(i) 695 continue 696 } 697 i++ 698 } 699 return p 700 } 701 702 func parseProgram(target *prog.Target, enabled map[*prog.Syscall]bool, data []byte) ( 703 p *prog.Prog, disabled bool, err error) { 704 p, err = target.Deserialize(data, prog.NonStrict) 705 if err != nil { 706 return 707 } 708 if len(p.Calls) > prog.MaxCalls { 709 return nil, false, fmt.Errorf("longer than %d calls", prog.MaxCalls) 710 } 711 // For some yet unknown reasons, programs with fail_nth > 0 may sneak in. Ignore them. 712 for _, call := range p.Calls { 713 if call.Props.FailNth > 0 { 714 return nil, false, fmt.Errorf("input has fail_nth > 0") 715 } 716 } 717 for _, c := range p.Calls { 718 if !enabled[c.Meta] { 719 return p, true, nil 720 } 721 } 722 return p, false, nil 723 } 724 725 func (mgr *Manager) runInstance(index int) (*Crash, error) { 726 mgr.checkUsedFiles() 727 var maxSignal signal.Signal 728 if fuzzer := mgr.fuzzer.Load(); fuzzer != nil { 729 maxSignal = fuzzer.Cover.CopyMaxSignal() 730 } 731 // Use unique instance names to prevent name collisions in case of untimely RPC messages. 732 instanceName := fmt.Sprintf("vm-%d", mgr.nextInstanceID.Add(1)) 733 injectLog := make(chan []byte, 10) 734 mgr.serv.createInstance(instanceName, maxSignal, injectLog) 735 736 rep, vmInfo, err := mgr.runInstanceInner(index, instanceName, injectLog) 737 machineInfo := mgr.serv.shutdownInstance(instanceName, rep != nil) 738 if len(vmInfo) != 0 { 739 machineInfo = append(append(vmInfo, '\n'), machineInfo...) 740 } 741 742 // Error that is not a VM crash. 743 if err != nil { 744 return nil, err 745 } 746 // No crash. 747 if rep == nil { 748 return nil, nil 749 } 750 crash := &Crash{ 751 instanceName: instanceName, 752 Report: rep, 753 machineInfo: machineInfo, 754 } 755 return crash, nil 756 } 757 758 func (mgr *Manager) runInstanceInner(index int, instanceName string, injectLog <-chan []byte) ( 759 *report.Report, []byte, error) { 760 start := time.Now() 761 762 inst, err := mgr.vmPool.Create(index) 763 if err != nil { 764 return nil, nil, fmt.Errorf("failed to create instance: %w", err) 765 } 766 defer inst.Close() 767 768 fwdAddr, err := inst.Forward(mgr.serv.port) 769 if err != nil { 770 return nil, nil, fmt.Errorf("failed to setup port forwarding: %w", err) 771 } 772 773 fuzzerBin, err := inst.Copy(mgr.cfg.FuzzerBin) 774 if err != nil { 775 return nil, nil, fmt.Errorf("failed to copy binary: %w", err) 776 } 777 778 // If ExecutorBin is provided, it means that syz-executor is already in the image, 779 // so no need to copy it. 780 executorBin := mgr.sysTarget.ExecutorBin 781 if executorBin == "" { 782 executorBin, err = inst.Copy(mgr.cfg.ExecutorBin) 783 if err != nil { 784 return nil, nil, fmt.Errorf("failed to copy binary: %w", err) 785 } 786 } 787 788 fuzzerV := 0 789 procs := mgr.cfg.Procs 790 if *flagDebug { 791 fuzzerV = 100 792 procs = 1 793 } 794 795 // Run the fuzzer binary. 796 mgr.bootTime.Save(time.Since(start)) 797 start = time.Now() 798 mgr.statNumFuzzing.Add(1) 799 defer mgr.statNumFuzzing.Add(-1) 800 801 args := &instance.FuzzerCmdArgs{ 802 Fuzzer: fuzzerBin, 803 Executor: executorBin, 804 Name: instanceName, 805 OS: mgr.cfg.TargetOS, 806 Arch: mgr.cfg.TargetArch, 807 FwdAddr: fwdAddr, 808 Sandbox: mgr.cfg.Sandbox, 809 Procs: procs, 810 Verbosity: fuzzerV, 811 Cover: mgr.cfg.Cover, 812 Debug: *flagDebug, 813 Test: false, 814 Optional: &instance.OptionalFuzzerArgs{ 815 Slowdown: mgr.cfg.Timeouts.Slowdown, 816 SandboxArg: mgr.cfg.SandboxArg, 817 PprofPort: inst.PprofPort(), 818 }, 819 } 820 cmd := instance.FuzzerCmd(args) 821 _, rep, err := inst.Run(mgr.cfg.Timeouts.VMRunningTime, mgr.reporter, cmd, 822 vm.ExitTimeout, vm.StopChan(mgr.vmStop), vm.InjectOutput(injectLog), 823 vm.EarlyFinishCb(func() { 824 // Depending on the crash type and kernel config, fuzzing may continue 825 // running for several seconds even after kernel has printed a crash report. 826 // This litters the log and we want to prevent it. 827 mgr.serv.stopFuzzing(instanceName) 828 }), 829 ) 830 if err != nil { 831 return nil, nil, fmt.Errorf("failed to run fuzzer: %w", err) 832 } 833 if rep == nil { 834 // This is the only "OK" outcome. 835 log.Logf(0, "%s: running for %v, restarting", instanceName, time.Since(start)) 836 return nil, nil, nil 837 } 838 vmInfo, err := inst.Info() 839 if err != nil { 840 vmInfo = []byte(fmt.Sprintf("error getting VM info: %v\n", err)) 841 } 842 return rep, vmInfo, nil 843 } 844 845 func (mgr *Manager) emailCrash(crash *Crash) { 846 if len(mgr.cfg.EmailAddrs) == 0 { 847 return 848 } 849 args := []string{"-s", "syzkaller: " + crash.Title} 850 args = append(args, mgr.cfg.EmailAddrs...) 851 log.Logf(0, "sending email to %v", mgr.cfg.EmailAddrs) 852 853 cmd := exec.Command("mailx", args...) 854 cmd.Stdin = bytes.NewReader(crash.Report.Report) 855 if _, err := osutil.Run(10*time.Minute, cmd); err != nil { 856 log.Logf(0, "failed to send email: %v", err) 857 } 858 } 859 860 func (mgr *Manager) saveCrash(crash *Crash) bool { 861 if err := mgr.reporter.Symbolize(crash.Report); err != nil { 862 log.Errorf("failed to symbolize report: %v", err) 863 } 864 if crash.Type == crash_pkg.MemoryLeak { 865 mgr.mu.Lock() 866 mgr.memoryLeakFrames[crash.Frame] = true 867 mgr.mu.Unlock() 868 } 869 if crash.Type == crash_pkg.DataRace { 870 mgr.mu.Lock() 871 mgr.dataRaceFrames[crash.Frame] = true 872 mgr.mu.Unlock() 873 } 874 flags := "" 875 if crash.Corrupted { 876 flags += " [corrupted]" 877 } 878 if crash.Suppressed { 879 flags += " [suppressed]" 880 } 881 log.Logf(0, "%s: crash: %v%v", crash.instanceName, crash.Title, flags) 882 883 if crash.Suppressed { 884 // Collect all of them into a single bucket so that it's possible to control and assess them, 885 // e.g. if there are some spikes in suppressed reports. 886 crash.Title = "suppressed report" 887 mgr.statSuppressed.Add(1) 888 } 889 890 mgr.statCrashes.Add(1) 891 mgr.mu.Lock() 892 if !mgr.crashTypes[crash.Title] { 893 mgr.crashTypes[crash.Title] = true 894 mgr.statCrashTypes.Add(1) 895 } 896 mgr.mu.Unlock() 897 898 if mgr.dash != nil { 899 if crash.Type == crash_pkg.MemoryLeak { 900 return true 901 } 902 dc := &dashapi.Crash{ 903 BuildID: mgr.cfg.Tag, 904 Title: crash.Title, 905 AltTitles: crash.AltTitles, 906 Corrupted: crash.Corrupted, 907 Suppressed: crash.Suppressed, 908 Recipients: crash.Recipients.ToDash(), 909 Log: crash.Output, 910 Report: crash.Report.Report, 911 MachineInfo: crash.machineInfo, 912 } 913 setGuiltyFiles(dc, crash.Report) 914 resp, err := mgr.dash.ReportCrash(dc) 915 if err != nil { 916 log.Logf(0, "failed to report crash to dashboard: %v", err) 917 } else { 918 // Don't store the crash locally, if we've successfully 919 // uploaded it to the dashboard. These will just eat disk space. 920 return resp.NeedRepro 921 } 922 } 923 924 sig := hash.Hash([]byte(crash.Title)) 925 id := sig.String() 926 dir := filepath.Join(mgr.crashdir, id) 927 osutil.MkdirAll(dir) 928 if err := osutil.WriteFile(filepath.Join(dir, "description"), []byte(crash.Title+"\n")); err != nil { 929 log.Logf(0, "failed to write crash: %v", err) 930 } 931 932 // Save up to mgr.cfg.MaxCrashLogs reports, overwrite the oldest once we've reached that number. 933 // Newer reports are generally more useful. Overwriting is also needed 934 // to be able to understand if a particular bug still happens or already fixed. 935 oldestI := 0 936 var oldestTime time.Time 937 for i := 0; i < mgr.cfg.MaxCrashLogs; i++ { 938 info, err := os.Stat(filepath.Join(dir, fmt.Sprintf("log%v", i))) 939 if err != nil { 940 oldestI = i 941 if i == 0 { 942 go mgr.emailCrash(crash) 943 } 944 break 945 } 946 if oldestTime.IsZero() || info.ModTime().Before(oldestTime) { 947 oldestI = i 948 oldestTime = info.ModTime() 949 } 950 } 951 writeOrRemove := func(name string, data []byte) { 952 filename := filepath.Join(dir, name+fmt.Sprint(oldestI)) 953 if len(data) == 0 { 954 os.Remove(filename) 955 return 956 } 957 osutil.WriteFile(filename, data) 958 } 959 writeOrRemove("log", crash.Output) 960 writeOrRemove("tag", []byte(mgr.cfg.Tag)) 961 writeOrRemove("report", crash.Report.Report) 962 writeOrRemove("machineInfo", crash.machineInfo) 963 return mgr.needLocalRepro(crash) 964 } 965 966 const maxReproAttempts = 3 967 968 func (mgr *Manager) needLocalRepro(crash *Crash) bool { 969 if !mgr.cfg.Reproduce || crash.Corrupted || crash.Suppressed { 970 return false 971 } 972 sig := hash.Hash([]byte(crash.Title)) 973 dir := filepath.Join(mgr.crashdir, sig.String()) 974 if osutil.IsExist(filepath.Join(dir, "repro.prog")) { 975 return false 976 } 977 for i := 0; i < maxReproAttempts; i++ { 978 if !osutil.IsExist(filepath.Join(dir, fmt.Sprintf("repro%v", i))) { 979 return true 980 } 981 } 982 return false 983 } 984 985 func (mgr *Manager) needRepro(crash *Crash) bool { 986 if crash.fromHub || crash.fromDashboard { 987 return true 988 } 989 if !mgr.checkDone || (mgr.enabledFeatures&flatrpc.FeatureLeak != 0 && 990 crash.Type != crash_pkg.MemoryLeak) { 991 // Leak checking is very slow, don't bother reproducing other crashes on leak instance. 992 return false 993 } 994 if mgr.dash == nil { 995 return mgr.needLocalRepro(crash) 996 } 997 cid := &dashapi.CrashID{ 998 BuildID: mgr.cfg.Tag, 999 Title: crash.Title, 1000 Corrupted: crash.Corrupted, 1001 Suppressed: crash.Suppressed, 1002 MayBeMissing: crash.Type == crash_pkg.MemoryLeak, // we did not send the original crash w/o repro 1003 } 1004 needRepro, err := mgr.dash.NeedRepro(cid) 1005 if err != nil { 1006 log.Logf(0, "dashboard.NeedRepro failed: %v", err) 1007 } 1008 return needRepro 1009 } 1010 1011 func truncateReproLog(log []byte) []byte { 1012 // Repro logs can get quite large and we have trouble sending large API requests (see #4495). 1013 // Let's truncate the log to a 512KB prefix and 512KB suffix. 1014 return report.Truncate(log, 512000, 512000) 1015 } 1016 1017 func (mgr *Manager) saveFailedRepro(rep *report.Report, stats *repro.Stats) { 1018 reproLog := fullReproLog(stats) 1019 if mgr.dash != nil { 1020 if rep.Type == crash_pkg.MemoryLeak { 1021 // Don't send failed leak repro attempts to dashboard 1022 // as we did not send the crash itself. 1023 log.Logf(1, "failed repro of '%v': not sending because of the memleak type", rep.Title) 1024 return 1025 } 1026 cid := &dashapi.CrashID{ 1027 BuildID: mgr.cfg.Tag, 1028 Title: rep.Title, 1029 Corrupted: rep.Corrupted, 1030 Suppressed: rep.Suppressed, 1031 MayBeMissing: rep.Type == crash_pkg.MemoryLeak, 1032 ReproLog: truncateReproLog(reproLog), 1033 } 1034 if err := mgr.dash.ReportFailedRepro(cid); err != nil { 1035 log.Logf(0, "failed to report failed repro to dashboard (log size %d): %v", 1036 len(reproLog), err) 1037 } else { 1038 return 1039 } 1040 } 1041 dir := filepath.Join(mgr.crashdir, hash.String([]byte(rep.Title))) 1042 osutil.MkdirAll(dir) 1043 for i := 0; i < maxReproAttempts; i++ { 1044 name := filepath.Join(dir, fmt.Sprintf("repro%v", i)) 1045 if !osutil.IsExist(name) && len(reproLog) > 0 { 1046 osutil.WriteFile(name, reproLog) 1047 break 1048 } 1049 } 1050 } 1051 1052 func (mgr *Manager) saveRepro(res *ReproResult) { 1053 repro := res.repro 1054 opts := fmt.Sprintf("# %+v\n", repro.Opts) 1055 progText := repro.Prog.Serialize() 1056 1057 // Append this repro to repro list to send to hub if it didn't come from hub originally. 1058 if !res.fromHub { 1059 progForHub := []byte(fmt.Sprintf("# %+v\n# %v\n# %v\n%s", 1060 repro.Opts, repro.Report.Title, mgr.cfg.Tag, progText)) 1061 mgr.mu.Lock() 1062 mgr.newRepros = append(mgr.newRepros, progForHub) 1063 mgr.mu.Unlock() 1064 } 1065 1066 var cprogText []byte 1067 if repro.CRepro { 1068 cprog, err := csource.Write(repro.Prog, repro.Opts) 1069 if err == nil { 1070 formatted, err := csource.Format(cprog) 1071 if err == nil { 1072 cprog = formatted 1073 } 1074 cprogText = cprog 1075 } else { 1076 log.Logf(0, "failed to write C source: %v", err) 1077 } 1078 } 1079 1080 if mgr.dash != nil { 1081 // Note: we intentionally don't set Corrupted for reproducers: 1082 // 1. This is reproducible so can be debugged even with corrupted report. 1083 // 2. Repro re-tried 3 times and still got corrupted report at the end, 1084 // so maybe corrupted report detection is broken. 1085 // 3. Reproduction is expensive so it's good to persist the result. 1086 1087 report := repro.Report 1088 output := report.Output 1089 1090 var crashFlags dashapi.CrashFlags 1091 if res.strace != nil { 1092 // If syzkaller managed to successfully run the repro with strace, send 1093 // the report and the output generated under strace. 1094 report = res.strace.Report 1095 output = res.strace.Output 1096 crashFlags = dashapi.CrashUnderStrace 1097 } 1098 1099 dc := &dashapi.Crash{ 1100 BuildID: mgr.cfg.Tag, 1101 Title: report.Title, 1102 AltTitles: report.AltTitles, 1103 Suppressed: report.Suppressed, 1104 Recipients: report.Recipients.ToDash(), 1105 Log: output, 1106 Flags: crashFlags, 1107 Report: report.Report, 1108 ReproOpts: repro.Opts.Serialize(), 1109 ReproSyz: progText, 1110 ReproC: cprogText, 1111 ReproLog: truncateReproLog(fullReproLog(res.stats)), 1112 Assets: mgr.uploadReproAssets(repro), 1113 OriginalTitle: res.originalTitle, 1114 } 1115 setGuiltyFiles(dc, report) 1116 if _, err := mgr.dash.ReportCrash(dc); err != nil { 1117 log.Logf(0, "failed to report repro to dashboard: %v", err) 1118 } else { 1119 // Don't store the crash locally, if we've successfully 1120 // uploaded it to the dashboard. These will just eat disk space. 1121 return 1122 } 1123 } 1124 1125 rep := repro.Report 1126 dir := filepath.Join(mgr.crashdir, hash.String([]byte(rep.Title))) 1127 osutil.MkdirAll(dir) 1128 1129 if err := osutil.WriteFile(filepath.Join(dir, "description"), []byte(rep.Title+"\n")); err != nil { 1130 log.Logf(0, "failed to write crash: %v", err) 1131 } 1132 osutil.WriteFile(filepath.Join(dir, "repro.prog"), append([]byte(opts), progText...)) 1133 if mgr.cfg.Tag != "" { 1134 osutil.WriteFile(filepath.Join(dir, "repro.tag"), []byte(mgr.cfg.Tag)) 1135 } 1136 if len(rep.Output) > 0 { 1137 osutil.WriteFile(filepath.Join(dir, "repro.log"), rep.Output) 1138 } 1139 if len(rep.Report) > 0 { 1140 osutil.WriteFile(filepath.Join(dir, "repro.report"), rep.Report) 1141 } 1142 if len(cprogText) > 0 { 1143 osutil.WriteFile(filepath.Join(dir, "repro.cprog"), cprogText) 1144 } 1145 repro.Prog.ForEachAsset(func(name string, typ prog.AssetType, r io.Reader) { 1146 fileName := filepath.Join(dir, name+".gz") 1147 if err := osutil.WriteGzipStream(fileName, r); err != nil { 1148 log.Logf(0, "failed to write crash asset: type %d, write error %v", typ, err) 1149 } 1150 }) 1151 if res.strace != nil { 1152 // Unlike dashboard reporting, we save strace output separately from the original log. 1153 if res.strace.Error != nil { 1154 osutil.WriteFile(filepath.Join(dir, "strace.error"), 1155 []byte(fmt.Sprintf("%v", res.strace.Error))) 1156 } 1157 if len(res.strace.Output) > 0 { 1158 osutil.WriteFile(filepath.Join(dir, "strace.log"), res.strace.Output) 1159 } 1160 } 1161 if reproLog := fullReproLog(res.stats); len(reproLog) > 0 { 1162 osutil.WriteFile(filepath.Join(dir, "repro.stats"), reproLog) 1163 } 1164 } 1165 1166 func (mgr *Manager) uploadReproAssets(repro *repro.Result) []dashapi.NewAsset { 1167 if mgr.assetStorage == nil { 1168 return nil 1169 } 1170 1171 ret := []dashapi.NewAsset{} 1172 repro.Prog.ForEachAsset(func(name string, typ prog.AssetType, r io.Reader) { 1173 dashTyp, ok := map[prog.AssetType]dashapi.AssetType{ 1174 prog.MountInRepro: dashapi.MountInRepro, 1175 }[typ] 1176 if !ok { 1177 panic("unknown extracted prog asset") 1178 } 1179 asset, err := mgr.assetStorage.UploadCrashAsset(r, name, dashTyp, nil) 1180 if err != nil { 1181 log.Logf(1, "processing of the asset %v (%v) failed: %v", name, typ, err) 1182 return 1183 } 1184 ret = append(ret, asset) 1185 }) 1186 return ret 1187 } 1188 1189 func fullReproLog(stats *repro.Stats) []byte { 1190 if stats == nil { 1191 return nil 1192 } 1193 return []byte(fmt.Sprintf("Extracting prog: %v\nMinimizing prog: %v\n"+ 1194 "Simplifying prog options: %v\nExtracting C: %v\nSimplifying C: %v\n\n\n%s", 1195 stats.ExtractProgTime, stats.MinimizeProgTime, 1196 stats.SimplifyProgTime, stats.ExtractCTime, stats.SimplifyCTime, stats.Log)) 1197 } 1198 1199 func (mgr *Manager) corpusInputHandler(updates <-chan corpus.NewItemEvent) { 1200 for update := range updates { 1201 mgr.serv.updateCoverFilter(update.NewCover) 1202 if update.Exists { 1203 // We only save new progs into the corpus.db file. 1204 continue 1205 } 1206 mgr.corpusDBMu.Lock() 1207 mgr.corpusDB.Save(update.Sig, update.ProgData, 0) 1208 if err := mgr.corpusDB.Flush(); err != nil { 1209 log.Errorf("failed to save corpus database: %v", err) 1210 } 1211 mgr.corpusDBMu.Unlock() 1212 } 1213 } 1214 1215 func (mgr *Manager) getMinimizedCorpus() (corpus, repros [][]byte) { 1216 mgr.mu.Lock() 1217 defer mgr.mu.Unlock() 1218 mgr.minimizeCorpusLocked() 1219 items := mgr.corpus.Items() 1220 corpus = make([][]byte, 0, len(items)) 1221 for _, inp := range items { 1222 corpus = append(corpus, inp.ProgData) 1223 } 1224 repros = mgr.newRepros 1225 mgr.newRepros = nil 1226 return 1227 } 1228 1229 func (mgr *Manager) addNewCandidates(candidates []fuzzer.Candidate) { 1230 if mgr.cfg.Experimental.ResetAccState { 1231 // Don't accept new candidates -- the execution is already very slow, 1232 // syz-hub will just overwhelm us. 1233 return 1234 } 1235 mgr.fuzzer.Load().AddCandidates(candidates) 1236 mgr.mu.Lock() 1237 defer mgr.mu.Unlock() 1238 if mgr.phase == phaseTriagedCorpus { 1239 mgr.phase = phaseQueriedHub 1240 } 1241 } 1242 1243 func (mgr *Manager) minimizeCorpusLocked() { 1244 currSize := mgr.corpus.StatProgs.Val() 1245 if currSize <= mgr.lastMinCorpus*103/100 { 1246 return 1247 } 1248 mgr.corpus.Minimize(mgr.cfg.Cover) 1249 newSize := mgr.corpus.StatProgs.Val() 1250 1251 log.Logf(1, "minimized corpus: %v -> %v", currSize, newSize) 1252 mgr.lastMinCorpus = newSize 1253 1254 // From time to time we get corpus explosion due to different reason: 1255 // generic bugs, per-OS bugs, problems with fallback coverage, kcov bugs, etc. 1256 // This has bad effect on the instance and especially on instances 1257 // connected via hub. Do some per-syscall sanity checking to prevent this. 1258 for call, info := range mgr.corpus.CallCover() { 1259 if mgr.cfg.Cover { 1260 // If we have less than 1K inputs per this call, 1261 // accept all new inputs unconditionally. 1262 if info.Count < 1000 { 1263 continue 1264 } 1265 // If we have more than 3K already, don't accept any more. 1266 // Between 1K and 3K look at amount of coverage we are getting from these programs. 1267 // Empirically, real coverage for the most saturated syscalls is ~30-60 1268 // per program (even when we have a thousand of them). For explosion 1269 // case coverage tend to be much lower (~0.3-5 per program). 1270 if info.Count < 3000 && len(info.Cover)/info.Count >= 10 { 1271 continue 1272 } 1273 } else { 1274 // If we don't have real coverage, signal is weak. 1275 // If we have more than several hundreds, there is something wrong. 1276 if info.Count < 300 { 1277 continue 1278 } 1279 } 1280 if mgr.saturatedCalls[call] { 1281 continue 1282 } 1283 mgr.saturatedCalls[call] = true 1284 log.Logf(0, "coverage for %v has saturated, not accepting more inputs", call) 1285 } 1286 1287 // Don't minimize persistent corpus until fuzzers have triaged all inputs from it. 1288 if mgr.phase < phaseTriagedCorpus { 1289 return 1290 } 1291 mgr.corpusDBMu.Lock() 1292 defer mgr.corpusDBMu.Unlock() 1293 for key := range mgr.corpusDB.Records { 1294 ok1 := mgr.corpus.Item(key) != nil 1295 _, ok2 := mgr.disabledHashes[key] 1296 if !ok1 && !ok2 { 1297 mgr.corpusDB.Delete(key) 1298 } 1299 } 1300 mgr.corpusDB.BumpVersion(currentDBVersion) 1301 } 1302 1303 func setGuiltyFiles(crash *dashapi.Crash, report *report.Report) { 1304 if report.GuiltyFile != "" { 1305 crash.GuiltyFiles = []string{report.GuiltyFile} 1306 } 1307 } 1308 1309 func (mgr *Manager) collectSyscallInfo() map[string]*corpus.CallCov { 1310 mgr.mu.Lock() 1311 enabledSyscalls := mgr.targetEnabledSyscalls 1312 mgr.mu.Unlock() 1313 1314 if enabledSyscalls == nil { 1315 return nil 1316 } 1317 calls := mgr.corpus.CallCover() 1318 // Add enabled, but not yet covered calls. 1319 for call := range enabledSyscalls { 1320 if calls[call.Name] == nil { 1321 calls[call.Name] = new(corpus.CallCov) 1322 } 1323 } 1324 return calls 1325 } 1326 1327 func (mgr *Manager) currentBugFrames() BugFrames { 1328 mgr.mu.Lock() 1329 defer mgr.mu.Unlock() 1330 frames := BugFrames{ 1331 memoryLeaks: make([]string, 0, len(mgr.memoryLeakFrames)), 1332 dataRaces: make([]string, 0, len(mgr.dataRaceFrames)), 1333 } 1334 for frame := range mgr.memoryLeakFrames { 1335 frames.memoryLeaks = append(frames.memoryLeaks, frame) 1336 } 1337 for frame := range mgr.dataRaceFrames { 1338 frames.dataRaces = append(frames.dataRaces, frame) 1339 } 1340 return frames 1341 } 1342 1343 func (mgr *Manager) machineChecked(features flatrpc.Feature, enabledSyscalls map[*prog.Syscall]bool, 1344 opts ipc.ExecOpts) queue.Source { 1345 mgr.mu.Lock() 1346 defer mgr.mu.Unlock() 1347 if mgr.checkDone { 1348 panic("machineChecked() called twice") 1349 } 1350 mgr.checkDone = true 1351 mgr.enabledFeatures = features 1352 mgr.targetEnabledSyscalls = enabledSyscalls 1353 statSyscalls := stats.Create("syscalls", "Number of enabled syscalls", 1354 stats.Simple, stats.NoGraph, stats.Link("/syscalls")) 1355 statSyscalls.Add(len(enabledSyscalls)) 1356 1357 rnd := rand.New(rand.NewSource(time.Now().UnixNano())) 1358 fuzzerObj := fuzzer.NewFuzzer(context.Background(), &fuzzer.Config{ 1359 Corpus: mgr.corpus, 1360 BaseOpts: opts, 1361 Coverage: mgr.cfg.Cover, 1362 FaultInjection: features&flatrpc.FeatureFault != 0, 1363 Comparisons: features&flatrpc.FeatureComparisons != 0, 1364 Collide: true, 1365 EnabledCalls: enabledSyscalls, 1366 NoMutateCalls: mgr.cfg.NoMutateCalls, 1367 FetchRawCover: mgr.cfg.RawCover, 1368 Logf: func(level int, msg string, args ...interface{}) { 1369 if level != 0 { 1370 return 1371 } 1372 log.Logf(level, msg, args...) 1373 }, 1374 NewInputFilter: func(call string) bool { 1375 mgr.mu.Lock() 1376 defer mgr.mu.Unlock() 1377 return !mgr.saturatedCalls[call] 1378 }, 1379 }, rnd, mgr.target) 1380 mgr.fuzzer.Store(fuzzerObj) 1381 1382 mgr.loadCorpus() 1383 mgr.firstConnect.Store(time.Now().Unix()) 1384 go mgr.corpusMinimization() 1385 go mgr.fuzzerLoop(fuzzerObj) 1386 if mgr.dash != nil { 1387 go mgr.dashboardReporter() 1388 if mgr.cfg.Reproduce { 1389 go mgr.dashboardReproTasks() 1390 } 1391 } 1392 return fuzzerObj 1393 } 1394 1395 func (mgr *Manager) corpusMinimization() { 1396 for range time.NewTicker(time.Minute).C { 1397 mgr.mu.Lock() 1398 mgr.minimizeCorpusLocked() 1399 mgr.mu.Unlock() 1400 } 1401 } 1402 1403 func (mgr *Manager) fuzzerSignalRotation() { 1404 const ( 1405 rotateSignals = 1000 1406 timeBetweenRotates = 15 * time.Minute 1407 // Every X dropped signals may in the worst case lead up to 3 * X 1408 // additional triage executions, which is in this case constitutes 1409 // 3000/60000 = 5%. 1410 execsBetweenRotates = 60000 1411 ) 1412 lastExecTotal := 0 1413 lastRotation := time.Now() 1414 for range time.NewTicker(5 * time.Minute).C { 1415 if mgr.statExecs.Val()-lastExecTotal < execsBetweenRotates { 1416 continue 1417 } 1418 if time.Since(lastRotation) < timeBetweenRotates { 1419 continue 1420 } 1421 mgr.fuzzer.Load().RotateMaxSignal(rotateSignals) 1422 lastRotation = time.Now() 1423 lastExecTotal = mgr.statExecs.Val() 1424 } 1425 } 1426 1427 func (mgr *Manager) fuzzerLoop(fuzzer *fuzzer.Fuzzer) { 1428 for ; ; time.Sleep(time.Second / 2) { 1429 // Distribute new max signal over all instances. 1430 newSignal, dropSignal := fuzzer.Cover.GrabSignalDelta() 1431 log.Logf(2, "distributing %d new signal, %d dropped signal", 1432 len(newSignal), len(dropSignal)) 1433 if len(newSignal)+len(dropSignal) != 0 { 1434 mgr.serv.distributeSignalDelta(newSignal, dropSignal) 1435 } 1436 1437 // Update the state machine. 1438 if fuzzer.StatCandidates.Val() == 0 { 1439 mgr.mu.Lock() 1440 if mgr.phase == phaseLoadedCorpus { 1441 go mgr.fuzzerSignalRotation() 1442 if mgr.cfg.HubClient != "" { 1443 mgr.phase = phaseTriagedCorpus 1444 go mgr.hubSyncLoop(pickGetter(mgr.cfg.HubKey)) 1445 } else { 1446 mgr.phase = phaseTriagedHub 1447 } 1448 } else if mgr.phase == phaseQueriedHub { 1449 mgr.phase = phaseTriagedHub 1450 } 1451 mgr.mu.Unlock() 1452 } 1453 } 1454 } 1455 1456 func (mgr *Manager) hubIsUnreachable() { 1457 var dash *dashapi.Dashboard 1458 mgr.mu.Lock() 1459 if mgr.phase == phaseTriagedCorpus { 1460 dash = mgr.dash 1461 mgr.phase = phaseTriagedHub 1462 log.Errorf("did not manage to connect to syz-hub; moving forward") 1463 } 1464 mgr.mu.Unlock() 1465 if dash != nil { 1466 mgr.dash.LogError(mgr.cfg.Name, "did not manage to connect to syz-hub") 1467 } 1468 } 1469 1470 func (mgr *Manager) collectUsedFiles() { 1471 if mgr.vmPool == nil { 1472 return 1473 } 1474 addUsedFile := func(f string) { 1475 if f == "" { 1476 return 1477 } 1478 stat, err := os.Stat(f) 1479 if err != nil { 1480 log.Fatalf("failed to stat %v: %v", f, err) 1481 } 1482 mgr.usedFiles[f] = stat.ModTime() 1483 } 1484 cfg := mgr.cfg 1485 addUsedFile(cfg.FuzzerBin) 1486 addUsedFile(cfg.ExecprogBin) 1487 addUsedFile(cfg.ExecutorBin) 1488 addUsedFile(cfg.SSHKey) 1489 if vmlinux := filepath.Join(cfg.KernelObj, mgr.sysTarget.KernelObject); osutil.IsExist(vmlinux) { 1490 addUsedFile(vmlinux) 1491 } 1492 if cfg.Image != "9p" { 1493 addUsedFile(cfg.Image) 1494 } 1495 } 1496 1497 func (mgr *Manager) checkUsedFiles() { 1498 for f, mod := range mgr.usedFiles { 1499 stat, err := os.Stat(f) 1500 if err != nil { 1501 log.Fatalf("failed to stat %v: %v", f, err) 1502 } 1503 if mod != stat.ModTime() { 1504 log.Fatalf("file %v that syz-manager uses has been modified by an external program\n"+ 1505 "this can lead to arbitrary syz-manager misbehavior\n"+ 1506 "modification time has changed: %v -> %v\n"+ 1507 "don't modify files that syz-manager uses. exiting to prevent harm", 1508 f, mod, stat.ModTime()) 1509 } 1510 } 1511 } 1512 1513 func (mgr *Manager) dashboardReporter() { 1514 webAddr := publicWebAddr(mgr.cfg.HTTP) 1515 triageInfoSent := false 1516 var lastFuzzingTime time.Duration 1517 var lastCrashes, lastSuppressedCrashes, lastExecs uint64 1518 for range time.NewTicker(time.Minute).C { 1519 mgr.mu.Lock() 1520 req := &dashapi.ManagerStatsReq{ 1521 Name: mgr.cfg.Name, 1522 Addr: webAddr, 1523 UpTime: time.Duration(mgr.statUptime.Val()) * time.Second, 1524 Corpus: uint64(mgr.corpus.StatProgs.Val()), 1525 PCs: uint64(mgr.corpus.StatCover.Val()), 1526 Cover: uint64(mgr.corpus.StatSignal.Val()), 1527 CrashTypes: uint64(mgr.statCrashTypes.Val()), 1528 FuzzingTime: time.Duration(mgr.statFuzzingTime.Val()) - lastFuzzingTime, 1529 Crashes: uint64(mgr.statCrashes.Val()) - lastCrashes, 1530 SuppressedCrashes: uint64(mgr.statSuppressed.Val()) - lastSuppressedCrashes, 1531 Execs: uint64(mgr.statExecs.Val()) - lastExecs, 1532 } 1533 if mgr.phase >= phaseTriagedCorpus && !triageInfoSent { 1534 triageInfoSent = true 1535 req.TriagedCoverage = uint64(mgr.corpus.StatSignal.Val()) 1536 req.TriagedPCs = uint64(mgr.corpus.StatCover.Val()) 1537 } 1538 mgr.mu.Unlock() 1539 1540 if err := mgr.dash.UploadManagerStats(req); err != nil { 1541 log.Logf(0, "failed to upload dashboard stats: %v", err) 1542 continue 1543 } 1544 mgr.mu.Lock() 1545 lastFuzzingTime += req.FuzzingTime 1546 lastCrashes += req.Crashes 1547 lastSuppressedCrashes += req.SuppressedCrashes 1548 lastExecs += req.Execs 1549 mgr.mu.Unlock() 1550 } 1551 } 1552 1553 func (mgr *Manager) dashboardReproTasks() { 1554 for range time.NewTicker(20 * time.Minute).C { 1555 needReproReply := make(chan bool) 1556 mgr.needMoreRepros <- needReproReply 1557 if !<-needReproReply { 1558 // We don't need reproducers at the moment. 1559 continue 1560 } 1561 resp, err := mgr.dash.LogToRepro(&dashapi.LogToReproReq{BuildID: mgr.cfg.Tag}) 1562 if err != nil { 1563 log.Logf(0, "failed to query logs to reproduce: %v", err) 1564 continue 1565 } 1566 if len(resp.CrashLog) > 0 { 1567 mgr.externalReproQueue <- &Crash{ 1568 fromDashboard: true, 1569 Report: &report.Report{ 1570 Title: resp.Title, 1571 Output: resp.CrashLog, 1572 }, 1573 } 1574 } 1575 } 1576 } 1577 1578 func publicWebAddr(addr string) string { 1579 _, port, err := net.SplitHostPort(addr) 1580 if err == nil && port != "" { 1581 if host, err := os.Hostname(); err == nil { 1582 addr = net.JoinHostPort(host, port) 1583 } 1584 if GCE, err := gce.NewContext(""); err == nil { 1585 addr = net.JoinHostPort(GCE.ExternalIP, port) 1586 } 1587 } 1588 return "http://" + addr 1589 }