github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/bisect/bisect.go (about) 1 // Copyright 2018 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 package bisect 5 6 import ( 7 "errors" 8 "fmt" 9 "math" 10 "os" 11 "sort" 12 "time" 13 14 "github.com/google/syzkaller/pkg/build" 15 "github.com/google/syzkaller/pkg/debugtracer" 16 "github.com/google/syzkaller/pkg/hash" 17 "github.com/google/syzkaller/pkg/instance" 18 "github.com/google/syzkaller/pkg/mgrconfig" 19 "github.com/google/syzkaller/pkg/osutil" 20 "github.com/google/syzkaller/pkg/report" 21 "github.com/google/syzkaller/pkg/report/crash" 22 "github.com/google/syzkaller/pkg/vcs" 23 ) 24 25 type Config struct { 26 Trace debugtracer.DebugTracer 27 Fix bool 28 DefaultCompiler string 29 CompilerType string 30 Make string 31 Linker string 32 BinDir string 33 Ccache string 34 Timeout time.Duration 35 Kernel KernelConfig 36 Syzkaller SyzkallerConfig 37 Repro ReproConfig 38 Manager *mgrconfig.Config 39 BuildSemaphore *osutil.Semaphore 40 TestSemaphore *osutil.Semaphore 41 BuildCPUs int 42 // CrossTree specifies whether a cross tree bisection is to take place, i.e. 43 // Kernel.Commit is not reachable from Kernel.Branch. 44 // In this case, bisection starts from their merge base. 45 CrossTree bool 46 } 47 48 type KernelConfig struct { 49 Repo string 50 Branch string 51 Commit string 52 CommitTitle string 53 Cmdline string 54 Sysctl string 55 Config []byte 56 // Baseline configuration is used in commit bisection. If the crash doesn't reproduce 57 // with baseline configuratopm config bisection is run. When triggering configuration 58 // option is found provided baseline configuration is modified according the bisection 59 // results. This new configuration is tested once more with current head. If crash 60 // reproduces with the generated configuration original configuation is replaced with 61 // this minimized one. 62 BaselineConfig []byte 63 Userspace string 64 // Extra commits to cherry pick to older kernel revisions. 65 Backports []vcs.BackportCommit 66 } 67 68 type SyzkallerConfig struct { 69 Repo string 70 Commit string 71 Descriptions string 72 } 73 74 type ReproConfig struct { 75 Opts []byte 76 Syz []byte 77 C []byte 78 } 79 80 type env struct { 81 cfg *Config 82 repo vcs.Repo 83 bisecter vcs.Bisecter 84 minimizer vcs.ConfigMinimizer 85 commit *vcs.Commit 86 head *vcs.Commit 87 kernelConfig []byte 88 inst instance.Env 89 numTests int 90 startTime time.Time 91 buildTime time.Duration 92 testTime time.Duration 93 reportTypes []crash.Type 94 // The current estimate of the reproducer's kernel crashing probability. 95 reproChance float64 96 // The product of our confidence in every bisection step result. 97 confidence float64 98 // Whether we should do 2x more execution runs for every test step. 99 // We could have inferred this data from reproChance, but we want to be 100 // able to react faster to sudden drops of reproducibility than an estimate 101 // can allows us to. 102 flaky bool 103 // A cache of already performed revision tests. 104 results map[string]*testResult 105 buildCfg instance.BuildKernelConfig 106 } 107 108 const MaxNumTests = 20 // number of tests we do per commit 109 110 // Result describes bisection result: 111 // 1. if bisection is conclusive, the single cause/fix commit in Commits 112 // - for cause bisection report is the crash on the cause commit 113 // - for fix bisection report is nil 114 // - Commit is nil 115 // - NoopChange is set if the commit did not cause any change in the kernel binary 116 // (bisection result it most likely wrong) 117 // 118 // 2. Bisected to a release commit 119 // - if bisection is inconclusive, range of potential cause/fix commits in Commits 120 // - report is nil in such case 121 // 122 // 3. Commit is nil 123 // - if the crash still happens on the oldest release/HEAD (for cause/fix bisection correspondingly) 124 // - no commits in Commits 125 // - the crash report on the oldest release/HEAD; 126 // - Commit points to the oldest/latest commit where crash happens. 127 // 128 // 4. Config contains kernel config used for bisection. 129 type Result struct { 130 Commits []*vcs.Commit 131 Report *report.Report 132 Commit *vcs.Commit 133 Config []byte 134 NoopChange bool 135 IsRelease bool 136 Confidence float64 137 } 138 139 // Run does the bisection and returns either the Result, 140 // or, if the crash is not reproduced on the start commit, an error. 141 func Run(cfg *Config) (*Result, error) { 142 if err := checkConfig(cfg); err != nil { 143 return nil, err 144 } 145 cfg.Manager.Cover = false // it's not supported somewhere back in time 146 repo, err := vcs.NewRepo(cfg.Manager.TargetOS, cfg.Manager.Type, cfg.Manager.KernelSrc) 147 if err != nil { 148 return nil, err 149 } 150 inst, err := instance.NewEnv(cfg.Manager, cfg.BuildSemaphore, cfg.TestSemaphore) 151 if err != nil { 152 return nil, err 153 } 154 if _, err = repo.CheckoutBranch(cfg.Kernel.Repo, cfg.Kernel.Branch); err != nil { 155 return nil, &build.InfraError{Title: fmt.Sprintf("%v", err)} 156 } 157 return runImpl(cfg, repo, inst) 158 } 159 160 func runImpl(cfg *Config, repo vcs.Repo, inst instance.Env) (*Result, error) { 161 bisecter, ok := repo.(vcs.Bisecter) 162 if !ok { 163 return nil, fmt.Errorf("bisection is not implemented for %v", cfg.Manager.TargetOS) 164 } 165 minimizer, ok := repo.(vcs.ConfigMinimizer) 166 if !ok && len(cfg.Kernel.BaselineConfig) != 0 { 167 return nil, fmt.Errorf("config minimization is not implemented for %v", cfg.Manager.TargetOS) 168 } 169 env := &env{ 170 cfg: cfg, 171 repo: repo, 172 bisecter: bisecter, 173 minimizer: minimizer, 174 inst: inst, 175 startTime: time.Now(), 176 confidence: 1.0, 177 buildCfg: instance.BuildKernelConfig{ 178 CompilerBin: cfg.DefaultCompiler, 179 MakeBin: cfg.Make, 180 LinkerBin: cfg.Linker, 181 CcacheBin: cfg.Ccache, 182 UserspaceDir: cfg.Kernel.Userspace, 183 CmdlineFile: cfg.Kernel.Cmdline, 184 SysctlFile: cfg.Kernel.Sysctl, 185 KernelConfig: cfg.Kernel.Config, 186 BuildCPUs: cfg.BuildCPUs, 187 }, 188 } 189 head, err := repo.Commit(vcs.HEAD) 190 if err != nil { 191 return nil, err 192 } 193 defer env.repo.SwitchCommit(head.Hash) 194 env.head = head 195 hostname, err := os.Hostname() 196 if err != nil { 197 hostname = "unnamed host" 198 } 199 env.logf("%s starts bisection %s", hostname, env.startTime.String()) 200 if cfg.Fix { 201 env.logf("bisecting fixing commit since %v", cfg.Kernel.Commit) 202 } else { 203 env.logf("bisecting cause commit starting from %v", cfg.Kernel.Commit) 204 } 205 start := time.Now() 206 res, err := env.bisect() 207 if env.flaky { 208 env.logf("reproducer is flaky (%.2f repro chance estimate)", env.reproChance) 209 } 210 env.logf("revisions tested: %v, total time: %v (build: %v, test: %v)", 211 env.numTests, time.Since(start), env.buildTime, env.testTime) 212 if err != nil { 213 env.logf("error: %v", err) 214 return nil, err 215 } 216 if len(res.Commits) == 0 { 217 if cfg.Fix { 218 env.logf("crash still not fixed or there were kernel test errors") 219 } else { 220 env.logf("oldest tested release already had the bug or it had kernel test errors") 221 } 222 223 env.logf("commit msg: %v", res.Commit.Title) 224 if res.Report != nil { 225 env.logf("crash: %v\n%s", res.Report.Title, res.Report.Report) 226 } 227 return res, nil 228 } 229 what := "bad" 230 if cfg.Fix { 231 what = "good" 232 } 233 if len(res.Commits) > 1 { 234 env.logf("bisection is inconclusive, the first %v commit could be any of:", what) 235 for _, com := range res.Commits { 236 env.logf("%v", com.Hash) 237 } 238 return res, nil 239 } 240 com := res.Commits[0] 241 env.logf("first %v commit: %v %v", what, com.Hash, com.Title) 242 env.logf("recipients (to): %q", com.Recipients.GetEmails(vcs.To)) 243 env.logf("recipients (cc): %q", com.Recipients.GetEmails(vcs.Cc)) 244 if res.Report != nil { 245 env.logf("crash: %v\n%s", res.Report.Title, res.Report.Report) 246 } 247 return res, nil 248 } 249 250 func (env *env) bisect() (*Result, error) { 251 err := env.bisecter.PrepareBisect() 252 if err != nil { 253 return nil, err 254 } 255 256 cfg := env.cfg 257 if err := env.inst.CleanKernel(&env.buildCfg); err != nil { 258 return nil, fmt.Errorf("kernel clean failed: %w", err) 259 } 260 env.logf("building syzkaller on %v", cfg.Syzkaller.Commit) 261 if _, err := env.inst.BuildSyzkaller(cfg.Syzkaller.Repo, cfg.Syzkaller.Commit); err != nil { 262 return nil, err 263 } 264 265 cfg.Kernel.Commit, err = env.identifyRewrittenCommit() 266 if err != nil { 267 return nil, err 268 } 269 com, err := env.repo.SwitchCommit(cfg.Kernel.Commit) 270 if err != nil { 271 return nil, err 272 } 273 274 env.logf("ensuring issue is reproducible on original commit %v\n", cfg.Kernel.Commit) 275 env.commit = com 276 env.kernelConfig = cfg.Kernel.Config 277 testRes, err := env.test() 278 if err != nil { 279 return nil, err 280 } else if testRes.verdict != vcs.BisectBad { 281 return nil, fmt.Errorf("the crash wasn't reproduced on the original commit") 282 } 283 env.reportTypes = testRes.types 284 env.reproChance = testRes.badRatio 285 286 testRes1, err := env.minimizeConfig() 287 if err != nil { 288 return nil, fmt.Errorf("config minimization failed: %w", err) 289 } 290 if testRes1 != nil { 291 // If config minimization even partially succeeds, minimizeConfig() 292 // would return a non-nil value of a new report. 293 testRes = testRes1 294 // Overwrite bug's reproducibility - it may be different after config minimization. 295 env.reproChance = testRes.badRatio 296 } 297 298 bad, good, results1, fatalResult, err := env.commitRange() 299 if fatalResult != nil || err != nil { 300 return fatalResult, err 301 } 302 if env.cfg.Fix { 303 env.commit = good 304 } else { 305 env.commit = bad 306 } 307 env.results = map[string]*testResult{cfg.Kernel.Commit: testRes} 308 for _, res := range results1 { 309 env.results[res.com.Hash] = res 310 } 311 commits, err := env.bisecter.Bisect(bad.Hash, good.Hash, cfg.Trace, env.testPredicate) 312 if err != nil { 313 return nil, err 314 } 315 env.logf("accumulated error probability: %0.2f", 1.0-env.confidence) 316 res := &Result{ 317 Commits: commits, 318 Config: env.kernelConfig, 319 Confidence: env.confidence, 320 } 321 if len(commits) == 1 { 322 com := commits[0] 323 testRes := env.results[com.Hash] 324 if testRes == nil { 325 return nil, fmt.Errorf("no result for culprit commit") 326 } 327 res.Report = testRes.rep 328 isRelease, err := env.bisecter.IsRelease(com.Hash) 329 if err != nil { 330 env.logf("failed to detect release: %v", err) 331 } 332 res.IsRelease = isRelease 333 noopChange, err := env.detectNoopChange(com) 334 if err != nil { 335 env.logf("failed to detect noop change: %v", err) 336 } 337 res.NoopChange = noopChange 338 } 339 return res, nil 340 } 341 342 func (env *env) identifyRewrittenCommit() (string, error) { 343 cfg := env.cfg 344 if cfg.Kernel.Commit != "" && cfg.CrossTree { 345 // If the failing commit is on another tree, just take it as is. 346 return cfg.Kernel.Commit, nil 347 } 348 _, err := env.repo.CheckoutBranch(cfg.Kernel.Repo, cfg.Kernel.Branch) 349 if err != nil { 350 return cfg.Kernel.Commit, err 351 } 352 contained, err := env.repo.Contains(cfg.Kernel.Commit) 353 if err != nil || contained { 354 return cfg.Kernel.Commit, err 355 } 356 357 if !cfg.Fix { 358 // If we're doing a cause bisection, we don't really need the commit to be 359 // reachable from cfg.Kernel.Branch. 360 // So let's try to force tag fetch and check if the commit is present in the 361 // repository. 362 env.logf("fetch other tags and check if the commit is present") 363 commit, err := env.repo.CheckoutCommit(cfg.Kernel.Repo, cfg.Kernel.Commit) 364 if err != nil { 365 // Ignore the error because the command will fail if the commit is really not 366 // present in the tree. 367 env.logf("fetch failed with %s", err) 368 } else if commit != nil { 369 return commit.Hash, nil 370 } 371 } 372 373 // We record the tested kernel commit when syzkaller triggers a crash. These commits can become 374 // unreachable after the crash was found, when the history of the tested kernel branch was 375 // rewritten. The commit might have been completely deleted from the branch or just changed in 376 // some way. Some branches like linux-next are often and heavily rewritten (aka rebased). 377 // This can also happen when changing the branch you fuzz in an existing syz-manager config. 378 // This makes sense when a downstream kernel fork rebased on top of a new upstream version and 379 // you don't want syzkaller to report all your old bugs again. 380 if cfg.Kernel.CommitTitle == "" { 381 // This can happen during a manual bisection, when only a hash is given. 382 return cfg.Kernel.Commit, fmt.Errorf( 383 "commit %v not reachable in branch '%v' and no commit title available", 384 cfg.Kernel.Commit, cfg.Kernel.Branch) 385 } 386 commit, err := env.repo.GetCommitByTitle(cfg.Kernel.CommitTitle) 387 if err != nil { 388 return cfg.Kernel.Commit, err 389 } 390 if commit == nil { 391 return cfg.Kernel.Commit, fmt.Errorf( 392 "commit %v not reachable in branch '%v'", cfg.Kernel.Commit, cfg.Kernel.Branch) 393 } 394 env.logf("rewritten commit %v reidentified by title '%v'\n", commit.Hash, cfg.Kernel.CommitTitle) 395 return commit.Hash, nil 396 } 397 398 func (env *env) minimizeConfig() (*testResult, error) { 399 // Find minimal configuration based on baseline to reproduce the crash. 400 testResults := make(map[hash.Sig]*testResult) 401 predMinimize := func(test []byte) (vcs.BisectResult, error) { 402 env.kernelConfig = test 403 testRes, err := env.test() 404 if err != nil { 405 return 0, err 406 } 407 // We want either a > 33% repro probability or at least it should not be 408 // worse than for the non-minimized config. 409 const badRatioThreshold = 1.0 / 3.0 410 if testRes.verdict == vcs.BisectBad && 411 testRes.badRatio < badRatioThreshold && 412 testRes.badRatio < env.reproChance { 413 return vcs.BisectSkip, nil 414 } 415 if testRes.verdict == vcs.BisectBad { 416 // Only remember crashes. 417 testResults[hash.Hash(test)] = testRes 418 } 419 return testRes.verdict, err 420 } 421 minConfig, err := env.minimizer.Minimize(env.cfg.Manager.SysTarget, env.cfg.Kernel.Config, 422 env.cfg.Kernel.BaselineConfig, env.reportTypes, env.cfg.Trace, predMinimize) 423 if err != nil { 424 if errors.Is(err, vcs.ErrBadKconfig) { 425 env.logf("config minimization failed due to bad Kconfig %v\nproceeding with the original config", err) 426 } else { 427 return nil, err 428 } 429 } 430 env.kernelConfig = minConfig 431 return testResults[hash.Hash(minConfig)], nil 432 } 433 434 func (env *env) detectNoopChange(com *vcs.Commit) (bool, error) { 435 testRes := env.results[com.Hash] 436 if testRes.kernelSign == "" || len(com.Parents) != 1 { 437 return false, nil 438 } 439 parent := com.Parents[0] 440 parentRes := env.results[parent] 441 if parentRes == nil { 442 env.logf("parent commit %v wasn't tested", parent) 443 // We could not test the parent commit if it is not based on the previous release 444 // (instead based on an older release, i.e. a very old non-rebased commit 445 // merged into the current release). 446 // TODO: we can use a differnet compiler for this old commit 447 // since effectively it's in the older release, in that case we may not 448 // detect noop change anyway. 449 if _, err := env.repo.SwitchCommit(parent); err != nil { 450 return false, err 451 } 452 _, kernelSign, err := env.build() 453 if err != nil { 454 return false, err 455 } 456 parentRes = &testResult{kernelSign: kernelSign} 457 } 458 env.logf("culprit signature: %v", testRes.kernelSign) 459 env.logf("parent signature: %v", parentRes.kernelSign) 460 return testRes.kernelSign == parentRes.kernelSign, nil 461 } 462 463 func (env *env) commitRange() (*vcs.Commit, *vcs.Commit, []*testResult, *Result, error) { 464 rangeFunc := env.commitRangeForCause 465 if env.cfg.Fix { 466 rangeFunc = env.commitRangeForFix 467 } 468 469 bad, good, results1, err := rangeFunc() 470 if err != nil { 471 return bad, good, results1, nil, err 472 } 473 474 fatalResult, err := env.validateCommitRange(bad, good, results1) 475 return bad, good, results1, fatalResult, err 476 } 477 478 func (env *env) commitRangeForFix() (*vcs.Commit, *vcs.Commit, []*testResult, error) { 479 var results []*testResult 480 startCommit := env.commit 481 if env.cfg.CrossTree { 482 env.logf("determining the merge base between %v and %v", 483 env.commit.Hash, env.head.Hash) 484 bases, err := env.repo.MergeBases(env.commit.Hash, env.head.Hash) 485 if err != nil { 486 return nil, nil, nil, err 487 } 488 if len(bases) != 1 { 489 env.logf("expected 1 merge base, got %d", len(bases)) 490 return nil, nil, nil, fmt.Errorf("expected 1 merge base, got %d", len(bases)) 491 } 492 env.logf("%s/%s is a merge base, check if it has the bug", bases[0].Hash, bases[0].Title) 493 startCommit = bases[0] 494 if _, err := env.repo.SwitchCommit(startCommit.Hash); err != nil { 495 return nil, nil, nil, err 496 } 497 res, err := env.test() 498 if err != nil { 499 return nil, nil, nil, err 500 } 501 results = append(results, res) 502 if res.verdict != vcs.BisectBad { 503 return nil, startCommit, results, nil 504 } 505 } 506 env.logf("testing current HEAD %v", env.head.Hash) 507 if _, err := env.repo.SwitchCommit(env.head.Hash); err != nil { 508 return nil, nil, nil, err 509 } 510 res, err := env.test() 511 if err != nil { 512 return nil, nil, nil, err 513 } 514 results = append(results, res) 515 if res.verdict != vcs.BisectGood { 516 return env.head, nil, results, nil 517 } 518 return env.head, startCommit, results, nil 519 } 520 521 func (env *env) commitRangeForCause() (*vcs.Commit, *vcs.Commit, []*testResult, error) { 522 cfg := env.cfg 523 tags, err := env.bisecter.PreviousReleaseTags(cfg.Kernel.Commit, cfg.CompilerType) 524 if err != nil { 525 return nil, nil, nil, err 526 } 527 if len(tags) == 0 { 528 return nil, nil, nil, fmt.Errorf("no release tags before this commit") 529 } 530 pickedTags := pickReleaseTags(tags) 531 env.logf("picked %v out of %d release tags", pickedTags, len(tags)) 532 533 lastBad := env.commit 534 var results []*testResult 535 for _, tag := range pickedTags { 536 env.logf("testing release %v", tag) 537 com, err := env.repo.SwitchCommit(tag) 538 if err != nil { 539 return nil, nil, nil, err 540 } 541 res, err := env.test() 542 if err != nil { 543 return nil, nil, nil, err 544 } 545 results = append(results, res) 546 if res.verdict == vcs.BisectGood { 547 return lastBad, com, results, nil 548 } 549 if res.verdict == vcs.BisectBad { 550 lastBad = com 551 } 552 } 553 // All tags were vcs.BisectBad or vcs.BisectSkip. 554 return lastBad, nil, results, nil 555 } 556 557 func (env *env) validateCommitRange(bad, good *vcs.Commit, results []*testResult) (*Result, error) { 558 if len(results) < 1 { 559 return nil, fmt.Errorf("commitRange returned no results") 560 } 561 562 if env.cfg.Fix && env.cfg.CrossTree && len(results) < 2 { 563 // For cross-tree bisections, it can be the case that the bug was introduced 564 // after the merge base, so there's no sense to continue the fix bisection. 565 env.logf("reproducer does not crash the merge base, so there's no known bad commit") 566 return &Result{Commit: good, Config: env.kernelConfig}, nil 567 } 568 569 finalResult := results[len(results)-1] // HEAD test for fix, oldest tested test for cause bisection 570 if finalResult.verdict == vcs.BisectBad { 571 // For cause bisection: Oldest tested release already had the bug. Giving up. 572 // For fix bisection: Crash still not fixed on HEAD. Leaving Result.Commits empty causes 573 // syzbot to retry this bisection later. 574 env.logf("crash still not fixed/happens on the oldest tested release") 575 return &Result{Report: finalResult.rep, Commit: bad, Config: env.kernelConfig}, nil 576 } 577 if finalResult.verdict == vcs.BisectSkip { 578 if env.cfg.Fix { 579 // HEAD is moving target. Sometimes changes break syzkaller fuzzing. 580 // Leaving Result.Commits empty so syzbot retries this bisection again later. 581 env.logf("HEAD had kernel build, boot or test errors") 582 return &Result{Report: finalResult.rep, Commit: bad, Config: env.kernelConfig}, nil 583 } 584 // The oldest tested release usually doesn't change. Retrying would give us the same result, 585 // unless we change the syz-ci setup (e.g. new rootfs, new compilers). 586 return nil, fmt.Errorf("oldest tested release had kernel build, boot or test errors") 587 } 588 589 return nil, nil 590 } 591 592 type testResult struct { 593 verdict vcs.BisectResult 594 com *vcs.Commit 595 rep *report.Report 596 types []crash.Type 597 kernelSign string 598 // The ratio of bad/(good+bad) results. 599 badRatio float64 600 // An estimate how much we can trust the result. 601 confidence float64 602 } 603 604 func (env *env) build() (*vcs.Commit, string, error) { 605 current, err := env.repo.Commit(vcs.HEAD) 606 if err != nil { 607 return nil, "", err 608 } 609 610 bisectEnv, err := env.bisecter.EnvForCommit( 611 env.cfg.DefaultCompiler, env.cfg.CompilerType, 612 env.cfg.BinDir, current.Hash, env.kernelConfig, 613 env.cfg.Kernel.Backports, 614 ) 615 if err != nil { 616 return current, "", err 617 } 618 env.logf("testing commit %v %v", current.Hash, env.cfg.CompilerType) 619 buildStart := time.Now() 620 buildCfg := env.buildCfg 621 buildCfg.CompilerBin = bisectEnv.Compiler 622 buildCfg.KernelConfig = bisectEnv.KernelConfig 623 if err := env.inst.CleanKernel(&buildCfg); err != nil { 624 return current, "", fmt.Errorf("kernel clean failed: %w", err) 625 } 626 _, imageDetails, err := env.inst.BuildKernel(&buildCfg) 627 if imageDetails.CompilerID != "" { 628 env.logf("compiler: %v", imageDetails.CompilerID) 629 } 630 if imageDetails.Signature != "" { 631 env.logf("kernel signature: %v", imageDetails.Signature) 632 } 633 env.buildTime += time.Since(buildStart) 634 return current, imageDetails.Signature, err 635 } 636 637 // Note: When this function returns an error, the bisection it was called from is aborted. 638 // Hence recoverable errors must be handled and the callers must treat testResult with care. 639 // e.g. testResult.verdict will be vcs.BisectSkip for a broken build, but err will be nil. 640 func (env *env) test() (*testResult, error) { 641 cfg := env.cfg 642 if cfg.Timeout != 0 && time.Since(env.startTime) > cfg.Timeout { 643 return nil, fmt.Errorf("bisection is taking too long (>%v), aborting", cfg.Timeout) 644 } 645 current, kernelSign, err := env.build() 646 res := &testResult{ 647 verdict: vcs.BisectSkip, 648 com: current, 649 kernelSign: kernelSign, 650 confidence: 1.0, 651 } 652 if current == nil { 653 // This is not recoverable, as the caller must know which commit to skip. 654 return res, fmt.Errorf("couldn't get repo HEAD: %w", err) 655 } 656 if err != nil { 657 errInfo := fmt.Sprintf("failed building %v: ", current.Hash) 658 var verr *osutil.VerboseError 659 var kerr *build.KernelError 660 if errors.As(err, &verr) { 661 errInfo += verr.Error() 662 env.saveDebugFile(current.Hash, 0, verr.Output) 663 } else if errors.As(err, &kerr) { 664 errInfo += string(kerr.Report) 665 env.saveDebugFile(current.Hash, 0, kerr.Output) 666 } else { 667 errInfo += err.Error() 668 env.logf("%v", err) 669 } 670 671 env.logf("%s", errInfo) 672 res.rep = &report.Report{Title: errInfo} 673 return res, nil 674 } 675 676 numTests := MaxNumTests / 2 677 if env.flaky || env.numTests == 0 { 678 // Use twice as many instances if the bug is flaky and during initial testing 679 // (as we don't know yet if it's flaky or not). 680 numTests *= 2 681 } 682 env.numTests++ 683 684 testStart := time.Now() 685 686 results, err := env.inst.Test(numTests, cfg.Repro.Syz, cfg.Repro.Opts, cfg.Repro.C) 687 env.testTime += time.Since(testStart) 688 if err != nil { 689 problem := fmt.Sprintf("repro testing failure: %v", err) 690 env.log(problem) 691 return res, &build.InfraError{Title: problem} 692 } 693 bad, good, infra, rep, types := env.processResults(current, results) 694 res.verdict, err = env.bisectionDecision(len(results), bad, good, infra) 695 if err != nil { 696 return nil, err 697 } 698 if bad+good > 0 { 699 res.badRatio = float64(bad) / float64(bad+good) 700 } 701 if res.verdict == vcs.BisectGood { 702 // The result could be a false negative. 703 res.confidence = 1.0 - math.Pow(1.0-env.reproChance, float64(good)) 704 env.logf("false negative chance: %.3f", 1.0-res.confidence) 705 } 706 if res.verdict == vcs.BisectSkip { 707 res.rep = &report.Report{ 708 Title: fmt.Sprintf("failed testing reproducer on %v", current.Hash), 709 } 710 } else { 711 // Pick the most relevant as the main one. 712 res.rep = rep 713 } 714 res.types = types 715 env.updateFlaky(res) 716 // TODO: when we start supporting boot/test error bisection, we need to make 717 // processResults treat that verdit as "good". 718 return res, nil 719 } 720 721 // testPredicate() is meant to be invoked by bisecter.Bisect(). 722 func (env *env) testPredicate() (vcs.BisectResult, error) { 723 var testRes1 *testResult 724 if env.cfg.Fix { 725 // There's a chance we might test a revision that does not yet contain the bug. 726 // Perform extra checks (see #4117). 727 env.logf("determine whether the revision contains the guilty commit") 728 hadBug, err := env.revisionHadBug() 729 if err == errUnknownBugPresence { 730 // Let's skip the revision just in case. 731 testRes1 = &testResult{verdict: vcs.BisectSkip} 732 } else if err != nil { 733 return 0, err 734 } 735 if !hadBug { 736 // For result consistency, pretend that the kernel crashed. 737 env.logf("the bug was not introduced yet; pretend that kernel crashed") 738 testRes1 = &testResult{verdict: vcs.BisectBad} 739 } 740 } 741 if testRes1 == nil { 742 var err error 743 testRes1, err = env.test() 744 if err != nil { 745 return 0, err 746 } 747 env.postTestResult(testRes1) 748 env.results[testRes1.com.Hash] = testRes1 749 } 750 // For fix bisections, results are inverted. 751 if env.cfg.Fix { 752 switch testRes1.verdict { 753 case vcs.BisectBad: 754 testRes1.verdict = vcs.BisectGood 755 case vcs.BisectGood: 756 testRes1.verdict = vcs.BisectBad 757 } 758 } 759 return testRes1.verdict, nil 760 } 761 762 // If there's a merge from a branch that was based on a much older code revision, 763 // it's likely that the bug was not yet present at all. 764 var errUnknownBugPresence = errors.New("unable to determine whether there was a bug") 765 766 func (env *env) revisionHadBug() (bool, error) { 767 // Check if any already tested revision that is reachable from HEAD crashed. 768 for hash, res := range env.results { 769 if res.rep == nil { 770 continue 771 } 772 ok, err := env.repo.Contains(hash) 773 if err != nil { 774 return false, err 775 } 776 if ok { 777 env.logf("revision %s crashed and is reachable", hash) 778 return true, nil 779 } 780 } 781 782 // TODO: it's also possible to extract useful information from non-crashed runs. 783 // But let's first see how many extra test() runs we get without it. 784 785 // We'll likely change the revision below. Ensure we get back to the original one. 786 curr, err := env.repo.Commit(vcs.HEAD) 787 if err != nil { 788 return false, err 789 } 790 defer env.repo.SwitchCommit(curr.Hash) 791 792 // Check all merge bases between the original bad commit (*) and the current HEAD revision. 793 // If at least one crashed, bug was definitely present. 794 // (*) Using the same bad commit hopefully helps us reuse many of the results. 795 bases, err := env.repo.MergeBases(curr.Hash, env.commit.Hash) 796 if err != nil { 797 return false, fmt.Errorf("failed to get the merge base between %s and %s: %w", 798 curr.Hash, env.commit.Hash, err) 799 } 800 anyResult := false 801 for _, base := range bases { 802 env.logf("checking the merge base %s", base.Hash) 803 res := env.results[base.Hash] 804 if res == nil { 805 env.logf("no existing result, test the revision") 806 env.repo.SwitchCommit(base.Hash) 807 res, err = env.test() 808 if err != nil { 809 return false, err 810 } 811 env.results[base.Hash] = res 812 } 813 if res.verdict == vcs.BisectSkip { 814 continue 815 } 816 anyResult = true 817 if res.rep != nil { 818 // No reason to test other bases. 819 return true, nil 820 } 821 } 822 if anyResult { 823 return false, nil 824 } 825 return false, errUnknownBugPresence 826 } 827 828 func (env *env) bisectionDecision(total, bad, good, infra int) (vcs.BisectResult, error) { 829 // Boot errors, image test errors, skipped crashes. 830 skip := total - bad - good - infra 831 832 wantBadRuns := max(2, (total-infra)/6) // For 10 runs, require 2 crashes. For 20, require 3. 833 wantGoodRuns := total / 2 834 wantTotalRuns := total / 2 835 if env.flaky { 836 // The reproducer works less than 50% of time, so we need really many good results. 837 wantGoodRuns = total * 3 / 4 838 } 839 if bad == 0 && good >= wantGoodRuns { 840 // We need a big enough number of good results, otherwise the chance of a false 841 // positive is too high. 842 return vcs.BisectGood, nil 843 } else if bad >= wantBadRuns && (good+bad) >= wantTotalRuns { 844 // We need enough (good+bad) results to conclude that the kernel revision itself 845 // is not too broken. 846 return vcs.BisectBad, nil 847 } else if infra > skip { 848 // We have been unable to determine a verdict mostly because of infra errors. 849 // Abort the bisection. 850 return vcs.BisectSkip, 851 &build.InfraError{Title: "unable to determine the verdict because of infra errors"} 852 } 853 env.logf("unable to determine the verdict: %d good runs (wanted %d), for bad wanted %d in total, got %d", 854 good, wantGoodRuns, wantTotalRuns, good+bad) 855 return vcs.BisectSkip, nil 856 } 857 858 func (env *env) processResults(current *vcs.Commit, results []instance.EnvTestResult) ( 859 bad, good, infra int, rep *report.Report, types []crash.Type) { 860 var verdicts []string 861 var reports []*report.Report 862 for i, res := range results { 863 if res.Error == nil { 864 good++ 865 verdicts = append(verdicts, "OK") 866 continue 867 } 868 var testError *instance.TestError 869 var crashError *instance.CrashError 870 switch { 871 case errors.As(res.Error, &testError): 872 if testError.Infra { 873 infra++ 874 verdicts = append(verdicts, fmt.Sprintf("infra problem: %v", testError)) 875 } else if testError.Boot { 876 verdicts = append(verdicts, fmt.Sprintf("boot failed: %v", testError)) 877 } else { 878 verdicts = append(verdicts, fmt.Sprintf("basic kernel testing failed: %v", testError)) 879 } 880 output := testError.Output 881 if testError.Report != nil { 882 output = testError.Report.Output 883 } 884 env.saveDebugFile(current.Hash, i, output) 885 case errors.As(res.Error, &crashError): 886 output := crashError.Report.Report 887 if len(output) == 0 { 888 output = crashError.Report.Output 889 } 890 env.saveDebugFile(current.Hash, i, output) 891 if env.isTransientError(crashError.Report) { 892 verdicts = append(verdicts, fmt.Sprintf("ignore: %v", crashError)) 893 break 894 } 895 bad++ 896 reports = append(reports, crashError.Report) 897 verdicts = append(verdicts, fmt.Sprintf("crashed: %v", crashError)) 898 default: 899 infra++ 900 verdicts = append(verdicts, fmt.Sprintf("failed: %v", res.Error)) 901 } 902 } 903 unique := make(map[string]bool) 904 for _, verdict := range verdicts { 905 unique[verdict] = true 906 } 907 if len(unique) == 1 { 908 env.logf("all runs: %v", verdicts[0]) 909 } else { 910 for i, verdict := range verdicts { 911 env.logf("run #%v: %v", i, verdict) 912 } 913 } 914 var others bool 915 rep, types, others = mostFrequentReports(reports) 916 if rep != nil || others { 917 // TODO: set flaky=true or in some other way indicate that the bug 918 // triggers multiple different crashes? 919 env.logf("representative crash: %v, types: %v", rep.Title, types) 920 } 921 return 922 } 923 924 // postTestResult() is to be run after we have got the results of a test() call for a revision. 925 // It updates the estimates of reproducibility and the overall result confidence. 926 func (env *env) postTestResult(res *testResult) { 927 env.confidence *= res.confidence 928 if res.verdict == vcs.BisectBad { 929 // Let's be conservative and only decrease our reproduction likelihood estimate. 930 // As the estimate of each test() can also be flaky, only partially update the result. 931 avg := (env.reproChance + res.badRatio) / 2.0 932 env.reproChance = min(env.reproChance, avg) 933 } 934 } 935 936 // updateFlaky() updates the current flakiness estimate. 937 func (env *env) updateFlaky(res *testResult) { 938 // We require at least 5 good+bad runs for a verdict, so 939 // with a 50% reproducility there's a ~3% chance of a false negative result. 940 // If there are 10 "good" results, that's a ~36% accumulated error probability. 941 // That's already noticeable, so let's do 2x more runs from there. 942 const flakyThreshold = 0.5 943 if res.verdict == vcs.BisectBad && res.badRatio < flakyThreshold { 944 // Once flaky => always treat as flaky. 945 env.flaky = true 946 } 947 } 948 949 // mostFrequentReports() processes the list of run results and determines: 950 // 1) The most representative crash types. 951 // 2) The most representative crash report. 952 // The algorithm is described in code comments. 953 func mostFrequentReports(reports []*report.Report) (*report.Report, []crash.Type, bool) { 954 // First find most frequent report types. 955 type info struct { 956 t crash.Type 957 count int 958 report *report.Report 959 } 960 crashes := 0 961 perType := []*info{} 962 perTypeMap := map[crash.Type]*info{} 963 for _, rep := range reports { 964 if rep.Title == "" { 965 continue 966 } 967 crashes++ 968 if perTypeMap[rep.Type] == nil { 969 obj := &info{ 970 t: rep.Type, 971 report: rep, 972 } 973 perType = append(perType, obj) 974 perTypeMap[rep.Type] = obj 975 } 976 perTypeMap[rep.Type].count++ 977 } 978 sort.Slice(perType, func(i, j int) bool { 979 return perType[i].count > perType[j].count 980 }) 981 // Then pick those that are representative enough. 982 var bestTypes []crash.Type 983 var bestReport *report.Report 984 taken := 0 985 for _, info := range perType { 986 if info.t == crash.Hang && info.count*2 < crashes && len(perType) > 1 { 987 // To pick a Hang as a representative one, require >= 50% 988 // of all crashes to be of this type. 989 // Hang crashes can appear in various parts of the kernel, so 990 // we only want to take them into account only if we are actually 991 // bisecting this kind of a bug. 992 continue 993 } 994 if info.t == crash.LostConnection && len(perType) > 1 { 995 // This crash type is much more often unrelated than not. 996 // Take it only if it's the only crash type. 997 continue 998 } 999 // Take further crash types until we have considered 2/3 of all crashes, but 1000 // no more than 3. 1001 needTaken := (crashes + 2) * 2 / 3 1002 if taken < needTaken && len(bestTypes) < 3 { 1003 if bestReport == nil { 1004 bestReport = info.report 1005 } 1006 bestTypes = append(bestTypes, info.t) 1007 taken += info.count 1008 } 1009 } 1010 return bestReport, bestTypes, len(bestTypes) != len(perType) 1011 } 1012 1013 func (env *env) isTransientError(rep *report.Report) bool { 1014 // If we're not chasing a SYZFATAL error, ignore them. 1015 // Otherwise it indicates some transient problem of the tested kernel revision. 1016 if rep.Type == crash.SyzFailure { 1017 hadSyzFailure := false 1018 for _, t := range env.reportTypes { 1019 hadSyzFailure = hadSyzFailure || t == crash.SyzFailure 1020 } 1021 return len(env.reportTypes) > 0 && !hadSyzFailure 1022 } 1023 // Lost connection is a frequent source of flaky results. 1024 // Ignore if it is was not in the canonical crash types set. 1025 if rep.Type == crash.LostConnection { 1026 hadLostConnection := false 1027 for _, t := range env.reportTypes { 1028 hadLostConnection = hadLostConnection || t == crash.LostConnection 1029 } 1030 return len(env.reportTypes) > 0 && !hadLostConnection 1031 } 1032 // All other errors are okay. 1033 return false 1034 } 1035 1036 func (env *env) saveDebugFile(hash string, idx int, data []byte) { 1037 env.cfg.Trace.SaveFile(fmt.Sprintf("%v.%v", hash, idx), data) 1038 } 1039 1040 func checkConfig(cfg *Config) error { 1041 if !osutil.IsExist(cfg.BinDir) { 1042 return fmt.Errorf("bin dir %v does not exist", cfg.BinDir) 1043 } 1044 if cfg.Kernel.Userspace != "" && !osutil.IsExist(cfg.Kernel.Userspace) { 1045 return fmt.Errorf("userspace dir %v does not exist", cfg.Kernel.Userspace) 1046 } 1047 if cfg.Kernel.Sysctl != "" && !osutil.IsExist(cfg.Kernel.Sysctl) { 1048 return fmt.Errorf("sysctl file %v does not exist", cfg.Kernel.Sysctl) 1049 } 1050 if cfg.Kernel.Cmdline != "" && !osutil.IsExist(cfg.Kernel.Cmdline) { 1051 return fmt.Errorf("cmdline file %v does not exist", cfg.Kernel.Cmdline) 1052 } 1053 return nil 1054 } 1055 1056 func (env *env) log(msg string) { 1057 env.logf("%v", msg) 1058 } 1059 1060 func (env *env) logf(msg string, args ...interface{}) { 1061 if false { 1062 _ = fmt.Sprintf(msg, args...) // enable printf checker 1063 } 1064 env.cfg.Trace.Log(msg, args...) 1065 } 1066 1067 // pickReleaseTags() picks a subset of revisions to test. 1068 // `all` is an ordered list of tags (from newer to older). 1069 func pickReleaseTags(all []string) []string { 1070 if len(all) == 0 { 1071 return nil 1072 } 1073 // First split into x.y.z, x.y.z-1, ... and x.y, x.y-1, ... 1074 var subReleases, releases []string 1075 releaseBegin := false 1076 for _, tag := range all { 1077 v1, _, rc, v3 := vcs.ParseReleaseTag(tag) 1078 if v1 < 0 || rc < 0 && v3 < 0 { 1079 releaseBegin = true 1080 releases = append(releases, tag) 1081 } 1082 if !releaseBegin { 1083 subReleases = append(subReleases, tag) 1084 } 1085 } 1086 var ret []string 1087 // Take 2 latest sub releases. 1088 takeSubReleases := min(2, len(subReleases)) 1089 ret = append(ret, subReleases[:takeSubReleases]...) 1090 // If there are a lot of sub releases, also take the middle one. 1091 if len(subReleases) > 5 { 1092 ret = append(ret, subReleases[len(subReleases)/2]) 1093 } 1094 for i := 0; i < len(releases); i++ { 1095 // Gradually increase step. 1096 step := 1 1097 if i >= 3 { 1098 step = 2 1099 } 1100 if i >= 11 { 1101 step = 3 1102 } 1103 if i%step == 0 || i == len(releases)-1 { 1104 ret = append(ret, releases[i]) 1105 } 1106 } 1107 return ret 1108 }