github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/fuzzer/job.go (about) 1 // Copyright 2024 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 package fuzzer 5 6 import ( 7 "bytes" 8 "fmt" 9 "math/rand" 10 "strings" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 "github.com/google/syzkaller/pkg/corpus" 16 "github.com/google/syzkaller/pkg/cover" 17 "github.com/google/syzkaller/pkg/flatrpc" 18 "github.com/google/syzkaller/pkg/fuzzer/queue" 19 "github.com/google/syzkaller/pkg/signal" 20 "github.com/google/syzkaller/prog" 21 ) 22 23 type job interface { 24 run(fuzzer *Fuzzer) 25 } 26 27 type jobIntrospector interface { 28 getInfo() *JobInfo 29 } 30 31 type JobInfo struct { 32 Name string 33 Calls []string 34 Type string 35 Execs atomic.Int32 36 37 syncBuffer 38 } 39 40 func (ji *JobInfo) ID() string { 41 return fmt.Sprintf("%p", ji) 42 } 43 44 func genProgRequest(fuzzer *Fuzzer, rnd *rand.Rand) *queue.Request { 45 p := fuzzer.target.Generate(rnd, 46 fuzzer.RecommendedCalls(), 47 fuzzer.ChoiceTable()) 48 return &queue.Request{ 49 Prog: p, 50 ExecOpts: setFlags(flatrpc.ExecFlagCollectSignal), 51 Stat: fuzzer.statExecGenerate, 52 } 53 } 54 55 func mutateProgRequest(fuzzer *Fuzzer, rnd *rand.Rand) *queue.Request { 56 p := fuzzer.Config.Corpus.ChooseProgram(rnd) 57 if p == nil { 58 return nil 59 } 60 newP := p.Clone() 61 newP.Mutate(rnd, 62 prog.RecommendedCalls, 63 fuzzer.ChoiceTable(), 64 fuzzer.Config.NoMutateCalls, 65 fuzzer.Config.Corpus.Programs(), 66 ) 67 return &queue.Request{ 68 Prog: newP, 69 ExecOpts: setFlags(flatrpc.ExecFlagCollectSignal), 70 Stat: fuzzer.statExecFuzz, 71 } 72 } 73 74 // triageJob are programs for which we noticed potential new coverage during 75 // first execution. But we are not sure yet if the coverage is real or not. 76 // During triage we understand if these programs in fact give new coverage, 77 // and if yes, minimize them and add to corpus. 78 type triageJob struct { 79 p *prog.Prog 80 executor queue.ExecutorID 81 flags ProgFlags 82 fuzzer *Fuzzer 83 queue queue.Executor 84 // Set of calls that gave potential new coverage. 85 calls map[int]*triageCall 86 87 info *JobInfo 88 } 89 90 type triageCall struct { 91 errno int32 92 newSignal signal.Signal 93 94 // Filled after deflake: 95 signals [deflakeNeedRuns]signal.Signal 96 stableSignal signal.Signal 97 newStableSignal signal.Signal 98 cover cover.Cover 99 rawCover []uint64 100 } 101 102 // As demonstrated in #4639, programs reproduce with a very high, but not 100% probability. 103 // The triage algorithm must tolerate this, so let's pick the signal that is common 104 // to 3 out of 5 runs. 105 // By binomial distribution, a program that reproduces 80% of time will pass deflake() 106 // with a 94% probability. If it reproduces 90% of time, it passes in 99% of cases. 107 // 108 // During corpus triage we are more permissive and require only 2/6 to produce new stable signal. 109 // Such parameters make 80% flakiness to pass 99% of time, and even 60% flakiness passes 96% of time. 110 // First, we don't need to be strict during corpus triage since the program has already passed 111 // the stricter check when it was added to the corpus. So we can do fewer runs during triage, 112 // and finish it sooner. If the program does not produce any stable signal any more, just flakes, 113 // (if the kernel code was changed, or configs disabled), then it still should be phased out 114 // of the corpus eventually. 115 // Second, even if small percent of programs are dropped from the corpus due to flaky signal, 116 // later after several restarts we will add them to the corpus again, and it will create lots 117 // of duplicate work for minimization/hints/smash/fault injection. For example, a program with 118 // 60% flakiness has 68% chance to pass 3/5 criteria, but it's also likely to be dropped from 119 // the corpus if we use the same 3/5 criteria during triage. With a large corpus this effect 120 // can cause re-addition of thousands of programs to the corpus, and hundreds of thousands 121 // of runs for the additional work. With 2/6 criteria, a program with 60% flakiness has 122 // 96% chance to be kept in the corpus after retriage. 123 const ( 124 deflakeNeedRuns = 3 125 deflakeMaxRuns = 5 126 deflakeNeedCorpusRuns = 2 127 deflakeMinCorpusRuns = 4 128 deflakeMaxCorpusRuns = 6 129 deflakeTotalCorpusRuns = 20 130 deflakeNeedSnapshotRuns = 2 131 ) 132 133 func (job *triageJob) execute(req *queue.Request, flags ProgFlags) *queue.Result { 134 defer job.info.Execs.Add(1) 135 req.Important = true // All triage executions are important. 136 return job.fuzzer.executeWithFlags(job.queue, req, flags) 137 } 138 139 func (job *triageJob) run(fuzzer *Fuzzer) { 140 fuzzer.statNewInputs.Add(1) 141 job.fuzzer = fuzzer 142 job.info.Logf("\n%s", job.p.Serialize()) 143 for call, info := range job.calls { 144 job.info.Logf("call #%d [%s]: |new signal|=%d%s", 145 call, job.p.CallName(call), info.newSignal.Len(), signalPreview(info.newSignal)) 146 } 147 148 // Compute input coverage and non-flaky signal for minimization. 149 stop := job.deflake(job.execute) 150 if stop { 151 return 152 } 153 var wg sync.WaitGroup 154 for call, info := range job.calls { 155 wg.Add(1) 156 go func() { 157 job.handleCall(call, info) 158 wg.Done() 159 }() 160 } 161 wg.Wait() 162 } 163 164 func (job *triageJob) handleCall(call int, info *triageCall) { 165 if info.newStableSignal.Empty() { 166 return 167 } 168 169 p := job.p 170 if job.flags&ProgMinimized == 0 { 171 p, call = job.minimize(call, info) 172 if p == nil { 173 return 174 } 175 } 176 callName := p.CallName(call) 177 if !job.fuzzer.Config.NewInputFilter(callName) { 178 return 179 } 180 if job.flags&ProgSmashed == 0 { 181 job.fuzzer.startJob(job.fuzzer.statJobsSmash, &smashJob{ 182 exec: job.fuzzer.smashQueue, 183 p: p.Clone(), 184 info: &JobInfo{ 185 Name: p.String(), 186 Type: "smash", 187 Calls: []string{p.CallName(call)}, 188 }, 189 }) 190 if job.fuzzer.Config.Comparisons && call >= 0 { 191 job.fuzzer.startJob(job.fuzzer.statJobsHints, &hintsJob{ 192 exec: job.fuzzer.smashQueue, 193 p: p.Clone(), 194 call: call, 195 info: &JobInfo{ 196 Name: p.String(), 197 Type: "hints", 198 Calls: []string{p.CallName(call)}, 199 }, 200 }) 201 } 202 if job.fuzzer.Config.FaultInjection && call >= 0 { 203 job.fuzzer.startJob(job.fuzzer.statJobsFaultInjection, &faultInjectionJob{ 204 exec: job.fuzzer.smashQueue, 205 p: p.Clone(), 206 call: call, 207 }) 208 } 209 } 210 job.fuzzer.Logf(2, "added new input for %v to the corpus: %s", callName, p) 211 input := corpus.NewInput{ 212 Prog: p, 213 Call: call, 214 Signal: info.stableSignal, 215 Cover: info.cover.Serialize(), 216 RawCover: info.rawCover, 217 } 218 job.fuzzer.Config.Corpus.Save(input) 219 } 220 221 func (job *triageJob) deflake(exec func(*queue.Request, ProgFlags) *queue.Result) (stop bool) { 222 job.info.Logf("deflake started") 223 224 avoid := []queue.ExecutorID{job.executor} 225 needRuns := deflakeNeedCorpusRuns 226 if job.fuzzer.Config.Snapshot { 227 needRuns = deflakeNeedSnapshotRuns 228 } else if job.flags&ProgFromCorpus == 0 { 229 needRuns = deflakeNeedRuns 230 } 231 prevTotalNewSignal := 0 232 for run := 1; ; run++ { 233 totalNewSignal := 0 234 indices := make([]int, 0, len(job.calls)) 235 for call, info := range job.calls { 236 indices = append(indices, call) 237 totalNewSignal += len(info.newSignal) 238 } 239 if job.stopDeflake(run, needRuns, prevTotalNewSignal == totalNewSignal) { 240 break 241 } 242 prevTotalNewSignal = totalNewSignal 243 result := exec(&queue.Request{ 244 Prog: job.p, 245 ExecOpts: setFlags(flatrpc.ExecFlagCollectCover | flatrpc.ExecFlagCollectSignal), 246 ReturnAllSignal: indices, 247 Avoid: avoid, 248 Stat: job.fuzzer.statExecTriage, 249 }, progInTriage) 250 if result.Stop() { 251 return true 252 } 253 avoid = append(avoid, result.Executor) 254 if result.Info == nil { 255 continue // the program has failed 256 } 257 deflakeCall := func(call int, res *flatrpc.CallInfo) { 258 info := job.calls[call] 259 if info == nil { 260 job.fuzzer.triageProgCall(job.p, res, call, &job.calls) 261 info = job.calls[call] 262 } 263 if info == nil || res == nil { 264 return 265 } 266 if len(info.rawCover) == 0 && job.fuzzer.Config.FetchRawCover { 267 info.rawCover = res.Cover 268 } 269 // Since the signal is frequently flaky, we may get some new new max signal. 270 // Merge it into the new signal we are chasing. 271 // Most likely we won't conclude it's stable signal b/c we already have at least one 272 // initial run w/o this signal, so if we exit after needRuns runs, 273 // it won't be stable. However, it's still possible if we do more than needRuns runs. 274 // But also we already observed it and we know it's flaky, so at least doing 275 // cover.addRawMaxSignal for it looks useful. 276 prio := signalPrio(job.p, res, call) 277 newMaxSignal := job.fuzzer.Cover.addRawMaxSignal(res.Signal, prio) 278 info.newSignal.Merge(newMaxSignal) 279 info.cover.Merge(res.Cover) 280 thisSignal := signal.FromRaw(res.Signal, prio) 281 for j := needRuns - 1; j > 0; j-- { 282 intersect := info.signals[j-1].Intersection(thisSignal) 283 info.signals[j].Merge(intersect) 284 } 285 info.signals[0].Merge(thisSignal) 286 } 287 for i, callInfo := range result.Info.Calls { 288 deflakeCall(i, callInfo) 289 } 290 deflakeCall(-1, result.Info.Extra) 291 } 292 job.info.Logf("deflake complete") 293 for call, info := range job.calls { 294 info.stableSignal = info.signals[needRuns-1] 295 info.newStableSignal = info.newSignal.Intersection(info.stableSignal) 296 job.info.Logf("call #%d [%s]: |stable signal|=%d, |new stable signal|=%d%s", 297 call, job.p.CallName(call), info.stableSignal.Len(), info.newStableSignal.Len(), 298 signalPreview(info.newStableSignal)) 299 } 300 return false 301 } 302 303 func (job *triageJob) stopDeflake(run, needRuns int, noNewSignal bool) bool { 304 if job.fuzzer.Config.Snapshot { 305 return run >= needRuns+1 306 } 307 haveSignal := true 308 for _, call := range job.calls { 309 if !call.newSignal.IntersectsWith(call.signals[needRuns-1]) { 310 haveSignal = false 311 } 312 } 313 if job.flags&ProgFromCorpus == 0 { 314 // For fuzzing programs we stop if we already have the right deflaked signal for all calls, 315 // or there's no chance to get coverage common to needRuns for all calls. 316 if run >= deflakeMaxRuns { 317 return true 318 } 319 noChance := true 320 for _, call := range job.calls { 321 if left := deflakeMaxRuns - run; left >= needRuns || 322 call.newSignal.IntersectsWith(call.signals[needRuns-left-1]) { 323 noChance = false 324 } 325 } 326 if haveSignal || noChance { 327 return true 328 } 329 } else if run >= deflakeTotalCorpusRuns || 330 noNewSignal && (run >= deflakeMaxCorpusRuns || run >= deflakeMinCorpusRuns && haveSignal) { 331 // For programs from the corpus we use a different condition b/c we want to extract 332 // as much flaky signal from them as possible. They have large coverage and run 333 // in the beginning, gathering flaky signal on them allows to grow max signal quickly 334 // and avoid lots of useless executions later. Any bit of flaky coverage discovered 335 // later will lead to triage, and if we are unlucky to conclude it's stable also 336 // to minimization+smash+hints (potentially thousands of runs). 337 // So we run them at least 5 times, or while we are still getting any new signal. 338 return true 339 } 340 return false 341 } 342 343 func (job *triageJob) minimize(call int, info *triageCall) (*prog.Prog, int) { 344 job.info.Logf("[call #%d] minimize started", call) 345 minimizeAttempts := 3 346 if job.fuzzer.Config.Snapshot { 347 minimizeAttempts = 2 348 } 349 stop := false 350 mode := prog.MinimizeCorpus 351 if job.fuzzer.Config.PatchTest { 352 mode = prog.MinimizeCallsOnly 353 } 354 p, call := prog.Minimize(job.p, call, mode, func(p1 *prog.Prog, call1 int) bool { 355 if stop { 356 return false 357 } 358 var mergedSignal signal.Signal 359 for i := 0; i < minimizeAttempts; i++ { 360 result := job.execute(&queue.Request{ 361 Prog: p1, 362 ExecOpts: setFlags(flatrpc.ExecFlagCollectSignal), 363 ReturnAllSignal: []int{call1}, 364 Stat: job.fuzzer.statExecMinimize, 365 }, 0) 366 if result.Stop() { 367 stop = true 368 return false 369 } 370 if !reexecutionSuccess(result.Info, info.errno, call1) { 371 // The call was not executed or failed. 372 continue 373 } 374 thisSignal := getSignalAndCover(p1, result.Info, call1) 375 if mergedSignal.Len() == 0 { 376 mergedSignal = thisSignal 377 } else { 378 mergedSignal.Merge(thisSignal) 379 } 380 if info.newStableSignal.Intersection(mergedSignal).Len() == info.newStableSignal.Len() { 381 job.info.Logf("[call #%d] minimization step success (|calls| = %d)", 382 call, len(p1.Calls)) 383 return true 384 } 385 } 386 job.info.Logf("[call #%d] minimization step failure", call) 387 return false 388 }) 389 if stop { 390 return nil, 0 391 } 392 return p, call 393 } 394 395 func reexecutionSuccess(info *flatrpc.ProgInfo, oldErrno int32, call int) bool { 396 if info == nil || len(info.Calls) == 0 { 397 return false 398 } 399 if call != -1 { 400 // Don't minimize calls from successful to unsuccessful. 401 // Successful calls are much more valuable. 402 if oldErrno == 0 && info.Calls[call].Error != 0 { 403 return false 404 } 405 return len(info.Calls[call].Signal) != 0 406 } 407 return info.Extra != nil && len(info.Extra.Signal) != 0 408 } 409 410 func getSignalAndCover(p *prog.Prog, info *flatrpc.ProgInfo, call int) signal.Signal { 411 inf := info.Extra 412 if call != -1 { 413 inf = info.Calls[call] 414 } 415 if inf == nil { 416 return nil 417 } 418 return signal.FromRaw(inf.Signal, signalPrio(p, inf, call)) 419 } 420 421 func signalPreview(s signal.Signal) string { 422 if s.Len() > 0 && s.Len() <= 3 { 423 var sb strings.Builder 424 sb.WriteString(" (") 425 for i, x := range s.ToRaw() { 426 if i > 0 { 427 sb.WriteString(", ") 428 } 429 fmt.Fprintf(&sb, "0x%x", x) 430 } 431 sb.WriteByte(')') 432 return sb.String() 433 } 434 return "" 435 } 436 437 func (job *triageJob) getInfo() *JobInfo { 438 return job.info 439 } 440 441 type smashJob struct { 442 exec queue.Executor 443 p *prog.Prog 444 info *JobInfo 445 } 446 447 func (job *smashJob) run(fuzzer *Fuzzer) { 448 fuzzer.Logf(2, "smashing the program %s:", job.p) 449 job.info.Logf("\n%s", job.p.Serialize()) 450 451 const iters = 25 452 rnd := fuzzer.rand() 453 for i := 0; i < iters; i++ { 454 p := job.p.Clone() 455 p.Mutate(rnd, prog.RecommendedCalls, 456 fuzzer.ChoiceTable(), 457 fuzzer.Config.NoMutateCalls, 458 fuzzer.Config.Corpus.Programs()) 459 result := fuzzer.execute(job.exec, &queue.Request{ 460 Prog: p, 461 ExecOpts: setFlags(flatrpc.ExecFlagCollectSignal), 462 Stat: fuzzer.statExecSmash, 463 }) 464 if result.Stop() { 465 return 466 } 467 job.info.Execs.Add(1) 468 } 469 } 470 471 func (job *smashJob) getInfo() *JobInfo { 472 return job.info 473 } 474 475 func randomCollide(origP *prog.Prog, rnd *rand.Rand) *prog.Prog { 476 if rnd.Intn(5) == 0 { 477 // Old-style collide with a 20% probability. 478 p, err := prog.DoubleExecCollide(origP, rnd) 479 if err == nil { 480 return p 481 } 482 } 483 if rnd.Intn(4) == 0 { 484 // Duplicate random calls with a 20% probability (25% * 80%). 485 p, err := prog.DupCallCollide(origP, rnd) 486 if err == nil { 487 return p 488 } 489 } 490 p := prog.AssignRandomAsync(origP, rnd) 491 if rnd.Intn(2) != 0 { 492 prog.AssignRandomRerun(p, rnd) 493 } 494 return p 495 } 496 497 type faultInjectionJob struct { 498 exec queue.Executor 499 p *prog.Prog 500 call int 501 } 502 503 func (job *faultInjectionJob) run(fuzzer *Fuzzer) { 504 for nth := 1; nth <= 100; nth++ { 505 fuzzer.Logf(2, "injecting fault into call %v, step %v", 506 job.call, nth) 507 newProg := job.p.Clone() 508 newProg.Calls[job.call].Props.FailNth = nth 509 result := fuzzer.execute(job.exec, &queue.Request{ 510 Prog: newProg, 511 Stat: fuzzer.statExecFaultInject, 512 }) 513 if result.Stop() { 514 return 515 } 516 info := result.Info 517 if info != nil && len(info.Calls) > job.call && 518 info.Calls[job.call].Flags&flatrpc.CallFlagFaultInjected == 0 { 519 break 520 } 521 } 522 } 523 524 type hintsJob struct { 525 exec queue.Executor 526 p *prog.Prog 527 call int 528 info *JobInfo 529 } 530 531 func (job *hintsJob) run(fuzzer *Fuzzer) { 532 // First execute the original program several times to get comparisons from KCOV. 533 // Additional executions lets us filter out flaky values, which seem to constitute ~30-40%. 534 p := job.p 535 job.info.Logf("\n%s", p.Serialize()) 536 537 var comps prog.CompMap 538 for i := 0; i < 3; i++ { 539 result := fuzzer.execute(job.exec, &queue.Request{ 540 Prog: p, 541 ExecOpts: setFlags(flatrpc.ExecFlagCollectComps), 542 Stat: fuzzer.statExecSeed, 543 }) 544 if result.Stop() { 545 return 546 } 547 job.info.Execs.Add(1) 548 if result.Info == nil || len(result.Info.Calls[job.call].Comps) == 0 { 549 continue 550 } 551 got := make(prog.CompMap) 552 for _, cmp := range result.Info.Calls[job.call].Comps { 553 got.Add(cmp.Pc, cmp.Op1, cmp.Op2, cmp.IsConst) 554 } 555 if i == 0 { 556 comps = got 557 } else { 558 comps.InplaceIntersect(got) 559 } 560 } 561 562 job.info.Logf("stable comps: %d", comps.Len()) 563 fuzzer.hintsLimiter.Limit(comps) 564 job.info.Logf("stable comps (after the hints limiter): %d", comps.Len()) 565 566 // Then mutate the initial program for every match between 567 // a syscall argument and a comparison operand. 568 // Execute each of such mutants to check if it gives new coverage. 569 p.MutateWithHints(job.call, comps, 570 func(p *prog.Prog) bool { 571 defer job.info.Execs.Add(1) 572 result := fuzzer.execute(job.exec, &queue.Request{ 573 Prog: p, 574 ExecOpts: setFlags(flatrpc.ExecFlagCollectSignal), 575 Stat: fuzzer.statExecHint, 576 }) 577 return !result.Stop() 578 }) 579 } 580 581 func (job *hintsJob) getInfo() *JobInfo { 582 return job.info 583 } 584 585 type syncBuffer struct { 586 mu sync.Mutex 587 buf bytes.Buffer 588 } 589 590 func (sb *syncBuffer) Logf(logFmt string, args ...any) { 591 sb.mu.Lock() 592 defer sb.mu.Unlock() 593 594 fmt.Fprintf(&sb.buf, "%s: ", time.Now().Format(time.DateTime)) 595 fmt.Fprintf(&sb.buf, logFmt, args...) 596 sb.buf.WriteByte('\n') 597 } 598 599 func (sb *syncBuffer) Bytes() []byte { 600 sb.mu.Lock() 601 defer sb.mu.Unlock() 602 return sb.buf.Bytes() 603 }