github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/fuzzer/job.go (about)

     1  // Copyright 2024 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  package fuzzer
     5  
     6  import (
     7  	"bytes"
     8  	"fmt"
     9  	"math/rand"
    10  	"strings"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	"github.com/google/syzkaller/pkg/corpus"
    16  	"github.com/google/syzkaller/pkg/cover"
    17  	"github.com/google/syzkaller/pkg/flatrpc"
    18  	"github.com/google/syzkaller/pkg/fuzzer/queue"
    19  	"github.com/google/syzkaller/pkg/signal"
    20  	"github.com/google/syzkaller/prog"
    21  )
    22  
    23  type job interface {
    24  	run(fuzzer *Fuzzer)
    25  }
    26  
    27  type jobIntrospector interface {
    28  	getInfo() *JobInfo
    29  }
    30  
    31  type JobInfo struct {
    32  	Name  string
    33  	Calls []string
    34  	Type  string
    35  	Execs atomic.Int32
    36  
    37  	syncBuffer
    38  }
    39  
    40  func (ji *JobInfo) ID() string {
    41  	return fmt.Sprintf("%p", ji)
    42  }
    43  
    44  func genProgRequest(fuzzer *Fuzzer, rnd *rand.Rand) *queue.Request {
    45  	p := fuzzer.target.Generate(rnd,
    46  		fuzzer.RecommendedCalls(),
    47  		fuzzer.ChoiceTable())
    48  	return &queue.Request{
    49  		Prog:     p,
    50  		ExecOpts: setFlags(flatrpc.ExecFlagCollectSignal),
    51  		Stat:     fuzzer.statExecGenerate,
    52  	}
    53  }
    54  
    55  func mutateProgRequest(fuzzer *Fuzzer, rnd *rand.Rand) *queue.Request {
    56  	p := fuzzer.Config.Corpus.ChooseProgram(rnd)
    57  	if p == nil {
    58  		return nil
    59  	}
    60  	newP := p.Clone()
    61  	newP.Mutate(rnd,
    62  		prog.RecommendedCalls,
    63  		fuzzer.ChoiceTable(),
    64  		fuzzer.Config.NoMutateCalls,
    65  		fuzzer.Config.Corpus.Programs(),
    66  	)
    67  	return &queue.Request{
    68  		Prog:     newP,
    69  		ExecOpts: setFlags(flatrpc.ExecFlagCollectSignal),
    70  		Stat:     fuzzer.statExecFuzz,
    71  	}
    72  }
    73  
    74  // triageJob are programs for which we noticed potential new coverage during
    75  // first execution. But we are not sure yet if the coverage is real or not.
    76  // During triage we understand if these programs in fact give new coverage,
    77  // and if yes, minimize them and add to corpus.
    78  type triageJob struct {
    79  	p        *prog.Prog
    80  	executor queue.ExecutorID
    81  	flags    ProgFlags
    82  	fuzzer   *Fuzzer
    83  	queue    queue.Executor
    84  	// Set of calls that gave potential new coverage.
    85  	calls map[int]*triageCall
    86  
    87  	info *JobInfo
    88  }
    89  
    90  type triageCall struct {
    91  	errno     int32
    92  	newSignal signal.Signal
    93  
    94  	// Filled after deflake:
    95  	signals         [deflakeNeedRuns]signal.Signal
    96  	stableSignal    signal.Signal
    97  	newStableSignal signal.Signal
    98  	cover           cover.Cover
    99  	rawCover        []uint64
   100  }
   101  
   102  // As demonstrated in #4639, programs reproduce with a very high, but not 100% probability.
   103  // The triage algorithm must tolerate this, so let's pick the signal that is common
   104  // to 3 out of 5 runs.
   105  // By binomial distribution, a program that reproduces 80% of time will pass deflake()
   106  // with a 94% probability. If it reproduces 90% of time, it passes in 99% of cases.
   107  //
   108  // During corpus triage we are more permissive and require only 2/6 to produce new stable signal.
   109  // Such parameters make 80% flakiness to pass 99% of time, and even 60% flakiness passes 96% of time.
   110  // First, we don't need to be strict during corpus triage since the program has already passed
   111  // the stricter check when it was added to the corpus. So we can do fewer runs during triage,
   112  // and finish it sooner. If the program does not produce any stable signal any more, just flakes,
   113  // (if the kernel code was changed, or configs disabled), then it still should be phased out
   114  // of the corpus eventually.
   115  // Second, even if small percent of programs are dropped from the corpus due to flaky signal,
   116  // later after several restarts we will add them to the corpus again, and it will create lots
   117  // of duplicate work for minimization/hints/smash/fault injection. For example, a program with
   118  // 60% flakiness has 68% chance to pass 3/5 criteria, but it's also likely to be dropped from
   119  // the corpus if we use the same 3/5 criteria during triage. With a large corpus this effect
   120  // can cause re-addition of thousands of programs to the corpus, and hundreds of thousands
   121  // of runs for the additional work. With 2/6 criteria, a program with 60% flakiness has
   122  // 96% chance to be kept in the corpus after retriage.
   123  const (
   124  	deflakeNeedRuns         = 3
   125  	deflakeMaxRuns          = 5
   126  	deflakeNeedCorpusRuns   = 2
   127  	deflakeMinCorpusRuns    = 4
   128  	deflakeMaxCorpusRuns    = 6
   129  	deflakeTotalCorpusRuns  = 20
   130  	deflakeNeedSnapshotRuns = 2
   131  )
   132  
   133  func (job *triageJob) execute(req *queue.Request, flags ProgFlags) *queue.Result {
   134  	defer job.info.Execs.Add(1)
   135  	req.Important = true // All triage executions are important.
   136  	return job.fuzzer.executeWithFlags(job.queue, req, flags)
   137  }
   138  
   139  func (job *triageJob) run(fuzzer *Fuzzer) {
   140  	fuzzer.statNewInputs.Add(1)
   141  	job.fuzzer = fuzzer
   142  	job.info.Logf("\n%s", job.p.Serialize())
   143  	for call, info := range job.calls {
   144  		job.info.Logf("call #%d [%s]: |new signal|=%d%s",
   145  			call, job.p.CallName(call), info.newSignal.Len(), signalPreview(info.newSignal))
   146  	}
   147  
   148  	// Compute input coverage and non-flaky signal for minimization.
   149  	stop := job.deflake(job.execute)
   150  	if stop {
   151  		return
   152  	}
   153  	var wg sync.WaitGroup
   154  	for call, info := range job.calls {
   155  		wg.Add(1)
   156  		go func() {
   157  			job.handleCall(call, info)
   158  			wg.Done()
   159  		}()
   160  	}
   161  	wg.Wait()
   162  }
   163  
   164  func (job *triageJob) handleCall(call int, info *triageCall) {
   165  	if info.newStableSignal.Empty() {
   166  		return
   167  	}
   168  
   169  	p := job.p
   170  	if job.flags&ProgMinimized == 0 {
   171  		p, call = job.minimize(call, info)
   172  		if p == nil {
   173  			return
   174  		}
   175  	}
   176  	callName := p.CallName(call)
   177  	if !job.fuzzer.Config.NewInputFilter(callName) {
   178  		return
   179  	}
   180  	if job.flags&ProgSmashed == 0 {
   181  		job.fuzzer.startJob(job.fuzzer.statJobsSmash, &smashJob{
   182  			exec: job.fuzzer.smashQueue,
   183  			p:    p.Clone(),
   184  			info: &JobInfo{
   185  				Name:  p.String(),
   186  				Type:  "smash",
   187  				Calls: []string{p.CallName(call)},
   188  			},
   189  		})
   190  		if job.fuzzer.Config.Comparisons && call >= 0 {
   191  			job.fuzzer.startJob(job.fuzzer.statJobsHints, &hintsJob{
   192  				exec: job.fuzzer.smashQueue,
   193  				p:    p.Clone(),
   194  				call: call,
   195  				info: &JobInfo{
   196  					Name:  p.String(),
   197  					Type:  "hints",
   198  					Calls: []string{p.CallName(call)},
   199  				},
   200  			})
   201  		}
   202  		if job.fuzzer.Config.FaultInjection && call >= 0 {
   203  			job.fuzzer.startJob(job.fuzzer.statJobsFaultInjection, &faultInjectionJob{
   204  				exec: job.fuzzer.smashQueue,
   205  				p:    p.Clone(),
   206  				call: call,
   207  			})
   208  		}
   209  	}
   210  	job.fuzzer.Logf(2, "added new input for %v to the corpus: %s", callName, p)
   211  	input := corpus.NewInput{
   212  		Prog:     p,
   213  		Call:     call,
   214  		Signal:   info.stableSignal,
   215  		Cover:    info.cover.Serialize(),
   216  		RawCover: info.rawCover,
   217  	}
   218  	job.fuzzer.Config.Corpus.Save(input)
   219  }
   220  
   221  func (job *triageJob) deflake(exec func(*queue.Request, ProgFlags) *queue.Result) (stop bool) {
   222  	job.info.Logf("deflake started")
   223  
   224  	avoid := []queue.ExecutorID{job.executor}
   225  	needRuns := deflakeNeedCorpusRuns
   226  	if job.fuzzer.Config.Snapshot {
   227  		needRuns = deflakeNeedSnapshotRuns
   228  	} else if job.flags&ProgFromCorpus == 0 {
   229  		needRuns = deflakeNeedRuns
   230  	}
   231  	prevTotalNewSignal := 0
   232  	for run := 1; ; run++ {
   233  		totalNewSignal := 0
   234  		indices := make([]int, 0, len(job.calls))
   235  		for call, info := range job.calls {
   236  			indices = append(indices, call)
   237  			totalNewSignal += len(info.newSignal)
   238  		}
   239  		if job.stopDeflake(run, needRuns, prevTotalNewSignal == totalNewSignal) {
   240  			break
   241  		}
   242  		prevTotalNewSignal = totalNewSignal
   243  		result := exec(&queue.Request{
   244  			Prog:            job.p,
   245  			ExecOpts:        setFlags(flatrpc.ExecFlagCollectCover | flatrpc.ExecFlagCollectSignal),
   246  			ReturnAllSignal: indices,
   247  			Avoid:           avoid,
   248  			Stat:            job.fuzzer.statExecTriage,
   249  		}, progInTriage)
   250  		if result.Stop() {
   251  			return true
   252  		}
   253  		avoid = append(avoid, result.Executor)
   254  		if result.Info == nil {
   255  			continue // the program has failed
   256  		}
   257  		deflakeCall := func(call int, res *flatrpc.CallInfo) {
   258  			info := job.calls[call]
   259  			if info == nil {
   260  				job.fuzzer.triageProgCall(job.p, res, call, &job.calls)
   261  				info = job.calls[call]
   262  			}
   263  			if info == nil || res == nil {
   264  				return
   265  			}
   266  			if len(info.rawCover) == 0 && job.fuzzer.Config.FetchRawCover {
   267  				info.rawCover = res.Cover
   268  			}
   269  			// Since the signal is frequently flaky, we may get some new new max signal.
   270  			// Merge it into the new signal we are chasing.
   271  			// Most likely we won't conclude it's stable signal b/c we already have at least one
   272  			// initial run w/o this signal, so if we exit after needRuns runs,
   273  			// it won't be stable. However, it's still possible if we do more than needRuns runs.
   274  			// But also we already observed it and we know it's flaky, so at least doing
   275  			// cover.addRawMaxSignal for it looks useful.
   276  			prio := signalPrio(job.p, res, call)
   277  			newMaxSignal := job.fuzzer.Cover.addRawMaxSignal(res.Signal, prio)
   278  			info.newSignal.Merge(newMaxSignal)
   279  			info.cover.Merge(res.Cover)
   280  			thisSignal := signal.FromRaw(res.Signal, prio)
   281  			for j := needRuns - 1; j > 0; j-- {
   282  				intersect := info.signals[j-1].Intersection(thisSignal)
   283  				info.signals[j].Merge(intersect)
   284  			}
   285  			info.signals[0].Merge(thisSignal)
   286  		}
   287  		for i, callInfo := range result.Info.Calls {
   288  			deflakeCall(i, callInfo)
   289  		}
   290  		deflakeCall(-1, result.Info.Extra)
   291  	}
   292  	job.info.Logf("deflake complete")
   293  	for call, info := range job.calls {
   294  		info.stableSignal = info.signals[needRuns-1]
   295  		info.newStableSignal = info.newSignal.Intersection(info.stableSignal)
   296  		job.info.Logf("call #%d [%s]: |stable signal|=%d, |new stable signal|=%d%s",
   297  			call, job.p.CallName(call), info.stableSignal.Len(), info.newStableSignal.Len(),
   298  			signalPreview(info.newStableSignal))
   299  	}
   300  	return false
   301  }
   302  
   303  func (job *triageJob) stopDeflake(run, needRuns int, noNewSignal bool) bool {
   304  	if job.fuzzer.Config.Snapshot {
   305  		return run >= needRuns+1
   306  	}
   307  	haveSignal := true
   308  	for _, call := range job.calls {
   309  		if !call.newSignal.IntersectsWith(call.signals[needRuns-1]) {
   310  			haveSignal = false
   311  		}
   312  	}
   313  	if job.flags&ProgFromCorpus == 0 {
   314  		// For fuzzing programs we stop if we already have the right deflaked signal for all calls,
   315  		// or there's no chance to get coverage common to needRuns for all calls.
   316  		if run >= deflakeMaxRuns {
   317  			return true
   318  		}
   319  		noChance := true
   320  		for _, call := range job.calls {
   321  			if left := deflakeMaxRuns - run; left >= needRuns ||
   322  				call.newSignal.IntersectsWith(call.signals[needRuns-left-1]) {
   323  				noChance = false
   324  			}
   325  		}
   326  		if haveSignal || noChance {
   327  			return true
   328  		}
   329  	} else if run >= deflakeTotalCorpusRuns ||
   330  		noNewSignal && (run >= deflakeMaxCorpusRuns || run >= deflakeMinCorpusRuns && haveSignal) {
   331  		// For programs from the corpus we use a different condition b/c we want to extract
   332  		// as much flaky signal from them as possible. They have large coverage and run
   333  		// in the beginning, gathering flaky signal on them allows to grow max signal quickly
   334  		// and avoid lots of useless executions later. Any bit of flaky coverage discovered
   335  		// later will lead to triage, and if we are unlucky to conclude it's stable also
   336  		// to minimization+smash+hints (potentially thousands of runs).
   337  		// So we run them at least 5 times, or while we are still getting any new signal.
   338  		return true
   339  	}
   340  	return false
   341  }
   342  
   343  func (job *triageJob) minimize(call int, info *triageCall) (*prog.Prog, int) {
   344  	job.info.Logf("[call #%d] minimize started", call)
   345  	minimizeAttempts := 3
   346  	if job.fuzzer.Config.Snapshot {
   347  		minimizeAttempts = 2
   348  	}
   349  	stop := false
   350  	mode := prog.MinimizeCorpus
   351  	if job.fuzzer.Config.PatchTest {
   352  		mode = prog.MinimizeCallsOnly
   353  	}
   354  	p, call := prog.Minimize(job.p, call, mode, func(p1 *prog.Prog, call1 int) bool {
   355  		if stop {
   356  			return false
   357  		}
   358  		var mergedSignal signal.Signal
   359  		for i := 0; i < minimizeAttempts; i++ {
   360  			result := job.execute(&queue.Request{
   361  				Prog:            p1,
   362  				ExecOpts:        setFlags(flatrpc.ExecFlagCollectSignal),
   363  				ReturnAllSignal: []int{call1},
   364  				Stat:            job.fuzzer.statExecMinimize,
   365  			}, 0)
   366  			if result.Stop() {
   367  				stop = true
   368  				return false
   369  			}
   370  			if !reexecutionSuccess(result.Info, info.errno, call1) {
   371  				// The call was not executed or failed.
   372  				continue
   373  			}
   374  			thisSignal := getSignalAndCover(p1, result.Info, call1)
   375  			if mergedSignal.Len() == 0 {
   376  				mergedSignal = thisSignal
   377  			} else {
   378  				mergedSignal.Merge(thisSignal)
   379  			}
   380  			if info.newStableSignal.Intersection(mergedSignal).Len() == info.newStableSignal.Len() {
   381  				job.info.Logf("[call #%d] minimization step success (|calls| = %d)",
   382  					call, len(p1.Calls))
   383  				return true
   384  			}
   385  		}
   386  		job.info.Logf("[call #%d] minimization step failure", call)
   387  		return false
   388  	})
   389  	if stop {
   390  		return nil, 0
   391  	}
   392  	return p, call
   393  }
   394  
   395  func reexecutionSuccess(info *flatrpc.ProgInfo, oldErrno int32, call int) bool {
   396  	if info == nil || len(info.Calls) == 0 {
   397  		return false
   398  	}
   399  	if call != -1 {
   400  		// Don't minimize calls from successful to unsuccessful.
   401  		// Successful calls are much more valuable.
   402  		if oldErrno == 0 && info.Calls[call].Error != 0 {
   403  			return false
   404  		}
   405  		return len(info.Calls[call].Signal) != 0
   406  	}
   407  	return info.Extra != nil && len(info.Extra.Signal) != 0
   408  }
   409  
   410  func getSignalAndCover(p *prog.Prog, info *flatrpc.ProgInfo, call int) signal.Signal {
   411  	inf := info.Extra
   412  	if call != -1 {
   413  		inf = info.Calls[call]
   414  	}
   415  	if inf == nil {
   416  		return nil
   417  	}
   418  	return signal.FromRaw(inf.Signal, signalPrio(p, inf, call))
   419  }
   420  
   421  func signalPreview(s signal.Signal) string {
   422  	if s.Len() > 0 && s.Len() <= 3 {
   423  		var sb strings.Builder
   424  		sb.WriteString(" (")
   425  		for i, x := range s.ToRaw() {
   426  			if i > 0 {
   427  				sb.WriteString(", ")
   428  			}
   429  			fmt.Fprintf(&sb, "0x%x", x)
   430  		}
   431  		sb.WriteByte(')')
   432  		return sb.String()
   433  	}
   434  	return ""
   435  }
   436  
   437  func (job *triageJob) getInfo() *JobInfo {
   438  	return job.info
   439  }
   440  
   441  type smashJob struct {
   442  	exec queue.Executor
   443  	p    *prog.Prog
   444  	info *JobInfo
   445  }
   446  
   447  func (job *smashJob) run(fuzzer *Fuzzer) {
   448  	fuzzer.Logf(2, "smashing the program %s:", job.p)
   449  	job.info.Logf("\n%s", job.p.Serialize())
   450  
   451  	const iters = 25
   452  	rnd := fuzzer.rand()
   453  	for i := 0; i < iters; i++ {
   454  		p := job.p.Clone()
   455  		p.Mutate(rnd, prog.RecommendedCalls,
   456  			fuzzer.ChoiceTable(),
   457  			fuzzer.Config.NoMutateCalls,
   458  			fuzzer.Config.Corpus.Programs())
   459  		result := fuzzer.execute(job.exec, &queue.Request{
   460  			Prog:     p,
   461  			ExecOpts: setFlags(flatrpc.ExecFlagCollectSignal),
   462  			Stat:     fuzzer.statExecSmash,
   463  		})
   464  		if result.Stop() {
   465  			return
   466  		}
   467  		job.info.Execs.Add(1)
   468  	}
   469  }
   470  
   471  func (job *smashJob) getInfo() *JobInfo {
   472  	return job.info
   473  }
   474  
   475  func randomCollide(origP *prog.Prog, rnd *rand.Rand) *prog.Prog {
   476  	if rnd.Intn(5) == 0 {
   477  		// Old-style collide with a 20% probability.
   478  		p, err := prog.DoubleExecCollide(origP, rnd)
   479  		if err == nil {
   480  			return p
   481  		}
   482  	}
   483  	if rnd.Intn(4) == 0 {
   484  		// Duplicate random calls with a 20% probability (25% * 80%).
   485  		p, err := prog.DupCallCollide(origP, rnd)
   486  		if err == nil {
   487  			return p
   488  		}
   489  	}
   490  	p := prog.AssignRandomAsync(origP, rnd)
   491  	if rnd.Intn(2) != 0 {
   492  		prog.AssignRandomRerun(p, rnd)
   493  	}
   494  	return p
   495  }
   496  
   497  type faultInjectionJob struct {
   498  	exec queue.Executor
   499  	p    *prog.Prog
   500  	call int
   501  }
   502  
   503  func (job *faultInjectionJob) run(fuzzer *Fuzzer) {
   504  	for nth := 1; nth <= 100; nth++ {
   505  		fuzzer.Logf(2, "injecting fault into call %v, step %v",
   506  			job.call, nth)
   507  		newProg := job.p.Clone()
   508  		newProg.Calls[job.call].Props.FailNth = nth
   509  		result := fuzzer.execute(job.exec, &queue.Request{
   510  			Prog: newProg,
   511  			Stat: fuzzer.statExecFaultInject,
   512  		})
   513  		if result.Stop() {
   514  			return
   515  		}
   516  		info := result.Info
   517  		if info != nil && len(info.Calls) > job.call &&
   518  			info.Calls[job.call].Flags&flatrpc.CallFlagFaultInjected == 0 {
   519  			break
   520  		}
   521  	}
   522  }
   523  
   524  type hintsJob struct {
   525  	exec queue.Executor
   526  	p    *prog.Prog
   527  	call int
   528  	info *JobInfo
   529  }
   530  
   531  func (job *hintsJob) run(fuzzer *Fuzzer) {
   532  	// First execute the original program several times to get comparisons from KCOV.
   533  	// Additional executions lets us filter out flaky values, which seem to constitute ~30-40%.
   534  	p := job.p
   535  	job.info.Logf("\n%s", p.Serialize())
   536  
   537  	var comps prog.CompMap
   538  	for i := 0; i < 3; i++ {
   539  		result := fuzzer.execute(job.exec, &queue.Request{
   540  			Prog:     p,
   541  			ExecOpts: setFlags(flatrpc.ExecFlagCollectComps),
   542  			Stat:     fuzzer.statExecSeed,
   543  		})
   544  		if result.Stop() {
   545  			return
   546  		}
   547  		job.info.Execs.Add(1)
   548  		if result.Info == nil || len(result.Info.Calls[job.call].Comps) == 0 {
   549  			continue
   550  		}
   551  		got := make(prog.CompMap)
   552  		for _, cmp := range result.Info.Calls[job.call].Comps {
   553  			got.Add(cmp.Pc, cmp.Op1, cmp.Op2, cmp.IsConst)
   554  		}
   555  		if i == 0 {
   556  			comps = got
   557  		} else {
   558  			comps.InplaceIntersect(got)
   559  		}
   560  	}
   561  
   562  	job.info.Logf("stable comps: %d", comps.Len())
   563  	fuzzer.hintsLimiter.Limit(comps)
   564  	job.info.Logf("stable comps (after the hints limiter): %d", comps.Len())
   565  
   566  	// Then mutate the initial program for every match between
   567  	// a syscall argument and a comparison operand.
   568  	// Execute each of such mutants to check if it gives new coverage.
   569  	p.MutateWithHints(job.call, comps,
   570  		func(p *prog.Prog) bool {
   571  			defer job.info.Execs.Add(1)
   572  			result := fuzzer.execute(job.exec, &queue.Request{
   573  				Prog:     p,
   574  				ExecOpts: setFlags(flatrpc.ExecFlagCollectSignal),
   575  				Stat:     fuzzer.statExecHint,
   576  			})
   577  			return !result.Stop()
   578  		})
   579  }
   580  
   581  func (job *hintsJob) getInfo() *JobInfo {
   582  	return job.info
   583  }
   584  
   585  type syncBuffer struct {
   586  	mu  sync.Mutex
   587  	buf bytes.Buffer
   588  }
   589  
   590  func (sb *syncBuffer) Logf(logFmt string, args ...any) {
   591  	sb.mu.Lock()
   592  	defer sb.mu.Unlock()
   593  
   594  	fmt.Fprintf(&sb.buf, "%s: ", time.Now().Format(time.DateTime))
   595  	fmt.Fprintf(&sb.buf, logFmt, args...)
   596  	sb.buf.WriteByte('\n')
   597  }
   598  
   599  func (sb *syncBuffer) Bytes() []byte {
   600  	sb.mu.Lock()
   601  	defer sb.mu.Unlock()
   602  	return sb.buf.Bytes()
   603  }