github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/manager/seeds.go (about)

     1  // Copyright 2024 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  package manager
     5  
     6  import (
     7  	"bufio"
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"math/rand"
    12  	"os"
    13  	"path/filepath"
    14  	"runtime"
    15  	"strings"
    16  	"sync"
    17  	"time"
    18  
    19  	"github.com/google/syzkaller/pkg/db"
    20  	"github.com/google/syzkaller/pkg/fuzzer"
    21  	"github.com/google/syzkaller/pkg/hash"
    22  	"github.com/google/syzkaller/pkg/log"
    23  	"github.com/google/syzkaller/pkg/mgrconfig"
    24  	"github.com/google/syzkaller/pkg/osutil"
    25  	"github.com/google/syzkaller/prog"
    26  )
    27  
    28  type Seeds struct {
    29  	CorpusDB   *db.DB
    30  	Fresh      bool
    31  	Candidates []fuzzer.Candidate
    32  }
    33  
    34  func LoadSeeds(cfg *mgrconfig.Config, immutable bool) (Seeds, error) {
    35  	var info Seeds
    36  	var err error
    37  	info.CorpusDB, err = db.Open(filepath.Join(cfg.Workdir, "corpus.db"), !immutable)
    38  	if err != nil {
    39  		if info.CorpusDB == nil {
    40  			return Seeds{}, fmt.Errorf("failed to open corpus database: %w", err)
    41  		}
    42  		log.Errorf("read %v inputs from corpus and got error: %v", len(info.CorpusDB.Records), err)
    43  	}
    44  	info.Fresh = len(info.CorpusDB.Records) == 0
    45  	corpusFlags := versionToFlags(info.CorpusDB.Version)
    46  	outputs := make(chan *input, 32)
    47  	chErr := make(chan error, 1)
    48  	go func() {
    49  		chErr <- readInputs(cfg, info.CorpusDB, outputs)
    50  		close(outputs)
    51  	}()
    52  
    53  	brokenSeeds := 0
    54  	skippedSeeds := 0
    55  	var brokenCorpus []string
    56  	var candidates []fuzzer.Candidate
    57  	for inp := range outputs {
    58  		if inp.Prog == nil {
    59  			if inp.IsSeed {
    60  				if errors.Is(inp.Err, ErrSkippedTest) {
    61  					skippedSeeds++
    62  					log.Logf(2, "seed %s is skipped: %s", inp.Path, inp.Err)
    63  				} else {
    64  					brokenSeeds++
    65  					log.Logf(0, "seed %s is broken: %s", inp.Path, inp.Err)
    66  				}
    67  			} else {
    68  				brokenCorpus = append(brokenCorpus, inp.Key)
    69  			}
    70  			continue
    71  		}
    72  		flags := corpusFlags
    73  		if inp.IsSeed {
    74  			if _, ok := info.CorpusDB.Records[hash.String(inp.Prog.Serialize())]; ok {
    75  				continue
    76  			}
    77  			// Seeds are not considered "from corpus" (won't be rerun multiple times)
    78  			// b/c they are tried on every start anyway.
    79  			flags = fuzzer.ProgMinimized
    80  		}
    81  		candidates = append(candidates, fuzzer.Candidate{
    82  			Prog:  inp.Prog,
    83  			Flags: flags,
    84  		})
    85  	}
    86  	if err := <-chErr; err != nil {
    87  		return Seeds{}, err
    88  	}
    89  	if len(brokenCorpus)+brokenSeeds != 0 {
    90  		log.Logf(0, "broken programs in the corpus: %v, broken seeds: %v", len(brokenCorpus), brokenSeeds)
    91  	}
    92  	if skippedSeeds != 0 {
    93  		log.Logf(0, "skipped %v seeds", skippedSeeds)
    94  	}
    95  	if !immutable {
    96  		// This needs to be done outside of the loop above to not race with corpusDB reads.
    97  		for _, sig := range brokenCorpus {
    98  			info.CorpusDB.Delete(sig)
    99  		}
   100  		if err := info.CorpusDB.Flush(); err != nil {
   101  			return Seeds{}, fmt.Errorf("failed to save corpus database: %w", err)
   102  		}
   103  	}
   104  	// Switch database to the mode when it does not keep records in memory.
   105  	// We don't need them anymore and they consume lots of memory.
   106  	info.CorpusDB.DiscardData()
   107  	info.Candidates = candidates
   108  	return info, nil
   109  }
   110  
   111  type input struct {
   112  	IsSeed bool
   113  	Key    string
   114  	Path   string
   115  	Data   []byte
   116  	Prog   *prog.Prog
   117  	Err    error
   118  }
   119  
   120  func readInputs(cfg *mgrconfig.Config, db *db.DB, output chan *input) error {
   121  	procs := runtime.GOMAXPROCS(0)
   122  	inputs := make(chan *input, procs)
   123  	var wg sync.WaitGroup
   124  	wg.Add(procs)
   125  
   126  	defer wg.Wait()
   127  	defer close(inputs)
   128  	for p := 0; p < procs; p++ {
   129  		go func() {
   130  			defer wg.Done()
   131  			for inp := range inputs {
   132  				inp.Prog, inp.Err = ParseSeed(cfg.Target, inp.Data)
   133  				output <- inp
   134  			}
   135  		}()
   136  	}
   137  
   138  	for key, rec := range db.Records {
   139  		inputs <- &input{
   140  			Key:  key,
   141  			Data: rec.Val,
   142  		}
   143  	}
   144  	seedPath := filepath.Join("sys", cfg.TargetOS, "test")
   145  	seedDir := filepath.Join(cfg.Syzkaller, seedPath)
   146  	if osutil.IsExist(seedDir) {
   147  		seeds, err := os.ReadDir(seedDir)
   148  		if err != nil {
   149  			return fmt.Errorf("failed to read seeds dir: %w", err)
   150  		}
   151  		for _, seed := range seeds {
   152  			data, err := os.ReadFile(filepath.Join(seedDir, seed.Name()))
   153  			if err != nil {
   154  				return fmt.Errorf("failed to read seed %v: %w", seed.Name(), err)
   155  			}
   156  			inputs <- &input{
   157  				IsSeed: true,
   158  				Path:   filepath.Join(seedPath, seed.Name()),
   159  				Data:   data,
   160  			}
   161  		}
   162  	}
   163  	return nil
   164  }
   165  
   166  const CurrentDBVersion = 5
   167  
   168  func versionToFlags(version uint64) fuzzer.ProgFlags {
   169  	// By default we don't re-minimize/re-smash programs from corpus,
   170  	// it takes lots of time on start and is unnecessary.
   171  	// However, on version bumps we can selectively re-minimize/re-smash.
   172  	corpusFlags := fuzzer.ProgFromCorpus | fuzzer.ProgMinimized | fuzzer.ProgSmashed
   173  	switch version {
   174  	case 0:
   175  		// Version 0 had broken minimization, so we need to re-minimize.
   176  		corpusFlags &= ^fuzzer.ProgMinimized
   177  		fallthrough
   178  	case 1:
   179  		// Version 1->2: memory is preallocated so lots of mmaps become unnecessary.
   180  		corpusFlags &= ^fuzzer.ProgMinimized
   181  		fallthrough
   182  	case 2:
   183  		// Version 2->3: big-endian hints.
   184  		corpusFlags &= ^fuzzer.ProgSmashed
   185  		fallthrough
   186  	case 3:
   187  		// Version 3->4: to shake things up.
   188  		corpusFlags &= ^fuzzer.ProgMinimized
   189  		fallthrough
   190  	case 4:
   191  		// Version 4->5: fix for comparison argument sign extension.
   192  		// Introduced in 1ba0279d74a35e96e81de87073212d2b20256e8f.
   193  
   194  		// Update (July 2024):
   195  		// We used to reset the fuzzer.ProgSmashed flag here, but it has led to
   196  		// perpetual corpus retriage on slow syzkaller instances. By now, all faster
   197  		// instances must have already bumped their corpus versions, so let's just
   198  		// increase the version to let all others go past the corpus triage stage.
   199  		fallthrough
   200  	case CurrentDBVersion:
   201  	}
   202  	return corpusFlags
   203  }
   204  
   205  func ParseSeed(target *prog.Target, data []byte) (*prog.Prog, error) {
   206  	p, _, err := parseProg(target, data, prog.NonStrict, nil)
   207  	return p, err
   208  }
   209  
   210  func ParseSeedWithRequirements(target *prog.Target, data []byte, reqs map[string]bool) (
   211  	*prog.Prog, map[string]bool, error) {
   212  	return parseProg(target, data, prog.Strict, reqs)
   213  }
   214  
   215  func parseRequires(data []byte) map[string]bool {
   216  	requires := make(map[string]bool)
   217  	for s := bufio.NewScanner(bytes.NewReader(data)); s.Scan(); {
   218  		const prefix = "# requires:"
   219  		line := s.Text()
   220  		if !strings.HasPrefix(line, prefix) {
   221  			continue
   222  		}
   223  		for _, req := range strings.Fields(line[len(prefix):]) {
   224  			positive := true
   225  			if req[0] == '-' {
   226  				positive = false
   227  				req = req[1:]
   228  			}
   229  			requires[req] = positive
   230  		}
   231  	}
   232  	return requires
   233  }
   234  
   235  func checkArch(requires map[string]bool, arch string) bool {
   236  	for req, positive := range requires {
   237  		const prefix = "arch="
   238  		if strings.HasPrefix(req, prefix) &&
   239  			arch != req[len(prefix):] == positive {
   240  			return false
   241  		}
   242  	}
   243  	return true
   244  }
   245  
   246  func MatchRequirements(props, requires map[string]bool) bool {
   247  	for req, positive := range requires {
   248  		if positive {
   249  			if !props[req] {
   250  				return false
   251  			}
   252  			continue
   253  		}
   254  		matched := true
   255  		for _, req1 := range strings.Split(req, ",") {
   256  			if !props[req1] {
   257  				matched = false
   258  			}
   259  		}
   260  		if matched {
   261  			return false
   262  		}
   263  	}
   264  	return true
   265  }
   266  
   267  var ErrSkippedTest = errors.New("skipped test based on constraints")
   268  
   269  func parseProg(target *prog.Target, data []byte, mode prog.DeserializeMode, reqs map[string]bool) (
   270  	*prog.Prog, map[string]bool, error) {
   271  	properties := parseRequires(data)
   272  	// Need to check requirements early, as some programs may fail to deserialize
   273  	// on some arches due to missing syscalls. We also do not want to parse tests
   274  	// that are marked as 'manual'.
   275  	if !checkArch(properties, target.Arch) || !MatchRequirements(properties, reqs) {
   276  		var pairs []string
   277  		for k, v := range properties {
   278  			pairs = append(pairs, fmt.Sprintf("%s=%t", k, v))
   279  		}
   280  		return nil, properties, fmt.Errorf("%w: %s", ErrSkippedTest, strings.Join(pairs, ", "))
   281  	}
   282  	p, err := target.Deserialize(data, mode)
   283  	if err != nil {
   284  		return nil, nil, err
   285  	}
   286  	if len(p.Calls) > prog.MaxCalls {
   287  		return nil, nil, fmt.Errorf("longer than %d calls (%d)", prog.MaxCalls, len(p.Calls))
   288  	}
   289  	// For some yet unknown reasons, programs with fail_nth > 0 may sneak in. Ignore them.
   290  	for _, call := range p.Calls {
   291  		if call.Props.FailNth > 0 {
   292  			return nil, nil, fmt.Errorf("input has fail_nth > 0")
   293  		}
   294  	}
   295  	return p, properties, nil
   296  }
   297  
   298  type FilteredCandidates struct {
   299  	Candidates     []fuzzer.Candidate
   300  	ModifiedHashes []string
   301  	SeedCount      int
   302  }
   303  
   304  func FilterCandidates(candidates []fuzzer.Candidate, syscalls map[*prog.Syscall]bool,
   305  	dropMinimize bool) FilteredCandidates {
   306  	var ret FilteredCandidates
   307  	for _, item := range candidates {
   308  		if !item.Prog.OnlyContains(syscalls) {
   309  			ret.ModifiedHashes = append(ret.ModifiedHashes, hash.String(item.Prog.Serialize()))
   310  			// We cut out the disabled syscalls and retriage/minimize what remains from the prog.
   311  			// The original prog will be deleted from the corpus.
   312  			if dropMinimize {
   313  				item.Flags &= ^fuzzer.ProgMinimized
   314  			}
   315  			item.Prog.FilterInplace(syscalls)
   316  			if len(item.Prog.Calls) == 0 {
   317  				continue
   318  			}
   319  		}
   320  		if item.Flags&fuzzer.ProgFromCorpus == 0 {
   321  			ret.SeedCount++
   322  		}
   323  		ret.Candidates = append(ret.Candidates, item)
   324  	}
   325  	return ret
   326  }
   327  
   328  // Programs that do more than 15 system calls are to be treated with suspicion and re-minimized.
   329  const ReminimizeThreshold = 15
   330  
   331  // ReminimizeSubset clears the fuzzer.ProgMinimized flag of a small subset of seeds.
   332  // The ultimate objective is to gradually clean up the poorly minimized corpus programs.
   333  // ReminimizeSubset assumes that candidates are sorted in the order of ascending len(Prog.Calls).
   334  func (fc *FilteredCandidates) ReminimizeSubset() int {
   335  	if len(fc.Candidates) == 0 {
   336  		return 0
   337  	}
   338  	// Focus on the top 10% of the largest programs in the corpus.
   339  	threshold := max(ReminimizeThreshold, len(fc.Candidates[len(fc.Candidates)*9/10].Prog.Calls))
   340  	var resetIndices []int
   341  	for i, info := range fc.Candidates {
   342  		if info.Flags&fuzzer.ProgMinimized == 0 {
   343  			continue
   344  		}
   345  		if len(info.Prog.Calls) >= threshold {
   346  			resetIndices = append(resetIndices, i)
   347  		}
   348  	}
   349  	// Reset ProgMinimized for up to 1% of the seed programs.
   350  	reset := min(50, len(resetIndices), max(1, len(fc.Candidates)/100))
   351  	rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
   352  	for _, i := range rnd.Perm(len(resetIndices))[:reset] {
   353  		fc.Candidates[resetIndices[i]].Flags &= ^fuzzer.ProgMinimized
   354  	}
   355  	return reset
   356  }
   357  
   358  // resmashSubset clears fuzzer.ProgSmashes for a subset of seeds.
   359  // We smash the program only once after we add it to the corpus, but it can be that
   360  // either it did not finish before the instance was restarted, or the fuzzing algorithms
   361  // have become smarter over time, or just that kernel code changed over time.
   362  // It would be best to track it in pkg/db, but until it's capable of that, let's just
   363  // re-smash some corpus subset on each syz-manager restart.
   364  func (fc *FilteredCandidates) ResmashSubset() int {
   365  	var indices []int
   366  	for i, info := range fc.Candidates {
   367  		if info.Flags&fuzzer.ProgSmashed == 0 {
   368  			continue
   369  		}
   370  		indices = append(indices, i)
   371  	}
   372  	// Reset ProgSmashed for up to 0.5% of the seed programs.
   373  	reset := min(25, len(indices), max(1, len(fc.Candidates)/200))
   374  	rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
   375  	for _, i := range rnd.Perm(len(indices))[:reset] {
   376  		fc.Candidates[indices[i]].Flags &= ^fuzzer.ProgSmashed
   377  	}
   378  	return reset
   379  }