github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/manager/seeds.go (about) 1 // Copyright 2024 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 package manager 5 6 import ( 7 "bufio" 8 "bytes" 9 "errors" 10 "fmt" 11 "math/rand" 12 "os" 13 "path/filepath" 14 "runtime" 15 "strings" 16 "sync" 17 "time" 18 19 "github.com/google/syzkaller/pkg/db" 20 "github.com/google/syzkaller/pkg/fuzzer" 21 "github.com/google/syzkaller/pkg/hash" 22 "github.com/google/syzkaller/pkg/log" 23 "github.com/google/syzkaller/pkg/mgrconfig" 24 "github.com/google/syzkaller/pkg/osutil" 25 "github.com/google/syzkaller/prog" 26 ) 27 28 type Seeds struct { 29 CorpusDB *db.DB 30 Fresh bool 31 Candidates []fuzzer.Candidate 32 } 33 34 func LoadSeeds(cfg *mgrconfig.Config, immutable bool) (Seeds, error) { 35 var info Seeds 36 var err error 37 info.CorpusDB, err = db.Open(filepath.Join(cfg.Workdir, "corpus.db"), !immutable) 38 if err != nil { 39 if info.CorpusDB == nil { 40 return Seeds{}, fmt.Errorf("failed to open corpus database: %w", err) 41 } 42 log.Errorf("read %v inputs from corpus and got error: %v", len(info.CorpusDB.Records), err) 43 } 44 info.Fresh = len(info.CorpusDB.Records) == 0 45 corpusFlags := versionToFlags(info.CorpusDB.Version) 46 outputs := make(chan *input, 32) 47 chErr := make(chan error, 1) 48 go func() { 49 chErr <- readInputs(cfg, info.CorpusDB, outputs) 50 close(outputs) 51 }() 52 53 brokenSeeds := 0 54 skippedSeeds := 0 55 var brokenCorpus []string 56 var candidates []fuzzer.Candidate 57 for inp := range outputs { 58 if inp.Prog == nil { 59 if inp.IsSeed { 60 if errors.Is(inp.Err, ErrSkippedTest) { 61 skippedSeeds++ 62 log.Logf(2, "seed %s is skipped: %s", inp.Path, inp.Err) 63 } else { 64 brokenSeeds++ 65 log.Logf(0, "seed %s is broken: %s", inp.Path, inp.Err) 66 } 67 } else { 68 brokenCorpus = append(brokenCorpus, inp.Key) 69 } 70 continue 71 } 72 flags := corpusFlags 73 if inp.IsSeed { 74 if _, ok := info.CorpusDB.Records[hash.String(inp.Prog.Serialize())]; ok { 75 continue 76 } 77 // Seeds are not considered "from corpus" (won't be rerun multiple times) 78 // b/c they are tried on every start anyway. 79 flags = fuzzer.ProgMinimized 80 } 81 candidates = append(candidates, fuzzer.Candidate{ 82 Prog: inp.Prog, 83 Flags: flags, 84 }) 85 } 86 if err := <-chErr; err != nil { 87 return Seeds{}, err 88 } 89 if len(brokenCorpus)+brokenSeeds != 0 { 90 log.Logf(0, "broken programs in the corpus: %v, broken seeds: %v", len(brokenCorpus), brokenSeeds) 91 } 92 if skippedSeeds != 0 { 93 log.Logf(0, "skipped %v seeds", skippedSeeds) 94 } 95 if !immutable { 96 // This needs to be done outside of the loop above to not race with corpusDB reads. 97 for _, sig := range brokenCorpus { 98 info.CorpusDB.Delete(sig) 99 } 100 if err := info.CorpusDB.Flush(); err != nil { 101 return Seeds{}, fmt.Errorf("failed to save corpus database: %w", err) 102 } 103 } 104 // Switch database to the mode when it does not keep records in memory. 105 // We don't need them anymore and they consume lots of memory. 106 info.CorpusDB.DiscardData() 107 info.Candidates = candidates 108 return info, nil 109 } 110 111 type input struct { 112 IsSeed bool 113 Key string 114 Path string 115 Data []byte 116 Prog *prog.Prog 117 Err error 118 } 119 120 func readInputs(cfg *mgrconfig.Config, db *db.DB, output chan *input) error { 121 procs := runtime.GOMAXPROCS(0) 122 inputs := make(chan *input, procs) 123 var wg sync.WaitGroup 124 wg.Add(procs) 125 126 defer wg.Wait() 127 defer close(inputs) 128 for p := 0; p < procs; p++ { 129 go func() { 130 defer wg.Done() 131 for inp := range inputs { 132 inp.Prog, inp.Err = ParseSeed(cfg.Target, inp.Data) 133 output <- inp 134 } 135 }() 136 } 137 138 for key, rec := range db.Records { 139 inputs <- &input{ 140 Key: key, 141 Data: rec.Val, 142 } 143 } 144 seedPath := filepath.Join("sys", cfg.TargetOS, "test") 145 seedDir := filepath.Join(cfg.Syzkaller, seedPath) 146 if osutil.IsExist(seedDir) { 147 seeds, err := os.ReadDir(seedDir) 148 if err != nil { 149 return fmt.Errorf("failed to read seeds dir: %w", err) 150 } 151 for _, seed := range seeds { 152 data, err := os.ReadFile(filepath.Join(seedDir, seed.Name())) 153 if err != nil { 154 return fmt.Errorf("failed to read seed %v: %w", seed.Name(), err) 155 } 156 inputs <- &input{ 157 IsSeed: true, 158 Path: filepath.Join(seedPath, seed.Name()), 159 Data: data, 160 } 161 } 162 } 163 return nil 164 } 165 166 const CurrentDBVersion = 5 167 168 func versionToFlags(version uint64) fuzzer.ProgFlags { 169 // By default we don't re-minimize/re-smash programs from corpus, 170 // it takes lots of time on start and is unnecessary. 171 // However, on version bumps we can selectively re-minimize/re-smash. 172 corpusFlags := fuzzer.ProgFromCorpus | fuzzer.ProgMinimized | fuzzer.ProgSmashed 173 switch version { 174 case 0: 175 // Version 0 had broken minimization, so we need to re-minimize. 176 corpusFlags &= ^fuzzer.ProgMinimized 177 fallthrough 178 case 1: 179 // Version 1->2: memory is preallocated so lots of mmaps become unnecessary. 180 corpusFlags &= ^fuzzer.ProgMinimized 181 fallthrough 182 case 2: 183 // Version 2->3: big-endian hints. 184 corpusFlags &= ^fuzzer.ProgSmashed 185 fallthrough 186 case 3: 187 // Version 3->4: to shake things up. 188 corpusFlags &= ^fuzzer.ProgMinimized 189 fallthrough 190 case 4: 191 // Version 4->5: fix for comparison argument sign extension. 192 // Introduced in 1ba0279d74a35e96e81de87073212d2b20256e8f. 193 194 // Update (July 2024): 195 // We used to reset the fuzzer.ProgSmashed flag here, but it has led to 196 // perpetual corpus retriage on slow syzkaller instances. By now, all faster 197 // instances must have already bumped their corpus versions, so let's just 198 // increase the version to let all others go past the corpus triage stage. 199 fallthrough 200 case CurrentDBVersion: 201 } 202 return corpusFlags 203 } 204 205 func ParseSeed(target *prog.Target, data []byte) (*prog.Prog, error) { 206 p, _, err := parseProg(target, data, prog.NonStrict, nil) 207 return p, err 208 } 209 210 func ParseSeedWithRequirements(target *prog.Target, data []byte, reqs map[string]bool) ( 211 *prog.Prog, map[string]bool, error) { 212 return parseProg(target, data, prog.Strict, reqs) 213 } 214 215 func parseRequires(data []byte) map[string]bool { 216 requires := make(map[string]bool) 217 for s := bufio.NewScanner(bytes.NewReader(data)); s.Scan(); { 218 const prefix = "# requires:" 219 line := s.Text() 220 if !strings.HasPrefix(line, prefix) { 221 continue 222 } 223 for _, req := range strings.Fields(line[len(prefix):]) { 224 positive := true 225 if req[0] == '-' { 226 positive = false 227 req = req[1:] 228 } 229 requires[req] = positive 230 } 231 } 232 return requires 233 } 234 235 func checkArch(requires map[string]bool, arch string) bool { 236 for req, positive := range requires { 237 const prefix = "arch=" 238 if strings.HasPrefix(req, prefix) && 239 arch != req[len(prefix):] == positive { 240 return false 241 } 242 } 243 return true 244 } 245 246 func MatchRequirements(props, requires map[string]bool) bool { 247 for req, positive := range requires { 248 if positive { 249 if !props[req] { 250 return false 251 } 252 continue 253 } 254 matched := true 255 for _, req1 := range strings.Split(req, ",") { 256 if !props[req1] { 257 matched = false 258 } 259 } 260 if matched { 261 return false 262 } 263 } 264 return true 265 } 266 267 var ErrSkippedTest = errors.New("skipped test based on constraints") 268 269 func parseProg(target *prog.Target, data []byte, mode prog.DeserializeMode, reqs map[string]bool) ( 270 *prog.Prog, map[string]bool, error) { 271 properties := parseRequires(data) 272 // Need to check requirements early, as some programs may fail to deserialize 273 // on some arches due to missing syscalls. We also do not want to parse tests 274 // that are marked as 'manual'. 275 if !checkArch(properties, target.Arch) || !MatchRequirements(properties, reqs) { 276 var pairs []string 277 for k, v := range properties { 278 pairs = append(pairs, fmt.Sprintf("%s=%t", k, v)) 279 } 280 return nil, properties, fmt.Errorf("%w: %s", ErrSkippedTest, strings.Join(pairs, ", ")) 281 } 282 p, err := target.Deserialize(data, mode) 283 if err != nil { 284 return nil, nil, err 285 } 286 if len(p.Calls) > prog.MaxCalls { 287 return nil, nil, fmt.Errorf("longer than %d calls (%d)", prog.MaxCalls, len(p.Calls)) 288 } 289 // For some yet unknown reasons, programs with fail_nth > 0 may sneak in. Ignore them. 290 for _, call := range p.Calls { 291 if call.Props.FailNth > 0 { 292 return nil, nil, fmt.Errorf("input has fail_nth > 0") 293 } 294 } 295 return p, properties, nil 296 } 297 298 type FilteredCandidates struct { 299 Candidates []fuzzer.Candidate 300 ModifiedHashes []string 301 SeedCount int 302 } 303 304 func FilterCandidates(candidates []fuzzer.Candidate, syscalls map[*prog.Syscall]bool, 305 dropMinimize bool) FilteredCandidates { 306 var ret FilteredCandidates 307 for _, item := range candidates { 308 if !item.Prog.OnlyContains(syscalls) { 309 ret.ModifiedHashes = append(ret.ModifiedHashes, hash.String(item.Prog.Serialize())) 310 // We cut out the disabled syscalls and retriage/minimize what remains from the prog. 311 // The original prog will be deleted from the corpus. 312 if dropMinimize { 313 item.Flags &= ^fuzzer.ProgMinimized 314 } 315 item.Prog.FilterInplace(syscalls) 316 if len(item.Prog.Calls) == 0 { 317 continue 318 } 319 } 320 if item.Flags&fuzzer.ProgFromCorpus == 0 { 321 ret.SeedCount++ 322 } 323 ret.Candidates = append(ret.Candidates, item) 324 } 325 return ret 326 } 327 328 // Programs that do more than 15 system calls are to be treated with suspicion and re-minimized. 329 const ReminimizeThreshold = 15 330 331 // ReminimizeSubset clears the fuzzer.ProgMinimized flag of a small subset of seeds. 332 // The ultimate objective is to gradually clean up the poorly minimized corpus programs. 333 // ReminimizeSubset assumes that candidates are sorted in the order of ascending len(Prog.Calls). 334 func (fc *FilteredCandidates) ReminimizeSubset() int { 335 if len(fc.Candidates) == 0 { 336 return 0 337 } 338 // Focus on the top 10% of the largest programs in the corpus. 339 threshold := max(ReminimizeThreshold, len(fc.Candidates[len(fc.Candidates)*9/10].Prog.Calls)) 340 var resetIndices []int 341 for i, info := range fc.Candidates { 342 if info.Flags&fuzzer.ProgMinimized == 0 { 343 continue 344 } 345 if len(info.Prog.Calls) >= threshold { 346 resetIndices = append(resetIndices, i) 347 } 348 } 349 // Reset ProgMinimized for up to 1% of the seed programs. 350 reset := min(50, len(resetIndices), max(1, len(fc.Candidates)/100)) 351 rnd := rand.New(rand.NewSource(time.Now().UnixNano())) 352 for _, i := range rnd.Perm(len(resetIndices))[:reset] { 353 fc.Candidates[resetIndices[i]].Flags &= ^fuzzer.ProgMinimized 354 } 355 return reset 356 } 357 358 // resmashSubset clears fuzzer.ProgSmashes for a subset of seeds. 359 // We smash the program only once after we add it to the corpus, but it can be that 360 // either it did not finish before the instance was restarted, or the fuzzing algorithms 361 // have become smarter over time, or just that kernel code changed over time. 362 // It would be best to track it in pkg/db, but until it's capable of that, let's just 363 // re-smash some corpus subset on each syz-manager restart. 364 func (fc *FilteredCandidates) ResmashSubset() int { 365 var indices []int 366 for i, info := range fc.Candidates { 367 if info.Flags&fuzzer.ProgSmashed == 0 { 368 continue 369 } 370 indices = append(indices, i) 371 } 372 // Reset ProgSmashed for up to 0.5% of the seed programs. 373 reset := min(25, len(indices), max(1, len(fc.Candidates)/200)) 374 rnd := rand.New(rand.NewSource(time.Now().UnixNano())) 375 for _, i := range rnd.Perm(len(indices))[:reset] { 376 fc.Candidates[indices[i]].Flags &= ^fuzzer.ProgSmashed 377 } 378 return reset 379 }