github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/syz-cluster/series-tracker/main.go (about) 1 // Copyright 2024 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 package main 5 6 import ( 7 "bytes" 8 "context" 9 "errors" 10 "flag" 11 "fmt" 12 "log" 13 "maps" 14 "path/filepath" 15 "regexp" 16 "slices" 17 "sort" 18 "strings" 19 "time" 20 21 "github.com/google/syzkaller/pkg/email" 22 "github.com/google/syzkaller/pkg/email/lore" 23 "github.com/google/syzkaller/pkg/vcs" 24 "github.com/google/syzkaller/syz-cluster/pkg/api" 25 "github.com/google/syzkaller/syz-cluster/pkg/app" 26 ) 27 28 var ( 29 flagVerbose = flag.Bool("verbose", false, "enable verbose output") 30 ) 31 32 func main() { 33 flag.Parse() 34 ctx := context.Background() 35 manifest := NewManifestSource(`https://lore.kernel.org`) 36 fetcher := &SeriesFetcher{ 37 gitRepoFolder: `/git-repo`, // Set in deployment.yaml. 38 client: app.DefaultClient(), 39 manifest: manifest, 40 archives: archivesToPoll(), 41 } 42 go manifest.Loop(ctx) 43 44 // On start, look at the last week of messages. 45 nextFrom := time.Now().Add(-time.Hour * 24 * 7) 46 for { 47 oldFrom := nextFrom 48 // Then, parse last 30 minutes every 15 minutes. 49 nextFrom = time.Now().Add(-time.Minute * 15) 50 err := fetcher.Update(ctx, oldFrom) 51 if err != nil { 52 app.Errorf("fetching failed: %v", err) 53 } 54 time.Sleep(15 * time.Minute) 55 } 56 } 57 58 func archivesToPoll() []string { 59 cfg, err := app.Config() 60 if err != nil { 61 app.Fatalf("failed to fetch the config: %v", err) 62 } 63 if len(cfg.LoreArchives) == 0 { 64 app.Fatalf("the list of archives to poll is empty") 65 } 66 return cfg.LoreArchives 67 } 68 69 type SeriesFetcher struct { 70 gitRepoFolder string 71 client *api.Client 72 manifest *ManifestSource 73 archives []string 74 } 75 76 func (sf *SeriesFetcher) Update(ctx context.Context, from time.Time) error { 77 log.Printf("querying email threads since %v", from) 78 79 manifest := sf.manifest.Get(ctx) 80 if manifest == nil { 81 return fmt.Errorf("failed to query the manifest data") 82 } 83 var list []lore.EmailReader 84 for _, name := range sf.archives { 85 info, ok := manifest[name] 86 if !ok { 87 return fmt.Errorf("manifest has no info for %q", name) 88 } 89 url := info.LastEpochURL() 90 log.Printf("polling %s", url) 91 92 folderName := sanitizeName(name) 93 if folderName == "" { 94 return fmt.Errorf("invalid archive name: %q", name) 95 } 96 gitRepo := vcs.NewLKMLRepo(filepath.Join(sf.gitRepoFolder, folderName)) 97 // TODO: by querying only the last archive, we risk losing the series that are split between both. 98 // But for now let's ignore this possibility. 99 _, err := gitRepo.Poll(url, "master") 100 if err != nil { 101 return fmt.Errorf("failed to poll %q: %w", url, err) 102 } 103 // We could have been fetching the emails precisely starting from the last Update() attempt, 104 // but since we may only save it once the whole series is there, it's easier to just look at all 105 // the recent messages. 106 repoList, err := lore.ReadArchive(gitRepo, "", from) 107 if err != nil { 108 return err 109 } 110 log.Printf("queried %d emails", len(repoList)) 111 list = append(list, repoList...) 112 } 113 114 var emails []*lore.Email 115 idToReader := map[string]lore.EmailReader{} 116 for _, item := range list { 117 // TODO: this could be done in several threads. 118 email, err := item.Parse(nil, nil) 119 if err != nil { 120 log.Printf("failed to parse email: %v", err) 121 continue 122 } 123 idToReader[email.MessageID] = item 124 emails = append(emails, email) 125 } 126 log.Printf("extracted: %d", len(list)) 127 128 allSeries := lore.PatchSeries(emails) 129 log.Printf("collected %d series", len(allSeries)) 130 131 for _, series := range allSeries { 132 if *flagVerbose { 133 logSeries(series) 134 } 135 err := sf.handleSeries(ctx, series, idToReader) 136 if err != nil { 137 app.Errorf("failed to save the series: %v", err) 138 } 139 } 140 return nil 141 } 142 143 func (sf *SeriesFetcher) handleSeries(ctx context.Context, series *lore.Series, 144 idToReader map[string]lore.EmailReader) error { 145 if series.Corrupted != "" { 146 log.Printf("skipping %s because of %q", series.MessageID, series.Corrupted) 147 return nil 148 } 149 first := series.Patches[0] 150 date := first.Date 151 if date.IsZero() || date.After(time.Now()) { 152 // We cannot fully trust dates from the mailing list as some of them are very weird, e.g. 153 // https://lore.kernel.org/all/20770915-nolibc-run-user-v1-1-3caec61726dc@weissschuh.net/raw. 154 date = time.Now() 155 } 156 apiSeries := &api.Series{ 157 ExtID: series.MessageID, 158 AuthorEmail: first.Author, 159 Title: series.Subject, 160 Version: series.Version, 161 SubjectTags: series.Tags, 162 Link: loreLink(series.MessageID), 163 PublishedAt: date, 164 } 165 sp := seriesProcessor{} 166 for i, patch := range series.Patches { 167 raw, err := idToReader[patch.MessageID].Read() 168 if err != nil { 169 return fmt.Errorf("failed to extract %q: %w", patch.MessageID, err) 170 } 171 body, err := sp.Process(raw) 172 if err != nil { 173 // Fall back to the raw message. 174 body = raw 175 log.Printf("failed to parse %d: %v", i, err) 176 } 177 apiSeries.Patches = append(apiSeries.Patches, api.SeriesPatch{ 178 Seq: patch.Seq, 179 Title: patch.Subject, 180 Link: "https://lore.kernel.org/all/" + patch.MessageID, 181 Body: body, 182 }) 183 } 184 apiSeries.Cc = sp.Emails() 185 ret, err := sf.client.UploadSeries(ctx, apiSeries) 186 if err != nil { 187 return fmt.Errorf("failed to save series: %w", err) 188 } else if !ret.Saved { 189 log.Printf("series %s already exists in the DB", series.MessageID) 190 return nil 191 } 192 _, err = sf.client.UploadSession(ctx, &api.NewSession{ 193 ExtID: series.MessageID, 194 }) 195 if err != nil { 196 return fmt.Errorf("failed to request a fuzzing session: %w", err) 197 } 198 log.Printf("series %s saved to the DB", series.MessageID) 199 return nil 200 } 201 202 func loreLink(messageID string) string { 203 return "https://lore.kernel.org/all/" + strings.Trim(messageID, "<>") 204 } 205 206 type seriesProcessor map[string]struct{} 207 208 var errFailedToParse = errors.New("failed to parse the email") 209 210 func (sp seriesProcessor) Process(raw []byte) ([]byte, error) { 211 msg, err := email.Parse(bytes.NewReader(raw), nil, nil, nil) 212 if err != nil { 213 return raw, fmt.Errorf("%w: %w", errFailedToParse, err) 214 } 215 for _, email := range msg.Cc { 216 sp[email] = struct{}{} 217 } 218 return []byte(msg.Body), nil 219 } 220 221 func (sp seriesProcessor) Emails() []string { 222 list := slices.Collect(maps.Keys(sp)) 223 sort.Strings(list) 224 return list 225 } 226 227 func logSeries(series *lore.Series) { 228 log.Printf("series ID=%s Subject=%s Patches=%d Version=%d Corrupted=%q", 229 series.MessageID, series.Subject, len(series.Patches), series.Version, 230 series.Corrupted) 231 for _, m := range series.Patches { 232 log.Printf(" #%d ID=%s Subject=%s", m.Seq, m.MessageID, m.Subject) 233 } 234 } 235 236 func sanitizeName(str string) string { 237 reg, err := regexp.Compile("[^a-zA-Z0-9]+") 238 if err != nil { 239 return "" 240 } 241 return reg.ReplaceAllString(str, "") 242 }