github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/syz-cluster/series-tracker/main.go (about)

     1  // Copyright 2024 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  package main
     5  
     6  import (
     7  	"bytes"
     8  	"context"
     9  	"errors"
    10  	"flag"
    11  	"fmt"
    12  	"log"
    13  	"maps"
    14  	"path/filepath"
    15  	"regexp"
    16  	"slices"
    17  	"sort"
    18  	"strings"
    19  	"time"
    20  
    21  	"github.com/google/syzkaller/pkg/email"
    22  	"github.com/google/syzkaller/pkg/email/lore"
    23  	"github.com/google/syzkaller/pkg/vcs"
    24  	"github.com/google/syzkaller/syz-cluster/pkg/api"
    25  	"github.com/google/syzkaller/syz-cluster/pkg/app"
    26  )
    27  
    28  var (
    29  	flagVerbose = flag.Bool("verbose", false, "enable verbose output")
    30  )
    31  
    32  func main() {
    33  	flag.Parse()
    34  	ctx := context.Background()
    35  	manifest := NewManifestSource(`https://lore.kernel.org`)
    36  	fetcher := &SeriesFetcher{
    37  		gitRepoFolder: `/git-repo`, // Set in deployment.yaml.
    38  		client:        app.DefaultClient(),
    39  		manifest:      manifest,
    40  		archives:      archivesToPoll(),
    41  	}
    42  	go manifest.Loop(ctx)
    43  
    44  	// On start, look at the last week of messages.
    45  	nextFrom := time.Now().Add(-time.Hour * 24 * 7)
    46  	for {
    47  		oldFrom := nextFrom
    48  		// Then, parse last 30 minutes every 15 minutes.
    49  		nextFrom = time.Now().Add(-time.Minute * 15)
    50  		err := fetcher.Update(ctx, oldFrom)
    51  		if err != nil {
    52  			app.Errorf("fetching failed: %v", err)
    53  		}
    54  		time.Sleep(15 * time.Minute)
    55  	}
    56  }
    57  
    58  func archivesToPoll() []string {
    59  	cfg, err := app.Config()
    60  	if err != nil {
    61  		app.Fatalf("failed to fetch the config: %v", err)
    62  	}
    63  	if len(cfg.LoreArchives) == 0 {
    64  		app.Fatalf("the list of archives to poll is empty")
    65  	}
    66  	return cfg.LoreArchives
    67  }
    68  
    69  type SeriesFetcher struct {
    70  	gitRepoFolder string
    71  	client        *api.Client
    72  	manifest      *ManifestSource
    73  	archives      []string
    74  }
    75  
    76  func (sf *SeriesFetcher) Update(ctx context.Context, from time.Time) error {
    77  	log.Printf("querying email threads since %v", from)
    78  
    79  	manifest := sf.manifest.Get(ctx)
    80  	if manifest == nil {
    81  		return fmt.Errorf("failed to query the manifest data")
    82  	}
    83  	var list []lore.EmailReader
    84  	for _, name := range sf.archives {
    85  		info, ok := manifest[name]
    86  		if !ok {
    87  			return fmt.Errorf("manifest has no info for %q", name)
    88  		}
    89  		url := info.LastEpochURL()
    90  		log.Printf("polling %s", url)
    91  
    92  		folderName := sanitizeName(name)
    93  		if folderName == "" {
    94  			return fmt.Errorf("invalid archive name: %q", name)
    95  		}
    96  		gitRepo := vcs.NewLKMLRepo(filepath.Join(sf.gitRepoFolder, folderName))
    97  		// TODO: by querying only the last archive, we risk losing the series that are split between both.
    98  		// But for now let's ignore this possibility.
    99  		_, err := gitRepo.Poll(url, "master")
   100  		if err != nil {
   101  			return fmt.Errorf("failed to poll %q: %w", url, err)
   102  		}
   103  		// We could have been fetching the emails precisely starting from the last Update() attempt,
   104  		// but since we may only save it once the whole series is there, it's easier to just look at all
   105  		// the recent messages.
   106  		repoList, err := lore.ReadArchive(gitRepo, "", from)
   107  		if err != nil {
   108  			return err
   109  		}
   110  		log.Printf("queried %d emails", len(repoList))
   111  		list = append(list, repoList...)
   112  	}
   113  
   114  	var emails []*lore.Email
   115  	idToReader := map[string]lore.EmailReader{}
   116  	for _, item := range list {
   117  		// TODO: this could be done in several threads.
   118  		email, err := item.Parse(nil, nil)
   119  		if err != nil {
   120  			log.Printf("failed to parse email: %v", err)
   121  			continue
   122  		}
   123  		idToReader[email.MessageID] = item
   124  		emails = append(emails, email)
   125  	}
   126  	log.Printf("extracted: %d", len(list))
   127  
   128  	allSeries := lore.PatchSeries(emails)
   129  	log.Printf("collected %d series", len(allSeries))
   130  
   131  	for _, series := range allSeries {
   132  		if *flagVerbose {
   133  			logSeries(series)
   134  		}
   135  		err := sf.handleSeries(ctx, series, idToReader)
   136  		if err != nil {
   137  			app.Errorf("failed to save the series: %v", err)
   138  		}
   139  	}
   140  	return nil
   141  }
   142  
   143  func (sf *SeriesFetcher) handleSeries(ctx context.Context, series *lore.Series,
   144  	idToReader map[string]lore.EmailReader) error {
   145  	if series.Corrupted != "" {
   146  		log.Printf("skipping %s because of %q", series.MessageID, series.Corrupted)
   147  		return nil
   148  	}
   149  	first := series.Patches[0]
   150  	date := first.Date
   151  	if date.IsZero() || date.After(time.Now()) {
   152  		// We cannot fully trust dates from the mailing list as some of them are very weird, e.g.
   153  		// https://lore.kernel.org/all/20770915-nolibc-run-user-v1-1-3caec61726dc@weissschuh.net/raw.
   154  		date = time.Now()
   155  	}
   156  	apiSeries := &api.Series{
   157  		ExtID:       series.MessageID,
   158  		AuthorEmail: first.Author,
   159  		Title:       series.Subject,
   160  		Version:     series.Version,
   161  		SubjectTags: series.Tags,
   162  		Link:        loreLink(series.MessageID),
   163  		PublishedAt: date,
   164  	}
   165  	sp := seriesProcessor{}
   166  	for i, patch := range series.Patches {
   167  		raw, err := idToReader[patch.MessageID].Read()
   168  		if err != nil {
   169  			return fmt.Errorf("failed to extract %q: %w", patch.MessageID, err)
   170  		}
   171  		body, err := sp.Process(raw)
   172  		if err != nil {
   173  			// Fall back to the raw message.
   174  			body = raw
   175  			log.Printf("failed to parse %d: %v", i, err)
   176  		}
   177  		apiSeries.Patches = append(apiSeries.Patches, api.SeriesPatch{
   178  			Seq:   patch.Seq,
   179  			Title: patch.Subject,
   180  			Link:  "https://lore.kernel.org/all/" + patch.MessageID,
   181  			Body:  body,
   182  		})
   183  	}
   184  	apiSeries.Cc = sp.Emails()
   185  	ret, err := sf.client.UploadSeries(ctx, apiSeries)
   186  	if err != nil {
   187  		return fmt.Errorf("failed to save series: %w", err)
   188  	} else if !ret.Saved {
   189  		log.Printf("series %s already exists in the DB", series.MessageID)
   190  		return nil
   191  	}
   192  	_, err = sf.client.UploadSession(ctx, &api.NewSession{
   193  		ExtID: series.MessageID,
   194  	})
   195  	if err != nil {
   196  		return fmt.Errorf("failed to request a fuzzing session: %w", err)
   197  	}
   198  	log.Printf("series %s saved to the DB", series.MessageID)
   199  	return nil
   200  }
   201  
   202  func loreLink(messageID string) string {
   203  	return "https://lore.kernel.org/all/" + strings.Trim(messageID, "<>")
   204  }
   205  
   206  type seriesProcessor map[string]struct{}
   207  
   208  var errFailedToParse = errors.New("failed to parse the email")
   209  
   210  func (sp seriesProcessor) Process(raw []byte) ([]byte, error) {
   211  	msg, err := email.Parse(bytes.NewReader(raw), nil, nil, nil)
   212  	if err != nil {
   213  		return raw, fmt.Errorf("%w: %w", errFailedToParse, err)
   214  	}
   215  	for _, email := range msg.Cc {
   216  		sp[email] = struct{}{}
   217  	}
   218  	return []byte(msg.Body), nil
   219  }
   220  
   221  func (sp seriesProcessor) Emails() []string {
   222  	list := slices.Collect(maps.Keys(sp))
   223  	sort.Strings(list)
   224  	return list
   225  }
   226  
   227  func logSeries(series *lore.Series) {
   228  	log.Printf("series ID=%s Subject=%s Patches=%d Version=%d Corrupted=%q",
   229  		series.MessageID, series.Subject, len(series.Patches), series.Version,
   230  		series.Corrupted)
   231  	for _, m := range series.Patches {
   232  		log.Printf("  #%d ID=%s Subject=%s", m.Seq, m.MessageID, m.Subject)
   233  	}
   234  }
   235  
   236  func sanitizeName(str string) string {
   237  	reg, err := regexp.Compile("[^a-zA-Z0-9]+")
   238  	if err != nil {
   239  		return ""
   240  	}
   241  	return reg.ReplaceAllString(str, "")
   242  }