github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/manager/repro.go (about)

     1  // Copyright 2024 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  package manager
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"maps"
    10  	"slices"
    11  	"sync"
    12  
    13  	"github.com/google/syzkaller/pkg/log"
    14  	"github.com/google/syzkaller/pkg/report"
    15  	"github.com/google/syzkaller/pkg/repro"
    16  	"github.com/google/syzkaller/pkg/stat"
    17  )
    18  
    19  type ReproResult struct {
    20  	Crash  *Crash // the original crash
    21  	Repro  *repro.Result
    22  	Strace *repro.StraceResult
    23  	Stats  *repro.Stats
    24  	Err    error
    25  }
    26  
    27  type Crash struct {
    28  	InstanceIndex int
    29  	FromHub       bool // this crash was created based on a repro from syz-hub
    30  	FromDashboard bool // .. or from dashboard
    31  	Manual        bool
    32  	FullRepro     bool // used by the diff fuzzer to do a full scale reproduction
    33  	*report.Report
    34  	TailReports []*report.Report
    35  }
    36  
    37  func (c *Crash) FullTitle() string {
    38  	suffix := ""
    39  	if c.FullRepro {
    40  		suffix = " (full)"
    41  	}
    42  	if c.Report.Title != "" {
    43  		return c.Report.Title + suffix
    44  	}
    45  	// Just use some unique, but stable titles.
    46  	if c.FromDashboard {
    47  		return fmt.Sprintf("dashboard crash %p%s", c, suffix)
    48  	} else if c.FromHub {
    49  		return fmt.Sprintf("crash from hub %p%s", c, suffix)
    50  	}
    51  	panic("the crash is expected to have a report")
    52  }
    53  
    54  type ReproManagerView interface {
    55  	RunRepro(ctx context.Context, crash *Crash) *ReproResult
    56  	NeedRepro(crash *Crash) bool
    57  	ResizeReproPool(size int)
    58  }
    59  
    60  type ReproLoop struct {
    61  	statNumReproducing *stat.Val
    62  	statPending        *stat.Val
    63  
    64  	onlyOnce  bool
    65  	mgr       ReproManagerView
    66  	parallel  chan struct{}
    67  	pingQueue chan struct{}
    68  	reproVMs  int
    69  
    70  	mu          sync.Mutex
    71  	queue       []*Crash
    72  	reproducing map[string]bool
    73  	enqueued    map[string]bool
    74  	attempts    map[string]int
    75  }
    76  
    77  func NewReproLoop(mgr ReproManagerView, reproVMs int, onlyOnce bool) *ReproLoop {
    78  	ret := &ReproLoop{
    79  		mgr:         mgr,
    80  		onlyOnce:    onlyOnce,
    81  		parallel:    make(chan struct{}, reproVMs),
    82  		reproVMs:    reproVMs,
    83  		reproducing: map[string]bool{},
    84  		pingQueue:   make(chan struct{}, 1),
    85  		enqueued:    map[string]bool{},
    86  		attempts:    map[string]int{},
    87  	}
    88  	ret.statNumReproducing = stat.New("reproducing", "Number of crashes being reproduced",
    89  		stat.Console, stat.NoGraph, func() int {
    90  			ret.mu.Lock()
    91  			defer ret.mu.Unlock()
    92  			return len(ret.reproducing)
    93  		})
    94  	ret.statPending = stat.New("pending", "Number of pending repro tasks",
    95  		stat.Console, stat.NoGraph, func() int {
    96  			ret.mu.Lock()
    97  			defer ret.mu.Unlock()
    98  			return len(ret.queue)
    99  		})
   100  	return ret
   101  }
   102  
   103  func (r *ReproLoop) CanReproMore() bool {
   104  	return len(r.parallel) != 0
   105  }
   106  
   107  func (r *ReproLoop) Reproducing() map[string]bool {
   108  	r.mu.Lock()
   109  	defer r.mu.Unlock()
   110  	return maps.Clone(r.reproducing)
   111  }
   112  
   113  // Empty returns true if there are neither running nor planned bug reproductions.
   114  func (r *ReproLoop) Empty() bool {
   115  	r.mu.Lock()
   116  	defer r.mu.Unlock()
   117  	return len(r.reproducing) == 0 && len(r.queue) == 0
   118  }
   119  
   120  func (r *ReproLoop) Enqueue(crash *Crash) {
   121  	r.mu.Lock()
   122  	defer r.mu.Unlock()
   123  
   124  	title := crash.FullTitle()
   125  	if r.onlyOnce && r.enqueued[title] {
   126  		// Try to reproduce each bug at most 1 time in this mode.
   127  		// Since we don't upload bugs/repros to dashboard, it likely won't have
   128  		// the reproducer even if we succeeded last time, and will repeatedly
   129  		// say it needs a repro.
   130  		return
   131  	}
   132  	log.Logf(1, "scheduled a reproduction of '%v'", title)
   133  	r.enqueued[title] = true
   134  	r.queue = append(r.queue, crash)
   135  
   136  	// Ping the loop.
   137  	select {
   138  	case r.pingQueue <- struct{}{}:
   139  	default:
   140  	}
   141  }
   142  
   143  func (r *ReproLoop) popCrash() *Crash {
   144  	r.mu.Lock()
   145  	defer r.mu.Unlock()
   146  
   147  	newBetter := func(base, new *Crash) bool {
   148  		// If diff fuzzed has requested a full reproduction, do it first.
   149  		if base.FullRepro != new.FullRepro {
   150  			return new.FullRepro
   151  		}
   152  		// The more times we failed, the less likely we are to actually
   153  		// find a reproducer. Give preference to not yet attempted repro runs.
   154  		baseTitle, newTitle := base.FullTitle(), new.FullTitle()
   155  		if r.attempts[baseTitle] != r.attempts[newTitle] {
   156  			return r.attempts[newTitle] < r.attempts[baseTitle]
   157  		}
   158  		// First, serve manual requests.
   159  		if new.Manual != base.Manual {
   160  			return new.Manual
   161  		}
   162  		// Then, deprioritize hub reproducers.
   163  		if new.FromHub != base.FromHub {
   164  			return !new.FromHub
   165  		}
   166  		return false
   167  	}
   168  
   169  	idx := -1
   170  	for i, crash := range r.queue {
   171  		if r.reproducing[crash.FullTitle()] {
   172  			continue
   173  		}
   174  		if idx == -1 || newBetter(r.queue[idx], r.queue[i]) {
   175  			idx = i
   176  		}
   177  	}
   178  	if idx == -1 {
   179  		return nil
   180  	}
   181  	crash := r.queue[idx]
   182  	r.queue = slices.Delete(r.queue, idx, idx+1)
   183  	return crash
   184  }
   185  
   186  func (r *ReproLoop) Loop(ctx context.Context) {
   187  	defer log.Logf(1, "repro loop terminated")
   188  
   189  	count := 0
   190  	for ; r.calculateReproVMs(count+1) <= r.reproVMs; count++ {
   191  		r.parallel <- struct{}{}
   192  	}
   193  	log.Logf(0, "starting bug reproductions (max %d VMs, %d repros)", r.reproVMs, count)
   194  
   195  	var wg sync.WaitGroup
   196  	defer wg.Wait()
   197  
   198  	for {
   199  		crash := r.popCrash()
   200  		for {
   201  			if crash != nil && !r.mgr.NeedRepro(crash) {
   202  				log.Logf(1, "reproduction of %q aborted: it's no longer needed", crash.FullTitle())
   203  				// Now we might not need that many VMs.
   204  				r.mu.Lock()
   205  				r.adjustPoolSizeLocked()
   206  				r.mu.Unlock()
   207  
   208  				// Immediately check if there was any other crash in the queue, so that we fall back
   209  				// to waiting on pingQueue only if there were really no other crashes in the queue.
   210  				crash = r.popCrash()
   211  				continue
   212  			}
   213  			if crash != nil {
   214  				break
   215  			}
   216  			select {
   217  			case <-r.pingQueue:
   218  				crash = r.popCrash()
   219  			case <-ctx.Done():
   220  				return
   221  			}
   222  		}
   223  
   224  		// Now wait until we can schedule another runner.
   225  		select {
   226  		case <-r.parallel:
   227  		case <-ctx.Done():
   228  			return
   229  		}
   230  
   231  		title := crash.FullTitle()
   232  		r.mu.Lock()
   233  		r.attempts[title]++
   234  		r.reproducing[title] = true
   235  		r.adjustPoolSizeLocked()
   236  		r.mu.Unlock()
   237  
   238  		wg.Add(1)
   239  		go func() {
   240  			defer wg.Done()
   241  
   242  			r.handle(ctx, crash)
   243  
   244  			r.mu.Lock()
   245  			delete(r.reproducing, title)
   246  			r.adjustPoolSizeLocked()
   247  			r.mu.Unlock()
   248  
   249  			r.parallel <- struct{}{}
   250  			// If the context is cancelled, no one is listening on pingQueue.
   251  			select {
   252  			case r.pingQueue <- struct{}{}:
   253  			default:
   254  			}
   255  		}()
   256  	}
   257  }
   258  
   259  func (r *ReproLoop) calculateReproVMs(repros int) int {
   260  	// Let's allocate 1.33 VMs per a reproducer thread.
   261  	if r.reproVMs == 1 && repros == 1 {
   262  		// With one exception -- if we have only one VM, let's still do one repro.
   263  		return 1
   264  	}
   265  	return (repros*4 + 2) / 3
   266  }
   267  
   268  func (r *ReproLoop) handle(ctx context.Context, crash *Crash) {
   269  	log.Logf(0, "start reproducing '%v'", crash.FullTitle())
   270  
   271  	res := r.mgr.RunRepro(ctx, crash)
   272  
   273  	crepro := false
   274  	title := ""
   275  	if res.Repro != nil {
   276  		crepro = res.Repro.CRepro
   277  		title = res.Repro.Report.Title
   278  	}
   279  	log.Logf(0, "repro finished '%v', repro=%v crepro=%v desc='%v' hub=%v from_dashboard=%v",
   280  		crash.FullTitle(), res.Repro != nil, crepro, title, crash.FromHub, crash.FromDashboard,
   281  	)
   282  }
   283  
   284  func (r *ReproLoop) adjustPoolSizeLocked() {
   285  	// Avoid the +-1 jitter by considering the repro queue size as well.
   286  	// We process same-titled crashes sequentially, so only count unique ones.
   287  	uniqueTitles := maps.Clone(r.reproducing)
   288  	for _, crash := range r.queue {
   289  		uniqueTitles[crash.FullTitle()] = true
   290  	}
   291  
   292  	needRepros := len(uniqueTitles)
   293  	VMs := min(r.reproVMs, r.calculateReproVMs(needRepros))
   294  	r.mgr.ResizeReproPool(VMs)
   295  }