github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/manager/repro.go (about) 1 // Copyright 2024 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 package manager 5 6 import ( 7 "context" 8 "fmt" 9 "maps" 10 "slices" 11 "sync" 12 13 "github.com/google/syzkaller/pkg/log" 14 "github.com/google/syzkaller/pkg/report" 15 "github.com/google/syzkaller/pkg/repro" 16 "github.com/google/syzkaller/pkg/stat" 17 ) 18 19 type ReproResult struct { 20 Crash *Crash // the original crash 21 Repro *repro.Result 22 Strace *repro.StraceResult 23 Stats *repro.Stats 24 Err error 25 } 26 27 type Crash struct { 28 InstanceIndex int 29 FromHub bool // this crash was created based on a repro from syz-hub 30 FromDashboard bool // .. or from dashboard 31 Manual bool 32 FullRepro bool // used by the diff fuzzer to do a full scale reproduction 33 *report.Report 34 TailReports []*report.Report 35 } 36 37 func (c *Crash) FullTitle() string { 38 suffix := "" 39 if c.FullRepro { 40 suffix = " (full)" 41 } 42 if c.Report.Title != "" { 43 return c.Report.Title + suffix 44 } 45 // Just use some unique, but stable titles. 46 if c.FromDashboard { 47 return fmt.Sprintf("dashboard crash %p%s", c, suffix) 48 } else if c.FromHub { 49 return fmt.Sprintf("crash from hub %p%s", c, suffix) 50 } 51 panic("the crash is expected to have a report") 52 } 53 54 type ReproManagerView interface { 55 RunRepro(ctx context.Context, crash *Crash) *ReproResult 56 NeedRepro(crash *Crash) bool 57 ResizeReproPool(size int) 58 } 59 60 type ReproLoop struct { 61 statNumReproducing *stat.Val 62 statPending *stat.Val 63 64 onlyOnce bool 65 mgr ReproManagerView 66 parallel chan struct{} 67 pingQueue chan struct{} 68 reproVMs int 69 70 mu sync.Mutex 71 queue []*Crash 72 reproducing map[string]bool 73 enqueued map[string]bool 74 attempts map[string]int 75 } 76 77 func NewReproLoop(mgr ReproManagerView, reproVMs int, onlyOnce bool) *ReproLoop { 78 ret := &ReproLoop{ 79 mgr: mgr, 80 onlyOnce: onlyOnce, 81 parallel: make(chan struct{}, reproVMs), 82 reproVMs: reproVMs, 83 reproducing: map[string]bool{}, 84 pingQueue: make(chan struct{}, 1), 85 enqueued: map[string]bool{}, 86 attempts: map[string]int{}, 87 } 88 ret.statNumReproducing = stat.New("reproducing", "Number of crashes being reproduced", 89 stat.Console, stat.NoGraph, func() int { 90 ret.mu.Lock() 91 defer ret.mu.Unlock() 92 return len(ret.reproducing) 93 }) 94 ret.statPending = stat.New("pending", "Number of pending repro tasks", 95 stat.Console, stat.NoGraph, func() int { 96 ret.mu.Lock() 97 defer ret.mu.Unlock() 98 return len(ret.queue) 99 }) 100 return ret 101 } 102 103 func (r *ReproLoop) CanReproMore() bool { 104 return len(r.parallel) != 0 105 } 106 107 func (r *ReproLoop) Reproducing() map[string]bool { 108 r.mu.Lock() 109 defer r.mu.Unlock() 110 return maps.Clone(r.reproducing) 111 } 112 113 // Empty returns true if there are neither running nor planned bug reproductions. 114 func (r *ReproLoop) Empty() bool { 115 r.mu.Lock() 116 defer r.mu.Unlock() 117 return len(r.reproducing) == 0 && len(r.queue) == 0 118 } 119 120 func (r *ReproLoop) Enqueue(crash *Crash) { 121 r.mu.Lock() 122 defer r.mu.Unlock() 123 124 title := crash.FullTitle() 125 if r.onlyOnce && r.enqueued[title] { 126 // Try to reproduce each bug at most 1 time in this mode. 127 // Since we don't upload bugs/repros to dashboard, it likely won't have 128 // the reproducer even if we succeeded last time, and will repeatedly 129 // say it needs a repro. 130 return 131 } 132 log.Logf(1, "scheduled a reproduction of '%v'", title) 133 r.enqueued[title] = true 134 r.queue = append(r.queue, crash) 135 136 // Ping the loop. 137 select { 138 case r.pingQueue <- struct{}{}: 139 default: 140 } 141 } 142 143 func (r *ReproLoop) popCrash() *Crash { 144 r.mu.Lock() 145 defer r.mu.Unlock() 146 147 newBetter := func(base, new *Crash) bool { 148 // If diff fuzzed has requested a full reproduction, do it first. 149 if base.FullRepro != new.FullRepro { 150 return new.FullRepro 151 } 152 // The more times we failed, the less likely we are to actually 153 // find a reproducer. Give preference to not yet attempted repro runs. 154 baseTitle, newTitle := base.FullTitle(), new.FullTitle() 155 if r.attempts[baseTitle] != r.attempts[newTitle] { 156 return r.attempts[newTitle] < r.attempts[baseTitle] 157 } 158 // First, serve manual requests. 159 if new.Manual != base.Manual { 160 return new.Manual 161 } 162 // Then, deprioritize hub reproducers. 163 if new.FromHub != base.FromHub { 164 return !new.FromHub 165 } 166 return false 167 } 168 169 idx := -1 170 for i, crash := range r.queue { 171 if r.reproducing[crash.FullTitle()] { 172 continue 173 } 174 if idx == -1 || newBetter(r.queue[idx], r.queue[i]) { 175 idx = i 176 } 177 } 178 if idx == -1 { 179 return nil 180 } 181 crash := r.queue[idx] 182 r.queue = slices.Delete(r.queue, idx, idx+1) 183 return crash 184 } 185 186 func (r *ReproLoop) Loop(ctx context.Context) { 187 defer log.Logf(1, "repro loop terminated") 188 189 count := 0 190 for ; r.calculateReproVMs(count+1) <= r.reproVMs; count++ { 191 r.parallel <- struct{}{} 192 } 193 log.Logf(0, "starting bug reproductions (max %d VMs, %d repros)", r.reproVMs, count) 194 195 var wg sync.WaitGroup 196 defer wg.Wait() 197 198 for { 199 crash := r.popCrash() 200 for { 201 if crash != nil && !r.mgr.NeedRepro(crash) { 202 log.Logf(1, "reproduction of %q aborted: it's no longer needed", crash.FullTitle()) 203 // Now we might not need that many VMs. 204 r.mu.Lock() 205 r.adjustPoolSizeLocked() 206 r.mu.Unlock() 207 208 // Immediately check if there was any other crash in the queue, so that we fall back 209 // to waiting on pingQueue only if there were really no other crashes in the queue. 210 crash = r.popCrash() 211 continue 212 } 213 if crash != nil { 214 break 215 } 216 select { 217 case <-r.pingQueue: 218 crash = r.popCrash() 219 case <-ctx.Done(): 220 return 221 } 222 } 223 224 // Now wait until we can schedule another runner. 225 select { 226 case <-r.parallel: 227 case <-ctx.Done(): 228 return 229 } 230 231 title := crash.FullTitle() 232 r.mu.Lock() 233 r.attempts[title]++ 234 r.reproducing[title] = true 235 r.adjustPoolSizeLocked() 236 r.mu.Unlock() 237 238 wg.Add(1) 239 go func() { 240 defer wg.Done() 241 242 r.handle(ctx, crash) 243 244 r.mu.Lock() 245 delete(r.reproducing, title) 246 r.adjustPoolSizeLocked() 247 r.mu.Unlock() 248 249 r.parallel <- struct{}{} 250 // If the context is cancelled, no one is listening on pingQueue. 251 select { 252 case r.pingQueue <- struct{}{}: 253 default: 254 } 255 }() 256 } 257 } 258 259 func (r *ReproLoop) calculateReproVMs(repros int) int { 260 // Let's allocate 1.33 VMs per a reproducer thread. 261 if r.reproVMs == 1 && repros == 1 { 262 // With one exception -- if we have only one VM, let's still do one repro. 263 return 1 264 } 265 return (repros*4 + 2) / 3 266 } 267 268 func (r *ReproLoop) handle(ctx context.Context, crash *Crash) { 269 log.Logf(0, "start reproducing '%v'", crash.FullTitle()) 270 271 res := r.mgr.RunRepro(ctx, crash) 272 273 crepro := false 274 title := "" 275 if res.Repro != nil { 276 crepro = res.Repro.CRepro 277 title = res.Repro.Report.Title 278 } 279 log.Logf(0, "repro finished '%v', repro=%v crepro=%v desc='%v' hub=%v from_dashboard=%v", 280 crash.FullTitle(), res.Repro != nil, crepro, title, crash.FromHub, crash.FromDashboard, 281 ) 282 } 283 284 func (r *ReproLoop) adjustPoolSizeLocked() { 285 // Avoid the +-1 jitter by considering the repro queue size as well. 286 // We process same-titled crashes sequentially, so only count unique ones. 287 uniqueTitles := maps.Clone(r.reproducing) 288 for _, crash := range r.queue { 289 uniqueTitles[crash.FullTitle()] = true 290 } 291 292 needRepros := len(uniqueTitles) 293 VMs := min(r.reproVMs, r.calculateReproVMs(needRepros)) 294 r.mgr.ResizeReproPool(VMs) 295 }