github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/vm/dispatcher/pool.go (about) 1 // Copyright 2024 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 package dispatcher 5 6 import ( 7 "context" 8 "io" 9 "sync" 10 "time" 11 12 "github.com/google/syzkaller/pkg/log" 13 "github.com/google/syzkaller/pkg/stat" 14 ) 15 16 type Instance interface { 17 io.Closer 18 } 19 20 type UpdateInfo func(cb func(info *Info)) 21 type Runner[T Instance] func(ctx context.Context, inst T, updInfo UpdateInfo) 22 type CreateInstance[T Instance] func(context.Context, int) (T, error) 23 24 // Pool[T] provides the functionality of a generic pool of instances. 25 // The instance is assumed to boot, be controlled by one Runner and then be re-created. 26 // The pool is assumed to have one default Runner (e.g. to be used for fuzzing), while a 27 // dynamically controlled sub-pool might be reserved for the arbitrary Runners. 28 type Pool[T Instance] struct { 29 BootErrors chan error 30 BootTime stat.AverageValue[time.Duration] 31 32 creator CreateInstance[T] 33 defaultJob Runner[T] 34 jobs chan Runner[T] 35 36 // The mutex serializes ReserveForRun() and SetDefault() calls. 37 mu *sync.Mutex 38 cv *sync.Cond 39 instances []*poolInstance[T] 40 paused bool 41 } 42 43 const bootErrorChanCap = 16 44 45 func NewPool[T Instance](count int, creator CreateInstance[T], def Runner[T]) *Pool[T] { 46 instances := make([]*poolInstance[T], count) 47 for i := 0; i < count; i++ { 48 inst := &poolInstance[T]{ 49 job: def, 50 idx: i, 51 } 52 inst.reset(func() {}) 53 instances[i] = inst 54 } 55 mu := new(sync.Mutex) 56 return &Pool[T]{ 57 BootErrors: make(chan error, bootErrorChanCap), 58 creator: creator, 59 defaultJob: def, 60 instances: instances, 61 jobs: make(chan Runner[T]), 62 mu: mu, 63 cv: sync.NewCond(mu), 64 } 65 } 66 67 // UpdateDefault forces all VMs to restart. 68 func (p *Pool[T]) SetDefault(def Runner[T]) { 69 p.mu.Lock() 70 defer p.mu.Unlock() 71 p.defaultJob = def 72 p.kickDefault() 73 } 74 75 func (p *Pool[T]) kickDefault() { 76 for _, inst := range p.instances { 77 if !inst.reserved() { 78 inst.free(p.defaultJob) 79 } 80 } 81 } 82 83 func (p *Pool[T]) TogglePause(paused bool) { 84 p.mu.Lock() 85 defer p.mu.Unlock() 86 p.paused = paused 87 if paused { 88 p.kickDefault() 89 } else { 90 p.cv.Broadcast() 91 } 92 } 93 94 func (p *Pool[T]) waitUnpaused() { 95 p.mu.Lock() 96 defer p.mu.Unlock() 97 for p.paused { 98 p.cv.Wait() 99 } 100 } 101 102 func (p *Pool[T]) Loop(ctx context.Context) { 103 var wg sync.WaitGroup 104 wg.Add(len(p.instances)) 105 for _, inst := range p.instances { 106 go func() { 107 for ctx.Err() == nil { 108 p.runInstance(ctx, inst) 109 } 110 wg.Done() 111 }() 112 } 113 wg.Wait() 114 } 115 116 func (p *Pool[T]) runInstance(ctx context.Context, inst *poolInstance[T]) { 117 p.waitUnpaused() 118 ctx, cancel := context.WithCancel(ctx) 119 defer cancel() 120 log.Logf(2, "pool: booting instance %d", inst.idx) 121 122 inst.reset(cancel) 123 124 start := time.Now() 125 inst.status(StateBooting) 126 defer inst.status(StateOffline) 127 128 obj, err := p.creator(ctx, inst.idx) 129 if err != nil { 130 p.reportBootError(ctx, err) 131 return 132 } 133 defer obj.Close() 134 135 p.BootTime.Save(time.Since(start)) 136 137 inst.status(StateWaiting) 138 // The job and jobChan fields are subject to concurrent updates. 139 inst.mu.Lock() 140 job, jobChan := inst.job, inst.jobChan 141 inst.mu.Unlock() 142 143 if job == nil { 144 select { 145 case newJob := <-jobChan: 146 job = newJob 147 case newJob := <-inst.switchToJob: 148 job = newJob 149 case <-ctx.Done(): 150 return 151 } 152 } 153 154 inst.status(StateRunning) 155 job(ctx, obj, inst.updateInfo) 156 } 157 158 func (p *Pool[T]) reportBootError(ctx context.Context, err error) { 159 select { 160 case p.BootErrors <- err: 161 return 162 default: 163 // Print some log message to make it visible. 164 log.Logf(0, "WARNING: boot error channel is full!") 165 } 166 select { 167 case p.BootErrors <- err: 168 case <-ctx.Done(): 169 // On context cancellation, no one might be listening on the channel. 170 } 171 } 172 173 // ReserveForRun specifies the size of the sub-pool for the execution of custom runners. 174 // The reserved instances will be booted, but the pool will not start the default runner. 175 // To unreserve all instances, execute ReserveForRun(0). 176 func (p *Pool[T]) ReserveForRun(count int) { 177 p.mu.Lock() 178 defer p.mu.Unlock() 179 180 if count > len(p.instances) { 181 panic("trying to reserve more VMs than present") 182 } 183 184 var free, reserved []*poolInstance[T] 185 for _, inst := range p.instances { 186 if inst.reserved() { 187 reserved = append(reserved, inst) 188 } else { 189 free = append(free, inst) 190 } 191 } 192 193 needReserve := count - len(reserved) 194 for i := 0; i < needReserve; i++ { 195 log.Logf(2, "pool: reserving instance %d", free[i].idx) 196 free[i].reserve(p.jobs) 197 } 198 199 needFree := len(reserved) - count 200 for i := 0; i < needFree; i++ { 201 log.Logf(2, "pool: releasing instance %d", reserved[i].idx) 202 reserved[i].free(p.defaultJob) 203 } 204 } 205 206 // Run blocks until it has found an instance to execute job and until job has finished. 207 // Returns an error if the job was aborted by cancelling the context. 208 func (p *Pool[T]) Run(ctx context.Context, job Runner[T]) error { 209 done := make(chan error) 210 // Submit the job. 211 select { 212 case p.jobs <- func(jobCtx context.Context, inst T, upd UpdateInfo) { 213 mergedCtx, cancel := mergeContextCancel(jobCtx, ctx) 214 defer cancel() 215 216 job(mergedCtx, inst, upd) 217 done <- mergedCtx.Err() 218 }: 219 case <-ctx.Done(): 220 // If the loop is aborted, no one is going to pick up the job. 221 return ctx.Err() 222 } 223 // Await the job. 224 return <-done 225 } 226 227 func (p *Pool[T]) Total() int { 228 return len(p.instances) 229 } 230 231 type Info struct { 232 State InstanceState 233 Status string 234 LastUpdate time.Time 235 Reserved bool 236 237 // The optional callbacks. 238 MachineInfo func() []byte 239 DetailedStatus func() []byte 240 } 241 242 func (p *Pool[T]) State() []Info { 243 p.mu.Lock() 244 defer p.mu.Unlock() 245 246 ret := make([]Info, len(p.instances)) 247 for i, inst := range p.instances { 248 ret[i] = inst.getInfo() 249 } 250 return ret 251 } 252 253 // poolInstance is not thread safe. 254 type poolInstance[T Instance] struct { 255 mu sync.Mutex 256 info Info 257 idx int 258 259 // Either job or jobChan will be set. 260 job Runner[T] 261 jobChan chan Runner[T] 262 switchToJob chan Runner[T] 263 stop func() 264 } 265 266 type InstanceState int 267 268 const ( 269 StateOffline InstanceState = iota 270 StateBooting 271 StateWaiting 272 StateRunning 273 ) 274 275 // reset() and status() may be called concurrently to all other methods. 276 // Other methods themselves are serialized. 277 func (pi *poolInstance[T]) reset(stop func()) { 278 pi.mu.Lock() 279 defer pi.mu.Unlock() 280 281 pi.info = Info{ 282 State: StateOffline, 283 LastUpdate: time.Now(), 284 Reserved: pi.info.Reserved, 285 } 286 pi.stop = stop 287 pi.switchToJob = make(chan Runner[T]) 288 } 289 290 func (pi *poolInstance[T]) updateInfo(upd func(*Info)) { 291 pi.mu.Lock() 292 defer pi.mu.Unlock() 293 upd(&pi.info) 294 pi.info.LastUpdate = time.Now() 295 } 296 297 func (pi *poolInstance[T]) status(status InstanceState) { 298 pi.updateInfo(func(info *Info) { 299 info.State = status 300 }) 301 } 302 303 func (pi *poolInstance[T]) reserved() bool { 304 return pi.jobChan != nil 305 } 306 307 func (pi *poolInstance[T]) getInfo() Info { 308 pi.mu.Lock() 309 defer pi.mu.Unlock() 310 return pi.info 311 } 312 313 func (pi *poolInstance[T]) reserve(ch chan Runner[T]) { 314 pi.mu.Lock() 315 // If we don't take the lock, it's possible that instance restart would race with job/jobChan update. 316 pi.stop() 317 pi.jobChan = ch 318 pi.job = nil 319 pi.info.Reserved = true 320 pi.mu.Unlock() 321 } 322 323 func (pi *poolInstance[T]) free(job Runner[T]) { 324 pi.mu.Lock() 325 if pi.job != nil { 326 // A change of a default job, let's force restart the instance. 327 pi.stop() 328 } 329 pi.job = job 330 pi.jobChan = nil 331 switchToJob := pi.switchToJob 332 pi.info.Reserved = false 333 pi.mu.Unlock() 334 335 select { 336 case switchToJob <- job: 337 // Just in case the instance has been waiting. 338 return 339 default: 340 } 341 } 342 343 func mergeContextCancel(main, monitor context.Context) (context.Context, func()) { 344 withCancel, cancel := context.WithCancel(main) 345 go func() { 346 select { 347 case <-withCancel.Done(): 348 case <-monitor.Done(): 349 } 350 cancel() 351 }() 352 return withCancel, cancel 353 }