github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/vm/dispatcher/pool.go (about)

     1  // Copyright 2024 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  package dispatcher
     5  
     6  import (
     7  	"context"
     8  	"io"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/google/syzkaller/pkg/log"
    13  	"github.com/google/syzkaller/pkg/stat"
    14  )
    15  
    16  type Instance interface {
    17  	io.Closer
    18  }
    19  
    20  type UpdateInfo func(cb func(info *Info))
    21  type Runner[T Instance] func(ctx context.Context, inst T, updInfo UpdateInfo)
    22  type CreateInstance[T Instance] func(context.Context, int) (T, error)
    23  
    24  // Pool[T] provides the functionality of a generic pool of instances.
    25  // The instance is assumed to boot, be controlled by one Runner and then be re-created.
    26  // The pool is assumed to have one default Runner (e.g. to be used for fuzzing), while a
    27  // dynamically controlled sub-pool might be reserved for the arbitrary Runners.
    28  type Pool[T Instance] struct {
    29  	BootErrors chan error
    30  	BootTime   stat.AverageValue[time.Duration]
    31  
    32  	creator    CreateInstance[T]
    33  	defaultJob Runner[T]
    34  	jobs       chan Runner[T]
    35  
    36  	// The mutex serializes ReserveForRun() and SetDefault() calls.
    37  	mu        *sync.Mutex
    38  	cv        *sync.Cond
    39  	instances []*poolInstance[T]
    40  	paused    bool
    41  }
    42  
    43  const bootErrorChanCap = 16
    44  
    45  func NewPool[T Instance](count int, creator CreateInstance[T], def Runner[T]) *Pool[T] {
    46  	instances := make([]*poolInstance[T], count)
    47  	for i := 0; i < count; i++ {
    48  		inst := &poolInstance[T]{
    49  			job: def,
    50  			idx: i,
    51  		}
    52  		inst.reset(func() {})
    53  		instances[i] = inst
    54  	}
    55  	mu := new(sync.Mutex)
    56  	return &Pool[T]{
    57  		BootErrors: make(chan error, bootErrorChanCap),
    58  		creator:    creator,
    59  		defaultJob: def,
    60  		instances:  instances,
    61  		jobs:       make(chan Runner[T]),
    62  		mu:         mu,
    63  		cv:         sync.NewCond(mu),
    64  	}
    65  }
    66  
    67  // UpdateDefault forces all VMs to restart.
    68  func (p *Pool[T]) SetDefault(def Runner[T]) {
    69  	p.mu.Lock()
    70  	defer p.mu.Unlock()
    71  	p.defaultJob = def
    72  	p.kickDefault()
    73  }
    74  
    75  func (p *Pool[T]) kickDefault() {
    76  	for _, inst := range p.instances {
    77  		if !inst.reserved() {
    78  			inst.free(p.defaultJob)
    79  		}
    80  	}
    81  }
    82  
    83  func (p *Pool[T]) TogglePause(paused bool) {
    84  	p.mu.Lock()
    85  	defer p.mu.Unlock()
    86  	p.paused = paused
    87  	if paused {
    88  		p.kickDefault()
    89  	} else {
    90  		p.cv.Broadcast()
    91  	}
    92  }
    93  
    94  func (p *Pool[T]) waitUnpaused() {
    95  	p.mu.Lock()
    96  	defer p.mu.Unlock()
    97  	for p.paused {
    98  		p.cv.Wait()
    99  	}
   100  }
   101  
   102  func (p *Pool[T]) Loop(ctx context.Context) {
   103  	var wg sync.WaitGroup
   104  	wg.Add(len(p.instances))
   105  	for _, inst := range p.instances {
   106  		go func() {
   107  			for ctx.Err() == nil {
   108  				p.runInstance(ctx, inst)
   109  			}
   110  			wg.Done()
   111  		}()
   112  	}
   113  	wg.Wait()
   114  }
   115  
   116  func (p *Pool[T]) runInstance(ctx context.Context, inst *poolInstance[T]) {
   117  	p.waitUnpaused()
   118  	ctx, cancel := context.WithCancel(ctx)
   119  	defer cancel()
   120  	log.Logf(2, "pool: booting instance %d", inst.idx)
   121  
   122  	inst.reset(cancel)
   123  
   124  	start := time.Now()
   125  	inst.status(StateBooting)
   126  	defer inst.status(StateOffline)
   127  
   128  	obj, err := p.creator(ctx, inst.idx)
   129  	if err != nil {
   130  		p.reportBootError(ctx, err)
   131  		return
   132  	}
   133  	defer obj.Close()
   134  
   135  	p.BootTime.Save(time.Since(start))
   136  
   137  	inst.status(StateWaiting)
   138  	// The job and jobChan fields are subject to concurrent updates.
   139  	inst.mu.Lock()
   140  	job, jobChan := inst.job, inst.jobChan
   141  	inst.mu.Unlock()
   142  
   143  	if job == nil {
   144  		select {
   145  		case newJob := <-jobChan:
   146  			job = newJob
   147  		case newJob := <-inst.switchToJob:
   148  			job = newJob
   149  		case <-ctx.Done():
   150  			return
   151  		}
   152  	}
   153  
   154  	inst.status(StateRunning)
   155  	job(ctx, obj, inst.updateInfo)
   156  }
   157  
   158  func (p *Pool[T]) reportBootError(ctx context.Context, err error) {
   159  	select {
   160  	case p.BootErrors <- err:
   161  		return
   162  	default:
   163  		// Print some log message to make it visible.
   164  		log.Logf(0, "WARNING: boot error channel is full!")
   165  	}
   166  	select {
   167  	case p.BootErrors <- err:
   168  	case <-ctx.Done():
   169  		// On context cancellation, no one might be listening on the channel.
   170  	}
   171  }
   172  
   173  // ReserveForRun specifies the size of the sub-pool for the execution of custom runners.
   174  // The reserved instances will be booted, but the pool will not start the default runner.
   175  // To unreserve all instances, execute ReserveForRun(0).
   176  func (p *Pool[T]) ReserveForRun(count int) {
   177  	p.mu.Lock()
   178  	defer p.mu.Unlock()
   179  
   180  	if count > len(p.instances) {
   181  		panic("trying to reserve more VMs than present")
   182  	}
   183  
   184  	var free, reserved []*poolInstance[T]
   185  	for _, inst := range p.instances {
   186  		if inst.reserved() {
   187  			reserved = append(reserved, inst)
   188  		} else {
   189  			free = append(free, inst)
   190  		}
   191  	}
   192  
   193  	needReserve := count - len(reserved)
   194  	for i := 0; i < needReserve; i++ {
   195  		log.Logf(2, "pool: reserving instance %d", free[i].idx)
   196  		free[i].reserve(p.jobs)
   197  	}
   198  
   199  	needFree := len(reserved) - count
   200  	for i := 0; i < needFree; i++ {
   201  		log.Logf(2, "pool: releasing instance %d", reserved[i].idx)
   202  		reserved[i].free(p.defaultJob)
   203  	}
   204  }
   205  
   206  // Run blocks until it has found an instance to execute job and until job has finished.
   207  // Returns an error if the job was aborted by cancelling the context.
   208  func (p *Pool[T]) Run(ctx context.Context, job Runner[T]) error {
   209  	done := make(chan error)
   210  	// Submit the job.
   211  	select {
   212  	case p.jobs <- func(jobCtx context.Context, inst T, upd UpdateInfo) {
   213  		mergedCtx, cancel := mergeContextCancel(jobCtx, ctx)
   214  		defer cancel()
   215  
   216  		job(mergedCtx, inst, upd)
   217  		done <- mergedCtx.Err()
   218  	}:
   219  	case <-ctx.Done():
   220  		// If the loop is aborted, no one is going to pick up the job.
   221  		return ctx.Err()
   222  	}
   223  	// Await the job.
   224  	return <-done
   225  }
   226  
   227  func (p *Pool[T]) Total() int {
   228  	return len(p.instances)
   229  }
   230  
   231  type Info struct {
   232  	State      InstanceState
   233  	Status     string
   234  	LastUpdate time.Time
   235  	Reserved   bool
   236  
   237  	// The optional callbacks.
   238  	MachineInfo    func() []byte
   239  	DetailedStatus func() []byte
   240  }
   241  
   242  func (p *Pool[T]) State() []Info {
   243  	p.mu.Lock()
   244  	defer p.mu.Unlock()
   245  
   246  	ret := make([]Info, len(p.instances))
   247  	for i, inst := range p.instances {
   248  		ret[i] = inst.getInfo()
   249  	}
   250  	return ret
   251  }
   252  
   253  // poolInstance is not thread safe.
   254  type poolInstance[T Instance] struct {
   255  	mu   sync.Mutex
   256  	info Info
   257  	idx  int
   258  
   259  	// Either job or jobChan will be set.
   260  	job         Runner[T]
   261  	jobChan     chan Runner[T]
   262  	switchToJob chan Runner[T]
   263  	stop        func()
   264  }
   265  
   266  type InstanceState int
   267  
   268  const (
   269  	StateOffline InstanceState = iota
   270  	StateBooting
   271  	StateWaiting
   272  	StateRunning
   273  )
   274  
   275  // reset() and status() may be called concurrently to all other methods.
   276  // Other methods themselves are serialized.
   277  func (pi *poolInstance[T]) reset(stop func()) {
   278  	pi.mu.Lock()
   279  	defer pi.mu.Unlock()
   280  
   281  	pi.info = Info{
   282  		State:      StateOffline,
   283  		LastUpdate: time.Now(),
   284  		Reserved:   pi.info.Reserved,
   285  	}
   286  	pi.stop = stop
   287  	pi.switchToJob = make(chan Runner[T])
   288  }
   289  
   290  func (pi *poolInstance[T]) updateInfo(upd func(*Info)) {
   291  	pi.mu.Lock()
   292  	defer pi.mu.Unlock()
   293  	upd(&pi.info)
   294  	pi.info.LastUpdate = time.Now()
   295  }
   296  
   297  func (pi *poolInstance[T]) status(status InstanceState) {
   298  	pi.updateInfo(func(info *Info) {
   299  		info.State = status
   300  	})
   301  }
   302  
   303  func (pi *poolInstance[T]) reserved() bool {
   304  	return pi.jobChan != nil
   305  }
   306  
   307  func (pi *poolInstance[T]) getInfo() Info {
   308  	pi.mu.Lock()
   309  	defer pi.mu.Unlock()
   310  	return pi.info
   311  }
   312  
   313  func (pi *poolInstance[T]) reserve(ch chan Runner[T]) {
   314  	pi.mu.Lock()
   315  	// If we don't take the lock, it's possible that instance restart would race with job/jobChan update.
   316  	pi.stop()
   317  	pi.jobChan = ch
   318  	pi.job = nil
   319  	pi.info.Reserved = true
   320  	pi.mu.Unlock()
   321  }
   322  
   323  func (pi *poolInstance[T]) free(job Runner[T]) {
   324  	pi.mu.Lock()
   325  	if pi.job != nil {
   326  		// A change of a default job, let's force restart the instance.
   327  		pi.stop()
   328  	}
   329  	pi.job = job
   330  	pi.jobChan = nil
   331  	switchToJob := pi.switchToJob
   332  	pi.info.Reserved = false
   333  	pi.mu.Unlock()
   334  
   335  	select {
   336  	case switchToJob <- job:
   337  		// Just in case the instance has been waiting.
   338  		return
   339  	default:
   340  	}
   341  }
   342  
   343  func mergeContextCancel(main, monitor context.Context) (context.Context, func()) {
   344  	withCancel, cancel := context.WithCancel(main)
   345  	go func() {
   346  		select {
   347  		case <-withCancel.Done():
   348  		case <-monitor.Done():
   349  		}
   350  		cancel()
   351  	}()
   352  	return withCancel, cancel
   353  }