github.com/JimmyHuang454/JLS-go@v0.0.0-20230831150107-90d536585ba0/internal/fuzz/worker.go (about)

     1  // Copyright 2020 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package fuzz
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"crypto/sha256"
    11  	"encoding/json"
    12  	"errors"
    13  	"fmt"
    14  	"io"
    15  	"os"
    16  	"os/exec"
    17  	"reflect"
    18  	"runtime"
    19  	"sync"
    20  	"time"
    21  )
    22  
    23  const (
    24  	// workerFuzzDuration is the amount of time a worker can spend testing random
    25  	// variations of an input given by the coordinator.
    26  	workerFuzzDuration = 100 * time.Millisecond
    27  
    28  	// workerTimeoutDuration is the amount of time a worker can go without
    29  	// responding to the coordinator before being stopped.
    30  	workerTimeoutDuration = 1 * time.Second
    31  
    32  	// workerExitCode is used as an exit code by fuzz worker processes after an internal error.
    33  	// This distinguishes internal errors from uncontrolled panics and other crashes.
    34  	// Keep in sync with internal/fuzz.workerExitCode.
    35  	workerExitCode = 70
    36  
    37  	// workerSharedMemSize is the maximum size of the shared memory file used to
    38  	// communicate with workers. This limits the size of fuzz inputs.
    39  	workerSharedMemSize = 100 << 20 // 100 MB
    40  )
    41  
    42  // worker manages a worker process running a test binary. The worker object
    43  // exists only in the coordinator (the process started by 'go test -fuzz').
    44  // workerClient is used by the coordinator to send RPCs to the worker process,
    45  // which handles them with workerServer.
    46  type worker struct {
    47  	dir     string   // working directory, same as package directory
    48  	binPath string   // path to test executable
    49  	args    []string // arguments for test executable
    50  	env     []string // environment for test executable
    51  
    52  	coordinator *coordinator
    53  
    54  	memMu chan *sharedMem // mutex guarding shared memory with worker; persists across processes.
    55  
    56  	cmd         *exec.Cmd     // current worker process
    57  	client      *workerClient // used to communicate with worker process
    58  	waitErr     error         // last error returned by wait, set before termC is closed.
    59  	interrupted bool          // true after stop interrupts a running worker.
    60  	termC       chan struct{} // closed by wait when worker process terminates
    61  }
    62  
    63  func newWorker(c *coordinator, dir, binPath string, args, env []string) (*worker, error) {
    64  	mem, err := sharedMemTempFile(workerSharedMemSize)
    65  	if err != nil {
    66  		return nil, err
    67  	}
    68  	memMu := make(chan *sharedMem, 1)
    69  	memMu <- mem
    70  	return &worker{
    71  		dir:         dir,
    72  		binPath:     binPath,
    73  		args:        args,
    74  		env:         env[:len(env):len(env)], // copy on append to ensure workers don't overwrite each other.
    75  		coordinator: c,
    76  		memMu:       memMu,
    77  	}, nil
    78  }
    79  
    80  // cleanup releases persistent resources associated with the worker.
    81  func (w *worker) cleanup() error {
    82  	mem := <-w.memMu
    83  	if mem == nil {
    84  		return nil
    85  	}
    86  	close(w.memMu)
    87  	return mem.Close()
    88  }
    89  
    90  // coordinate runs the test binary to perform fuzzing.
    91  //
    92  // coordinate loops until ctx is cancelled or a fatal error is encountered.
    93  // If a test process terminates unexpectedly while fuzzing, coordinate will
    94  // attempt to restart and continue unless the termination can be attributed
    95  // to an interruption (from a timer or the user).
    96  //
    97  // While looping, coordinate receives inputs from the coordinator, passes
    98  // those inputs to the worker process, then passes the results back to
    99  // the coordinator.
   100  func (w *worker) coordinate(ctx context.Context) error {
   101  	// Main event loop.
   102  	for {
   103  		// Start or restart the worker if it's not running.
   104  		if !w.isRunning() {
   105  			if err := w.startAndPing(ctx); err != nil {
   106  				return err
   107  			}
   108  		}
   109  
   110  		select {
   111  		case <-ctx.Done():
   112  			// Worker was told to stop.
   113  			err := w.stop()
   114  			if err != nil && !w.interrupted && !isInterruptError(err) {
   115  				return err
   116  			}
   117  			return ctx.Err()
   118  
   119  		case <-w.termC:
   120  			// Worker process terminated unexpectedly while waiting for input.
   121  			err := w.stop()
   122  			if w.interrupted {
   123  				panic("worker interrupted after unexpected termination")
   124  			}
   125  			if err == nil || isInterruptError(err) {
   126  				// Worker stopped, either by exiting with status 0 or after being
   127  				// interrupted with a signal that was not sent by the coordinator.
   128  				//
   129  				// When the user presses ^C, on POSIX platforms, SIGINT is delivered to
   130  				// all processes in the group concurrently, and the worker may see it
   131  				// before the coordinator. The worker should exit 0 gracefully (in
   132  				// theory).
   133  				//
   134  				// This condition is probably intended by the user, so suppress
   135  				// the error.
   136  				return nil
   137  			}
   138  			if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == workerExitCode {
   139  				// Worker exited with a code indicating F.Fuzz was not called correctly,
   140  				// for example, F.Fail was called first.
   141  				return fmt.Errorf("fuzzing process exited unexpectedly due to an internal failure: %w", err)
   142  			}
   143  			// Worker exited non-zero or was terminated by a non-interrupt
   144  			// signal (for example, SIGSEGV) while fuzzing.
   145  			return fmt.Errorf("fuzzing process hung or terminated unexpectedly: %w", err)
   146  			// TODO(jayconrod,katiehockman): if -keepfuzzing, restart worker.
   147  
   148  		case input := <-w.coordinator.inputC:
   149  			// Received input from coordinator.
   150  			args := fuzzArgs{
   151  				Limit:        input.limit,
   152  				Timeout:      input.timeout,
   153  				Warmup:       input.warmup,
   154  				CoverageData: input.coverageData,
   155  			}
   156  			entry, resp, isInternalError, err := w.client.fuzz(ctx, input.entry, args)
   157  			canMinimize := true
   158  			if err != nil {
   159  				// Error communicating with worker.
   160  				w.stop()
   161  				if ctx.Err() != nil {
   162  					// Timeout or interruption.
   163  					return ctx.Err()
   164  				}
   165  				if w.interrupted {
   166  					// Communication error before we stopped the worker.
   167  					// Report an error, but don't record a crasher.
   168  					return fmt.Errorf("communicating with fuzzing process: %v", err)
   169  				}
   170  				if sig, ok := terminationSignal(w.waitErr); ok && !isCrashSignal(sig) {
   171  					// Worker terminated by a signal that probably wasn't caused by a
   172  					// specific input to the fuzz function. For example, on Linux,
   173  					// the kernel (OOM killer) may send SIGKILL to a process using a lot
   174  					// of memory. Or the shell might send SIGHUP when the terminal
   175  					// is closed. Don't record a crasher.
   176  					return fmt.Errorf("fuzzing process terminated by unexpected signal; no crash will be recorded: %v", w.waitErr)
   177  				}
   178  				if isInternalError {
   179  					// An internal error occurred which shouldn't be considered
   180  					// a crash.
   181  					return err
   182  				}
   183  				// Unexpected termination. Set error message and fall through.
   184  				// We'll restart the worker on the next iteration.
   185  				// Don't attempt to minimize this since it crashed the worker.
   186  				resp.Err = fmt.Sprintf("fuzzing process hung or terminated unexpectedly: %v", w.waitErr)
   187  				canMinimize = false
   188  			}
   189  			result := fuzzResult{
   190  				limit:         input.limit,
   191  				count:         resp.Count,
   192  				totalDuration: resp.TotalDuration,
   193  				entryDuration: resp.InterestingDuration,
   194  				entry:         entry,
   195  				crasherMsg:    resp.Err,
   196  				coverageData:  resp.CoverageData,
   197  				canMinimize:   canMinimize,
   198  			}
   199  			w.coordinator.resultC <- result
   200  
   201  		case input := <-w.coordinator.minimizeC:
   202  			// Received input to minimize from coordinator.
   203  			result, err := w.minimize(ctx, input)
   204  			if err != nil {
   205  				// Error minimizing. Send back the original input. If it didn't cause
   206  				// an error before, report it as causing an error now.
   207  				// TODO: double-check this is handled correctly when
   208  				// implementing -keepfuzzing.
   209  				result = fuzzResult{
   210  					entry:       input.entry,
   211  					crasherMsg:  input.crasherMsg,
   212  					canMinimize: false,
   213  					limit:       input.limit,
   214  				}
   215  				if result.crasherMsg == "" {
   216  					result.crasherMsg = err.Error()
   217  				}
   218  			}
   219  			w.coordinator.resultC <- result
   220  		}
   221  	}
   222  }
   223  
   224  // minimize tells a worker process to attempt to find a smaller value that
   225  // either causes an error (if we started minimizing because we found an input
   226  // that causes an error) or preserves new coverage (if we started minimizing
   227  // because we found an input that expands coverage).
   228  func (w *worker) minimize(ctx context.Context, input fuzzMinimizeInput) (min fuzzResult, err error) {
   229  	if w.coordinator.opts.MinimizeTimeout != 0 {
   230  		var cancel func()
   231  		ctx, cancel = context.WithTimeout(ctx, w.coordinator.opts.MinimizeTimeout)
   232  		defer cancel()
   233  	}
   234  
   235  	args := minimizeArgs{
   236  		Limit:        input.limit,
   237  		Timeout:      input.timeout,
   238  		KeepCoverage: input.keepCoverage,
   239  	}
   240  	entry, resp, err := w.client.minimize(ctx, input.entry, args)
   241  	if err != nil {
   242  		// Error communicating with worker.
   243  		w.stop()
   244  		if ctx.Err() != nil || w.interrupted || isInterruptError(w.waitErr) {
   245  			// Worker was interrupted, possibly by the user pressing ^C.
   246  			// Normally, workers can handle interrupts and timeouts gracefully and
   247  			// will return without error. An error here indicates the worker
   248  			// may not have been in a good state, but the error won't be meaningful
   249  			// to the user. Just return the original crasher without logging anything.
   250  			return fuzzResult{
   251  				entry:        input.entry,
   252  				crasherMsg:   input.crasherMsg,
   253  				coverageData: input.keepCoverage,
   254  				canMinimize:  false,
   255  				limit:        input.limit,
   256  			}, nil
   257  		}
   258  		return fuzzResult{
   259  			entry:         entry,
   260  			crasherMsg:    fmt.Sprintf("fuzzing process hung or terminated unexpectedly while minimizing: %v", err),
   261  			canMinimize:   false,
   262  			limit:         input.limit,
   263  			count:         resp.Count,
   264  			totalDuration: resp.Duration,
   265  		}, nil
   266  	}
   267  
   268  	if input.crasherMsg != "" && resp.Err == "" {
   269  		return fuzzResult{}, fmt.Errorf("attempted to minimize a crash but could not reproduce")
   270  	}
   271  
   272  	return fuzzResult{
   273  		entry:         entry,
   274  		crasherMsg:    resp.Err,
   275  		coverageData:  resp.CoverageData,
   276  		canMinimize:   false,
   277  		limit:         input.limit,
   278  		count:         resp.Count,
   279  		totalDuration: resp.Duration,
   280  	}, nil
   281  }
   282  
   283  func (w *worker) isRunning() bool {
   284  	return w.cmd != nil
   285  }
   286  
   287  // startAndPing starts the worker process and sends it a message to make sure it
   288  // can communicate.
   289  //
   290  // startAndPing returns an error if any part of this didn't work, including if
   291  // the context is expired or the worker process was interrupted before it
   292  // responded. Errors that happen after start but before the ping response
   293  // likely indicate that the worker did not call F.Fuzz or called F.Fail first.
   294  // We don't record crashers for these errors.
   295  func (w *worker) startAndPing(ctx context.Context) error {
   296  	if ctx.Err() != nil {
   297  		return ctx.Err()
   298  	}
   299  	if err := w.start(); err != nil {
   300  		return err
   301  	}
   302  	if err := w.client.ping(ctx); err != nil {
   303  		w.stop()
   304  		if ctx.Err() != nil {
   305  			return ctx.Err()
   306  		}
   307  		if isInterruptError(err) {
   308  			// User may have pressed ^C before worker responded.
   309  			return err
   310  		}
   311  		// TODO: record and return stderr.
   312  		return fmt.Errorf("fuzzing process terminated without fuzzing: %w", err)
   313  	}
   314  	return nil
   315  }
   316  
   317  // start runs a new worker process.
   318  //
   319  // If the process couldn't be started, start returns an error. Start won't
   320  // return later termination errors from the process if they occur.
   321  //
   322  // If the process starts successfully, start returns nil. stop must be called
   323  // once later to clean up, even if the process terminates on its own.
   324  //
   325  // When the process terminates, w.waitErr is set to the error (if any), and
   326  // w.termC is closed.
   327  func (w *worker) start() (err error) {
   328  	if w.isRunning() {
   329  		panic("worker already started")
   330  	}
   331  	w.waitErr = nil
   332  	w.interrupted = false
   333  	w.termC = nil
   334  
   335  	cmd := exec.Command(w.binPath, w.args...)
   336  	cmd.Dir = w.dir
   337  	cmd.Env = w.env[:len(w.env):len(w.env)] // copy on append to ensure workers don't overwrite each other.
   338  
   339  	// Create the "fuzz_in" and "fuzz_out" pipes so we can communicate with
   340  	// the worker. We don't use stdin and stdout, since the test binary may
   341  	// do something else with those.
   342  	//
   343  	// Each pipe has a reader and a writer. The coordinator writes to fuzzInW
   344  	// and reads from fuzzOutR. The worker inherits fuzzInR and fuzzOutW.
   345  	// The coordinator closes fuzzInR and fuzzOutW after starting the worker,
   346  	// since we have no further need of them.
   347  	fuzzInR, fuzzInW, err := os.Pipe()
   348  	if err != nil {
   349  		return err
   350  	}
   351  	defer fuzzInR.Close()
   352  	fuzzOutR, fuzzOutW, err := os.Pipe()
   353  	if err != nil {
   354  		fuzzInW.Close()
   355  		return err
   356  	}
   357  	defer fuzzOutW.Close()
   358  	setWorkerComm(cmd, workerComm{fuzzIn: fuzzInR, fuzzOut: fuzzOutW, memMu: w.memMu})
   359  
   360  	// Start the worker process.
   361  	if err := cmd.Start(); err != nil {
   362  		fuzzInW.Close()
   363  		fuzzOutR.Close()
   364  		return err
   365  	}
   366  
   367  	// Worker started successfully.
   368  	// After this, w.client owns fuzzInW and fuzzOutR, so w.client.Close must be
   369  	// called later by stop.
   370  	w.cmd = cmd
   371  	w.termC = make(chan struct{})
   372  	comm := workerComm{fuzzIn: fuzzInW, fuzzOut: fuzzOutR, memMu: w.memMu}
   373  	m := newMutator()
   374  	w.client = newWorkerClient(comm, m)
   375  
   376  	go func() {
   377  		w.waitErr = w.cmd.Wait()
   378  		close(w.termC)
   379  	}()
   380  
   381  	return nil
   382  }
   383  
   384  // stop tells the worker process to exit by closing w.client, then blocks until
   385  // it terminates. If the worker doesn't terminate after a short time, stop
   386  // signals it with os.Interrupt (where supported), then os.Kill.
   387  //
   388  // stop returns the error the process terminated with, if any (same as
   389  // w.waitErr).
   390  //
   391  // stop must be called at least once after start returns successfully, even if
   392  // the worker process terminates unexpectedly.
   393  func (w *worker) stop() error {
   394  	if w.termC == nil {
   395  		panic("worker was not started successfully")
   396  	}
   397  	select {
   398  	case <-w.termC:
   399  		// Worker already terminated.
   400  		if w.client == nil {
   401  			// stop already called.
   402  			return w.waitErr
   403  		}
   404  		// Possible unexpected termination.
   405  		w.client.Close()
   406  		w.cmd = nil
   407  		w.client = nil
   408  		return w.waitErr
   409  	default:
   410  		// Worker still running.
   411  	}
   412  
   413  	// Tell the worker to stop by closing fuzz_in. It won't actually stop until it
   414  	// finishes with earlier calls.
   415  	closeC := make(chan struct{})
   416  	go func() {
   417  		w.client.Close()
   418  		close(closeC)
   419  	}()
   420  
   421  	sig := os.Interrupt
   422  	if runtime.GOOS == "windows" {
   423  		// Per https://golang.org/pkg/os/#Signal, “Interrupt is not implemented on
   424  		// Windows; using it with os.Process.Signal will return an error.”
   425  		// Fall back to Kill instead.
   426  		sig = os.Kill
   427  	}
   428  
   429  	t := time.NewTimer(workerTimeoutDuration)
   430  	for {
   431  		select {
   432  		case <-w.termC:
   433  			// Worker terminated.
   434  			t.Stop()
   435  			<-closeC
   436  			w.cmd = nil
   437  			w.client = nil
   438  			return w.waitErr
   439  
   440  		case <-t.C:
   441  			// Timer fired before worker terminated.
   442  			w.interrupted = true
   443  			switch sig {
   444  			case os.Interrupt:
   445  				// Try to stop the worker with SIGINT and wait a little longer.
   446  				w.cmd.Process.Signal(sig)
   447  				sig = os.Kill
   448  				t.Reset(workerTimeoutDuration)
   449  
   450  			case os.Kill:
   451  				// Try to stop the worker with SIGKILL and keep waiting.
   452  				w.cmd.Process.Signal(sig)
   453  				sig = nil
   454  				t.Reset(workerTimeoutDuration)
   455  
   456  			case nil:
   457  				// Still waiting. Print a message to let the user know why.
   458  				fmt.Fprintf(w.coordinator.opts.Log, "waiting for fuzzing process to terminate...\n")
   459  			}
   460  		}
   461  	}
   462  }
   463  
   464  // RunFuzzWorker is called in a worker process to communicate with the
   465  // coordinator process in order to fuzz random inputs. RunFuzzWorker loops
   466  // until the coordinator tells it to stop.
   467  //
   468  // fn is a wrapper on the fuzz function. It may return an error to indicate
   469  // a given input "crashed". The coordinator will also record a crasher if
   470  // the function times out or terminates the process.
   471  //
   472  // RunFuzzWorker returns an error if it could not communicate with the
   473  // coordinator process.
   474  func RunFuzzWorker(ctx context.Context, fn func(CorpusEntry) error) error {
   475  	comm, err := getWorkerComm()
   476  	if err != nil {
   477  		return err
   478  	}
   479  	srv := &workerServer{
   480  		workerComm: comm,
   481  		fuzzFn: func(e CorpusEntry) (time.Duration, error) {
   482  			timer := time.AfterFunc(10*time.Second, func() {
   483  				panic("deadlocked!") // this error message won't be printed
   484  			})
   485  			defer timer.Stop()
   486  			start := time.Now()
   487  			err := fn(e)
   488  			return time.Since(start), err
   489  		},
   490  		m: newMutator(),
   491  	}
   492  	return srv.serve(ctx)
   493  }
   494  
   495  // call is serialized and sent from the coordinator on fuzz_in. It acts as
   496  // a minimalist RPC mechanism. Exactly one of its fields must be set to indicate
   497  // which method to call.
   498  type call struct {
   499  	Ping     *pingArgs
   500  	Fuzz     *fuzzArgs
   501  	Minimize *minimizeArgs
   502  }
   503  
   504  // minimizeArgs contains arguments to workerServer.minimize. The value to
   505  // minimize is already in shared memory.
   506  type minimizeArgs struct {
   507  	// Timeout is the time to spend minimizing. This may include time to start up,
   508  	// especially if the input causes the worker process to terminated, requiring
   509  	// repeated restarts.
   510  	Timeout time.Duration
   511  
   512  	// Limit is the maximum number of values to test, without spending more time
   513  	// than Duration. 0 indicates no limit.
   514  	Limit int64
   515  
   516  	// KeepCoverage is a set of coverage counters the worker should attempt to
   517  	// keep in minimized values. When provided, the worker will reject inputs that
   518  	// don't cause at least one of these bits to be set.
   519  	KeepCoverage []byte
   520  
   521  	// Index is the index of the fuzz target parameter to be minimized.
   522  	Index int
   523  }
   524  
   525  // minimizeResponse contains results from workerServer.minimize.
   526  type minimizeResponse struct {
   527  	// WroteToMem is true if the worker found a smaller input and wrote it to
   528  	// shared memory. If minimizeArgs.KeepCoverage was set, the minimized input
   529  	// preserved at least one coverage bit and did not cause an error.
   530  	// Otherwise, the minimized input caused some error, recorded in Err.
   531  	WroteToMem bool
   532  
   533  	// Err is the error string caused by the value in shared memory, if any.
   534  	Err string
   535  
   536  	// CoverageData is the set of coverage bits activated by the minimized value
   537  	// in shared memory. When set, it contains at least one bit from KeepCoverage.
   538  	// CoverageData will be nil if Err is set or if minimization failed.
   539  	CoverageData []byte
   540  
   541  	// Duration is the time spent minimizing, not including starting or cleaning up.
   542  	Duration time.Duration
   543  
   544  	// Count is the number of values tested.
   545  	Count int64
   546  }
   547  
   548  // fuzzArgs contains arguments to workerServer.fuzz. The value to fuzz is
   549  // passed in shared memory.
   550  type fuzzArgs struct {
   551  	// Timeout is the time to spend fuzzing, not including starting or
   552  	// cleaning up.
   553  	Timeout time.Duration
   554  
   555  	// Limit is the maximum number of values to test, without spending more time
   556  	// than Duration. 0 indicates no limit.
   557  	Limit int64
   558  
   559  	// Warmup indicates whether this is part of a warmup run, meaning that
   560  	// fuzzing should not occur. If coverageEnabled is true, then coverage data
   561  	// should be reported.
   562  	Warmup bool
   563  
   564  	// CoverageData is the coverage data. If set, the worker should update its
   565  	// local coverage data prior to fuzzing.
   566  	CoverageData []byte
   567  }
   568  
   569  // fuzzResponse contains results from workerServer.fuzz.
   570  type fuzzResponse struct {
   571  	// Duration is the time spent fuzzing, not including starting or cleaning up.
   572  	TotalDuration       time.Duration
   573  	InterestingDuration time.Duration
   574  
   575  	// Count is the number of values tested.
   576  	Count int64
   577  
   578  	// CoverageData is set if the value in shared memory expands coverage
   579  	// and therefore may be interesting to the coordinator.
   580  	CoverageData []byte
   581  
   582  	// Err is the error string caused by the value in shared memory, which is
   583  	// non-empty if the value in shared memory caused a crash.
   584  	Err string
   585  
   586  	// InternalErr is the error string caused by an internal error in the
   587  	// worker. This shouldn't be considered a crasher.
   588  	InternalErr string
   589  }
   590  
   591  // pingArgs contains arguments to workerServer.ping.
   592  type pingArgs struct{}
   593  
   594  // pingResponse contains results from workerServer.ping.
   595  type pingResponse struct{}
   596  
   597  // workerComm holds pipes and shared memory used for communication
   598  // between the coordinator process (client) and a worker process (server).
   599  // These values are unique to each worker; they are shared only with the
   600  // coordinator, not with other workers.
   601  //
   602  // Access to shared memory is synchronized implicitly over the RPC protocol
   603  // implemented in workerServer and workerClient. During a call, the client
   604  // (worker) has exclusive access to shared memory; at other times, the server
   605  // (coordinator) has exclusive access.
   606  type workerComm struct {
   607  	fuzzIn, fuzzOut *os.File
   608  	memMu           chan *sharedMem // mutex guarding shared memory
   609  }
   610  
   611  // workerServer is a minimalist RPC server, run by fuzz worker processes.
   612  // It allows the coordinator process (using workerClient) to call methods in a
   613  // worker process. This system allows the coordinator to run multiple worker
   614  // processes in parallel and to collect inputs that caused crashes from shared
   615  // memory after a worker process terminates unexpectedly.
   616  type workerServer struct {
   617  	workerComm
   618  	m *mutator
   619  
   620  	// coverageMask is the local coverage data for the worker. It is
   621  	// periodically updated to reflect the data in the coordinator when new
   622  	// coverage is found.
   623  	coverageMask []byte
   624  
   625  	// fuzzFn runs the worker's fuzz target on the given input and returns an
   626  	// error if it finds a crasher (the process may also exit or crash), and the
   627  	// time it took to run the input. It sets a deadline of 10 seconds, at which
   628  	// point it will panic with the assumption that the process is hanging or
   629  	// deadlocked.
   630  	fuzzFn func(CorpusEntry) (time.Duration, error)
   631  }
   632  
   633  // serve reads serialized RPC messages on fuzzIn. When serve receives a message,
   634  // it calls the corresponding method, then sends the serialized result back
   635  // on fuzzOut.
   636  //
   637  // serve handles RPC calls synchronously; it will not attempt to read a message
   638  // until the previous call has finished.
   639  //
   640  // serve returns errors that occurred when communicating over pipes. serve
   641  // does not return errors from method calls; those are passed through serialized
   642  // responses.
   643  func (ws *workerServer) serve(ctx context.Context) error {
   644  	enc := json.NewEncoder(ws.fuzzOut)
   645  	dec := json.NewDecoder(&contextReader{ctx: ctx, r: ws.fuzzIn})
   646  	for {
   647  		var c call
   648  		if err := dec.Decode(&c); err != nil {
   649  			if err == io.EOF || err == ctx.Err() {
   650  				return nil
   651  			} else {
   652  				return err
   653  			}
   654  		}
   655  
   656  		var resp any
   657  		switch {
   658  		case c.Fuzz != nil:
   659  			resp = ws.fuzz(ctx, *c.Fuzz)
   660  		case c.Minimize != nil:
   661  			resp = ws.minimize(ctx, *c.Minimize)
   662  		case c.Ping != nil:
   663  			resp = ws.ping(ctx, *c.Ping)
   664  		default:
   665  			return errors.New("no arguments provided for any call")
   666  		}
   667  
   668  		if err := enc.Encode(resp); err != nil {
   669  			return err
   670  		}
   671  	}
   672  }
   673  
   674  // chainedMutations is how many mutations are applied before the worker
   675  // resets the input to it's original state.
   676  // NOTE: this number was picked without much thought. It is low enough that
   677  // it seems to create a significant diversity in mutated inputs. We may want
   678  // to consider looking into this more closely once we have a proper performance
   679  // testing framework. Another option is to randomly pick the number of chained
   680  // mutations on each invocation of the workerServer.fuzz method (this appears to
   681  // be what libFuzzer does, although there seems to be no documentation which
   682  // explains why this choice was made.)
   683  const chainedMutations = 5
   684  
   685  // fuzz runs the test function on random variations of the input value in shared
   686  // memory for a limited duration or number of iterations.
   687  //
   688  // fuzz returns early if it finds an input that crashes the fuzz function (with
   689  // fuzzResponse.Err set) or an input that expands coverage (with
   690  // fuzzResponse.InterestingDuration set).
   691  //
   692  // fuzz does not modify the input in shared memory. Instead, it saves the
   693  // initial PRNG state in shared memory and increments a counter in shared
   694  // memory before each call to the test function. The caller may reconstruct
   695  // the crashing input with this information, since the PRNG is deterministic.
   696  func (ws *workerServer) fuzz(ctx context.Context, args fuzzArgs) (resp fuzzResponse) {
   697  	if args.CoverageData != nil {
   698  		if ws.coverageMask != nil && len(args.CoverageData) != len(ws.coverageMask) {
   699  			resp.InternalErr = fmt.Sprintf("unexpected size for CoverageData: got %d, expected %d", len(args.CoverageData), len(ws.coverageMask))
   700  			return resp
   701  		}
   702  		ws.coverageMask = args.CoverageData
   703  	}
   704  	start := time.Now()
   705  	defer func() { resp.TotalDuration = time.Since(start) }()
   706  
   707  	if args.Timeout != 0 {
   708  		var cancel func()
   709  		ctx, cancel = context.WithTimeout(ctx, args.Timeout)
   710  		defer cancel()
   711  	}
   712  	mem := <-ws.memMu
   713  	ws.m.r.save(&mem.header().randState, &mem.header().randInc)
   714  	defer func() {
   715  		resp.Count = mem.header().count
   716  		ws.memMu <- mem
   717  	}()
   718  	if args.Limit > 0 && mem.header().count >= args.Limit {
   719  		resp.InternalErr = fmt.Sprintf("mem.header().count %d already exceeds args.Limit %d", mem.header().count, args.Limit)
   720  		return resp
   721  	}
   722  
   723  	originalVals, err := unmarshalCorpusFile(mem.valueCopy())
   724  	if err != nil {
   725  		resp.InternalErr = err.Error()
   726  		return resp
   727  	}
   728  	vals := make([]any, len(originalVals))
   729  	copy(vals, originalVals)
   730  
   731  	shouldStop := func() bool {
   732  		return args.Limit > 0 && mem.header().count >= args.Limit
   733  	}
   734  	fuzzOnce := func(entry CorpusEntry) (dur time.Duration, cov []byte, errMsg string) {
   735  		mem.header().count++
   736  		var err error
   737  		dur, err = ws.fuzzFn(entry)
   738  		if err != nil {
   739  			errMsg = err.Error()
   740  			if errMsg == "" {
   741  				errMsg = "fuzz function failed with no input"
   742  			}
   743  			return dur, nil, errMsg
   744  		}
   745  		if ws.coverageMask != nil && countNewCoverageBits(ws.coverageMask, coverageSnapshot) > 0 {
   746  			return dur, coverageSnapshot, ""
   747  		}
   748  		return dur, nil, ""
   749  	}
   750  
   751  	if args.Warmup {
   752  		dur, _, errMsg := fuzzOnce(CorpusEntry{Values: vals})
   753  		if errMsg != "" {
   754  			resp.Err = errMsg
   755  			return resp
   756  		}
   757  		resp.InterestingDuration = dur
   758  		if coverageEnabled {
   759  			resp.CoverageData = coverageSnapshot
   760  		}
   761  		return resp
   762  	}
   763  
   764  	for {
   765  		select {
   766  		case <-ctx.Done():
   767  			return resp
   768  		default:
   769  			if mem.header().count%chainedMutations == 0 {
   770  				copy(vals, originalVals)
   771  				ws.m.r.save(&mem.header().randState, &mem.header().randInc)
   772  			}
   773  			ws.m.mutate(vals, cap(mem.valueRef()))
   774  
   775  			entry := CorpusEntry{Values: vals}
   776  			dur, cov, errMsg := fuzzOnce(entry)
   777  			if errMsg != "" {
   778  				resp.Err = errMsg
   779  				return resp
   780  			}
   781  			if cov != nil {
   782  				resp.CoverageData = cov
   783  				resp.InterestingDuration = dur
   784  				return resp
   785  			}
   786  			if shouldStop() {
   787  				return resp
   788  			}
   789  		}
   790  	}
   791  }
   792  
   793  func (ws *workerServer) minimize(ctx context.Context, args minimizeArgs) (resp minimizeResponse) {
   794  	start := time.Now()
   795  	defer func() { resp.Duration = time.Since(start) }()
   796  	mem := <-ws.memMu
   797  	defer func() { ws.memMu <- mem }()
   798  	vals, err := unmarshalCorpusFile(mem.valueCopy())
   799  	if err != nil {
   800  		panic(err)
   801  	}
   802  	inpHash := sha256.Sum256(mem.valueCopy())
   803  	if args.Timeout != 0 {
   804  		var cancel func()
   805  		ctx, cancel = context.WithTimeout(ctx, args.Timeout)
   806  		defer cancel()
   807  	}
   808  
   809  	// Minimize the values in vals, then write to shared memory. We only write
   810  	// to shared memory after completing minimization.
   811  	success, err := ws.minimizeInput(ctx, vals, mem, args)
   812  	if success {
   813  		writeToMem(vals, mem)
   814  		outHash := sha256.Sum256(mem.valueCopy())
   815  		mem.header().rawInMem = false
   816  		resp.WroteToMem = true
   817  		if err != nil {
   818  			resp.Err = err.Error()
   819  		} else {
   820  			// If the values didn't change during minimization then coverageSnapshot is likely
   821  			// a dirty snapshot which represents the very last step of minimization, not the
   822  			// coverage for the initial input. In that case just return the coverage we were
   823  			// given initially, since it more accurately represents the coverage map for the
   824  			// input we are returning.
   825  			if outHash != inpHash {
   826  				resp.CoverageData = coverageSnapshot
   827  			} else {
   828  				resp.CoverageData = args.KeepCoverage
   829  			}
   830  		}
   831  	}
   832  	return resp
   833  }
   834  
   835  // minimizeInput applies a series of minimizing transformations on the provided
   836  // vals, ensuring that each minimization still causes an error, or keeps
   837  // coverage, in fuzzFn. It uses the context to determine how long to run,
   838  // stopping once closed. It returns a bool indicating whether minimization was
   839  // successful and an error if one was found.
   840  func (ws *workerServer) minimizeInput(ctx context.Context, vals []any, mem *sharedMem, args minimizeArgs) (success bool, retErr error) {
   841  	keepCoverage := args.KeepCoverage
   842  	memBytes := mem.valueRef()
   843  	bPtr := &memBytes
   844  	count := &mem.header().count
   845  	shouldStop := func() bool {
   846  		return ctx.Err() != nil ||
   847  			(args.Limit > 0 && *count >= args.Limit)
   848  	}
   849  	if shouldStop() {
   850  		return false, nil
   851  	}
   852  
   853  	// Check that the original value preserves coverage or causes an error.
   854  	// If not, then whatever caused us to think the value was interesting may
   855  	// have been a flake, and we can't minimize it.
   856  	*count++
   857  	_, retErr = ws.fuzzFn(CorpusEntry{Values: vals})
   858  	if keepCoverage != nil {
   859  		if !hasCoverageBit(keepCoverage, coverageSnapshot) || retErr != nil {
   860  			return false, nil
   861  		}
   862  	} else if retErr == nil {
   863  		return false, nil
   864  	}
   865  	mem.header().rawInMem = true
   866  
   867  	// tryMinimized runs the fuzz function with candidate replacing the value
   868  	// at index valI. tryMinimized returns whether the input with candidate is
   869  	// interesting for the same reason as the original input: it returns
   870  	// an error if one was expected, or it preserves coverage.
   871  	tryMinimized := func(candidate []byte) bool {
   872  		prev := vals[args.Index]
   873  		switch prev.(type) {
   874  		case []byte:
   875  			vals[args.Index] = candidate
   876  		case string:
   877  			vals[args.Index] = string(candidate)
   878  		default:
   879  			panic("impossible")
   880  		}
   881  		copy(*bPtr, candidate)
   882  		*bPtr = (*bPtr)[:len(candidate)]
   883  		mem.setValueLen(len(candidate))
   884  		*count++
   885  		_, err := ws.fuzzFn(CorpusEntry{Values: vals})
   886  		if err != nil {
   887  			retErr = err
   888  			if keepCoverage != nil {
   889  				// Now that we've found a crash, that's more important than any
   890  				// minimization of interesting inputs that was being done. Clear out
   891  				// keepCoverage to only minimize the crash going forward.
   892  				keepCoverage = nil
   893  			}
   894  			return true
   895  		}
   896  		// Minimization should preserve coverage bits.
   897  		if keepCoverage != nil && isCoverageSubset(keepCoverage, coverageSnapshot) {
   898  			return true
   899  		}
   900  		vals[args.Index] = prev
   901  		return false
   902  	}
   903  	switch v := vals[args.Index].(type) {
   904  	case string:
   905  		minimizeBytes([]byte(v), tryMinimized, shouldStop)
   906  	case []byte:
   907  		minimizeBytes(v, tryMinimized, shouldStop)
   908  	default:
   909  		panic("impossible")
   910  	}
   911  	return true, retErr
   912  }
   913  
   914  func writeToMem(vals []any, mem *sharedMem) {
   915  	b := marshalCorpusFile(vals...)
   916  	mem.setValue(b)
   917  }
   918  
   919  // ping does nothing. The coordinator calls this method to ensure the worker
   920  // has called F.Fuzz and can communicate.
   921  func (ws *workerServer) ping(ctx context.Context, args pingArgs) pingResponse {
   922  	return pingResponse{}
   923  }
   924  
   925  // workerClient is a minimalist RPC client. The coordinator process uses a
   926  // workerClient to call methods in each worker process (handled by
   927  // workerServer).
   928  type workerClient struct {
   929  	workerComm
   930  	m *mutator
   931  
   932  	// mu is the mutex protecting the workerComm.fuzzIn pipe. This must be
   933  	// locked before making calls to the workerServer. It prevents
   934  	// workerClient.Close from closing fuzzIn while workerClient methods are
   935  	// writing to it concurrently, and prevents multiple callers from writing to
   936  	// fuzzIn concurrently.
   937  	mu sync.Mutex
   938  }
   939  
   940  func newWorkerClient(comm workerComm, m *mutator) *workerClient {
   941  	return &workerClient{workerComm: comm, m: m}
   942  }
   943  
   944  // Close shuts down the connection to the RPC server (the worker process) by
   945  // closing fuzz_in. Close drains fuzz_out (avoiding a SIGPIPE in the worker),
   946  // and closes it after the worker process closes the other end.
   947  func (wc *workerClient) Close() error {
   948  	wc.mu.Lock()
   949  	defer wc.mu.Unlock()
   950  
   951  	// Close fuzzIn. This signals to the server that there are no more calls,
   952  	// and it should exit.
   953  	if err := wc.fuzzIn.Close(); err != nil {
   954  		wc.fuzzOut.Close()
   955  		return err
   956  	}
   957  
   958  	// Drain fuzzOut and close it. When the server exits, the kernel will close
   959  	// its end of fuzzOut, and we'll get EOF.
   960  	if _, err := io.Copy(io.Discard, wc.fuzzOut); err != nil {
   961  		wc.fuzzOut.Close()
   962  		return err
   963  	}
   964  	return wc.fuzzOut.Close()
   965  }
   966  
   967  // errSharedMemClosed is returned by workerClient methods that cannot access
   968  // shared memory because it was closed and unmapped by another goroutine. That
   969  // can happen when worker.cleanup is called in the worker goroutine while a
   970  // workerClient.fuzz call runs concurrently.
   971  //
   972  // This error should not be reported. It indicates the operation was
   973  // interrupted.
   974  var errSharedMemClosed = errors.New("internal error: shared memory was closed and unmapped")
   975  
   976  // minimize tells the worker to call the minimize method. See
   977  // workerServer.minimize.
   978  func (wc *workerClient) minimize(ctx context.Context, entryIn CorpusEntry, args minimizeArgs) (entryOut CorpusEntry, resp minimizeResponse, retErr error) {
   979  	wc.mu.Lock()
   980  	defer wc.mu.Unlock()
   981  
   982  	mem, ok := <-wc.memMu
   983  	if !ok {
   984  		return CorpusEntry{}, minimizeResponse{}, errSharedMemClosed
   985  	}
   986  	mem.header().count = 0
   987  	inp, err := corpusEntryData(entryIn)
   988  	if err != nil {
   989  		return CorpusEntry{}, minimizeResponse{}, err
   990  	}
   991  	mem.setValue(inp)
   992  	defer func() { wc.memMu <- mem }()
   993  	entryOut = entryIn
   994  	entryOut.Values, err = unmarshalCorpusFile(inp)
   995  	if err != nil {
   996  		return CorpusEntry{}, minimizeResponse{}, fmt.Errorf("workerClient.minimize unmarshaling provided value: %v", err)
   997  	}
   998  	for i, v := range entryOut.Values {
   999  		if !isMinimizable(reflect.TypeOf(v)) {
  1000  			continue
  1001  		}
  1002  
  1003  		wc.memMu <- mem
  1004  		args.Index = i
  1005  		c := call{Minimize: &args}
  1006  		callErr := wc.callLocked(ctx, c, &resp)
  1007  		mem, ok = <-wc.memMu
  1008  		if !ok {
  1009  			return CorpusEntry{}, minimizeResponse{}, errSharedMemClosed
  1010  		}
  1011  
  1012  		if callErr != nil {
  1013  			retErr = callErr
  1014  			if !mem.header().rawInMem {
  1015  				// An unrecoverable error occurred before minimization began.
  1016  				return entryIn, minimizeResponse{}, retErr
  1017  			}
  1018  			// An unrecoverable error occurred during minimization. mem now
  1019  			// holds the raw, unmarshalled bytes of entryIn.Values[i] that
  1020  			// caused the error.
  1021  			switch entryOut.Values[i].(type) {
  1022  			case string:
  1023  				entryOut.Values[i] = string(mem.valueCopy())
  1024  			case []byte:
  1025  				entryOut.Values[i] = mem.valueCopy()
  1026  			default:
  1027  				panic("impossible")
  1028  			}
  1029  			entryOut.Data = marshalCorpusFile(entryOut.Values...)
  1030  			// Stop minimizing; another unrecoverable error is likely to occur.
  1031  			break
  1032  		}
  1033  
  1034  		if resp.WroteToMem {
  1035  			// Minimization succeeded, and mem holds the marshaled data.
  1036  			entryOut.Data = mem.valueCopy()
  1037  			entryOut.Values, err = unmarshalCorpusFile(entryOut.Data)
  1038  			if err != nil {
  1039  				return CorpusEntry{}, minimizeResponse{}, fmt.Errorf("workerClient.minimize unmarshaling minimized value: %v", err)
  1040  			}
  1041  		}
  1042  
  1043  		// Prepare for next iteration of the loop.
  1044  		if args.Timeout != 0 {
  1045  			args.Timeout -= resp.Duration
  1046  			if args.Timeout <= 0 {
  1047  				break
  1048  			}
  1049  		}
  1050  		if args.Limit != 0 {
  1051  			args.Limit -= mem.header().count
  1052  			if args.Limit <= 0 {
  1053  				break
  1054  			}
  1055  		}
  1056  	}
  1057  	resp.Count = mem.header().count
  1058  	h := sha256.Sum256(entryOut.Data)
  1059  	entryOut.Path = fmt.Sprintf("%x", h[:4])
  1060  	return entryOut, resp, retErr
  1061  }
  1062  
  1063  // fuzz tells the worker to call the fuzz method. See workerServer.fuzz.
  1064  func (wc *workerClient) fuzz(ctx context.Context, entryIn CorpusEntry, args fuzzArgs) (entryOut CorpusEntry, resp fuzzResponse, isInternalError bool, err error) {
  1065  	wc.mu.Lock()
  1066  	defer wc.mu.Unlock()
  1067  
  1068  	mem, ok := <-wc.memMu
  1069  	if !ok {
  1070  		return CorpusEntry{}, fuzzResponse{}, true, errSharedMemClosed
  1071  	}
  1072  	mem.header().count = 0
  1073  	inp, err := corpusEntryData(entryIn)
  1074  	if err != nil {
  1075  		return CorpusEntry{}, fuzzResponse{}, true, err
  1076  	}
  1077  	mem.setValue(inp)
  1078  	wc.memMu <- mem
  1079  
  1080  	c := call{Fuzz: &args}
  1081  	callErr := wc.callLocked(ctx, c, &resp)
  1082  	if resp.InternalErr != "" {
  1083  		return CorpusEntry{}, fuzzResponse{}, true, errors.New(resp.InternalErr)
  1084  	}
  1085  	mem, ok = <-wc.memMu
  1086  	if !ok {
  1087  		return CorpusEntry{}, fuzzResponse{}, true, errSharedMemClosed
  1088  	}
  1089  	defer func() { wc.memMu <- mem }()
  1090  	resp.Count = mem.header().count
  1091  
  1092  	if !bytes.Equal(inp, mem.valueRef()) {
  1093  		return CorpusEntry{}, fuzzResponse{}, true, errors.New("workerServer.fuzz modified input")
  1094  	}
  1095  	needEntryOut := callErr != nil || resp.Err != "" ||
  1096  		(!args.Warmup && resp.CoverageData != nil)
  1097  	if needEntryOut {
  1098  		valuesOut, err := unmarshalCorpusFile(inp)
  1099  		if err != nil {
  1100  			return CorpusEntry{}, fuzzResponse{}, true, fmt.Errorf("unmarshaling fuzz input value after call: %v", err)
  1101  		}
  1102  		wc.m.r.restore(mem.header().randState, mem.header().randInc)
  1103  		if !args.Warmup {
  1104  			// Only mutate the valuesOut if fuzzing actually occurred.
  1105  			numMutations := ((resp.Count - 1) % chainedMutations) + 1
  1106  			for i := int64(0); i < numMutations; i++ {
  1107  				wc.m.mutate(valuesOut, cap(mem.valueRef()))
  1108  			}
  1109  		}
  1110  		dataOut := marshalCorpusFile(valuesOut...)
  1111  
  1112  		h := sha256.Sum256(dataOut)
  1113  		name := fmt.Sprintf("%x", h[:4])
  1114  		entryOut = CorpusEntry{
  1115  			Parent:     entryIn.Path,
  1116  			Path:       name,
  1117  			Data:       dataOut,
  1118  			Generation: entryIn.Generation + 1,
  1119  		}
  1120  		if args.Warmup {
  1121  			// The bytes weren't mutated, so if entryIn was a seed corpus value,
  1122  			// then entryOut is too.
  1123  			entryOut.IsSeed = entryIn.IsSeed
  1124  		}
  1125  	}
  1126  
  1127  	return entryOut, resp, false, callErr
  1128  }
  1129  
  1130  // ping tells the worker to call the ping method. See workerServer.ping.
  1131  func (wc *workerClient) ping(ctx context.Context) error {
  1132  	wc.mu.Lock()
  1133  	defer wc.mu.Unlock()
  1134  	c := call{Ping: &pingArgs{}}
  1135  	var resp pingResponse
  1136  	return wc.callLocked(ctx, c, &resp)
  1137  }
  1138  
  1139  // callLocked sends an RPC from the coordinator to the worker process and waits
  1140  // for the response. The callLocked may be cancelled with ctx.
  1141  func (wc *workerClient) callLocked(ctx context.Context, c call, resp any) (err error) {
  1142  	enc := json.NewEncoder(wc.fuzzIn)
  1143  	dec := json.NewDecoder(&contextReader{ctx: ctx, r: wc.fuzzOut})
  1144  	if err := enc.Encode(c); err != nil {
  1145  		return err
  1146  	}
  1147  	return dec.Decode(resp)
  1148  }
  1149  
  1150  // contextReader wraps a Reader with a Context. If the context is cancelled
  1151  // while the underlying reader is blocked, Read returns immediately.
  1152  //
  1153  // This is useful for reading from a pipe. Closing a pipe file descriptor does
  1154  // not unblock pending Reads on that file descriptor. All copies of the pipe's
  1155  // other file descriptor (the write end) must be closed in all processes that
  1156  // inherit it. This is difficult to do correctly in the situation we care about
  1157  // (process group termination).
  1158  type contextReader struct {
  1159  	ctx context.Context
  1160  	r   io.Reader
  1161  }
  1162  
  1163  func (cr *contextReader) Read(b []byte) (int, error) {
  1164  	if ctxErr := cr.ctx.Err(); ctxErr != nil {
  1165  		return 0, ctxErr
  1166  	}
  1167  	done := make(chan struct{})
  1168  
  1169  	// This goroutine may stay blocked after Read returns because the underlying
  1170  	// read is blocked.
  1171  	var n int
  1172  	var err error
  1173  	go func() {
  1174  		n, err = cr.r.Read(b)
  1175  		close(done)
  1176  	}()
  1177  
  1178  	select {
  1179  	case <-cr.ctx.Done():
  1180  		return 0, cr.ctx.Err()
  1181  	case <-done:
  1182  		return n, err
  1183  	}
  1184  }