github.com/JimmyHuang454/JLS-go@v0.0.0-20230831150107-90d536585ba0/internal/fuzz/worker.go (about) 1 // Copyright 2020 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package fuzz 6 7 import ( 8 "bytes" 9 "context" 10 "crypto/sha256" 11 "encoding/json" 12 "errors" 13 "fmt" 14 "io" 15 "os" 16 "os/exec" 17 "reflect" 18 "runtime" 19 "sync" 20 "time" 21 ) 22 23 const ( 24 // workerFuzzDuration is the amount of time a worker can spend testing random 25 // variations of an input given by the coordinator. 26 workerFuzzDuration = 100 * time.Millisecond 27 28 // workerTimeoutDuration is the amount of time a worker can go without 29 // responding to the coordinator before being stopped. 30 workerTimeoutDuration = 1 * time.Second 31 32 // workerExitCode is used as an exit code by fuzz worker processes after an internal error. 33 // This distinguishes internal errors from uncontrolled panics and other crashes. 34 // Keep in sync with internal/fuzz.workerExitCode. 35 workerExitCode = 70 36 37 // workerSharedMemSize is the maximum size of the shared memory file used to 38 // communicate with workers. This limits the size of fuzz inputs. 39 workerSharedMemSize = 100 << 20 // 100 MB 40 ) 41 42 // worker manages a worker process running a test binary. The worker object 43 // exists only in the coordinator (the process started by 'go test -fuzz'). 44 // workerClient is used by the coordinator to send RPCs to the worker process, 45 // which handles them with workerServer. 46 type worker struct { 47 dir string // working directory, same as package directory 48 binPath string // path to test executable 49 args []string // arguments for test executable 50 env []string // environment for test executable 51 52 coordinator *coordinator 53 54 memMu chan *sharedMem // mutex guarding shared memory with worker; persists across processes. 55 56 cmd *exec.Cmd // current worker process 57 client *workerClient // used to communicate with worker process 58 waitErr error // last error returned by wait, set before termC is closed. 59 interrupted bool // true after stop interrupts a running worker. 60 termC chan struct{} // closed by wait when worker process terminates 61 } 62 63 func newWorker(c *coordinator, dir, binPath string, args, env []string) (*worker, error) { 64 mem, err := sharedMemTempFile(workerSharedMemSize) 65 if err != nil { 66 return nil, err 67 } 68 memMu := make(chan *sharedMem, 1) 69 memMu <- mem 70 return &worker{ 71 dir: dir, 72 binPath: binPath, 73 args: args, 74 env: env[:len(env):len(env)], // copy on append to ensure workers don't overwrite each other. 75 coordinator: c, 76 memMu: memMu, 77 }, nil 78 } 79 80 // cleanup releases persistent resources associated with the worker. 81 func (w *worker) cleanup() error { 82 mem := <-w.memMu 83 if mem == nil { 84 return nil 85 } 86 close(w.memMu) 87 return mem.Close() 88 } 89 90 // coordinate runs the test binary to perform fuzzing. 91 // 92 // coordinate loops until ctx is cancelled or a fatal error is encountered. 93 // If a test process terminates unexpectedly while fuzzing, coordinate will 94 // attempt to restart and continue unless the termination can be attributed 95 // to an interruption (from a timer or the user). 96 // 97 // While looping, coordinate receives inputs from the coordinator, passes 98 // those inputs to the worker process, then passes the results back to 99 // the coordinator. 100 func (w *worker) coordinate(ctx context.Context) error { 101 // Main event loop. 102 for { 103 // Start or restart the worker if it's not running. 104 if !w.isRunning() { 105 if err := w.startAndPing(ctx); err != nil { 106 return err 107 } 108 } 109 110 select { 111 case <-ctx.Done(): 112 // Worker was told to stop. 113 err := w.stop() 114 if err != nil && !w.interrupted && !isInterruptError(err) { 115 return err 116 } 117 return ctx.Err() 118 119 case <-w.termC: 120 // Worker process terminated unexpectedly while waiting for input. 121 err := w.stop() 122 if w.interrupted { 123 panic("worker interrupted after unexpected termination") 124 } 125 if err == nil || isInterruptError(err) { 126 // Worker stopped, either by exiting with status 0 or after being 127 // interrupted with a signal that was not sent by the coordinator. 128 // 129 // When the user presses ^C, on POSIX platforms, SIGINT is delivered to 130 // all processes in the group concurrently, and the worker may see it 131 // before the coordinator. The worker should exit 0 gracefully (in 132 // theory). 133 // 134 // This condition is probably intended by the user, so suppress 135 // the error. 136 return nil 137 } 138 if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == workerExitCode { 139 // Worker exited with a code indicating F.Fuzz was not called correctly, 140 // for example, F.Fail was called first. 141 return fmt.Errorf("fuzzing process exited unexpectedly due to an internal failure: %w", err) 142 } 143 // Worker exited non-zero or was terminated by a non-interrupt 144 // signal (for example, SIGSEGV) while fuzzing. 145 return fmt.Errorf("fuzzing process hung or terminated unexpectedly: %w", err) 146 // TODO(jayconrod,katiehockman): if -keepfuzzing, restart worker. 147 148 case input := <-w.coordinator.inputC: 149 // Received input from coordinator. 150 args := fuzzArgs{ 151 Limit: input.limit, 152 Timeout: input.timeout, 153 Warmup: input.warmup, 154 CoverageData: input.coverageData, 155 } 156 entry, resp, isInternalError, err := w.client.fuzz(ctx, input.entry, args) 157 canMinimize := true 158 if err != nil { 159 // Error communicating with worker. 160 w.stop() 161 if ctx.Err() != nil { 162 // Timeout or interruption. 163 return ctx.Err() 164 } 165 if w.interrupted { 166 // Communication error before we stopped the worker. 167 // Report an error, but don't record a crasher. 168 return fmt.Errorf("communicating with fuzzing process: %v", err) 169 } 170 if sig, ok := terminationSignal(w.waitErr); ok && !isCrashSignal(sig) { 171 // Worker terminated by a signal that probably wasn't caused by a 172 // specific input to the fuzz function. For example, on Linux, 173 // the kernel (OOM killer) may send SIGKILL to a process using a lot 174 // of memory. Or the shell might send SIGHUP when the terminal 175 // is closed. Don't record a crasher. 176 return fmt.Errorf("fuzzing process terminated by unexpected signal; no crash will be recorded: %v", w.waitErr) 177 } 178 if isInternalError { 179 // An internal error occurred which shouldn't be considered 180 // a crash. 181 return err 182 } 183 // Unexpected termination. Set error message and fall through. 184 // We'll restart the worker on the next iteration. 185 // Don't attempt to minimize this since it crashed the worker. 186 resp.Err = fmt.Sprintf("fuzzing process hung or terminated unexpectedly: %v", w.waitErr) 187 canMinimize = false 188 } 189 result := fuzzResult{ 190 limit: input.limit, 191 count: resp.Count, 192 totalDuration: resp.TotalDuration, 193 entryDuration: resp.InterestingDuration, 194 entry: entry, 195 crasherMsg: resp.Err, 196 coverageData: resp.CoverageData, 197 canMinimize: canMinimize, 198 } 199 w.coordinator.resultC <- result 200 201 case input := <-w.coordinator.minimizeC: 202 // Received input to minimize from coordinator. 203 result, err := w.minimize(ctx, input) 204 if err != nil { 205 // Error minimizing. Send back the original input. If it didn't cause 206 // an error before, report it as causing an error now. 207 // TODO: double-check this is handled correctly when 208 // implementing -keepfuzzing. 209 result = fuzzResult{ 210 entry: input.entry, 211 crasherMsg: input.crasherMsg, 212 canMinimize: false, 213 limit: input.limit, 214 } 215 if result.crasherMsg == "" { 216 result.crasherMsg = err.Error() 217 } 218 } 219 w.coordinator.resultC <- result 220 } 221 } 222 } 223 224 // minimize tells a worker process to attempt to find a smaller value that 225 // either causes an error (if we started minimizing because we found an input 226 // that causes an error) or preserves new coverage (if we started minimizing 227 // because we found an input that expands coverage). 228 func (w *worker) minimize(ctx context.Context, input fuzzMinimizeInput) (min fuzzResult, err error) { 229 if w.coordinator.opts.MinimizeTimeout != 0 { 230 var cancel func() 231 ctx, cancel = context.WithTimeout(ctx, w.coordinator.opts.MinimizeTimeout) 232 defer cancel() 233 } 234 235 args := minimizeArgs{ 236 Limit: input.limit, 237 Timeout: input.timeout, 238 KeepCoverage: input.keepCoverage, 239 } 240 entry, resp, err := w.client.minimize(ctx, input.entry, args) 241 if err != nil { 242 // Error communicating with worker. 243 w.stop() 244 if ctx.Err() != nil || w.interrupted || isInterruptError(w.waitErr) { 245 // Worker was interrupted, possibly by the user pressing ^C. 246 // Normally, workers can handle interrupts and timeouts gracefully and 247 // will return without error. An error here indicates the worker 248 // may not have been in a good state, but the error won't be meaningful 249 // to the user. Just return the original crasher without logging anything. 250 return fuzzResult{ 251 entry: input.entry, 252 crasherMsg: input.crasherMsg, 253 coverageData: input.keepCoverage, 254 canMinimize: false, 255 limit: input.limit, 256 }, nil 257 } 258 return fuzzResult{ 259 entry: entry, 260 crasherMsg: fmt.Sprintf("fuzzing process hung or terminated unexpectedly while minimizing: %v", err), 261 canMinimize: false, 262 limit: input.limit, 263 count: resp.Count, 264 totalDuration: resp.Duration, 265 }, nil 266 } 267 268 if input.crasherMsg != "" && resp.Err == "" { 269 return fuzzResult{}, fmt.Errorf("attempted to minimize a crash but could not reproduce") 270 } 271 272 return fuzzResult{ 273 entry: entry, 274 crasherMsg: resp.Err, 275 coverageData: resp.CoverageData, 276 canMinimize: false, 277 limit: input.limit, 278 count: resp.Count, 279 totalDuration: resp.Duration, 280 }, nil 281 } 282 283 func (w *worker) isRunning() bool { 284 return w.cmd != nil 285 } 286 287 // startAndPing starts the worker process and sends it a message to make sure it 288 // can communicate. 289 // 290 // startAndPing returns an error if any part of this didn't work, including if 291 // the context is expired or the worker process was interrupted before it 292 // responded. Errors that happen after start but before the ping response 293 // likely indicate that the worker did not call F.Fuzz or called F.Fail first. 294 // We don't record crashers for these errors. 295 func (w *worker) startAndPing(ctx context.Context) error { 296 if ctx.Err() != nil { 297 return ctx.Err() 298 } 299 if err := w.start(); err != nil { 300 return err 301 } 302 if err := w.client.ping(ctx); err != nil { 303 w.stop() 304 if ctx.Err() != nil { 305 return ctx.Err() 306 } 307 if isInterruptError(err) { 308 // User may have pressed ^C before worker responded. 309 return err 310 } 311 // TODO: record and return stderr. 312 return fmt.Errorf("fuzzing process terminated without fuzzing: %w", err) 313 } 314 return nil 315 } 316 317 // start runs a new worker process. 318 // 319 // If the process couldn't be started, start returns an error. Start won't 320 // return later termination errors from the process if they occur. 321 // 322 // If the process starts successfully, start returns nil. stop must be called 323 // once later to clean up, even if the process terminates on its own. 324 // 325 // When the process terminates, w.waitErr is set to the error (if any), and 326 // w.termC is closed. 327 func (w *worker) start() (err error) { 328 if w.isRunning() { 329 panic("worker already started") 330 } 331 w.waitErr = nil 332 w.interrupted = false 333 w.termC = nil 334 335 cmd := exec.Command(w.binPath, w.args...) 336 cmd.Dir = w.dir 337 cmd.Env = w.env[:len(w.env):len(w.env)] // copy on append to ensure workers don't overwrite each other. 338 339 // Create the "fuzz_in" and "fuzz_out" pipes so we can communicate with 340 // the worker. We don't use stdin and stdout, since the test binary may 341 // do something else with those. 342 // 343 // Each pipe has a reader and a writer. The coordinator writes to fuzzInW 344 // and reads from fuzzOutR. The worker inherits fuzzInR and fuzzOutW. 345 // The coordinator closes fuzzInR and fuzzOutW after starting the worker, 346 // since we have no further need of them. 347 fuzzInR, fuzzInW, err := os.Pipe() 348 if err != nil { 349 return err 350 } 351 defer fuzzInR.Close() 352 fuzzOutR, fuzzOutW, err := os.Pipe() 353 if err != nil { 354 fuzzInW.Close() 355 return err 356 } 357 defer fuzzOutW.Close() 358 setWorkerComm(cmd, workerComm{fuzzIn: fuzzInR, fuzzOut: fuzzOutW, memMu: w.memMu}) 359 360 // Start the worker process. 361 if err := cmd.Start(); err != nil { 362 fuzzInW.Close() 363 fuzzOutR.Close() 364 return err 365 } 366 367 // Worker started successfully. 368 // After this, w.client owns fuzzInW and fuzzOutR, so w.client.Close must be 369 // called later by stop. 370 w.cmd = cmd 371 w.termC = make(chan struct{}) 372 comm := workerComm{fuzzIn: fuzzInW, fuzzOut: fuzzOutR, memMu: w.memMu} 373 m := newMutator() 374 w.client = newWorkerClient(comm, m) 375 376 go func() { 377 w.waitErr = w.cmd.Wait() 378 close(w.termC) 379 }() 380 381 return nil 382 } 383 384 // stop tells the worker process to exit by closing w.client, then blocks until 385 // it terminates. If the worker doesn't terminate after a short time, stop 386 // signals it with os.Interrupt (where supported), then os.Kill. 387 // 388 // stop returns the error the process terminated with, if any (same as 389 // w.waitErr). 390 // 391 // stop must be called at least once after start returns successfully, even if 392 // the worker process terminates unexpectedly. 393 func (w *worker) stop() error { 394 if w.termC == nil { 395 panic("worker was not started successfully") 396 } 397 select { 398 case <-w.termC: 399 // Worker already terminated. 400 if w.client == nil { 401 // stop already called. 402 return w.waitErr 403 } 404 // Possible unexpected termination. 405 w.client.Close() 406 w.cmd = nil 407 w.client = nil 408 return w.waitErr 409 default: 410 // Worker still running. 411 } 412 413 // Tell the worker to stop by closing fuzz_in. It won't actually stop until it 414 // finishes with earlier calls. 415 closeC := make(chan struct{}) 416 go func() { 417 w.client.Close() 418 close(closeC) 419 }() 420 421 sig := os.Interrupt 422 if runtime.GOOS == "windows" { 423 // Per https://golang.org/pkg/os/#Signal, “Interrupt is not implemented on 424 // Windows; using it with os.Process.Signal will return an error.” 425 // Fall back to Kill instead. 426 sig = os.Kill 427 } 428 429 t := time.NewTimer(workerTimeoutDuration) 430 for { 431 select { 432 case <-w.termC: 433 // Worker terminated. 434 t.Stop() 435 <-closeC 436 w.cmd = nil 437 w.client = nil 438 return w.waitErr 439 440 case <-t.C: 441 // Timer fired before worker terminated. 442 w.interrupted = true 443 switch sig { 444 case os.Interrupt: 445 // Try to stop the worker with SIGINT and wait a little longer. 446 w.cmd.Process.Signal(sig) 447 sig = os.Kill 448 t.Reset(workerTimeoutDuration) 449 450 case os.Kill: 451 // Try to stop the worker with SIGKILL and keep waiting. 452 w.cmd.Process.Signal(sig) 453 sig = nil 454 t.Reset(workerTimeoutDuration) 455 456 case nil: 457 // Still waiting. Print a message to let the user know why. 458 fmt.Fprintf(w.coordinator.opts.Log, "waiting for fuzzing process to terminate...\n") 459 } 460 } 461 } 462 } 463 464 // RunFuzzWorker is called in a worker process to communicate with the 465 // coordinator process in order to fuzz random inputs. RunFuzzWorker loops 466 // until the coordinator tells it to stop. 467 // 468 // fn is a wrapper on the fuzz function. It may return an error to indicate 469 // a given input "crashed". The coordinator will also record a crasher if 470 // the function times out or terminates the process. 471 // 472 // RunFuzzWorker returns an error if it could not communicate with the 473 // coordinator process. 474 func RunFuzzWorker(ctx context.Context, fn func(CorpusEntry) error) error { 475 comm, err := getWorkerComm() 476 if err != nil { 477 return err 478 } 479 srv := &workerServer{ 480 workerComm: comm, 481 fuzzFn: func(e CorpusEntry) (time.Duration, error) { 482 timer := time.AfterFunc(10*time.Second, func() { 483 panic("deadlocked!") // this error message won't be printed 484 }) 485 defer timer.Stop() 486 start := time.Now() 487 err := fn(e) 488 return time.Since(start), err 489 }, 490 m: newMutator(), 491 } 492 return srv.serve(ctx) 493 } 494 495 // call is serialized and sent from the coordinator on fuzz_in. It acts as 496 // a minimalist RPC mechanism. Exactly one of its fields must be set to indicate 497 // which method to call. 498 type call struct { 499 Ping *pingArgs 500 Fuzz *fuzzArgs 501 Minimize *minimizeArgs 502 } 503 504 // minimizeArgs contains arguments to workerServer.minimize. The value to 505 // minimize is already in shared memory. 506 type minimizeArgs struct { 507 // Timeout is the time to spend minimizing. This may include time to start up, 508 // especially if the input causes the worker process to terminated, requiring 509 // repeated restarts. 510 Timeout time.Duration 511 512 // Limit is the maximum number of values to test, without spending more time 513 // than Duration. 0 indicates no limit. 514 Limit int64 515 516 // KeepCoverage is a set of coverage counters the worker should attempt to 517 // keep in minimized values. When provided, the worker will reject inputs that 518 // don't cause at least one of these bits to be set. 519 KeepCoverage []byte 520 521 // Index is the index of the fuzz target parameter to be minimized. 522 Index int 523 } 524 525 // minimizeResponse contains results from workerServer.minimize. 526 type minimizeResponse struct { 527 // WroteToMem is true if the worker found a smaller input and wrote it to 528 // shared memory. If minimizeArgs.KeepCoverage was set, the minimized input 529 // preserved at least one coverage bit and did not cause an error. 530 // Otherwise, the minimized input caused some error, recorded in Err. 531 WroteToMem bool 532 533 // Err is the error string caused by the value in shared memory, if any. 534 Err string 535 536 // CoverageData is the set of coverage bits activated by the minimized value 537 // in shared memory. When set, it contains at least one bit from KeepCoverage. 538 // CoverageData will be nil if Err is set or if minimization failed. 539 CoverageData []byte 540 541 // Duration is the time spent minimizing, not including starting or cleaning up. 542 Duration time.Duration 543 544 // Count is the number of values tested. 545 Count int64 546 } 547 548 // fuzzArgs contains arguments to workerServer.fuzz. The value to fuzz is 549 // passed in shared memory. 550 type fuzzArgs struct { 551 // Timeout is the time to spend fuzzing, not including starting or 552 // cleaning up. 553 Timeout time.Duration 554 555 // Limit is the maximum number of values to test, without spending more time 556 // than Duration. 0 indicates no limit. 557 Limit int64 558 559 // Warmup indicates whether this is part of a warmup run, meaning that 560 // fuzzing should not occur. If coverageEnabled is true, then coverage data 561 // should be reported. 562 Warmup bool 563 564 // CoverageData is the coverage data. If set, the worker should update its 565 // local coverage data prior to fuzzing. 566 CoverageData []byte 567 } 568 569 // fuzzResponse contains results from workerServer.fuzz. 570 type fuzzResponse struct { 571 // Duration is the time spent fuzzing, not including starting or cleaning up. 572 TotalDuration time.Duration 573 InterestingDuration time.Duration 574 575 // Count is the number of values tested. 576 Count int64 577 578 // CoverageData is set if the value in shared memory expands coverage 579 // and therefore may be interesting to the coordinator. 580 CoverageData []byte 581 582 // Err is the error string caused by the value in shared memory, which is 583 // non-empty if the value in shared memory caused a crash. 584 Err string 585 586 // InternalErr is the error string caused by an internal error in the 587 // worker. This shouldn't be considered a crasher. 588 InternalErr string 589 } 590 591 // pingArgs contains arguments to workerServer.ping. 592 type pingArgs struct{} 593 594 // pingResponse contains results from workerServer.ping. 595 type pingResponse struct{} 596 597 // workerComm holds pipes and shared memory used for communication 598 // between the coordinator process (client) and a worker process (server). 599 // These values are unique to each worker; they are shared only with the 600 // coordinator, not with other workers. 601 // 602 // Access to shared memory is synchronized implicitly over the RPC protocol 603 // implemented in workerServer and workerClient. During a call, the client 604 // (worker) has exclusive access to shared memory; at other times, the server 605 // (coordinator) has exclusive access. 606 type workerComm struct { 607 fuzzIn, fuzzOut *os.File 608 memMu chan *sharedMem // mutex guarding shared memory 609 } 610 611 // workerServer is a minimalist RPC server, run by fuzz worker processes. 612 // It allows the coordinator process (using workerClient) to call methods in a 613 // worker process. This system allows the coordinator to run multiple worker 614 // processes in parallel and to collect inputs that caused crashes from shared 615 // memory after a worker process terminates unexpectedly. 616 type workerServer struct { 617 workerComm 618 m *mutator 619 620 // coverageMask is the local coverage data for the worker. It is 621 // periodically updated to reflect the data in the coordinator when new 622 // coverage is found. 623 coverageMask []byte 624 625 // fuzzFn runs the worker's fuzz target on the given input and returns an 626 // error if it finds a crasher (the process may also exit or crash), and the 627 // time it took to run the input. It sets a deadline of 10 seconds, at which 628 // point it will panic with the assumption that the process is hanging or 629 // deadlocked. 630 fuzzFn func(CorpusEntry) (time.Duration, error) 631 } 632 633 // serve reads serialized RPC messages on fuzzIn. When serve receives a message, 634 // it calls the corresponding method, then sends the serialized result back 635 // on fuzzOut. 636 // 637 // serve handles RPC calls synchronously; it will not attempt to read a message 638 // until the previous call has finished. 639 // 640 // serve returns errors that occurred when communicating over pipes. serve 641 // does not return errors from method calls; those are passed through serialized 642 // responses. 643 func (ws *workerServer) serve(ctx context.Context) error { 644 enc := json.NewEncoder(ws.fuzzOut) 645 dec := json.NewDecoder(&contextReader{ctx: ctx, r: ws.fuzzIn}) 646 for { 647 var c call 648 if err := dec.Decode(&c); err != nil { 649 if err == io.EOF || err == ctx.Err() { 650 return nil 651 } else { 652 return err 653 } 654 } 655 656 var resp any 657 switch { 658 case c.Fuzz != nil: 659 resp = ws.fuzz(ctx, *c.Fuzz) 660 case c.Minimize != nil: 661 resp = ws.minimize(ctx, *c.Minimize) 662 case c.Ping != nil: 663 resp = ws.ping(ctx, *c.Ping) 664 default: 665 return errors.New("no arguments provided for any call") 666 } 667 668 if err := enc.Encode(resp); err != nil { 669 return err 670 } 671 } 672 } 673 674 // chainedMutations is how many mutations are applied before the worker 675 // resets the input to it's original state. 676 // NOTE: this number was picked without much thought. It is low enough that 677 // it seems to create a significant diversity in mutated inputs. We may want 678 // to consider looking into this more closely once we have a proper performance 679 // testing framework. Another option is to randomly pick the number of chained 680 // mutations on each invocation of the workerServer.fuzz method (this appears to 681 // be what libFuzzer does, although there seems to be no documentation which 682 // explains why this choice was made.) 683 const chainedMutations = 5 684 685 // fuzz runs the test function on random variations of the input value in shared 686 // memory for a limited duration or number of iterations. 687 // 688 // fuzz returns early if it finds an input that crashes the fuzz function (with 689 // fuzzResponse.Err set) or an input that expands coverage (with 690 // fuzzResponse.InterestingDuration set). 691 // 692 // fuzz does not modify the input in shared memory. Instead, it saves the 693 // initial PRNG state in shared memory and increments a counter in shared 694 // memory before each call to the test function. The caller may reconstruct 695 // the crashing input with this information, since the PRNG is deterministic. 696 func (ws *workerServer) fuzz(ctx context.Context, args fuzzArgs) (resp fuzzResponse) { 697 if args.CoverageData != nil { 698 if ws.coverageMask != nil && len(args.CoverageData) != len(ws.coverageMask) { 699 resp.InternalErr = fmt.Sprintf("unexpected size for CoverageData: got %d, expected %d", len(args.CoverageData), len(ws.coverageMask)) 700 return resp 701 } 702 ws.coverageMask = args.CoverageData 703 } 704 start := time.Now() 705 defer func() { resp.TotalDuration = time.Since(start) }() 706 707 if args.Timeout != 0 { 708 var cancel func() 709 ctx, cancel = context.WithTimeout(ctx, args.Timeout) 710 defer cancel() 711 } 712 mem := <-ws.memMu 713 ws.m.r.save(&mem.header().randState, &mem.header().randInc) 714 defer func() { 715 resp.Count = mem.header().count 716 ws.memMu <- mem 717 }() 718 if args.Limit > 0 && mem.header().count >= args.Limit { 719 resp.InternalErr = fmt.Sprintf("mem.header().count %d already exceeds args.Limit %d", mem.header().count, args.Limit) 720 return resp 721 } 722 723 originalVals, err := unmarshalCorpusFile(mem.valueCopy()) 724 if err != nil { 725 resp.InternalErr = err.Error() 726 return resp 727 } 728 vals := make([]any, len(originalVals)) 729 copy(vals, originalVals) 730 731 shouldStop := func() bool { 732 return args.Limit > 0 && mem.header().count >= args.Limit 733 } 734 fuzzOnce := func(entry CorpusEntry) (dur time.Duration, cov []byte, errMsg string) { 735 mem.header().count++ 736 var err error 737 dur, err = ws.fuzzFn(entry) 738 if err != nil { 739 errMsg = err.Error() 740 if errMsg == "" { 741 errMsg = "fuzz function failed with no input" 742 } 743 return dur, nil, errMsg 744 } 745 if ws.coverageMask != nil && countNewCoverageBits(ws.coverageMask, coverageSnapshot) > 0 { 746 return dur, coverageSnapshot, "" 747 } 748 return dur, nil, "" 749 } 750 751 if args.Warmup { 752 dur, _, errMsg := fuzzOnce(CorpusEntry{Values: vals}) 753 if errMsg != "" { 754 resp.Err = errMsg 755 return resp 756 } 757 resp.InterestingDuration = dur 758 if coverageEnabled { 759 resp.CoverageData = coverageSnapshot 760 } 761 return resp 762 } 763 764 for { 765 select { 766 case <-ctx.Done(): 767 return resp 768 default: 769 if mem.header().count%chainedMutations == 0 { 770 copy(vals, originalVals) 771 ws.m.r.save(&mem.header().randState, &mem.header().randInc) 772 } 773 ws.m.mutate(vals, cap(mem.valueRef())) 774 775 entry := CorpusEntry{Values: vals} 776 dur, cov, errMsg := fuzzOnce(entry) 777 if errMsg != "" { 778 resp.Err = errMsg 779 return resp 780 } 781 if cov != nil { 782 resp.CoverageData = cov 783 resp.InterestingDuration = dur 784 return resp 785 } 786 if shouldStop() { 787 return resp 788 } 789 } 790 } 791 } 792 793 func (ws *workerServer) minimize(ctx context.Context, args minimizeArgs) (resp minimizeResponse) { 794 start := time.Now() 795 defer func() { resp.Duration = time.Since(start) }() 796 mem := <-ws.memMu 797 defer func() { ws.memMu <- mem }() 798 vals, err := unmarshalCorpusFile(mem.valueCopy()) 799 if err != nil { 800 panic(err) 801 } 802 inpHash := sha256.Sum256(mem.valueCopy()) 803 if args.Timeout != 0 { 804 var cancel func() 805 ctx, cancel = context.WithTimeout(ctx, args.Timeout) 806 defer cancel() 807 } 808 809 // Minimize the values in vals, then write to shared memory. We only write 810 // to shared memory after completing minimization. 811 success, err := ws.minimizeInput(ctx, vals, mem, args) 812 if success { 813 writeToMem(vals, mem) 814 outHash := sha256.Sum256(mem.valueCopy()) 815 mem.header().rawInMem = false 816 resp.WroteToMem = true 817 if err != nil { 818 resp.Err = err.Error() 819 } else { 820 // If the values didn't change during minimization then coverageSnapshot is likely 821 // a dirty snapshot which represents the very last step of minimization, not the 822 // coverage for the initial input. In that case just return the coverage we were 823 // given initially, since it more accurately represents the coverage map for the 824 // input we are returning. 825 if outHash != inpHash { 826 resp.CoverageData = coverageSnapshot 827 } else { 828 resp.CoverageData = args.KeepCoverage 829 } 830 } 831 } 832 return resp 833 } 834 835 // minimizeInput applies a series of minimizing transformations on the provided 836 // vals, ensuring that each minimization still causes an error, or keeps 837 // coverage, in fuzzFn. It uses the context to determine how long to run, 838 // stopping once closed. It returns a bool indicating whether minimization was 839 // successful and an error if one was found. 840 func (ws *workerServer) minimizeInput(ctx context.Context, vals []any, mem *sharedMem, args minimizeArgs) (success bool, retErr error) { 841 keepCoverage := args.KeepCoverage 842 memBytes := mem.valueRef() 843 bPtr := &memBytes 844 count := &mem.header().count 845 shouldStop := func() bool { 846 return ctx.Err() != nil || 847 (args.Limit > 0 && *count >= args.Limit) 848 } 849 if shouldStop() { 850 return false, nil 851 } 852 853 // Check that the original value preserves coverage or causes an error. 854 // If not, then whatever caused us to think the value was interesting may 855 // have been a flake, and we can't minimize it. 856 *count++ 857 _, retErr = ws.fuzzFn(CorpusEntry{Values: vals}) 858 if keepCoverage != nil { 859 if !hasCoverageBit(keepCoverage, coverageSnapshot) || retErr != nil { 860 return false, nil 861 } 862 } else if retErr == nil { 863 return false, nil 864 } 865 mem.header().rawInMem = true 866 867 // tryMinimized runs the fuzz function with candidate replacing the value 868 // at index valI. tryMinimized returns whether the input with candidate is 869 // interesting for the same reason as the original input: it returns 870 // an error if one was expected, or it preserves coverage. 871 tryMinimized := func(candidate []byte) bool { 872 prev := vals[args.Index] 873 switch prev.(type) { 874 case []byte: 875 vals[args.Index] = candidate 876 case string: 877 vals[args.Index] = string(candidate) 878 default: 879 panic("impossible") 880 } 881 copy(*bPtr, candidate) 882 *bPtr = (*bPtr)[:len(candidate)] 883 mem.setValueLen(len(candidate)) 884 *count++ 885 _, err := ws.fuzzFn(CorpusEntry{Values: vals}) 886 if err != nil { 887 retErr = err 888 if keepCoverage != nil { 889 // Now that we've found a crash, that's more important than any 890 // minimization of interesting inputs that was being done. Clear out 891 // keepCoverage to only minimize the crash going forward. 892 keepCoverage = nil 893 } 894 return true 895 } 896 // Minimization should preserve coverage bits. 897 if keepCoverage != nil && isCoverageSubset(keepCoverage, coverageSnapshot) { 898 return true 899 } 900 vals[args.Index] = prev 901 return false 902 } 903 switch v := vals[args.Index].(type) { 904 case string: 905 minimizeBytes([]byte(v), tryMinimized, shouldStop) 906 case []byte: 907 minimizeBytes(v, tryMinimized, shouldStop) 908 default: 909 panic("impossible") 910 } 911 return true, retErr 912 } 913 914 func writeToMem(vals []any, mem *sharedMem) { 915 b := marshalCorpusFile(vals...) 916 mem.setValue(b) 917 } 918 919 // ping does nothing. The coordinator calls this method to ensure the worker 920 // has called F.Fuzz and can communicate. 921 func (ws *workerServer) ping(ctx context.Context, args pingArgs) pingResponse { 922 return pingResponse{} 923 } 924 925 // workerClient is a minimalist RPC client. The coordinator process uses a 926 // workerClient to call methods in each worker process (handled by 927 // workerServer). 928 type workerClient struct { 929 workerComm 930 m *mutator 931 932 // mu is the mutex protecting the workerComm.fuzzIn pipe. This must be 933 // locked before making calls to the workerServer. It prevents 934 // workerClient.Close from closing fuzzIn while workerClient methods are 935 // writing to it concurrently, and prevents multiple callers from writing to 936 // fuzzIn concurrently. 937 mu sync.Mutex 938 } 939 940 func newWorkerClient(comm workerComm, m *mutator) *workerClient { 941 return &workerClient{workerComm: comm, m: m} 942 } 943 944 // Close shuts down the connection to the RPC server (the worker process) by 945 // closing fuzz_in. Close drains fuzz_out (avoiding a SIGPIPE in the worker), 946 // and closes it after the worker process closes the other end. 947 func (wc *workerClient) Close() error { 948 wc.mu.Lock() 949 defer wc.mu.Unlock() 950 951 // Close fuzzIn. This signals to the server that there are no more calls, 952 // and it should exit. 953 if err := wc.fuzzIn.Close(); err != nil { 954 wc.fuzzOut.Close() 955 return err 956 } 957 958 // Drain fuzzOut and close it. When the server exits, the kernel will close 959 // its end of fuzzOut, and we'll get EOF. 960 if _, err := io.Copy(io.Discard, wc.fuzzOut); err != nil { 961 wc.fuzzOut.Close() 962 return err 963 } 964 return wc.fuzzOut.Close() 965 } 966 967 // errSharedMemClosed is returned by workerClient methods that cannot access 968 // shared memory because it was closed and unmapped by another goroutine. That 969 // can happen when worker.cleanup is called in the worker goroutine while a 970 // workerClient.fuzz call runs concurrently. 971 // 972 // This error should not be reported. It indicates the operation was 973 // interrupted. 974 var errSharedMemClosed = errors.New("internal error: shared memory was closed and unmapped") 975 976 // minimize tells the worker to call the minimize method. See 977 // workerServer.minimize. 978 func (wc *workerClient) minimize(ctx context.Context, entryIn CorpusEntry, args minimizeArgs) (entryOut CorpusEntry, resp minimizeResponse, retErr error) { 979 wc.mu.Lock() 980 defer wc.mu.Unlock() 981 982 mem, ok := <-wc.memMu 983 if !ok { 984 return CorpusEntry{}, minimizeResponse{}, errSharedMemClosed 985 } 986 mem.header().count = 0 987 inp, err := corpusEntryData(entryIn) 988 if err != nil { 989 return CorpusEntry{}, minimizeResponse{}, err 990 } 991 mem.setValue(inp) 992 defer func() { wc.memMu <- mem }() 993 entryOut = entryIn 994 entryOut.Values, err = unmarshalCorpusFile(inp) 995 if err != nil { 996 return CorpusEntry{}, minimizeResponse{}, fmt.Errorf("workerClient.minimize unmarshaling provided value: %v", err) 997 } 998 for i, v := range entryOut.Values { 999 if !isMinimizable(reflect.TypeOf(v)) { 1000 continue 1001 } 1002 1003 wc.memMu <- mem 1004 args.Index = i 1005 c := call{Minimize: &args} 1006 callErr := wc.callLocked(ctx, c, &resp) 1007 mem, ok = <-wc.memMu 1008 if !ok { 1009 return CorpusEntry{}, minimizeResponse{}, errSharedMemClosed 1010 } 1011 1012 if callErr != nil { 1013 retErr = callErr 1014 if !mem.header().rawInMem { 1015 // An unrecoverable error occurred before minimization began. 1016 return entryIn, minimizeResponse{}, retErr 1017 } 1018 // An unrecoverable error occurred during minimization. mem now 1019 // holds the raw, unmarshalled bytes of entryIn.Values[i] that 1020 // caused the error. 1021 switch entryOut.Values[i].(type) { 1022 case string: 1023 entryOut.Values[i] = string(mem.valueCopy()) 1024 case []byte: 1025 entryOut.Values[i] = mem.valueCopy() 1026 default: 1027 panic("impossible") 1028 } 1029 entryOut.Data = marshalCorpusFile(entryOut.Values...) 1030 // Stop minimizing; another unrecoverable error is likely to occur. 1031 break 1032 } 1033 1034 if resp.WroteToMem { 1035 // Minimization succeeded, and mem holds the marshaled data. 1036 entryOut.Data = mem.valueCopy() 1037 entryOut.Values, err = unmarshalCorpusFile(entryOut.Data) 1038 if err != nil { 1039 return CorpusEntry{}, minimizeResponse{}, fmt.Errorf("workerClient.minimize unmarshaling minimized value: %v", err) 1040 } 1041 } 1042 1043 // Prepare for next iteration of the loop. 1044 if args.Timeout != 0 { 1045 args.Timeout -= resp.Duration 1046 if args.Timeout <= 0 { 1047 break 1048 } 1049 } 1050 if args.Limit != 0 { 1051 args.Limit -= mem.header().count 1052 if args.Limit <= 0 { 1053 break 1054 } 1055 } 1056 } 1057 resp.Count = mem.header().count 1058 h := sha256.Sum256(entryOut.Data) 1059 entryOut.Path = fmt.Sprintf("%x", h[:4]) 1060 return entryOut, resp, retErr 1061 } 1062 1063 // fuzz tells the worker to call the fuzz method. See workerServer.fuzz. 1064 func (wc *workerClient) fuzz(ctx context.Context, entryIn CorpusEntry, args fuzzArgs) (entryOut CorpusEntry, resp fuzzResponse, isInternalError bool, err error) { 1065 wc.mu.Lock() 1066 defer wc.mu.Unlock() 1067 1068 mem, ok := <-wc.memMu 1069 if !ok { 1070 return CorpusEntry{}, fuzzResponse{}, true, errSharedMemClosed 1071 } 1072 mem.header().count = 0 1073 inp, err := corpusEntryData(entryIn) 1074 if err != nil { 1075 return CorpusEntry{}, fuzzResponse{}, true, err 1076 } 1077 mem.setValue(inp) 1078 wc.memMu <- mem 1079 1080 c := call{Fuzz: &args} 1081 callErr := wc.callLocked(ctx, c, &resp) 1082 if resp.InternalErr != "" { 1083 return CorpusEntry{}, fuzzResponse{}, true, errors.New(resp.InternalErr) 1084 } 1085 mem, ok = <-wc.memMu 1086 if !ok { 1087 return CorpusEntry{}, fuzzResponse{}, true, errSharedMemClosed 1088 } 1089 defer func() { wc.memMu <- mem }() 1090 resp.Count = mem.header().count 1091 1092 if !bytes.Equal(inp, mem.valueRef()) { 1093 return CorpusEntry{}, fuzzResponse{}, true, errors.New("workerServer.fuzz modified input") 1094 } 1095 needEntryOut := callErr != nil || resp.Err != "" || 1096 (!args.Warmup && resp.CoverageData != nil) 1097 if needEntryOut { 1098 valuesOut, err := unmarshalCorpusFile(inp) 1099 if err != nil { 1100 return CorpusEntry{}, fuzzResponse{}, true, fmt.Errorf("unmarshaling fuzz input value after call: %v", err) 1101 } 1102 wc.m.r.restore(mem.header().randState, mem.header().randInc) 1103 if !args.Warmup { 1104 // Only mutate the valuesOut if fuzzing actually occurred. 1105 numMutations := ((resp.Count - 1) % chainedMutations) + 1 1106 for i := int64(0); i < numMutations; i++ { 1107 wc.m.mutate(valuesOut, cap(mem.valueRef())) 1108 } 1109 } 1110 dataOut := marshalCorpusFile(valuesOut...) 1111 1112 h := sha256.Sum256(dataOut) 1113 name := fmt.Sprintf("%x", h[:4]) 1114 entryOut = CorpusEntry{ 1115 Parent: entryIn.Path, 1116 Path: name, 1117 Data: dataOut, 1118 Generation: entryIn.Generation + 1, 1119 } 1120 if args.Warmup { 1121 // The bytes weren't mutated, so if entryIn was a seed corpus value, 1122 // then entryOut is too. 1123 entryOut.IsSeed = entryIn.IsSeed 1124 } 1125 } 1126 1127 return entryOut, resp, false, callErr 1128 } 1129 1130 // ping tells the worker to call the ping method. See workerServer.ping. 1131 func (wc *workerClient) ping(ctx context.Context) error { 1132 wc.mu.Lock() 1133 defer wc.mu.Unlock() 1134 c := call{Ping: &pingArgs{}} 1135 var resp pingResponse 1136 return wc.callLocked(ctx, c, &resp) 1137 } 1138 1139 // callLocked sends an RPC from the coordinator to the worker process and waits 1140 // for the response. The callLocked may be cancelled with ctx. 1141 func (wc *workerClient) callLocked(ctx context.Context, c call, resp any) (err error) { 1142 enc := json.NewEncoder(wc.fuzzIn) 1143 dec := json.NewDecoder(&contextReader{ctx: ctx, r: wc.fuzzOut}) 1144 if err := enc.Encode(c); err != nil { 1145 return err 1146 } 1147 return dec.Decode(resp) 1148 } 1149 1150 // contextReader wraps a Reader with a Context. If the context is cancelled 1151 // while the underlying reader is blocked, Read returns immediately. 1152 // 1153 // This is useful for reading from a pipe. Closing a pipe file descriptor does 1154 // not unblock pending Reads on that file descriptor. All copies of the pipe's 1155 // other file descriptor (the write end) must be closed in all processes that 1156 // inherit it. This is difficult to do correctly in the situation we care about 1157 // (process group termination). 1158 type contextReader struct { 1159 ctx context.Context 1160 r io.Reader 1161 } 1162 1163 func (cr *contextReader) Read(b []byte) (int, error) { 1164 if ctxErr := cr.ctx.Err(); ctxErr != nil { 1165 return 0, ctxErr 1166 } 1167 done := make(chan struct{}) 1168 1169 // This goroutine may stay blocked after Read returns because the underlying 1170 // read is blocked. 1171 var n int 1172 var err error 1173 go func() { 1174 n, err = cr.r.Read(b) 1175 close(done) 1176 }() 1177 1178 select { 1179 case <-cr.ctx.Done(): 1180 return 0, cr.ctx.Err() 1181 case <-done: 1182 return n, err 1183 } 1184 }