github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/vm/gce/gce.go (about)

     1  // Copyright 2016 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  // Package gce allows to use Google Compute Engine (GCE) virtual machines as VMs.
     5  // It is assumed that syz-manager also runs on GCE as VMs are created in the current project/zone.
     6  //
     7  // See https://cloud.google.com/compute/docs for details.
     8  // In particular, how to build GCE-compatible images:
     9  // https://cloud.google.com/compute/docs/tutorials/building-images
    10  // Working with serial console:
    11  // https://cloud.google.com/compute/docs/instances/interacting-with-serial-console
    12  package gce
    13  
    14  import (
    15  	"archive/tar"
    16  	"bytes"
    17  	"compress/gzip"
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  	"os"
    23  	"path/filepath"
    24  	"strings"
    25  	"time"
    26  
    27  	"github.com/google/syzkaller/pkg/config"
    28  	"github.com/google/syzkaller/pkg/gce"
    29  	"github.com/google/syzkaller/pkg/gcs"
    30  	"github.com/google/syzkaller/pkg/kd"
    31  	"github.com/google/syzkaller/pkg/log"
    32  	"github.com/google/syzkaller/pkg/osutil"
    33  	"github.com/google/syzkaller/pkg/report"
    34  	"github.com/google/syzkaller/sys/targets"
    35  	"github.com/google/syzkaller/vm/vmimpl"
    36  )
    37  
    38  func init() {
    39  	vmimpl.Register("gce", vmimpl.Type{
    40  		Ctor:        ctor,
    41  		Overcommit:  true,
    42  		Preemptible: true,
    43  	})
    44  }
    45  
    46  type Config struct {
    47  	Count         int    `json:"count"`          // number of VMs to use
    48  	ZoneID        string `json:"zone_id"`        // GCE zone (if it's different from that of syz-manager)
    49  	MachineType   string `json:"machine_type"`   // GCE machine type (e.g. "n1-highcpu-2")
    50  	GCSPath       string `json:"gcs_path"`       // GCS path to upload image
    51  	GCEImage      string `json:"gce_image"`      // pre-created GCE image to use
    52  	Preemptible   bool   `json:"preemptible"`    // use preemptible VMs if available (defaults to true)
    53  	DisplayDevice bool   `json:"display_device"` // enable a virtual display device
    54  	// Username to connect to ssh-serialport.googleapis.com.
    55  	// Leave empty for non-OS Login GCP projects.
    56  	// Otherwise take the user from `gcloud compute connect-to-serial-port --dry-run`.
    57  	SerialPortUser string `json:"serial_port_user"`
    58  	// A private key to connect to ssh-serialport.googleapis.com.
    59  	// Leave empty for non-OS Login GCP projects.
    60  	// Otherwise generate one and upload it:
    61  	// `gcloud compute os-login ssh-keys add --key-file some-key.pub`.
    62  	SerialPortKey string `json:"serial_port_key"`
    63  }
    64  
    65  type Pool struct {
    66  	env            *vmimpl.Env
    67  	cfg            *Config
    68  	GCE            *gce.Context
    69  	consoleReadCmd string // optional: command to read non-standard kernel console
    70  }
    71  
    72  type instance struct {
    73  	env   *vmimpl.Env
    74  	cfg   *Config
    75  	GCE   *gce.Context
    76  	debug bool
    77  	name  string
    78  	vmimpl.SSHOptions
    79  	gceKey         string // per-instance private ssh key associated with the instance
    80  	closed         chan bool
    81  	consolew       io.WriteCloser
    82  	consoleReadCmd string // optional: command to read non-standard kernel console
    83  	timeouts       targets.Timeouts
    84  }
    85  
    86  func ctor(env *vmimpl.Env) (vmimpl.Pool, error) {
    87  	return Ctor(env, "")
    88  }
    89  
    90  func Ctor(env *vmimpl.Env, consoleReadCmd string) (*Pool, error) {
    91  	if env.Name == "" {
    92  		return nil, fmt.Errorf("config param name is empty (required for GCE)")
    93  	}
    94  	cfg := &Config{
    95  		Count:       1,
    96  		Preemptible: true,
    97  		// Display device is not supported on other platforms.
    98  		DisplayDevice: env.Arch == targets.AMD64,
    99  	}
   100  	if err := config.LoadData(env.Config, cfg); err != nil {
   101  		return nil, fmt.Errorf("failed to parse gce vm config: %w", err)
   102  	}
   103  	if cfg.Count < 1 || cfg.Count > 1000 {
   104  		return nil, fmt.Errorf("invalid config param count: %v, want [1, 1000]", cfg.Count)
   105  	}
   106  	if cfg.MachineType == "" {
   107  		return nil, fmt.Errorf("machine_type parameter is empty")
   108  	}
   109  	if cfg.GCEImage == "" && cfg.GCSPath == "" {
   110  		return nil, fmt.Errorf("gcs_path parameter is empty")
   111  	}
   112  	if cfg.GCEImage == "" && env.Image == "" {
   113  		return nil, fmt.Errorf("config param image is empty (required for GCE)")
   114  	}
   115  	if cfg.GCEImage != "" && env.Image != "" {
   116  		return nil, fmt.Errorf("both image and gce_image are specified")
   117  	}
   118  
   119  	GCE, err := initGCE(cfg.ZoneID)
   120  	if err != nil {
   121  		return nil, err
   122  	}
   123  
   124  	log.Logf(0, "GCE initialized: running on %v, internal IP %v, project %v, zone %v, net %v/%v",
   125  		GCE.Instance, GCE.InternalIP, GCE.ProjectID, GCE.ZoneID, GCE.Network, GCE.Subnetwork)
   126  
   127  	if cfg.GCEImage == "" {
   128  		cfg.GCEImage = env.Name
   129  		gcsImage := filepath.Join(cfg.GCSPath, env.Name+"-image.tar.gz")
   130  		log.Logf(0, "uploading image %v to %v...", env.Image, gcsImage)
   131  		if err := uploadImageToGCS(env.Image, gcsImage); err != nil {
   132  			return nil, err
   133  		}
   134  		log.Logf(0, "creating GCE image %v...", cfg.GCEImage)
   135  		if err := GCE.DeleteImage(cfg.GCEImage); err != nil {
   136  			return nil, fmt.Errorf("failed to delete GCE image: %w", err)
   137  		}
   138  		if err := GCE.CreateImage(cfg.GCEImage, gcsImage, env.OS); err != nil {
   139  			return nil, fmt.Errorf("failed to create GCE image: %w", err)
   140  		}
   141  	}
   142  	pool := &Pool{
   143  		cfg:            cfg,
   144  		env:            env,
   145  		GCE:            GCE,
   146  		consoleReadCmd: consoleReadCmd,
   147  	}
   148  	return pool, nil
   149  }
   150  
   151  func initGCE(zoneID string) (*gce.Context, error) {
   152  	// There happen some transient GCE init errors on and off.
   153  	// Let's try it several times before aborting.
   154  	const (
   155  		gceInitAttempts = 3
   156  		gceInitBackoff  = 5 * time.Second
   157  	)
   158  	var (
   159  		GCE *gce.Context
   160  		err error
   161  	)
   162  	for i := 1; i <= gceInitAttempts; i++ {
   163  		if i > 1 {
   164  			time.Sleep(gceInitBackoff)
   165  		}
   166  		GCE, err = gce.NewContext(zoneID)
   167  		if err == nil {
   168  			return GCE, nil
   169  		}
   170  		log.Logf(0, "init GCE attempt %d/%d failed: %v", i, gceInitAttempts, err)
   171  	}
   172  	return nil, fmt.Errorf("all attempts to init GCE failed: %w", err)
   173  }
   174  
   175  func (pool *Pool) Count() int {
   176  	return pool.cfg.Count
   177  }
   178  
   179  func (pool *Pool) Create(_ context.Context, workdir string, index int) (vmimpl.Instance, error) {
   180  	name := fmt.Sprintf("%v-%v", pool.env.Name, index)
   181  	// Create SSH key for the instance.
   182  	gceKey := filepath.Join(workdir, "key")
   183  	keygen := osutil.Command("ssh-keygen", "-t", "ed25519", "-N", "", "-C", "syzkaller", "-f", gceKey)
   184  	if out, err := keygen.CombinedOutput(); err != nil {
   185  		return nil, fmt.Errorf("failed to execute ssh-keygen: %w\n%s", err, out)
   186  	}
   187  	gceKeyPub, err := os.ReadFile(gceKey + ".pub")
   188  	if err != nil {
   189  		return nil, fmt.Errorf("failed to read file: %w", err)
   190  	}
   191  
   192  	log.Logf(0, "deleting instance: %v", name)
   193  	if err := pool.GCE.DeleteInstance(name, true); err != nil {
   194  		return nil, err
   195  	}
   196  	log.Logf(0, "creating instance: %v", name)
   197  	ip, err := pool.GCE.CreateInstance(name, pool.cfg.MachineType, pool.cfg.GCEImage,
   198  		string(gceKeyPub), pool.cfg.Preemptible, pool.cfg.DisplayDevice)
   199  	if err != nil {
   200  		return nil, err
   201  	}
   202  
   203  	ok := false
   204  	defer func() {
   205  		if !ok {
   206  			pool.GCE.DeleteInstance(name, true)
   207  		}
   208  	}()
   209  	sshKey := pool.env.SSHKey
   210  	sshUser := pool.env.SSHUser
   211  	if sshKey == "GCE" {
   212  		// Assuming image supports GCE ssh fanciness.
   213  		sshKey = gceKey
   214  		sshUser = "syzkaller"
   215  	}
   216  	log.Logf(0, "wait instance to boot: %v (%v)", name, ip)
   217  	inst := &instance{
   218  		env:   pool.env,
   219  		cfg:   pool.cfg,
   220  		debug: pool.env.Debug,
   221  		GCE:   pool.GCE,
   222  		name:  name,
   223  		SSHOptions: vmimpl.SSHOptions{
   224  			Addr: ip,
   225  			Port: 22,
   226  			Key:  sshKey,
   227  			User: sshUser,
   228  		},
   229  
   230  		gceKey: gceKey,
   231  
   232  		closed:         make(chan bool),
   233  		consoleReadCmd: pool.consoleReadCmd,
   234  		timeouts:       pool.env.Timeouts,
   235  	}
   236  	if err := vmimpl.WaitForSSH(5*time.Minute, inst.SSHOptions,
   237  		pool.env.OS, nil, false, pool.env.Debug); err != nil {
   238  		output, outputErr := inst.getSerialPortOutput()
   239  		if outputErr != nil {
   240  			output = []byte(fmt.Sprintf("failed to get boot output: %v", outputErr))
   241  		}
   242  		return nil, vmimpl.MakeBootError(err, output)
   243  	}
   244  	ok = true
   245  	return inst, nil
   246  }
   247  
   248  func (inst *instance) Close() error {
   249  	close(inst.closed)
   250  	err := inst.GCE.DeleteInstance(inst.name, false)
   251  	if inst.consolew != nil {
   252  		err2 := inst.consolew.Close()
   253  		if err == nil {
   254  			err = err2
   255  		}
   256  	}
   257  	return err
   258  }
   259  
   260  func (inst *instance) Forward(port int) (string, error) {
   261  	return fmt.Sprintf("%v:%v", inst.GCE.InternalIP, port), nil
   262  }
   263  
   264  func (inst *instance) Copy(hostSrc string) (string, error) {
   265  	vmDst := "./" + filepath.Base(hostSrc)
   266  	args := append(vmimpl.SCPArgs(true, inst.Key, inst.Port, false),
   267  		hostSrc, inst.User+"@"+inst.Addr+":"+vmDst)
   268  	if err := runCmd(inst.debug, "scp", args...); err != nil {
   269  		return "", err
   270  	}
   271  	return vmDst, nil
   272  }
   273  
   274  func (inst *instance) Run(ctx context.Context, command string) (
   275  	<-chan []byte, <-chan error, error) {
   276  	conRpipe, conWpipe, err := osutil.LongPipe()
   277  	if err != nil {
   278  		return nil, nil, err
   279  	}
   280  
   281  	var conArgs []string
   282  	if inst.consoleReadCmd == "" {
   283  		conArgs = inst.serialPortArgs(false)
   284  	} else {
   285  		conArgs = inst.sshArgs(inst.consoleReadCmd)
   286  	}
   287  	con := osutil.Command("ssh", conArgs...)
   288  	con.Env = []string{}
   289  	con.Stdout = conWpipe
   290  	con.Stderr = conWpipe
   291  	conw, err := con.StdinPipe()
   292  	if err != nil {
   293  		conRpipe.Close()
   294  		conWpipe.Close()
   295  		return nil, nil, err
   296  	}
   297  	if inst.consolew != nil {
   298  		inst.consolew.Close()
   299  	}
   300  	inst.consolew = conw
   301  	if err := con.Start(); err != nil {
   302  		conRpipe.Close()
   303  		conWpipe.Close()
   304  		return nil, nil, fmt.Errorf("failed to connect to console server: %w", err)
   305  	}
   306  	conWpipe.Close()
   307  
   308  	var tee io.Writer
   309  	if inst.debug {
   310  		tee = os.Stdout
   311  	}
   312  	merger := vmimpl.NewOutputMerger(tee)
   313  	var decoder func(data []byte) (int, int, []byte)
   314  	if inst.env.OS == targets.Windows {
   315  		decoder = kd.Decode
   316  	}
   317  	merger.AddDecoder("console", conRpipe, decoder)
   318  	if err := waitForConsoleConnect(merger); err != nil {
   319  		con.Process.Kill()
   320  		merger.Wait()
   321  		return nil, nil, err
   322  	}
   323  	sshRpipe, sshWpipe, err := osutil.LongPipe()
   324  	if err != nil {
   325  		con.Process.Kill()
   326  		merger.Wait()
   327  		sshRpipe.Close()
   328  		return nil, nil, err
   329  	}
   330  	ssh := osutil.Command("ssh", inst.sshArgs(command)...)
   331  	ssh.Stdout = sshWpipe
   332  	ssh.Stderr = sshWpipe
   333  	if err := ssh.Start(); err != nil {
   334  		con.Process.Kill()
   335  		merger.Wait()
   336  		sshRpipe.Close()
   337  		sshWpipe.Close()
   338  		return nil, nil, fmt.Errorf("failed to connect to instance: %w", err)
   339  	}
   340  	sshWpipe.Close()
   341  	merger.Add("ssh", sshRpipe)
   342  
   343  	return vmimpl.Multiplex(ctx, ssh, merger, vmimpl.MultiplexConfig{
   344  		Console: vmimpl.CmdCloser{Cmd: con},
   345  		Close:   inst.closed,
   346  		Debug:   inst.debug,
   347  		Scale:   inst.timeouts.Scale,
   348  		IgnoreError: func(err error) bool {
   349  			var mergeError *vmimpl.MergerError
   350  			if errors.As(err, &mergeError) && mergeError.R == conRpipe {
   351  				// Console connection must never fail. If it does, it's either
   352  				// instance preemption or a GCE bug. In either case, not a kernel bug.
   353  				log.Logf(0, "%v: gce console connection failed with %v", inst.name, mergeError.Err)
   354  				return true
   355  			} else {
   356  				// Check if the instance was terminated due to preemption or host maintenance.
   357  				// vmimpl.Multiplex() already adds a delay, so we've already waited enough
   358  				// to let GCE VM status updates propagate.
   359  				if !inst.GCE.IsInstanceRunning(inst.name) {
   360  					log.Logf(0, "%v: ssh exited but instance is not running", inst.name)
   361  					return true
   362  				}
   363  			}
   364  			return false
   365  		},
   366  	})
   367  }
   368  
   369  func waitForConsoleConnect(merger *vmimpl.OutputMerger) error {
   370  	// We've started the console reading ssh command, but it has not necessary connected yet.
   371  	// If we proceed to running the target command right away, we can miss part
   372  	// of console output. During repro we can crash machines very quickly and
   373  	// would miss beginning of a crash. Before ssh starts piping console output,
   374  	// it usually prints:
   375  	// "serialport: Connected to ... port 1 (session ID: ..., active connections: 1)"
   376  	// So we wait for this line, or at least a minute and at least some output.
   377  	timeout := time.NewTimer(time.Minute)
   378  	defer timeout.Stop()
   379  	connectedMsg := []byte("serialport: Connected")
   380  	permissionDeniedMsg := []byte("Permission denied (publickey)")
   381  	var output []byte
   382  	for {
   383  		select {
   384  		case out := <-merger.Output:
   385  			output = append(output, out...)
   386  			if bytes.Contains(output, connectedMsg) {
   387  				// Just to make sure (otherwise we still see trimmed reports).
   388  				time.Sleep(5 * time.Second)
   389  				return nil
   390  			}
   391  			if bytes.Contains(output, permissionDeniedMsg) {
   392  				// This is a GCE bug.
   393  				return fmt.Errorf("broken console: %s", permissionDeniedMsg)
   394  			}
   395  		case <-timeout.C:
   396  			if len(output) == 0 {
   397  				return fmt.Errorf("broken console: no output")
   398  			}
   399  			return nil
   400  		}
   401  	}
   402  }
   403  
   404  func (inst *instance) Diagnose(rep *report.Report) ([]byte, bool) {
   405  	switch inst.env.OS {
   406  	case targets.Linux:
   407  		output, wait, _ := vmimpl.DiagnoseLinux(rep, inst.ssh)
   408  		return output, wait
   409  	case targets.FreeBSD:
   410  		return vmimpl.DiagnoseFreeBSD(inst.consolew)
   411  	case targets.OpenBSD:
   412  		return vmimpl.DiagnoseOpenBSD(inst.consolew)
   413  	}
   414  	return nil, false
   415  }
   416  
   417  func (inst *instance) ssh(args ...string) ([]byte, error) {
   418  	return osutil.RunCmd(time.Minute, "", "ssh", inst.sshArgs(args...)...)
   419  }
   420  
   421  func (inst *instance) sshArgs(args ...string) []string {
   422  	sshArgs := append(vmimpl.SSHArgs(inst.debug, inst.Key, 22, false), inst.User+"@"+inst.Addr)
   423  	if inst.env.OS == targets.Linux && inst.User != "root" {
   424  		args = []string{"sudo", "bash", "-c", "'" + strings.Join(args, " ") + "'"}
   425  	}
   426  	return append(sshArgs, args...)
   427  }
   428  
   429  func (inst *instance) serialPortArgs(replay bool) []string {
   430  	user := "syzkaller"
   431  	if inst.cfg.SerialPortUser != "" {
   432  		user = inst.cfg.SerialPortUser
   433  	}
   434  	key := inst.gceKey
   435  	if inst.cfg.SerialPortKey != "" {
   436  		key = inst.cfg.SerialPortKey
   437  	}
   438  	replayArg := ""
   439  	if replay {
   440  		replayArg = ".replay-lines=10000"
   441  	}
   442  	conAddr := fmt.Sprintf("%v.%v.%v.%s.port=1%s@%v-ssh-serialport.googleapis.com",
   443  		inst.GCE.ProjectID, inst.GCE.ZoneID, inst.name, user, replayArg, inst.GCE.RegionID)
   444  	conArgs := append(vmimpl.SSHArgs(inst.debug, key, 9600, false), conAddr)
   445  	// TODO(blackgnezdo): Remove this once ssh-serialport.googleapis.com stops using
   446  	// host key algorithm: ssh-rsa.
   447  	return append(conArgs, "-o", "HostKeyAlgorithms=+ssh-rsa")
   448  }
   449  
   450  func (inst *instance) getSerialPortOutput() ([]byte, error) {
   451  	conRpipe, conWpipe, err := osutil.LongPipe()
   452  	if err != nil {
   453  		return nil, err
   454  	}
   455  	defer conRpipe.Close()
   456  	defer conWpipe.Close()
   457  
   458  	con := osutil.Command("ssh", inst.serialPortArgs(true)...)
   459  	con.Env = []string{}
   460  	con.Stdout = conWpipe
   461  	con.Stderr = conWpipe
   462  	if _, err := con.StdinPipe(); err != nil { // SSH would close connection on stdin EOF
   463  		return nil, err
   464  	}
   465  	if err := con.Start(); err != nil {
   466  		return nil, fmt.Errorf("failed to connect to console server: %w", err)
   467  	}
   468  	conWpipe.Close()
   469  	done := make(chan bool)
   470  	go func() {
   471  		timeout := time.NewTimer(time.Minute)
   472  		defer timeout.Stop()
   473  		select {
   474  		case <-done:
   475  		case <-timeout.C:
   476  		}
   477  		con.Process.Kill()
   478  	}()
   479  	var output []byte
   480  	buf := make([]byte, 64<<10)
   481  	for {
   482  		n, err := conRpipe.Read(buf)
   483  		if err != nil || n == 0 {
   484  			break
   485  		}
   486  		output = append(output, buf[:n]...)
   487  	}
   488  	close(done)
   489  	con.Wait()
   490  	return output, nil
   491  }
   492  
   493  func uploadImageToGCS(localImage, gcsImage string) error {
   494  	GCS, err := gcs.NewClient(context.Background())
   495  	if err != nil {
   496  		return fmt.Errorf("failed to create GCS client: %w", err)
   497  	}
   498  	defer GCS.Close()
   499  
   500  	localReader, err := os.Open(localImage)
   501  	if err != nil {
   502  		return fmt.Errorf("failed to open image file: %w", err)
   503  	}
   504  	defer localReader.Close()
   505  	localStat, err := localReader.Stat()
   506  	if err != nil {
   507  		return fmt.Errorf("failed to stat image file: %w", err)
   508  	}
   509  
   510  	gcsWriter, err := GCS.FileWriter(gcsImage, "", "")
   511  	if err != nil {
   512  		return fmt.Errorf("failed to upload image: %w", err)
   513  	}
   514  	defer gcsWriter.Close()
   515  
   516  	gzipWriter := gzip.NewWriter(gcsWriter)
   517  	tarWriter := tar.NewWriter(gzipWriter)
   518  	tarHeader := &tar.Header{
   519  		Name:     "disk.raw",
   520  		Typeflag: tar.TypeReg,
   521  		Mode:     0640,
   522  		Size:     localStat.Size(),
   523  		ModTime:  time.Now(),
   524  		Uname:    "syzkaller",
   525  		Gname:    "syzkaller",
   526  	}
   527  	setGNUFormat(tarHeader)
   528  	if err := tarWriter.WriteHeader(tarHeader); err != nil {
   529  		return fmt.Errorf("failed to write image tar header: %w", err)
   530  	}
   531  	if _, err := io.Copy(tarWriter, localReader); err != nil {
   532  		return fmt.Errorf("failed to write image file: %w", err)
   533  	}
   534  	if err := tarWriter.Close(); err != nil {
   535  		return fmt.Errorf("failed to write image file: %w", err)
   536  	}
   537  	if err := gzipWriter.Close(); err != nil {
   538  		return fmt.Errorf("failed to write image file: %w", err)
   539  	}
   540  	if err := gcsWriter.Close(); err != nil {
   541  		return fmt.Errorf("failed to write image file: %w", err)
   542  	}
   543  	return nil
   544  }
   545  
   546  func runCmd(debug bool, bin string, args ...string) error {
   547  	if debug {
   548  		log.Logf(0, "running command: %v %#v", bin, args)
   549  	}
   550  	output, err := osutil.RunCmd(time.Minute, "", bin, args...)
   551  	if debug {
   552  		log.Logf(0, "result: %v\n%s", err, output)
   553  	}
   554  	return err
   555  }