github.com/google/syzkaller@v0.0.0-20240517125934-c0f1611a36d6/vm/gce/gce.go (about)

     1  // Copyright 2016 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  // Package gce allows to use Google Compute Engine (GCE) virtual machines as VMs.
     5  // It is assumed that syz-manager also runs on GCE as VMs are created in the current project/zone.
     6  //
     7  // See https://cloud.google.com/compute/docs for details.
     8  // In particular, how to build GCE-compatible images:
     9  // https://cloud.google.com/compute/docs/tutorials/building-images
    10  // Working with serial console:
    11  // https://cloud.google.com/compute/docs/instances/interacting-with-serial-console
    12  package gce
    13  
    14  import (
    15  	"archive/tar"
    16  	"bytes"
    17  	"compress/gzip"
    18  	"errors"
    19  	"fmt"
    20  	"io"
    21  	"os"
    22  	"path/filepath"
    23  	"strings"
    24  	"time"
    25  
    26  	"github.com/google/syzkaller/pkg/config"
    27  	"github.com/google/syzkaller/pkg/gce"
    28  	"github.com/google/syzkaller/pkg/gcs"
    29  	"github.com/google/syzkaller/pkg/kd"
    30  	"github.com/google/syzkaller/pkg/log"
    31  	"github.com/google/syzkaller/pkg/osutil"
    32  	"github.com/google/syzkaller/pkg/report"
    33  	"github.com/google/syzkaller/sys/targets"
    34  	"github.com/google/syzkaller/vm/vmimpl"
    35  )
    36  
    37  func init() {
    38  	vmimpl.Register("gce", ctor, true)
    39  }
    40  
    41  type Config struct {
    42  	Count         int    `json:"count"`          // number of VMs to use
    43  	ZoneID        string `json:"zone_id"`        // GCE zone (if it's different from that of syz-manager)
    44  	MachineType   string `json:"machine_type"`   // GCE machine type (e.g. "n1-highcpu-2")
    45  	GCSPath       string `json:"gcs_path"`       // GCS path to upload image
    46  	GCEImage      string `json:"gce_image"`      // pre-created GCE image to use
    47  	Preemptible   bool   `json:"preemptible"`    // use preemptible VMs if available (defaults to true)
    48  	DisplayDevice bool   `json:"display_device"` // enable a virtual display device
    49  	// Username to connect to ssh-serialport.googleapis.com.
    50  	// Leave empty for non-OS Login GCP projects.
    51  	// Otherwise take the user from `gcloud compute connect-to-serial-port --dry-run`.
    52  	SerialPortUser string `json:"serial_port_user"`
    53  	// A private key to connect to ssh-serialport.googleapis.com.
    54  	// Leave empty for non-OS Login GCP projects.
    55  	// Otherwise generate one and upload it:
    56  	// `gcloud compute os-login ssh-keys add --key-file some-key.pub`.
    57  	SerialPortKey string `json:"serial_port_key"`
    58  }
    59  
    60  type Pool struct {
    61  	env            *vmimpl.Env
    62  	cfg            *Config
    63  	GCE            *gce.Context
    64  	consoleReadCmd string // optional: command to read non-standard kernel console
    65  }
    66  
    67  type instance struct {
    68  	env            *vmimpl.Env
    69  	cfg            *Config
    70  	GCE            *gce.Context
    71  	debug          bool
    72  	name           string
    73  	ip             string
    74  	gceKey         string // per-instance private ssh key associated with the instance
    75  	sshKey         string // ssh key
    76  	sshUser        string
    77  	closed         chan bool
    78  	consolew       io.WriteCloser
    79  	consoleReadCmd string // optional: command to read non-standard kernel console
    80  }
    81  
    82  func ctor(env *vmimpl.Env) (vmimpl.Pool, error) {
    83  	return Ctor(env, "")
    84  }
    85  
    86  func Ctor(env *vmimpl.Env, consoleReadCmd string) (*Pool, error) {
    87  	if env.Name == "" {
    88  		return nil, fmt.Errorf("config param name is empty (required for GCE)")
    89  	}
    90  	cfg := &Config{
    91  		Count:       1,
    92  		Preemptible: true,
    93  		// Display device is not supported on other platforms.
    94  		DisplayDevice: env.Arch == targets.AMD64,
    95  	}
    96  	if err := config.LoadData(env.Config, cfg); err != nil {
    97  		return nil, fmt.Errorf("failed to parse gce vm config: %w", err)
    98  	}
    99  	if cfg.Count < 1 || cfg.Count > 1000 {
   100  		return nil, fmt.Errorf("invalid config param count: %v, want [1, 1000]", cfg.Count)
   101  	}
   102  	if env.Debug && cfg.Count > 1 {
   103  		log.Logf(0, "limiting number of VMs from %v to 1 in debug mode", cfg.Count)
   104  		cfg.Count = 1
   105  	}
   106  	if cfg.MachineType == "" {
   107  		return nil, fmt.Errorf("machine_type parameter is empty")
   108  	}
   109  	if cfg.GCEImage == "" && cfg.GCSPath == "" {
   110  		return nil, fmt.Errorf("gcs_path parameter is empty")
   111  	}
   112  	if cfg.GCEImage == "" && env.Image == "" {
   113  		return nil, fmt.Errorf("config param image is empty (required for GCE)")
   114  	}
   115  	if cfg.GCEImage != "" && env.Image != "" {
   116  		return nil, fmt.Errorf("both image and gce_image are specified")
   117  	}
   118  
   119  	GCE, err := initGCE(cfg.ZoneID)
   120  	if err != nil {
   121  		return nil, err
   122  	}
   123  
   124  	log.Logf(0, "GCE initialized: running on %v, internal IP %v, project %v, zone %v, net %v/%v",
   125  		GCE.Instance, GCE.InternalIP, GCE.ProjectID, GCE.ZoneID, GCE.Network, GCE.Subnetwork)
   126  
   127  	if cfg.GCEImage == "" {
   128  		cfg.GCEImage = env.Name
   129  		gcsImage := filepath.Join(cfg.GCSPath, env.Name+"-image.tar.gz")
   130  		log.Logf(0, "uploading image %v to %v...", env.Image, gcsImage)
   131  		if err := uploadImageToGCS(env.Image, gcsImage); err != nil {
   132  			return nil, err
   133  		}
   134  		log.Logf(0, "creating GCE image %v...", cfg.GCEImage)
   135  		if err := GCE.DeleteImage(cfg.GCEImage); err != nil {
   136  			return nil, fmt.Errorf("failed to delete GCE image: %w", err)
   137  		}
   138  		if err := GCE.CreateImage(cfg.GCEImage, gcsImage); err != nil {
   139  			return nil, fmt.Errorf("failed to create GCE image: %w", err)
   140  		}
   141  	}
   142  	pool := &Pool{
   143  		cfg:            cfg,
   144  		env:            env,
   145  		GCE:            GCE,
   146  		consoleReadCmd: consoleReadCmd,
   147  	}
   148  	return pool, nil
   149  }
   150  
   151  func initGCE(zoneID string) (*gce.Context, error) {
   152  	// There happen some transient GCE init errors on and off.
   153  	// Let's try it several times before aborting.
   154  	const (
   155  		gceInitAttempts = 3
   156  		gceInitBackoff  = 5 * time.Second
   157  	)
   158  	var (
   159  		GCE *gce.Context
   160  		err error
   161  	)
   162  	for i := 1; i <= gceInitAttempts; i++ {
   163  		if i > 1 {
   164  			time.Sleep(gceInitBackoff)
   165  		}
   166  		GCE, err = gce.NewContext(zoneID)
   167  		if err == nil {
   168  			return GCE, nil
   169  		}
   170  		log.Logf(0, "init GCE attempt %d/%d failed: %v", i, gceInitAttempts, err)
   171  	}
   172  	return nil, fmt.Errorf("all attempts to init GCE failed: %w", err)
   173  }
   174  
   175  func (pool *Pool) Count() int {
   176  	return pool.cfg.Count
   177  }
   178  
   179  func (pool *Pool) Create(workdir string, index int) (vmimpl.Instance, error) {
   180  	name := fmt.Sprintf("%v-%v", pool.env.Name, index)
   181  	// Create SSH key for the instance.
   182  	gceKey := filepath.Join(workdir, "key")
   183  	keygen := osutil.Command("ssh-keygen", "-t", "ed25519", "-N", "", "-C", "syzkaller", "-f", gceKey)
   184  	if out, err := keygen.CombinedOutput(); err != nil {
   185  		return nil, fmt.Errorf("failed to execute ssh-keygen: %w\n%s", err, out)
   186  	}
   187  	gceKeyPub, err := os.ReadFile(gceKey + ".pub")
   188  	if err != nil {
   189  		return nil, fmt.Errorf("failed to read file: %w", err)
   190  	}
   191  
   192  	log.Logf(0, "deleting instance: %v", name)
   193  	if err := pool.GCE.DeleteInstance(name, true); err != nil {
   194  		return nil, err
   195  	}
   196  	log.Logf(0, "creating instance: %v", name)
   197  	ip, err := pool.GCE.CreateInstance(name, pool.cfg.MachineType, pool.cfg.GCEImage,
   198  		string(gceKeyPub), pool.cfg.Preemptible, pool.cfg.DisplayDevice)
   199  	if err != nil {
   200  		return nil, err
   201  	}
   202  
   203  	ok := false
   204  	defer func() {
   205  		if !ok {
   206  			pool.GCE.DeleteInstance(name, true)
   207  		}
   208  	}()
   209  	sshKey := pool.env.SSHKey
   210  	sshUser := pool.env.SSHUser
   211  	if sshKey == "GCE" {
   212  		// Assuming image supports GCE ssh fanciness.
   213  		sshKey = gceKey
   214  		sshUser = "syzkaller"
   215  	}
   216  	log.Logf(0, "wait instance to boot: %v (%v)", name, ip)
   217  	inst := &instance{
   218  		env:            pool.env,
   219  		cfg:            pool.cfg,
   220  		debug:          pool.env.Debug,
   221  		GCE:            pool.GCE,
   222  		name:           name,
   223  		ip:             ip,
   224  		gceKey:         gceKey,
   225  		sshKey:         sshKey,
   226  		sshUser:        sshUser,
   227  		closed:         make(chan bool),
   228  		consoleReadCmd: pool.consoleReadCmd,
   229  	}
   230  	if err := vmimpl.WaitForSSH(pool.env.Debug, 5*time.Minute, ip,
   231  		sshKey, sshUser, pool.env.OS, 22, nil, false); err != nil {
   232  		output, outputErr := inst.getSerialPortOutput()
   233  		if outputErr != nil {
   234  			output = []byte(fmt.Sprintf("failed to get boot output: %v", outputErr))
   235  		}
   236  		return nil, vmimpl.MakeBootError(err, output)
   237  	}
   238  	ok = true
   239  	return inst, nil
   240  }
   241  
   242  func (inst *instance) Close() {
   243  	close(inst.closed)
   244  	inst.GCE.DeleteInstance(inst.name, false)
   245  	if inst.consolew != nil {
   246  		inst.consolew.Close()
   247  	}
   248  }
   249  
   250  func (inst *instance) Forward(port int) (string, error) {
   251  	return fmt.Sprintf("%v:%v", inst.GCE.InternalIP, port), nil
   252  }
   253  
   254  func (inst *instance) Copy(hostSrc string) (string, error) {
   255  	vmDst := "./" + filepath.Base(hostSrc)
   256  	args := append(vmimpl.SCPArgs(true, inst.sshKey, 22, false), hostSrc, inst.sshUser+"@"+inst.ip+":"+vmDst)
   257  	if err := runCmd(inst.debug, "scp", args...); err != nil {
   258  		return "", err
   259  	}
   260  	return vmDst, nil
   261  }
   262  
   263  func (inst *instance) Run(timeout time.Duration, stop <-chan bool, command string) (
   264  	<-chan []byte, <-chan error, error) {
   265  	conRpipe, conWpipe, err := osutil.LongPipe()
   266  	if err != nil {
   267  		return nil, nil, err
   268  	}
   269  
   270  	var conArgs []string
   271  	if inst.consoleReadCmd == "" {
   272  		conArgs = inst.serialPortArgs(false)
   273  	} else {
   274  		conArgs = inst.sshArgs(inst.consoleReadCmd)
   275  	}
   276  	con := osutil.Command("ssh", conArgs...)
   277  	con.Env = []string{}
   278  	con.Stdout = conWpipe
   279  	con.Stderr = conWpipe
   280  	conw, err := con.StdinPipe()
   281  	if err != nil {
   282  		conRpipe.Close()
   283  		conWpipe.Close()
   284  		return nil, nil, err
   285  	}
   286  	if inst.consolew != nil {
   287  		inst.consolew.Close()
   288  	}
   289  	inst.consolew = conw
   290  	if err := con.Start(); err != nil {
   291  		conRpipe.Close()
   292  		conWpipe.Close()
   293  		return nil, nil, fmt.Errorf("failed to connect to console server: %w", err)
   294  	}
   295  	conWpipe.Close()
   296  
   297  	var tee io.Writer
   298  	if inst.debug {
   299  		tee = os.Stdout
   300  	}
   301  	merger := vmimpl.NewOutputMerger(tee)
   302  	var decoder func(data []byte) (int, int, []byte)
   303  	if inst.env.OS == targets.Windows {
   304  		decoder = kd.Decode
   305  	}
   306  	merger.AddDecoder("console", conRpipe, decoder)
   307  	if err := waitForConsoleConnect(merger); err != nil {
   308  		con.Process.Kill()
   309  		merger.Wait()
   310  		return nil, nil, err
   311  	}
   312  	sshRpipe, sshWpipe, err := osutil.LongPipe()
   313  	if err != nil {
   314  		con.Process.Kill()
   315  		merger.Wait()
   316  		sshRpipe.Close()
   317  		return nil, nil, err
   318  	}
   319  	ssh := osutil.Command("ssh", inst.sshArgs(command)...)
   320  	ssh.Stdout = sshWpipe
   321  	ssh.Stderr = sshWpipe
   322  	if err := ssh.Start(); err != nil {
   323  		con.Process.Kill()
   324  		merger.Wait()
   325  		sshRpipe.Close()
   326  		sshWpipe.Close()
   327  		return nil, nil, fmt.Errorf("failed to connect to instance: %w", err)
   328  	}
   329  	sshWpipe.Close()
   330  	merger.Add("ssh", sshRpipe)
   331  
   332  	errc := make(chan error, 1)
   333  	signal := func(err error) {
   334  		select {
   335  		case errc <- err:
   336  		default:
   337  		}
   338  	}
   339  
   340  	go func() {
   341  		select {
   342  		case <-time.After(timeout):
   343  			signal(vmimpl.ErrTimeout)
   344  		case <-stop:
   345  			signal(vmimpl.ErrTimeout)
   346  		case <-inst.closed:
   347  			signal(fmt.Errorf("instance closed"))
   348  		case err := <-merger.Err:
   349  			con.Process.Kill()
   350  			ssh.Process.Kill()
   351  			merger.Wait()
   352  			con.Wait()
   353  			var mergeError *vmimpl.MergerError
   354  			if cmdErr := ssh.Wait(); cmdErr == nil {
   355  				// If the command exited successfully, we got EOF error from merger.
   356  				// But in this case no error has happened and the EOF is expected.
   357  				err = nil
   358  			} else if errors.As(err, &mergeError) && mergeError.R == conRpipe {
   359  				// Console connection must never fail. If it does, it's either
   360  				// instance preemption or a GCE bug. In either case, not a kernel bug.
   361  				log.Logf(0, "%v: gce console connection failed with %v", inst.name, mergeError.Err)
   362  				err = vmimpl.ErrTimeout
   363  			} else {
   364  				// Check if the instance was terminated due to preemption or host maintenance.
   365  				time.Sleep(5 * time.Second) // just to avoid any GCE races
   366  				if !inst.GCE.IsInstanceRunning(inst.name) {
   367  					log.Logf(0, "%v: ssh exited but instance is not running", inst.name)
   368  					err = vmimpl.ErrTimeout
   369  				}
   370  			}
   371  			signal(err)
   372  			return
   373  		}
   374  		con.Process.Kill()
   375  		ssh.Process.Kill()
   376  		merger.Wait()
   377  		con.Wait()
   378  		ssh.Wait()
   379  	}()
   380  	return merger.Output, errc, nil
   381  }
   382  
   383  func waitForConsoleConnect(merger *vmimpl.OutputMerger) error {
   384  	// We've started the console reading ssh command, but it has not necessary connected yet.
   385  	// If we proceed to running the target command right away, we can miss part
   386  	// of console output. During repro we can crash machines very quickly and
   387  	// would miss beginning of a crash. Before ssh starts piping console output,
   388  	// it usually prints:
   389  	// "serialport: Connected to ... port 1 (session ID: ..., active connections: 1)"
   390  	// So we wait for this line, or at least a minute and at least some output.
   391  	timeout := time.NewTimer(time.Minute)
   392  	defer timeout.Stop()
   393  	connectedMsg := []byte("serialport: Connected")
   394  	permissionDeniedMsg := []byte("Permission denied (publickey)")
   395  	var output []byte
   396  	for {
   397  		select {
   398  		case out := <-merger.Output:
   399  			output = append(output, out...)
   400  			if bytes.Contains(output, connectedMsg) {
   401  				// Just to make sure (otherwise we still see trimmed reports).
   402  				time.Sleep(5 * time.Second)
   403  				return nil
   404  			}
   405  			if bytes.Contains(output, permissionDeniedMsg) {
   406  				// This is a GCE bug.
   407  				return fmt.Errorf("broken console: %s", permissionDeniedMsg)
   408  			}
   409  		case <-timeout.C:
   410  			if len(output) == 0 {
   411  				return fmt.Errorf("broken console: no output")
   412  			}
   413  			return nil
   414  		}
   415  	}
   416  }
   417  
   418  func (inst *instance) Diagnose(rep *report.Report) ([]byte, bool) {
   419  	switch inst.env.OS {
   420  	case targets.Linux:
   421  		output, wait, _ := vmimpl.DiagnoseLinux(rep, inst.ssh)
   422  		return output, wait
   423  	case targets.FreeBSD:
   424  		return vmimpl.DiagnoseFreeBSD(inst.consolew)
   425  	case targets.OpenBSD:
   426  		return vmimpl.DiagnoseOpenBSD(inst.consolew)
   427  	}
   428  	return nil, false
   429  }
   430  
   431  func (inst *instance) ssh(args ...string) ([]byte, error) {
   432  	return osutil.RunCmd(time.Minute, "", "ssh", inst.sshArgs(args...)...)
   433  }
   434  
   435  func (inst *instance) sshArgs(args ...string) []string {
   436  	sshArgs := append(vmimpl.SSHArgs(inst.debug, inst.sshKey, 22, false), inst.sshUser+"@"+inst.ip)
   437  	if inst.env.OS == targets.Linux && inst.sshUser != "root" {
   438  		args = []string{"sudo", "bash", "-c", "'" + strings.Join(args, " ") + "'"}
   439  	}
   440  	return append(sshArgs, args...)
   441  }
   442  
   443  func (inst *instance) serialPortArgs(replay bool) []string {
   444  	user := "syzkaller"
   445  	if inst.cfg.SerialPortUser != "" {
   446  		user = inst.cfg.SerialPortUser
   447  	}
   448  	key := inst.gceKey
   449  	if inst.cfg.SerialPortKey != "" {
   450  		key = inst.cfg.SerialPortKey
   451  	}
   452  	replayArg := ""
   453  	if replay {
   454  		replayArg = ".replay-lines=10000"
   455  	}
   456  	conAddr := fmt.Sprintf("%v.%v.%v.%s.port=1%s@%v-ssh-serialport.googleapis.com",
   457  		inst.GCE.ProjectID, inst.GCE.ZoneID, inst.name, user, replayArg, inst.GCE.RegionID)
   458  	conArgs := append(vmimpl.SSHArgs(inst.debug, key, 9600, false), conAddr)
   459  	// TODO(blackgnezdo): Remove this once ssh-serialport.googleapis.com stops using
   460  	// host key algorithm: ssh-rsa.
   461  	return append(conArgs, "-o", "HostKeyAlgorithms=+ssh-rsa")
   462  }
   463  
   464  func (inst *instance) getSerialPortOutput() ([]byte, error) {
   465  	conRpipe, conWpipe, err := osutil.LongPipe()
   466  	if err != nil {
   467  		return nil, err
   468  	}
   469  	defer conRpipe.Close()
   470  	defer conWpipe.Close()
   471  
   472  	con := osutil.Command("ssh", inst.serialPortArgs(true)...)
   473  	con.Env = []string{}
   474  	con.Stdout = conWpipe
   475  	con.Stderr = conWpipe
   476  	if _, err := con.StdinPipe(); err != nil { // SSH would close connection on stdin EOF
   477  		return nil, err
   478  	}
   479  	if err := con.Start(); err != nil {
   480  		return nil, fmt.Errorf("failed to connect to console server: %w", err)
   481  	}
   482  	conWpipe.Close()
   483  	done := make(chan bool)
   484  	go func() {
   485  		timeout := time.NewTimer(time.Minute)
   486  		defer timeout.Stop()
   487  		select {
   488  		case <-done:
   489  		case <-timeout.C:
   490  		}
   491  		con.Process.Kill()
   492  	}()
   493  	var output []byte
   494  	buf := make([]byte, 64<<10)
   495  	for {
   496  		n, err := conRpipe.Read(buf)
   497  		if err != nil || n == 0 {
   498  			break
   499  		}
   500  		output = append(output, buf[:n]...)
   501  	}
   502  	close(done)
   503  	con.Wait()
   504  	return output, nil
   505  }
   506  
   507  func uploadImageToGCS(localImage, gcsImage string) error {
   508  	GCS, err := gcs.NewClient()
   509  	if err != nil {
   510  		return fmt.Errorf("failed to create GCS client: %w", err)
   511  	}
   512  	defer GCS.Close()
   513  
   514  	localReader, err := os.Open(localImage)
   515  	if err != nil {
   516  		return fmt.Errorf("failed to open image file: %w", err)
   517  	}
   518  	defer localReader.Close()
   519  	localStat, err := localReader.Stat()
   520  	if err != nil {
   521  		return fmt.Errorf("failed to stat image file: %w", err)
   522  	}
   523  
   524  	gcsWriter, err := GCS.FileWriter(gcsImage)
   525  	if err != nil {
   526  		return fmt.Errorf("failed to upload image: %w", err)
   527  	}
   528  	defer gcsWriter.Close()
   529  
   530  	gzipWriter := gzip.NewWriter(gcsWriter)
   531  	tarWriter := tar.NewWriter(gzipWriter)
   532  	tarHeader := &tar.Header{
   533  		Name:     "disk.raw",
   534  		Typeflag: tar.TypeReg,
   535  		Mode:     0640,
   536  		Size:     localStat.Size(),
   537  		ModTime:  time.Now(),
   538  		Uname:    "syzkaller",
   539  		Gname:    "syzkaller",
   540  	}
   541  	setGNUFormat(tarHeader)
   542  	if err := tarWriter.WriteHeader(tarHeader); err != nil {
   543  		return fmt.Errorf("failed to write image tar header: %w", err)
   544  	}
   545  	if _, err := io.Copy(tarWriter, localReader); err != nil {
   546  		return fmt.Errorf("failed to write image file: %w", err)
   547  	}
   548  	if err := tarWriter.Close(); err != nil {
   549  		return fmt.Errorf("failed to write image file: %w", err)
   550  	}
   551  	if err := gzipWriter.Close(); err != nil {
   552  		return fmt.Errorf("failed to write image file: %w", err)
   553  	}
   554  	if err := gcsWriter.Close(); err != nil {
   555  		return fmt.Errorf("failed to write image file: %w", err)
   556  	}
   557  	return nil
   558  }
   559  
   560  func runCmd(debug bool, bin string, args ...string) error {
   561  	if debug {
   562  		log.Logf(0, "running command: %v %#v", bin, args)
   563  	}
   564  	output, err := osutil.RunCmd(time.Minute, "", bin, args...)
   565  	if debug {
   566  		log.Logf(0, "result: %v\n%s", err, output)
   567  	}
   568  	return err
   569  }