github.com/google/syzkaller@v0.0.0-20240517125934-c0f1611a36d6/vm/gce/gce.go (about) 1 // Copyright 2016 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 // Package gce allows to use Google Compute Engine (GCE) virtual machines as VMs. 5 // It is assumed that syz-manager also runs on GCE as VMs are created in the current project/zone. 6 // 7 // See https://cloud.google.com/compute/docs for details. 8 // In particular, how to build GCE-compatible images: 9 // https://cloud.google.com/compute/docs/tutorials/building-images 10 // Working with serial console: 11 // https://cloud.google.com/compute/docs/instances/interacting-with-serial-console 12 package gce 13 14 import ( 15 "archive/tar" 16 "bytes" 17 "compress/gzip" 18 "errors" 19 "fmt" 20 "io" 21 "os" 22 "path/filepath" 23 "strings" 24 "time" 25 26 "github.com/google/syzkaller/pkg/config" 27 "github.com/google/syzkaller/pkg/gce" 28 "github.com/google/syzkaller/pkg/gcs" 29 "github.com/google/syzkaller/pkg/kd" 30 "github.com/google/syzkaller/pkg/log" 31 "github.com/google/syzkaller/pkg/osutil" 32 "github.com/google/syzkaller/pkg/report" 33 "github.com/google/syzkaller/sys/targets" 34 "github.com/google/syzkaller/vm/vmimpl" 35 ) 36 37 func init() { 38 vmimpl.Register("gce", ctor, true) 39 } 40 41 type Config struct { 42 Count int `json:"count"` // number of VMs to use 43 ZoneID string `json:"zone_id"` // GCE zone (if it's different from that of syz-manager) 44 MachineType string `json:"machine_type"` // GCE machine type (e.g. "n1-highcpu-2") 45 GCSPath string `json:"gcs_path"` // GCS path to upload image 46 GCEImage string `json:"gce_image"` // pre-created GCE image to use 47 Preemptible bool `json:"preemptible"` // use preemptible VMs if available (defaults to true) 48 DisplayDevice bool `json:"display_device"` // enable a virtual display device 49 // Username to connect to ssh-serialport.googleapis.com. 50 // Leave empty for non-OS Login GCP projects. 51 // Otherwise take the user from `gcloud compute connect-to-serial-port --dry-run`. 52 SerialPortUser string `json:"serial_port_user"` 53 // A private key to connect to ssh-serialport.googleapis.com. 54 // Leave empty for non-OS Login GCP projects. 55 // Otherwise generate one and upload it: 56 // `gcloud compute os-login ssh-keys add --key-file some-key.pub`. 57 SerialPortKey string `json:"serial_port_key"` 58 } 59 60 type Pool struct { 61 env *vmimpl.Env 62 cfg *Config 63 GCE *gce.Context 64 consoleReadCmd string // optional: command to read non-standard kernel console 65 } 66 67 type instance struct { 68 env *vmimpl.Env 69 cfg *Config 70 GCE *gce.Context 71 debug bool 72 name string 73 ip string 74 gceKey string // per-instance private ssh key associated with the instance 75 sshKey string // ssh key 76 sshUser string 77 closed chan bool 78 consolew io.WriteCloser 79 consoleReadCmd string // optional: command to read non-standard kernel console 80 } 81 82 func ctor(env *vmimpl.Env) (vmimpl.Pool, error) { 83 return Ctor(env, "") 84 } 85 86 func Ctor(env *vmimpl.Env, consoleReadCmd string) (*Pool, error) { 87 if env.Name == "" { 88 return nil, fmt.Errorf("config param name is empty (required for GCE)") 89 } 90 cfg := &Config{ 91 Count: 1, 92 Preemptible: true, 93 // Display device is not supported on other platforms. 94 DisplayDevice: env.Arch == targets.AMD64, 95 } 96 if err := config.LoadData(env.Config, cfg); err != nil { 97 return nil, fmt.Errorf("failed to parse gce vm config: %w", err) 98 } 99 if cfg.Count < 1 || cfg.Count > 1000 { 100 return nil, fmt.Errorf("invalid config param count: %v, want [1, 1000]", cfg.Count) 101 } 102 if env.Debug && cfg.Count > 1 { 103 log.Logf(0, "limiting number of VMs from %v to 1 in debug mode", cfg.Count) 104 cfg.Count = 1 105 } 106 if cfg.MachineType == "" { 107 return nil, fmt.Errorf("machine_type parameter is empty") 108 } 109 if cfg.GCEImage == "" && cfg.GCSPath == "" { 110 return nil, fmt.Errorf("gcs_path parameter is empty") 111 } 112 if cfg.GCEImage == "" && env.Image == "" { 113 return nil, fmt.Errorf("config param image is empty (required for GCE)") 114 } 115 if cfg.GCEImage != "" && env.Image != "" { 116 return nil, fmt.Errorf("both image and gce_image are specified") 117 } 118 119 GCE, err := initGCE(cfg.ZoneID) 120 if err != nil { 121 return nil, err 122 } 123 124 log.Logf(0, "GCE initialized: running on %v, internal IP %v, project %v, zone %v, net %v/%v", 125 GCE.Instance, GCE.InternalIP, GCE.ProjectID, GCE.ZoneID, GCE.Network, GCE.Subnetwork) 126 127 if cfg.GCEImage == "" { 128 cfg.GCEImage = env.Name 129 gcsImage := filepath.Join(cfg.GCSPath, env.Name+"-image.tar.gz") 130 log.Logf(0, "uploading image %v to %v...", env.Image, gcsImage) 131 if err := uploadImageToGCS(env.Image, gcsImage); err != nil { 132 return nil, err 133 } 134 log.Logf(0, "creating GCE image %v...", cfg.GCEImage) 135 if err := GCE.DeleteImage(cfg.GCEImage); err != nil { 136 return nil, fmt.Errorf("failed to delete GCE image: %w", err) 137 } 138 if err := GCE.CreateImage(cfg.GCEImage, gcsImage); err != nil { 139 return nil, fmt.Errorf("failed to create GCE image: %w", err) 140 } 141 } 142 pool := &Pool{ 143 cfg: cfg, 144 env: env, 145 GCE: GCE, 146 consoleReadCmd: consoleReadCmd, 147 } 148 return pool, nil 149 } 150 151 func initGCE(zoneID string) (*gce.Context, error) { 152 // There happen some transient GCE init errors on and off. 153 // Let's try it several times before aborting. 154 const ( 155 gceInitAttempts = 3 156 gceInitBackoff = 5 * time.Second 157 ) 158 var ( 159 GCE *gce.Context 160 err error 161 ) 162 for i := 1; i <= gceInitAttempts; i++ { 163 if i > 1 { 164 time.Sleep(gceInitBackoff) 165 } 166 GCE, err = gce.NewContext(zoneID) 167 if err == nil { 168 return GCE, nil 169 } 170 log.Logf(0, "init GCE attempt %d/%d failed: %v", i, gceInitAttempts, err) 171 } 172 return nil, fmt.Errorf("all attempts to init GCE failed: %w", err) 173 } 174 175 func (pool *Pool) Count() int { 176 return pool.cfg.Count 177 } 178 179 func (pool *Pool) Create(workdir string, index int) (vmimpl.Instance, error) { 180 name := fmt.Sprintf("%v-%v", pool.env.Name, index) 181 // Create SSH key for the instance. 182 gceKey := filepath.Join(workdir, "key") 183 keygen := osutil.Command("ssh-keygen", "-t", "ed25519", "-N", "", "-C", "syzkaller", "-f", gceKey) 184 if out, err := keygen.CombinedOutput(); err != nil { 185 return nil, fmt.Errorf("failed to execute ssh-keygen: %w\n%s", err, out) 186 } 187 gceKeyPub, err := os.ReadFile(gceKey + ".pub") 188 if err != nil { 189 return nil, fmt.Errorf("failed to read file: %w", err) 190 } 191 192 log.Logf(0, "deleting instance: %v", name) 193 if err := pool.GCE.DeleteInstance(name, true); err != nil { 194 return nil, err 195 } 196 log.Logf(0, "creating instance: %v", name) 197 ip, err := pool.GCE.CreateInstance(name, pool.cfg.MachineType, pool.cfg.GCEImage, 198 string(gceKeyPub), pool.cfg.Preemptible, pool.cfg.DisplayDevice) 199 if err != nil { 200 return nil, err 201 } 202 203 ok := false 204 defer func() { 205 if !ok { 206 pool.GCE.DeleteInstance(name, true) 207 } 208 }() 209 sshKey := pool.env.SSHKey 210 sshUser := pool.env.SSHUser 211 if sshKey == "GCE" { 212 // Assuming image supports GCE ssh fanciness. 213 sshKey = gceKey 214 sshUser = "syzkaller" 215 } 216 log.Logf(0, "wait instance to boot: %v (%v)", name, ip) 217 inst := &instance{ 218 env: pool.env, 219 cfg: pool.cfg, 220 debug: pool.env.Debug, 221 GCE: pool.GCE, 222 name: name, 223 ip: ip, 224 gceKey: gceKey, 225 sshKey: sshKey, 226 sshUser: sshUser, 227 closed: make(chan bool), 228 consoleReadCmd: pool.consoleReadCmd, 229 } 230 if err := vmimpl.WaitForSSH(pool.env.Debug, 5*time.Minute, ip, 231 sshKey, sshUser, pool.env.OS, 22, nil, false); err != nil { 232 output, outputErr := inst.getSerialPortOutput() 233 if outputErr != nil { 234 output = []byte(fmt.Sprintf("failed to get boot output: %v", outputErr)) 235 } 236 return nil, vmimpl.MakeBootError(err, output) 237 } 238 ok = true 239 return inst, nil 240 } 241 242 func (inst *instance) Close() { 243 close(inst.closed) 244 inst.GCE.DeleteInstance(inst.name, false) 245 if inst.consolew != nil { 246 inst.consolew.Close() 247 } 248 } 249 250 func (inst *instance) Forward(port int) (string, error) { 251 return fmt.Sprintf("%v:%v", inst.GCE.InternalIP, port), nil 252 } 253 254 func (inst *instance) Copy(hostSrc string) (string, error) { 255 vmDst := "./" + filepath.Base(hostSrc) 256 args := append(vmimpl.SCPArgs(true, inst.sshKey, 22, false), hostSrc, inst.sshUser+"@"+inst.ip+":"+vmDst) 257 if err := runCmd(inst.debug, "scp", args...); err != nil { 258 return "", err 259 } 260 return vmDst, nil 261 } 262 263 func (inst *instance) Run(timeout time.Duration, stop <-chan bool, command string) ( 264 <-chan []byte, <-chan error, error) { 265 conRpipe, conWpipe, err := osutil.LongPipe() 266 if err != nil { 267 return nil, nil, err 268 } 269 270 var conArgs []string 271 if inst.consoleReadCmd == "" { 272 conArgs = inst.serialPortArgs(false) 273 } else { 274 conArgs = inst.sshArgs(inst.consoleReadCmd) 275 } 276 con := osutil.Command("ssh", conArgs...) 277 con.Env = []string{} 278 con.Stdout = conWpipe 279 con.Stderr = conWpipe 280 conw, err := con.StdinPipe() 281 if err != nil { 282 conRpipe.Close() 283 conWpipe.Close() 284 return nil, nil, err 285 } 286 if inst.consolew != nil { 287 inst.consolew.Close() 288 } 289 inst.consolew = conw 290 if err := con.Start(); err != nil { 291 conRpipe.Close() 292 conWpipe.Close() 293 return nil, nil, fmt.Errorf("failed to connect to console server: %w", err) 294 } 295 conWpipe.Close() 296 297 var tee io.Writer 298 if inst.debug { 299 tee = os.Stdout 300 } 301 merger := vmimpl.NewOutputMerger(tee) 302 var decoder func(data []byte) (int, int, []byte) 303 if inst.env.OS == targets.Windows { 304 decoder = kd.Decode 305 } 306 merger.AddDecoder("console", conRpipe, decoder) 307 if err := waitForConsoleConnect(merger); err != nil { 308 con.Process.Kill() 309 merger.Wait() 310 return nil, nil, err 311 } 312 sshRpipe, sshWpipe, err := osutil.LongPipe() 313 if err != nil { 314 con.Process.Kill() 315 merger.Wait() 316 sshRpipe.Close() 317 return nil, nil, err 318 } 319 ssh := osutil.Command("ssh", inst.sshArgs(command)...) 320 ssh.Stdout = sshWpipe 321 ssh.Stderr = sshWpipe 322 if err := ssh.Start(); err != nil { 323 con.Process.Kill() 324 merger.Wait() 325 sshRpipe.Close() 326 sshWpipe.Close() 327 return nil, nil, fmt.Errorf("failed to connect to instance: %w", err) 328 } 329 sshWpipe.Close() 330 merger.Add("ssh", sshRpipe) 331 332 errc := make(chan error, 1) 333 signal := func(err error) { 334 select { 335 case errc <- err: 336 default: 337 } 338 } 339 340 go func() { 341 select { 342 case <-time.After(timeout): 343 signal(vmimpl.ErrTimeout) 344 case <-stop: 345 signal(vmimpl.ErrTimeout) 346 case <-inst.closed: 347 signal(fmt.Errorf("instance closed")) 348 case err := <-merger.Err: 349 con.Process.Kill() 350 ssh.Process.Kill() 351 merger.Wait() 352 con.Wait() 353 var mergeError *vmimpl.MergerError 354 if cmdErr := ssh.Wait(); cmdErr == nil { 355 // If the command exited successfully, we got EOF error from merger. 356 // But in this case no error has happened and the EOF is expected. 357 err = nil 358 } else if errors.As(err, &mergeError) && mergeError.R == conRpipe { 359 // Console connection must never fail. If it does, it's either 360 // instance preemption or a GCE bug. In either case, not a kernel bug. 361 log.Logf(0, "%v: gce console connection failed with %v", inst.name, mergeError.Err) 362 err = vmimpl.ErrTimeout 363 } else { 364 // Check if the instance was terminated due to preemption or host maintenance. 365 time.Sleep(5 * time.Second) // just to avoid any GCE races 366 if !inst.GCE.IsInstanceRunning(inst.name) { 367 log.Logf(0, "%v: ssh exited but instance is not running", inst.name) 368 err = vmimpl.ErrTimeout 369 } 370 } 371 signal(err) 372 return 373 } 374 con.Process.Kill() 375 ssh.Process.Kill() 376 merger.Wait() 377 con.Wait() 378 ssh.Wait() 379 }() 380 return merger.Output, errc, nil 381 } 382 383 func waitForConsoleConnect(merger *vmimpl.OutputMerger) error { 384 // We've started the console reading ssh command, but it has not necessary connected yet. 385 // If we proceed to running the target command right away, we can miss part 386 // of console output. During repro we can crash machines very quickly and 387 // would miss beginning of a crash. Before ssh starts piping console output, 388 // it usually prints: 389 // "serialport: Connected to ... port 1 (session ID: ..., active connections: 1)" 390 // So we wait for this line, or at least a minute and at least some output. 391 timeout := time.NewTimer(time.Minute) 392 defer timeout.Stop() 393 connectedMsg := []byte("serialport: Connected") 394 permissionDeniedMsg := []byte("Permission denied (publickey)") 395 var output []byte 396 for { 397 select { 398 case out := <-merger.Output: 399 output = append(output, out...) 400 if bytes.Contains(output, connectedMsg) { 401 // Just to make sure (otherwise we still see trimmed reports). 402 time.Sleep(5 * time.Second) 403 return nil 404 } 405 if bytes.Contains(output, permissionDeniedMsg) { 406 // This is a GCE bug. 407 return fmt.Errorf("broken console: %s", permissionDeniedMsg) 408 } 409 case <-timeout.C: 410 if len(output) == 0 { 411 return fmt.Errorf("broken console: no output") 412 } 413 return nil 414 } 415 } 416 } 417 418 func (inst *instance) Diagnose(rep *report.Report) ([]byte, bool) { 419 switch inst.env.OS { 420 case targets.Linux: 421 output, wait, _ := vmimpl.DiagnoseLinux(rep, inst.ssh) 422 return output, wait 423 case targets.FreeBSD: 424 return vmimpl.DiagnoseFreeBSD(inst.consolew) 425 case targets.OpenBSD: 426 return vmimpl.DiagnoseOpenBSD(inst.consolew) 427 } 428 return nil, false 429 } 430 431 func (inst *instance) ssh(args ...string) ([]byte, error) { 432 return osutil.RunCmd(time.Minute, "", "ssh", inst.sshArgs(args...)...) 433 } 434 435 func (inst *instance) sshArgs(args ...string) []string { 436 sshArgs := append(vmimpl.SSHArgs(inst.debug, inst.sshKey, 22, false), inst.sshUser+"@"+inst.ip) 437 if inst.env.OS == targets.Linux && inst.sshUser != "root" { 438 args = []string{"sudo", "bash", "-c", "'" + strings.Join(args, " ") + "'"} 439 } 440 return append(sshArgs, args...) 441 } 442 443 func (inst *instance) serialPortArgs(replay bool) []string { 444 user := "syzkaller" 445 if inst.cfg.SerialPortUser != "" { 446 user = inst.cfg.SerialPortUser 447 } 448 key := inst.gceKey 449 if inst.cfg.SerialPortKey != "" { 450 key = inst.cfg.SerialPortKey 451 } 452 replayArg := "" 453 if replay { 454 replayArg = ".replay-lines=10000" 455 } 456 conAddr := fmt.Sprintf("%v.%v.%v.%s.port=1%s@%v-ssh-serialport.googleapis.com", 457 inst.GCE.ProjectID, inst.GCE.ZoneID, inst.name, user, replayArg, inst.GCE.RegionID) 458 conArgs := append(vmimpl.SSHArgs(inst.debug, key, 9600, false), conAddr) 459 // TODO(blackgnezdo): Remove this once ssh-serialport.googleapis.com stops using 460 // host key algorithm: ssh-rsa. 461 return append(conArgs, "-o", "HostKeyAlgorithms=+ssh-rsa") 462 } 463 464 func (inst *instance) getSerialPortOutput() ([]byte, error) { 465 conRpipe, conWpipe, err := osutil.LongPipe() 466 if err != nil { 467 return nil, err 468 } 469 defer conRpipe.Close() 470 defer conWpipe.Close() 471 472 con := osutil.Command("ssh", inst.serialPortArgs(true)...) 473 con.Env = []string{} 474 con.Stdout = conWpipe 475 con.Stderr = conWpipe 476 if _, err := con.StdinPipe(); err != nil { // SSH would close connection on stdin EOF 477 return nil, err 478 } 479 if err := con.Start(); err != nil { 480 return nil, fmt.Errorf("failed to connect to console server: %w", err) 481 } 482 conWpipe.Close() 483 done := make(chan bool) 484 go func() { 485 timeout := time.NewTimer(time.Minute) 486 defer timeout.Stop() 487 select { 488 case <-done: 489 case <-timeout.C: 490 } 491 con.Process.Kill() 492 }() 493 var output []byte 494 buf := make([]byte, 64<<10) 495 for { 496 n, err := conRpipe.Read(buf) 497 if err != nil || n == 0 { 498 break 499 } 500 output = append(output, buf[:n]...) 501 } 502 close(done) 503 con.Wait() 504 return output, nil 505 } 506 507 func uploadImageToGCS(localImage, gcsImage string) error { 508 GCS, err := gcs.NewClient() 509 if err != nil { 510 return fmt.Errorf("failed to create GCS client: %w", err) 511 } 512 defer GCS.Close() 513 514 localReader, err := os.Open(localImage) 515 if err != nil { 516 return fmt.Errorf("failed to open image file: %w", err) 517 } 518 defer localReader.Close() 519 localStat, err := localReader.Stat() 520 if err != nil { 521 return fmt.Errorf("failed to stat image file: %w", err) 522 } 523 524 gcsWriter, err := GCS.FileWriter(gcsImage) 525 if err != nil { 526 return fmt.Errorf("failed to upload image: %w", err) 527 } 528 defer gcsWriter.Close() 529 530 gzipWriter := gzip.NewWriter(gcsWriter) 531 tarWriter := tar.NewWriter(gzipWriter) 532 tarHeader := &tar.Header{ 533 Name: "disk.raw", 534 Typeflag: tar.TypeReg, 535 Mode: 0640, 536 Size: localStat.Size(), 537 ModTime: time.Now(), 538 Uname: "syzkaller", 539 Gname: "syzkaller", 540 } 541 setGNUFormat(tarHeader) 542 if err := tarWriter.WriteHeader(tarHeader); err != nil { 543 return fmt.Errorf("failed to write image tar header: %w", err) 544 } 545 if _, err := io.Copy(tarWriter, localReader); err != nil { 546 return fmt.Errorf("failed to write image file: %w", err) 547 } 548 if err := tarWriter.Close(); err != nil { 549 return fmt.Errorf("failed to write image file: %w", err) 550 } 551 if err := gzipWriter.Close(); err != nil { 552 return fmt.Errorf("failed to write image file: %w", err) 553 } 554 if err := gcsWriter.Close(); err != nil { 555 return fmt.Errorf("failed to write image file: %w", err) 556 } 557 return nil 558 } 559 560 func runCmd(debug bool, bin string, args ...string) error { 561 if debug { 562 log.Logf(0, "running command: %v %#v", bin, args) 563 } 564 output, err := osutil.RunCmd(time.Minute, "", bin, args...) 565 if debug { 566 log.Logf(0, "result: %v\n%s", err, output) 567 } 568 return err 569 }