github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/vm/gce/gce.go (about) 1 // Copyright 2016 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 // Package gce allows to use Google Compute Engine (GCE) virtual machines as VMs. 5 // It is assumed that syz-manager also runs on GCE as VMs are created in the current project/zone. 6 // 7 // See https://cloud.google.com/compute/docs for details. 8 // In particular, how to build GCE-compatible images: 9 // https://cloud.google.com/compute/docs/tutorials/building-images 10 // Working with serial console: 11 // https://cloud.google.com/compute/docs/instances/interacting-with-serial-console 12 package gce 13 14 import ( 15 "archive/tar" 16 "bytes" 17 "compress/gzip" 18 "context" 19 "errors" 20 "fmt" 21 "io" 22 "os" 23 "path/filepath" 24 "strings" 25 "time" 26 27 "github.com/google/syzkaller/pkg/config" 28 "github.com/google/syzkaller/pkg/gce" 29 "github.com/google/syzkaller/pkg/gcs" 30 "github.com/google/syzkaller/pkg/kd" 31 "github.com/google/syzkaller/pkg/log" 32 "github.com/google/syzkaller/pkg/osutil" 33 "github.com/google/syzkaller/pkg/report" 34 "github.com/google/syzkaller/sys/targets" 35 "github.com/google/syzkaller/vm/vmimpl" 36 ) 37 38 func init() { 39 vmimpl.Register("gce", vmimpl.Type{ 40 Ctor: ctor, 41 Overcommit: true, 42 Preemptible: true, 43 }) 44 } 45 46 type Config struct { 47 Count int `json:"count"` // number of VMs to use 48 ZoneID string `json:"zone_id"` // GCE zone (if it's different from that of syz-manager) 49 MachineType string `json:"machine_type"` // GCE machine type (e.g. "n1-highcpu-2") 50 GCSPath string `json:"gcs_path"` // GCS path to upload image 51 GCEImage string `json:"gce_image"` // pre-created GCE image to use 52 Preemptible bool `json:"preemptible"` // use preemptible VMs if available (defaults to true) 53 DisplayDevice bool `json:"display_device"` // enable a virtual display device 54 // Username to connect to ssh-serialport.googleapis.com. 55 // Leave empty for non-OS Login GCP projects. 56 // Otherwise take the user from `gcloud compute connect-to-serial-port --dry-run`. 57 SerialPortUser string `json:"serial_port_user"` 58 // A private key to connect to ssh-serialport.googleapis.com. 59 // Leave empty for non-OS Login GCP projects. 60 // Otherwise generate one and upload it: 61 // `gcloud compute os-login ssh-keys add --key-file some-key.pub`. 62 SerialPortKey string `json:"serial_port_key"` 63 } 64 65 type Pool struct { 66 env *vmimpl.Env 67 cfg *Config 68 GCE *gce.Context 69 consoleReadCmd string // optional: command to read non-standard kernel console 70 } 71 72 type instance struct { 73 env *vmimpl.Env 74 cfg *Config 75 GCE *gce.Context 76 debug bool 77 name string 78 vmimpl.SSHOptions 79 gceKey string // per-instance private ssh key associated with the instance 80 closed chan bool 81 consolew io.WriteCloser 82 consoleReadCmd string // optional: command to read non-standard kernel console 83 timeouts targets.Timeouts 84 } 85 86 func ctor(env *vmimpl.Env) (vmimpl.Pool, error) { 87 return Ctor(env, "") 88 } 89 90 func Ctor(env *vmimpl.Env, consoleReadCmd string) (*Pool, error) { 91 if env.Name == "" { 92 return nil, fmt.Errorf("config param name is empty (required for GCE)") 93 } 94 cfg := &Config{ 95 Count: 1, 96 Preemptible: true, 97 // Display device is not supported on other platforms. 98 DisplayDevice: env.Arch == targets.AMD64, 99 } 100 if err := config.LoadData(env.Config, cfg); err != nil { 101 return nil, fmt.Errorf("failed to parse gce vm config: %w", err) 102 } 103 if cfg.Count < 1 || cfg.Count > 1000 { 104 return nil, fmt.Errorf("invalid config param count: %v, want [1, 1000]", cfg.Count) 105 } 106 if cfg.MachineType == "" { 107 return nil, fmt.Errorf("machine_type parameter is empty") 108 } 109 if cfg.GCEImage == "" && cfg.GCSPath == "" { 110 return nil, fmt.Errorf("gcs_path parameter is empty") 111 } 112 if cfg.GCEImage == "" && env.Image == "" { 113 return nil, fmt.Errorf("config param image is empty (required for GCE)") 114 } 115 if cfg.GCEImage != "" && env.Image != "" { 116 return nil, fmt.Errorf("both image and gce_image are specified") 117 } 118 119 GCE, err := initGCE(cfg.ZoneID) 120 if err != nil { 121 return nil, err 122 } 123 124 log.Logf(0, "GCE initialized: running on %v, internal IP %v, project %v, zone %v, net %v/%v", 125 GCE.Instance, GCE.InternalIP, GCE.ProjectID, GCE.ZoneID, GCE.Network, GCE.Subnetwork) 126 127 if cfg.GCEImage == "" { 128 cfg.GCEImage = env.Name 129 gcsImage := filepath.Join(cfg.GCSPath, env.Name+"-image.tar.gz") 130 log.Logf(0, "uploading image %v to %v...", env.Image, gcsImage) 131 if err := uploadImageToGCS(env.Image, gcsImage); err != nil { 132 return nil, err 133 } 134 log.Logf(0, "creating GCE image %v...", cfg.GCEImage) 135 if err := GCE.DeleteImage(cfg.GCEImage); err != nil { 136 return nil, fmt.Errorf("failed to delete GCE image: %w", err) 137 } 138 if err := GCE.CreateImage(cfg.GCEImage, gcsImage, env.OS); err != nil { 139 return nil, fmt.Errorf("failed to create GCE image: %w", err) 140 } 141 } 142 pool := &Pool{ 143 cfg: cfg, 144 env: env, 145 GCE: GCE, 146 consoleReadCmd: consoleReadCmd, 147 } 148 return pool, nil 149 } 150 151 func initGCE(zoneID string) (*gce.Context, error) { 152 // There happen some transient GCE init errors on and off. 153 // Let's try it several times before aborting. 154 const ( 155 gceInitAttempts = 3 156 gceInitBackoff = 5 * time.Second 157 ) 158 var ( 159 GCE *gce.Context 160 err error 161 ) 162 for i := 1; i <= gceInitAttempts; i++ { 163 if i > 1 { 164 time.Sleep(gceInitBackoff) 165 } 166 GCE, err = gce.NewContext(zoneID) 167 if err == nil { 168 return GCE, nil 169 } 170 log.Logf(0, "init GCE attempt %d/%d failed: %v", i, gceInitAttempts, err) 171 } 172 return nil, fmt.Errorf("all attempts to init GCE failed: %w", err) 173 } 174 175 func (pool *Pool) Count() int { 176 return pool.cfg.Count 177 } 178 179 func (pool *Pool) Create(_ context.Context, workdir string, index int) (vmimpl.Instance, error) { 180 name := fmt.Sprintf("%v-%v", pool.env.Name, index) 181 // Create SSH key for the instance. 182 gceKey := filepath.Join(workdir, "key") 183 keygen := osutil.Command("ssh-keygen", "-t", "ed25519", "-N", "", "-C", "syzkaller", "-f", gceKey) 184 if out, err := keygen.CombinedOutput(); err != nil { 185 return nil, fmt.Errorf("failed to execute ssh-keygen: %w\n%s", err, out) 186 } 187 gceKeyPub, err := os.ReadFile(gceKey + ".pub") 188 if err != nil { 189 return nil, fmt.Errorf("failed to read file: %w", err) 190 } 191 192 log.Logf(0, "deleting instance: %v", name) 193 if err := pool.GCE.DeleteInstance(name, true); err != nil { 194 return nil, err 195 } 196 log.Logf(0, "creating instance: %v", name) 197 ip, err := pool.GCE.CreateInstance(name, pool.cfg.MachineType, pool.cfg.GCEImage, 198 string(gceKeyPub), pool.cfg.Preemptible, pool.cfg.DisplayDevice) 199 if err != nil { 200 return nil, err 201 } 202 203 ok := false 204 defer func() { 205 if !ok { 206 pool.GCE.DeleteInstance(name, true) 207 } 208 }() 209 sshKey := pool.env.SSHKey 210 sshUser := pool.env.SSHUser 211 if sshKey == "GCE" { 212 // Assuming image supports GCE ssh fanciness. 213 sshKey = gceKey 214 sshUser = "syzkaller" 215 } 216 log.Logf(0, "wait instance to boot: %v (%v)", name, ip) 217 inst := &instance{ 218 env: pool.env, 219 cfg: pool.cfg, 220 debug: pool.env.Debug, 221 GCE: pool.GCE, 222 name: name, 223 SSHOptions: vmimpl.SSHOptions{ 224 Addr: ip, 225 Port: 22, 226 Key: sshKey, 227 User: sshUser, 228 }, 229 230 gceKey: gceKey, 231 232 closed: make(chan bool), 233 consoleReadCmd: pool.consoleReadCmd, 234 timeouts: pool.env.Timeouts, 235 } 236 if err := vmimpl.WaitForSSH(5*time.Minute, inst.SSHOptions, 237 pool.env.OS, nil, false, pool.env.Debug); err != nil { 238 output, outputErr := inst.getSerialPortOutput() 239 if outputErr != nil { 240 output = []byte(fmt.Sprintf("failed to get boot output: %v", outputErr)) 241 } 242 return nil, vmimpl.MakeBootError(err, output) 243 } 244 ok = true 245 return inst, nil 246 } 247 248 func (inst *instance) Close() error { 249 close(inst.closed) 250 err := inst.GCE.DeleteInstance(inst.name, false) 251 if inst.consolew != nil { 252 err2 := inst.consolew.Close() 253 if err == nil { 254 err = err2 255 } 256 } 257 return err 258 } 259 260 func (inst *instance) Forward(port int) (string, error) { 261 return fmt.Sprintf("%v:%v", inst.GCE.InternalIP, port), nil 262 } 263 264 func (inst *instance) Copy(hostSrc string) (string, error) { 265 vmDst := "./" + filepath.Base(hostSrc) 266 args := append(vmimpl.SCPArgs(true, inst.Key, inst.Port, false), 267 hostSrc, inst.User+"@"+inst.Addr+":"+vmDst) 268 if err := runCmd(inst.debug, "scp", args...); err != nil { 269 return "", err 270 } 271 return vmDst, nil 272 } 273 274 func (inst *instance) Run(ctx context.Context, command string) ( 275 <-chan []byte, <-chan error, error) { 276 conRpipe, conWpipe, err := osutil.LongPipe() 277 if err != nil { 278 return nil, nil, err 279 } 280 281 var conArgs []string 282 if inst.consoleReadCmd == "" { 283 conArgs = inst.serialPortArgs(false) 284 } else { 285 conArgs = inst.sshArgs(inst.consoleReadCmd) 286 } 287 con := osutil.Command("ssh", conArgs...) 288 con.Env = []string{} 289 con.Stdout = conWpipe 290 con.Stderr = conWpipe 291 conw, err := con.StdinPipe() 292 if err != nil { 293 conRpipe.Close() 294 conWpipe.Close() 295 return nil, nil, err 296 } 297 if inst.consolew != nil { 298 inst.consolew.Close() 299 } 300 inst.consolew = conw 301 if err := con.Start(); err != nil { 302 conRpipe.Close() 303 conWpipe.Close() 304 return nil, nil, fmt.Errorf("failed to connect to console server: %w", err) 305 } 306 conWpipe.Close() 307 308 var tee io.Writer 309 if inst.debug { 310 tee = os.Stdout 311 } 312 merger := vmimpl.NewOutputMerger(tee) 313 var decoder func(data []byte) (int, int, []byte) 314 if inst.env.OS == targets.Windows { 315 decoder = kd.Decode 316 } 317 merger.AddDecoder("console", conRpipe, decoder) 318 if err := waitForConsoleConnect(merger); err != nil { 319 con.Process.Kill() 320 merger.Wait() 321 return nil, nil, err 322 } 323 sshRpipe, sshWpipe, err := osutil.LongPipe() 324 if err != nil { 325 con.Process.Kill() 326 merger.Wait() 327 sshRpipe.Close() 328 return nil, nil, err 329 } 330 ssh := osutil.Command("ssh", inst.sshArgs(command)...) 331 ssh.Stdout = sshWpipe 332 ssh.Stderr = sshWpipe 333 if err := ssh.Start(); err != nil { 334 con.Process.Kill() 335 merger.Wait() 336 sshRpipe.Close() 337 sshWpipe.Close() 338 return nil, nil, fmt.Errorf("failed to connect to instance: %w", err) 339 } 340 sshWpipe.Close() 341 merger.Add("ssh", sshRpipe) 342 343 return vmimpl.Multiplex(ctx, ssh, merger, vmimpl.MultiplexConfig{ 344 Console: vmimpl.CmdCloser{Cmd: con}, 345 Close: inst.closed, 346 Debug: inst.debug, 347 Scale: inst.timeouts.Scale, 348 IgnoreError: func(err error) bool { 349 var mergeError *vmimpl.MergerError 350 if errors.As(err, &mergeError) && mergeError.R == conRpipe { 351 // Console connection must never fail. If it does, it's either 352 // instance preemption or a GCE bug. In either case, not a kernel bug. 353 log.Logf(0, "%v: gce console connection failed with %v", inst.name, mergeError.Err) 354 return true 355 } else { 356 // Check if the instance was terminated due to preemption or host maintenance. 357 // vmimpl.Multiplex() already adds a delay, so we've already waited enough 358 // to let GCE VM status updates propagate. 359 if !inst.GCE.IsInstanceRunning(inst.name) { 360 log.Logf(0, "%v: ssh exited but instance is not running", inst.name) 361 return true 362 } 363 } 364 return false 365 }, 366 }) 367 } 368 369 func waitForConsoleConnect(merger *vmimpl.OutputMerger) error { 370 // We've started the console reading ssh command, but it has not necessary connected yet. 371 // If we proceed to running the target command right away, we can miss part 372 // of console output. During repro we can crash machines very quickly and 373 // would miss beginning of a crash. Before ssh starts piping console output, 374 // it usually prints: 375 // "serialport: Connected to ... port 1 (session ID: ..., active connections: 1)" 376 // So we wait for this line, or at least a minute and at least some output. 377 timeout := time.NewTimer(time.Minute) 378 defer timeout.Stop() 379 connectedMsg := []byte("serialport: Connected") 380 permissionDeniedMsg := []byte("Permission denied (publickey)") 381 var output []byte 382 for { 383 select { 384 case out := <-merger.Output: 385 output = append(output, out...) 386 if bytes.Contains(output, connectedMsg) { 387 // Just to make sure (otherwise we still see trimmed reports). 388 time.Sleep(5 * time.Second) 389 return nil 390 } 391 if bytes.Contains(output, permissionDeniedMsg) { 392 // This is a GCE bug. 393 return fmt.Errorf("broken console: %s", permissionDeniedMsg) 394 } 395 case <-timeout.C: 396 if len(output) == 0 { 397 return fmt.Errorf("broken console: no output") 398 } 399 return nil 400 } 401 } 402 } 403 404 func (inst *instance) Diagnose(rep *report.Report) ([]byte, bool) { 405 switch inst.env.OS { 406 case targets.Linux: 407 output, wait, _ := vmimpl.DiagnoseLinux(rep, inst.ssh) 408 return output, wait 409 case targets.FreeBSD: 410 return vmimpl.DiagnoseFreeBSD(inst.consolew) 411 case targets.OpenBSD: 412 return vmimpl.DiagnoseOpenBSD(inst.consolew) 413 } 414 return nil, false 415 } 416 417 func (inst *instance) ssh(args ...string) ([]byte, error) { 418 return osutil.RunCmd(time.Minute, "", "ssh", inst.sshArgs(args...)...) 419 } 420 421 func (inst *instance) sshArgs(args ...string) []string { 422 sshArgs := append(vmimpl.SSHArgs(inst.debug, inst.Key, 22, false), inst.User+"@"+inst.Addr) 423 if inst.env.OS == targets.Linux && inst.User != "root" { 424 args = []string{"sudo", "bash", "-c", "'" + strings.Join(args, " ") + "'"} 425 } 426 return append(sshArgs, args...) 427 } 428 429 func (inst *instance) serialPortArgs(replay bool) []string { 430 user := "syzkaller" 431 if inst.cfg.SerialPortUser != "" { 432 user = inst.cfg.SerialPortUser 433 } 434 key := inst.gceKey 435 if inst.cfg.SerialPortKey != "" { 436 key = inst.cfg.SerialPortKey 437 } 438 replayArg := "" 439 if replay { 440 replayArg = ".replay-lines=10000" 441 } 442 conAddr := fmt.Sprintf("%v.%v.%v.%s.port=1%s@%v-ssh-serialport.googleapis.com", 443 inst.GCE.ProjectID, inst.GCE.ZoneID, inst.name, user, replayArg, inst.GCE.RegionID) 444 conArgs := append(vmimpl.SSHArgs(inst.debug, key, 9600, false), conAddr) 445 // TODO(blackgnezdo): Remove this once ssh-serialport.googleapis.com stops using 446 // host key algorithm: ssh-rsa. 447 return append(conArgs, "-o", "HostKeyAlgorithms=+ssh-rsa") 448 } 449 450 func (inst *instance) getSerialPortOutput() ([]byte, error) { 451 conRpipe, conWpipe, err := osutil.LongPipe() 452 if err != nil { 453 return nil, err 454 } 455 defer conRpipe.Close() 456 defer conWpipe.Close() 457 458 con := osutil.Command("ssh", inst.serialPortArgs(true)...) 459 con.Env = []string{} 460 con.Stdout = conWpipe 461 con.Stderr = conWpipe 462 if _, err := con.StdinPipe(); err != nil { // SSH would close connection on stdin EOF 463 return nil, err 464 } 465 if err := con.Start(); err != nil { 466 return nil, fmt.Errorf("failed to connect to console server: %w", err) 467 } 468 conWpipe.Close() 469 done := make(chan bool) 470 go func() { 471 timeout := time.NewTimer(time.Minute) 472 defer timeout.Stop() 473 select { 474 case <-done: 475 case <-timeout.C: 476 } 477 con.Process.Kill() 478 }() 479 var output []byte 480 buf := make([]byte, 64<<10) 481 for { 482 n, err := conRpipe.Read(buf) 483 if err != nil || n == 0 { 484 break 485 } 486 output = append(output, buf[:n]...) 487 } 488 close(done) 489 con.Wait() 490 return output, nil 491 } 492 493 func uploadImageToGCS(localImage, gcsImage string) error { 494 GCS, err := gcs.NewClient(context.Background()) 495 if err != nil { 496 return fmt.Errorf("failed to create GCS client: %w", err) 497 } 498 defer GCS.Close() 499 500 localReader, err := os.Open(localImage) 501 if err != nil { 502 return fmt.Errorf("failed to open image file: %w", err) 503 } 504 defer localReader.Close() 505 localStat, err := localReader.Stat() 506 if err != nil { 507 return fmt.Errorf("failed to stat image file: %w", err) 508 } 509 510 gcsWriter, err := GCS.FileWriter(gcsImage, "", "") 511 if err != nil { 512 return fmt.Errorf("failed to upload image: %w", err) 513 } 514 defer gcsWriter.Close() 515 516 gzipWriter := gzip.NewWriter(gcsWriter) 517 tarWriter := tar.NewWriter(gzipWriter) 518 tarHeader := &tar.Header{ 519 Name: "disk.raw", 520 Typeflag: tar.TypeReg, 521 Mode: 0640, 522 Size: localStat.Size(), 523 ModTime: time.Now(), 524 Uname: "syzkaller", 525 Gname: "syzkaller", 526 } 527 setGNUFormat(tarHeader) 528 if err := tarWriter.WriteHeader(tarHeader); err != nil { 529 return fmt.Errorf("failed to write image tar header: %w", err) 530 } 531 if _, err := io.Copy(tarWriter, localReader); err != nil { 532 return fmt.Errorf("failed to write image file: %w", err) 533 } 534 if err := tarWriter.Close(); err != nil { 535 return fmt.Errorf("failed to write image file: %w", err) 536 } 537 if err := gzipWriter.Close(); err != nil { 538 return fmt.Errorf("failed to write image file: %w", err) 539 } 540 if err := gcsWriter.Close(); err != nil { 541 return fmt.Errorf("failed to write image file: %w", err) 542 } 543 return nil 544 } 545 546 func runCmd(debug bool, bin string, args ...string) error { 547 if debug { 548 log.Logf(0, "running command: %v %#v", bin, args) 549 } 550 output, err := osutil.RunCmd(time.Minute, "", bin, args...) 551 if debug { 552 log.Logf(0, "result: %v\n%s", err, output) 553 } 554 return err 555 }