k8s.io/kubernetes@v1.29.3/test/e2e_node/remote/gce/gce_runner.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package gce 18 19 import ( 20 "context" 21 "encoding/base64" 22 "errors" 23 "flag" 24 "fmt" 25 "net/http" 26 "os" 27 "path/filepath" 28 "regexp" 29 "sort" 30 "strings" 31 "time" 32 33 "k8s.io/kubernetes/test/e2e_node/remote" 34 35 "github.com/google/uuid" 36 "golang.org/x/oauth2/google" 37 "google.golang.org/api/compute/v1" 38 "google.golang.org/api/option" 39 "k8s.io/apimachinery/pkg/util/wait" 40 "k8s.io/klog/v2" 41 "sigs.k8s.io/yaml" 42 ) 43 44 var _ remote.Runner = (*GCERunner)(nil) 45 46 func init() { 47 remote.RegisterRunner("gce", NewGCERunner) 48 } 49 50 // envs is the type used to collect all node envs. The key is the env name, 51 // and the value is the env value 52 type envs map[string]string 53 54 // String function of flag.Value 55 func (e *envs) String() string { 56 return fmt.Sprint(*e) 57 } 58 59 // Set function of flag.Value 60 func (e *envs) Set(value string) error { 61 if value == "" { 62 return nil 63 } 64 kv := strings.SplitN(value, "=", 2) 65 if len(kv) != 2 { 66 return fmt.Errorf("invalid env string %s", value) 67 } 68 emap := *e 69 emap[kv[0]] = kv[1] 70 return nil 71 } 72 73 // nodeEnvs is the node envs from the flag `node-env`. 74 var nodeEnvs = make(envs) 75 76 var project = flag.String("project", "", "gce project the hosts live in (gce)") 77 var zone = flag.String("zone", "", "gce zone that the hosts live in (gce)") 78 var instanceMetadata = flag.String("instance-metadata", "", "key/value metadata for instances separated by '=' or '<', 'k=v' means the key is 'k' and the value is 'v'; 'k<p' means the key is 'k' and the value is extracted from the local path 'p', e.g. k1=v1,k2<p2 (gce)") 79 var imageProject = flag.String("image-project", "", "gce project the hosts live in (gce)") 80 var instanceType = flag.String("instance-type", "e2-medium", "GCP Machine type to use for test") 81 var preemptibleInstances = flag.Bool("preemptible-instances", false, "If true, gce instances will be configured to be preemptible (gce)") 82 83 func init() { 84 flag.Var(&nodeEnvs, "node-env", "An environment variable passed to instance as metadata, e.g. when '--node-env=PATH=/usr/bin' is specified, there will be an extra instance metadata 'PATH=/usr/bin'.") 85 } 86 87 const ( 88 defaultGCEMachine = "n1-standard-1" 89 acceleratorTypeResourceFormat = "https://www.googleapis.com/compute/v1/projects/%s/zones/%s/acceleratorTypes/%s" 90 ) 91 92 type GCERunner struct { 93 cfg remote.Config 94 gceComputeService *compute.Service 95 gceImages *internalGCEImageConfig 96 } 97 98 func NewGCERunner(cfg remote.Config) remote.Runner { 99 if cfg.InstanceNamePrefix == "" { 100 cfg.InstanceNamePrefix = "tmp-node-e2e-" + uuid.New().String()[:8] 101 } 102 return &GCERunner{cfg: cfg} 103 } 104 105 func (g *GCERunner) Validate() error { 106 if len(g.cfg.Hosts) == 0 && g.cfg.ImageConfigFile == "" && len(g.cfg.Images) == 0 { 107 klog.Fatalf("Must specify one of --image-config-file, --hosts, --images.") 108 } 109 var err error 110 g.gceComputeService, err = getComputeClient() 111 if err != nil { 112 return fmt.Errorf("Unable to create gcloud compute service using defaults. Make sure you are authenticated. %w", err) 113 } 114 115 if g.gceImages, err = g.prepareGceImages(); err != nil { 116 klog.Fatalf("While preparing GCE images: %v", err) 117 } 118 return nil 119 } 120 121 func (g *GCERunner) StartTests(suite remote.TestSuite, archivePath string, results chan *remote.TestResult) (numTests int) { 122 for shortName := range g.gceImages.images { 123 imageConfig := g.gceImages.images[shortName] 124 numTests++ 125 fmt.Printf("Initializing e2e tests using image %s/%s/%s.\n", shortName, imageConfig.project, imageConfig.image) 126 go func(image *internalGCEImage, junitFileName string) { 127 results <- g.testGCEImage(suite, archivePath, image, junitFileName) 128 }(&imageConfig, shortName) 129 } 130 return 131 } 132 133 func getComputeClient() (*compute.Service, error) { 134 const retries = 10 135 const backoff = time.Second * 6 136 137 // Setup the gce client for provisioning instances 138 // Getting credentials on gce jenkins is flaky, so try a couple times 139 var err error 140 var cs *compute.Service 141 for i := 0; i < retries; i++ { 142 if i > 0 { 143 time.Sleep(backoff) 144 } 145 146 var client *http.Client 147 client, err = google.DefaultClient(context.Background(), compute.ComputeScope) 148 if err != nil { 149 continue 150 } 151 152 cs, err = compute.NewService(context.Background(), option.WithHTTPClient(client)) 153 if err != nil { 154 continue 155 } 156 return cs, nil 157 } 158 return nil, err 159 } 160 161 // Accelerator contains type and count about resource. 162 type Accelerator struct { 163 Type string `json:"type,omitempty"` 164 Count int64 `json:"count,omitempty"` 165 } 166 167 // Resources contains accelerators array. 168 type Resources struct { 169 Accelerators []Accelerator `json:"accelerators,omitempty"` 170 } 171 172 // internalGCEImage is an internal GCE image representation for E2E node. 173 type internalGCEImage struct { 174 image string 175 // imageDesc is the description of the image. If empty, the value in the 176 // 'image' will be used. 177 imageDesc string 178 kernelArguments []string 179 project string 180 resources Resources 181 metadata *compute.Metadata 182 machine string 183 } 184 185 type internalGCEImageConfig struct { 186 images map[string]internalGCEImage 187 } 188 189 // GCEImageConfig specifies what images should be run and how for these tests. 190 // It can be created via the `--images` and `--image-project` flags, or by 191 // specifying the `--image-config-file` flag, pointing to a json or yaml file 192 // of the form: 193 // 194 // images: 195 // short-name: 196 // image: gce-image-name 197 // project: gce-image-project 198 // machine: for benchmark only, the machine type (GCE instance) to run test 199 // tests: for benchmark only, a list of ginkgo focus strings to match tests 200 // 201 // TODO(coufon): replace 'image' with 'node' in configurations 202 // and we plan to support testing custom machines other than GCE by specifying Host 203 type GCEImageConfig struct { 204 Images map[string]GCEImage `json:"images"` 205 } 206 207 // GCEImage contains some information about GCE Image. 208 type GCEImage struct { 209 Image string `json:"image,omitempty"` 210 ImageRegex string `json:"image_regex,omitempty"` 211 // ImageFamily is the image family to use. The latest image from the image family will be used, e.g cos-81-lts. 212 ImageFamily string `json:"image_family,omitempty"` 213 ImageDesc string `json:"image_description,omitempty"` 214 KernelArguments []string `json:"kernel_arguments,omitempty"` 215 Project string `json:"project"` 216 Metadata string `json:"metadata"` 217 Machine string `json:"machine,omitempty"` 218 Resources Resources `json:"resources,omitempty"` 219 } 220 221 // Returns an image name based on regex and given GCE project. 222 func (g *GCERunner) getGCEImage(imageRegex, imageFamily string, project string) (string, error) { 223 imageObjs := []imageObj{} 224 imageRe := regexp.MustCompile(imageRegex) 225 if err := g.gceComputeService.Images.List(project).Pages(context.Background(), 226 func(ilc *compute.ImageList) error { 227 for _, instance := range ilc.Items { 228 if imageRegex != "" && !imageRe.MatchString(instance.Name) { 229 continue 230 } 231 if imageFamily != "" && instance.Family != imageFamily { 232 continue 233 } 234 creationTime, err := time.Parse(time.RFC3339, instance.CreationTimestamp) 235 if err != nil { 236 return fmt.Errorf("failed to parse instance creation timestamp %q: %w", instance.CreationTimestamp, err) 237 } 238 io := imageObj{ 239 creationTime: creationTime, 240 name: instance.Name, 241 } 242 imageObjs = append(imageObjs, io) 243 } 244 return nil 245 }, 246 ); err != nil { 247 return "", fmt.Errorf("failed to list images in project %q: %w", project, err) 248 } 249 250 // Pick the latest image after sorting. 251 sort.Sort(byCreationTime(imageObjs)) 252 if len(imageObjs) > 0 { 253 klog.V(4).Infof("found images %+v based on regex %q and family %q in project %q", imageObjs, imageRegex, imageFamily, project) 254 return imageObjs[0].name, nil 255 } 256 return "", fmt.Errorf("found zero images based on regex %q and family %q in project %q", imageRegex, imageFamily, project) 257 } 258 259 func (g *GCERunner) prepareGceImages() (*internalGCEImageConfig, error) { 260 gceImages := &internalGCEImageConfig{ 261 images: make(map[string]internalGCEImage), 262 } 263 264 // Parse images from given config file and convert them to internalGCEImage. 265 if g.cfg.ImageConfigFile != "" { 266 configPath := g.cfg.ImageConfigFile 267 if g.cfg.ImageConfigDir != "" { 268 configPath = filepath.Join(g.cfg.ImageConfigDir, g.cfg.ImageConfigFile) 269 } 270 271 imageConfigData, err := os.ReadFile(configPath) 272 if err != nil { 273 return nil, fmt.Errorf("Could not read image config file provided: %w", err) 274 } 275 // Unmarshal the given image config file. All images for this test run will be organized into a map. 276 // shortName->GCEImage, e.g cos-stable->cos-stable-81-12871-103-0. 277 externalImageConfig := GCEImageConfig{Images: make(map[string]GCEImage)} 278 err = yaml.Unmarshal(imageConfigData, &externalImageConfig) 279 if err != nil { 280 return nil, fmt.Errorf("Could not parse image config file: %w", err) 281 } 282 283 for shortName, imageConfig := range externalImageConfig.Images { 284 var image string 285 if (imageConfig.ImageRegex != "" || imageConfig.ImageFamily != "") && imageConfig.Image == "" { 286 image, err = g.getGCEImage(imageConfig.ImageRegex, imageConfig.ImageFamily, imageConfig.Project) 287 if err != nil { 288 return nil, fmt.Errorf("Could not retrieve a image based on image regex %q and family %q: %v", 289 imageConfig.ImageRegex, imageConfig.ImageFamily, err) 290 } 291 } else { 292 image = imageConfig.Image 293 } 294 // Convert the given image into an internalGCEImage. 295 metadata := imageConfig.Metadata 296 if len(strings.TrimSpace(*instanceMetadata)) > 0 { 297 metadata += "," + *instanceMetadata 298 } 299 gceImage := internalGCEImage{ 300 image: image, 301 imageDesc: imageConfig.ImageDesc, 302 project: imageConfig.Project, 303 metadata: g.getImageMetadata(metadata), 304 kernelArguments: imageConfig.KernelArguments, 305 machine: imageConfig.Machine, 306 resources: imageConfig.Resources, 307 } 308 if gceImage.imageDesc == "" { 309 gceImage.imageDesc = gceImage.image 310 } 311 gceImages.images[shortName] = gceImage 312 } 313 } 314 315 // Allow users to specify additional images via cli flags for local testing 316 // convenience; merge in with config file 317 if len(g.cfg.Images) > 0 { 318 if *imageProject == "" { 319 klog.Fatal("Must specify --image-project if you specify --images") 320 } 321 for _, image := range g.cfg.Images { 322 gceImage := internalGCEImage{ 323 image: image, 324 project: *imageProject, 325 metadata: g.getImageMetadata(*instanceMetadata), 326 } 327 gceImages.images[image] = gceImage 328 } 329 } 330 331 if len(gceImages.images) != 0 && *zone == "" { 332 return nil, errors.New("must specify --zone flag") 333 } 334 // Make sure GCP project is set. Without a project, images can't be retrieved.. 335 for shortName, imageConfig := range gceImages.images { 336 if imageConfig.project == "" { 337 return nil, fmt.Errorf("invalid config for %v; must specify a project", shortName) 338 } 339 } 340 if len(gceImages.images) != 0 { 341 if *project == "" { 342 return nil, errors.New("must specify --project flag to launch images into") 343 } 344 } 345 346 return gceImages, nil 347 } 348 349 type imageObj struct { 350 creationTime time.Time 351 name string 352 } 353 354 type byCreationTime []imageObj 355 356 func (a byCreationTime) Len() int { return len(a) } 357 func (a byCreationTime) Less(i, j int) bool { return a[i].creationTime.After(a[j].creationTime) } 358 func (a byCreationTime) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 359 360 func (g *GCERunner) getImageMetadata(input string) *compute.Metadata { 361 if input == "" { 362 return nil 363 } 364 klog.V(3).Infof("parsing instance metadata: %q", input) 365 raw := g.parseInstanceMetadata(input) 366 klog.V(4).Infof("parsed instance metadata: %v", raw) 367 metadataItems := []*compute.MetadataItems{} 368 for k, v := range raw { 369 val := v 370 metadataItems = append(metadataItems, &compute.MetadataItems{ 371 Key: k, 372 Value: &val, 373 }) 374 } 375 ret := compute.Metadata{Items: metadataItems} 376 return &ret 377 } 378 379 func (g *GCERunner) deleteGCEInstance(host string) { 380 klog.Infof("Deleting instance %q", host) 381 _, err := g.gceComputeService.Instances.Delete(*project, *zone, host).Do() 382 if err != nil { 383 klog.Errorf("Error deleting instance %q: %v", host, err) 384 } 385 } 386 387 func (g *GCERunner) parseInstanceMetadata(str string) map[string]string { 388 metadata := make(map[string]string) 389 ss := strings.Split(str, ",") 390 for _, s := range ss { 391 kv := strings.Split(s, "=") 392 if len(kv) == 2 { 393 metadata[kv[0]] = kv[1] 394 continue 395 } 396 kp := strings.Split(s, "<") 397 if len(kp) != 2 { 398 klog.Fatalf("Invalid instance metadata: %q", s) 399 continue 400 } 401 metaPath := kp[1] 402 if g.cfg.ImageConfigDir != "" { 403 metaPath = filepath.Join(g.cfg.ImageConfigDir, metaPath) 404 } 405 v, err := os.ReadFile(metaPath) 406 if err != nil { 407 klog.Fatalf("Failed to read metadata file %q: %v", metaPath, err) 408 continue 409 } 410 metadata[kp[0]] = ignitionInjectGCEPublicKey(metaPath, string(v)) 411 } 412 for k, v := range nodeEnvs { 413 metadata[k] = v 414 } 415 return metadata 416 } 417 418 // ignitionInjectGCEPublicKey tries to inject the GCE SSH public key into the 419 // provided ignition file path. 420 // 421 // This will only being done if the job has the 422 // IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE environment variable set, while it 423 // tried to replace the GCE_SSH_PUBLIC_KEY_FILE_CONTENT placeholder. 424 func ignitionInjectGCEPublicKey(path string, content string) string { 425 if os.Getenv("IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE") == "" { 426 return content 427 } 428 429 klog.Infof("Injecting SSH public key into ignition") 430 431 const publicKeyEnv = "GCE_SSH_PUBLIC_KEY_FILE" 432 sshPublicKeyFile := os.Getenv(publicKeyEnv) 433 if sshPublicKeyFile == "" { 434 klog.Errorf("Environment variable %s is not set", publicKeyEnv) 435 os.Exit(1) 436 } 437 438 sshPublicKey, err := os.ReadFile(sshPublicKeyFile) 439 if err != nil { 440 klog.ErrorS(err, "unable to read SSH public key file") 441 os.Exit(1) 442 } 443 444 const sshPublicKeyFileContentMarker = "GCE_SSH_PUBLIC_KEY_FILE_CONTENT" 445 key := base64.StdEncoding.EncodeToString(sshPublicKey) 446 base64Marker := base64.StdEncoding.EncodeToString([]byte(sshPublicKeyFileContentMarker)) 447 replacer := strings.NewReplacer( 448 sshPublicKeyFileContentMarker, key, 449 base64Marker, key, 450 ) 451 return replacer.Replace(content) 452 } 453 454 // Provision a gce instance using image and run the tests in archive against the instance. 455 // Delete the instance afterward. 456 func (g *GCERunner) testGCEImage(suite remote.TestSuite, archivePath string, imageConfig *internalGCEImage, junitFileName string) *remote.TestResult { 457 ginkgoFlagsStr := g.cfg.GinkgoFlags 458 459 host, err := g.createGCEInstance(imageConfig) 460 if g.cfg.DeleteInstances { 461 defer g.deleteGCEInstance(host) 462 } 463 if err != nil { 464 return &remote.TestResult{ 465 Err: fmt.Errorf("unable to create gce instance with running docker daemon for image %s. %v", imageConfig.image, err), 466 } 467 } 468 469 // Only delete the files if we are keeping the instance and want it cleaned up. 470 // If we are going to delete the instance, don't bother with cleaning up the files 471 deleteFiles := !g.cfg.DeleteInstances && g.cfg.Cleanup 472 473 if err = g.registerGceHostIP(host); err != nil { 474 return &remote.TestResult{ 475 Err: err, 476 Host: host, 477 ExitOK: false, 478 } 479 } 480 481 output, exitOk, err := remote.RunRemote(remote.RunRemoteConfig{ 482 Suite: suite, 483 Archive: archivePath, 484 Host: host, 485 Cleanup: deleteFiles, 486 ImageDesc: imageConfig.imageDesc, 487 JunitFileName: junitFileName, 488 TestArgs: g.cfg.TestArgs, 489 GinkgoArgs: ginkgoFlagsStr, 490 SystemSpecName: g.cfg.SystemSpecName, 491 ExtraEnvs: g.cfg.ExtraEnvs, 492 RuntimeConfig: g.cfg.RuntimeConfig, 493 }) 494 result := remote.TestResult{ 495 Output: output, 496 Err: err, 497 Host: host, 498 ExitOK: exitOk, 499 } 500 501 // This is a temporary solution to collect serial node serial log. Only port 1 contains useful information. 502 // TODO(random-liu): Extract out and unify log collection logic with cluste e2e. 503 serialPortOutput, err := g.gceComputeService.Instances.GetSerialPortOutput(*project, *zone, host).Port(1).Do() 504 if err != nil { 505 klog.Errorf("Failed to collect serial Output from node %q: %v", host, err) 506 } else { 507 logFilename := "serial-1.log" 508 err := remote.WriteLog(host, logFilename, serialPortOutput.Contents) 509 if err != nil { 510 klog.Errorf("Failed to write serial Output from node %q to %q: %v", host, logFilename, err) 511 } 512 } 513 return &result 514 } 515 516 // Provision a gce instance using image 517 func (g *GCERunner) createGCEInstance(imageConfig *internalGCEImage) (string, error) { 518 p, err := g.gceComputeService.Projects.Get(*project).Do() 519 if err != nil { 520 return "", fmt.Errorf("failed to get project info %q: %w", *project, err) 521 } 522 // Use default service account 523 serviceAccount := p.DefaultServiceAccount 524 klog.V(1).Infof("Creating instance %+v with service account %q", *imageConfig, serviceAccount) 525 name := g.imageToInstanceName(imageConfig) 526 i := &compute.Instance{ 527 Name: name, 528 MachineType: g.machineType(imageConfig.machine), 529 NetworkInterfaces: []*compute.NetworkInterface{ 530 { 531 AccessConfigs: []*compute.AccessConfig{ 532 { 533 Type: "ONE_TO_ONE_NAT", 534 Name: "External NAT", 535 }, 536 }}, 537 }, 538 Disks: []*compute.AttachedDisk{ 539 { 540 AutoDelete: true, 541 Boot: true, 542 Type: "PERSISTENT", 543 InitializeParams: &compute.AttachedDiskInitializeParams{ 544 SourceImage: g.sourceImage(imageConfig.image, imageConfig.project), 545 DiskSizeGb: 20, 546 }, 547 }, 548 }, 549 ServiceAccounts: []*compute.ServiceAccount{ 550 { 551 Email: serviceAccount, 552 Scopes: []string{ 553 "https://www.googleapis.com/auth/cloud-platform", 554 }, 555 }, 556 }, 557 } 558 559 scheduling := compute.Scheduling{ 560 Preemptible: *preemptibleInstances, 561 } 562 for _, accelerator := range imageConfig.resources.Accelerators { 563 if i.GuestAccelerators == nil { 564 autoRestart := true 565 i.GuestAccelerators = []*compute.AcceleratorConfig{} 566 scheduling.OnHostMaintenance = "TERMINATE" 567 scheduling.AutomaticRestart = &autoRestart 568 } 569 aType := fmt.Sprintf(acceleratorTypeResourceFormat, *project, *zone, accelerator.Type) 570 ac := &compute.AcceleratorConfig{ 571 AcceleratorCount: accelerator.Count, 572 AcceleratorType: aType, 573 } 574 i.GuestAccelerators = append(i.GuestAccelerators, ac) 575 } 576 i.Scheduling = &scheduling 577 i.Metadata = imageConfig.metadata 578 var insertionOperationName string 579 if _, err := g.gceComputeService.Instances.Get(*project, *zone, i.Name).Do(); err != nil { 580 op, err := g.gceComputeService.Instances.Insert(*project, *zone, i).Do() 581 582 if err != nil { 583 ret := fmt.Sprintf("could not create instance %s: API error: %v", name, err) 584 if op != nil { 585 ret = fmt.Sprintf("%s: %v", ret, op.Error) 586 } 587 return "", fmt.Errorf(ret) 588 } else if op.Error != nil { 589 var errs []string 590 for _, insertErr := range op.Error.Errors { 591 errs = append(errs, fmt.Sprintf("%+v", insertErr)) 592 } 593 return "", fmt.Errorf("could not create instance %s: %+v", name, errs) 594 595 } 596 insertionOperationName = op.Name 597 } 598 instanceRunning := false 599 var instance *compute.Instance 600 for i := 0; i < 30 && !instanceRunning; i++ { 601 if i > 0 { 602 time.Sleep(time.Second * 20) 603 } 604 var insertionOperation *compute.Operation 605 insertionOperation, err = g.gceComputeService.ZoneOperations.Get(*project, *zone, insertionOperationName).Do() 606 if err != nil { 607 continue 608 } 609 if strings.ToUpper(insertionOperation.Status) != "DONE" { 610 err = fmt.Errorf("instance insert operation %s not in state DONE, was %s", name, insertionOperation.Status) 611 continue 612 } 613 if insertionOperation.Error != nil { 614 var errs []string 615 for _, insertErr := range insertionOperation.Error.Errors { 616 errs = append(errs, fmt.Sprintf("%+v", insertErr)) 617 } 618 return name, fmt.Errorf("could not create instance %s: %+v", name, errs) 619 } 620 621 instance, err = g.gceComputeService.Instances.Get(*project, *zone, name).Do() 622 if err != nil { 623 continue 624 } 625 if strings.ToUpper(instance.Status) != "RUNNING" { 626 err = fmt.Errorf("instance %s not in state RUNNING, was %s", name, instance.Status) 627 continue 628 } 629 externalIP := g.getExternalIP(instance) 630 if len(externalIP) > 0 { 631 remote.AddHostnameIP(name, externalIP) 632 } 633 634 var output string 635 output, err = remote.SSH(name, "sh", "-c", 636 "'systemctl list-units --type=service --state=running | grep -e containerd -e crio'") 637 if err != nil { 638 err = fmt.Errorf("instance %s not running containerd/crio daemon - Command failed: %s", name, output) 639 continue 640 } 641 if !strings.Contains(output, "containerd.service") && 642 !strings.Contains(output, "crio.service") { 643 err = fmt.Errorf("instance %s not running containerd/crio daemon: %s", name, output) 644 continue 645 } 646 instanceRunning = true 647 } 648 // If instance didn't reach running state in time, return with error now. 649 if err != nil { 650 return name, err 651 } 652 // Instance reached running state in time, make sure that cloud-init is complete 653 if g.isCloudInitUsed(imageConfig.metadata) { 654 cloudInitFinished := false 655 for i := 0; i < 60 && !cloudInitFinished; i++ { 656 if i > 0 { 657 time.Sleep(time.Second * 20) 658 } 659 var finished string 660 finished, err = remote.SSH(name, "ls", "/var/lib/cloud/instance/boot-finished") 661 if err != nil { 662 err = fmt.Errorf("instance %s has not finished cloud-init script: %s", name, finished) 663 continue 664 } 665 cloudInitFinished = true 666 } 667 } 668 669 // apply additional kernel arguments to the instance 670 if len(imageConfig.kernelArguments) > 0 { 671 klog.Info("Update kernel arguments") 672 if err := g.updateKernelArguments(instance, imageConfig.image, imageConfig.kernelArguments); err != nil { 673 return name, err 674 } 675 } 676 677 return name, err 678 } 679 680 func (g *GCERunner) isCloudInitUsed(metadata *compute.Metadata) bool { 681 if metadata == nil { 682 return false 683 } 684 for _, item := range metadata.Items { 685 if item.Key == "user-data" && item.Value != nil && strings.HasPrefix(*item.Value, "#cloud-config") { 686 return true 687 } 688 } 689 return false 690 } 691 692 func (g *GCERunner) sourceImage(image, imageProject string) string { 693 return fmt.Sprintf("projects/%s/global/images/%s", imageProject, image) 694 } 695 696 func (g *GCERunner) imageToInstanceName(imageConfig *internalGCEImage) string { 697 if imageConfig.machine == "" { 698 return g.cfg.InstanceNamePrefix + "-" + imageConfig.image 699 } 700 // For benchmark test, node name has the format 'machine-image-uuid' to run 701 // different machine types with the same image in parallel 702 return imageConfig.machine + "-" + imageConfig.image + "-" + uuid.New().String()[:8] 703 } 704 705 func (g *GCERunner) registerGceHostIP(host string) error { 706 instance, err := g.gceComputeService.Instances.Get(*project, *zone, host).Do() 707 if err != nil { 708 return err 709 } 710 if strings.ToUpper(instance.Status) != "RUNNING" { 711 return fmt.Errorf("instance %s not in state RUNNING, was %s", host, instance.Status) 712 } 713 externalIP := g.getExternalIP(instance) 714 if len(externalIP) > 0 { 715 remote.AddHostnameIP(host, externalIP) 716 } 717 return nil 718 } 719 func (g *GCERunner) getExternalIP(instance *compute.Instance) string { 720 for i := range instance.NetworkInterfaces { 721 ni := instance.NetworkInterfaces[i] 722 for j := range ni.AccessConfigs { 723 ac := ni.AccessConfigs[j] 724 if len(ac.NatIP) > 0 { 725 return ac.NatIP 726 } 727 } 728 } 729 return "" 730 } 731 func (g *GCERunner) updateKernelArguments(instance *compute.Instance, image string, kernelArgs []string) error { 732 kernelArgsString := strings.Join(kernelArgs, " ") 733 734 var cmd []string 735 if strings.Contains(image, "cos") { 736 cmd = []string{ 737 "dir=$(mktemp -d)", 738 "mount /dev/sda12 ${dir}", 739 fmt.Sprintf("sed -i -e \"s|cros_efi|cros_efi %s|g\" ${dir}/efi/boot/grub.cfg", kernelArgsString), 740 "umount ${dir}", 741 "rmdir ${dir}", 742 } 743 } 744 745 if strings.Contains(image, "ubuntu") { 746 cmd = []string{ 747 fmt.Sprintf("echo \"GRUB_CMDLINE_LINUX_DEFAULT=%s ${GRUB_CMDLINE_LINUX_DEFAULT}\" > /etc/default/grub.d/99-additional-arguments.cfg", kernelArgsString), 748 "/usr/sbin/update-grub", 749 } 750 } 751 752 if len(cmd) == 0 { 753 klog.Warningf("The image %s does not support adding an additional kernel arguments", image) 754 return nil 755 } 756 757 out, err := remote.SSH(instance.Name, "sh", "-c", fmt.Sprintf("'%s'", strings.Join(cmd, "&&"))) 758 if err != nil { 759 klog.Errorf("failed to run command %s: out: %s, Err: %v", cmd, out, err) 760 return err 761 } 762 763 if err := g.rebootInstance(instance); err != nil { 764 return err 765 } 766 767 return nil 768 } 769 770 func (g *GCERunner) machineType(machine string) string { 771 var ret string 772 if machine == "" && *instanceType != "" { 773 ret = *instanceType 774 } else if machine != "" { 775 ret = machine 776 } else { 777 ret = defaultGCEMachine 778 } 779 return fmt.Sprintf("zones/%s/machineTypes/%s", *zone, ret) 780 } 781 func (g *GCERunner) rebootInstance(instance *compute.Instance) error { 782 // wait until the instance will not response to SSH 783 klog.Info("Reboot the node and wait for instance not to be available via SSH") 784 if waitErr := wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) { 785 if _, err := remote.SSH(instance.Name, "reboot"); err != nil { 786 return true, nil 787 } 788 789 return false, nil 790 }); waitErr != nil { 791 return fmt.Errorf("the instance %s still response to SSH: %v", instance.Name, waitErr) 792 } 793 794 // wait until the instance will response again to SSH 795 klog.Info("Wait for instance to be available via SSH") 796 if waitErr := wait.PollImmediate(30*time.Second, 5*time.Minute, func() (bool, error) { 797 if _, err := remote.SSH(instance.Name, "sh", "-c", "date"); err != nil { 798 return false, nil 799 } 800 return true, nil 801 }); waitErr != nil { 802 return fmt.Errorf("the instance %s does not response to SSH: %v", instance.Name, waitErr) 803 } 804 805 return nil 806 }