k8s.io/kubernetes@v1.29.3/test/e2e_node/remote/gce/gce_runner.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package gce
    18  
    19  import (
    20  	"context"
    21  	"encoding/base64"
    22  	"errors"
    23  	"flag"
    24  	"fmt"
    25  	"net/http"
    26  	"os"
    27  	"path/filepath"
    28  	"regexp"
    29  	"sort"
    30  	"strings"
    31  	"time"
    32  
    33  	"k8s.io/kubernetes/test/e2e_node/remote"
    34  
    35  	"github.com/google/uuid"
    36  	"golang.org/x/oauth2/google"
    37  	"google.golang.org/api/compute/v1"
    38  	"google.golang.org/api/option"
    39  	"k8s.io/apimachinery/pkg/util/wait"
    40  	"k8s.io/klog/v2"
    41  	"sigs.k8s.io/yaml"
    42  )
    43  
    44  var _ remote.Runner = (*GCERunner)(nil)
    45  
    46  func init() {
    47  	remote.RegisterRunner("gce", NewGCERunner)
    48  }
    49  
    50  // envs is the type used to collect all node envs. The key is the env name,
    51  // and the value is the env value
    52  type envs map[string]string
    53  
    54  // String function of flag.Value
    55  func (e *envs) String() string {
    56  	return fmt.Sprint(*e)
    57  }
    58  
    59  // Set function of flag.Value
    60  func (e *envs) Set(value string) error {
    61  	if value == "" {
    62  		return nil
    63  	}
    64  	kv := strings.SplitN(value, "=", 2)
    65  	if len(kv) != 2 {
    66  		return fmt.Errorf("invalid env string %s", value)
    67  	}
    68  	emap := *e
    69  	emap[kv[0]] = kv[1]
    70  	return nil
    71  }
    72  
    73  // nodeEnvs is the node envs from the flag `node-env`.
    74  var nodeEnvs = make(envs)
    75  
    76  var project = flag.String("project", "", "gce project the hosts live in (gce)")
    77  var zone = flag.String("zone", "", "gce zone that the hosts live in (gce)")
    78  var instanceMetadata = flag.String("instance-metadata", "", "key/value metadata for instances separated by '=' or '<', 'k=v' means the key is 'k' and the value is 'v'; 'k<p' means the key is 'k' and the value is extracted from the local path 'p', e.g. k1=v1,k2<p2  (gce)")
    79  var imageProject = flag.String("image-project", "", "gce project the hosts live in  (gce)")
    80  var instanceType = flag.String("instance-type", "e2-medium", "GCP Machine type to use for test")
    81  var preemptibleInstances = flag.Bool("preemptible-instances", false, "If true, gce instances will be configured to be preemptible  (gce)")
    82  
    83  func init() {
    84  	flag.Var(&nodeEnvs, "node-env", "An environment variable passed to instance as metadata, e.g. when '--node-env=PATH=/usr/bin' is specified, there will be an extra instance metadata 'PATH=/usr/bin'.")
    85  }
    86  
    87  const (
    88  	defaultGCEMachine             = "n1-standard-1"
    89  	acceleratorTypeResourceFormat = "https://www.googleapis.com/compute/v1/projects/%s/zones/%s/acceleratorTypes/%s"
    90  )
    91  
    92  type GCERunner struct {
    93  	cfg               remote.Config
    94  	gceComputeService *compute.Service
    95  	gceImages         *internalGCEImageConfig
    96  }
    97  
    98  func NewGCERunner(cfg remote.Config) remote.Runner {
    99  	if cfg.InstanceNamePrefix == "" {
   100  		cfg.InstanceNamePrefix = "tmp-node-e2e-" + uuid.New().String()[:8]
   101  	}
   102  	return &GCERunner{cfg: cfg}
   103  }
   104  
   105  func (g *GCERunner) Validate() error {
   106  	if len(g.cfg.Hosts) == 0 && g.cfg.ImageConfigFile == "" && len(g.cfg.Images) == 0 {
   107  		klog.Fatalf("Must specify one of --image-config-file, --hosts, --images.")
   108  	}
   109  	var err error
   110  	g.gceComputeService, err = getComputeClient()
   111  	if err != nil {
   112  		return fmt.Errorf("Unable to create gcloud compute service using defaults.  Make sure you are authenticated. %w", err)
   113  	}
   114  
   115  	if g.gceImages, err = g.prepareGceImages(); err != nil {
   116  		klog.Fatalf("While preparing GCE images: %v", err)
   117  	}
   118  	return nil
   119  }
   120  
   121  func (g *GCERunner) StartTests(suite remote.TestSuite, archivePath string, results chan *remote.TestResult) (numTests int) {
   122  	for shortName := range g.gceImages.images {
   123  		imageConfig := g.gceImages.images[shortName]
   124  		numTests++
   125  		fmt.Printf("Initializing e2e tests using image %s/%s/%s.\n", shortName, imageConfig.project, imageConfig.image)
   126  		go func(image *internalGCEImage, junitFileName string) {
   127  			results <- g.testGCEImage(suite, archivePath, image, junitFileName)
   128  		}(&imageConfig, shortName)
   129  	}
   130  	return
   131  }
   132  
   133  func getComputeClient() (*compute.Service, error) {
   134  	const retries = 10
   135  	const backoff = time.Second * 6
   136  
   137  	// Setup the gce client for provisioning instances
   138  	// Getting credentials on gce jenkins is flaky, so try a couple times
   139  	var err error
   140  	var cs *compute.Service
   141  	for i := 0; i < retries; i++ {
   142  		if i > 0 {
   143  			time.Sleep(backoff)
   144  		}
   145  
   146  		var client *http.Client
   147  		client, err = google.DefaultClient(context.Background(), compute.ComputeScope)
   148  		if err != nil {
   149  			continue
   150  		}
   151  
   152  		cs, err = compute.NewService(context.Background(), option.WithHTTPClient(client))
   153  		if err != nil {
   154  			continue
   155  		}
   156  		return cs, nil
   157  	}
   158  	return nil, err
   159  }
   160  
   161  // Accelerator contains type and count about resource.
   162  type Accelerator struct {
   163  	Type  string `json:"type,omitempty"`
   164  	Count int64  `json:"count,omitempty"`
   165  }
   166  
   167  // Resources contains accelerators array.
   168  type Resources struct {
   169  	Accelerators []Accelerator `json:"accelerators,omitempty"`
   170  }
   171  
   172  // internalGCEImage is an internal GCE image representation for E2E node.
   173  type internalGCEImage struct {
   174  	image string
   175  	// imageDesc is the description of the image. If empty, the value in the
   176  	// 'image' will be used.
   177  	imageDesc       string
   178  	kernelArguments []string
   179  	project         string
   180  	resources       Resources
   181  	metadata        *compute.Metadata
   182  	machine         string
   183  }
   184  
   185  type internalGCEImageConfig struct {
   186  	images map[string]internalGCEImage
   187  }
   188  
   189  // GCEImageConfig specifies what images should be run and how for these tests.
   190  // It can be created via the `--images` and `--image-project` flags, or by
   191  // specifying the `--image-config-file` flag, pointing to a json or yaml file
   192  // of the form:
   193  //
   194  //	images:
   195  //	  short-name:
   196  //	    image: gce-image-name
   197  //	    project: gce-image-project
   198  //	    machine: for benchmark only, the machine type (GCE instance) to run test
   199  //	    tests: for benchmark only, a list of ginkgo focus strings to match tests
   200  //
   201  // TODO(coufon): replace 'image' with 'node' in configurations
   202  // and we plan to support testing custom machines other than GCE by specifying Host
   203  type GCEImageConfig struct {
   204  	Images map[string]GCEImage `json:"images"`
   205  }
   206  
   207  // GCEImage contains some information about GCE Image.
   208  type GCEImage struct {
   209  	Image      string `json:"image,omitempty"`
   210  	ImageRegex string `json:"image_regex,omitempty"`
   211  	// ImageFamily is the image family to use. The latest image from the image family will be used, e.g cos-81-lts.
   212  	ImageFamily     string    `json:"image_family,omitempty"`
   213  	ImageDesc       string    `json:"image_description,omitempty"`
   214  	KernelArguments []string  `json:"kernel_arguments,omitempty"`
   215  	Project         string    `json:"project"`
   216  	Metadata        string    `json:"metadata"`
   217  	Machine         string    `json:"machine,omitempty"`
   218  	Resources       Resources `json:"resources,omitempty"`
   219  }
   220  
   221  // Returns an image name based on regex and given GCE project.
   222  func (g *GCERunner) getGCEImage(imageRegex, imageFamily string, project string) (string, error) {
   223  	imageObjs := []imageObj{}
   224  	imageRe := regexp.MustCompile(imageRegex)
   225  	if err := g.gceComputeService.Images.List(project).Pages(context.Background(),
   226  		func(ilc *compute.ImageList) error {
   227  			for _, instance := range ilc.Items {
   228  				if imageRegex != "" && !imageRe.MatchString(instance.Name) {
   229  					continue
   230  				}
   231  				if imageFamily != "" && instance.Family != imageFamily {
   232  					continue
   233  				}
   234  				creationTime, err := time.Parse(time.RFC3339, instance.CreationTimestamp)
   235  				if err != nil {
   236  					return fmt.Errorf("failed to parse instance creation timestamp %q: %w", instance.CreationTimestamp, err)
   237  				}
   238  				io := imageObj{
   239  					creationTime: creationTime,
   240  					name:         instance.Name,
   241  				}
   242  				imageObjs = append(imageObjs, io)
   243  			}
   244  			return nil
   245  		},
   246  	); err != nil {
   247  		return "", fmt.Errorf("failed to list images in project %q: %w", project, err)
   248  	}
   249  
   250  	// Pick the latest image after sorting.
   251  	sort.Sort(byCreationTime(imageObjs))
   252  	if len(imageObjs) > 0 {
   253  		klog.V(4).Infof("found images %+v based on regex %q and family %q in project %q", imageObjs, imageRegex, imageFamily, project)
   254  		return imageObjs[0].name, nil
   255  	}
   256  	return "", fmt.Errorf("found zero images based on regex %q and family %q in project %q", imageRegex, imageFamily, project)
   257  }
   258  
   259  func (g *GCERunner) prepareGceImages() (*internalGCEImageConfig, error) {
   260  	gceImages := &internalGCEImageConfig{
   261  		images: make(map[string]internalGCEImage),
   262  	}
   263  
   264  	// Parse images from given config file and convert them to internalGCEImage.
   265  	if g.cfg.ImageConfigFile != "" {
   266  		configPath := g.cfg.ImageConfigFile
   267  		if g.cfg.ImageConfigDir != "" {
   268  			configPath = filepath.Join(g.cfg.ImageConfigDir, g.cfg.ImageConfigFile)
   269  		}
   270  
   271  		imageConfigData, err := os.ReadFile(configPath)
   272  		if err != nil {
   273  			return nil, fmt.Errorf("Could not read image config file provided: %w", err)
   274  		}
   275  		// Unmarshal the given image config file. All images for this test run will be organized into a map.
   276  		// shortName->GCEImage, e.g cos-stable->cos-stable-81-12871-103-0.
   277  		externalImageConfig := GCEImageConfig{Images: make(map[string]GCEImage)}
   278  		err = yaml.Unmarshal(imageConfigData, &externalImageConfig)
   279  		if err != nil {
   280  			return nil, fmt.Errorf("Could not parse image config file: %w", err)
   281  		}
   282  
   283  		for shortName, imageConfig := range externalImageConfig.Images {
   284  			var image string
   285  			if (imageConfig.ImageRegex != "" || imageConfig.ImageFamily != "") && imageConfig.Image == "" {
   286  				image, err = g.getGCEImage(imageConfig.ImageRegex, imageConfig.ImageFamily, imageConfig.Project)
   287  				if err != nil {
   288  					return nil, fmt.Errorf("Could not retrieve a image based on image regex %q and family %q: %v",
   289  						imageConfig.ImageRegex, imageConfig.ImageFamily, err)
   290  				}
   291  			} else {
   292  				image = imageConfig.Image
   293  			}
   294  			// Convert the given image into an internalGCEImage.
   295  			metadata := imageConfig.Metadata
   296  			if len(strings.TrimSpace(*instanceMetadata)) > 0 {
   297  				metadata += "," + *instanceMetadata
   298  			}
   299  			gceImage := internalGCEImage{
   300  				image:           image,
   301  				imageDesc:       imageConfig.ImageDesc,
   302  				project:         imageConfig.Project,
   303  				metadata:        g.getImageMetadata(metadata),
   304  				kernelArguments: imageConfig.KernelArguments,
   305  				machine:         imageConfig.Machine,
   306  				resources:       imageConfig.Resources,
   307  			}
   308  			if gceImage.imageDesc == "" {
   309  				gceImage.imageDesc = gceImage.image
   310  			}
   311  			gceImages.images[shortName] = gceImage
   312  		}
   313  	}
   314  
   315  	// Allow users to specify additional images via cli flags for local testing
   316  	// convenience; merge in with config file
   317  	if len(g.cfg.Images) > 0 {
   318  		if *imageProject == "" {
   319  			klog.Fatal("Must specify --image-project if you specify --images")
   320  		}
   321  		for _, image := range g.cfg.Images {
   322  			gceImage := internalGCEImage{
   323  				image:    image,
   324  				project:  *imageProject,
   325  				metadata: g.getImageMetadata(*instanceMetadata),
   326  			}
   327  			gceImages.images[image] = gceImage
   328  		}
   329  	}
   330  
   331  	if len(gceImages.images) != 0 && *zone == "" {
   332  		return nil, errors.New("must specify --zone flag")
   333  	}
   334  	// Make sure GCP project is set. Without a project, images can't be retrieved..
   335  	for shortName, imageConfig := range gceImages.images {
   336  		if imageConfig.project == "" {
   337  			return nil, fmt.Errorf("invalid config for %v; must specify a project", shortName)
   338  		}
   339  	}
   340  	if len(gceImages.images) != 0 {
   341  		if *project == "" {
   342  			return nil, errors.New("must specify --project flag to launch images into")
   343  		}
   344  	}
   345  
   346  	return gceImages, nil
   347  }
   348  
   349  type imageObj struct {
   350  	creationTime time.Time
   351  	name         string
   352  }
   353  
   354  type byCreationTime []imageObj
   355  
   356  func (a byCreationTime) Len() int           { return len(a) }
   357  func (a byCreationTime) Less(i, j int) bool { return a[i].creationTime.After(a[j].creationTime) }
   358  func (a byCreationTime) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
   359  
   360  func (g *GCERunner) getImageMetadata(input string) *compute.Metadata {
   361  	if input == "" {
   362  		return nil
   363  	}
   364  	klog.V(3).Infof("parsing instance metadata: %q", input)
   365  	raw := g.parseInstanceMetadata(input)
   366  	klog.V(4).Infof("parsed instance metadata: %v", raw)
   367  	metadataItems := []*compute.MetadataItems{}
   368  	for k, v := range raw {
   369  		val := v
   370  		metadataItems = append(metadataItems, &compute.MetadataItems{
   371  			Key:   k,
   372  			Value: &val,
   373  		})
   374  	}
   375  	ret := compute.Metadata{Items: metadataItems}
   376  	return &ret
   377  }
   378  
   379  func (g *GCERunner) deleteGCEInstance(host string) {
   380  	klog.Infof("Deleting instance %q", host)
   381  	_, err := g.gceComputeService.Instances.Delete(*project, *zone, host).Do()
   382  	if err != nil {
   383  		klog.Errorf("Error deleting instance %q: %v", host, err)
   384  	}
   385  }
   386  
   387  func (g *GCERunner) parseInstanceMetadata(str string) map[string]string {
   388  	metadata := make(map[string]string)
   389  	ss := strings.Split(str, ",")
   390  	for _, s := range ss {
   391  		kv := strings.Split(s, "=")
   392  		if len(kv) == 2 {
   393  			metadata[kv[0]] = kv[1]
   394  			continue
   395  		}
   396  		kp := strings.Split(s, "<")
   397  		if len(kp) != 2 {
   398  			klog.Fatalf("Invalid instance metadata: %q", s)
   399  			continue
   400  		}
   401  		metaPath := kp[1]
   402  		if g.cfg.ImageConfigDir != "" {
   403  			metaPath = filepath.Join(g.cfg.ImageConfigDir, metaPath)
   404  		}
   405  		v, err := os.ReadFile(metaPath)
   406  		if err != nil {
   407  			klog.Fatalf("Failed to read metadata file %q: %v", metaPath, err)
   408  			continue
   409  		}
   410  		metadata[kp[0]] = ignitionInjectGCEPublicKey(metaPath, string(v))
   411  	}
   412  	for k, v := range nodeEnvs {
   413  		metadata[k] = v
   414  	}
   415  	return metadata
   416  }
   417  
   418  // ignitionInjectGCEPublicKey tries to inject the GCE SSH public key into the
   419  // provided ignition file path.
   420  //
   421  // This will only being done if the job has the
   422  // IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE environment variable set, while it
   423  // tried to replace the GCE_SSH_PUBLIC_KEY_FILE_CONTENT placeholder.
   424  func ignitionInjectGCEPublicKey(path string, content string) string {
   425  	if os.Getenv("IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE") == "" {
   426  		return content
   427  	}
   428  
   429  	klog.Infof("Injecting SSH public key into ignition")
   430  
   431  	const publicKeyEnv = "GCE_SSH_PUBLIC_KEY_FILE"
   432  	sshPublicKeyFile := os.Getenv(publicKeyEnv)
   433  	if sshPublicKeyFile == "" {
   434  		klog.Errorf("Environment variable %s is not set", publicKeyEnv)
   435  		os.Exit(1)
   436  	}
   437  
   438  	sshPublicKey, err := os.ReadFile(sshPublicKeyFile)
   439  	if err != nil {
   440  		klog.ErrorS(err, "unable to read SSH public key file")
   441  		os.Exit(1)
   442  	}
   443  
   444  	const sshPublicKeyFileContentMarker = "GCE_SSH_PUBLIC_KEY_FILE_CONTENT"
   445  	key := base64.StdEncoding.EncodeToString(sshPublicKey)
   446  	base64Marker := base64.StdEncoding.EncodeToString([]byte(sshPublicKeyFileContentMarker))
   447  	replacer := strings.NewReplacer(
   448  		sshPublicKeyFileContentMarker, key,
   449  		base64Marker, key,
   450  	)
   451  	return replacer.Replace(content)
   452  }
   453  
   454  // Provision a gce instance using image and run the tests in archive against the instance.
   455  // Delete the instance afterward.
   456  func (g *GCERunner) testGCEImage(suite remote.TestSuite, archivePath string, imageConfig *internalGCEImage, junitFileName string) *remote.TestResult {
   457  	ginkgoFlagsStr := g.cfg.GinkgoFlags
   458  
   459  	host, err := g.createGCEInstance(imageConfig)
   460  	if g.cfg.DeleteInstances {
   461  		defer g.deleteGCEInstance(host)
   462  	}
   463  	if err != nil {
   464  		return &remote.TestResult{
   465  			Err: fmt.Errorf("unable to create gce instance with running docker daemon for image %s.  %v", imageConfig.image, err),
   466  		}
   467  	}
   468  
   469  	// Only delete the files if we are keeping the instance and want it cleaned up.
   470  	// If we are going to delete the instance, don't bother with cleaning up the files
   471  	deleteFiles := !g.cfg.DeleteInstances && g.cfg.Cleanup
   472  
   473  	if err = g.registerGceHostIP(host); err != nil {
   474  		return &remote.TestResult{
   475  			Err:    err,
   476  			Host:   host,
   477  			ExitOK: false,
   478  		}
   479  	}
   480  
   481  	output, exitOk, err := remote.RunRemote(remote.RunRemoteConfig{
   482  		Suite:          suite,
   483  		Archive:        archivePath,
   484  		Host:           host,
   485  		Cleanup:        deleteFiles,
   486  		ImageDesc:      imageConfig.imageDesc,
   487  		JunitFileName:  junitFileName,
   488  		TestArgs:       g.cfg.TestArgs,
   489  		GinkgoArgs:     ginkgoFlagsStr,
   490  		SystemSpecName: g.cfg.SystemSpecName,
   491  		ExtraEnvs:      g.cfg.ExtraEnvs,
   492  		RuntimeConfig:  g.cfg.RuntimeConfig,
   493  	})
   494  	result := remote.TestResult{
   495  		Output: output,
   496  		Err:    err,
   497  		Host:   host,
   498  		ExitOK: exitOk,
   499  	}
   500  
   501  	// This is a temporary solution to collect serial node serial log. Only port 1 contains useful information.
   502  	// TODO(random-liu): Extract out and unify log collection logic with cluste e2e.
   503  	serialPortOutput, err := g.gceComputeService.Instances.GetSerialPortOutput(*project, *zone, host).Port(1).Do()
   504  	if err != nil {
   505  		klog.Errorf("Failed to collect serial Output from node %q: %v", host, err)
   506  	} else {
   507  		logFilename := "serial-1.log"
   508  		err := remote.WriteLog(host, logFilename, serialPortOutput.Contents)
   509  		if err != nil {
   510  			klog.Errorf("Failed to write serial Output from node %q to %q: %v", host, logFilename, err)
   511  		}
   512  	}
   513  	return &result
   514  }
   515  
   516  // Provision a gce instance using image
   517  func (g *GCERunner) createGCEInstance(imageConfig *internalGCEImage) (string, error) {
   518  	p, err := g.gceComputeService.Projects.Get(*project).Do()
   519  	if err != nil {
   520  		return "", fmt.Errorf("failed to get project info %q: %w", *project, err)
   521  	}
   522  	// Use default service account
   523  	serviceAccount := p.DefaultServiceAccount
   524  	klog.V(1).Infof("Creating instance %+v  with service account %q", *imageConfig, serviceAccount)
   525  	name := g.imageToInstanceName(imageConfig)
   526  	i := &compute.Instance{
   527  		Name:        name,
   528  		MachineType: g.machineType(imageConfig.machine),
   529  		NetworkInterfaces: []*compute.NetworkInterface{
   530  			{
   531  				AccessConfigs: []*compute.AccessConfig{
   532  					{
   533  						Type: "ONE_TO_ONE_NAT",
   534  						Name: "External NAT",
   535  					},
   536  				}},
   537  		},
   538  		Disks: []*compute.AttachedDisk{
   539  			{
   540  				AutoDelete: true,
   541  				Boot:       true,
   542  				Type:       "PERSISTENT",
   543  				InitializeParams: &compute.AttachedDiskInitializeParams{
   544  					SourceImage: g.sourceImage(imageConfig.image, imageConfig.project),
   545  					DiskSizeGb:  20,
   546  				},
   547  			},
   548  		},
   549  		ServiceAccounts: []*compute.ServiceAccount{
   550  			{
   551  				Email: serviceAccount,
   552  				Scopes: []string{
   553  					"https://www.googleapis.com/auth/cloud-platform",
   554  				},
   555  			},
   556  		},
   557  	}
   558  
   559  	scheduling := compute.Scheduling{
   560  		Preemptible: *preemptibleInstances,
   561  	}
   562  	for _, accelerator := range imageConfig.resources.Accelerators {
   563  		if i.GuestAccelerators == nil {
   564  			autoRestart := true
   565  			i.GuestAccelerators = []*compute.AcceleratorConfig{}
   566  			scheduling.OnHostMaintenance = "TERMINATE"
   567  			scheduling.AutomaticRestart = &autoRestart
   568  		}
   569  		aType := fmt.Sprintf(acceleratorTypeResourceFormat, *project, *zone, accelerator.Type)
   570  		ac := &compute.AcceleratorConfig{
   571  			AcceleratorCount: accelerator.Count,
   572  			AcceleratorType:  aType,
   573  		}
   574  		i.GuestAccelerators = append(i.GuestAccelerators, ac)
   575  	}
   576  	i.Scheduling = &scheduling
   577  	i.Metadata = imageConfig.metadata
   578  	var insertionOperationName string
   579  	if _, err := g.gceComputeService.Instances.Get(*project, *zone, i.Name).Do(); err != nil {
   580  		op, err := g.gceComputeService.Instances.Insert(*project, *zone, i).Do()
   581  
   582  		if err != nil {
   583  			ret := fmt.Sprintf("could not create instance %s: API error: %v", name, err)
   584  			if op != nil {
   585  				ret = fmt.Sprintf("%s: %v", ret, op.Error)
   586  			}
   587  			return "", fmt.Errorf(ret)
   588  		} else if op.Error != nil {
   589  			var errs []string
   590  			for _, insertErr := range op.Error.Errors {
   591  				errs = append(errs, fmt.Sprintf("%+v", insertErr))
   592  			}
   593  			return "", fmt.Errorf("could not create instance %s: %+v", name, errs)
   594  
   595  		}
   596  		insertionOperationName = op.Name
   597  	}
   598  	instanceRunning := false
   599  	var instance *compute.Instance
   600  	for i := 0; i < 30 && !instanceRunning; i++ {
   601  		if i > 0 {
   602  			time.Sleep(time.Second * 20)
   603  		}
   604  		var insertionOperation *compute.Operation
   605  		insertionOperation, err = g.gceComputeService.ZoneOperations.Get(*project, *zone, insertionOperationName).Do()
   606  		if err != nil {
   607  			continue
   608  		}
   609  		if strings.ToUpper(insertionOperation.Status) != "DONE" {
   610  			err = fmt.Errorf("instance insert operation %s not in state DONE, was %s", name, insertionOperation.Status)
   611  			continue
   612  		}
   613  		if insertionOperation.Error != nil {
   614  			var errs []string
   615  			for _, insertErr := range insertionOperation.Error.Errors {
   616  				errs = append(errs, fmt.Sprintf("%+v", insertErr))
   617  			}
   618  			return name, fmt.Errorf("could not create instance %s: %+v", name, errs)
   619  		}
   620  
   621  		instance, err = g.gceComputeService.Instances.Get(*project, *zone, name).Do()
   622  		if err != nil {
   623  			continue
   624  		}
   625  		if strings.ToUpper(instance.Status) != "RUNNING" {
   626  			err = fmt.Errorf("instance %s not in state RUNNING, was %s", name, instance.Status)
   627  			continue
   628  		}
   629  		externalIP := g.getExternalIP(instance)
   630  		if len(externalIP) > 0 {
   631  			remote.AddHostnameIP(name, externalIP)
   632  		}
   633  
   634  		var output string
   635  		output, err = remote.SSH(name, "sh", "-c",
   636  			"'systemctl list-units  --type=service  --state=running | grep -e containerd -e crio'")
   637  		if err != nil {
   638  			err = fmt.Errorf("instance %s not running containerd/crio daemon - Command failed: %s", name, output)
   639  			continue
   640  		}
   641  		if !strings.Contains(output, "containerd.service") &&
   642  			!strings.Contains(output, "crio.service") {
   643  			err = fmt.Errorf("instance %s not running containerd/crio daemon: %s", name, output)
   644  			continue
   645  		}
   646  		instanceRunning = true
   647  	}
   648  	// If instance didn't reach running state in time, return with error now.
   649  	if err != nil {
   650  		return name, err
   651  	}
   652  	// Instance reached running state in time, make sure that cloud-init is complete
   653  	if g.isCloudInitUsed(imageConfig.metadata) {
   654  		cloudInitFinished := false
   655  		for i := 0; i < 60 && !cloudInitFinished; i++ {
   656  			if i > 0 {
   657  				time.Sleep(time.Second * 20)
   658  			}
   659  			var finished string
   660  			finished, err = remote.SSH(name, "ls", "/var/lib/cloud/instance/boot-finished")
   661  			if err != nil {
   662  				err = fmt.Errorf("instance %s has not finished cloud-init script: %s", name, finished)
   663  				continue
   664  			}
   665  			cloudInitFinished = true
   666  		}
   667  	}
   668  
   669  	// apply additional kernel arguments to the instance
   670  	if len(imageConfig.kernelArguments) > 0 {
   671  		klog.Info("Update kernel arguments")
   672  		if err := g.updateKernelArguments(instance, imageConfig.image, imageConfig.kernelArguments); err != nil {
   673  			return name, err
   674  		}
   675  	}
   676  
   677  	return name, err
   678  }
   679  
   680  func (g *GCERunner) isCloudInitUsed(metadata *compute.Metadata) bool {
   681  	if metadata == nil {
   682  		return false
   683  	}
   684  	for _, item := range metadata.Items {
   685  		if item.Key == "user-data" && item.Value != nil && strings.HasPrefix(*item.Value, "#cloud-config") {
   686  			return true
   687  		}
   688  	}
   689  	return false
   690  }
   691  
   692  func (g *GCERunner) sourceImage(image, imageProject string) string {
   693  	return fmt.Sprintf("projects/%s/global/images/%s", imageProject, image)
   694  }
   695  
   696  func (g *GCERunner) imageToInstanceName(imageConfig *internalGCEImage) string {
   697  	if imageConfig.machine == "" {
   698  		return g.cfg.InstanceNamePrefix + "-" + imageConfig.image
   699  	}
   700  	// For benchmark test, node name has the format 'machine-image-uuid' to run
   701  	// different machine types with the same image in parallel
   702  	return imageConfig.machine + "-" + imageConfig.image + "-" + uuid.New().String()[:8]
   703  }
   704  
   705  func (g *GCERunner) registerGceHostIP(host string) error {
   706  	instance, err := g.gceComputeService.Instances.Get(*project, *zone, host).Do()
   707  	if err != nil {
   708  		return err
   709  	}
   710  	if strings.ToUpper(instance.Status) != "RUNNING" {
   711  		return fmt.Errorf("instance %s not in state RUNNING, was %s", host, instance.Status)
   712  	}
   713  	externalIP := g.getExternalIP(instance)
   714  	if len(externalIP) > 0 {
   715  		remote.AddHostnameIP(host, externalIP)
   716  	}
   717  	return nil
   718  }
   719  func (g *GCERunner) getExternalIP(instance *compute.Instance) string {
   720  	for i := range instance.NetworkInterfaces {
   721  		ni := instance.NetworkInterfaces[i]
   722  		for j := range ni.AccessConfigs {
   723  			ac := ni.AccessConfigs[j]
   724  			if len(ac.NatIP) > 0 {
   725  				return ac.NatIP
   726  			}
   727  		}
   728  	}
   729  	return ""
   730  }
   731  func (g *GCERunner) updateKernelArguments(instance *compute.Instance, image string, kernelArgs []string) error {
   732  	kernelArgsString := strings.Join(kernelArgs, " ")
   733  
   734  	var cmd []string
   735  	if strings.Contains(image, "cos") {
   736  		cmd = []string{
   737  			"dir=$(mktemp -d)",
   738  			"mount /dev/sda12 ${dir}",
   739  			fmt.Sprintf("sed -i -e \"s|cros_efi|cros_efi %s|g\" ${dir}/efi/boot/grub.cfg", kernelArgsString),
   740  			"umount ${dir}",
   741  			"rmdir ${dir}",
   742  		}
   743  	}
   744  
   745  	if strings.Contains(image, "ubuntu") {
   746  		cmd = []string{
   747  			fmt.Sprintf("echo \"GRUB_CMDLINE_LINUX_DEFAULT=%s ${GRUB_CMDLINE_LINUX_DEFAULT}\" > /etc/default/grub.d/99-additional-arguments.cfg", kernelArgsString),
   748  			"/usr/sbin/update-grub",
   749  		}
   750  	}
   751  
   752  	if len(cmd) == 0 {
   753  		klog.Warningf("The image %s does not support adding an additional kernel arguments", image)
   754  		return nil
   755  	}
   756  
   757  	out, err := remote.SSH(instance.Name, "sh", "-c", fmt.Sprintf("'%s'", strings.Join(cmd, "&&")))
   758  	if err != nil {
   759  		klog.Errorf("failed to run command %s: out: %s, Err: %v", cmd, out, err)
   760  		return err
   761  	}
   762  
   763  	if err := g.rebootInstance(instance); err != nil {
   764  		return err
   765  	}
   766  
   767  	return nil
   768  }
   769  
   770  func (g *GCERunner) machineType(machine string) string {
   771  	var ret string
   772  	if machine == "" && *instanceType != "" {
   773  		ret = *instanceType
   774  	} else if machine != "" {
   775  		ret = machine
   776  	} else {
   777  		ret = defaultGCEMachine
   778  	}
   779  	return fmt.Sprintf("zones/%s/machineTypes/%s", *zone, ret)
   780  }
   781  func (g *GCERunner) rebootInstance(instance *compute.Instance) error {
   782  	// wait until the instance will not response to SSH
   783  	klog.Info("Reboot the node and wait for instance not to be available via SSH")
   784  	if waitErr := wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) {
   785  		if _, err := remote.SSH(instance.Name, "reboot"); err != nil {
   786  			return true, nil
   787  		}
   788  
   789  		return false, nil
   790  	}); waitErr != nil {
   791  		return fmt.Errorf("the instance %s still response to SSH: %v", instance.Name, waitErr)
   792  	}
   793  
   794  	// wait until the instance will response again to SSH
   795  	klog.Info("Wait for instance to be available via SSH")
   796  	if waitErr := wait.PollImmediate(30*time.Second, 5*time.Minute, func() (bool, error) {
   797  		if _, err := remote.SSH(instance.Name, "sh", "-c", "date"); err != nil {
   798  			return false, nil
   799  		}
   800  		return true, nil
   801  	}); waitErr != nil {
   802  		return fmt.Errorf("the instance %s does not response to SSH: %v", instance.Name, waitErr)
   803  	}
   804  
   805  	return nil
   806  }