k8s.io/kubernetes@v1.29.3/test/e2e_node/remote/gce/gce_runner.go (about)

    17  package gce
    19  import (
    20  	"context"
    21  	"encoding/base64"
    22  	"errors"
    23  	"flag"
    24  	"fmt"
    25  	"net/http"
    26  	"os"
    27  	"path/filepath"
    28  	"regexp"
    29  	"sort"
    30  	"strings"
    31  	"time"
    33  	"k8s.io/kubernetes/test/e2e_node/remote"
    35  	"github.com/google/uuid"
    36  	"golang.org/x/oauth2/google"
    37  	"google.golang.org/api/compute/v1"
    38  	"google.golang.org/api/option"
    39  	"k8s.io/apimachinery/pkg/util/wait"
    40  	"k8s.io/klog/v2"
    41  	"sigs.k8s.io/yaml"
    42  )
    44  var _ remote.Runner = (*GCERunner)(nil)
    46  func init() {
    47  	remote.RegisterRunner("gce", NewGCERunner)
    48  }
    50  // envs is the type used to collect all node envs. The key is the env name,
    51  // and the value is the env value
    52  type envs map[string]string
    54  // String function of flag.Value
    55  func (e *envs) String() string {
    56  	return fmt.Sprint(*e)
    57  }
    59  // Set function of flag.Value
    60  func (e *envs) Set(value string) error {
    61  	if value == "" {
    62  		return nil
    63  	}
    64  	kv := strings.SplitN(value, "=", 2)
    65  	if len(kv) != 2 {
    66  		return fmt.Errorf("invalid env string %s", value)
    67  	}
    68  	emap := *e
    69  	emap[kv[0]] = kv[1]
    70  	return nil
    71  }
    73  // nodeEnvs is the node envs from the flag `node-env`.
    74  var nodeEnvs = make(envs)
    76  var project = flag.String("project", "", "gce project the hosts live in (gce)")
    77  var zone = flag.String("zone", "", "gce zone that the hosts live in (gce)")
    78  var instanceMetadata = flag.String("instance-metadata", "", "key/value metadata for instances separated by '=' or '<', 'k=v' means the key is 'k' and the value is 'v'; 'k<p' means the key is 'k' and the value is extracted from the local path 'p', e.g. k1=v1,k2<p2  (gce)")
    79  var imageProject = flag.String("image-project", "", "gce project the hosts live in  (gce)")
    80  var instanceType = flag.String("instance-type", "e2-medium", "GCP Machine type to use for test")
    81  var preemptibleInstances = flag.Bool("preemptible-instances", false, "If true, gce instances will be configured to be preemptible  (gce)")
    83  func init() {
    84  	flag.Var(&nodeEnvs, "node-env", "An environment variable passed to instance as metadata, e.g. when '--node-env=PATH=/usr/bin' is specified, there will be an extra instance metadata 'PATH=/usr/bin'.")
    85  }
    87  const (
    88  	defaultGCEMachine             = "n1-standard-1"
    89  	acceleratorTypeResourceFormat = "https://www.googleapis.com/compute/v1/projects/%s/zones/%s/acceleratorTypes/%s"
    90  )
    92  type GCERunner struct {
    93  	cfg               remote.Config
    94  	gceComputeService *compute.Service
    95  	gceImages         *internalGCEImageConfig
    96  }
    98  func NewGCERunner(cfg remote.Config) remote.Runner {
    99  	if cfg.InstanceNamePrefix == "" {
   100  		cfg.InstanceNamePrefix = "tmp-node-e2e-" + uuid.New().String()[:8]
   101  	}
   102  	return &GCERunner{cfg: cfg}
   103  }
   105  func (g *GCERunner) Validate() error {
   106  	if len(g.cfg.Hosts) == 0 && g.cfg.ImageConfigFile == "" && len(g.cfg.Images) == 0 {
   107  		klog.Fatalf("Must specify one of --image-config-file, --hosts, --images.")
   108  	}
   109  	var err error
   110  	g.gceComputeService, err = getComputeClient()
   111  	if err != nil {
   112  		return fmt.Errorf("Unable to create gcloud compute service using defaults.  Make sure you are authenticated. %w", err)
   113  	}
   115  	if g.gceImages, err = g.prepareGceImages(); err != nil {
   116  		klog.Fatalf("While preparing GCE images: %v", err)
   117  	}
   118  	return nil
   119  }
   121  func (g *GCERunner) StartTests(suite remote.TestSuite, archivePath string, results chan *remote.TestResult) (numTests int) {
   122  	for shortName := range g.gceImages.images {
   123  		imageConfig := g.gceImages.images[shortName]
   124  		numTests++
   125  		fmt.Printf("Initializing e2e tests using image %s/%s/%s.\n", shortName, imageConfig.project, imageConfig.image)
   126  		go func(image *internalGCEImage, junitFileName string) {
   127  			results <- g.testGCEImage(suite, archivePath, image, junitFileName)
   128  		}(&imageConfig, shortName)
   129  	}
   130  	return
   131  }
   133  func getComputeClient() (*compute.Service, error) {
   134  	const retries = 10
   135  	const backoff = time.Second * 6
   137  	// Setup the gce client for provisioning instances
   138  	// Getting credentials on gce jenkins is flaky, so try a couple times
   139  	var err error
   140  	var cs *compute.Service
   141  	for i := 0; i < retries; i++ {
   142  		if i > 0 {
   143  			time.Sleep(backoff)
   144  		}
   146  		var client *http.Client
   147  		client, err = google.DefaultClient(context.Background(), compute.ComputeScope)
   148  		if err != nil {
   149  			continue
   150  		}
   152  		cs, err = compute.NewService(context.Background(), option.WithHTTPClient(client))
   153  		if err != nil {
   154  			continue
   155  		}
   156  		return cs, nil
   157  	}
   158  	return nil, err
   159  }
   161  // Accelerator contains type and count about resource.
   162  type Accelerator struct {
   163  	Type  string `json:"type,omitempty"`
   164  	Count int64  `json:"count,omitempty"`
   165  }
   167  // Resources contains accelerators array.
   168  type Resources struct {
   169  	Accelerators []Accelerator `json:"accelerators,omitempty"`
   170  }
   172  // internalGCEImage is an internal GCE image representation for E2E node.
   173  type internalGCEImage struct {
   174  	image string
   175  	// imageDesc is the description of the image. If empty, the value in the
   176  	// 'image' will be used.
   177  	imageDesc       string
   178  	kernelArguments []string
   179  	project         string
   180  	resources       Resources
   181  	metadata        *compute.Metadata
   182  	machine         string
   183  }
   185  type internalGCEImageConfig struct {
   186  	images map[string]internalGCEImage
   187  }
   189  // GCEImageConfig specifies what images should be run and how for these tests.
   190  // It can be created via the `--images` and `--image-project` flags, or by
   191  // specifying the `--image-config-file` flag, pointing to a json or yaml file
   192  // of the form:
   193  //
   194  //	images:
   195  //	  short-name:
   196  //	    image: gce-image-name
   197  //	    project: gce-image-project
   198  //	    machine: for benchmark only, the machine type (GCE instance) to run test
   199  //	    tests: for benchmark only, a list of ginkgo focus strings to match tests
   200  //
   201  // TODO(coufon): replace 'image' with 'node' in configurations
   202  // and we plan to support testing custom machines other than GCE by specifying Host
   203  type GCEImageConfig struct {
   204  	Images map[string]GCEImage `json:"images"`
   205  }
   207  // GCEImage contains some information about GCE Image.
   208  type GCEImage struct {
   209  	Image      string `json:"image,omitempty"`
   210  	ImageRegex string `json:"image_regex,omitempty"`
   211  	// ImageFamily is the image family to use. The latest image from the image family will be used, e.g cos-81-lts.
   212  	ImageFamily     string    `json:"image_family,omitempty"`
   213  	ImageDesc       string    `json:"image_description,omitempty"`
   214  	KernelArguments []string  `json:"kernel_arguments,omitempty"`
   215  	Project         string    `json:"project"`
   216  	Metadata        string    `json:"metadata"`
   217  	Machine         string    `json:"machine,omitempty"`
   218  	Resources       Resources `json:"resources,omitempty"`
   219  }
   221  // Returns an image name based on regex and given GCE project.
   222  func (g *GCERunner) getGCEImage(imageRegex, imageFamily string, project string) (string, error) {
   223  	imageObjs := []imageObj{}
   224  	imageRe := regexp.MustCompile(imageRegex)
   225  	if err := g.gceComputeService.Images.List(project).Pages(context.Background(),
   226  		func(ilc *compute.ImageList) error {
   227  			for _, instance := range ilc.Items {
   228  				if imageRegex != "" && !imageRe.MatchString(instance.Name) {
   229  					continue
   230  				}
   231  				if imageFamily != "" && instance.Family != imageFamily {
   232  					continue
   233  				}
   234  				creationTime, err := time.Parse(time.RFC3339, instance.CreationTimestamp)
   235  				if err != nil {
   236  					return fmt.Errorf("failed to parse instance creation timestamp %q: %w", instance.CreationTimestamp, err)
   237  				}
   238  				io := imageObj{
   239  					creationTime: creationTime,
   240  					name:         instance.Name,
   241  				}
   242  				imageObjs = append(imageObjs, io)
   243  			}
   244  			return nil
   245  		},
   246  	); err != nil {
   247  		return "", fmt.Errorf("failed to list images in project %q: %w", project, err)
   248  	}
   250  	// Pick the latest image after sorting.
   251  	sort.Sort(byCreationTime(imageObjs))
   252  	if len(imageObjs) > 0 {
   253  		klog.V(4).Infof("found images %+v based on regex %q and family %q in project %q", imageObjs, imageRegex, imageFamily, project)
   254  		return imageObjs[0].name, nil
   255  	}
   256  	return "", fmt.Errorf("found zero images based on regex %q and family %q in project %q", imageRegex, imageFamily, project)
   257  }
   259  func (g *GCERunner) prepareGceImages() (*internalGCEImageConfig, error) {
   260  	gceImages := &internalGCEImageConfig{
   261  		images: make(map[string]internalGCEImage),
   262  	}
   264  	// Parse images from given config file and convert them to internalGCEImage.
   265  	if g.cfg.ImageConfigFile != "" {
   266  		configPath := g.cfg.ImageConfigFile
   267  		if g.cfg.ImageConfigDir != "" {
   268  			configPath = filepath.Join(g.cfg.ImageConfigDir, g.cfg.ImageConfigFile)
   269  		}
   271  		imageConfigData, err := os.ReadFile(configPath)
   272  		if err != nil {
   273  			return nil, fmt.Errorf("Could not read image config file provided: %w", err)
   274  		}
   275  		// Unmarshal the given image config file. All images for this test run will be organized into a map.
   276  		// shortName->GCEImage, e.g cos-stable->cos-stable-81-12871-103-0.
   277  		externalImageConfig := GCEImageConfig{Images: make(map[string]GCEImage)}
   278  		err = yaml.Unmarshal(imageConfigData, &externalImageConfig)
   279  		if err != nil {
   280  			return nil, fmt.Errorf("Could not parse image config file: %w", err)
   281  		}
   283  		for shortName, imageConfig := range externalImageConfig.Images {
   284  			var image string
   285  			if (imageConfig.ImageRegex != "" || imageConfig.ImageFamily != "") && imageConfig.Image == "" {
   286  				image, err = g.getGCEImage(imageConfig.ImageRegex, imageConfig.ImageFamily, imageConfig.Project)
   287  				if err != nil {
   288  					return nil, fmt.Errorf("Could not retrieve a image based on image regex %q and family %q: %v",
   289  						imageConfig.ImageRegex, imageConfig.ImageFamily, err)
   290  				}
   291  			} else {
   292  				image = imageConfig.Image
   293  			}
   294  			// Convert the given image into an internalGCEImage.
   295  			metadata := imageConfig.Metadata
   296  			if len(strings.TrimSpace(*instanceMetadata)) > 0 {
   297  				metadata += "," + *instanceMetadata
   298  			}
   299  			gceImage := internalGCEImage{
   300  				image:           image,
   301  				imageDesc:       imageConfig.ImageDesc,
   302  				project:         imageConfig.Project,
   303  				metadata:        g.getImageMetadata(metadata),
   304  				kernelArguments: imageConfig.KernelArguments,
   305  				machine:         imageConfig.Machine,
   306  				resources:       imageConfig.Resources,
   307  			}
   308  			if gceImage.imageDesc == "" {
   309  				gceImage.imageDesc = gceImage.image
   310  			}
   311  			gceImages.images[shortName] = gceImage
   312  		}
   313  	}
   315  	// Allow users to specify additional images via cli flags for local testing
   316  	// convenience; merge in with config file
   317  	if len(g.cfg.Images) > 0 {
   318  		if *imageProject == "" {
   319  			klog.Fatal("Must specify --image-project if you specify --images")
   320  		}
   321  		for _, image := range g.cfg.Images {
   322  			gceImage := internalGCEImage{
   323  				image:    image,
   324  				project:  *imageProject,
   325  				metadata: g.getImageMetadata(*instanceMetadata),
   326  			}
   327  			gceImages.images[image] = gceImage
   328  		}
   329  	}
   331  	if len(gceImages.images) != 0 && *zone == "" {
   332  		return nil, errors.New("must specify --zone flag")
   333  	}
   334  	// Make sure GCP project is set. Without a project, images can't be retrieved..
   335  	for shortName, imageConfig := range gceImages.images {
   336  		if imageConfig.project == "" {
   337  			return nil, fmt.Errorf("invalid config for %v; must specify a project", shortName)
   338  		}
   339  	}
   340  	if len(gceImages.images) != 0 {
   341  		if *project == "" {
   342  			return nil, errors.New("must specify --project flag to launch images into")
   343  		}
   344  	}
   346  	return gceImages, nil
   347  }
   349  type imageObj struct {
   350  	creationTime time.Time
   351  	name         string
   352  }
   354  type byCreationTime []imageObj
   356  func (a byCreationTime) Len() int           { return len(a) }
   357  func (a byCreationTime) Less(i, j int) bool { return a[i].creationTime.After(a[j].creationTime) }
   358  func (a byCreationTime) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
   360  func (g *GCERunner) getImageMetadata(input string) *compute.Metadata {
   361  	if input == "" {
   362  		return nil
   363  	}
   364  	klog.V(3).Infof("parsing instance metadata: %q", input)
   365  	raw := g.parseInstanceMetadata(input)
   366  	klog.V(4).Infof("parsed instance metadata: %v", raw)
   367  	metadataItems := []*compute.MetadataItems{}
   368  	for k, v := range raw {
   369  		val := v
   370  		metadataItems = append(metadataItems, &compute.MetadataItems{
   371  			Key:   k,
   372  			Value: &val,
   373  		})
   374  	}
   375  	ret := compute.Metadata{Items: metadataItems}
   376  	return &ret
   377  }
   379  func (g *GCERunner) deleteGCEInstance(host string) {
   380  	klog.Infof("Deleting instance %q", host)
   381  	_, err := g.gceComputeService.Instances.Delete(*project, *zone, host).Do()
   382  	if err != nil {
   383  		klog.Errorf("Error deleting instance %q: %v", host, err)
   384  	}
   385  }
   387  func (g *GCERunner) parseInstanceMetadata(str string) map[string]string {
   388  	metadata := make(map[string]string)
   389  	ss := strings.Split(str, ",")
   390  	for _, s := range ss {
   391  		kv := strings.Split(s, "=")
   392  		if len(kv) == 2 {
   393  			metadata[kv[0]] = kv[1]
   394  			continue
   395  		}
   396  		kp := strings.Split(s, "<")
   397  		if len(kp) != 2 {
   398  			klog.Fatalf("Invalid instance metadata: %q", s)
   399  			continue
   400  		}
   401  		metaPath := kp[1]
   402  		if g.cfg.ImageConfigDir != "" {
   403  			metaPath = filepath.Join(g.cfg.ImageConfigDir, metaPath)
   404  		}
   405  		v, err := os.ReadFile(metaPath)
   406  		if err != nil {
   407  			klog.Fatalf("Failed to read metadata file %q: %v", metaPath, err)
   408  			continue
   409  		}
   410  		metadata[kp[0]] = ignitionInjectGCEPublicKey(metaPath, string(v))
   411  	}
   412  	for k, v := range nodeEnvs {
   413  		metadata[k] = v
   414  	}
   415  	return metadata
   416  }
   418  // ignitionInjectGCEPublicKey tries to inject the GCE SSH public key into the
   419  // provided ignition file path.
   420  //
   421  // This will only being done if the job has the
   422  // IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE environment variable set, while it
   423  // tried to replace the GCE_SSH_PUBLIC_KEY_FILE_CONTENT placeholder.
   424  func ignitionInjectGCEPublicKey(path string, content string) string {
   425  	if os.Getenv("IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE") == "" {
   426  		return content
   427  	}
   429  	klog.Infof("Injecting SSH public key into ignition")
   431  	const publicKeyEnv = "GCE_SSH_PUBLIC_KEY_FILE"
   432  	sshPublicKeyFile := os.Getenv(publicKeyEnv)
   433  	if sshPublicKeyFile == "" {
   434  		klog.Errorf("Environment variable %s is not set", publicKeyEnv)
   435  		os.Exit(1)
   436  	}
   438  	sshPublicKey, err := os.ReadFile(sshPublicKeyFile)
   439  	if err != nil {
   440  		klog.ErrorS(err, "unable to read SSH public key file")
   441  		os.Exit(1)
   442  	}
   444  	const sshPublicKeyFileContentMarker = "GCE_SSH_PUBLIC_KEY_FILE_CONTENT"
   445  	key := base64.StdEncoding.EncodeToString(sshPublicKey)
   446  	base64Marker := base64.StdEncoding.EncodeToString([]byte(sshPublicKeyFileContentMarker))
   447  	replacer := strings.NewReplacer(
   448  		sshPublicKeyFileContentMarker, key,
   449  		base64Marker, key,
   450  	)
   451  	return replacer.Replace(content)
   452  }
   454  // Provision a gce instance using image and run the tests in archive against the instance.
   455  // Delete the instance afterward.
   456  func (g *GCERunner) testGCEImage(suite remote.TestSuite, archivePath string, imageConfig *internalGCEImage, junitFileName string) *remote.TestResult {
   457  	ginkgoFlagsStr := g.cfg.GinkgoFlags
   459  	host, err := g.createGCEInstance(imageConfig)
   460  	if g.cfg.DeleteInstances {
   461  		defer g.deleteGCEInstance(host)
   462  	}
   463  	if err != nil {
   464  		return &remote.TestResult{
   465  			Err: fmt.Errorf("unable to create gce instance with running docker daemon for image %s.  %v", imageConfig.image, err),
   466  		}
   467  	}
   469  	// Only delete the files if we are keeping the instance and want it cleaned up.
   470  	// If we are going to delete the instance, don't bother with cleaning up the files
   471  	deleteFiles := !g.cfg.DeleteInstances && g.cfg.Cleanup
   473  	if err = g.registerGceHostIP(host); err != nil {
   474  		return &remote.TestResult{
   475  			Err:    err,
   476  			Host:   host,
   477  			ExitOK: false,
   478  		}
   479  	}
   481  	output, exitOk, err := remote.RunRemote(remote.RunRemoteConfig{
   482  		Suite:          suite,
   483  		Archive:        archivePath,
   484  		Host:           host,
   485  		Cleanup:        deleteFiles,
   486  		ImageDesc:      imageConfig.imageDesc,
   487  		JunitFileName:  junitFileName,
   488  		TestArgs:       g.cfg.TestArgs,
   489  		GinkgoArgs:     ginkgoFlagsStr,
   490  		SystemSpecName: g.cfg.SystemSpecName,
   491  		ExtraEnvs:      g.cfg.ExtraEnvs,
   492  		RuntimeConfig:  g.cfg.RuntimeConfig,
   493  	})
   494  	result := remote.TestResult{
   495  		Output: output,
   496  		Err:    err,
   497  		Host:   host,
   498  		ExitOK: exitOk,
   499  	}
   501  	// This is a temporary solution to collect serial node serial log. Only port 1 contains useful information.
   502  	// TODO(random-liu): Extract out and unify log collection logic with cluste e2e.
   503  	serialPortOutput, err := g.gceComputeService.Instances.GetSerialPortOutput(*project, *zone, host).Port(1).Do()
   504  	if err != nil {
   505  		klog.Errorf("Failed to collect serial Output from node %q: %v", host, err)
   506  	} else {
   507  		logFilename := "serial-1.log"
   508  		err := remote.WriteLog(host, logFilename, serialPortOutput.Contents)
   509  		if err != nil {
   510  			klog.Errorf("Failed to write serial Output from node %q to %q: %v", host, logFilename, err)
   511  		}
   512  	}
   513  	return &result
   514  }
   516  // Provision a gce instance using image
   517  func (g *GCERunner) createGCEInstance(imageConfig *internalGCEImage) (string, error) {
   518  	p, err := g.gceComputeService.Projects.Get(*project).Do()
   519  	if err != nil {
   520  		return "", fmt.Errorf("failed to get project info %q: %w", *project, err)
   521  	}
   522  	// Use default service account
   523  	serviceAccount := p.DefaultServiceAccount
   524  	klog.V(1).Infof("Creating instance %+v  with service account %q", *imageConfig, serviceAccount)
   525  	name := g.imageToInstanceName(imageConfig)
   526  	i := &compute.Instance{
   527  		Name:        name,
   528  		MachineType: g.machineType(imageConfig.machine),
   529  		NetworkInterfaces: []*compute.NetworkInterface{
   530  			{
   531  				AccessConfigs: []*compute.AccessConfig{
   532  					{
   533  						Type: "ONE_TO_ONE_NAT",
   534  						Name: "External NAT",
   535  					},
   536  				}},
   537  		},
   538  		Disks: []*compute.AttachedDisk{
   539  			{
   540  				AutoDelete: true,
   541  				Boot:       true,
   542  				Type:       "PERSISTENT",
   543  				InitializeParams: &compute.AttachedDiskInitializeParams{
   544  					SourceImage: g.sourceImage(imageConfig.image, imageConfig.project),
   545  					DiskSizeGb:  20,
   546  				},
   547  			},
   548  		},
   549  		ServiceAccounts: []*compute.ServiceAccount{
   550  			{
   551  				Email: serviceAccount,
   552  				Scopes: []string{
   553  					"https://www.googleapis.com/auth/cloud-platform",
   554  				},
   555  			},
   556  		},
   557  	}
   559  	scheduling := compute.Scheduling{
   560  		Preemptible: *preemptibleInstances,
   561  	}
   562  	for _, accelerator := range imageConfig.resources.Accelerators {
   563  		if i.GuestAccelerators == nil {
   564  			autoRestart := true
   565  			i.GuestAccelerators = []*compute.AcceleratorConfig{}
   566  			scheduling.OnHostMaintenance = "TERMINATE"
   567  			scheduling.AutomaticRestart = &autoRestart
   568  		}
   569  		aType := fmt.Sprintf(acceleratorTypeResourceFormat, *project, *zone, accelerator.Type)
   570  		ac := &compute.AcceleratorConfig{
   571  			AcceleratorCount: accelerator.Count,
   572  			AcceleratorType:  aType,
   573  		}
   574  		i.GuestAccelerators = append(i.GuestAccelerators, ac)
   575  	}
   576  	i.Scheduling = &scheduling
   577  	i.Metadata = imageConfig.metadata
   578  	var insertionOperationName string
   579  	if _, err := g.gceComputeService.Instances.Get(*project, *zone, i.Name).Do(); err != nil {
   580  		op, err := g.gceComputeService.Instances.Insert(*project, *zone, i).Do()
   582  		if err != nil {
   583  			ret := fmt.Sprintf("could not create instance %s: API error: %v", name, err)
   584  			if op != nil {
   585  				ret = fmt.Sprintf("%s: %v", ret, op.Error)
   586  			}
   587  			return "", fmt.Errorf(ret)
   588  		} else if op.Error != nil {
   589  			var errs []string
   590  			for _, insertErr := range op.Error.Errors {
   591  				errs = append(errs, fmt.Sprintf("%+v", insertErr))
   592  			}
   593  			return "", fmt.Errorf("could not create instance %s: %+v", name, errs)
   595  		}
   596  		insertionOperationName = op.Name
   597  	}
   598  	instanceRunning := false
   599  	var instance *compute.Instance
   600  	for i := 0; i < 30 && !instanceRunning; i++ {
   601  		if i > 0 {
   602  			time.Sleep(time.Second * 20)
   603  		}
   604  		var insertionOperation *compute.Operation
   605  		insertionOperation, err = g.gceComputeService.ZoneOperations.Get(*project, *zone, insertionOperationName).Do()
   606  		if err != nil {
   607  			continue
   608  		}
   609  		if strings.ToUpper(insertionOperation.Status) != "DONE" {
   610  			err = fmt.Errorf("instance insert operation %s not in state DONE, was %s", name, insertionOperation.Status)
   611  			continue
   612  		}
   613  		if insertionOperation.Error != nil {
   614  			var errs []string
   615  			for _, insertErr := range insertionOperation.Error.Errors {
   616  				errs = append(errs, fmt.Sprintf("%+v", insertErr))
   617  			}
   618  			return name, fmt.Errorf("could not create instance %s: %+v", name, errs)
   619  		}
   621  		instance, err = g.gceComputeService.Instances.Get(*project, *zone, name).Do()
   622  		if err != nil {
   623  			continue
   624  		}
   625  		if strings.ToUpper(instance.Status) != "RUNNING" {
   626  			err = fmt.Errorf("instance %s not in state RUNNING, was %s", name, instance.Status)
   627  			continue
   628  		}
   629  		externalIP := g.getExternalIP(instance)
   630  		if len(externalIP) > 0 {
   631  			remote.AddHostnameIP(name, externalIP)
   632  		}
   634  		var output string
   635  		output, err = remote.SSH(name, "sh", "-c",
   636  			"'systemctl list-units  --type=service  --state=running | grep -e containerd -e crio'")
   637  		if err != nil {
   638  			err = fmt.Errorf("instance %s not running containerd/crio daemon - Command failed: %s", name, output)
   639  			continue
   640  		}
   641  		if !strings.Contains(output, "containerd.service") &&
   642  			!strings.Contains(output, "crio.service") {
   643  			err = fmt.Errorf("instance %s not running containerd/crio daemon: %s", name, output)
   644  			continue
   645  		}
   646  		instanceRunning = true
   647  	}
   648  	// If instance didn't reach running state in time, return with error now.
   649  	if err != nil {
   650  		return name, err
   651  	}
   652  	// Instance reached running state in time, make sure that cloud-init is complete
   653  	if g.isCloudInitUsed(imageConfig.metadata) {
   654  		cloudInitFinished := false
   655  		for i := 0; i < 60 && !cloudInitFinished; i++ {
   656  			if i > 0 {
   657  				time.Sleep(time.Second * 20)
   658  			}
   659  			var finished string
   660  			finished, err = remote.SSH(name, "ls", "/var/lib/cloud/instance/boot-finished")
   661  			if err != nil {
   662  				err = fmt.Errorf("instance %s has not finished cloud-init script: %s", name, finished)
   663  				continue
   664  			}
   665  			cloudInitFinished = true
   666  		}
   667  	}
   669  	// apply additional kernel arguments to the instance
   670  	if len(imageConfig.kernelArguments) > 0 {
   671  		klog.Info("Update kernel arguments")
   672  		if err := g.updateKernelArguments(instance, imageConfig.image, imageConfig.kernelArguments); err != nil {
   673  			return name, err
   674  		}
   675  	}
   677  	return name, err
   678  }
   680  func (g *GCERunner) isCloudInitUsed(metadata *compute.Metadata) bool {
   681  	if metadata == nil {
   682  		return false
   683  	}
   684  	for _, item := range metadata.Items {
   685  		if item.Key == "user-data" && item.Value != nil && strings.HasPrefix(*item.Value, "#cloud-config") {
   686  			return true
   687  		}
   688  	}
   689  	return false
   690  }
   692  func (g *GCERunner) sourceImage(image, imageProject string) string {
   693  	return fmt.Sprintf("projects/%s/global/images/%s", imageProject, image)
   694  }
   696  func (g *GCERunner) imageToInstanceName(imageConfig *internalGCEImage) string {
   697  	if imageConfig.machine == "" {
   698  		return g.cfg.InstanceNamePrefix + "-" + imageConfig.image
   699  	}
   700  	// For benchmark test, node name has the format 'machine-image-uuid' to run
   701  	// different machine types with the same image in parallel
   702  	return imageConfig.machine + "-" + imageConfig.image + "-" + uuid.New().String()[:8]
   703  }
   705  func (g *GCERunner) registerGceHostIP(host string) error {
   706  	instance, err := g.gceComputeService.Instances.Get(*project, *zone, host).Do()
   707  	if err != nil {
   708  		return err
   709  	}
   710  	if strings.ToUpper(instance.Status) != "RUNNING" {
   711  		return fmt.Errorf("instance %s not in state RUNNING, was %s", host, instance.Status)
   712  	}
   713  	externalIP := g.getExternalIP(instance)
   714  	if len(externalIP) > 0 {
   715  		remote.AddHostnameIP(host, externalIP)
   716  	}
   717  	return nil
   718  }
   719  func (g *GCERunner) getExternalIP(instance *compute.Instance) string {
   720  	for i := range instance.NetworkInterfaces {
   721  		ni := instance.NetworkInterfaces[i]
   722  		for j := range ni.AccessConfigs {
   723  			ac := ni.AccessConfigs[j]
   724  			if len(ac.NatIP) > 0 {
   725  				return ac.NatIP
   726  			}
   727  		}
   728  	}
   729  	return ""
   730  }
   731  func (g *GCERunner) updateKernelArguments(instance *compute.Instance, image string, kernelArgs []string) error {
   732  	kernelArgsString := strings.Join(kernelArgs, " ")
   734  	var cmd []string
   735  	if strings.Contains(image, "cos") {
   736  		cmd = []string{
   737  			"dir=$(mktemp -d)",
   738  			"mount /dev/sda12 ${dir}",
   739  			fmt.Sprintf("sed -i -e \"s|cros_efi|cros_efi %s|g\" ${dir}/efi/boot/grub.cfg", kernelArgsString),
   740  			"umount ${dir}",
   741  			"rmdir ${dir}",
   742  		}
   743  	}
   745  	if strings.Contains(image, "ubuntu") {
   746  		cmd = []string{
   747  			fmt.Sprintf("echo \"GRUB_CMDLINE_LINUX_DEFAULT=%s ${GRUB_CMDLINE_LINUX_DEFAULT}\" > /etc/default/grub.d/99-additional-arguments.cfg", kernelArgsString),
   748  			"/usr/sbin/update-grub",
   749  		}
   750  	}
   752  	if len(cmd) == 0 {
   753  		klog.Warningf("The image %s does not support adding an additional kernel arguments", image)
   754  		return nil
   755  	}
   757  	out, err := remote.SSH(instance.Name, "sh", "-c", fmt.Sprintf("'%s'", strings.Join(cmd, "&&")))
   758  	if err != nil {
   759  		klog.Errorf("failed to run command %s: out: %s, Err: %v", cmd, out, err)
   760  		return err
   761  	}
   763  	if err := g.rebootInstance(instance); err != nil {
   764  		return err
   765  	}
   767  	return nil
   768  }
   770  func (g *GCERunner) machineType(machine string) string {
   771  	var ret string
   772  	if machine == "" && *instanceType != "" {
   773  		ret = *instanceType
   774  	} else if machine != "" {
   775  		ret = machine
   776  	} else {
   777  		ret = defaultGCEMachine
   778  	}
   779  	return fmt.Sprintf("zones/%s/machineTypes/%s", *zone, ret)
   780  }
   781  func (g *GCERunner) rebootInstance(instance *compute.Instance) error {
   782  	// wait until the instance will not response to SSH
   783  	klog.Info("Reboot the node and wait for instance not to be available via SSH")
   784  	if waitErr := wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) {
   785  		if _, err := remote.SSH(instance.Name, "reboot"); err != nil {
   786  			return true, nil
   787  		}
   789  		return false, nil
   790  	}); waitErr != nil {
   791  		return fmt.Errorf("the instance %s still response to SSH: %v", instance.Name, waitErr)
   792  	}
   794  	// wait until the instance will response again to SSH
   795  	klog.Info("Wait for instance to be available via SSH")
   796  	if waitErr := wait.PollImmediate(30*time.Second, 5*time.Minute, func() (bool, error) {
   797  		if _, err := remote.SSH(instance.Name, "sh", "-c", "date"); err != nil {
   798  			return false, nil
   799  		}
   800  		return true, nil
   801  	}); waitErr != nil {
   802  		return fmt.Errorf("the instance %s does not response to SSH: %v", instance.Name, waitErr)
   803  	}
   805  	return nil
   806  }