github.com/maxgio92/test-infra@v0.1.0/kubetest/main.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"errors"
    23  	"flag"
    24  	"fmt"
    25  	"log"
    26  	"math/rand"
    27  	"os"
    28  	"os/exec"
    29  	"os/signal"
    30  	"path/filepath"
    31  	"regexp"
    32  	"strconv"
    33  	"strings"
    34  	"time"
    35  
    36  	"github.com/spf13/pflag"
    37  	"github.com/maxgio92/test-infra/kubetest/boskos/client"
    38  
    39  	"github.com/maxgio92/test-infra/kubetest/conformance"
    40  	"github.com/maxgio92/test-infra/kubetest/kind"
    41  	"github.com/maxgio92/test-infra/kubetest/process"
    42  	"github.com/maxgio92/test-infra/kubetest/util"
    43  )
    44  
    45  // Hardcoded in ginkgo-e2e.sh
    46  const defaultGinkgoParallel = 25
    47  
    48  var (
    49  	artifacts = filepath.Join(os.Getenv("WORKSPACE"), "_artifacts")
    50  	boskos, _ = client.NewClient(os.Getenv("JOB_NAME"), "http://boskos.test-pods.svc.cluster.local.", "", "")
    51  	control   = process.NewControl(timeout, interrupt, terminate, verbose)
    52  	gitTag    = ""                              // initializing default zero value. ldflags will populate this during build time.
    53  	interrupt = time.NewTimer(time.Duration(0)) // interrupt testing at this time.
    54  	terminate = time.NewTimer(time.Duration(0)) // terminate testing at this time.
    55  	timeout   = time.Duration(0)
    56  	verbose   = false
    57  )
    58  
    59  type options struct {
    60  	build                buildStrategy
    61  	boskosWaitDuration   time.Duration
    62  	charts               bool
    63  	checkLeaks           bool
    64  	checkSkew            bool
    65  	cluster              string
    66  	clusterIPRange       string
    67  	deployment           string
    68  	down                 bool
    69  	dump                 string
    70  	dumpPreTestLogs      string
    71  	extract              extractStrategies
    72  	extractCIBucket      string
    73  	extractReleaseBucket string
    74  	extractSource        bool
    75  	flushMemAfterBuild   bool
    76  	focusRegex           string
    77  	gcpCloudSdk          string
    78  	gcpMasterImage       string
    79  	gcpMasterSize        string
    80  	gcpNetwork           string
    81  	gcpNodeImage         string
    82  	gcpImageFamily       string
    83  	gcpImageProject      string
    84  	gcpNodes             string
    85  	gcpNodeSize          string
    86  	gcpProject           string
    87  	gcpProjectType       string
    88  	gcpServiceAccount    string
    89  	// gcpSSHProxyInstanceName is the name of the vm instance which ip address will be used to set the
    90  	// KUBE_SSH_BASTION env. If set, it will result in proxying ssh connections in tests through the
    91  	// "bastion". It's useful for clusters with nodes without public ssh access, e.g. nodes without
    92  	// public ip addresses. Works only for gcp providers (gce, gke).
    93  	gcpSSHProxyInstanceName string
    94  	gcpRegion               string
    95  	gcpZone                 string
    96  	ginkgoParallel          ginkgoParallelValue
    97  	kubecfg                 string
    98  	kubemark                bool
    99  	kubemarkMasterSize      string
   100  	kubemarkNodes           string // TODO(fejta): switch to int after migration
   101  	logexporterGCSPath      string
   102  	metadataSources         string
   103  	noAllowDup              bool
   104  	nodeArgs                string
   105  	nodeTestArgs            string
   106  	nodeTests               bool
   107  	preTestCmd              string
   108  	postTestCmd             string
   109  	provider                string
   110  	publish                 string
   111  	runtimeConfig           string
   112  	save                    string
   113  	skew                    bool
   114  	skipRegex               string
   115  	soak                    bool
   116  	soakDuration            time.Duration
   117  	stage                   stageStrategy
   118  	storageTestDriverPath   string
   119  	test                    bool
   120  	testArgs                string
   121  	testCmd                 string
   122  	testCmdName             string
   123  	testCmdArgs             []string
   124  	up                      bool
   125  	upgradeArgs             string
   126  	version                 bool
   127  }
   128  
   129  func defineFlags() *options {
   130  	o := options{}
   131  	flag.Var(&o.build, "build", "Rebuild k8s binaries, optionally forcing (release|quick|bazel) strategy")
   132  	flag.DurationVar(&o.boskosWaitDuration, "boskos-wait-duration", 5*time.Minute, "Defines how long it waits until quit getting Boskos resoure, default 5 minutes")
   133  	flag.BoolVar(&o.charts, "charts", false, "If true, run charts tests")
   134  	flag.BoolVar(&o.checkSkew, "check-version-skew", true, "Verify client and server versions match")
   135  	flag.BoolVar(&o.checkLeaks, "check-leaked-resources", false, "Ensure project ends with the same resources")
   136  	flag.StringVar(&o.cluster, "cluster", "", "Cluster name. Must be set for --deployment=gke (TODO: other deployments).")
   137  	flag.StringVar(&o.clusterIPRange, "cluster-ip-range", "", "Specifies CLUSTER_IP_RANGE value during --up and --test (only relevant for --deployment=bash). Auto-calculated if empty.")
   138  	flag.StringVar(&o.deployment, "deployment", "bash", "Choices: none/bash/conformance/gke/kind/kops/node/local")
   139  	flag.BoolVar(&o.down, "down", false, "If true, tear down the cluster before exiting.")
   140  	flag.StringVar(&o.dump, "dump", "", "If set, dump bring-up and cluster logs to this location on test or cluster-up failure")
   141  	flag.StringVar(&o.dumpPreTestLogs, "dump-pre-test-logs", "", "If set, dump cluster logs to this location before running tests")
   142  	flag.Var(&o.extract, "extract", "Extract k8s binaries from the specified release location")
   143  	flag.StringVar(&o.extractCIBucket, "extract-ci-bucket", "k8s-release-dev", "Extract k8s CI binaries from the specified GCS bucket")
   144  	flag.StringVar(&o.extractReleaseBucket, "extract-release-bucket", "kubernetes-release", "Extract k8s release binaries from the specified GCS bucket")
   145  	flag.BoolVar(&o.extractSource, "extract-source", false, "Extract k8s src together with other tarballs")
   146  	flag.BoolVar(&o.flushMemAfterBuild, "flush-mem-after-build", false, "If true, try to flush container memory after building")
   147  	flag.Var(&o.ginkgoParallel, "ginkgo-parallel", fmt.Sprintf("Run Ginkgo tests in parallel, default %d runners. Use --ginkgo-parallel=N to specify an exact count.", defaultGinkgoParallel))
   148  	flag.StringVar(&o.gcpCloudSdk, "gcp-cloud-sdk", "", "Install/upgrade google-cloud-sdk to the gs:// path if set")
   149  	flag.StringVar(&o.gcpProject, "gcp-project", "", "For use with gcloud commands")
   150  	flag.StringVar(&o.gcpProjectType, "gcp-project-type", "", "Explicitly indicate which project type to select from boskos")
   151  	flag.StringVar(&o.gcpServiceAccount, "gcp-service-account", "", "Service account to activate before using gcloud")
   152  	flag.StringVar(&o.gcpZone, "gcp-zone", "", "For use with gcloud commands")
   153  	flag.StringVar(&o.gcpRegion, "gcp-region", "", "For use with gcloud commands")
   154  	flag.StringVar(&o.gcpNetwork, "gcp-network", "", "Cluster network. Must be set for --deployment=gke (TODO: other deployments).")
   155  	flag.StringVar(&o.gcpMasterImage, "gcp-master-image", "", "Master image type (cos|debian on GCE, n/a on GKE)")
   156  	flag.StringVar(&o.gcpMasterSize, "gcp-master-size", "", "(--provider=gce only) Size of master to create (e.g n1-standard-1). Auto-calculated if left empty.")
   157  	flag.StringVar(&o.gcpNodeImage, "gcp-node-image", "", "Node image type (cos|container_vm on GKE, cos|debian on GCE)")
   158  	flag.StringVar(&o.gcpImageFamily, "image-family", "", "Node image family from which to use the latest image, required when --gcp-node-image=CUSTOM")
   159  	flag.StringVar(&o.gcpImageProject, "image-project", "", "Project containing node image family, required when --gcp-node-image=CUSTOM")
   160  	flag.StringVar(&o.gcpNodes, "gcp-nodes", "", "(--provider=gce only) Number of nodes to create.")
   161  	flag.StringVar(&o.gcpNodeSize, "gcp-node-size", "", "(--provider=gce only) Size of nodes to create (e.g n1-standard-1).")
   162  	flag.StringVar(&o.gcpSSHProxyInstanceName, "gcp-ssh-proxy-instance-name", "", "(--provider=gce|gke only) If set, will result in proxing the ssh connections via the provided instance name while running tests")
   163  	flag.StringVar(&o.kubecfg, "kubeconfig", "", "The location of a kubeconfig file.")
   164  	flag.StringVar(&o.focusRegex, "ginkgo-focus", "", "The ginkgo regex to focus. Currently only respected for (dind).")
   165  	flag.StringVar(&o.skipRegex, "ginkgo-skip", "", "The ginkgo regex to skip. Currently only respected for (dind).")
   166  	flag.BoolVar(&o.kubemark, "kubemark", false, "If true, run kubemark tests.")
   167  	flag.StringVar(&o.kubemarkMasterSize, "kubemark-master-size", "", "Kubemark master size (only relevant if --kubemark=true). Auto-calculated based on '--kubemark-nodes' if left empty.")
   168  	flag.StringVar(&o.kubemarkNodes, "kubemark-nodes", "5", "Number of kubemark nodes to start (only relevant if --kubemark=true).")
   169  	flag.StringVar(&o.logexporterGCSPath, "logexporter-gcs-path", "", "Path to the GCS artifacts directory to dump logs from nodes. Logexporter gets enabled if this is non-empty")
   170  	flag.StringVar(&o.metadataSources, "metadata-sources", "images.json", "Comma-separated list of files inside ./artifacts to merge into metadata.json")
   171  	flag.StringVar(&o.nodeArgs, "node-args", "", "Args for node e2e tests.")
   172  	flag.StringVar(&o.nodeTestArgs, "node-test-args", "", "Test args specifically for node e2e tests.")
   173  	flag.BoolVar(&o.noAllowDup, "no-allow-dup", false, "if set --allow-dup will not be passed to push-build and --stage will error if the build already exists on the gcs path")
   174  	flag.BoolVar(&o.nodeTests, "node-tests", false, "If true, run node-e2e tests.")
   175  	flag.StringVar(&o.preTestCmd, "pre-test-cmd", "", "If set, run the provided command before running any tests.")
   176  	flag.StringVar(&o.postTestCmd, "post-test-cmd", "", "If set, run the provided command after running all the tests.")
   177  	flag.StringVar(&o.provider, "provider", "", "Kubernetes provider such as gce, gke, aws, etc")
   178  	flag.StringVar(&o.publish, "publish", "", "Publish version to the specified gs:// path on success")
   179  	flag.StringVar(&o.runtimeConfig, "runtime-config", "", "If set, API versions can be turned on or off while bringing up the API server.")
   180  	flag.StringVar(&o.stage.dockerRegistry, "registry", "", "Push images to the specified docker registry (e.g. gcr.io/a-test-project)")
   181  	flag.StringVar(&o.save, "save", "", "Save credentials to gs:// path on --up if set (or load from there if not --up)")
   182  	flag.BoolVar(&o.skew, "skew", false, "If true, run tests in another version at ../kubernetes/kubernetes_skew")
   183  	flag.BoolVar(&o.soak, "soak", false, "If true, job runs in soak mode")
   184  	flag.DurationVar(&o.soakDuration, "soak-duration", 7*24*time.Hour, "Maximum age of a soak cluster before it gets recycled")
   185  	flag.Var(&o.stage, "stage", "Upload binaries to gs://bucket/devel/job-suffix if set")
   186  	flag.StringVar(&o.stage.versionSuffix, "stage-suffix", "", "Append suffix to staged version when set")
   187  	flag.StringVar(&o.storageTestDriverPath, "storage-testdriver-repo-path", "", "Relative path for external e2e test driver config in the csi driver repo")
   188  	flag.BoolVar(&o.test, "test", false, "Run Ginkgo tests.")
   189  	flag.StringVar(&o.testArgs, "test_args", "", "Space-separated list of arguments to pass to Ginkgo test runner.")
   190  	flag.StringVar(&o.testCmd, "test-cmd", "", "command to run against the cluster instead of Ginkgo e2e tests")
   191  	flag.StringVar(&o.testCmdName, "test-cmd-name", "", "name to log the test command as in xml results")
   192  	flag.DurationVar(&timeout, "timeout", time.Duration(0), "Terminate testing after the timeout duration (s/m/h)")
   193  	flag.BoolVar(&o.up, "up", false, "If true, start the e2e cluster. If cluster is already up, recreate it.")
   194  	flag.StringVar(&o.upgradeArgs, "upgrade_args", "", "If set, run upgrade tests before other tests")
   195  	flag.BoolVar(&o.version, "version", false, "Command to print version")
   196  
   197  	// The "-v" flag was also used by glog, which is used by k8s.io/client-go. Duplicate flags cause panics.
   198  	// 1. Even if we could convince glog to change, they have too many consumers to ever do so.
   199  	// 2. The glog lib parses flags during init. It is impossible to dynamically rewrite the args before they're parsed by glog.
   200  	// 3. The glog lib takes an int value, so "-v false" is an error.
   201  	// 4. It's possible, but unlikely, we could convince k8s.io/client-go to use a logging shim, because a library shouldn't force a logging implementation. This would take a major version release for the lib.
   202  	//
   203  	// The most reasonable solution is to accept that we shouldn't have made a single-letter global, and rename all references to this variable.
   204  	flag.BoolVar(&verbose, "verbose-commands", true, "If true, print all command output.")
   205  
   206  	// go flag does not support StringArrayVar
   207  	pflag.StringArrayVar(&o.testCmdArgs, "test-cmd-args", []string{}, "args for test-cmd")
   208  	return &o
   209  }
   210  
   211  var suite util.TestSuite = util.TestSuite{Name: "kubetest"}
   212  
   213  func validWorkingDirectory() error {
   214  	cwd, err := os.Getwd()
   215  	if err != nil {
   216  		return fmt.Errorf("could not get pwd: %w", err)
   217  	}
   218  	acwd, err := filepath.Abs(cwd)
   219  	if err != nil {
   220  		return fmt.Errorf("failed to convert %s to an absolute path: %w", cwd, err)
   221  	}
   222  	// This also matches "kubernetes_skew" for upgrades.
   223  	if !strings.Contains(filepath.Base(acwd), "kubernetes") {
   224  		return fmt.Errorf("must run from kubernetes directory root. current: %s", acwd)
   225  	}
   226  	return nil
   227  }
   228  
   229  type deployer interface {
   230  	Up() error
   231  	IsUp() error
   232  	DumpClusterLogs(localPath, gcsPath string) error
   233  	TestSetup() error
   234  	Down() error
   235  	GetClusterCreated(gcpProject string) (time.Time, error)
   236  	KubectlCommand() (*exec.Cmd, error)
   237  }
   238  
   239  // publisher is implemented by deployers that want to publish status on success
   240  type publisher interface {
   241  	// Publish is called when the tests were successful; the deployer should publish a success file
   242  	Publish() error
   243  }
   244  
   245  func getDeployer(o *options) (deployer, error) {
   246  	switch o.deployment {
   247  	case "bash":
   248  		return newBash(&o.clusterIPRange, o.gcpProject, o.gcpZone, o.gcpSSHProxyInstanceName, o.provider), nil
   249  	case "conformance":
   250  		return conformance.NewDeployer(o.kubecfg)
   251  	case "gke":
   252  		return newGKE(o.provider, o.gcpProject, o.gcpZone, o.gcpRegion, o.gcpNetwork, o.gcpNodeImage, o.gcpImageFamily, o.gcpImageProject, o.cluster, o.gcpSSHProxyInstanceName, &o.testArgs, &o.upgradeArgs)
   253  	case "kind":
   254  		return kind.NewDeployer(control, string(o.build))
   255  	case "kops":
   256  		return newKops(o.provider, o.gcpProject, o.cluster)
   257  	case "node":
   258  		return nodeDeploy{provider: o.provider}, nil
   259  	case "none":
   260  		return noneDeploy{}, nil
   261  	case "local":
   262  		return newLocalCluster(), nil
   263  	case "aksengine":
   264  		return newAKSEngine()
   265  	case "aks":
   266  		return newAksDeployer()
   267  	default:
   268  		return nil, fmt.Errorf("unknown deployment strategy %q", o.deployment)
   269  	}
   270  }
   271  
   272  func validateFlags(o *options) error {
   273  	if !o.extract.Enabled() && o.extractSource {
   274  		return errors.New("--extract-source flag cannot be passed without --extract")
   275  	}
   276  	return nil
   277  }
   278  
   279  func main() {
   280  	log.SetFlags(log.LstdFlags | log.Lshortfile)
   281  	log.Printf("Running kubetest version: %s\n", gitTag)
   282  
   283  	// Initialize global pseudo random generator. Initializing it to select random AWS Zones.
   284  	rand.Seed(time.Now().UnixNano())
   285  
   286  	pflag.CommandLine = pflag.NewFlagSet(os.Args[0], pflag.ContinueOnError)
   287  	o := defineFlags()
   288  	pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
   289  	if err := pflag.CommandLine.Parse(os.Args[1:]); err != nil {
   290  		log.Fatalf("Flag parse failed: %v", err)
   291  	}
   292  
   293  	if err := validateFlags(o); err != nil {
   294  		log.Fatalf("Flags validation failed. err: %v", err)
   295  	}
   296  
   297  	if o.version {
   298  		log.Printf("kubetest version: %s\n", gitTag)
   299  		return
   300  	}
   301  
   302  	control = process.NewControl(timeout, interrupt, terminate, verbose)
   303  
   304  	// do things when we know we are running in the kubetest image
   305  	if os.Getenv("KUBETEST_IN_DOCKER") == "true" {
   306  		o.flushMemAfterBuild = true
   307  	}
   308  	// sanity fix for kind deployer, not set for other deployers to avoid
   309  	// breaking changes...
   310  	if o.deployment == "kind" {
   311  		// always default --dump for kind, in CI use $ARTIFACTS
   312  		artifacts := os.Getenv("ARTIFACTS")
   313  		if artifacts == "" {
   314  			artifacts = "./_artifacts"
   315  		}
   316  		o.dump = artifacts
   317  	}
   318  
   319  	err := complete(o)
   320  
   321  	if boskos.HasResource() {
   322  		if berr := boskos.ReleaseAll("dirty"); berr != nil {
   323  			log.Fatalf("[Boskos] Fail To Release: %v, kubetest err: %v", berr, err)
   324  		}
   325  	}
   326  
   327  	if err != nil {
   328  		log.Fatalf("Something went wrong: %v", err)
   329  	}
   330  }
   331  
   332  func complete(o *options) error {
   333  	if !terminate.Stop() {
   334  		<-terminate.C // Drain the value if necessary.
   335  	}
   336  	if !interrupt.Stop() {
   337  		<-interrupt.C // Drain value
   338  	}
   339  
   340  	if timeout > 0 {
   341  		log.Printf("Limiting testing to %s", timeout)
   342  		interrupt.Reset(timeout)
   343  	}
   344  
   345  	if o.dump != "" {
   346  		defer writeMetadata(o.dump, o.metadataSources)
   347  		defer control.WriteXML(&suite, o.dump, time.Now())
   348  	}
   349  	if o.logexporterGCSPath != "" {
   350  		o.testArgs += fmt.Sprintf(" --logexporter-gcs-path=%s", o.logexporterGCSPath)
   351  	}
   352  	if err := control.XMLWrap(&suite, "Prepare", func() error { return prepare(o) }); err != nil {
   353  		return fmt.Errorf("failed to prepare test environment: %w", err)
   354  	}
   355  	// Get the deployer before we acquire k8s so any additional flag
   356  	// verifications happen early.
   357  	var deploy deployer
   358  	err := control.XMLWrap(&suite, "GetDeployer", func() error {
   359  		d, err := getDeployer(o)
   360  		deploy = d
   361  		return err
   362  	})
   363  	if err != nil {
   364  		return fmt.Errorf("error creating deployer: %w", err)
   365  	}
   366  
   367  	// Check soaking before run tests
   368  	if o.soak {
   369  		if created, err := deploy.GetClusterCreated(o.gcpProject); err != nil {
   370  			// continue, but log the error
   371  			log.Printf("deploy %v, GetClusterCreated failed: %v", o.deployment, err)
   372  		} else {
   373  			if time.Now().After(created.Add(o.soakDuration)) {
   374  				// flip up on - which will tear down previous cluster and start a new one
   375  				log.Printf("Previous soak cluster created at %v, will recreate the cluster", created)
   376  				o.up = true
   377  			}
   378  		}
   379  	}
   380  
   381  	if err := acquireKubernetes(o, deploy); err != nil {
   382  		return fmt.Errorf("failed to acquire k8s binaries: %w", err)
   383  	}
   384  	if o.extract.Enabled() {
   385  		// If we specified `--extract-source` we will already be in the correct directory
   386  		if !o.extractSource {
   387  			if err := os.Chdir("kubernetes"); err != nil {
   388  				return fmt.Errorf("failed to chdir to kubernetes dir: %w", err)
   389  			}
   390  		}
   391  	}
   392  	if err := validWorkingDirectory(); err != nil {
   393  		return fmt.Errorf("called from invalid working directory: %w", err)
   394  	}
   395  
   396  	if o.down {
   397  		// listen for signals such as ^C and gracefully attempt to clean up
   398  		c := make(chan os.Signal, 1)
   399  		signal.Notify(c, os.Interrupt)
   400  		go func() {
   401  			for range c {
   402  				log.Print("Captured ^C, gracefully attempting to cleanup resources..")
   403  				if err = deploy.Down(); err != nil {
   404  					log.Printf("Tearing down deployment failed: %v", err)
   405  				}
   406  				if err != nil {
   407  					os.Exit(1)
   408  				}
   409  
   410  				os.Exit(2)
   411  			}
   412  		}()
   413  	}
   414  
   415  	if err := run(deploy, *o); err != nil {
   416  		return err
   417  	}
   418  
   419  	// Publish the successfully tested version when requested
   420  	if o.publish != "" {
   421  		if err := publish(o.publish); err != nil {
   422  			return err
   423  		}
   424  	}
   425  	return nil
   426  }
   427  
   428  func acquireKubernetes(o *options, d deployer) error {
   429  	// Potentially build kubernetes
   430  	if o.build.Enabled() {
   431  		var err error
   432  		// kind deployer manages build
   433  		if k, ok := d.(*kind.Deployer); ok {
   434  			err = control.XMLWrap(&suite, "Build", k.Build)
   435  		} else if c, ok := d.(*aksEngineDeployer); ok { // Azure deployer
   436  			err = control.XMLWrap(&suite, "Build", func() error {
   437  				return c.Build(o.build)
   438  			})
   439  		} else {
   440  			err = control.XMLWrap(&suite, "Build", o.build.Build)
   441  		}
   442  		if o.flushMemAfterBuild {
   443  			util.FlushMem()
   444  		}
   445  		if err != nil {
   446  			return err
   447  		}
   448  	}
   449  
   450  	// Potentially stage build binaries somewhere on GCS
   451  	if o.stage.Enabled() {
   452  		if err := control.XMLWrap(&suite, "Stage", func() error {
   453  			return o.stage.Stage(o.noAllowDup)
   454  		}); err != nil {
   455  			return err
   456  		}
   457  	}
   458  
   459  	// Potentially download existing binaries and extract them.
   460  	if o.extract.Enabled() {
   461  		err := control.XMLWrap(&suite, "Extract", func() error {
   462  			// Should we restore a previous state?
   463  			// Restore if we are not upping the cluster
   464  			if o.save != "" {
   465  				if !o.up {
   466  					// Restore version and .kube/config from --up
   467  					log.Printf("Overwriting extract strategy to load kubeconfig and version from %s", o.save)
   468  					o.extract = extractStrategies{
   469  						extractStrategy{
   470  							mode:   load,
   471  							option: o.save,
   472  						},
   473  					}
   474  				}
   475  			}
   476  
   477  			// New deployment, extract new version
   478  			return o.extract.Extract(o.gcpProject, o.gcpZone, o.gcpRegion, o.extractCIBucket, o.extractReleaseBucket, o.extractSource)
   479  		})
   480  		if err != nil {
   481  			return err
   482  		}
   483  	}
   484  	return nil
   485  }
   486  
   487  // Returns the k8s version name
   488  func findVersion() string {
   489  	// The version may be in a version file
   490  	if _, err := os.Stat("version"); err == nil {
   491  		b, err := os.ReadFile("version")
   492  		if err == nil {
   493  			return strings.TrimSpace(string(b))
   494  		}
   495  		log.Printf("Failed to read version: %v", err)
   496  	}
   497  
   498  	// We can also get it from the git repo.
   499  	if _, err := os.Stat("hack/lib/version.sh"); err == nil {
   500  		// TODO(fejta): do this in go. At least we removed the upload-to-gcs.sh dep.
   501  		gross := `. hack/lib/version.sh && KUBE_ROOT=. kube::version::get_version_vars && echo "${KUBE_GIT_VERSION-}"`
   502  		b, err := control.Output(exec.Command("bash", "-c", gross))
   503  		if err == nil {
   504  			return strings.TrimSpace(string(b))
   505  		}
   506  		log.Printf("Failed to get_version_vars: %v", err)
   507  	}
   508  
   509  	return "unknown" // Sad trombone
   510  }
   511  
   512  // maybeMergeMetadata will add new keyvals into the map; quietly eats errors.
   513  func maybeMergeJSON(meta map[string]string, path string) {
   514  	if data, err := os.ReadFile(path); err == nil {
   515  		json.Unmarshal(data, &meta)
   516  	}
   517  }
   518  
   519  // Write metadata.json, including version and env arg data.
   520  func writeMetadata(path, metadataSources string) error {
   521  	m := make(map[string]string)
   522  
   523  	// Look for any sources of metadata and load 'em
   524  	for _, f := range strings.Split(metadataSources, ",") {
   525  		maybeMergeJSON(m, filepath.Join(path, f))
   526  	}
   527  
   528  	ver := findVersion()
   529  	m["job-version"] = ver // TODO(krzyzacy): retire
   530  	m["revision"] = ver
   531  	m["kubetest-version"] = gitTag
   532  	re := regexp.MustCompile(`^BUILD_METADATA_(.+)$`)
   533  	for _, e := range os.Environ() {
   534  		p := strings.SplitN(e, "=", 2)
   535  		r := re.FindStringSubmatch(p[0])
   536  		if r == nil {
   537  			continue
   538  		}
   539  		k, v := strings.ToLower(r[1]), p[1]
   540  		m[k] = v
   541  	}
   542  	f, err := os.Create(filepath.Join(path, "metadata.json"))
   543  	if err != nil {
   544  		return err
   545  	}
   546  	defer f.Close()
   547  	e := json.NewEncoder(f)
   548  	return e.Encode(m)
   549  }
   550  
   551  // Install cloudsdk tarball to location, updating PATH
   552  func installGcloud(tarball string, location string) error {
   553  
   554  	if err := os.MkdirAll(location, 0775); err != nil {
   555  		return err
   556  	}
   557  
   558  	if err := control.FinishRunning(exec.Command("tar", "xzf", tarball, "-C", location)); err != nil {
   559  		return err
   560  	}
   561  
   562  	if err := control.FinishRunning(exec.Command(filepath.Join(location, "google-cloud-sdk", "install.sh"), "--disable-installation-options", "--bash-completion=false", "--path-update=false", "--usage-reporting=false")); err != nil {
   563  		return err
   564  	}
   565  
   566  	if err := util.InsertPath(filepath.Join(location, "google-cloud-sdk", "bin")); err != nil {
   567  		return err
   568  	}
   569  
   570  	if err := control.FinishRunning(exec.Command("gcloud", "components", "install", "alpha")); err != nil {
   571  		return err
   572  	}
   573  
   574  	if err := control.FinishRunning(exec.Command("gcloud", "components", "install", "beta")); err != nil {
   575  		return err
   576  	}
   577  
   578  	if err := control.FinishRunning(exec.Command("gcloud", "info")); err != nil {
   579  		return err
   580  	}
   581  	return nil
   582  }
   583  
   584  func migrateGcpEnvAndOptions(o *options) error {
   585  	var network string
   586  	var zone string
   587  	switch o.provider {
   588  	case "gke":
   589  		network = "KUBE_GKE_NETWORK"
   590  		zone = "ZONE"
   591  	default:
   592  		network = "KUBE_GCE_NETWORK"
   593  		zone = "KUBE_GCE_ZONE"
   594  	}
   595  	return util.MigrateOptions([]util.MigratedOption{
   596  		{
   597  			Env:    "PROJECT",
   598  			Option: &o.gcpProject,
   599  			Name:   "--gcp-project",
   600  		},
   601  		{
   602  			Env:    zone,
   603  			Option: &o.gcpZone,
   604  			Name:   "--gcp-zone",
   605  		},
   606  		{
   607  			Env:    "REGION",
   608  			Option: &o.gcpRegion,
   609  			Name:   "--gcp-region",
   610  		},
   611  		{
   612  			Env:    "GOOGLE_APPLICATION_CREDENTIALS",
   613  			Option: &o.gcpServiceAccount,
   614  			Name:   "--gcp-service-account",
   615  		},
   616  		{
   617  			Env:    network,
   618  			Option: &o.gcpNetwork,
   619  			Name:   "--gcp-network",
   620  		},
   621  		{
   622  			Env:    "KUBE_NODE_OS_DISTRIBUTION",
   623  			Option: &o.gcpNodeImage,
   624  			Name:   "--gcp-node-image",
   625  		},
   626  		{
   627  			Env:    "KUBE_MASTER_OS_DISTRIBUTION",
   628  			Option: &o.gcpMasterImage,
   629  			Name:   "--gcp-master-image",
   630  		},
   631  		{
   632  			Env:    "NUM_NODES",
   633  			Option: &o.gcpNodes,
   634  			Name:   "--gcp-nodes",
   635  		},
   636  		{
   637  			Env:    "NODE_SIZE",
   638  			Option: &o.gcpNodeSize,
   639  			Name:   "--gcp-node-size",
   640  		},
   641  		{
   642  			Env:    "MASTER_SIZE",
   643  			Option: &o.gcpMasterSize,
   644  			Name:   "--gcp-master-size",
   645  		},
   646  		{
   647  			Env:      "CLOUDSDK_BUCKET",
   648  			Option:   &o.gcpCloudSdk,
   649  			Name:     "--gcp-cloud-sdk",
   650  			SkipPush: true,
   651  		},
   652  	})
   653  }
   654  
   655  func prepareGcp(o *options) error {
   656  	if err := migrateGcpEnvAndOptions(o); err != nil {
   657  		return err
   658  	}
   659  	// Must happen before any gcloud commands
   660  	if err := activateServiceAccount(o.gcpServiceAccount); err != nil {
   661  		return err
   662  	}
   663  
   664  	if o.provider == "gce" {
   665  		if distro := os.Getenv("KUBE_OS_DISTRIBUTION"); distro != "" {
   666  			log.Printf("Please use --gcp-master-image=%s --gcp-node-image=%s (instead of deprecated KUBE_OS_DISTRIBUTION)",
   667  				distro, distro)
   668  			// Note: KUBE_OS_DISTRIBUTION takes precedence over
   669  			// KUBE_{MASTER,NODE}_OS_DISTRIBUTION, so override here
   670  			// after the migration above.
   671  			o.gcpNodeImage = distro
   672  			o.gcpMasterImage = distro
   673  			if err := os.Setenv("KUBE_NODE_OS_DISTRIBUTION", distro); err != nil {
   674  				return fmt.Errorf("could not set KUBE_NODE_OS_DISTRIBUTION=%s: %w", distro, err)
   675  			}
   676  			if err := os.Setenv("KUBE_MASTER_OS_DISTRIBUTION", distro); err != nil {
   677  				return fmt.Errorf("could not set KUBE_MASTER_OS_DISTRIBUTION=%s: %w", distro, err)
   678  			}
   679  		}
   680  
   681  		hasGCPImageFamily, hasGCPImageProject := len(o.gcpImageFamily) != 0, len(o.gcpImageProject) != 0
   682  		if hasGCPImageFamily != hasGCPImageProject {
   683  			return fmt.Errorf("--image-family and --image-project must be both set or unset")
   684  		}
   685  		if hasGCPImageFamily && hasGCPImageProject {
   686  			out, err := control.Output(exec.Command("gcloud", "compute", "images", "describe-from-family", o.gcpImageFamily, "--project", o.gcpImageProject))
   687  			if err != nil {
   688  				return fmt.Errorf("failed to get latest image from family %q in project %q: %s", o.gcpImageFamily, o.gcpImageProject, err)
   689  			}
   690  			latestImage := ""
   691  			latestImageRegexp := regexp.MustCompile(`^name: *(\S+)`)
   692  			for _, line := range strings.Split(string(out), "\n") {
   693  				matches := latestImageRegexp.FindStringSubmatch(line)
   694  				if len(matches) == 2 {
   695  					latestImage = matches[1]
   696  					break
   697  				}
   698  			}
   699  			if len(latestImage) == 0 {
   700  				return fmt.Errorf("failed to get latest image from family %q in project %q", o.gcpImageFamily, o.gcpImageProject)
   701  			}
   702  			if o.deployment == "node" {
   703  				o.nodeArgs += fmt.Sprintf(" --images=%s --image-project=%s", latestImage, o.gcpImageProject)
   704  			} else {
   705  				os.Setenv("KUBE_GCE_NODE_IMAGE", latestImage)
   706  				os.Setenv("KUBE_GCE_NODE_PROJECT", o.gcpImageProject)
   707  			}
   708  		}
   709  	} else if o.provider == "gke" {
   710  		if o.deployment == "" {
   711  			o.deployment = "gke"
   712  		}
   713  		if o.deployment != "gke" {
   714  			return fmt.Errorf("expected --deployment=gke for --provider=gke, found --deployment=%s", o.deployment)
   715  		}
   716  		if o.gcpMasterImage != "" {
   717  			return fmt.Errorf("expected --gcp-master-image to be empty for --provider=gke, found --gcp-master-image=%s", o.gcpMasterImage)
   718  		}
   719  		if o.gcpNodes != "" {
   720  			return fmt.Errorf("--gcp-nodes cannot be set on GKE, use --gke-shape instead")
   721  		}
   722  		if o.gcpNodeSize != "" {
   723  			return fmt.Errorf("--gcp-node-size cannot be set on GKE, use --gke-shape instead")
   724  		}
   725  		if o.gcpMasterSize != "" {
   726  			return fmt.Errorf("--gcp-master-size cannot be set on GKE, where it's auto-computed")
   727  		}
   728  
   729  		// TODO(kubernetes/test-infra#3536): This is used by the
   730  		// ginkgo-e2e.sh wrapper.
   731  		nod := o.gcpNodeImage
   732  		if nod == "container_vm" {
   733  			// gcloud container clusters create understands
   734  			// "container_vm", e2es understand "debian".
   735  			nod = "debian"
   736  		}
   737  		if nod == "cos_containerd" {
   738  			// gcloud container clusters create understands
   739  			// "cos_containerd", e2es only understand
   740  			// "gci"/"cos",
   741  			nod = "gci"
   742  		}
   743  		os.Setenv("NODE_OS_DISTRIBUTION", nod)
   744  	}
   745  	if o.gcpProject == "" {
   746  		log.Print("--gcp-project is missing, trying to fetch a project from boskos.\n" +
   747  			"(for local runs please set --gcp-project to your dev project)")
   748  
   749  		var resType string
   750  		if o.gcpProjectType != "" {
   751  			resType = o.gcpProjectType
   752  		} else if o.provider == "gke" {
   753  			resType = "gke-project"
   754  		} else {
   755  			resType = "gce-project"
   756  		}
   757  
   758  		log.Printf("provider %v, will acquire project type %v from boskos", o.provider, resType)
   759  
   760  		// let's retry 5min to get next available resource
   761  		ctx, cancel := context.WithTimeout(context.Background(), o.boskosWaitDuration)
   762  		defer cancel()
   763  		p, err := boskos.AcquireWait(ctx, resType, "free", "busy")
   764  		if err != nil {
   765  			return fmt.Errorf("--provider=%s boskos failed to acquire project: %w", o.provider, err)
   766  		}
   767  
   768  		if p == nil {
   769  			return fmt.Errorf("boskos does not have a free %s at the moment", resType)
   770  		}
   771  
   772  		go func(c *client.Client, proj string) {
   773  			for range time.Tick(time.Minute * 5) {
   774  				if err := c.UpdateOne(p.Name, "busy", nil); err != nil {
   775  					log.Printf("[Boskos] Update of %s failed with %v", p.Name, err)
   776  				}
   777  			}
   778  		}(boskos, p.Name)
   779  		o.gcpProject = p.Name
   780  	}
   781  
   782  	if err := os.Setenv("CLOUDSDK_CORE_PRINT_UNHANDLED_TRACEBACKS", "1"); err != nil {
   783  		return fmt.Errorf("could not set CLOUDSDK_CORE_PRINT_UNHANDLED_TRACEBACKS=1: %w", err)
   784  	}
   785  
   786  	if err := control.FinishRunning(exec.Command("gcloud", "config", "set", "project", o.gcpProject)); err != nil {
   787  		return fmt.Errorf("fail to set project %s : err %w", o.gcpProject, err)
   788  	}
   789  
   790  	// TODO(krzyzacy):Remove this when we retire migrateGcpEnvAndOptions
   791  	// Note that a lot of scripts are still depend on this env in k/k repo.
   792  	if err := os.Setenv("PROJECT", o.gcpProject); err != nil {
   793  		return fmt.Errorf("fail to set env var PROJECT %s : err %w", o.gcpProject, err)
   794  	}
   795  
   796  	// Ensure ssh keys exist
   797  	log.Print("Checking existing of GCP ssh keys...")
   798  	k := filepath.Join(util.Home(".ssh"), "google_compute_engine")
   799  	if _, err := os.Stat(k); err != nil {
   800  		return err
   801  	}
   802  	pk := k + ".pub"
   803  	if _, err := os.Stat(pk); err != nil {
   804  		return err
   805  	}
   806  
   807  	log.Printf("Checking presence of public key in %s", o.gcpProject)
   808  	if out, err := control.Output(exec.Command("gcloud", "compute", "--project="+o.gcpProject, "project-info", "describe")); err != nil {
   809  		return err
   810  	} else if b, err := os.ReadFile(pk); err != nil {
   811  		return err
   812  	} else if !strings.Contains(string(out), string(b)) {
   813  		log.Print("Uploading public ssh key to project metadata...")
   814  		if err = control.FinishRunning(exec.Command("gcloud", "compute", "--project="+o.gcpProject, "config-ssh")); err != nil {
   815  			return err
   816  		}
   817  	}
   818  
   819  	// Install custom gcloud version if necessary
   820  	if o.gcpCloudSdk != "" {
   821  		for i := 0; i < 3; i++ {
   822  			if err := control.FinishRunning(exec.Command("gsutil", "-mq", "cp", "-r", o.gcpCloudSdk, util.Home())); err == nil {
   823  				break // Success!
   824  			}
   825  			time.Sleep(1 << uint(i) * time.Second)
   826  		}
   827  		for _, f := range []string{util.Home(".gsutil"), util.Home("repo"), util.Home("cloudsdk")} {
   828  			if _, err := os.Stat(f); err == nil || !os.IsNotExist(err) {
   829  				if err = os.RemoveAll(f); err != nil {
   830  					return err
   831  				}
   832  			}
   833  		}
   834  
   835  		install := util.Home("repo", "google-cloud-sdk.tar.gz")
   836  		if strings.HasSuffix(o.gcpCloudSdk, ".tar.gz") {
   837  			install = util.Home(filepath.Base(o.gcpCloudSdk))
   838  		} else {
   839  			if err := os.Rename(util.Home(filepath.Base(o.gcpCloudSdk)), util.Home("repo")); err != nil {
   840  				return err
   841  			}
   842  
   843  			// Controls which gcloud components to install.
   844  			pop, err := util.PushEnv("CLOUDSDK_COMPONENT_MANAGER_SNAPSHOT_URL", "file://"+util.Home("repo", "components-2.json"))
   845  			if err != nil {
   846  				return err
   847  			}
   848  			defer pop()
   849  		}
   850  
   851  		if err := installGcloud(install, util.Home("cloudsdk")); err != nil {
   852  			return err
   853  		}
   854  		// gcloud creds may have changed
   855  		if err := activateServiceAccount(o.gcpServiceAccount); err != nil {
   856  			return err
   857  		}
   858  	}
   859  
   860  	if o.kubemark {
   861  		if p := os.Getenv("KUBEMARK_BAZEL_BUILD"); strings.ToLower(p) == "y" {
   862  			// we need docker-credential-gcr to get authed properly
   863  			// https://github.com/bazelbuild/rules_docker#authorization
   864  			if err := control.FinishRunning(exec.Command("gcloud", "components", "install", "docker-credential-gcr")); err != nil {
   865  				return err
   866  			}
   867  			if err := control.FinishRunning(exec.Command("docker-credential-gcr", "configure-docker")); err != nil {
   868  				return err
   869  			}
   870  		}
   871  	}
   872  
   873  	return nil
   874  }
   875  
   876  func prepareAws(o *options) error {
   877  	// gcloud creds may have changed
   878  	if err := activateServiceAccount(o.gcpServiceAccount); err != nil {
   879  		return err
   880  	}
   881  	return control.FinishRunning(exec.Command("pip", "install", "awscli"))
   882  }
   883  
   884  // Activate GOOGLE_APPLICATION_CREDENTIALS if set or do nothing.
   885  func activateServiceAccount(path string) error {
   886  	if path == "" {
   887  		return nil
   888  	}
   889  	return control.FinishRunning(exec.Command("gcloud", "auth", "activate-service-account", "--key-file="+path))
   890  }
   891  
   892  func prepare(o *options) error {
   893  	if err := util.MigrateOptions([]util.MigratedOption{
   894  		{
   895  			Env:    "KUBERNETES_PROVIDER",
   896  			Option: &o.provider,
   897  			Name:   "--provider",
   898  		},
   899  		{
   900  			Env:    "CLUSTER_NAME",
   901  			Option: &o.cluster,
   902  			Name:   "--cluster",
   903  		},
   904  	}); err != nil {
   905  		return err
   906  	}
   907  	if err := prepareGinkgoParallel(&o.ginkgoParallel); err != nil {
   908  		return err
   909  	}
   910  
   911  	switch o.provider {
   912  	case "gce", "gke", "node":
   913  		if err := prepareGcp(o); err != nil {
   914  			return err
   915  		}
   916  	case "aws":
   917  		if err := prepareAws(o); err != nil {
   918  			return err
   919  		}
   920  	}
   921  
   922  	if o.kubemark {
   923  		if err := util.MigrateOptions([]util.MigratedOption{
   924  			{
   925  				Env:    "KUBEMARK_NUM_NODES",
   926  				Option: &o.kubemarkNodes,
   927  				Name:   "--kubemark-nodes",
   928  			},
   929  			{
   930  				Env:    "KUBEMARK_MASTER_SIZE",
   931  				Option: &o.kubemarkMasterSize,
   932  				Name:   "--kubemark-master-size",
   933  			},
   934  		}); err != nil {
   935  			return err
   936  		}
   937  	}
   938  
   939  	if err := os.MkdirAll(artifacts, 0777); err != nil { // Create artifacts
   940  		return err
   941  	}
   942  
   943  	return nil
   944  }
   945  
   946  type ginkgoParallelValue struct {
   947  	v int // 0 == not set (defaults to 1)
   948  }
   949  
   950  func (v *ginkgoParallelValue) IsBoolFlag() bool {
   951  	return true
   952  }
   953  
   954  func (v *ginkgoParallelValue) String() string {
   955  	if v.v == 0 {
   956  		return "1"
   957  	}
   958  	return strconv.Itoa(v.v)
   959  }
   960  
   961  func (v *ginkgoParallelValue) Set(s string) error {
   962  	if s == "" {
   963  		v.v = 0
   964  		return nil
   965  	}
   966  	if s == "true" {
   967  		v.v = defaultGinkgoParallel
   968  		return nil
   969  	}
   970  	p, err := strconv.Atoi(s)
   971  	if err != nil {
   972  		return fmt.Errorf("--ginkgo-parallel must be an integer, found %q", s)
   973  	}
   974  	if p < 1 {
   975  		return fmt.Errorf("--ginkgo-parallel must be >= 1, found %d", p)
   976  	}
   977  	v.v = p
   978  	return nil
   979  }
   980  
   981  func (v *ginkgoParallelValue) Type() string {
   982  	return "ginkgoParallelValue"
   983  }
   984  
   985  func (v *ginkgoParallelValue) Get() int {
   986  	if v.v == 0 {
   987  		return 1
   988  	}
   989  	return v.v
   990  }
   991  
   992  var _ flag.Value = &ginkgoParallelValue{}
   993  
   994  // Hand migrate this option. GINKGO_PARALLEL => GINKGO_PARALLEL_NODES=25
   995  func prepareGinkgoParallel(v *ginkgoParallelValue) error {
   996  	if p := os.Getenv("GINKGO_PARALLEL"); strings.ToLower(p) == "y" {
   997  		log.Printf("Please use kubetest --ginkgo-parallel (instead of deprecated GINKGO_PARALLEL=y)")
   998  		if err := v.Set("true"); err != nil {
   999  			return err
  1000  		}
  1001  		os.Unsetenv("GINKGO_PARALLEL")
  1002  	}
  1003  	if p := os.Getenv("GINKGO_PARALLEL_NODES"); p != "" {
  1004  		log.Printf("Please use kubetest --ginkgo-parallel=%s (instead of deprecated GINKGO_PARALLEL_NODES=%s)", p, p)
  1005  		if err := v.Set(p); err != nil {
  1006  			return err
  1007  		}
  1008  	}
  1009  	os.Setenv("GINKGO_PARALLEL_NODES", v.String())
  1010  	return nil
  1011  }
  1012  
  1013  func publish(pub string) error {
  1014  	v, err := os.ReadFile("version")
  1015  	if err != nil {
  1016  		return err
  1017  	}
  1018  	log.Printf("Set %s version to %s", pub, string(v))
  1019  	return gcsWrite(pub, v)
  1020  }