k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/kubetest/main.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"errors"
    23  	"flag"
    24  	"fmt"
    25  	"log"
    26  	"os"
    27  	"os/exec"
    28  	"os/signal"
    29  	"path/filepath"
    30  	"regexp"
    31  	"strconv"
    32  	"strings"
    33  	"time"
    34  
    35  	"github.com/spf13/pflag"
    36  	"k8s.io/test-infra/kubetest/boskos/client"
    37  
    38  	"k8s.io/test-infra/kubetest/conformance"
    39  	"k8s.io/test-infra/kubetest/kind"
    40  	"k8s.io/test-infra/kubetest/process"
    41  	"k8s.io/test-infra/kubetest/util"
    42  )
    43  
    44  // Hardcoded in ginkgo-e2e.sh
    45  const defaultGinkgoParallel = 25
    46  
    47  var (
    48  	artifacts = filepath.Join(os.Getenv("WORKSPACE"), "_artifacts")
    49  	boskos, _ = client.NewClient(os.Getenv("JOB_NAME"), "http://boskos.test-pods.svc.cluster.local.", "", "")
    50  	control   = process.NewControl(timeout, interrupt, terminate, verbose)
    51  	gitTag    = ""                              // initializing default zero value. ldflags will populate this during build time.
    52  	interrupt = time.NewTimer(time.Duration(0)) // interrupt testing at this time.
    53  	terminate = time.NewTimer(time.Duration(0)) // terminate testing at this time.
    54  	timeout   = time.Duration(0)
    55  	verbose   = false
    56  )
    57  
    58  type options struct {
    59  	build                buildStrategy
    60  	boskosWaitDuration   time.Duration
    61  	charts               bool
    62  	checkLeaks           bool
    63  	checkSkew            bool
    64  	cluster              string
    65  	clusterIPRange       string
    66  	deployment           string
    67  	down                 bool
    68  	dump                 string
    69  	dumpPreTestLogs      string
    70  	extract              extractStrategies
    71  	extractCIBucket      string
    72  	extractReleaseBucket string
    73  	extractSource        bool
    74  	flushMemAfterBuild   bool
    75  	focusRegex           string
    76  	gcpCloudSdk          string
    77  	gcpMasterImage       string
    78  	gcpMasterSize        string
    79  	gcpNetwork           string
    80  	gcpNodeImage         string
    81  	gcpImageFamily       string
    82  	gcpImageProject      string
    83  	gcpNodes             string
    84  	gcpNodeSize          string
    85  	gcpProject           string
    86  	gcpProjectType       string
    87  	gcpServiceAccount    string
    88  	// gcpSSHProxyInstanceName is the name of the vm instance which ip address will be used to set the
    89  	// KUBE_SSH_BASTION env. If set, it will result in proxying ssh connections in tests through the
    90  	// "bastion". It's useful for clusters with nodes without public ssh access, e.g. nodes without
    91  	// public ip addresses. Works only for gcp providers (gce, gke).
    92  	gcpSSHProxyInstanceName string
    93  	gcpRegion               string
    94  	gcpZone                 string
    95  	ginkgoParallel          ginkgoParallelValue
    96  	kubecfg                 string
    97  	kubemark                bool
    98  	kubemarkMasterSize      string
    99  	kubemarkNodes           string // TODO(fejta): switch to int after migration
   100  	logexporterGCSPath      string
   101  	metadataSources         string
   102  	noAllowDup              bool
   103  	nodeArgs                string
   104  	nodeTestArgs            string
   105  	nodeTests               bool
   106  	preTestCmd              string
   107  	postTestCmd             string
   108  	provider                string
   109  	publish                 string
   110  	runtimeConfig           string
   111  	save                    string
   112  	skew                    bool
   113  	skipDumpClusterLogs     bool
   114  	skipRegex               string
   115  	soak                    bool
   116  	soakDuration            time.Duration
   117  	stage                   stageStrategy
   118  	storageTestDriverPath   string
   119  	test                    bool
   120  	testArgs                string
   121  	testCmd                 string
   122  	testCmdName             string
   123  	testCmdArgs             []string
   124  	up                      bool
   125  	upgradeArgs             string
   126  	version                 bool
   127  }
   128  
   129  func defineFlags() *options {
   130  	o := options{}
   131  	flag.Var(&o.build, "build", "Rebuild k8s binaries, optionally forcing (release|quick|bazel) strategy")
   132  	flag.DurationVar(&o.boskosWaitDuration, "boskos-wait-duration", 5*time.Minute, "Defines how long it waits until quit getting Boskos resoure, default 5 minutes")
   133  	flag.BoolVar(&o.charts, "charts", false, "If true, run charts tests")
   134  	flag.BoolVar(&o.checkSkew, "check-version-skew", true, "Verify client and server versions match")
   135  	flag.BoolVar(&o.checkLeaks, "check-leaked-resources", false, "Ensure project ends with the same resources")
   136  	flag.StringVar(&o.cluster, "cluster", "", "Cluster name. Must be set for --deployment=gke (TODO: other deployments).")
   137  	flag.StringVar(&o.clusterIPRange, "cluster-ip-range", "", "Specifies CLUSTER_IP_RANGE value during --up and --test (only relevant for --deployment=bash). Auto-calculated if empty.")
   138  	flag.StringVar(&o.deployment, "deployment", "bash", "Choices: none/bash/conformance/gke/kind/kops/node/local")
   139  	flag.BoolVar(&o.down, "down", false, "If true, tear down the cluster before exiting.")
   140  	flag.StringVar(&o.dump, "dump", "", "If set, dump bring-up and cluster logs to this location on test or cluster-up failure")
   141  	flag.StringVar(&o.dumpPreTestLogs, "dump-pre-test-logs", "", "If set, dump cluster logs to this location before running tests")
   142  	flag.Var(&o.extract, "extract", "Extract k8s binaries from the specified release location")
   143  	flag.StringVar(&o.extractCIBucket, "extract-ci-bucket", "k8s-release-dev", "Extract k8s CI binaries from the specified GCS bucket")
   144  	flag.StringVar(&o.extractReleaseBucket, "extract-release-bucket", "kubernetes-release", "Extract k8s release binaries from the specified GCS bucket")
   145  	flag.BoolVar(&o.extractSource, "extract-source", false, "Extract k8s src together with other tarballs")
   146  	flag.BoolVar(&o.flushMemAfterBuild, "flush-mem-after-build", false, "If true, try to flush container memory after building")
   147  	flag.Var(&o.ginkgoParallel, "ginkgo-parallel", fmt.Sprintf("Run Ginkgo tests in parallel, default %d runners. Use --ginkgo-parallel=N to specify an exact count.", defaultGinkgoParallel))
   148  	flag.StringVar(&o.gcpCloudSdk, "gcp-cloud-sdk", "", "Install/upgrade google-cloud-sdk to the gs:// path if set")
   149  	flag.StringVar(&o.gcpProject, "gcp-project", "", "For use with gcloud commands")
   150  	flag.StringVar(&o.gcpProjectType, "gcp-project-type", "", "Explicitly indicate which project type to select from boskos")
   151  	flag.StringVar(&o.gcpServiceAccount, "gcp-service-account", "", "Service account to activate before using gcloud")
   152  	flag.StringVar(&o.gcpZone, "gcp-zone", "", "For use with gcloud commands")
   153  	flag.StringVar(&o.gcpRegion, "gcp-region", "", "For use with gcloud commands")
   154  	flag.StringVar(&o.gcpNetwork, "gcp-network", "", "Cluster network. Must be set for --deployment=gke (TODO: other deployments).")
   155  	flag.StringVar(&o.gcpMasterImage, "gcp-master-image", "", "Master image type (cos|debian on GCE, n/a on GKE)")
   156  	flag.StringVar(&o.gcpMasterSize, "gcp-master-size", "", "(--provider=gce only) Size of master to create (e.g n1-standard-1). Auto-calculated if left empty.")
   157  	flag.StringVar(&o.gcpNodeImage, "gcp-node-image", "", "Node image type (cos|container_vm on GKE, cos|debian on GCE)")
   158  	flag.StringVar(&o.gcpImageFamily, "image-family", "", "Node image family from which to use the latest image, required when --gcp-node-image=CUSTOM")
   159  	flag.StringVar(&o.gcpImageProject, "image-project", "", "Project containing node image family, required when --gcp-node-image=CUSTOM")
   160  	flag.StringVar(&o.gcpNodes, "gcp-nodes", "", "(--provider=gce only) Number of nodes to create.")
   161  	flag.StringVar(&o.gcpNodeSize, "gcp-node-size", "", "(--provider=gce only) Size of nodes to create (e.g n1-standard-1).")
   162  	flag.StringVar(&o.gcpSSHProxyInstanceName, "gcp-ssh-proxy-instance-name", "", "(--provider=gce|gke only) If set, will result in proxing the ssh connections via the provided instance name while running tests")
   163  	flag.StringVar(&o.kubecfg, "kubeconfig", "", "The location of a kubeconfig file.")
   164  	flag.StringVar(&o.focusRegex, "ginkgo-focus", "", "The ginkgo regex to focus. Currently only respected for (dind).")
   165  	flag.StringVar(&o.skipRegex, "ginkgo-skip", "", "The ginkgo regex to skip. Currently only respected for (dind).")
   166  	flag.BoolVar(&o.kubemark, "kubemark", false, "If true, run kubemark tests.")
   167  	flag.StringVar(&o.kubemarkMasterSize, "kubemark-master-size", "", "Kubemark master size (only relevant if --kubemark=true). Auto-calculated based on '--kubemark-nodes' if left empty.")
   168  	flag.StringVar(&o.kubemarkNodes, "kubemark-nodes", "5", "Number of kubemark nodes to start (only relevant if --kubemark=true).")
   169  	flag.StringVar(&o.logexporterGCSPath, "logexporter-gcs-path", "", "Path to the GCS artifacts directory to dump logs from nodes. Logexporter gets enabled if this is non-empty")
   170  	flag.StringVar(&o.metadataSources, "metadata-sources", "images.json", "Comma-separated list of files inside ./artifacts to merge into metadata.json")
   171  	flag.StringVar(&o.nodeArgs, "node-args", "", "Args for node e2e tests.")
   172  	flag.StringVar(&o.nodeTestArgs, "node-test-args", "", "Test args specifically for node e2e tests.")
   173  	flag.BoolVar(&o.noAllowDup, "no-allow-dup", false, "if set --allow-dup will not be passed to push-build and --stage will error if the build already exists on the gcs path")
   174  	flag.BoolVar(&o.nodeTests, "node-tests", false, "If true, run node-e2e tests.")
   175  	flag.StringVar(&o.preTestCmd, "pre-test-cmd", "", "If set, run the provided command before running any tests.")
   176  	flag.StringVar(&o.postTestCmd, "post-test-cmd", "", "If set, run the provided command after running all the tests.")
   177  	flag.StringVar(&o.provider, "provider", "", "Kubernetes provider such as gce, gke, aws, etc")
   178  	flag.StringVar(&o.publish, "publish", "", "Publish version to the specified gs:// path on success")
   179  	flag.StringVar(&o.runtimeConfig, "runtime-config", "", "If set, API versions can be turned on or off while bringing up the API server.")
   180  	flag.StringVar(&o.stage.dockerRegistry, "registry", "", "Push images to the specified docker registry (e.g. gcr.io/a-test-project)")
   181  	flag.StringVar(&o.save, "save", "", "Save credentials to gs:// path on --up if set (or load from there if not --up)")
   182  	flag.BoolVar(&o.skew, "skew", false, "If true, run tests in another version at ../kubernetes/kubernetes_skew")
   183  	flag.BoolVar(&o.skipDumpClusterLogs, "skip-dump-cluster-logs", false, "If true, skip the cluster log dumping")
   184  	flag.BoolVar(&o.soak, "soak", false, "If true, job runs in soak mode")
   185  	flag.DurationVar(&o.soakDuration, "soak-duration", 7*24*time.Hour, "Maximum age of a soak cluster before it gets recycled")
   186  	flag.Var(&o.stage, "stage", "Upload binaries to gs://bucket/devel/job-suffix if set")
   187  	flag.StringVar(&o.stage.versionSuffix, "stage-suffix", "", "Append suffix to staged version when set")
   188  	flag.StringVar(&o.storageTestDriverPath, "storage-testdriver-repo-path", "", "Relative path for external e2e test driver config in the csi driver repo")
   189  	flag.BoolVar(&o.test, "test", false, "Run Ginkgo tests.")
   190  	flag.StringVar(&o.testArgs, "test_args", "", "Space-separated list of arguments to pass to Ginkgo test runner.")
   191  	flag.StringVar(&o.testCmd, "test-cmd", "", "command to run against the cluster instead of Ginkgo e2e tests")
   192  	flag.StringVar(&o.testCmdName, "test-cmd-name", "", "name to log the test command as in xml results")
   193  	flag.DurationVar(&timeout, "timeout", time.Duration(0), "Terminate testing after the timeout duration (s/m/h)")
   194  	flag.BoolVar(&o.up, "up", false, "If true, start the e2e cluster. If cluster is already up, recreate it.")
   195  	flag.StringVar(&o.upgradeArgs, "upgrade_args", "", "If set, run upgrade tests before other tests")
   196  	flag.BoolVar(&o.version, "version", false, "Command to print version")
   197  
   198  	// The "-v" flag was also used by glog, which is used by k8s.io/client-go. Duplicate flags cause panics.
   199  	// 1. Even if we could convince glog to change, they have too many consumers to ever do so.
   200  	// 2. The glog lib parses flags during init. It is impossible to dynamically rewrite the args before they're parsed by glog.
   201  	// 3. The glog lib takes an int value, so "-v false" is an error.
   202  	// 4. It's possible, but unlikely, we could convince k8s.io/client-go to use a logging shim, because a library shouldn't force a logging implementation. This would take a major version release for the lib.
   203  	//
   204  	// The most reasonable solution is to accept that we shouldn't have made a single-letter global, and rename all references to this variable.
   205  	flag.BoolVar(&verbose, "verbose-commands", true, "If true, print all command output.")
   206  
   207  	// go flag does not support StringArrayVar
   208  	pflag.StringArrayVar(&o.testCmdArgs, "test-cmd-args", []string{}, "args for test-cmd")
   209  	return &o
   210  }
   211  
   212  var suite util.TestSuite = util.TestSuite{Name: "kubetest"}
   213  
   214  func validWorkingDirectory() error {
   215  	cwd, err := os.Getwd()
   216  	if err != nil {
   217  		return fmt.Errorf("could not get pwd: %w", err)
   218  	}
   219  	acwd, err := filepath.Abs(cwd)
   220  	if err != nil {
   221  		return fmt.Errorf("failed to convert %s to an absolute path: %w", cwd, err)
   222  	}
   223  	// This also matches "kubernetes_skew" for upgrades.
   224  	if !strings.Contains(filepath.Base(acwd), "kubernetes") {
   225  		return fmt.Errorf("must run from kubernetes directory root. current: %s", acwd)
   226  	}
   227  	return nil
   228  }
   229  
   230  type deployer interface {
   231  	Up() error
   232  	IsUp() error
   233  	DumpClusterLogs(localPath, gcsPath string) error
   234  	TestSetup() error
   235  	Down() error
   236  	GetClusterCreated(gcpProject string) (time.Time, error)
   237  	KubectlCommand() (*exec.Cmd, error)
   238  }
   239  
   240  // publisher is implemented by deployers that want to publish status on success
   241  type publisher interface {
   242  	// Publish is called when the tests were successful; the deployer should publish a success file
   243  	Publish() error
   244  }
   245  
   246  func getDeployer(o *options) (deployer, error) {
   247  	switch o.deployment {
   248  	case "bash":
   249  		return newBash(&o.clusterIPRange, o.gcpProject, o.gcpZone, o.gcpSSHProxyInstanceName, o.provider), nil
   250  	case "conformance":
   251  		return conformance.NewDeployer(o.kubecfg)
   252  	case "gke":
   253  		return newGKE(o.provider, o.gcpProject, o.gcpZone, o.gcpRegion, o.gcpNetwork, o.gcpNodeImage, o.gcpImageFamily, o.gcpImageProject, o.cluster, o.gcpSSHProxyInstanceName, &o.testArgs, &o.upgradeArgs)
   254  	case "kind":
   255  		return kind.NewDeployer(control, string(o.build))
   256  	case "kops":
   257  		return newKops(o.provider, o.gcpProject, o.cluster)
   258  	case "node":
   259  		return nodeDeploy{provider: o.provider}, nil
   260  	case "none":
   261  		return noneDeploy{}, nil
   262  	case "local":
   263  		return newLocalCluster(), nil
   264  	case "aksengine":
   265  		return newAKSEngine()
   266  	case "aks":
   267  		return newAksDeployer()
   268  	default:
   269  		return nil, fmt.Errorf("unknown deployment strategy %q", o.deployment)
   270  	}
   271  }
   272  
   273  func validateFlags(o *options) error {
   274  	if !o.extract.Enabled() && o.extractSource {
   275  		return errors.New("--extract-source flag cannot be passed without --extract")
   276  	}
   277  	return nil
   278  }
   279  
   280  func main() {
   281  	log.SetFlags(log.LstdFlags | log.Lshortfile)
   282  	log.Printf("Running kubetest version: %s\n", gitTag)
   283  
   284  	pflag.CommandLine = pflag.NewFlagSet(os.Args[0], pflag.ContinueOnError)
   285  	o := defineFlags()
   286  	pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
   287  	if err := pflag.CommandLine.Parse(os.Args[1:]); err != nil {
   288  		log.Fatalf("Flag parse failed: %v", err)
   289  	}
   290  
   291  	if err := validateFlags(o); err != nil {
   292  		log.Fatalf("Flags validation failed. err: %v", err)
   293  	}
   294  
   295  	if o.version {
   296  		log.Printf("kubetest version: %s\n", gitTag)
   297  		return
   298  	}
   299  
   300  	control = process.NewControl(timeout, interrupt, terminate, verbose)
   301  
   302  	// do things when we know we are running in the kubetest image
   303  	if os.Getenv("KUBETEST_IN_DOCKER") == "true" {
   304  		o.flushMemAfterBuild = true
   305  	}
   306  	// sanity fix for kind deployer, not set for other deployers to avoid
   307  	// breaking changes...
   308  	if o.deployment == "kind" {
   309  		// always default --dump for kind, in CI use $ARTIFACTS
   310  		artifacts := os.Getenv("ARTIFACTS")
   311  		if artifacts == "" {
   312  			artifacts = "./_artifacts"
   313  		}
   314  		o.dump = artifacts
   315  	}
   316  
   317  	err := complete(o)
   318  
   319  	if boskos.HasResource() {
   320  		if berr := boskos.ReleaseAll("dirty"); berr != nil {
   321  			log.Fatalf("[Boskos] Fail To Release: %v, kubetest err: %v", berr, err)
   322  		}
   323  	}
   324  
   325  	if err != nil {
   326  		log.Fatalf("Something went wrong: %v", err)
   327  	}
   328  }
   329  
   330  func complete(o *options) error {
   331  	if !terminate.Stop() {
   332  		<-terminate.C // Drain the value if necessary.
   333  	}
   334  	if !interrupt.Stop() {
   335  		<-interrupt.C // Drain value
   336  	}
   337  
   338  	if timeout > 0 {
   339  		log.Printf("Limiting testing to %s", timeout)
   340  		interrupt.Reset(timeout)
   341  	}
   342  
   343  	if o.dump != "" {
   344  		defer writeMetadata(o.dump, o.metadataSources)
   345  		defer control.WriteXML(&suite, o.dump, time.Now())
   346  	}
   347  	if o.logexporterGCSPath != "" {
   348  		o.testArgs += fmt.Sprintf(" --logexporter-gcs-path=%s", o.logexporterGCSPath)
   349  	}
   350  	if err := control.XMLWrap(&suite, "Prepare", func() error { return prepare(o) }); err != nil {
   351  		return fmt.Errorf("failed to prepare test environment: %w", err)
   352  	}
   353  	// Get the deployer before we acquire k8s so any additional flag
   354  	// verifications happen early.
   355  	var deploy deployer
   356  	err := control.XMLWrap(&suite, "GetDeployer", func() error {
   357  		d, err := getDeployer(o)
   358  		deploy = d
   359  		return err
   360  	})
   361  	if err != nil {
   362  		return fmt.Errorf("error creating deployer: %w", err)
   363  	}
   364  
   365  	// Check soaking before run tests
   366  	if o.soak {
   367  		if created, err := deploy.GetClusterCreated(o.gcpProject); err != nil {
   368  			// continue, but log the error
   369  			log.Printf("deploy %v, GetClusterCreated failed: %v", o.deployment, err)
   370  		} else {
   371  			if time.Now().After(created.Add(o.soakDuration)) {
   372  				// flip up on - which will tear down previous cluster and start a new one
   373  				log.Printf("Previous soak cluster created at %v, will recreate the cluster", created)
   374  				o.up = true
   375  			}
   376  		}
   377  	}
   378  
   379  	if err := acquireKubernetes(o, deploy); err != nil {
   380  		return fmt.Errorf("failed to acquire k8s binaries: %w", err)
   381  	}
   382  	if o.extract.Enabled() {
   383  		// If we specified `--extract-source` we will already be in the correct directory
   384  		if !o.extractSource {
   385  			if err := os.Chdir("kubernetes"); err != nil {
   386  				return fmt.Errorf("failed to chdir to kubernetes dir: %w", err)
   387  			}
   388  		}
   389  	}
   390  	if err := validWorkingDirectory(); err != nil {
   391  		return fmt.Errorf("called from invalid working directory: %w", err)
   392  	}
   393  
   394  	if o.down {
   395  		// listen for signals such as ^C and gracefully attempt to clean up
   396  		c := make(chan os.Signal, 1)
   397  		signal.Notify(c, os.Interrupt)
   398  		go func() {
   399  			for range c {
   400  				log.Print("Captured ^C, gracefully attempting to cleanup resources..")
   401  				if err = deploy.Down(); err != nil {
   402  					log.Printf("Tearing down deployment failed: %v", err)
   403  				}
   404  				if err != nil {
   405  					os.Exit(1)
   406  				}
   407  
   408  				os.Exit(2)
   409  			}
   410  		}()
   411  	}
   412  
   413  	if err := run(deploy, *o); err != nil {
   414  		return err
   415  	}
   416  
   417  	// Publish the successfully tested version when requested
   418  	if o.publish != "" {
   419  		if err := publish(o.publish); err != nil {
   420  			return err
   421  		}
   422  	}
   423  	return nil
   424  }
   425  
   426  func acquireKubernetes(o *options, d deployer) error {
   427  	// Potentially build kubernetes
   428  	if o.build.Enabled() {
   429  		var err error
   430  		// kind deployer manages build
   431  		if k, ok := d.(*kind.Deployer); ok {
   432  			err = control.XMLWrap(&suite, "Build", k.Build)
   433  		} else if c, ok := d.(*aksEngineDeployer); ok { // Azure deployer
   434  			err = control.XMLWrap(&suite, "Build", func() error {
   435  				return c.Build(o.build)
   436  			})
   437  		} else {
   438  			err = control.XMLWrap(&suite, "Build", o.build.Build)
   439  		}
   440  		if o.flushMemAfterBuild {
   441  			util.FlushMem()
   442  		}
   443  		if err != nil {
   444  			return err
   445  		}
   446  	}
   447  
   448  	// Potentially stage build binaries somewhere on GCS
   449  	if o.stage.Enabled() {
   450  		if err := control.XMLWrap(&suite, "Stage", func() error {
   451  			return o.stage.Stage(o.noAllowDup)
   452  		}); err != nil {
   453  			return err
   454  		}
   455  	}
   456  
   457  	// Potentially download existing binaries and extract them.
   458  	if o.extract.Enabled() {
   459  		err := control.XMLWrap(&suite, "Extract", func() error {
   460  			// Should we restore a previous state?
   461  			// Restore if we are not upping the cluster
   462  			if o.save != "" {
   463  				if !o.up {
   464  					// Restore version and .kube/config from --up
   465  					log.Printf("Overwriting extract strategy to load kubeconfig and version from %s", o.save)
   466  					o.extract = extractStrategies{
   467  						extractStrategy{
   468  							mode:   load,
   469  							option: o.save,
   470  						},
   471  					}
   472  				}
   473  			}
   474  
   475  			// New deployment, extract new version
   476  			return o.extract.Extract(o.gcpProject, o.gcpZone, o.gcpRegion, o.extractCIBucket, o.extractReleaseBucket, o.extractSource)
   477  		})
   478  		if err != nil {
   479  			return err
   480  		}
   481  	}
   482  	return nil
   483  }
   484  
   485  // Returns the k8s version name
   486  func findVersion() string {
   487  	// The version may be in a version file
   488  	if _, err := os.Stat("version"); err == nil {
   489  		b, err := os.ReadFile("version")
   490  		if err == nil {
   491  			return strings.TrimSpace(string(b))
   492  		}
   493  		log.Printf("Failed to read version: %v", err)
   494  	}
   495  
   496  	// We can also get it from the git repo.
   497  	if _, err := os.Stat("hack/lib/version.sh"); err == nil {
   498  		// TODO(fejta): do this in go. At least we removed the upload-to-gcs.sh dep.
   499  		gross := `. hack/lib/version.sh && KUBE_ROOT=. kube::version::get_version_vars && echo "${KUBE_GIT_VERSION-}"`
   500  		b, err := control.Output(exec.Command("bash", "-c", gross))
   501  		if err == nil {
   502  			return strings.TrimSpace(string(b))
   503  		}
   504  		log.Printf("Failed to get_version_vars: %v", err)
   505  	}
   506  
   507  	return "unknown" // Sad trombone
   508  }
   509  
   510  // maybeMergeMetadata will add new keyvals into the map; quietly eats errors.
   511  func maybeMergeJSON(meta map[string]string, path string) {
   512  	if data, err := os.ReadFile(path); err == nil {
   513  		json.Unmarshal(data, &meta)
   514  	}
   515  }
   516  
   517  // Write metadata.json, including version and env arg data.
   518  func writeMetadata(path, metadataSources string) error {
   519  	m := make(map[string]string)
   520  
   521  	// Look for any sources of metadata and load 'em
   522  	for _, f := range strings.Split(metadataSources, ",") {
   523  		maybeMergeJSON(m, filepath.Join(path, f))
   524  	}
   525  
   526  	ver := findVersion()
   527  	m["job-version"] = ver // TODO(krzyzacy): retire
   528  	m["revision"] = ver
   529  	m["kubetest-version"] = gitTag
   530  	re := regexp.MustCompile(`^BUILD_METADATA_(.+)$`)
   531  	for _, e := range os.Environ() {
   532  		p := strings.SplitN(e, "=", 2)
   533  		r := re.FindStringSubmatch(p[0])
   534  		if r == nil {
   535  			continue
   536  		}
   537  		k, v := strings.ToLower(r[1]), p[1]
   538  		m[k] = v
   539  	}
   540  	f, err := os.Create(filepath.Join(path, "metadata.json"))
   541  	if err != nil {
   542  		return err
   543  	}
   544  	defer f.Close()
   545  	e := json.NewEncoder(f)
   546  	return e.Encode(m)
   547  }
   548  
   549  // Install cloudsdk tarball to location, updating PATH
   550  func installGcloud(tarball string, location string) error {
   551  
   552  	if err := os.MkdirAll(location, 0775); err != nil {
   553  		return err
   554  	}
   555  
   556  	if err := control.FinishRunning(exec.Command("tar", "xzf", tarball, "-C", location)); err != nil {
   557  		return err
   558  	}
   559  
   560  	if err := control.FinishRunning(exec.Command(filepath.Join(location, "google-cloud-sdk", "install.sh"), "--disable-installation-options", "--bash-completion=false", "--path-update=false", "--usage-reporting=false")); err != nil {
   561  		return err
   562  	}
   563  
   564  	if err := util.InsertPath(filepath.Join(location, "google-cloud-sdk", "bin")); err != nil {
   565  		return err
   566  	}
   567  
   568  	if err := control.FinishRunning(exec.Command("gcloud", "components", "install", "alpha")); err != nil {
   569  		return err
   570  	}
   571  
   572  	if err := control.FinishRunning(exec.Command("gcloud", "components", "install", "beta")); err != nil {
   573  		return err
   574  	}
   575  
   576  	if err := control.FinishRunning(exec.Command("gcloud", "info")); err != nil {
   577  		return err
   578  	}
   579  	return nil
   580  }
   581  
   582  func migrateGcpEnvAndOptions(o *options) error {
   583  	var network string
   584  	var zone string
   585  	switch o.provider {
   586  	case "gke":
   587  		network = "KUBE_GKE_NETWORK"
   588  		zone = "ZONE"
   589  	default:
   590  		network = "KUBE_GCE_NETWORK"
   591  		zone = "KUBE_GCE_ZONE"
   592  	}
   593  	return util.MigrateOptions([]util.MigratedOption{
   594  		{
   595  			Env:    "PROJECT",
   596  			Option: &o.gcpProject,
   597  			Name:   "--gcp-project",
   598  		},
   599  		{
   600  			Env:    zone,
   601  			Option: &o.gcpZone,
   602  			Name:   "--gcp-zone",
   603  		},
   604  		{
   605  			Env:    "REGION",
   606  			Option: &o.gcpRegion,
   607  			Name:   "--gcp-region",
   608  		},
   609  		{
   610  			Env:    "GOOGLE_APPLICATION_CREDENTIALS",
   611  			Option: &o.gcpServiceAccount,
   612  			Name:   "--gcp-service-account",
   613  		},
   614  		{
   615  			Env:    network,
   616  			Option: &o.gcpNetwork,
   617  			Name:   "--gcp-network",
   618  		},
   619  		{
   620  			Env:    "KUBE_NODE_OS_DISTRIBUTION",
   621  			Option: &o.gcpNodeImage,
   622  			Name:   "--gcp-node-image",
   623  		},
   624  		{
   625  			Env:    "KUBE_MASTER_OS_DISTRIBUTION",
   626  			Option: &o.gcpMasterImage,
   627  			Name:   "--gcp-master-image",
   628  		},
   629  		{
   630  			Env:    "NUM_NODES",
   631  			Option: &o.gcpNodes,
   632  			Name:   "--gcp-nodes",
   633  		},
   634  		{
   635  			Env:    "NODE_SIZE",
   636  			Option: &o.gcpNodeSize,
   637  			Name:   "--gcp-node-size",
   638  		},
   639  		{
   640  			Env:    "MASTER_SIZE",
   641  			Option: &o.gcpMasterSize,
   642  			Name:   "--gcp-master-size",
   643  		},
   644  		{
   645  			Env:      "CLOUDSDK_BUCKET",
   646  			Option:   &o.gcpCloudSdk,
   647  			Name:     "--gcp-cloud-sdk",
   648  			SkipPush: true,
   649  		},
   650  	})
   651  }
   652  
   653  func prepareGcp(o *options) error {
   654  	if err := migrateGcpEnvAndOptions(o); err != nil {
   655  		return err
   656  	}
   657  	// Must happen before any gcloud commands
   658  	if err := activateServiceAccount(o.gcpServiceAccount); err != nil {
   659  		return err
   660  	}
   661  
   662  	if o.provider == "gce" {
   663  		if distro := os.Getenv("KUBE_OS_DISTRIBUTION"); distro != "" {
   664  			log.Printf("Please use --gcp-master-image=%s --gcp-node-image=%s (instead of deprecated KUBE_OS_DISTRIBUTION)",
   665  				distro, distro)
   666  			// Note: KUBE_OS_DISTRIBUTION takes precedence over
   667  			// KUBE_{MASTER,NODE}_OS_DISTRIBUTION, so override here
   668  			// after the migration above.
   669  			o.gcpNodeImage = distro
   670  			o.gcpMasterImage = distro
   671  			if err := os.Setenv("KUBE_NODE_OS_DISTRIBUTION", distro); err != nil {
   672  				return fmt.Errorf("could not set KUBE_NODE_OS_DISTRIBUTION=%s: %w", distro, err)
   673  			}
   674  			if err := os.Setenv("KUBE_MASTER_OS_DISTRIBUTION", distro); err != nil {
   675  				return fmt.Errorf("could not set KUBE_MASTER_OS_DISTRIBUTION=%s: %w", distro, err)
   676  			}
   677  		}
   678  
   679  		hasGCPImageFamily, hasGCPImageProject := len(o.gcpImageFamily) != 0, len(o.gcpImageProject) != 0
   680  		if hasGCPImageFamily != hasGCPImageProject {
   681  			return fmt.Errorf("--image-family and --image-project must be both set or unset")
   682  		}
   683  		if hasGCPImageFamily && hasGCPImageProject {
   684  			out, err := control.Output(exec.Command("gcloud", "compute", "images", "describe-from-family", o.gcpImageFamily, "--project", o.gcpImageProject))
   685  			if err != nil {
   686  				return fmt.Errorf("failed to get latest image from family %q in project %q: %s", o.gcpImageFamily, o.gcpImageProject, err)
   687  			}
   688  			latestImage := ""
   689  			latestImageRegexp := regexp.MustCompile(`^name: *(\S+)`)
   690  			for _, line := range strings.Split(string(out), "\n") {
   691  				matches := latestImageRegexp.FindStringSubmatch(line)
   692  				if len(matches) == 2 {
   693  					latestImage = matches[1]
   694  					break
   695  				}
   696  			}
   697  			if len(latestImage) == 0 {
   698  				return fmt.Errorf("failed to get latest image from family %q in project %q", o.gcpImageFamily, o.gcpImageProject)
   699  			}
   700  			if o.deployment == "node" {
   701  				o.nodeArgs += fmt.Sprintf(" --images=%s --image-project=%s", latestImage, o.gcpImageProject)
   702  			} else {
   703  				os.Setenv("KUBE_GCE_NODE_IMAGE", latestImage)
   704  				os.Setenv("KUBE_GCE_NODE_PROJECT", o.gcpImageProject)
   705  			}
   706  		}
   707  	} else if o.provider == "gke" {
   708  		if o.deployment == "" {
   709  			o.deployment = "gke"
   710  		}
   711  		if o.deployment != "gke" {
   712  			return fmt.Errorf("expected --deployment=gke for --provider=gke, found --deployment=%s", o.deployment)
   713  		}
   714  		if o.gcpMasterImage != "" {
   715  			return fmt.Errorf("expected --gcp-master-image to be empty for --provider=gke, found --gcp-master-image=%s", o.gcpMasterImage)
   716  		}
   717  		if o.gcpNodes != "" {
   718  			return fmt.Errorf("--gcp-nodes cannot be set on GKE, use --gke-shape instead")
   719  		}
   720  		if o.gcpNodeSize != "" {
   721  			return fmt.Errorf("--gcp-node-size cannot be set on GKE, use --gke-shape instead")
   722  		}
   723  		if o.gcpMasterSize != "" {
   724  			return fmt.Errorf("--gcp-master-size cannot be set on GKE, where it's auto-computed")
   725  		}
   726  
   727  		// TODO(kubernetes/test-infra#3536): This is used by the
   728  		// ginkgo-e2e.sh wrapper.
   729  		nod := o.gcpNodeImage
   730  		if nod == "container_vm" {
   731  			// gcloud container clusters create understands
   732  			// "container_vm", e2es understand "debian".
   733  			nod = "debian"
   734  		}
   735  		if nod == "cos_containerd" {
   736  			// gcloud container clusters create understands
   737  			// "cos_containerd", e2es only understand
   738  			// "gci"/"cos",
   739  			nod = "gci"
   740  		}
   741  		os.Setenv("NODE_OS_DISTRIBUTION", nod)
   742  	}
   743  	if o.gcpProject == "" {
   744  		log.Print("--gcp-project is missing, trying to fetch a project from boskos.\n" +
   745  			"(for local runs please set --gcp-project to your dev project)")
   746  
   747  		var resType string
   748  		if o.gcpProjectType != "" {
   749  			resType = o.gcpProjectType
   750  		} else if o.provider == "gke" {
   751  			resType = "gke-project"
   752  		} else {
   753  			resType = "gce-project"
   754  		}
   755  
   756  		log.Printf("provider %v, will acquire project type %v from boskos", o.provider, resType)
   757  
   758  		// let's retry 5min to get next available resource
   759  		ctx, cancel := context.WithTimeout(context.Background(), o.boskosWaitDuration)
   760  		defer cancel()
   761  		p, err := boskos.AcquireWait(ctx, resType, "free", "busy")
   762  		if err != nil {
   763  			return fmt.Errorf("--provider=%s boskos failed to acquire project: %w", o.provider, err)
   764  		}
   765  
   766  		if p == nil {
   767  			return fmt.Errorf("boskos does not have a free %s at the moment", resType)
   768  		}
   769  
   770  		go func(c *client.Client, proj string) {
   771  			for range time.Tick(time.Minute * 5) {
   772  				if err := c.UpdateOne(p.Name, "busy", nil); err != nil {
   773  					log.Printf("[Boskos] Update of %s failed with %v", p.Name, err)
   774  				}
   775  			}
   776  		}(boskos, p.Name)
   777  		o.gcpProject = p.Name
   778  	}
   779  
   780  	if err := os.Setenv("CLOUDSDK_CORE_PRINT_UNHANDLED_TRACEBACKS", "1"); err != nil {
   781  		return fmt.Errorf("could not set CLOUDSDK_CORE_PRINT_UNHANDLED_TRACEBACKS=1: %w", err)
   782  	}
   783  
   784  	if err := control.FinishRunning(exec.Command("gcloud", "config", "set", "project", o.gcpProject)); err != nil {
   785  		return fmt.Errorf("fail to set project %s : err %w", o.gcpProject, err)
   786  	}
   787  
   788  	// TODO(krzyzacy):Remove this when we retire migrateGcpEnvAndOptions
   789  	// Note that a lot of scripts are still depend on this env in k/k repo.
   790  	if err := os.Setenv("PROJECT", o.gcpProject); err != nil {
   791  		return fmt.Errorf("fail to set env var PROJECT %s : err %w", o.gcpProject, err)
   792  	}
   793  
   794  	// Ensure ssh keys exist
   795  	log.Print("Checking existing of GCP ssh keys...")
   796  	k := filepath.Join(util.Home(".ssh"), "google_compute_engine")
   797  	if _, err := os.Stat(k); err != nil {
   798  		return err
   799  	}
   800  	pk := k + ".pub"
   801  	if _, err := os.Stat(pk); err != nil {
   802  		return err
   803  	}
   804  
   805  	log.Printf("Checking presence of public key in %s", o.gcpProject)
   806  	if out, err := control.Output(exec.Command("gcloud", "compute", "--project="+o.gcpProject, "project-info", "describe")); err != nil {
   807  		return err
   808  	} else if b, err := os.ReadFile(pk); err != nil {
   809  		return err
   810  	} else if !strings.Contains(string(out), string(b)) {
   811  		log.Print("Uploading public ssh key to project metadata...")
   812  		if err = control.FinishRunning(exec.Command("gcloud", "compute", "--project="+o.gcpProject, "config-ssh")); err != nil {
   813  			return err
   814  		}
   815  	}
   816  
   817  	// Install custom gcloud version if necessary
   818  	if o.gcpCloudSdk != "" {
   819  		for i := 0; i < 3; i++ {
   820  			if err := control.FinishRunning(exec.Command("gsutil", "-mq", "cp", "-r", o.gcpCloudSdk, util.Home())); err == nil {
   821  				break // Success!
   822  			}
   823  			time.Sleep(1 << uint(i) * time.Second)
   824  		}
   825  		for _, f := range []string{util.Home(".gsutil"), util.Home("repo"), util.Home("cloudsdk")} {
   826  			if _, err := os.Stat(f); err == nil || !os.IsNotExist(err) {
   827  				if err = os.RemoveAll(f); err != nil {
   828  					return err
   829  				}
   830  			}
   831  		}
   832  
   833  		install := util.Home("repo", "google-cloud-sdk.tar.gz")
   834  		if strings.HasSuffix(o.gcpCloudSdk, ".tar.gz") {
   835  			install = util.Home(filepath.Base(o.gcpCloudSdk))
   836  		} else {
   837  			if err := os.Rename(util.Home(filepath.Base(o.gcpCloudSdk)), util.Home("repo")); err != nil {
   838  				return err
   839  			}
   840  
   841  			// Controls which gcloud components to install.
   842  			pop, err := util.PushEnv("CLOUDSDK_COMPONENT_MANAGER_SNAPSHOT_URL", "file://"+util.Home("repo", "components-2.json"))
   843  			if err != nil {
   844  				return err
   845  			}
   846  			defer pop()
   847  		}
   848  
   849  		if err := installGcloud(install, util.Home("cloudsdk")); err != nil {
   850  			return err
   851  		}
   852  		// gcloud creds may have changed
   853  		if err := activateServiceAccount(o.gcpServiceAccount); err != nil {
   854  			return err
   855  		}
   856  	}
   857  
   858  	if o.kubemark {
   859  		if p := os.Getenv("KUBEMARK_BAZEL_BUILD"); strings.ToLower(p) == "y" {
   860  			// we need docker-credential-gcr to get authed properly
   861  			// https://github.com/bazelbuild/rules_docker#authorization
   862  			if err := control.FinishRunning(exec.Command("gcloud", "components", "install", "docker-credential-gcr")); err != nil {
   863  				return err
   864  			}
   865  			if err := control.FinishRunning(exec.Command("docker-credential-gcr", "configure-docker")); err != nil {
   866  				return err
   867  			}
   868  		}
   869  	}
   870  
   871  	return nil
   872  }
   873  
   874  func prepareAws(o *options) error {
   875  	// gcloud creds may have changed
   876  	if err := activateServiceAccount(o.gcpServiceAccount); err != nil {
   877  		return err
   878  	}
   879  	return control.FinishRunning(exec.Command("pip", "install", "awscli"))
   880  }
   881  
   882  // Activate GOOGLE_APPLICATION_CREDENTIALS if set or do nothing.
   883  func activateServiceAccount(path string) error {
   884  	if path == "" {
   885  		return nil
   886  	}
   887  	return control.FinishRunning(exec.Command("gcloud", "auth", "activate-service-account", "--key-file="+path))
   888  }
   889  
   890  func prepare(o *options) error {
   891  	if err := util.MigrateOptions([]util.MigratedOption{
   892  		{
   893  			Env:    "KUBERNETES_PROVIDER",
   894  			Option: &o.provider,
   895  			Name:   "--provider",
   896  		},
   897  		{
   898  			Env:    "CLUSTER_NAME",
   899  			Option: &o.cluster,
   900  			Name:   "--cluster",
   901  		},
   902  	}); err != nil {
   903  		return err
   904  	}
   905  	if err := prepareGinkgoParallel(&o.ginkgoParallel); err != nil {
   906  		return err
   907  	}
   908  
   909  	switch o.provider {
   910  	case "gce", "gke", "node":
   911  		if err := prepareGcp(o); err != nil {
   912  			return err
   913  		}
   914  	case "aws":
   915  		if err := prepareAws(o); err != nil {
   916  			return err
   917  		}
   918  	}
   919  
   920  	if o.kubemark {
   921  		if err := util.MigrateOptions([]util.MigratedOption{
   922  			{
   923  				Env:    "KUBEMARK_NUM_NODES",
   924  				Option: &o.kubemarkNodes,
   925  				Name:   "--kubemark-nodes",
   926  			},
   927  			{
   928  				Env:    "KUBEMARK_MASTER_SIZE",
   929  				Option: &o.kubemarkMasterSize,
   930  				Name:   "--kubemark-master-size",
   931  			},
   932  		}); err != nil {
   933  			return err
   934  		}
   935  	}
   936  
   937  	if err := os.MkdirAll(artifacts, 0777); err != nil { // Create artifacts
   938  		return err
   939  	}
   940  
   941  	return nil
   942  }
   943  
   944  type ginkgoParallelValue struct {
   945  	v int // 0 == not set (defaults to 1)
   946  }
   947  
   948  func (v *ginkgoParallelValue) IsBoolFlag() bool {
   949  	return true
   950  }
   951  
   952  func (v *ginkgoParallelValue) String() string {
   953  	if v.v == 0 {
   954  		return "1"
   955  	}
   956  	return strconv.Itoa(v.v)
   957  }
   958  
   959  func (v *ginkgoParallelValue) Set(s string) error {
   960  	if s == "" {
   961  		v.v = 0
   962  		return nil
   963  	}
   964  	if s == "true" {
   965  		v.v = defaultGinkgoParallel
   966  		return nil
   967  	}
   968  	p, err := strconv.Atoi(s)
   969  	if err != nil {
   970  		return fmt.Errorf("--ginkgo-parallel must be an integer, found %q", s)
   971  	}
   972  	if p < 1 {
   973  		return fmt.Errorf("--ginkgo-parallel must be >= 1, found %d", p)
   974  	}
   975  	v.v = p
   976  	return nil
   977  }
   978  
   979  func (v *ginkgoParallelValue) Type() string {
   980  	return "ginkgoParallelValue"
   981  }
   982  
   983  func (v *ginkgoParallelValue) Get() int {
   984  	if v.v == 0 {
   985  		return 1
   986  	}
   987  	return v.v
   988  }
   989  
   990  var _ flag.Value = &ginkgoParallelValue{}
   991  
   992  // Hand migrate this option. GINKGO_PARALLEL => GINKGO_PARALLEL_NODES=25
   993  func prepareGinkgoParallel(v *ginkgoParallelValue) error {
   994  	if p := os.Getenv("GINKGO_PARALLEL"); strings.ToLower(p) == "y" {
   995  		log.Printf("Please use kubetest --ginkgo-parallel (instead of deprecated GINKGO_PARALLEL=y)")
   996  		if err := v.Set("true"); err != nil {
   997  			return err
   998  		}
   999  		os.Unsetenv("GINKGO_PARALLEL")
  1000  	}
  1001  	if p := os.Getenv("GINKGO_PARALLEL_NODES"); p != "" {
  1002  		log.Printf("Please use kubetest --ginkgo-parallel=%s (instead of deprecated GINKGO_PARALLEL_NODES=%s)", p, p)
  1003  		if err := v.Set(p); err != nil {
  1004  			return err
  1005  		}
  1006  	}
  1007  	os.Setenv("GINKGO_PARALLEL_NODES", v.String())
  1008  	return nil
  1009  }
  1010  
  1011  func publish(pub string) error {
  1012  	v, err := os.ReadFile("version")
  1013  	if err != nil {
  1014  		return err
  1015  	}
  1016  	log.Printf("Set %s version to %s", pub, string(v))
  1017  	return gcsWrite(pub, v)
  1018  }