github.com/abayer/test-infra@v0.0.5/kubetest/main.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"encoding/json"
    21  	"errors"
    22  	"flag"
    23  	"fmt"
    24  	"io/ioutil"
    25  	"log"
    26  	"math/rand"
    27  	"os"
    28  	"os/exec"
    29  	"os/signal"
    30  	"path/filepath"
    31  	"regexp"
    32  	"strconv"
    33  	"strings"
    34  	"time"
    35  
    36  	"github.com/spf13/pflag"
    37  
    38  	"k8s.io/test-infra/boskos/client"
    39  	"k8s.io/test-infra/kubetest/conformance"
    40  	"k8s.io/test-infra/kubetest/dind"
    41  	"k8s.io/test-infra/kubetest/process"
    42  	"k8s.io/test-infra/kubetest/util"
    43  )
    44  
    45  // Hardcoded in ginkgo-e2e.sh
    46  const defaultGinkgoParallel = 25
    47  
    48  var (
    49  	artifacts = filepath.Join(os.Getenv("WORKSPACE"), "_artifacts")
    50  	interrupt = time.NewTimer(time.Duration(0)) // interrupt testing at this time.
    51  	terminate = time.NewTimer(time.Duration(0)) // terminate testing at this time.
    52  	verbose   = false
    53  	timeout   = time.Duration(0)
    54  	boskos    = client.NewClient(os.Getenv("JOB_NAME"), "http://boskos")
    55  	control   = process.NewControl(timeout, interrupt, terminate, verbose)
    56  )
    57  
    58  type options struct {
    59  	build               buildStrategy
    60  	buildFederation     buildFederationStrategy
    61  	charts              bool
    62  	checkLeaks          bool
    63  	checkSkew           bool
    64  	cluster             string
    65  	clusterIPRange      string
    66  	deployment          string
    67  	dindImage           string
    68  	down                bool
    69  	dump                string
    70  	extract             extractStrategies
    71  	extractFederation   extractFederationStrategies
    72  	extractSource       bool
    73  	federation          bool
    74  	flushMemAfterBuild  bool
    75  	focusRegex          string
    76  	gcpCloudSdk         string
    77  	gcpMasterImage      string
    78  	gcpMasterSize       string
    79  	gcpNetwork          string
    80  	gcpNodeImage        string
    81  	gcpImageFamily      string
    82  	gcpImageProject     string
    83  	gcpNodes            string
    84  	gcpNodeSize         string
    85  	gcpProject          string
    86  	gcpProjectType      string
    87  	gcpServiceAccount   string
    88  	gcpRegion           string
    89  	gcpZone             string
    90  	ginkgoParallel      ginkgoParallelValue
    91  	kubecfg             string
    92  	kubemark            bool
    93  	kubemarkMasterSize  string
    94  	kubemarkNodes       string // TODO(fejta): switch to int after migration
    95  	logexporterGCSPath  string
    96  	metadataSources     string
    97  	multiClusters       multiClusterDeployment
    98  	multipleFederations bool
    99  	noAllowDup          bool
   100  	nodeArgs            string
   101  	nodeTestArgs        string
   102  	nodeTests           bool
   103  	perfTests           bool
   104  	provider            string
   105  	publish             string
   106  	runtimeConfig       string
   107  	save                string
   108  	skew                bool
   109  	skipRegex           string
   110  	soak                bool
   111  	soakDuration        time.Duration
   112  	sshUser             string
   113  	stage               stageStrategy
   114  	stageFederation     stageFederationStrategy
   115  	test                bool
   116  	testArgs            string
   117  	testCmd             string
   118  	testCmdName         string
   119  	testCmdArgs         []string
   120  	up                  bool
   121  	upgradeArgs         string
   122  }
   123  
   124  func defineFlags() *options {
   125  	o := options{}
   126  	flag.Var(&o.build, "build", "Rebuild k8s binaries, optionally forcing (release|quick|bazel|dind) strategy")
   127  	flag.Var(&o.buildFederation, "build-federation", "Rebuild federation binaries, optionally forcing (release|quick|bazel) strategy")
   128  	flag.BoolVar(&o.charts, "charts", false, "If true, run charts tests")
   129  	flag.BoolVar(&o.checkSkew, "check-version-skew", true, "Verify client and server versions match")
   130  	flag.BoolVar(&o.checkLeaks, "check-leaked-resources", false, "Ensure project ends with the same resources")
   131  	flag.StringVar(&o.cluster, "cluster", "", "Cluster name. Must be set for --deployment=gke (TODO: other deployments).")
   132  	flag.StringVar(&o.clusterIPRange, "cluster-ip-range", "", "Specifies CLUSTER_IP_RANGE value during --up and --test (only relevant for --deployment=bash). Auto-calculated if empty.")
   133  	flag.StringVar(&o.deployment, "deployment", "bash", "Choices: none/bash/conformance/dind/gke/kops/kubernetes-anywhere/node/local")
   134  	flag.StringVar(&o.dindImage, "dind-image", "", "The dind image to use to start a cluster. Defaults to the docker tag produced by bazel.")
   135  	flag.BoolVar(&o.down, "down", false, "If true, tear down the cluster before exiting.")
   136  	flag.StringVar(&o.dump, "dump", "", "If set, dump cluster logs to this location on test or cluster-up failure")
   137  	flag.Var(&o.extract, "extract", "Extract k8s binaries from the specified release location")
   138  	flag.Var(&o.extractFederation, "extract-federation", "Extract federation binaries from the specified release location")
   139  	flag.BoolVar(&o.extractSource, "extract-source", false, "Extract k8s src together with other tarballs")
   140  	flag.BoolVar(&o.federation, "federation", false, "If true, start/tear down the federation control plane along with the clusters. To only start/tear down the federation control plane, specify --deployment=none")
   141  	flag.BoolVar(&o.flushMemAfterBuild, "flush-mem-after-build", false, "If true, try to flush container memory after building")
   142  	flag.Var(&o.ginkgoParallel, "ginkgo-parallel", fmt.Sprintf("Run Ginkgo tests in parallel, default %d runners. Use --ginkgo-parallel=N to specify an exact count.", defaultGinkgoParallel))
   143  	flag.StringVar(&o.gcpCloudSdk, "gcp-cloud-sdk", "", "Install/upgrade google-cloud-sdk to the gs:// path if set")
   144  	flag.StringVar(&o.gcpProject, "gcp-project", "", "For use with gcloud commands")
   145  	flag.StringVar(&o.gcpProjectType, "gcp-project-type", "", "Explicitly indicate which project type to select from boskos")
   146  	flag.StringVar(&o.gcpServiceAccount, "gcp-service-account", "", "Service account to activate before using gcloud")
   147  	flag.StringVar(&o.gcpZone, "gcp-zone", "", "For use with gcloud commands")
   148  	flag.StringVar(&o.gcpRegion, "gcp-region", "", "For use with gcloud commands")
   149  	flag.StringVar(&o.gcpNetwork, "gcp-network", "", "Cluster network. Must be set for --deployment=gke (TODO: other deployments).")
   150  	flag.StringVar(&o.gcpMasterImage, "gcp-master-image", "", "Master image type (cos|debian on GCE, n/a on GKE)")
   151  	flag.StringVar(&o.gcpMasterSize, "gcp-master-size", "", "(--provider=gce only) Size of master to create (e.g n1-standard-1). Auto-calculated if left empty.")
   152  	flag.StringVar(&o.gcpNodeImage, "gcp-node-image", "", "Node image type (cos|container_vm on GKE, cos|debian on GCE)")
   153  	flag.StringVar(&o.gcpImageFamily, "image-family", "", "Node image family from which to use the latest image, required when --gcp-node-image=CUSTOM")
   154  	flag.StringVar(&o.gcpImageProject, "image-project", "", "Project containing node image family, required when --gcp-node-image=CUSTOM")
   155  	flag.StringVar(&o.gcpNodes, "gcp-nodes", "", "(--provider=gce only) Number of nodes to create.")
   156  	flag.StringVar(&o.gcpNodeSize, "gcp-node-size", "", "(--provider=gce only) Size of nodes to create (e.g n1-standard-1).")
   157  	flag.StringVar(&o.kubecfg, "kubeconfig", "", "The location of a kubeconfig file.")
   158  	flag.StringVar(&o.focusRegex, "ginkgo-focus", "", "The ginkgo regex to focus. Currently only respected for (dind).")
   159  	flag.StringVar(&o.skipRegex, "ginkgo-skip", "", "The ginkgo regex to skip. Currently only respected for (dind).")
   160  	flag.BoolVar(&o.kubemark, "kubemark", false, "If true, run kubemark tests.")
   161  	flag.StringVar(&o.kubemarkMasterSize, "kubemark-master-size", "", "Kubemark master size (only relevant if --kubemark=true). Auto-calculated based on '--kubemark-nodes' if left empty.")
   162  	flag.StringVar(&o.kubemarkNodes, "kubemark-nodes", "5", "Number of kubemark nodes to start (only relevant if --kubemark=true).")
   163  	flag.StringVar(&o.logexporterGCSPath, "logexporter-gcs-path", "", "Path to the GCS artifacts directory to dump logs from nodes. Logexporter gets enabled if this is non-empty")
   164  	flag.StringVar(&o.metadataSources, "metadata-sources", "images.json", "Comma-separated list of files inside ./artifacts to merge into metadata.json")
   165  	flag.Var(&o.multiClusters, "multi-clusters", "If set, bring up/down multiple clusters specified. Format is [Zone1:]Cluster1[,[ZoneN:]ClusterN]]*. Zone is optional and default zone is used if zone is not specified")
   166  	flag.BoolVar(&o.multipleFederations, "multiple-federations", false, "If true, enable running multiple federation control planes in parallel")
   167  	flag.StringVar(&o.nodeArgs, "node-args", "", "Args for node e2e tests.")
   168  	flag.StringVar(&o.nodeTestArgs, "node-test-args", "", "Test args specifically for node e2e tests.")
   169  	flag.BoolVar(&o.noAllowDup, "no-allow-dup", false, "if set --allow-dup will not be passed to push-build and --stage will error if the build already exists on the gcs path")
   170  	flag.BoolVar(&o.nodeTests, "node-tests", false, "If true, run node-e2e tests.")
   171  	flag.BoolVar(&o.perfTests, "perf-tests", false, "If true, run tests from perf-tests repo.")
   172  	flag.StringVar(&o.provider, "provider", "", "Kubernetes provider such as gce, gke, aws, etc")
   173  	flag.StringVar(&o.publish, "publish", "", "Publish version to the specified gs:// path on success")
   174  	flag.StringVar(&o.runtimeConfig, "runtime-config", "batch/v2alpha1=true", "If set, API versions can be turned on or off while bringing up the API server.")
   175  	flag.StringVar(&o.stage.dockerRegistry, "registry", "", "Push images to the specified docker registry (e.g. gcr.io/a-test-project)")
   176  	flag.StringVar(&o.save, "save", "", "Save credentials to gs:// path on --up if set (or load from there if not --up)")
   177  	flag.BoolVar(&o.skew, "skew", false, "If true, run tests in another version at ../kubernetes/hack/e2e.go")
   178  	flag.BoolVar(&o.soak, "soak", false, "If true, job runs in soak mode")
   179  	flag.DurationVar(&o.soakDuration, "soak-duration", 7*24*time.Hour, "Maximum age of a soak cluster before it gets recycled")
   180  	flag.Var(&o.stage, "stage", "Upload binaries to gs://bucket/devel/job-suffix if set")
   181  	flag.Var(&o.stageFederation, "stage-federation", "Upload federation binaries to gs://bucket/devel/job-suffix if set")
   182  	flag.StringVar(&o.stage.versionSuffix, "stage-suffix", "", "Append suffix to staged version when set")
   183  	flag.BoolVar(&o.test, "test", false, "Run Ginkgo tests.")
   184  	flag.StringVar(&o.testArgs, "test_args", "", "Space-separated list of arguments to pass to Ginkgo test runner.")
   185  	flag.StringVar(&o.testCmd, "test-cmd", "", "command to run against the cluster instead of Ginkgo e2e tests")
   186  	flag.StringVar(&o.testCmdName, "test-cmd-name", "", "name to log the test command as in xml results")
   187  	flag.DurationVar(&timeout, "timeout", time.Duration(0), "Terminate testing after the timeout duration (s/m/h)")
   188  	flag.BoolVar(&o.up, "up", false, "If true, start the e2e cluster. If cluster is already up, recreate it.")
   189  	flag.StringVar(&o.upgradeArgs, "upgrade_args", "", "If set, run upgrade tests before other tests")
   190  
   191  	// The "-v" flag was also used by glog, which is used by k8s.io/client-go. Duplicate flags cause panics.
   192  	// 1. Even if we could convince glog to change, they have too many consumers to ever do so.
   193  	// 2. The glog lib parses flags during init. It is impossible to dynamically rewrite the args before they're parsed by glog.
   194  	// 3. The glog lib takes an int value, so "-v false" is an error.
   195  	// 4. It's possible, but unlikely, we could convince k8s.io/client-go to use a logging shim, because a library shouldn't force a logging implementation. This would take a major version release for the lib.
   196  	//
   197  	// The most reasonable solution is to accept that we shouldn't have made a single-letter global, and rename all references to this variable.
   198  	flag.BoolVar(&verbose, "verbose-commands", true, "If true, print all command output.")
   199  
   200  	// go flag does not support StringArrayVar
   201  	pflag.StringArrayVar(&o.testCmdArgs, "test-cmd-args", []string{}, "args for test-cmd")
   202  	return &o
   203  }
   204  
   205  var suite util.TestSuite
   206  
   207  func validWorkingDirectory() error {
   208  	cwd, err := os.Getwd()
   209  	if err != nil {
   210  		return fmt.Errorf("could not get pwd: %v", err)
   211  	}
   212  	acwd, err := filepath.Abs(cwd)
   213  	if err != nil {
   214  		return fmt.Errorf("failed to convert %s to an absolute path: %v", cwd, err)
   215  	}
   216  	// This also matches "kubernetes_skew" for upgrades.
   217  	if !strings.Contains(filepath.Base(acwd), "kubernetes") {
   218  		return fmt.Errorf("must run from kubernetes directory root: %v", acwd)
   219  	}
   220  	return nil
   221  }
   222  
   223  type deployer interface {
   224  	Up() error
   225  	IsUp() error
   226  	DumpClusterLogs(localPath, gcsPath string) error
   227  	TestSetup() error
   228  	Down() error
   229  	GetClusterCreated(gcpProject string) (time.Time, error)
   230  }
   231  
   232  // publisher is implemented by deployers that want to publish status on success
   233  type publisher interface {
   234  	// Publish is called when the tests were successful; the deployer should publish a success file
   235  	Publish() error
   236  }
   237  
   238  func getDeployer(o *options) (deployer, error) {
   239  	switch o.deployment {
   240  	case "bash":
   241  		return newBash(&o.clusterIPRange), nil
   242  	case "conformance":
   243  		return conformance.NewDeployer(o.kubecfg)
   244  	case "dind":
   245  		return dind.NewDeployer(o.kubecfg, o.dindImage, control)
   246  	case "gke":
   247  		return newGKE(o.provider, o.gcpProject, o.gcpZone, o.gcpRegion, o.gcpNetwork, o.gcpNodeImage, o.gcpImageFamily, o.gcpImageProject, o.cluster, &o.testArgs, &o.upgradeArgs)
   248  	case "kops":
   249  		return newKops(o.provider, o.gcpProject, o.cluster)
   250  	case "kubernetes-anywhere":
   251  		if o.multiClusters.Enabled() {
   252  			return newKubernetesAnywhereMultiCluster(o.gcpProject, o.gcpZone, o.multiClusters)
   253  		}
   254  		return newKubernetesAnywhere(o.gcpProject, o.gcpZone)
   255  	case "node":
   256  		return nodeDeploy{}, nil
   257  	case "none":
   258  		return noneDeploy{}, nil
   259  	case "local":
   260  		return newLocalCluster(), nil
   261  	default:
   262  		return nil, fmt.Errorf("unknown deployment strategy %q", o.deployment)
   263  	}
   264  }
   265  
   266  func validateFlags(o *options) error {
   267  	if o.multiClusters.Enabled() && o.deployment != "kubernetes-anywhere" {
   268  		return errors.New("--multi-clusters flag cannot be passed with deployments other than 'kubernetes-anywhere'")
   269  	}
   270  	if !o.extract.Enabled() && o.extractSource {
   271  		return errors.New("--extract-source flag cannot be passed without --extract")
   272  	}
   273  	return nil
   274  }
   275  
   276  func main() {
   277  	log.SetFlags(log.LstdFlags | log.Lshortfile)
   278  
   279  	// Initialize global pseudo random generator. Intializing it to select random AWS Zones.
   280  	rand.Seed(time.Now().UnixNano())
   281  
   282  	pflag.CommandLine = pflag.NewFlagSet(os.Args[0], pflag.ContinueOnError)
   283  	o := defineFlags()
   284  	pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
   285  	if err := pflag.CommandLine.Parse(os.Args[1:]); err != nil {
   286  		log.Fatalf("Flag parse failed: %v", err)
   287  	}
   288  
   289  	if err := validateFlags(o); err != nil {
   290  		log.Fatalf("Flags validation failed. err: %v", err)
   291  	}
   292  
   293  	control = process.NewControl(timeout, interrupt, terminate, verbose)
   294  
   295  	// do things when we know we are running in the kubetest image
   296  	if os.Getenv("KUBETEST_IN_DOCKER") == "true" {
   297  		o.flushMemAfterBuild = true
   298  	}
   299  
   300  	err := complete(o)
   301  
   302  	if boskos.HasResource() {
   303  		if berr := boskos.ReleaseAll("dirty"); berr != nil {
   304  			log.Fatalf("[Boskos] Fail To Release: %v, kubetest err: %v", berr, err)
   305  		}
   306  	}
   307  
   308  	if err != nil {
   309  		log.Fatalf("Something went wrong: %v", err)
   310  	}
   311  }
   312  
   313  func complete(o *options) error {
   314  	if !terminate.Stop() {
   315  		<-terminate.C // Drain the value if necessary.
   316  	}
   317  	if !interrupt.Stop() {
   318  		<-interrupt.C // Drain value
   319  	}
   320  
   321  	if timeout > 0 {
   322  		log.Printf("Limiting testing to %s", timeout)
   323  		interrupt.Reset(timeout)
   324  	}
   325  
   326  	if o.dump != "" {
   327  		defer writeMetadata(o.dump, o.metadataSources)
   328  		defer control.WriteXML(&suite, o.dump, time.Now())
   329  	}
   330  	if o.logexporterGCSPath != "" {
   331  		o.testArgs += fmt.Sprintf(" --logexporter-gcs-path=%s", o.logexporterGCSPath)
   332  	}
   333  	if err := prepare(o); err != nil {
   334  		return fmt.Errorf("failed to prepare test environment: %v", err)
   335  	}
   336  	if err := prepareFederation(o); err != nil {
   337  		return fmt.Errorf("failed to prepare federation test environment: %v", err)
   338  	}
   339  	// Get the deployer before we acquire k8s so any additional flag
   340  	// verifications happen early.
   341  	deploy, err := getDeployer(o)
   342  	if err != nil {
   343  		return fmt.Errorf("error creating deployer: %v", err)
   344  	}
   345  
   346  	// Check soaking before run tests
   347  	if o.soak {
   348  		if created, err := deploy.GetClusterCreated(o.gcpProject); err != nil {
   349  			// continue, but log the error
   350  			log.Printf("deploy %v, GetClusterCreated failed: %v", o.deployment, err)
   351  		} else {
   352  			if time.Now().After(created.Add(o.soakDuration)) {
   353  				// flip up on - which will tear down previous cluster and start a new one
   354  				log.Printf("Previous soak cluster created at %v, will recreate the cluster", created)
   355  				o.up = true
   356  			}
   357  		}
   358  	}
   359  
   360  	if err := acquireKubernetes(o); err != nil {
   361  		return fmt.Errorf("failed to acquire k8s binaries: %v", err)
   362  	}
   363  	if err := acquireFederation(o); err != nil {
   364  		return fmt.Errorf("failed to acquire federation binaries: %v", err)
   365  	}
   366  	if o.extract.Enabled() {
   367  		// If we specified `--extract-source` we will already be in the correct directory
   368  		if !o.extractSource {
   369  			if err := os.Chdir("kubernetes"); err != nil {
   370  				return fmt.Errorf("failed to chdir to kubernetes dir: %v", err)
   371  			}
   372  		}
   373  	}
   374  	if err := validWorkingDirectory(); err != nil {
   375  		return fmt.Errorf("called from invalid working directory: %v", err)
   376  	}
   377  
   378  	if o.down {
   379  		// listen for signals such as ^C and gracefully attempt to clean up
   380  		c := make(chan os.Signal, 1)
   381  		signal.Notify(c, os.Interrupt)
   382  		go func() {
   383  			for range c {
   384  				log.Print("Captured ^C, gracefully attempting to cleanup resources..")
   385  				var fedErr, err error
   386  				if o.federation {
   387  					if fedErr = fedDown(); fedErr != nil {
   388  						log.Printf("Tearing down federation failed: %v", fedErr)
   389  					}
   390  				}
   391  				if err = deploy.Down(); err != nil {
   392  					log.Printf("Tearing down deployment failed: %v", err)
   393  				}
   394  				if fedErr != nil || err != nil {
   395  					os.Exit(1)
   396  				}
   397  
   398  				os.Exit(2)
   399  			}
   400  		}()
   401  	}
   402  
   403  	if err := run(deploy, *o); err != nil {
   404  		return err
   405  	}
   406  
   407  	// Publish the successfully tested version when requested
   408  	if o.publish != "" {
   409  		if err := publish(o.publish); err != nil {
   410  			return err
   411  		}
   412  	}
   413  	return nil
   414  }
   415  
   416  func acquireKubernetes(o *options) error {
   417  	// Potentially build kubernetes
   418  	if o.build.Enabled() {
   419  		err := control.XMLWrap(&suite, "Build", o.build.Build)
   420  		if o.flushMemAfterBuild {
   421  			util.FlushMem()
   422  		}
   423  		if err != nil {
   424  			return err
   425  		}
   426  	}
   427  
   428  	// Potentially stage build binaries somewhere on GCS
   429  	if o.stage.Enabled() {
   430  		if o.build == "dind" {
   431  			return fmt.Errorf("staging dind images isn't supported yet")
   432  		}
   433  		if err := control.XMLWrap(&suite, "Stage", func() error {
   434  			return o.stage.Stage(o.federation, o.noAllowDup)
   435  		}); err != nil {
   436  			return err
   437  		}
   438  	}
   439  
   440  	// Potentially download existing binaries and extract them.
   441  	if o.extract.Enabled() {
   442  		err := control.XMLWrap(&suite, "Extract", func() error {
   443  			// Should we restore a previous state?
   444  			// Restore if we are not upping the cluster or we are bringing up
   445  			// a federation control plane without the federated clusters.
   446  			if o.save != "" {
   447  				if !o.up {
   448  					// Restore version and .kube/config from --up
   449  					log.Printf("Overwriting extract strategy to load kubeconfig and version from %s", o.save)
   450  					o.extract = extractStrategies{
   451  						extractStrategy{
   452  							mode:   load,
   453  							option: o.save,
   454  						},
   455  					}
   456  				} else if o.federation && o.up && o.deployment == "none" {
   457  					// Only restore .kube/config from previous --up, use the regular
   458  					// extraction strategy to restore version.
   459  					log.Printf("Load kubeconfig from %s", o.save)
   460  					loadKubeconfig(o.save)
   461  				}
   462  			}
   463  
   464  			// New deployment, extract new version
   465  			return o.extract.Extract(o.gcpProject, o.gcpZone, o.gcpRegion, o.extractSource)
   466  		})
   467  		if err != nil {
   468  			return err
   469  		}
   470  	}
   471  	return nil
   472  }
   473  
   474  func acquireFederation(o *options) error {
   475  	// Potentially build federation
   476  	if o.buildFederation.Enabled() {
   477  		err := control.XMLWrap(&suite, "BuildFederation", o.buildFederation.Build)
   478  		if o.flushMemAfterBuild {
   479  			util.FlushMem()
   480  		}
   481  		if err != nil {
   482  			return err
   483  		}
   484  	}
   485  
   486  	// Potentially stage federation binaries somewhere on GCS
   487  	if o.stageFederation.Enabled() {
   488  		if err := control.XMLWrap(&suite, "StageFederation", func() error {
   489  			return o.stageFederation.Stage()
   490  		}); err != nil {
   491  			return err
   492  		}
   493  	}
   494  
   495  	// Potentially download existing federation binaries and extract them.
   496  	if o.extractFederation.Enabled() {
   497  		err := control.XMLWrap(&suite, "ExtractFederation", func() error {
   498  			return o.extractFederation.Extract(o.gcpProject, o.gcpZone)
   499  		})
   500  		return err
   501  	}
   502  	return nil
   503  }
   504  
   505  // Returns the k8s version name
   506  func findVersion() string {
   507  	// The version may be in a version file
   508  	if _, err := os.Stat("version"); err == nil {
   509  		b, err := ioutil.ReadFile("version")
   510  		if err == nil {
   511  			return strings.TrimSpace(string(b))
   512  		}
   513  		log.Printf("Failed to read version: %v", err)
   514  	}
   515  
   516  	// We can also get it from the git repo.
   517  	if _, err := os.Stat("hack/lib/version.sh"); err == nil {
   518  		// TODO(fejta): do this in go. At least we removed the upload-to-gcs.sh dep.
   519  		gross := `. hack/lib/version.sh && KUBE_ROOT=. kube::version::get_version_vars && echo "${KUBE_GIT_VERSION-}"`
   520  		b, err := control.Output(exec.Command("bash", "-c", gross))
   521  		if err == nil {
   522  			return strings.TrimSpace(string(b))
   523  		}
   524  		log.Printf("Failed to get_version_vars: %v", err)
   525  	}
   526  
   527  	return "unknown" // Sad trombone
   528  }
   529  
   530  // maybeMergeMetadata will add new keyvals into the map; quietly eats errors.
   531  func maybeMergeJSON(meta map[string]string, path string) {
   532  	if data, err := ioutil.ReadFile(path); err == nil {
   533  		json.Unmarshal(data, &meta)
   534  	}
   535  }
   536  
   537  // Write metadata.json, including version and env arg data.
   538  func writeMetadata(path, metadataSources string) error {
   539  	m := make(map[string]string)
   540  
   541  	// Look for any sources of metadata and load 'em
   542  	for _, f := range strings.Split(metadataSources, ",") {
   543  		maybeMergeJSON(m, filepath.Join(path, f))
   544  	}
   545  
   546  	ver := findVersion()
   547  	m["version"] = ver // TODO(fejta): retire
   548  	m["job-version"] = ver
   549  	re := regexp.MustCompile(`^BUILD_METADATA_(.+)$`)
   550  	for _, e := range os.Environ() {
   551  		p := strings.SplitN(e, "=", 2)
   552  		r := re.FindStringSubmatch(p[0])
   553  		if r == nil {
   554  			continue
   555  		}
   556  		k, v := strings.ToLower(r[1]), p[1]
   557  		m[k] = v
   558  	}
   559  	f, err := os.Create(filepath.Join(path, "metadata.json"))
   560  	if err != nil {
   561  		return err
   562  	}
   563  	defer f.Close()
   564  	e := json.NewEncoder(f)
   565  	return e.Encode(m)
   566  }
   567  
   568  // Install cloudsdk tarball to location, updating PATH
   569  func installGcloud(tarball string, location string) error {
   570  
   571  	if err := os.MkdirAll(location, 0775); err != nil {
   572  		return err
   573  	}
   574  
   575  	if err := control.FinishRunning(exec.Command("tar", "xzf", tarball, "-C", location)); err != nil {
   576  		return err
   577  	}
   578  
   579  	if err := control.FinishRunning(exec.Command(filepath.Join(location, "google-cloud-sdk", "install.sh"), "--disable-installation-options", "--bash-completion=false", "--path-update=false", "--usage-reporting=false")); err != nil {
   580  		return err
   581  	}
   582  
   583  	if err := util.InsertPath(filepath.Join(location, "google-cloud-sdk", "bin")); err != nil {
   584  		return err
   585  	}
   586  
   587  	if err := control.FinishRunning(exec.Command("gcloud", "components", "install", "alpha")); err != nil {
   588  		return err
   589  	}
   590  
   591  	if err := control.FinishRunning(exec.Command("gcloud", "components", "install", "beta")); err != nil {
   592  		return err
   593  	}
   594  
   595  	if err := control.FinishRunning(exec.Command("gcloud", "info")); err != nil {
   596  		return err
   597  	}
   598  	return nil
   599  }
   600  
   601  func migrateGcpEnvAndOptions(o *options) error {
   602  	var network string
   603  	var zone string
   604  	switch o.provider {
   605  	case "gke":
   606  		network = "KUBE_GKE_NETWORK"
   607  		zone = "ZONE"
   608  	default:
   609  		network = "KUBE_GCE_NETWORK"
   610  		zone = "KUBE_GCE_ZONE"
   611  	}
   612  	return util.MigrateOptions([]util.MigratedOption{
   613  		{
   614  			Env:    "PROJECT",
   615  			Option: &o.gcpProject,
   616  			Name:   "--gcp-project",
   617  		},
   618  		{
   619  			Env:    zone,
   620  			Option: &o.gcpZone,
   621  			Name:   "--gcp-zone",
   622  		},
   623  		{
   624  			Env:    "REGION",
   625  			Option: &o.gcpRegion,
   626  			Name:   "--gcp-region",
   627  		},
   628  		{
   629  			Env:    "GOOGLE_APPLICATION_CREDENTIALS",
   630  			Option: &o.gcpServiceAccount,
   631  			Name:   "--gcp-service-account",
   632  		},
   633  		{
   634  			Env:    network,
   635  			Option: &o.gcpNetwork,
   636  			Name:   "--gcp-network",
   637  		},
   638  		{
   639  			Env:    "KUBE_NODE_OS_DISTRIBUTION",
   640  			Option: &o.gcpNodeImage,
   641  			Name:   "--gcp-node-image",
   642  		},
   643  		{
   644  			Env:    "KUBE_MASTER_OS_DISTRIBUTION",
   645  			Option: &o.gcpMasterImage,
   646  			Name:   "--gcp-master-image",
   647  		},
   648  		{
   649  			Env:    "NUM_NODES",
   650  			Option: &o.gcpNodes,
   651  			Name:   "--gcp-nodes",
   652  		},
   653  		{
   654  			Env:    "NODE_SIZE",
   655  			Option: &o.gcpNodeSize,
   656  			Name:   "--gcp-node-size",
   657  		},
   658  		{
   659  			Env:    "MASTER_SIZE",
   660  			Option: &o.gcpMasterSize,
   661  			Name:   "--gcp-master-size",
   662  		},
   663  		{
   664  			Env:      "CLOUDSDK_BUCKET",
   665  			Option:   &o.gcpCloudSdk,
   666  			Name:     "--gcp-cloud-sdk",
   667  			SkipPush: true,
   668  		},
   669  	})
   670  }
   671  
   672  func prepareGcp(o *options) error {
   673  	if err := migrateGcpEnvAndOptions(o); err != nil {
   674  		return err
   675  	}
   676  	if o.provider == "gce" {
   677  		if distro := os.Getenv("KUBE_OS_DISTRIBUTION"); distro != "" {
   678  			log.Printf("Please use --gcp-master-image=%s --gcp-node-image=%s (instead of deprecated KUBE_OS_DISTRIBUTION)",
   679  				distro, distro)
   680  			// Note: KUBE_OS_DISTRIBUTION takes precedence over
   681  			// KUBE_{MASTER,NODE}_OS_DISTRIBUTION, so override here
   682  			// after the migration above.
   683  			o.gcpNodeImage = distro
   684  			o.gcpMasterImage = distro
   685  			if err := os.Setenv("KUBE_NODE_OS_DISTRIBUTION", distro); err != nil {
   686  				return fmt.Errorf("could not set KUBE_NODE_OS_DISTRIBUTION=%s: %v", distro, err)
   687  			}
   688  			if err := os.Setenv("KUBE_MASTER_OS_DISTRIBUTION", distro); err != nil {
   689  				return fmt.Errorf("could not set KUBE_MASTER_OS_DISTRIBUTION=%s: %v", distro, err)
   690  			}
   691  		}
   692  
   693  		hasGCPImageFamily, hasGCPImageProject := len(o.gcpImageFamily) != 0, len(o.gcpImageProject) != 0
   694  		if hasGCPImageFamily != hasGCPImageProject {
   695  			return fmt.Errorf("--image-family and --image-project must be both set or unset")
   696  		}
   697  		if hasGCPImageFamily && hasGCPImageProject {
   698  			out, err := control.Output(exec.Command("gcloud", "compute", "images", "describe-from-family", o.gcpImageFamily, "--project", o.gcpImageProject))
   699  			if err != nil {
   700  				return fmt.Errorf("failed to get latest image from family %q in project %q: %s", o.gcpImageFamily, o.gcpImageProject, err)
   701  			}
   702  			latestImage := ""
   703  			latestImageRegexp := regexp.MustCompile("^name: *(\\S+)")
   704  			for _, line := range strings.Split(string(out), "\n") {
   705  				matches := latestImageRegexp.FindStringSubmatch(line)
   706  				if len(matches) == 2 {
   707  					latestImage = matches[1]
   708  					break
   709  				}
   710  			}
   711  			if len(latestImage) == 0 {
   712  				return fmt.Errorf("failed to get latest image from family %q in project %q", o.gcpImageFamily, o.gcpImageProject)
   713  			}
   714  			if o.deployment == "node" {
   715  				o.nodeArgs += fmt.Sprintf(" --images=%s --image-project=%s", latestImage, o.gcpImageProject)
   716  			} else {
   717  				os.Setenv("KUBE_GCE_NODE_IMAGE", latestImage)
   718  				os.Setenv("KUBE_GCE_NODE_PROJECT", o.gcpImageProject)
   719  			}
   720  		}
   721  	} else if o.provider == "gke" {
   722  		if o.deployment == "" {
   723  			o.deployment = "gke"
   724  		}
   725  		if o.deployment != "gke" {
   726  			return fmt.Errorf("expected --deployment=gke for --provider=gke, found --deployment=%s", o.deployment)
   727  		}
   728  		if o.gcpNodeImage == "" {
   729  			return fmt.Errorf("--gcp-node-image must be set for GKE")
   730  		}
   731  		if o.gcpMasterImage != "" {
   732  			return fmt.Errorf("expected --gcp-master-image to be empty for --provider=gke, found --gcp-master-image=%s", o.gcpMasterImage)
   733  		}
   734  		if o.gcpNodes != "" {
   735  			return fmt.Errorf("--gcp-nodes cannot be set on GKE, use --gke-shape instead")
   736  		}
   737  		if o.gcpNodeSize != "" {
   738  			return fmt.Errorf("--gcp-node-size cannot be set on GKE, use --gke-shape instead")
   739  		}
   740  		if o.gcpMasterSize != "" {
   741  			return fmt.Errorf("--gcp-master-size cannot be set on GKE, where it's auto-computed")
   742  		}
   743  
   744  		// TODO(kubernetes/test-infra#3536): This is used by the
   745  		// ginkgo-e2e.sh wrapper.
   746  		nod := o.gcpNodeImage
   747  		if nod == "container_vm" {
   748  			// gcloud container clusters create understands
   749  			// "container_vm", e2es understand "debian".
   750  			nod = "debian"
   751  		}
   752  		if nod == "cos_containerd" {
   753  			// gcloud container clusters create understands
   754  			// "cos_containerd", e2es only understand
   755  			// "gci"/"cos",
   756  			nod = "gci"
   757  		}
   758  		os.Setenv("NODE_OS_DISTRIBUTION", nod)
   759  	}
   760  	if o.gcpProject == "" {
   761  		log.Print("--gcp-project is missing, trying to fetch a project from boskos.\n" +
   762  			"(for local runs please set --gcp-project to your dev project)")
   763  
   764  		var resType string
   765  		if o.gcpProjectType != "" {
   766  			resType = o.gcpProjectType
   767  		} else if o.provider == "gke" {
   768  			resType = "gke-project"
   769  		} else {
   770  			resType = "gce-project"
   771  		}
   772  
   773  		log.Printf("provider %v, will acquire project type %v from boskos", o.provider, resType)
   774  
   775  		p, err := boskos.Acquire(resType, "free", "busy")
   776  		if err != nil {
   777  			return fmt.Errorf("--provider=%s boskos failed to acquire project: %v", o.provider, err)
   778  		}
   779  
   780  		if p == nil {
   781  			return fmt.Errorf("boskos does not have a free %s at the moment", resType)
   782  		}
   783  
   784  		go func(c *client.Client, proj string) {
   785  			for range time.Tick(time.Minute * 5) {
   786  				if err := c.UpdateOne(p.Name, "busy", nil); err != nil {
   787  					log.Printf("[Boskos] Update %s failed with %v", p, err)
   788  				}
   789  			}
   790  		}(boskos, p.Name)
   791  		o.gcpProject = p.Name
   792  	}
   793  
   794  	if err := os.Setenv("CLOUDSDK_CORE_PRINT_UNHANDLED_TRACEBACKS", "1"); err != nil {
   795  		return fmt.Errorf("could not set CLOUDSDK_CORE_PRINT_UNHANDLED_TRACEBACKS=1: %v", err)
   796  	}
   797  
   798  	if err := control.FinishRunning(exec.Command("gcloud", "config", "set", "project", o.gcpProject)); err != nil {
   799  		return fmt.Errorf("fail to set project %s : err %v", o.gcpProject, err)
   800  	}
   801  
   802  	// TODO(krzyzacy):Remove this when we retire migrateGcpEnvAndOptions
   803  	// Note that a lot of scripts are still depend on this env in k/k repo.
   804  	if err := os.Setenv("PROJECT", o.gcpProject); err != nil {
   805  		return fmt.Errorf("fail to set env var PROJECT %s : err %v", o.gcpProject, err)
   806  	}
   807  
   808  	// gcloud creds may have changed
   809  	if err := activateServiceAccount(o.gcpServiceAccount); err != nil {
   810  		return err
   811  	}
   812  
   813  	// Ensure ssh keys exist
   814  	log.Print("Checking existing of GCP ssh keys...")
   815  	k := filepath.Join(util.Home(".ssh"), "google_compute_engine")
   816  	if _, err := os.Stat(k); err != nil {
   817  		return err
   818  	}
   819  	pk := k + ".pub"
   820  	if _, err := os.Stat(pk); err != nil {
   821  		return err
   822  	}
   823  
   824  	log.Printf("Checking presence of public key in %s", o.gcpProject)
   825  	if out, err := control.Output(exec.Command("gcloud", "compute", "--project="+o.gcpProject, "project-info", "describe")); err != nil {
   826  		return err
   827  	} else if b, err := ioutil.ReadFile(pk); err != nil {
   828  		return err
   829  	} else if !strings.Contains(string(out), string(b)) {
   830  		log.Print("Uploading public ssh key to project metadata...")
   831  		if err = control.FinishRunning(exec.Command("gcloud", "compute", "--project="+o.gcpProject, "config-ssh")); err != nil {
   832  			return err
   833  		}
   834  	}
   835  
   836  	// Install custom gcloud version if necessary
   837  	if o.gcpCloudSdk != "" {
   838  		for i := 0; i < 3; i++ {
   839  			if err := control.FinishRunning(exec.Command("gsutil", "-mq", "cp", "-r", o.gcpCloudSdk, util.Home())); err == nil {
   840  				break // Success!
   841  			}
   842  			time.Sleep(1 << uint(i) * time.Second)
   843  		}
   844  		for _, f := range []string{util.Home(".gsutil"), util.Home("repo"), util.Home("cloudsdk")} {
   845  			if _, err := os.Stat(f); err == nil || !os.IsNotExist(err) {
   846  				if err = os.RemoveAll(f); err != nil {
   847  					return err
   848  				}
   849  			}
   850  		}
   851  
   852  		install := util.Home("repo", "google-cloud-sdk.tar.gz")
   853  		if strings.HasSuffix(o.gcpCloudSdk, ".tar.gz") {
   854  			install = util.Home(filepath.Base(o.gcpCloudSdk))
   855  		} else {
   856  			if err := os.Rename(util.Home(filepath.Base(o.gcpCloudSdk)), util.Home("repo")); err != nil {
   857  				return err
   858  			}
   859  
   860  			// Controls which gcloud components to install.
   861  			pop, err := util.PushEnv("CLOUDSDK_COMPONENT_MANAGER_SNAPSHOT_URL", "file://"+util.Home("repo", "components-2.json"))
   862  			if err != nil {
   863  				return err
   864  			}
   865  			defer pop()
   866  		}
   867  
   868  		if err := installGcloud(install, util.Home("cloudsdk")); err != nil {
   869  			return err
   870  		}
   871  		// gcloud creds may have changed
   872  		if err := activateServiceAccount(o.gcpServiceAccount); err != nil {
   873  			return err
   874  		}
   875  	}
   876  
   877  	if o.kubemark {
   878  		if p := os.Getenv("KUBEMARK_BAZEL_BUILD"); strings.ToLower(p) == "y" {
   879  			// we need docker-credential-gcr to get authed properly
   880  			// https://github.com/bazelbuild/rules_docker#authorization
   881  			if err := control.FinishRunning(exec.Command("gcloud", "components", "install", "docker-credential-gcr")); err != nil {
   882  				return err
   883  			}
   884  			if err := control.FinishRunning(exec.Command("docker-credential-gcr", "configure-docker")); err != nil {
   885  				return err
   886  			}
   887  		}
   888  	}
   889  
   890  	return nil
   891  }
   892  
   893  func prepareAws(o *options) error {
   894  	// gcloud creds may have changed
   895  	if err := activateServiceAccount(o.gcpServiceAccount); err != nil {
   896  		return err
   897  	}
   898  	return control.FinishRunning(exec.Command("pip", "install", "awscli"))
   899  }
   900  
   901  // Activate GOOGLE_APPLICATION_CREDENTIALS if set or do nothing.
   902  func activateServiceAccount(path string) error {
   903  	if path == "" {
   904  		return nil
   905  	}
   906  	return control.FinishRunning(exec.Command("gcloud", "auth", "activate-service-account", "--key-file="+path))
   907  }
   908  
   909  // Make all artifacts world readable.
   910  // The root user winds up owning the files when the container exists.
   911  // Ensure that other users can read these files at that time.
   912  func chmodArtifacts() error {
   913  	return control.FinishRunning(exec.Command("chmod", "-R", "o+r", artifacts))
   914  }
   915  
   916  func prepare(o *options) error {
   917  	if err := util.MigrateOptions([]util.MigratedOption{
   918  		{
   919  			Env:    "KUBERNETES_PROVIDER",
   920  			Option: &o.provider,
   921  			Name:   "--provider",
   922  		},
   923  		{
   924  			Env:    "CLUSTER_NAME",
   925  			Option: &o.cluster,
   926  			Name:   "--cluster",
   927  		},
   928  	}); err != nil {
   929  		return err
   930  	}
   931  	if err := prepareGinkgoParallel(&o.ginkgoParallel); err != nil {
   932  		return err
   933  	}
   934  
   935  	switch o.provider {
   936  	case "gce", "gke", "kubernetes-anywhere", "node":
   937  		if err := prepareGcp(o); err != nil {
   938  			return err
   939  		}
   940  	case "aws":
   941  		if err := prepareAws(o); err != nil {
   942  			return err
   943  		}
   944  	}
   945  
   946  	if o.kubemark {
   947  		if err := util.MigrateOptions([]util.MigratedOption{
   948  			{
   949  				Env:    "KUBEMARK_NUM_NODES",
   950  				Option: &o.kubemarkNodes,
   951  				Name:   "--kubemark-nodes",
   952  			},
   953  			{
   954  				Env:    "KUBEMARK_MASTER_SIZE",
   955  				Option: &o.kubemarkMasterSize,
   956  				Name:   "--kubemark-master-size",
   957  			},
   958  		}); err != nil {
   959  			return err
   960  		}
   961  	}
   962  
   963  	if err := os.MkdirAll(artifacts, 0777); err != nil { // Create artifacts
   964  		return err
   965  	}
   966  
   967  	return nil
   968  }
   969  
   970  func prepareFederation(o *options) error {
   971  	if o.multipleFederations {
   972  		// TODO(fejta): use boskos to grab a federation cluster
   973  		// Note: EXECUTOR_NUMBER and NODE_NAME are Jenkins
   974  		// specific environment variables. So this doesn't work
   975  		// when we move away from Jenkins.
   976  		execNum := os.Getenv("EXECUTOR_NUMBER")
   977  		if execNum == "" {
   978  			execNum = "0"
   979  		}
   980  		suffix := fmt.Sprintf("%s-%s", os.Getenv("NODE_NAME"), execNum)
   981  		federationName := fmt.Sprintf("e2e-f8n-%s", suffix)
   982  		federationSystemNamespace := fmt.Sprintf("f8n-system-%s", suffix)
   983  		err := os.Setenv("FEDERATION_NAME", federationName)
   984  		if err != nil {
   985  			return err
   986  		}
   987  		return os.Setenv("FEDERATION_NAMESPACE", federationSystemNamespace)
   988  	}
   989  	return nil
   990  }
   991  
   992  type ginkgoParallelValue struct {
   993  	v int // 0 == not set (defaults to 1)
   994  }
   995  
   996  func (v *ginkgoParallelValue) IsBoolFlag() bool {
   997  	return true
   998  }
   999  
  1000  func (v *ginkgoParallelValue) String() string {
  1001  	if v.v == 0 {
  1002  		return "1"
  1003  	}
  1004  	return strconv.Itoa(v.v)
  1005  }
  1006  
  1007  func (v *ginkgoParallelValue) Set(s string) error {
  1008  	if s == "" {
  1009  		v.v = 0
  1010  		return nil
  1011  	}
  1012  	if s == "true" {
  1013  		v.v = defaultGinkgoParallel
  1014  		return nil
  1015  	}
  1016  	p, err := strconv.Atoi(s)
  1017  	if err != nil {
  1018  		return fmt.Errorf("--ginkgo-parallel must be an integer, found %q", s)
  1019  	}
  1020  	if p < 1 {
  1021  		return fmt.Errorf("--ginkgo-parallel must be >= 1, found %d", p)
  1022  	}
  1023  	v.v = p
  1024  	return nil
  1025  }
  1026  
  1027  func (v *ginkgoParallelValue) Type() string {
  1028  	return "ginkgoParallelValue"
  1029  }
  1030  
  1031  func (v *ginkgoParallelValue) Get() int {
  1032  	if v.v == 0 {
  1033  		return 1
  1034  	}
  1035  	return v.v
  1036  }
  1037  
  1038  var _ flag.Value = &ginkgoParallelValue{}
  1039  
  1040  // Hand migrate this option. GINKGO_PARALLEL => GINKGO_PARALLEL_NODES=25
  1041  func prepareGinkgoParallel(v *ginkgoParallelValue) error {
  1042  	if p := os.Getenv("GINKGO_PARALLEL"); strings.ToLower(p) == "y" {
  1043  		log.Printf("Please use kubetest --ginkgo-parallel (instead of deprecated GINKGO_PARALLEL=y)")
  1044  		if err := v.Set("true"); err != nil {
  1045  			return err
  1046  		}
  1047  		os.Unsetenv("GINKGO_PARALLEL")
  1048  	}
  1049  	if p := os.Getenv("GINKGO_PARALLEL_NODES"); p != "" {
  1050  		log.Printf("Please use kubetest --ginkgo-parallel=%s (instead of deprecated GINKGO_PARALLEL_NODES=%s)", p, p)
  1051  		if err := v.Set(p); err != nil {
  1052  			return err
  1053  		}
  1054  	}
  1055  	os.Setenv("GINKGO_PARALLEL_NODES", v.String())
  1056  	return nil
  1057  }
  1058  
  1059  func publish(pub string) error {
  1060  	v, err := ioutil.ReadFile("version")
  1061  	if err != nil {
  1062  		return err
  1063  	}
  1064  	log.Printf("Set %s version to %s", pub, string(v))
  1065  	return gcsWrite(pub, v)
  1066  }