github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/vm/gce/gcloud.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package gce
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"encoding/json"
    17  	"fmt"
    18  	"os"
    19  	"os/exec"
    20  	"regexp"
    21  	"strings"
    22  	"time"
    23  
    24  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config"
    25  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm"
    26  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm/flagstub"
    27  	"github.com/cockroachdb/errors"
    28  	"github.com/spf13/pflag"
    29  	"golang.org/x/sync/errgroup"
    30  )
    31  
    32  const (
    33  	defaultProject = "cockroach-ephemeral"
    34  	// ProviderName is gce.
    35  	ProviderName = "gce"
    36  )
    37  
    38  // DefaultProject returns the default GCE project.
    39  func DefaultProject() string {
    40  	return defaultProject
    41  }
    42  
    43  // projects for which a cron GC job exists.
    44  var projectsWithGC = []string{defaultProject, "andrei-jepsen"}
    45  
    46  // init will inject the GCE provider into vm.Providers, but only if the gcloud tool is available on the local path.
    47  func init() {
    48  	var p vm.Provider = &Provider{}
    49  	if _, err := exec.LookPath("gcloud"); err != nil {
    50  		p = flagstub.New(p, "please install the gcloud CLI utilities "+
    51  			"(https://cloud.google.com/sdk/downloads)")
    52  	} else {
    53  		gceP := makeProvider()
    54  		p = &gceP
    55  	}
    56  	vm.Providers[ProviderName] = p
    57  }
    58  
    59  func runJSONCommand(args []string, parsed interface{}) error {
    60  	cmd := exec.Command("gcloud", args...)
    61  
    62  	rawJSON, err := cmd.Output()
    63  	if err != nil {
    64  		var stderr []byte
    65  		if exitErr := (*exec.ExitError)(nil); errors.As(err, &exitErr) {
    66  			stderr = exitErr.Stderr
    67  		}
    68  		// TODO(peter,ajwerner): Remove this hack once gcloud behaves when adding
    69  		// new zones.
    70  		if matched, _ := regexp.Match(`.*Unknown zone`, stderr); !matched {
    71  			return errors.Errorf("failed to run: gcloud %s: %s\nstdout: %s\nstderr: %s",
    72  				strings.Join(args, " "), err, bytes.TrimSpace(rawJSON), bytes.TrimSpace(stderr))
    73  		}
    74  	}
    75  
    76  	if err := json.Unmarshal(rawJSON, &parsed); err != nil {
    77  		return errors.Wrapf(err, "failed to parse json %s", rawJSON)
    78  	}
    79  
    80  	return nil
    81  }
    82  
    83  // Used to parse the gcloud responses
    84  type jsonVM struct {
    85  	Name              string
    86  	Labels            map[string]string
    87  	CreationTimestamp time.Time
    88  	NetworkInterfaces []struct {
    89  		Network       string
    90  		NetworkIP     string
    91  		AccessConfigs []struct {
    92  			Name  string
    93  			NatIP string
    94  		}
    95  	}
    96  	MachineType string
    97  	Zone        string
    98  }
    99  
   100  // Convert the JSON VM data into our common VM type
   101  func (jsonVM *jsonVM) toVM(project string, opts *providerOpts) (ret *vm.VM) {
   102  	var vmErrors []error
   103  	var err error
   104  
   105  	// Check "lifetime" label.
   106  	var lifetime time.Duration
   107  	if lifetimeStr, ok := jsonVM.Labels["lifetime"]; ok {
   108  		if lifetime, err = time.ParseDuration(lifetimeStr); err != nil {
   109  			vmErrors = append(vmErrors, vm.ErrNoExpiration)
   110  		}
   111  	} else {
   112  		vmErrors = append(vmErrors, vm.ErrNoExpiration)
   113  	}
   114  
   115  	// lastComponent splits a url path and returns only the last part. This is
   116  	// used because some of the fields in jsonVM are defined using URLs like:
   117  	//  "https://www.googleapis.com/compute/v1/projects/cockroach-shared/zones/us-east1-b/machineTypes/n1-standard-16"
   118  	// We want to strip this down to "n1-standard-16", so we only want the last
   119  	// component.
   120  	lastComponent := func(url string) string {
   121  		s := strings.Split(url, "/")
   122  		return s[len(s)-1]
   123  	}
   124  
   125  	// Extract network information
   126  	var publicIP, privateIP, vpc string
   127  	if len(jsonVM.NetworkInterfaces) == 0 {
   128  		vmErrors = append(vmErrors, vm.ErrBadNetwork)
   129  	} else {
   130  		privateIP = jsonVM.NetworkInterfaces[0].NetworkIP
   131  		if len(jsonVM.NetworkInterfaces[0].AccessConfigs) == 0 {
   132  			vmErrors = append(vmErrors, vm.ErrBadNetwork)
   133  		} else {
   134  			_ = jsonVM.NetworkInterfaces[0].AccessConfigs[0].Name // silence unused warning
   135  			publicIP = jsonVM.NetworkInterfaces[0].AccessConfigs[0].NatIP
   136  			vpc = lastComponent(jsonVM.NetworkInterfaces[0].Network)
   137  		}
   138  	}
   139  
   140  	machineType := lastComponent(jsonVM.MachineType)
   141  	zone := lastComponent(jsonVM.Zone)
   142  	remoteUser := config.SharedUser
   143  	if !opts.useSharedUser {
   144  		// N.B. gcloud uses the local username to log into instances rather
   145  		// than the username on the authenticated Google account but we set
   146  		// up the shared user at cluster creation time. Allow use of the
   147  		// local username if requested.
   148  		remoteUser = config.OSUser.Username
   149  	}
   150  	return &vm.VM{
   151  		Name:        jsonVM.Name,
   152  		CreatedAt:   jsonVM.CreationTimestamp,
   153  		Errors:      vmErrors,
   154  		DNS:         fmt.Sprintf("%s.%s.%s", jsonVM.Name, zone, project),
   155  		Lifetime:    lifetime,
   156  		PrivateIP:   privateIP,
   157  		Provider:    ProviderName,
   158  		ProviderID:  jsonVM.Name,
   159  		PublicIP:    publicIP,
   160  		RemoteUser:  remoteUser,
   161  		VPC:         vpc,
   162  		MachineType: machineType,
   163  		Zone:        zone,
   164  		Project:     project,
   165  	}
   166  }
   167  
   168  type jsonAuth struct {
   169  	Account string
   170  	Status  string
   171  }
   172  
   173  // User-configurable, provider-specific options
   174  type providerOpts struct {
   175  	// projects represent the GCE projects to operate on. Accessed through
   176  	// GetProject() or GetProjects() depending on whether the command accepts
   177  	// multiple projects or a single one.
   178  	projects       []string
   179  	ServiceAccount string
   180  	MachineType    string
   181  	Zones          []string
   182  	Image          string
   183  	SSDCount       int
   184  
   185  	// useSharedUser indicates that the shared user rather than the personal
   186  	// user should be used to ssh into the remote machines.
   187  	useSharedUser bool
   188  }
   189  
   190  // projectsVal is the implementation for the --gce-projects flag. It populates
   191  // opts.projects.
   192  type projectsVal struct {
   193  	acceptMultipleProjects bool
   194  	opts                   *providerOpts
   195  }
   196  
   197  // defaultZones is the list of  zones used by default for cluster creation.
   198  // If the geo flag is specified, nodes are distributed between zones.
   199  var defaultZones = []string{
   200  	"us-east1-b",
   201  	"us-west1-b",
   202  	"europe-west2-b",
   203  }
   204  
   205  // Set is part of the pflag.Value interface.
   206  func (v projectsVal) Set(projects string) error {
   207  	if projects == "" {
   208  		return fmt.Errorf("empty GCE project")
   209  	}
   210  	prj := strings.Split(projects, ",")
   211  	if !v.acceptMultipleProjects && len(prj) > 1 {
   212  		return fmt.Errorf("multiple GCE projects not supported for command")
   213  	}
   214  	v.opts.projects = prj
   215  	return nil
   216  }
   217  
   218  // Type is part of the pflag.Value interface.
   219  func (v projectsVal) Type() string {
   220  	if v.acceptMultipleProjects {
   221  		return "comma-separated list of GCE projects"
   222  	}
   223  	return "GCE project name"
   224  }
   225  
   226  // String is part of the pflag.Value interface.
   227  func (v projectsVal) String() string {
   228  	return strings.Join(v.opts.projects, ",")
   229  }
   230  
   231  func makeProviderOpts() providerOpts {
   232  	project := os.Getenv("GCE_PROJECT")
   233  	if project == "" {
   234  		project = defaultProject
   235  	}
   236  	return providerOpts{
   237  		// projects needs space for one project, which is set by the flags for
   238  		// commands that accept a single project.
   239  		projects: []string{project},
   240  	}
   241  }
   242  
   243  // GetProject returns the GCE project on which we're configured to operate.
   244  // If multiple projects were configured, this panics.
   245  func (p *Provider) GetProject() string {
   246  	o := p.opts
   247  	if len(o.projects) > 1 {
   248  		panic(fmt.Sprintf(
   249  			"multiple projects not supported (%d specified)", len(o.projects)))
   250  	}
   251  	return o.projects[0]
   252  }
   253  
   254  // GetProjects returns the list of GCE projects on which we're configured to
   255  // operate.
   256  func (p *Provider) GetProjects() []string {
   257  	return p.opts.projects
   258  }
   259  
   260  func (o *providerOpts) ConfigureCreateFlags(flags *pflag.FlagSet) {
   261  	flags.StringVar(&o.MachineType, "machine-type", "n1-standard-4", "DEPRECATED")
   262  	_ = flags.MarkDeprecated("machine-type", "use "+ProviderName+"-machine-type instead")
   263  	flags.StringSliceVar(&o.Zones, "zones", nil, "DEPRECATED")
   264  	_ = flags.MarkDeprecated("zones", "use "+ProviderName+"-zones instead")
   265  
   266  	flags.StringVar(&o.ServiceAccount, ProviderName+"-service-account",
   267  		os.Getenv("GCE_SERVICE_ACCOUNT"), "Service account to use")
   268  	flags.StringVar(&o.MachineType, ProviderName+"-machine-type", "n1-standard-4",
   269  		"Machine type (see https://cloud.google.com/compute/docs/machine-types)")
   270  	flags.StringSliceVar(&o.Zones, ProviderName+"-zones", nil,
   271  		fmt.Sprintf("Zones for cluster. If zones are formatted as AZ:N where N is an integer, the zone\n"+
   272  			"will be repeated N times. If > 1 zone specified, nodes will be geo-distributed\n"+
   273  			"regardless of geo (default [%s])",
   274  			strings.Join(defaultZones, ",")))
   275  	flags.StringVar(&o.Image, ProviderName+"-image", "ubuntu-1604-xenial-v20200129",
   276  		"Image to use to create the vm, ubuntu-1904-disco-v20191008 is a more modern image")
   277  	flags.IntVar(&o.SSDCount, ProviderName+"-local-ssd-count", 1,
   278  		"Number of local SSDs to create on GCE instance.")
   279  }
   280  
   281  func (o *providerOpts) ConfigureClusterFlags(flags *pflag.FlagSet, opt vm.MultipleProjectsOption) {
   282  	var usage string
   283  	if opt == vm.SingleProject {
   284  		usage = "GCE project to manage"
   285  	} else {
   286  		usage = "List of GCE projects to manage"
   287  	}
   288  
   289  	flags.Var(
   290  		projectsVal{
   291  			acceptMultipleProjects: opt == vm.AcceptMultipleProjects,
   292  			opts:                   o,
   293  		},
   294  		ProviderName+"-project", /* name */
   295  		usage)
   296  
   297  	flags.BoolVar(&o.useSharedUser,
   298  		ProviderName+"-use-shared-user", true,
   299  		fmt.Sprintf("use the shared user %q for ssh rather than your user %q",
   300  			config.SharedUser, config.OSUser.Username))
   301  }
   302  
   303  // Provider is the GCE implementation of the vm.Provider interface.
   304  type Provider struct {
   305  	opts providerOpts
   306  }
   307  
   308  func makeProvider() Provider {
   309  	return Provider{opts: makeProviderOpts()}
   310  }
   311  
   312  // CleanSSH TODO(peter): document
   313  func (p *Provider) CleanSSH() error {
   314  	for _, prj := range p.GetProjects() {
   315  		args := []string{"compute", "config-ssh", "--project", prj, "--quiet", "--remove"}
   316  		cmd := exec.Command("gcloud", args...)
   317  
   318  		output, err := cmd.CombinedOutput()
   319  		if err != nil {
   320  			return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output)
   321  		}
   322  	}
   323  	return nil
   324  }
   325  
   326  // ConfigSSH TODO(peter): document
   327  func (p *Provider) ConfigSSH() error {
   328  	for _, prj := range p.GetProjects() {
   329  		args := []string{"compute", "config-ssh", "--project", prj, "--quiet"}
   330  		cmd := exec.Command("gcloud", args...)
   331  
   332  		output, err := cmd.CombinedOutput()
   333  		if err != nil {
   334  			return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output)
   335  		}
   336  	}
   337  	return nil
   338  }
   339  
   340  // Create TODO(peter): document
   341  func (p *Provider) Create(names []string, opts vm.CreateOpts) error {
   342  	project := p.GetProject()
   343  	var gcJob bool
   344  	for _, prj := range projectsWithGC {
   345  		if prj == p.GetProject() {
   346  			gcJob = true
   347  			break
   348  		}
   349  	}
   350  	if !gcJob {
   351  		fmt.Printf("WARNING: --lifetime functionality requires "+
   352  			"`roachprod gc --gce-project=%s` cronjob\n", project)
   353  	}
   354  
   355  	zones, err := vm.ExpandZonesFlag(p.opts.Zones)
   356  	if err != nil {
   357  		return err
   358  	}
   359  	if len(zones) == 0 {
   360  		if opts.GeoDistributed {
   361  			zones = defaultZones
   362  		} else {
   363  			zones = []string{defaultZones[0]}
   364  		}
   365  	}
   366  
   367  	// Fixed args.
   368  	args := []string{
   369  		"compute", "instances", "create",
   370  		"--subnet", "default",
   371  		"--maintenance-policy", "MIGRATE",
   372  		"--scopes", "default,storage-rw",
   373  		"--image", p.opts.Image,
   374  		"--image-project", "ubuntu-os-cloud",
   375  		"--boot-disk-size", "10",
   376  		"--boot-disk-type", "pd-ssd",
   377  	}
   378  
   379  	if project == defaultProject && p.opts.ServiceAccount == "" {
   380  		p.opts.ServiceAccount = "21965078311-compute@developer.gserviceaccount.com"
   381  	}
   382  	if p.opts.ServiceAccount != "" {
   383  		args = append(args, "--service-account", p.opts.ServiceAccount)
   384  	}
   385  
   386  	extraMountOpts := ""
   387  	// Dynamic args.
   388  	if opts.SSDOpts.UseLocalSSD {
   389  		// n2-class and c2-class GCP machines cannot be requested with only 1
   390  		// SSD; minimum number of actual SSDs is 2.
   391  		// TODO(pbardea): This is more general for machine types that
   392  		// come in different sizes.
   393  		// See: https://cloud.google.com/compute/docs/disks/
   394  		n2MachineTypes := regexp.MustCompile("^[cn]2-.+-16")
   395  		if n2MachineTypes.MatchString(p.opts.MachineType) && p.opts.SSDCount < 2 {
   396  			fmt.Fprint(os.Stderr, "WARNING: SSD count must be at least 2 for n2 and c2 machine types with 16vCPU. Setting --gce-local-ssd-count to 2.\n")
   397  			p.opts.SSDCount = 2
   398  		}
   399  		for i := 0; i < p.opts.SSDCount; i++ {
   400  			args = append(args, "--local-ssd", "interface=NVME")
   401  		}
   402  		if opts.SSDOpts.NoExt4Barrier {
   403  			extraMountOpts = "nobarrier"
   404  		}
   405  	}
   406  
   407  	// Create GCE startup script file.
   408  	filename, err := writeStartupScript(extraMountOpts)
   409  	if err != nil {
   410  		return errors.Wrapf(err, "could not write GCE startup script to temp file")
   411  	}
   412  	defer func() {
   413  		_ = os.Remove(filename)
   414  	}()
   415  
   416  	args = append(args, "--machine-type", p.opts.MachineType)
   417  	args = append(args, "--labels", fmt.Sprintf("lifetime=%s", opts.Lifetime))
   418  
   419  	args = append(args, "--metadata-from-file", fmt.Sprintf("startup-script=%s", filename))
   420  	args = append(args, "--project", project)
   421  
   422  	var g errgroup.Group
   423  
   424  	nodeZones := vm.ZonePlacement(len(zones), len(names))
   425  	zoneHostNames := make([][]string, len(zones))
   426  	for i, name := range names {
   427  		zone := nodeZones[i]
   428  		zoneHostNames[zone] = append(zoneHostNames[zone], name)
   429  	}
   430  	for i, zoneHosts := range zoneHostNames {
   431  		argsWithZone := append(args[:len(args):len(args)], "--zone", zones[i])
   432  		argsWithZone = append(argsWithZone, zoneHosts...)
   433  		g.Go(func() error {
   434  			cmd := exec.Command("gcloud", argsWithZone...)
   435  
   436  			output, err := cmd.CombinedOutput()
   437  			if err != nil {
   438  				return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output)
   439  			}
   440  			return nil
   441  		})
   442  
   443  	}
   444  
   445  	return g.Wait()
   446  }
   447  
   448  // Delete TODO(peter): document
   449  func (p *Provider) Delete(vms vm.List) error {
   450  	// Map from project to map of zone to list of machines in that project/zone.
   451  	projectZoneMap := make(map[string]map[string][]string)
   452  	for _, v := range vms {
   453  		if v.Provider != ProviderName {
   454  			return errors.Errorf("%s received VM instance from %s", ProviderName, v.Provider)
   455  		}
   456  		if projectZoneMap[v.Project] == nil {
   457  			projectZoneMap[v.Project] = make(map[string][]string)
   458  		}
   459  
   460  		projectZoneMap[v.Project][v.Zone] = append(projectZoneMap[v.Project][v.Zone], v.Name)
   461  	}
   462  
   463  	var g errgroup.Group
   464  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
   465  	defer cancel()
   466  	for project, zoneMap := range projectZoneMap {
   467  		for zone, names := range zoneMap {
   468  			args := []string{
   469  				"compute", "instances", "delete",
   470  				"--delete-disks", "all",
   471  			}
   472  
   473  			args = append(args, "--project", project)
   474  			args = append(args, "--zone", zone)
   475  			args = append(args, names...)
   476  
   477  			g.Go(func() error {
   478  				cmd := exec.CommandContext(ctx, "gcloud", args...)
   479  
   480  				output, err := cmd.CombinedOutput()
   481  				if err != nil {
   482  					return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output)
   483  				}
   484  				return nil
   485  			})
   486  		}
   487  	}
   488  
   489  	return g.Wait()
   490  }
   491  
   492  // Extend TODO(peter): document
   493  func (p *Provider) Extend(vms vm.List, lifetime time.Duration) error {
   494  	// The gcloud command only takes a single instance.  Unlike Delete() above, we have to
   495  	// perform the iteration here.
   496  	for _, v := range vms {
   497  		args := []string{"compute", "instances", "add-labels"}
   498  
   499  		args = append(args, "--project", v.Project)
   500  		args = append(args, "--zone", v.Zone)
   501  		args = append(args, "--labels", fmt.Sprintf("lifetime=%s", lifetime))
   502  		args = append(args, v.Name)
   503  
   504  		cmd := exec.Command("gcloud", args...)
   505  
   506  		output, err := cmd.CombinedOutput()
   507  		if err != nil {
   508  			return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output)
   509  		}
   510  	}
   511  	return nil
   512  }
   513  
   514  // FindActiveAccount TODO(peter): document
   515  func (p *Provider) FindActiveAccount() (string, error) {
   516  	args := []string{"auth", "list", "--format", "json", "--filter", "status~ACTIVE"}
   517  
   518  	accounts := make([]jsonAuth, 0)
   519  	if err := runJSONCommand(args, &accounts); err != nil {
   520  		return "", err
   521  	}
   522  
   523  	if len(accounts) != 1 {
   524  		return "", fmt.Errorf("no active accounts found, please configure gcloud")
   525  	}
   526  
   527  	if !strings.HasSuffix(accounts[0].Account, config.EmailDomain) {
   528  		return "", fmt.Errorf("active account %q does no belong to domain %s",
   529  			accounts[0].Account, config.EmailDomain)
   530  	}
   531  	_ = accounts[0].Status // silence unused warning
   532  
   533  	username := strings.Split(accounts[0].Account, "@")[0]
   534  	return username, nil
   535  }
   536  
   537  // Flags TODO(peter): document
   538  func (p *Provider) Flags() vm.ProviderFlags {
   539  	return &p.opts
   540  }
   541  
   542  // List queries gcloud to produce a list of VM info objects.
   543  func (p *Provider) List() (vm.List, error) {
   544  	var vms vm.List
   545  	for _, prj := range p.GetProjects() {
   546  		args := []string{"compute", "instances", "list", "--project", prj, "--format", "json"}
   547  
   548  		// Run the command, extracting the JSON payload
   549  		jsonVMS := make([]jsonVM, 0)
   550  		if err := runJSONCommand(args, &jsonVMS); err != nil {
   551  			return nil, err
   552  		}
   553  
   554  		// Now, convert the json payload into our common VM type
   555  		for _, jsonVM := range jsonVMS {
   556  			vms = append(vms, *jsonVM.toVM(prj, &p.opts))
   557  		}
   558  	}
   559  
   560  	return vms, nil
   561  }
   562  
   563  // Name TODO(peter): document
   564  func (p *Provider) Name() string {
   565  	return ProviderName
   566  }
   567  
   568  // Active is part of the vm.Provider interface.
   569  func (p *Provider) Active() bool {
   570  	return true
   571  }