github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/vm/gce/gcloud.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package gce 12 13 import ( 14 "bytes" 15 "context" 16 "encoding/json" 17 "fmt" 18 "os" 19 "os/exec" 20 "regexp" 21 "strings" 22 "time" 23 24 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config" 25 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm" 26 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm/flagstub" 27 "github.com/cockroachdb/errors" 28 "github.com/spf13/pflag" 29 "golang.org/x/sync/errgroup" 30 ) 31 32 const ( 33 defaultProject = "cockroach-ephemeral" 34 // ProviderName is gce. 35 ProviderName = "gce" 36 ) 37 38 // DefaultProject returns the default GCE project. 39 func DefaultProject() string { 40 return defaultProject 41 } 42 43 // projects for which a cron GC job exists. 44 var projectsWithGC = []string{defaultProject, "andrei-jepsen"} 45 46 // init will inject the GCE provider into vm.Providers, but only if the gcloud tool is available on the local path. 47 func init() { 48 var p vm.Provider = &Provider{} 49 if _, err := exec.LookPath("gcloud"); err != nil { 50 p = flagstub.New(p, "please install the gcloud CLI utilities "+ 51 "(https://cloud.google.com/sdk/downloads)") 52 } else { 53 gceP := makeProvider() 54 p = &gceP 55 } 56 vm.Providers[ProviderName] = p 57 } 58 59 func runJSONCommand(args []string, parsed interface{}) error { 60 cmd := exec.Command("gcloud", args...) 61 62 rawJSON, err := cmd.Output() 63 if err != nil { 64 var stderr []byte 65 if exitErr := (*exec.ExitError)(nil); errors.As(err, &exitErr) { 66 stderr = exitErr.Stderr 67 } 68 // TODO(peter,ajwerner): Remove this hack once gcloud behaves when adding 69 // new zones. 70 if matched, _ := regexp.Match(`.*Unknown zone`, stderr); !matched { 71 return errors.Errorf("failed to run: gcloud %s: %s\nstdout: %s\nstderr: %s", 72 strings.Join(args, " "), err, bytes.TrimSpace(rawJSON), bytes.TrimSpace(stderr)) 73 } 74 } 75 76 if err := json.Unmarshal(rawJSON, &parsed); err != nil { 77 return errors.Wrapf(err, "failed to parse json %s", rawJSON) 78 } 79 80 return nil 81 } 82 83 // Used to parse the gcloud responses 84 type jsonVM struct { 85 Name string 86 Labels map[string]string 87 CreationTimestamp time.Time 88 NetworkInterfaces []struct { 89 Network string 90 NetworkIP string 91 AccessConfigs []struct { 92 Name string 93 NatIP string 94 } 95 } 96 MachineType string 97 Zone string 98 } 99 100 // Convert the JSON VM data into our common VM type 101 func (jsonVM *jsonVM) toVM(project string, opts *providerOpts) (ret *vm.VM) { 102 var vmErrors []error 103 var err error 104 105 // Check "lifetime" label. 106 var lifetime time.Duration 107 if lifetimeStr, ok := jsonVM.Labels["lifetime"]; ok { 108 if lifetime, err = time.ParseDuration(lifetimeStr); err != nil { 109 vmErrors = append(vmErrors, vm.ErrNoExpiration) 110 } 111 } else { 112 vmErrors = append(vmErrors, vm.ErrNoExpiration) 113 } 114 115 // lastComponent splits a url path and returns only the last part. This is 116 // used because some of the fields in jsonVM are defined using URLs like: 117 // "https://www.googleapis.com/compute/v1/projects/cockroach-shared/zones/us-east1-b/machineTypes/n1-standard-16" 118 // We want to strip this down to "n1-standard-16", so we only want the last 119 // component. 120 lastComponent := func(url string) string { 121 s := strings.Split(url, "/") 122 return s[len(s)-1] 123 } 124 125 // Extract network information 126 var publicIP, privateIP, vpc string 127 if len(jsonVM.NetworkInterfaces) == 0 { 128 vmErrors = append(vmErrors, vm.ErrBadNetwork) 129 } else { 130 privateIP = jsonVM.NetworkInterfaces[0].NetworkIP 131 if len(jsonVM.NetworkInterfaces[0].AccessConfigs) == 0 { 132 vmErrors = append(vmErrors, vm.ErrBadNetwork) 133 } else { 134 _ = jsonVM.NetworkInterfaces[0].AccessConfigs[0].Name // silence unused warning 135 publicIP = jsonVM.NetworkInterfaces[0].AccessConfigs[0].NatIP 136 vpc = lastComponent(jsonVM.NetworkInterfaces[0].Network) 137 } 138 } 139 140 machineType := lastComponent(jsonVM.MachineType) 141 zone := lastComponent(jsonVM.Zone) 142 remoteUser := config.SharedUser 143 if !opts.useSharedUser { 144 // N.B. gcloud uses the local username to log into instances rather 145 // than the username on the authenticated Google account but we set 146 // up the shared user at cluster creation time. Allow use of the 147 // local username if requested. 148 remoteUser = config.OSUser.Username 149 } 150 return &vm.VM{ 151 Name: jsonVM.Name, 152 CreatedAt: jsonVM.CreationTimestamp, 153 Errors: vmErrors, 154 DNS: fmt.Sprintf("%s.%s.%s", jsonVM.Name, zone, project), 155 Lifetime: lifetime, 156 PrivateIP: privateIP, 157 Provider: ProviderName, 158 ProviderID: jsonVM.Name, 159 PublicIP: publicIP, 160 RemoteUser: remoteUser, 161 VPC: vpc, 162 MachineType: machineType, 163 Zone: zone, 164 Project: project, 165 } 166 } 167 168 type jsonAuth struct { 169 Account string 170 Status string 171 } 172 173 // User-configurable, provider-specific options 174 type providerOpts struct { 175 // projects represent the GCE projects to operate on. Accessed through 176 // GetProject() or GetProjects() depending on whether the command accepts 177 // multiple projects or a single one. 178 projects []string 179 ServiceAccount string 180 MachineType string 181 Zones []string 182 Image string 183 SSDCount int 184 185 // useSharedUser indicates that the shared user rather than the personal 186 // user should be used to ssh into the remote machines. 187 useSharedUser bool 188 } 189 190 // projectsVal is the implementation for the --gce-projects flag. It populates 191 // opts.projects. 192 type projectsVal struct { 193 acceptMultipleProjects bool 194 opts *providerOpts 195 } 196 197 // defaultZones is the list of zones used by default for cluster creation. 198 // If the geo flag is specified, nodes are distributed between zones. 199 var defaultZones = []string{ 200 "us-east1-b", 201 "us-west1-b", 202 "europe-west2-b", 203 } 204 205 // Set is part of the pflag.Value interface. 206 func (v projectsVal) Set(projects string) error { 207 if projects == "" { 208 return fmt.Errorf("empty GCE project") 209 } 210 prj := strings.Split(projects, ",") 211 if !v.acceptMultipleProjects && len(prj) > 1 { 212 return fmt.Errorf("multiple GCE projects not supported for command") 213 } 214 v.opts.projects = prj 215 return nil 216 } 217 218 // Type is part of the pflag.Value interface. 219 func (v projectsVal) Type() string { 220 if v.acceptMultipleProjects { 221 return "comma-separated list of GCE projects" 222 } 223 return "GCE project name" 224 } 225 226 // String is part of the pflag.Value interface. 227 func (v projectsVal) String() string { 228 return strings.Join(v.opts.projects, ",") 229 } 230 231 func makeProviderOpts() providerOpts { 232 project := os.Getenv("GCE_PROJECT") 233 if project == "" { 234 project = defaultProject 235 } 236 return providerOpts{ 237 // projects needs space for one project, which is set by the flags for 238 // commands that accept a single project. 239 projects: []string{project}, 240 } 241 } 242 243 // GetProject returns the GCE project on which we're configured to operate. 244 // If multiple projects were configured, this panics. 245 func (p *Provider) GetProject() string { 246 o := p.opts 247 if len(o.projects) > 1 { 248 panic(fmt.Sprintf( 249 "multiple projects not supported (%d specified)", len(o.projects))) 250 } 251 return o.projects[0] 252 } 253 254 // GetProjects returns the list of GCE projects on which we're configured to 255 // operate. 256 func (p *Provider) GetProjects() []string { 257 return p.opts.projects 258 } 259 260 func (o *providerOpts) ConfigureCreateFlags(flags *pflag.FlagSet) { 261 flags.StringVar(&o.MachineType, "machine-type", "n1-standard-4", "DEPRECATED") 262 _ = flags.MarkDeprecated("machine-type", "use "+ProviderName+"-machine-type instead") 263 flags.StringSliceVar(&o.Zones, "zones", nil, "DEPRECATED") 264 _ = flags.MarkDeprecated("zones", "use "+ProviderName+"-zones instead") 265 266 flags.StringVar(&o.ServiceAccount, ProviderName+"-service-account", 267 os.Getenv("GCE_SERVICE_ACCOUNT"), "Service account to use") 268 flags.StringVar(&o.MachineType, ProviderName+"-machine-type", "n1-standard-4", 269 "Machine type (see https://cloud.google.com/compute/docs/machine-types)") 270 flags.StringSliceVar(&o.Zones, ProviderName+"-zones", nil, 271 fmt.Sprintf("Zones for cluster. If zones are formatted as AZ:N where N is an integer, the zone\n"+ 272 "will be repeated N times. If > 1 zone specified, nodes will be geo-distributed\n"+ 273 "regardless of geo (default [%s])", 274 strings.Join(defaultZones, ","))) 275 flags.StringVar(&o.Image, ProviderName+"-image", "ubuntu-1604-xenial-v20200129", 276 "Image to use to create the vm, ubuntu-1904-disco-v20191008 is a more modern image") 277 flags.IntVar(&o.SSDCount, ProviderName+"-local-ssd-count", 1, 278 "Number of local SSDs to create on GCE instance.") 279 } 280 281 func (o *providerOpts) ConfigureClusterFlags(flags *pflag.FlagSet, opt vm.MultipleProjectsOption) { 282 var usage string 283 if opt == vm.SingleProject { 284 usage = "GCE project to manage" 285 } else { 286 usage = "List of GCE projects to manage" 287 } 288 289 flags.Var( 290 projectsVal{ 291 acceptMultipleProjects: opt == vm.AcceptMultipleProjects, 292 opts: o, 293 }, 294 ProviderName+"-project", /* name */ 295 usage) 296 297 flags.BoolVar(&o.useSharedUser, 298 ProviderName+"-use-shared-user", true, 299 fmt.Sprintf("use the shared user %q for ssh rather than your user %q", 300 config.SharedUser, config.OSUser.Username)) 301 } 302 303 // Provider is the GCE implementation of the vm.Provider interface. 304 type Provider struct { 305 opts providerOpts 306 } 307 308 func makeProvider() Provider { 309 return Provider{opts: makeProviderOpts()} 310 } 311 312 // CleanSSH TODO(peter): document 313 func (p *Provider) CleanSSH() error { 314 for _, prj := range p.GetProjects() { 315 args := []string{"compute", "config-ssh", "--project", prj, "--quiet", "--remove"} 316 cmd := exec.Command("gcloud", args...) 317 318 output, err := cmd.CombinedOutput() 319 if err != nil { 320 return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output) 321 } 322 } 323 return nil 324 } 325 326 // ConfigSSH TODO(peter): document 327 func (p *Provider) ConfigSSH() error { 328 for _, prj := range p.GetProjects() { 329 args := []string{"compute", "config-ssh", "--project", prj, "--quiet"} 330 cmd := exec.Command("gcloud", args...) 331 332 output, err := cmd.CombinedOutput() 333 if err != nil { 334 return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output) 335 } 336 } 337 return nil 338 } 339 340 // Create TODO(peter): document 341 func (p *Provider) Create(names []string, opts vm.CreateOpts) error { 342 project := p.GetProject() 343 var gcJob bool 344 for _, prj := range projectsWithGC { 345 if prj == p.GetProject() { 346 gcJob = true 347 break 348 } 349 } 350 if !gcJob { 351 fmt.Printf("WARNING: --lifetime functionality requires "+ 352 "`roachprod gc --gce-project=%s` cronjob\n", project) 353 } 354 355 zones, err := vm.ExpandZonesFlag(p.opts.Zones) 356 if err != nil { 357 return err 358 } 359 if len(zones) == 0 { 360 if opts.GeoDistributed { 361 zones = defaultZones 362 } else { 363 zones = []string{defaultZones[0]} 364 } 365 } 366 367 // Fixed args. 368 args := []string{ 369 "compute", "instances", "create", 370 "--subnet", "default", 371 "--maintenance-policy", "MIGRATE", 372 "--scopes", "default,storage-rw", 373 "--image", p.opts.Image, 374 "--image-project", "ubuntu-os-cloud", 375 "--boot-disk-size", "10", 376 "--boot-disk-type", "pd-ssd", 377 } 378 379 if project == defaultProject && p.opts.ServiceAccount == "" { 380 p.opts.ServiceAccount = "21965078311-compute@developer.gserviceaccount.com" 381 } 382 if p.opts.ServiceAccount != "" { 383 args = append(args, "--service-account", p.opts.ServiceAccount) 384 } 385 386 extraMountOpts := "" 387 // Dynamic args. 388 if opts.SSDOpts.UseLocalSSD { 389 // n2-class and c2-class GCP machines cannot be requested with only 1 390 // SSD; minimum number of actual SSDs is 2. 391 // TODO(pbardea): This is more general for machine types that 392 // come in different sizes. 393 // See: https://cloud.google.com/compute/docs/disks/ 394 n2MachineTypes := regexp.MustCompile("^[cn]2-.+-16") 395 if n2MachineTypes.MatchString(p.opts.MachineType) && p.opts.SSDCount < 2 { 396 fmt.Fprint(os.Stderr, "WARNING: SSD count must be at least 2 for n2 and c2 machine types with 16vCPU. Setting --gce-local-ssd-count to 2.\n") 397 p.opts.SSDCount = 2 398 } 399 for i := 0; i < p.opts.SSDCount; i++ { 400 args = append(args, "--local-ssd", "interface=NVME") 401 } 402 if opts.SSDOpts.NoExt4Barrier { 403 extraMountOpts = "nobarrier" 404 } 405 } 406 407 // Create GCE startup script file. 408 filename, err := writeStartupScript(extraMountOpts) 409 if err != nil { 410 return errors.Wrapf(err, "could not write GCE startup script to temp file") 411 } 412 defer func() { 413 _ = os.Remove(filename) 414 }() 415 416 args = append(args, "--machine-type", p.opts.MachineType) 417 args = append(args, "--labels", fmt.Sprintf("lifetime=%s", opts.Lifetime)) 418 419 args = append(args, "--metadata-from-file", fmt.Sprintf("startup-script=%s", filename)) 420 args = append(args, "--project", project) 421 422 var g errgroup.Group 423 424 nodeZones := vm.ZonePlacement(len(zones), len(names)) 425 zoneHostNames := make([][]string, len(zones)) 426 for i, name := range names { 427 zone := nodeZones[i] 428 zoneHostNames[zone] = append(zoneHostNames[zone], name) 429 } 430 for i, zoneHosts := range zoneHostNames { 431 argsWithZone := append(args[:len(args):len(args)], "--zone", zones[i]) 432 argsWithZone = append(argsWithZone, zoneHosts...) 433 g.Go(func() error { 434 cmd := exec.Command("gcloud", argsWithZone...) 435 436 output, err := cmd.CombinedOutput() 437 if err != nil { 438 return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output) 439 } 440 return nil 441 }) 442 443 } 444 445 return g.Wait() 446 } 447 448 // Delete TODO(peter): document 449 func (p *Provider) Delete(vms vm.List) error { 450 // Map from project to map of zone to list of machines in that project/zone. 451 projectZoneMap := make(map[string]map[string][]string) 452 for _, v := range vms { 453 if v.Provider != ProviderName { 454 return errors.Errorf("%s received VM instance from %s", ProviderName, v.Provider) 455 } 456 if projectZoneMap[v.Project] == nil { 457 projectZoneMap[v.Project] = make(map[string][]string) 458 } 459 460 projectZoneMap[v.Project][v.Zone] = append(projectZoneMap[v.Project][v.Zone], v.Name) 461 } 462 463 var g errgroup.Group 464 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) 465 defer cancel() 466 for project, zoneMap := range projectZoneMap { 467 for zone, names := range zoneMap { 468 args := []string{ 469 "compute", "instances", "delete", 470 "--delete-disks", "all", 471 } 472 473 args = append(args, "--project", project) 474 args = append(args, "--zone", zone) 475 args = append(args, names...) 476 477 g.Go(func() error { 478 cmd := exec.CommandContext(ctx, "gcloud", args...) 479 480 output, err := cmd.CombinedOutput() 481 if err != nil { 482 return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output) 483 } 484 return nil 485 }) 486 } 487 } 488 489 return g.Wait() 490 } 491 492 // Extend TODO(peter): document 493 func (p *Provider) Extend(vms vm.List, lifetime time.Duration) error { 494 // The gcloud command only takes a single instance. Unlike Delete() above, we have to 495 // perform the iteration here. 496 for _, v := range vms { 497 args := []string{"compute", "instances", "add-labels"} 498 499 args = append(args, "--project", v.Project) 500 args = append(args, "--zone", v.Zone) 501 args = append(args, "--labels", fmt.Sprintf("lifetime=%s", lifetime)) 502 args = append(args, v.Name) 503 504 cmd := exec.Command("gcloud", args...) 505 506 output, err := cmd.CombinedOutput() 507 if err != nil { 508 return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output) 509 } 510 } 511 return nil 512 } 513 514 // FindActiveAccount TODO(peter): document 515 func (p *Provider) FindActiveAccount() (string, error) { 516 args := []string{"auth", "list", "--format", "json", "--filter", "status~ACTIVE"} 517 518 accounts := make([]jsonAuth, 0) 519 if err := runJSONCommand(args, &accounts); err != nil { 520 return "", err 521 } 522 523 if len(accounts) != 1 { 524 return "", fmt.Errorf("no active accounts found, please configure gcloud") 525 } 526 527 if !strings.HasSuffix(accounts[0].Account, config.EmailDomain) { 528 return "", fmt.Errorf("active account %q does no belong to domain %s", 529 accounts[0].Account, config.EmailDomain) 530 } 531 _ = accounts[0].Status // silence unused warning 532 533 username := strings.Split(accounts[0].Account, "@")[0] 534 return username, nil 535 } 536 537 // Flags TODO(peter): document 538 func (p *Provider) Flags() vm.ProviderFlags { 539 return &p.opts 540 } 541 542 // List queries gcloud to produce a list of VM info objects. 543 func (p *Provider) List() (vm.List, error) { 544 var vms vm.List 545 for _, prj := range p.GetProjects() { 546 args := []string{"compute", "instances", "list", "--project", prj, "--format", "json"} 547 548 // Run the command, extracting the JSON payload 549 jsonVMS := make([]jsonVM, 0) 550 if err := runJSONCommand(args, &jsonVMS); err != nil { 551 return nil, err 552 } 553 554 // Now, convert the json payload into our common VM type 555 for _, jsonVM := range jsonVMS { 556 vms = append(vms, *jsonVM.toVM(prj, &p.opts)) 557 } 558 } 559 560 return vms, nil 561 } 562 563 // Name TODO(peter): document 564 func (p *Provider) Name() string { 565 return ProviderName 566 } 567 568 // Active is part of the vm.Provider interface. 569 func (p *Provider) Active() bool { 570 return true 571 }