github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/vm/gce/utils.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package gce
    12  
    13  import (
    14  	"bufio"
    15  	"bytes"
    16  	"fmt"
    17  	"io"
    18  	"io/ioutil"
    19  	"os"
    20  	"os/exec"
    21  	"text/template"
    22  
    23  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm"
    24  	"github.com/cockroachdb/errors"
    25  )
    26  
    27  const (
    28  	dnsProject = "cockroach-shared"
    29  	dnsZone    = "roachprod"
    30  )
    31  
    32  // Subdomain is the DNS subdomain to in which to maintain cluster node names.
    33  var Subdomain = func() string {
    34  	if d, ok := os.LookupEnv("ROACHPROD_DNS"); ok {
    35  		return d
    36  	}
    37  	return "roachprod.crdb.io"
    38  }()
    39  
    40  // Startup script used to find/format/mount all local SSDs in GCE.
    41  // Each disk is mounted to /mnt/data<disknum> and chmoded to all users.
    42  //
    43  // This is a template because the instantiator needs to optionally configure the
    44  // mounting options. The script cannot take arguments since it is to be invoked
    45  // by the gcloud tool which cannot pass args.
    46  const gceLocalSSDStartupScriptTemplate = `#!/usr/bin/env bash
    47  # Script for setting up a GCE machine for roachprod use.
    48  
    49  mount_opts="discard,defaults"
    50  {{if .ExtraMountOpts}}mount_opts="${mount_opts},{{.ExtraMountOpts}}"{{end}}
    51  
    52  disknum=0
    53  for d in $(ls /dev/disk/by-id/google-local-*); do
    54    let "disknum++"
    55    grep -e "${d}" /etc/fstab > /dev/null
    56    if [ $? -ne 0 ]; then
    57      echo "Disk ${disknum}: ${d} not mounted, creating..."
    58      mountpoint="/mnt/data${disknum}"
    59      sudo mkdir -p "${mountpoint}"
    60      sudo mkfs.ext4 -F ${d}
    61      sudo mount -o ${mount_opts} ${d} ${mountpoint}
    62      echo "${d} ${mountpoint} ext4 ${mount_opts} 1 1" | sudo tee -a /etc/fstab
    63    else
    64      echo "Disk ${disknum}: ${d} already mounted, skipping..."
    65    fi
    66  done
    67  if [ "${disknum}" -eq "0" ]; then
    68    echo "No disks mounted, creating /mnt/data1"
    69    sudo mkdir -p /mnt/data1
    70  fi
    71  
    72  sudo chmod 777 /mnt/data1
    73  # sshguard can prevent frequent ssh connections to the same host. Disable it.
    74  sudo service sshguard stop
    75  # increase the number of concurrent unauthenticated connections to the sshd
    76  # daemon. See https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Load_Balancing.
    77  # By default, only 10 unauthenticated connections are permitted before sshd
    78  # starts randomly dropping connections.
    79  sudo sh -c 'echo "MaxStartups 64:30:128" >> /etc/ssh/sshd_config'
    80  # Crank up the logging for issues such as:
    81  # https://github.com/cockroachdb/cockroach/issues/36929
    82  sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG3/' /etc/ssh/sshd_config
    83  sudo service sshd restart
    84  # increase the default maximum number of open file descriptors for
    85  # root and non-root users. Load generators running a lot of concurrent
    86  # workers bump into this often.
    87  sudo sh -c 'echo "root - nofile 65536\n* - nofile 65536" > /etc/security/limits.d/10-roachprod-nofiles.conf'
    88  
    89  # Send TCP keepalives every minute since GCE will terminate idle connections
    90  # after 10m. Note that keepalives still need to be requested by the application
    91  # with the SO_KEEPALIVE socket option.
    92  cat <<EOF > /etc/sysctl.d/99-roachprod-tcp-keepalive.conf
    93  net.ipv4.tcp_keepalive_time=60
    94  net.ipv4.tcp_keepalive_intvl=60
    95  net.ipv4.tcp_keepalive_probes=5
    96  EOF
    97  
    98  # Enable core dumps
    99  cat <<EOF > /etc/security/limits.d/core_unlimited.conf
   100  * soft core unlimited
   101  * hard core unlimited
   102  root soft core unlimited
   103  root hard core unlimited
   104  EOF
   105  
   106  mkdir -p /mnt/data1/cores
   107  chmod a+w /mnt/data1/cores
   108  CORE_PATTERN="/mnt/data1/cores/core.%e.%p.%h.%t"
   109  echo "$CORE_PATTERN" > /proc/sys/kernel/core_pattern
   110  sed -i'~' 's/enabled=1/enabled=0/' /etc/default/apport
   111  sed -i'~' '/.*kernel\\.core_pattern.*/c\\' /etc/sysctl.conf
   112  echo "kernel.core_pattern=$CORE_PATTERN" >> /etc/sysctl.conf
   113  
   114  sysctl --system  # reload sysctl settings
   115  
   116  sudo touch /mnt/data1/.roachprod-initialized
   117  `
   118  
   119  // writeStartupScript writes the startup script to a temp file.
   120  // Returns the path to the file.
   121  // After use, the caller should delete the temp file.
   122  //
   123  // extraMountOpts, if not empty, is appended to the default mount options. It is
   124  // a comma-separated list of options for the "mount -o" flag.
   125  func writeStartupScript(extraMountOpts string) (string, error) {
   126  	type tmplParams struct {
   127  		ExtraMountOpts string
   128  	}
   129  
   130  	args := tmplParams{ExtraMountOpts: extraMountOpts}
   131  
   132  	tmpfile, err := ioutil.TempFile("", "gce-startup-script")
   133  	if err != nil {
   134  		return "", err
   135  	}
   136  	defer tmpfile.Close()
   137  
   138  	t := template.Must(template.New("start").Parse(gceLocalSSDStartupScriptTemplate))
   139  	if err := t.Execute(tmpfile, args); err != nil {
   140  		return "", err
   141  	}
   142  	return tmpfile.Name(), nil
   143  }
   144  
   145  // SyncDNS replaces the configured DNS zone with the supplied hosts.
   146  func SyncDNS(vms vm.List) error {
   147  	if Subdomain == "" {
   148  		return nil
   149  	}
   150  
   151  	f, err := ioutil.TempFile(os.ExpandEnv("$HOME/.roachprod/"), "dns.bind")
   152  	if err != nil {
   153  		return err
   154  	}
   155  	defer f.Close()
   156  	defer func() {
   157  		if err := os.Remove(f.Name()); err != nil {
   158  			fmt.Fprintf(os.Stderr, "removing %s failed: %v", f.Name(), err)
   159  		}
   160  	}()
   161  	for _, vm := range vms {
   162  		if len(vm.Name) < 60 {
   163  			fmt.Fprintf(f, "%s 60 IN A %s\n", vm.Name, vm.PublicIP)
   164  		}
   165  	}
   166  	f.Close()
   167  
   168  	args := []string{"--project", dnsProject, "dns", "record-sets", "import",
   169  		"-z", dnsZone, "--delete-all-existing", "--zone-file-format", f.Name()}
   170  	cmd := exec.Command("gcloud", args...)
   171  	output, err := cmd.CombinedOutput()
   172  
   173  	return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output)
   174  }
   175  
   176  // GetUserAuthorizedKeys retreives reads a list of user public keys from the
   177  // gcloud cockroach-ephemeral project and returns them formatted for use in
   178  // an authorized_keys file.
   179  func GetUserAuthorizedKeys() (authorizedKeys []byte, err error) {
   180  	var outBuf bytes.Buffer
   181  	// The below command will return a stream of user:pubkey as text.
   182  	cmd := exec.Command("gcloud", "compute", "project-info", "describe",
   183  		"--project=cockroach-ephemeral",
   184  		"--format=value(commonInstanceMetadata.ssh-keys)")
   185  	cmd.Stderr = os.Stderr
   186  	cmd.Stdout = &outBuf
   187  	if err := cmd.Run(); err != nil {
   188  		return nil, err
   189  	}
   190  	// Initialize a bufio.Reader with a large enough buffer that we will never
   191  	// expect a line prefix when processing lines and can return an error if a
   192  	// call to ReadLine ever returns a prefix.
   193  	var pubKeyBuf bytes.Buffer
   194  	r := bufio.NewReaderSize(&outBuf, 1<<16 /* 64 kB */)
   195  	for {
   196  		line, isPrefix, err := r.ReadLine()
   197  		if err == io.EOF {
   198  			break
   199  		}
   200  		if err != nil {
   201  			return nil, err
   202  		}
   203  		if isPrefix {
   204  			return nil, fmt.Errorf("unexpectedly failed to read public key line")
   205  		}
   206  		if len(line) == 0 {
   207  			continue
   208  		}
   209  		colonIdx := bytes.IndexRune(line, ':')
   210  		if colonIdx == -1 {
   211  			return nil, fmt.Errorf("malformed public key line %q", string(line))
   212  		}
   213  		// Skip users named "root" or "ubuntu" which don't correspond to humans
   214  		// and should be removed from the gcloud project.
   215  		if name := string(line[:colonIdx]); name == "root" || name == "ubuntu" {
   216  			continue
   217  		}
   218  		pubKeyBuf.Write(line[colonIdx+1:])
   219  		pubKeyBuf.WriteRune('\n')
   220  	}
   221  	return pubKeyBuf.Bytes(), nil
   222  }