github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/vm/gce/utils.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package gce 12 13 import ( 14 "bufio" 15 "bytes" 16 "fmt" 17 "io" 18 "io/ioutil" 19 "os" 20 "os/exec" 21 "text/template" 22 23 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm" 24 "github.com/cockroachdb/errors" 25 ) 26 27 const ( 28 dnsProject = "cockroach-shared" 29 dnsZone = "roachprod" 30 ) 31 32 // Subdomain is the DNS subdomain to in which to maintain cluster node names. 33 var Subdomain = func() string { 34 if d, ok := os.LookupEnv("ROACHPROD_DNS"); ok { 35 return d 36 } 37 return "roachprod.crdb.io" 38 }() 39 40 // Startup script used to find/format/mount all local SSDs in GCE. 41 // Each disk is mounted to /mnt/data<disknum> and chmoded to all users. 42 // 43 // This is a template because the instantiator needs to optionally configure the 44 // mounting options. The script cannot take arguments since it is to be invoked 45 // by the gcloud tool which cannot pass args. 46 const gceLocalSSDStartupScriptTemplate = `#!/usr/bin/env bash 47 # Script for setting up a GCE machine for roachprod use. 48 49 mount_opts="discard,defaults" 50 {{if .ExtraMountOpts}}mount_opts="${mount_opts},{{.ExtraMountOpts}}"{{end}} 51 52 disknum=0 53 for d in $(ls /dev/disk/by-id/google-local-*); do 54 let "disknum++" 55 grep -e "${d}" /etc/fstab > /dev/null 56 if [ $? -ne 0 ]; then 57 echo "Disk ${disknum}: ${d} not mounted, creating..." 58 mountpoint="/mnt/data${disknum}" 59 sudo mkdir -p "${mountpoint}" 60 sudo mkfs.ext4 -F ${d} 61 sudo mount -o ${mount_opts} ${d} ${mountpoint} 62 echo "${d} ${mountpoint} ext4 ${mount_opts} 1 1" | sudo tee -a /etc/fstab 63 else 64 echo "Disk ${disknum}: ${d} already mounted, skipping..." 65 fi 66 done 67 if [ "${disknum}" -eq "0" ]; then 68 echo "No disks mounted, creating /mnt/data1" 69 sudo mkdir -p /mnt/data1 70 fi 71 72 sudo chmod 777 /mnt/data1 73 # sshguard can prevent frequent ssh connections to the same host. Disable it. 74 sudo service sshguard stop 75 # increase the number of concurrent unauthenticated connections to the sshd 76 # daemon. See https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Load_Balancing. 77 # By default, only 10 unauthenticated connections are permitted before sshd 78 # starts randomly dropping connections. 79 sudo sh -c 'echo "MaxStartups 64:30:128" >> /etc/ssh/sshd_config' 80 # Crank up the logging for issues such as: 81 # https://github.com/cockroachdb/cockroach/issues/36929 82 sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG3/' /etc/ssh/sshd_config 83 sudo service sshd restart 84 # increase the default maximum number of open file descriptors for 85 # root and non-root users. Load generators running a lot of concurrent 86 # workers bump into this often. 87 sudo sh -c 'echo "root - nofile 65536\n* - nofile 65536" > /etc/security/limits.d/10-roachprod-nofiles.conf' 88 89 # Send TCP keepalives every minute since GCE will terminate idle connections 90 # after 10m. Note that keepalives still need to be requested by the application 91 # with the SO_KEEPALIVE socket option. 92 cat <<EOF > /etc/sysctl.d/99-roachprod-tcp-keepalive.conf 93 net.ipv4.tcp_keepalive_time=60 94 net.ipv4.tcp_keepalive_intvl=60 95 net.ipv4.tcp_keepalive_probes=5 96 EOF 97 98 # Enable core dumps 99 cat <<EOF > /etc/security/limits.d/core_unlimited.conf 100 * soft core unlimited 101 * hard core unlimited 102 root soft core unlimited 103 root hard core unlimited 104 EOF 105 106 mkdir -p /mnt/data1/cores 107 chmod a+w /mnt/data1/cores 108 CORE_PATTERN="/mnt/data1/cores/core.%e.%p.%h.%t" 109 echo "$CORE_PATTERN" > /proc/sys/kernel/core_pattern 110 sed -i'~' 's/enabled=1/enabled=0/' /etc/default/apport 111 sed -i'~' '/.*kernel\\.core_pattern.*/c\\' /etc/sysctl.conf 112 echo "kernel.core_pattern=$CORE_PATTERN" >> /etc/sysctl.conf 113 114 sysctl --system # reload sysctl settings 115 116 sudo touch /mnt/data1/.roachprod-initialized 117 ` 118 119 // writeStartupScript writes the startup script to a temp file. 120 // Returns the path to the file. 121 // After use, the caller should delete the temp file. 122 // 123 // extraMountOpts, if not empty, is appended to the default mount options. It is 124 // a comma-separated list of options for the "mount -o" flag. 125 func writeStartupScript(extraMountOpts string) (string, error) { 126 type tmplParams struct { 127 ExtraMountOpts string 128 } 129 130 args := tmplParams{ExtraMountOpts: extraMountOpts} 131 132 tmpfile, err := ioutil.TempFile("", "gce-startup-script") 133 if err != nil { 134 return "", err 135 } 136 defer tmpfile.Close() 137 138 t := template.Must(template.New("start").Parse(gceLocalSSDStartupScriptTemplate)) 139 if err := t.Execute(tmpfile, args); err != nil { 140 return "", err 141 } 142 return tmpfile.Name(), nil 143 } 144 145 // SyncDNS replaces the configured DNS zone with the supplied hosts. 146 func SyncDNS(vms vm.List) error { 147 if Subdomain == "" { 148 return nil 149 } 150 151 f, err := ioutil.TempFile(os.ExpandEnv("$HOME/.roachprod/"), "dns.bind") 152 if err != nil { 153 return err 154 } 155 defer f.Close() 156 defer func() { 157 if err := os.Remove(f.Name()); err != nil { 158 fmt.Fprintf(os.Stderr, "removing %s failed: %v", f.Name(), err) 159 } 160 }() 161 for _, vm := range vms { 162 if len(vm.Name) < 60 { 163 fmt.Fprintf(f, "%s 60 IN A %s\n", vm.Name, vm.PublicIP) 164 } 165 } 166 f.Close() 167 168 args := []string{"--project", dnsProject, "dns", "record-sets", "import", 169 "-z", dnsZone, "--delete-all-existing", "--zone-file-format", f.Name()} 170 cmd := exec.Command("gcloud", args...) 171 output, err := cmd.CombinedOutput() 172 173 return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output) 174 } 175 176 // GetUserAuthorizedKeys retreives reads a list of user public keys from the 177 // gcloud cockroach-ephemeral project and returns them formatted for use in 178 // an authorized_keys file. 179 func GetUserAuthorizedKeys() (authorizedKeys []byte, err error) { 180 var outBuf bytes.Buffer 181 // The below command will return a stream of user:pubkey as text. 182 cmd := exec.Command("gcloud", "compute", "project-info", "describe", 183 "--project=cockroach-ephemeral", 184 "--format=value(commonInstanceMetadata.ssh-keys)") 185 cmd.Stderr = os.Stderr 186 cmd.Stdout = &outBuf 187 if err := cmd.Run(); err != nil { 188 return nil, err 189 } 190 // Initialize a bufio.Reader with a large enough buffer that we will never 191 // expect a line prefix when processing lines and can return an error if a 192 // call to ReadLine ever returns a prefix. 193 var pubKeyBuf bytes.Buffer 194 r := bufio.NewReaderSize(&outBuf, 1<<16 /* 64 kB */) 195 for { 196 line, isPrefix, err := r.ReadLine() 197 if err == io.EOF { 198 break 199 } 200 if err != nil { 201 return nil, err 202 } 203 if isPrefix { 204 return nil, fmt.Errorf("unexpectedly failed to read public key line") 205 } 206 if len(line) == 0 { 207 continue 208 } 209 colonIdx := bytes.IndexRune(line, ':') 210 if colonIdx == -1 { 211 return nil, fmt.Errorf("malformed public key line %q", string(line)) 212 } 213 // Skip users named "root" or "ubuntu" which don't correspond to humans 214 // and should be removed from the gcloud project. 215 if name := string(line[:colonIdx]); name == "root" || name == "ubuntu" { 216 continue 217 } 218 pubKeyBuf.Write(line[colonIdx+1:]) 219 pubKeyBuf.WriteRune('\n') 220 } 221 return pubKeyBuf.Bytes(), nil 222 }