github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/vm/aws/support.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package aws 12 13 import ( 14 "bytes" 15 "encoding/json" 16 "io/ioutil" 17 "log" 18 "os/exec" 19 "strings" 20 "text/template" 21 22 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm" 23 "github.com/cockroachdb/errors" 24 ) 25 26 // Both M5 and I3 machines expose their EBS or local SSD volumes as NVMe block 27 // devices, but the actual device numbers vary a bit between the two types. 28 // This user-data script will create a filesystem, mount the data volume, and 29 // chmod 777. 30 // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/nvme-ebs-volumes.html 31 // 32 // This is a template because the instantiator needs to optionally configure the 33 // mounting options. The script cannot take arguments since it is to be invoked 34 // by the aws tool which cannot pass args. 35 const awsStartupScriptTemplate = `#!/usr/bin/env bash 36 # Script for setting up a AWS machine for roachprod use. 37 38 set -x 39 sudo apt-get update 40 sudo apt-get install -qy --no-install-recommends mdadm 41 42 mount_opts="discard,defaults" 43 {{if .ExtraMountOpts}}mount_opts="${mount_opts},{{.ExtraMountOpts}}"{{end}} 44 45 disks=() 46 mountpoint="/mnt/data1" 47 # On different machine types, the drives are either called nvme... or xvdd. 48 for d in $(ls /dev/nvme?n1 /dev/xvdd); do 49 if ! mount | grep ${d}; then 50 disks+=("${d}") 51 echo "Disk ${d} not mounted, creating..." 52 else 53 echo "Disk ${d} already mounted, skipping..." 54 fi 55 done 56 if [ "${#disks[@]}" -eq "0" ]; then 57 echo "No disks mounted, creating ${mountpoint}" 58 mkdir -p ${mountpoint} 59 chmod 777 ${mountpoint} 60 elif [ "${#disks[@]}" -eq "1" ]; then 61 echo "One disk mounted, creating ${mountpoint}" 62 mkdir -p ${mountpoint} 63 disk=${disks[0]} 64 mkfs.ext4 -E nodiscard ${disk} 65 mount -o ${mount_opts} ${disk} ${mountpoint} 66 chmod 777 ${mountpoint} 67 echo "${disk} ${mountpoint} ext4 ${mount_opts} 1 1" | tee -a /etc/fstab 68 else 69 echo "${#disks[@]} disks mounted, creating ${mountpoint} using RAID 0" 70 mkdir -p ${mountpoint} 71 raiddisk="/dev/md0" 72 mdadm --create ${raiddisk} --level=0 --raid-devices=${#disks[@]} "${disks[@]}" 73 mkfs.ext4 -E nodiscard ${raiddisk} 74 mount -o ${mount_opts} ${raiddisk} ${mountpoint} 75 chmod 777 ${mountpoint} 76 echo "${raiddisk} ${mountpoint} ext4 ${mount_opts} 1 1" | tee -a /etc/fstab 77 fi 78 79 sudo apt-get install -qy chrony 80 echo -e "\nserver 169.254.169.123 prefer iburst" | sudo tee -a /etc/chrony/chrony.conf 81 echo -e "\nmakestep 0.1 3" | sudo tee -a /etc/chrony/chrony.conf 82 sudo /etc/init.d/chrony restart 83 sudo chronyc -a waitsync 30 0.01 | sudo tee -a /root/chrony.log 84 85 # sshguard can prevent frequent ssh connections to the same host. Disable it. 86 sudo service sshguard stop 87 # increase the number of concurrent unauthenticated connections to the sshd 88 # daemon. See https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Load_Balancing. 89 # By default, only 10 unauthenticated connections are permitted before sshd 90 # starts randomly dropping connections. 91 sudo sh -c 'echo "MaxStartups 64:30:128" >> /etc/ssh/sshd_config' 92 # Crank up the logging for issues such as: 93 # https://github.com/cockroachdb/cockroach/issues/36929 94 sudo sed -i'' 's/LogLevel.*$/LogLevel DEBUG3/' /etc/ssh/sshd_config 95 sudo service sshd restart 96 # increase the default maximum number of open file descriptors for 97 # root and non-root users. Load generators running a lot of concurrent 98 # workers bump into this often. 99 sudo sh -c 'echo "root - nofile 65536\n* - nofile 65536" > /etc/security/limits.d/10-roachprod-nofiles.conf' 100 101 # Enable core dumps 102 cat <<EOF > /etc/security/limits.d/core_unlimited.conf 103 * soft core unlimited 104 * hard core unlimited 105 root soft core unlimited 106 root hard core unlimited 107 EOF 108 109 mkdir -p /mnt/data1/cores 110 chmod a+w /mnt/data1/cores 111 CORE_PATTERN="/mnt/data1/cores/core.%e.%p.%h.%t" 112 echo "$CORE_PATTERN" > /proc/sys/kernel/core_pattern 113 sed -i'~' 's/enabled=1/enabled=0/' /etc/default/apport 114 sed -i'~' '/.*kernel\\.core_pattern.*/c\\' /etc/sysctl.conf 115 echo "kernel.core_pattern=$CORE_PATTERN" >> /etc/sysctl.conf 116 117 sysctl --system # reload sysctl settings 118 119 sudo touch /mnt/data1/.roachprod-initialized 120 ` 121 122 // writeStartupScript writes the startup script to a temp file. 123 // Returns the path to the file. 124 // After use, the caller should delete the temp file. 125 // 126 // extraMountOpts, if not empty, is appended to the default mount options. It is 127 // a comma-separated list of options for the "mount -o" flag. 128 func writeStartupScript(extraMountOpts string) (string, error) { 129 type tmplParams struct { 130 ExtraMountOpts string 131 } 132 133 args := tmplParams{ExtraMountOpts: extraMountOpts} 134 135 tmpfile, err := ioutil.TempFile("", "aws-startup-script") 136 if err != nil { 137 return "", err 138 } 139 defer tmpfile.Close() 140 141 t := template.Must(template.New("start").Parse(awsStartupScriptTemplate)) 142 if err := t.Execute(tmpfile, args); err != nil { 143 return "", err 144 } 145 return tmpfile.Name(), nil 146 } 147 148 // runCommand is used to invoke an AWS command. 149 func (p *Provider) runCommand(args []string) ([]byte, error) { 150 151 if p.opts.Profile != "" { 152 args = append(args[:len(args):len(args)], "--profile", p.opts.Profile) 153 } 154 var stderrBuf bytes.Buffer 155 cmd := exec.Command("aws", args...) 156 cmd.Stderr = &stderrBuf 157 output, err := cmd.Output() 158 if err != nil { 159 if exitErr := (*exec.ExitError)(nil); errors.As(err, &exitErr) { 160 log.Println(string(exitErr.Stderr)) 161 } 162 return nil, errors.Wrapf(err, "failed to run: aws %s: stderr: %v", 163 strings.Join(args, " "), stderrBuf.String()) 164 } 165 return output, nil 166 } 167 168 // runJSONCommand invokes an aws command and parses the json output. 169 func (p *Provider) runJSONCommand(args []string, parsed interface{}) error { 170 // Force json output in case the user has overridden the default behavior. 171 args = append(args[:len(args):len(args)], "--output", "json") 172 rawJSON, err := p.runCommand(args) 173 if err != nil { 174 return err 175 } 176 if err := json.Unmarshal(rawJSON, &parsed); err != nil { 177 return errors.Wrapf(err, "failed to parse json %s", rawJSON) 178 } 179 180 return nil 181 } 182 183 // regionMap collates VM instances by their region. 184 func regionMap(vms vm.List) (map[string]vm.List, error) { 185 // Fan out the work by region 186 byRegion := make(map[string]vm.List) 187 for _, m := range vms { 188 region, err := zoneToRegion(m.Zone) 189 if err != nil { 190 return nil, err 191 } 192 byRegion[region] = append(byRegion[region], m) 193 } 194 return byRegion, nil 195 } 196 197 // zoneToRegion converts an availability zone like us-east-2a to the zone name us-east-2 198 func zoneToRegion(zone string) (string, error) { 199 return zone[0 : len(zone)-1], nil 200 }