github.com/coreos/mantle@v0.13.0/platform/platform.go (about) 1 // Copyright 2017 CoreOS, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package platform 16 17 import ( 18 "bytes" 19 "fmt" 20 "io" 21 "path/filepath" 22 "sync" 23 "time" 24 25 "github.com/coreos/pkg/capnslog" 26 "golang.org/x/crypto/ssh" 27 "golang.org/x/net/context" 28 29 "github.com/coreos/mantle/platform/conf" 30 "github.com/coreos/mantle/util" 31 ) 32 33 const ( 34 sshRetries = 30 35 sshTimeout = 10 * time.Second 36 ) 37 38 var ( 39 plog = capnslog.NewPackageLogger("github.com/coreos/mantle", "platform") 40 ) 41 42 // Name is a unique identifier for a platform. 43 type Name string 44 45 // Machine represents a Container Linux instance. 46 type Machine interface { 47 // ID returns the plaform-specific machine identifier. 48 ID() string 49 50 // IP returns the machine's public IP. 51 IP() string 52 53 // PrivateIP returns the machine's private IP. 54 PrivateIP() string 55 56 // RuntimeConf returns the cluster's runtime configuration. 57 RuntimeConf() RuntimeConfig 58 59 // SSHClient establishes a new SSH connection to the machine. 60 SSHClient() (*ssh.Client, error) 61 62 // PasswordSSHClient establishes a new SSH connection using the provided credentials. 63 PasswordSSHClient(user string, password string) (*ssh.Client, error) 64 65 // SSH runs a single command over a new SSH connection. 66 SSH(cmd string) ([]byte, []byte, error) 67 68 // Reboot restarts the machine and waits for it to come back. 69 Reboot() error 70 71 // Destroy terminates the machine and frees associated resources. It should log 72 // any failures; since they are not actionable, it does not return an error. 73 Destroy() 74 75 // ConsoleOutput returns the machine's console output if available, 76 // or an empty string. Only expected to be valid after Destroy(). 77 ConsoleOutput() string 78 79 // JournalOutput returns the machine's journal output if available, 80 // or an empty string. Only expected to be valid after Destroy(). 81 JournalOutput() string 82 } 83 84 // Cluster represents a cluster of machines within a single Flight. 85 type Cluster interface { 86 // Platform returns the name of the platform. 87 Platform() Name 88 89 // Name returns a unique name for the Cluster. 90 Name() string 91 92 // NewMachine creates a new Container Linux machine. 93 NewMachine(userdata *conf.UserData) (Machine, error) 94 95 // Machines returns a slice of the active machines in the Cluster. 96 Machines() []Machine 97 98 // GetDiscoveryURL returns a new etcd discovery URL. 99 GetDiscoveryURL(size int) (string, error) 100 101 // Destroy terminates each machine in the cluster and frees any other 102 // associated resources. It should log any failures; since they are not 103 // actionable, it does not return an error 104 Destroy() 105 106 // ConsoleOutput returns a map of console output from destroyed 107 // cluster machines. 108 ConsoleOutput() map[string]string 109 110 // JournalOutput returns a map of journal output from destroyed 111 // cluster machines. 112 JournalOutput() map[string]string 113 114 // IgnitionVersion returns the version of Ignition supported by the 115 // cluster 116 IgnitionVersion() string 117 } 118 119 // Flight represents a group of Clusters within a single platform. 120 type Flight interface { 121 // NewCluster creates a new Cluster. 122 NewCluster(rconf *RuntimeConfig) (Cluster, error) 123 124 // Name returns a unique name for the Flight. 125 Name() string 126 127 // Platform returns the name of the platform. 128 Platform() Name 129 130 // Clusters returns a slice of the active Clusters. 131 Clusters() []Cluster 132 133 // Destroy terminates each cluster and frees any other associated 134 // resources. It should log any failures; since they are not 135 // actionable, it does not return an error. 136 Destroy() 137 } 138 139 // SystemdDropin is a userdata type agnostic struct representing a systemd dropin 140 type SystemdDropin struct { 141 Unit string 142 Name string 143 Contents string 144 } 145 146 // Options contains the base options for all clusters. 147 type Options struct { 148 BaseName string 149 Distribution string 150 IgnitionVersion string 151 SystemdDropins []SystemdDropin 152 153 // OSContainer is an image pull spec that can be given to the pivot service 154 // in RHCOS machines to perform machine content upgrades. 155 // When specified additional files & units will be automatically generated 156 // inside of RenderUserData 157 OSContainer string 158 } 159 160 // RuntimeConfig contains cluster-specific configuration. 161 type RuntimeConfig struct { 162 OutputDir string 163 164 NoSSHKeyInUserData bool // don't inject SSH key into Ignition/cloud-config 165 NoSSHKeyInMetadata bool // don't add SSH key to platform metadata 166 NoEnableSelinux bool // don't enable selinux when starting or rebooting a machine 167 AllowFailedUnits bool // don't fail CheckMachine if a systemd unit has failed 168 } 169 170 // Wrap a StdoutPipe as a io.ReadCloser 171 type sshPipe struct { 172 s *ssh.Session 173 c *ssh.Client 174 err *bytes.Buffer 175 io.Reader 176 } 177 178 func (p *sshPipe) Close() error { 179 if err := p.s.Wait(); err != nil { 180 return fmt.Errorf("%s: %s", err, p.err) 181 } 182 if err := p.s.Close(); err != nil { 183 return err 184 } 185 return p.c.Close() 186 } 187 188 // Copy a file between two machines in a cluster. 189 func TransferFile(src Machine, srcPath string, dst Machine, dstPath string) error { 190 srcPipe, err := ReadFile(src, srcPath) 191 if err != nil { 192 return err 193 } 194 defer srcPipe.Close() 195 196 if err := InstallFile(srcPipe, dst, dstPath); err != nil { 197 return err 198 } 199 return nil 200 } 201 202 // ReadFile returns a io.ReadCloser that streams the requested file. The 203 // caller should close the reader when finished. 204 func ReadFile(m Machine, path string) (io.ReadCloser, error) { 205 client, err := m.SSHClient() 206 if err != nil { 207 return nil, fmt.Errorf("failed creating SSH client: %v", err) 208 } 209 210 session, err := client.NewSession() 211 if err != nil { 212 client.Close() 213 return nil, fmt.Errorf("failed creating SSH session: %v", err) 214 } 215 216 // connect session stdout 217 stdoutPipe, err := session.StdoutPipe() 218 if err != nil { 219 session.Close() 220 client.Close() 221 return nil, err 222 } 223 224 // collect stderr 225 errBuf := bytes.NewBuffer(nil) 226 session.Stderr = errBuf 227 228 // stream file to stdout 229 err = session.Start(fmt.Sprintf("sudo cat %s", path)) 230 if err != nil { 231 session.Close() 232 client.Close() 233 return nil, err 234 } 235 236 // pass stdoutPipe as a io.ReadCloser that cleans up the ssh session 237 // on when closed. 238 return &sshPipe{session, client, errBuf, stdoutPipe}, nil 239 } 240 241 // InstallFile copies data from in to the path to on m. 242 func InstallFile(in io.Reader, m Machine, to string) error { 243 dir := filepath.Dir(to) 244 out, stderr, err := m.SSH(fmt.Sprintf("sudo mkdir -p %s", dir)) 245 if err != nil { 246 return fmt.Errorf("failed creating directory %s: %s: %s", dir, stderr, err) 247 } 248 249 client, err := m.SSHClient() 250 if err != nil { 251 return fmt.Errorf("failed creating SSH client: %v", err) 252 } 253 254 defer client.Close() 255 256 session, err := client.NewSession() 257 if err != nil { 258 return fmt.Errorf("failed creating SSH session: %v", err) 259 } 260 261 defer session.Close() 262 263 // write file to fs from stdin 264 session.Stdin = in 265 out, err = session.CombinedOutput(fmt.Sprintf("sudo install -m 0755 /dev/stdin %s", to)) 266 if err != nil { 267 return fmt.Errorf("failed executing install: %q: %v", out, err) 268 } 269 270 return nil 271 } 272 273 // NewMachines spawns n instances in cluster c, with 274 // each instance passed the same userdata. 275 func NewMachines(c Cluster, userdata *conf.UserData, n int) ([]Machine, error) { 276 var wg sync.WaitGroup 277 278 mchan := make(chan Machine, n) 279 errchan := make(chan error, n) 280 281 for i := 0; i < n; i++ { 282 wg.Add(1) 283 go func() { 284 defer wg.Done() 285 m, err := c.NewMachine(userdata) 286 if err != nil { 287 errchan <- err 288 } 289 if m != nil { 290 mchan <- m 291 } 292 }() 293 } 294 295 wg.Wait() 296 close(mchan) 297 close(errchan) 298 299 machs := []Machine{} 300 301 for m := range mchan { 302 machs = append(machs, m) 303 } 304 305 if firsterr, ok := <-errchan; ok { 306 for _, m := range machs { 307 m.Destroy() 308 } 309 return nil, firsterr 310 } 311 312 return machs, nil 313 } 314 315 // CheckMachine tests a machine for various error conditions such as ssh 316 // being available and no systemd units failing at the time ssh is reachable. 317 // It also ensures the remote system is running Container Linux by CoreOS or 318 // Red Hat CoreOS. 319 // 320 // TODO(mischief): better error messages. 321 func CheckMachine(ctx context.Context, m Machine) error { 322 // ensure ssh works and the system is ready 323 sshChecker := func() error { 324 if err := ctx.Err(); err != nil { 325 return err 326 } 327 out, stderr, err := m.SSH("systemctl is-system-running") 328 if !bytes.Contains([]byte("initializing starting running stopping"), out) { 329 return nil // stop retrying if the system went haywire 330 } 331 if err != nil { 332 return fmt.Errorf("could not check if machine is running: %s: %v: %s", out, err, stderr) 333 } 334 return nil 335 } 336 337 if err := util.Retry(sshRetries, sshTimeout, sshChecker); err != nil { 338 return fmt.Errorf("ssh unreachable: %v", err) 339 } 340 341 out, stderr, err := m.SSH(`. /etc/os-release && echo "$ID-$VARIANT_ID"`) 342 if err != nil { 343 return fmt.Errorf("no /etc/os-release file: %v: %s", err, stderr) 344 } 345 346 // ensure we're talking to a supported system 347 switch string(out) { 348 case `coreos-`, `rhcos-`, `fedora-coreos`: 349 break 350 default: 351 return fmt.Errorf("not a supported instance: %v", string(out)) 352 } 353 354 if !m.RuntimeConf().AllowFailedUnits { 355 // ensure no systemd units failed during boot 356 out, stderr, err = m.SSH("systemctl --no-legend --state failed list-units") 357 if err != nil { 358 return fmt.Errorf("systemctl: %s: %v: %s", out, err, stderr) 359 } 360 if len(out) > 0 { 361 return fmt.Errorf("some systemd units failed:\n%s", out) 362 } 363 } 364 365 return ctx.Err() 366 }