github.com/coreos/mantle@v0.13.0/platform/platform.go (about)

     1  // Copyright 2017 CoreOS, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package platform
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"io"
    21  	"path/filepath"
    22  	"sync"
    23  	"time"
    24  
    25  	"github.com/coreos/pkg/capnslog"
    26  	"golang.org/x/crypto/ssh"
    27  	"golang.org/x/net/context"
    28  
    29  	"github.com/coreos/mantle/platform/conf"
    30  	"github.com/coreos/mantle/util"
    31  )
    32  
    33  const (
    34  	sshRetries = 30
    35  	sshTimeout = 10 * time.Second
    36  )
    37  
    38  var (
    39  	plog = capnslog.NewPackageLogger("github.com/coreos/mantle", "platform")
    40  )
    41  
    42  // Name is a unique identifier for a platform.
    43  type Name string
    44  
    45  // Machine represents a Container Linux instance.
    46  type Machine interface {
    47  	// ID returns the plaform-specific machine identifier.
    48  	ID() string
    49  
    50  	// IP returns the machine's public IP.
    51  	IP() string
    52  
    53  	// PrivateIP returns the machine's private IP.
    54  	PrivateIP() string
    55  
    56  	// RuntimeConf returns the cluster's runtime configuration.
    57  	RuntimeConf() RuntimeConfig
    58  
    59  	// SSHClient establishes a new SSH connection to the machine.
    60  	SSHClient() (*ssh.Client, error)
    61  
    62  	// PasswordSSHClient establishes a new SSH connection using the provided credentials.
    63  	PasswordSSHClient(user string, password string) (*ssh.Client, error)
    64  
    65  	// SSH runs a single command over a new SSH connection.
    66  	SSH(cmd string) ([]byte, []byte, error)
    67  
    68  	// Reboot restarts the machine and waits for it to come back.
    69  	Reboot() error
    70  
    71  	// Destroy terminates the machine and frees associated resources. It should log
    72  	// any failures; since they are not actionable, it does not return an error.
    73  	Destroy()
    74  
    75  	// ConsoleOutput returns the machine's console output if available,
    76  	// or an empty string.  Only expected to be valid after Destroy().
    77  	ConsoleOutput() string
    78  
    79  	// JournalOutput returns the machine's journal output if available,
    80  	// or an empty string.  Only expected to be valid after Destroy().
    81  	JournalOutput() string
    82  }
    83  
    84  // Cluster represents a cluster of machines within a single Flight.
    85  type Cluster interface {
    86  	// Platform returns the name of the platform.
    87  	Platform() Name
    88  
    89  	// Name returns a unique name for the Cluster.
    90  	Name() string
    91  
    92  	// NewMachine creates a new Container Linux machine.
    93  	NewMachine(userdata *conf.UserData) (Machine, error)
    94  
    95  	// Machines returns a slice of the active machines in the Cluster.
    96  	Machines() []Machine
    97  
    98  	// GetDiscoveryURL returns a new etcd discovery URL.
    99  	GetDiscoveryURL(size int) (string, error)
   100  
   101  	// Destroy terminates each machine in the cluster and frees any other
   102  	// associated resources. It should log any failures; since they are not
   103  	// actionable, it does not return an error
   104  	Destroy()
   105  
   106  	// ConsoleOutput returns a map of console output from destroyed
   107  	// cluster machines.
   108  	ConsoleOutput() map[string]string
   109  
   110  	// JournalOutput returns a map of journal output from destroyed
   111  	// cluster machines.
   112  	JournalOutput() map[string]string
   113  
   114  	// IgnitionVersion returns the version of Ignition supported by the
   115  	// cluster
   116  	IgnitionVersion() string
   117  }
   118  
   119  // Flight represents a group of Clusters within a single platform.
   120  type Flight interface {
   121  	// NewCluster creates a new Cluster.
   122  	NewCluster(rconf *RuntimeConfig) (Cluster, error)
   123  
   124  	// Name returns a unique name for the Flight.
   125  	Name() string
   126  
   127  	// Platform returns the name of the platform.
   128  	Platform() Name
   129  
   130  	// Clusters returns a slice of the active Clusters.
   131  	Clusters() []Cluster
   132  
   133  	// Destroy terminates each cluster and frees any other associated
   134  	// resources.  It should log any failures; since they are not
   135  	// actionable, it does not return an error.
   136  	Destroy()
   137  }
   138  
   139  // SystemdDropin is a userdata type agnostic struct representing a systemd dropin
   140  type SystemdDropin struct {
   141  	Unit     string
   142  	Name     string
   143  	Contents string
   144  }
   145  
   146  // Options contains the base options for all clusters.
   147  type Options struct {
   148  	BaseName        string
   149  	Distribution    string
   150  	IgnitionVersion string
   151  	SystemdDropins  []SystemdDropin
   152  
   153  	// OSContainer is an image pull spec that can be given to the pivot service
   154  	// in RHCOS machines to perform machine content upgrades.
   155  	// When specified additional files & units will be automatically generated
   156  	// inside of RenderUserData
   157  	OSContainer string
   158  }
   159  
   160  // RuntimeConfig contains cluster-specific configuration.
   161  type RuntimeConfig struct {
   162  	OutputDir string
   163  
   164  	NoSSHKeyInUserData bool // don't inject SSH key into Ignition/cloud-config
   165  	NoSSHKeyInMetadata bool // don't add SSH key to platform metadata
   166  	NoEnableSelinux    bool // don't enable selinux when starting or rebooting a machine
   167  	AllowFailedUnits   bool // don't fail CheckMachine if a systemd unit has failed
   168  }
   169  
   170  // Wrap a StdoutPipe as a io.ReadCloser
   171  type sshPipe struct {
   172  	s   *ssh.Session
   173  	c   *ssh.Client
   174  	err *bytes.Buffer
   175  	io.Reader
   176  }
   177  
   178  func (p *sshPipe) Close() error {
   179  	if err := p.s.Wait(); err != nil {
   180  		return fmt.Errorf("%s: %s", err, p.err)
   181  	}
   182  	if err := p.s.Close(); err != nil {
   183  		return err
   184  	}
   185  	return p.c.Close()
   186  }
   187  
   188  // Copy a file between two machines in a cluster.
   189  func TransferFile(src Machine, srcPath string, dst Machine, dstPath string) error {
   190  	srcPipe, err := ReadFile(src, srcPath)
   191  	if err != nil {
   192  		return err
   193  	}
   194  	defer srcPipe.Close()
   195  
   196  	if err := InstallFile(srcPipe, dst, dstPath); err != nil {
   197  		return err
   198  	}
   199  	return nil
   200  }
   201  
   202  // ReadFile returns a io.ReadCloser that streams the requested file. The
   203  // caller should close the reader when finished.
   204  func ReadFile(m Machine, path string) (io.ReadCloser, error) {
   205  	client, err := m.SSHClient()
   206  	if err != nil {
   207  		return nil, fmt.Errorf("failed creating SSH client: %v", err)
   208  	}
   209  
   210  	session, err := client.NewSession()
   211  	if err != nil {
   212  		client.Close()
   213  		return nil, fmt.Errorf("failed creating SSH session: %v", err)
   214  	}
   215  
   216  	// connect session stdout
   217  	stdoutPipe, err := session.StdoutPipe()
   218  	if err != nil {
   219  		session.Close()
   220  		client.Close()
   221  		return nil, err
   222  	}
   223  
   224  	// collect stderr
   225  	errBuf := bytes.NewBuffer(nil)
   226  	session.Stderr = errBuf
   227  
   228  	// stream file to stdout
   229  	err = session.Start(fmt.Sprintf("sudo cat %s", path))
   230  	if err != nil {
   231  		session.Close()
   232  		client.Close()
   233  		return nil, err
   234  	}
   235  
   236  	// pass stdoutPipe as a io.ReadCloser that cleans up the ssh session
   237  	// on when closed.
   238  	return &sshPipe{session, client, errBuf, stdoutPipe}, nil
   239  }
   240  
   241  // InstallFile copies data from in to the path to on m.
   242  func InstallFile(in io.Reader, m Machine, to string) error {
   243  	dir := filepath.Dir(to)
   244  	out, stderr, err := m.SSH(fmt.Sprintf("sudo mkdir -p %s", dir))
   245  	if err != nil {
   246  		return fmt.Errorf("failed creating directory %s: %s: %s", dir, stderr, err)
   247  	}
   248  
   249  	client, err := m.SSHClient()
   250  	if err != nil {
   251  		return fmt.Errorf("failed creating SSH client: %v", err)
   252  	}
   253  
   254  	defer client.Close()
   255  
   256  	session, err := client.NewSession()
   257  	if err != nil {
   258  		return fmt.Errorf("failed creating SSH session: %v", err)
   259  	}
   260  
   261  	defer session.Close()
   262  
   263  	// write file to fs from stdin
   264  	session.Stdin = in
   265  	out, err = session.CombinedOutput(fmt.Sprintf("sudo install -m 0755 /dev/stdin %s", to))
   266  	if err != nil {
   267  		return fmt.Errorf("failed executing install: %q: %v", out, err)
   268  	}
   269  
   270  	return nil
   271  }
   272  
   273  // NewMachines spawns n instances in cluster c, with
   274  // each instance passed the same userdata.
   275  func NewMachines(c Cluster, userdata *conf.UserData, n int) ([]Machine, error) {
   276  	var wg sync.WaitGroup
   277  
   278  	mchan := make(chan Machine, n)
   279  	errchan := make(chan error, n)
   280  
   281  	for i := 0; i < n; i++ {
   282  		wg.Add(1)
   283  		go func() {
   284  			defer wg.Done()
   285  			m, err := c.NewMachine(userdata)
   286  			if err != nil {
   287  				errchan <- err
   288  			}
   289  			if m != nil {
   290  				mchan <- m
   291  			}
   292  		}()
   293  	}
   294  
   295  	wg.Wait()
   296  	close(mchan)
   297  	close(errchan)
   298  
   299  	machs := []Machine{}
   300  
   301  	for m := range mchan {
   302  		machs = append(machs, m)
   303  	}
   304  
   305  	if firsterr, ok := <-errchan; ok {
   306  		for _, m := range machs {
   307  			m.Destroy()
   308  		}
   309  		return nil, firsterr
   310  	}
   311  
   312  	return machs, nil
   313  }
   314  
   315  // CheckMachine tests a machine for various error conditions such as ssh
   316  // being available and no systemd units failing at the time ssh is reachable.
   317  // It also ensures the remote system is running Container Linux by CoreOS or
   318  // Red Hat CoreOS.
   319  //
   320  // TODO(mischief): better error messages.
   321  func CheckMachine(ctx context.Context, m Machine) error {
   322  	// ensure ssh works and the system is ready
   323  	sshChecker := func() error {
   324  		if err := ctx.Err(); err != nil {
   325  			return err
   326  		}
   327  		out, stderr, err := m.SSH("systemctl is-system-running")
   328  		if !bytes.Contains([]byte("initializing starting running stopping"), out) {
   329  			return nil // stop retrying if the system went haywire
   330  		}
   331  		if err != nil {
   332  			return fmt.Errorf("could not check if machine is running: %s: %v: %s", out, err, stderr)
   333  		}
   334  		return nil
   335  	}
   336  
   337  	if err := util.Retry(sshRetries, sshTimeout, sshChecker); err != nil {
   338  		return fmt.Errorf("ssh unreachable: %v", err)
   339  	}
   340  
   341  	out, stderr, err := m.SSH(`. /etc/os-release && echo "$ID-$VARIANT_ID"`)
   342  	if err != nil {
   343  		return fmt.Errorf("no /etc/os-release file: %v: %s", err, stderr)
   344  	}
   345  
   346  	// ensure we're talking to a supported system
   347  	switch string(out) {
   348  	case `coreos-`, `rhcos-`, `fedora-coreos`:
   349  		break
   350  	default:
   351  		return fmt.Errorf("not a supported instance: %v", string(out))
   352  	}
   353  
   354  	if !m.RuntimeConf().AllowFailedUnits {
   355  		// ensure no systemd units failed during boot
   356  		out, stderr, err = m.SSH("systemctl --no-legend --state failed list-units")
   357  		if err != nil {
   358  			return fmt.Errorf("systemctl: %s: %v: %s", out, err, stderr)
   359  		}
   360  		if len(out) > 0 {
   361  			return fmt.Errorf("some systemd units failed:\n%s", out)
   362  		}
   363  	}
   364  
   365  	return ctx.Err()
   366  }