github.com/jlmeeker/kismatic@v1.10.1-0.20180612190640-57f9005a1f1a/pkg/install/execute.go

github.com/jlmeeker/kismatic@v1.10.1-0.20180612190640-57f9005a1f1a/pkg/install/execute.go (about)

     1  package install
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io"
     7  	"io/ioutil"
     8  	"os"
     9  	"path/filepath"
    10  	"time"
    11  
    12  	"strings"
    13  
    14  	"github.com/apprenda/kismatic/pkg/ansible"
    15  	"github.com/apprenda/kismatic/pkg/install/explain"
    16  	"github.com/apprenda/kismatic/pkg/tls"
    17  	"github.com/apprenda/kismatic/pkg/util"
    18  )
    19  
    20  // The PreFlightExecutor will run pre-flight checks against the
    21  // environment defined in the plan file
    22  type PreFlightExecutor interface {
    23  	RunPreFlightCheck(plan *Plan, nodes ...string) error
    24  	RunNewNodePreFlightCheck(Plan, Node) error
    25  	RunUpgradePreFlightCheck(*Plan, ListableNode) error
    26  }
    27  
    28  // The Executor will carry out the installation plan
    29  type Executor interface {
    30  	PreFlightExecutor
    31  	Install(plan *Plan, restartServices bool, nodes ...string) error
    32  	Reset(plan *Plan, nodes ...string) error
    33  	GenerateCertificates(p *Plan, useExistingCA bool) error
    34  	RunSmokeTest(*Plan) error
    35  	AddNode(plan *Plan, node Node, roles []string, restartServices bool) (*Plan, error)
    36  	RunPlay(name string, plan *Plan, restartServices bool, nodes ...string) error
    37  	AddVolume(*Plan, StorageVolume) error
    38  	DeleteVolume(*Plan, string) error
    39  	UpgradeNodes(plan Plan, nodesToUpgrade []ListableNode, onlineUpgrade bool, maxParallelWorkers int, restartServices bool) error
    40  	ValidateControlPlane(plan Plan) error
    41  	UpgradeClusterServices(plan Plan) error
    42  }
    43  
    44  // DiagnosticsExecutor will run diagnostics on the nodes after an install
    45  type DiagnosticsExecutor interface {
    46  	DiagnoseNodes(plan Plan) error
    47  }
    48  
    49  // ExecutorOptions are used to configure the executor
    50  type ExecutorOptions struct {
    51  	// GeneratedAssetsDirectory is the location where generated assets
    52  	// are to be stored
    53  	GeneratedAssetsDirectory string
    54  	// OutputFormat sets the format of the executor
    55  	OutputFormat string
    56  	// Verbose output from the executor
    57  	Verbose bool
    58  	// RunsDirectory is where information about installation runs is kept
    59  	RunsDirectory string
    60  	// DiagnosticsDirecty is where the doDiagnostics information about the cluster will be dumped
    61  	DiagnosticsDirecty string
    62  	// DryRun determines if the executor should actually run the task
    63  	DryRun bool
    64  }
    65  
    66  // NewExecutor returns an executor for performing installations according to the installation plan.
    67  func NewExecutor(stdout io.Writer, errOut io.Writer, options ExecutorOptions) (Executor, error) {
    68  	ansibleDir := "ansible"
    69  	if options.GeneratedAssetsDirectory == "" {
    70  		return nil, fmt.Errorf("GeneratedAssetsDirectory option cannot be empty")
    71  	}
    72  	if options.RunsDirectory == "" {
    73  		options.RunsDirectory = "./runs"
    74  	}
    75  
    76  	// Setup the console output format
    77  	var outFormat ansible.OutputFormat
    78  	switch options.OutputFormat {
    79  	case "raw":
    80  		outFormat = ansible.RawFormat
    81  	case "simple":
    82  		outFormat = ansible.JSONLinesFormat
    83  	default:
    84  		return nil, fmt.Errorf("Output format %q is not supported", options.OutputFormat)
    85  	}
    86  	certsDir := filepath.Join(options.GeneratedAssetsDirectory, "keys")
    87  	pki := &LocalPKI{
    88  		CACsr: filepath.Join(ansibleDir, "playbooks", "tls", "ca-csr.json"),
    89  		GeneratedCertsDirectory: certsDir,
    90  		Log: stdout,
    91  	}
    92  	return &ansibleExecutor{
    93  		options:             options,
    94  		stdout:              stdout,
    95  		consoleOutputFormat: outFormat,
    96  		ansibleDir:          ansibleDir,
    97  		certsDir:            certsDir,
    98  		pki:                 pki,
    99  	}, nil
   100  }
   101  
   102  // NewPreFlightExecutor returns an executor for running preflight
   103  func NewPreFlightExecutor(stdout io.Writer, errOut io.Writer, options ExecutorOptions) (PreFlightExecutor, error) {
   104  	ansibleDir := "ansible"
   105  	if options.RunsDirectory == "" {
   106  		options.RunsDirectory = "./runs"
   107  	}
   108  	// Setup the console output format
   109  	var outFormat ansible.OutputFormat
   110  	switch options.OutputFormat {
   111  	case "raw":
   112  		outFormat = ansible.RawFormat
   113  	case "simple":
   114  		outFormat = ansible.JSONLinesFormat
   115  	default:
   116  		return nil, fmt.Errorf("Output format %q is not supported", options.OutputFormat)
   117  	}
   118  
   119  	return &ansibleExecutor{
   120  		options:             options,
   121  		stdout:              stdout,
   122  		consoleOutputFormat: outFormat,
   123  		ansibleDir:          ansibleDir,
   124  	}, nil
   125  }
   126  
   127  // NewDiagnosticsExecutor returns an executor for running preflight
   128  func NewDiagnosticsExecutor(stdout io.Writer, errOut io.Writer, options ExecutorOptions) (DiagnosticsExecutor, error) {
   129  	ansibleDir := "ansible"
   130  	if options.RunsDirectory == "" {
   131  		options.RunsDirectory = "./runs"
   132  	}
   133  	if options.DiagnosticsDirecty == "" {
   134  		wd, err := os.Getwd()
   135  		if err != nil {
   136  			return nil, fmt.Errorf("Could not get working directory: %v", err)
   137  		}
   138  		options.DiagnosticsDirecty = filepath.Join(wd, "diagnostics")
   139  	}
   140  
   141  	// Setup the console output format
   142  	var outFormat ansible.OutputFormat
   143  	switch options.OutputFormat {
   144  	case "raw":
   145  		outFormat = ansible.RawFormat
   146  	case "simple":
   147  		outFormat = ansible.JSONLinesFormat
   148  	default:
   149  		return nil, fmt.Errorf("Output format %q is not supported", options.OutputFormat)
   150  	}
   151  
   152  	return &ansibleExecutor{
   153  		options:             options,
   154  		stdout:              stdout,
   155  		consoleOutputFormat: outFormat,
   156  		ansibleDir:          ansibleDir,
   157  	}, nil
   158  }
   159  
   160  type ansibleExecutor struct {
   161  	options             ExecutorOptions
   162  	stdout              io.Writer
   163  	consoleOutputFormat ansible.OutputFormat
   164  	ansibleDir          string
   165  	certsDir            string
   166  	pki                 PKI
   167  
   168  	// Hook for testing purposes.. default implementation is used at runtime
   169  	runnerExplainerFactory func(explain.AnsibleEventExplainer, io.Writer) (ansible.Runner, *explain.AnsibleEventStreamExplainer, error)
   170  }
   171  
   172  type task struct {
   173  	// name of the task used for the runs dir
   174  	name string
   175  	// the inventory of nodes to use
   176  	inventory ansible.Inventory
   177  	// the cluster catalog to use
   178  	clusterCatalog ansible.ClusterCatalog
   179  	// the playbook filename
   180  	playbook string
   181  	// the explainer to use
   182  	explainer explain.AnsibleEventExplainer
   183  	// the plan
   184  	plan Plan
   185  	// run the task on specific nodes
   186  	limit []string
   187  }
   188  
   189  // execute will run the given task, and setup all what's needed for us to run ansible.
   190  func (ae *ansibleExecutor) execute(t task) error {
   191  	if ae.options.DryRun {
   192  		return nil
   193  	}
   194  	runDirectory, err := ae.createRunDirectory(t.name)
   195  	if err != nil {
   196  		return fmt.Errorf("error creating working directory for %q: %v", t.name, err)
   197  	}
   198  	// Save the plan file that was used for this execution
   199  	fp := FilePlanner{
   200  		File: filepath.Join(runDirectory, "kismatic-cluster.yaml"),
   201  	}
   202  	if err = fp.Write(&t.plan); err != nil {
   203  		return fmt.Errorf("error recording plan file to %s: %v", fp.File, err)
   204  	}
   205  	ansibleLogFilename := filepath.Join(runDirectory, "ansible.log")
   206  	ansibleLogFile, err := os.Create(ansibleLogFilename)
   207  	if err != nil {
   208  		return fmt.Errorf("error creating ansible log file %q: %v", ansibleLogFilename, err)
   209  	}
   210  	runner, explainer, err := ae.ansibleRunnerWithExplainer(t.explainer, ansibleLogFile, runDirectory)
   211  	if err != nil {
   212  		return err
   213  	}
   214  
   215  	// Start running ansible with the given playbook
   216  	var eventStream <-chan ansible.Event
   217  	if t.limit != nil && len(t.limit) != 0 {
   218  		eventStream, err = runner.StartPlaybookOnNode(t.playbook, t.inventory, t.clusterCatalog, t.limit...)
   219  	} else {
   220  		eventStream, err = runner.StartPlaybook(t.playbook, t.inventory, t.clusterCatalog)
   221  	}
   222  	if err != nil {
   223  		return fmt.Errorf("error running ansible playbook: %v", err)
   224  	}
   225  	// Ansible blocks until explainer starts reading from stream. Start
   226  	// explainer in a separate go routine
   227  	go explainer.Explain(eventStream)
   228  
   229  	// Wait until ansible exits
   230  	if err = runner.WaitPlaybook(); err != nil {
   231  		return fmt.Errorf("error running playbook: %v", err)
   232  	}
   233  	return nil
   234  }
   235  
   236  // GenerateCertificatesprivate generates keys and certificates for the cluster, if needed
   237  func (ae *ansibleExecutor) GenerateCertificates(p *Plan, useExistingCA bool) error {
   238  	if err := os.MkdirAll(ae.certsDir, 0777); err != nil {
   239  		return fmt.Errorf("error creating directory %s for storing TLS assets: %v", ae.certsDir, err)
   240  	}
   241  
   242  	// Generate cluster Certificate Authority
   243  	util.PrintHeader(ae.stdout, "Configuring Certificates", '=')
   244  
   245  	var clusterCACert *tls.CA
   246  	var err error
   247  	if useExistingCA {
   248  		exists, err := ae.pki.CertificateAuthorityExists()
   249  		if err != nil {
   250  			return fmt.Errorf("error checking if CA exists: %v", err)
   251  		}
   252  		if !exists {
   253  			return errors.New("The Certificate Authority is required, but it was not found.")
   254  		}
   255  		clusterCACert, err = ae.pki.GetClusterCA()
   256  		if err != nil {
   257  			return fmt.Errorf("error reading CA certificate: %v", err)
   258  		}
   259  
   260  	} else {
   261  		clusterCACert, err = ae.pki.GenerateClusterCA(p)
   262  		if err != nil {
   263  			return fmt.Errorf("error generating CA for the cluster: %v", err)
   264  		}
   265  	}
   266  
   267  	proxyClientCACert, err := ae.pki.GenerateProxyClientCA(p)
   268  	if err != nil {
   269  		return fmt.Errorf("error generating CA for the proxy client: %v", err)
   270  	}
   271  
   272  	// Generate node and user certificates
   273  	err = ae.pki.GenerateClusterCertificates(p, clusterCACert, proxyClientCACert)
   274  	if err != nil {
   275  		return fmt.Errorf("error generating certificates for the cluster: %v", err)
   276  	}
   277  
   278  	util.PrettyPrintOk(ae.stdout, "Cluster certificates can be found in the %q directory", ae.options.GeneratedAssetsDirectory)
   279  	return nil
   280  }
   281  
   282  // Install the cluster according to the installation plan
   283  func (ae *ansibleExecutor) Install(p *Plan, restartServices bool, nodes ...string) error {
   284  	// Build the ansible inventory
   285  	cc, err := ae.buildClusterCatalog(p)
   286  	if err != nil {
   287  		return err
   288  	}
   289  	if restartServices {
   290  		cc.EnableRestart()
   291  	}
   292  	t := task{
   293  		name:           "apply",
   294  		playbook:       "kubernetes.yaml",
   295  		plan:           *p,
   296  		inventory:      buildInventoryFromPlan(p),
   297  		clusterCatalog: *cc,
   298  		explainer:      ae.defaultExplainer(),
   299  		limit:          nodes,
   300  	}
   301  	util.PrintHeader(ae.stdout, "Installing Cluster", '=')
   302  	return ae.execute(t)
   303  }
   304  
   305  func (ae *ansibleExecutor) Reset(p *Plan, nodes ...string) error {
   306  	cc, err := ae.buildClusterCatalog(p)
   307  	if err != nil {
   308  		return err
   309  	}
   310  	t := task{
   311  		name:           "reset",
   312  		playbook:       "reset.yaml",
   313  		explainer:      ae.defaultExplainer(),
   314  		plan:           *p,
   315  		inventory:      buildInventoryFromPlan(p),
   316  		clusterCatalog: *cc,
   317  		limit:          nodes,
   318  	}
   319  	util.PrintHeader(ae.stdout, "Resetting Nodes in the Cluster", '=')
   320  	return ae.execute(t)
   321  }
   322  
   323  func (ae *ansibleExecutor) RunSmokeTest(p *Plan) error {
   324  	cc, err := ae.buildClusterCatalog(p)
   325  	if err != nil {
   326  		return err
   327  	}
   328  	t := task{
   329  		name:           "smoketest",
   330  		playbook:       "smoketest.yaml",
   331  		explainer:      ae.defaultExplainer(),
   332  		plan:           *p,
   333  		inventory:      buildInventoryFromPlan(p),
   334  		clusterCatalog: *cc,
   335  	}
   336  	util.PrintHeader(ae.stdout, "Running Smoke Test", '=')
   337  	return ae.execute(t)
   338  }
   339  
   340  // RunPreflightCheck against the nodes defined in the plan
   341  func (ae *ansibleExecutor) RunPreFlightCheck(p *Plan, nodes ...string) error {
   342  	cc, err := ae.buildClusterCatalog(p)
   343  	if err != nil {
   344  		return err
   345  	}
   346  	t := task{
   347  		name:           "preflight",
   348  		playbook:       "preflight.yaml",
   349  		inventory:      buildInventoryFromPlan(p),
   350  		clusterCatalog: *cc,
   351  		explainer:      ae.preflightExplainer(),
   352  		plan:           *p,
   353  		limit:          nodes,
   354  	}
   355  	return ae.execute(t)
   356  }
   357  
   358  // RunNewNodePreFlightCheck runs the preflight checks against a new node
   359  func (ae *ansibleExecutor) RunNewNodePreFlightCheck(p Plan, node Node) error {
   360  	cc, err := ae.buildClusterCatalog(&p)
   361  	if err != nil {
   362  		return err
   363  	}
   364  	t := task{
   365  		name:           "copy-inspector",
   366  		playbook:       "copy-inspector.yaml",
   367  		inventory:      buildInventoryFromPlan(&p),
   368  		clusterCatalog: *cc,
   369  		explainer:      ae.preflightExplainer(),
   370  		plan:           p,
   371  	}
   372  	if err := ae.execute(t); err != nil {
   373  		return err
   374  	}
   375  
   376  	p.Worker.ExpectedCount++
   377  	p.Worker.Nodes = append(p.Worker.Nodes, node)
   378  	t = task{
   379  		name:           "add-node-preflight",
   380  		playbook:       "preflight.yaml",
   381  		inventory:      buildInventoryFromPlan(&p),
   382  		clusterCatalog: *cc,
   383  		explainer:      ae.preflightExplainer(),
   384  		plan:           p,
   385  		limit:          []string{node.Host},
   386  	}
   387  	return ae.execute(t)
   388  }
   389  
   390  func (ae *ansibleExecutor) RunUpgradePreFlightCheck(p *Plan, node ListableNode) error {
   391  	inventory := buildInventoryFromPlan(p)
   392  	cc, err := ae.buildClusterCatalog(p)
   393  	if err != nil {
   394  		return err
   395  	}
   396  	t := task{
   397  		name:           "copy-inspector",
   398  		playbook:       "copy-inspector.yaml",
   399  		inventory:      buildInventoryFromPlan(p),
   400  		clusterCatalog: *cc,
   401  		explainer:      ae.preflightExplainer(),
   402  		plan:           *p,
   403  	}
   404  	if err := ae.execute(t); err != nil {
   405  		return err
   406  	}
   407  	t = task{
   408  		name:           "upgrade-preflight",
   409  		playbook:       "upgrade-preflight.yaml",
   410  		explainer:      ae.preflightExplainer(),
   411  		plan:           *p,
   412  		inventory:      inventory,
   413  		clusterCatalog: *cc,
   414  		limit:          []string{node.Node.Host},
   415  	}
   416  	return ae.execute(t)
   417  }
   418  
   419  func (ae *ansibleExecutor) RunPlay(playName string, p *Plan, restartServices bool, nodes ...string) error {
   420  	cc, err := ae.buildClusterCatalog(p)
   421  	if err != nil {
   422  		return err
   423  	}
   424  	if restartServices {
   425  		cc.EnableRestart()
   426  	}
   427  	t := task{
   428  		name:           "step",
   429  		playbook:       playName,
   430  		inventory:      buildInventoryFromPlan(p),
   431  		clusterCatalog: *cc,
   432  		explainer:      ae.defaultExplainer(),
   433  		plan:           *p,
   434  		limit:          nodes,
   435  	}
   436  	return ae.execute(t)
   437  }
   438  
   439  func (ae *ansibleExecutor) AddVolume(plan *Plan, volume StorageVolume) error {
   440  	// Validate that there are enough storage nodes to satisfy the request
   441  	nodesRequired := volume.ReplicateCount * volume.DistributionCount
   442  	if nodesRequired > len(plan.Storage.Nodes) {
   443  		return fmt.Errorf("the requested volume configuration requires %d storage nodes, but the cluster only has %d.", nodesRequired, len(plan.Storage.Nodes))
   444  	}
   445  
   446  	cc, err := ae.buildClusterCatalog(plan)
   447  	if err != nil {
   448  		return err
   449  	}
   450  	// Add storage related vars
   451  	cc.VolumeName = volume.Name
   452  	cc.VolumeReplicaCount = volume.ReplicateCount
   453  	cc.VolumeDistributionCount = volume.DistributionCount
   454  	cc.VolumeStorageClass = volume.StorageClass
   455  	cc.VolumeQuotaGB = volume.SizeGB
   456  	cc.VolumeQuotaBytes = volume.SizeGB * (1 << (10 * 3))
   457  	cc.VolumeMount = "/"
   458  	cc.VolumeReclaimPolicy = volume.ReclaimPolicy
   459  	cc.VolumeAccessModes = volume.AccessModes
   460  
   461  	// Allow nodes and pods to access volumes
   462  	allowedNodes := plan.Master.Nodes
   463  	allowedNodes = append(allowedNodes, plan.Worker.Nodes...)
   464  	allowedNodes = append(allowedNodes, plan.Ingress.Nodes...)
   465  	allowedNodes = append(allowedNodes, plan.Storage.Nodes...)
   466  
   467  	allowed := volume.AllowAddresses
   468  	allowed = append(allowed, plan.Cluster.Networking.PodCIDRBlock)
   469  	for _, n := range allowedNodes {
   470  		ip := n.IP
   471  		if n.InternalIP != "" {
   472  			ip = n.InternalIP
   473  		}
   474  		allowed = append(allowed, ip)
   475  	}
   476  	cc.VolumeAllowedIPs = strings.Join(allowed, ",")
   477  
   478  	t := task{
   479  		name:           "add-volume",
   480  		playbook:       "volume-add.yaml",
   481  		plan:           *plan,
   482  		inventory:      buildInventoryFromPlan(plan),
   483  		clusterCatalog: *cc,
   484  		explainer:      ae.defaultExplainer(),
   485  	}
   486  	util.PrintHeader(ae.stdout, "Add Persistent Storage Volume", '=')
   487  	return ae.execute(t)
   488  }
   489  
   490  func (ae *ansibleExecutor) DeleteVolume(plan *Plan, name string) error {
   491  	cc, err := ae.buildClusterCatalog(plan)
   492  	if err != nil {
   493  		return err
   494  	}
   495  	// Add storage related vars
   496  	cc.VolumeName = name
   497  	cc.VolumeMount = "/"
   498  
   499  	t := task{
   500  		name:           "delete-volume",
   501  		playbook:       "volume-delete.yaml",
   502  		plan:           *plan,
   503  		inventory:      buildInventoryFromPlan(plan),
   504  		clusterCatalog: *cc,
   505  		explainer:      ae.defaultExplainer(),
   506  	}
   507  	util.PrintHeader(ae.stdout, "Delete Persistent Storage Volume", '=')
   508  	return ae.execute(t)
   509  }
   510  
   511  // UpgradeNodes upgrades the nodes of the cluster in the following phases:
   512  //   1. Etcd nodes
   513  //   2. Master nodes
   514  //   3. Worker nodes (regardless of specialization)
   515  //
   516  // When a node is being upgraded, all the components of the node are upgraded, regardless of
   517  // which phase of the upgrade we are in. For example, when upgrading a node that is both an etcd and master,
   518  // the etcd components and the master components will be upgraded when we are in the upgrade etcd nodes
   519  // phase.
   520  func (ae *ansibleExecutor) UpgradeNodes(plan Plan, nodesToUpgrade []ListableNode, onlineUpgrade bool, maxParallelWorkers int, restartServices bool) error {
   521  	// Nodes can have multiple roles. For this reason, we need to keep track of which nodes
   522  	// have been upgraded to avoid re-upgrading them.
   523  	upgradedNodes := map[string]bool{}
   524  	// Upgrade etcd nodes
   525  	for _, nodeToUpgrade := range nodesToUpgrade {
   526  		for _, role := range nodeToUpgrade.Roles {
   527  			if role == "etcd" {
   528  				node := nodeToUpgrade
   529  				if err := ae.upgradeNodes(plan, onlineUpgrade, restartServices, node); err != nil {
   530  					return fmt.Errorf("error upgrading node %q: %v", node.Node.Host, err)
   531  				}
   532  				upgradedNodes[node.Node.IP] = true
   533  				break
   534  			}
   535  		}
   536  	}
   537  
   538  	// Upgrade master nodes
   539  	for _, nodeToUpgrade := range nodesToUpgrade {
   540  		if upgradedNodes[nodeToUpgrade.Node.IP] == true {
   541  			continue
   542  		}
   543  		for _, role := range nodeToUpgrade.Roles {
   544  			if role == "master" {
   545  				node := nodeToUpgrade
   546  				if err := ae.upgradeNodes(plan, onlineUpgrade, restartServices, node); err != nil {
   547  					return fmt.Errorf("error upgrading node %q: %v", node.Node.Host, err)
   548  				}
   549  				upgradedNodes[node.Node.IP] = true
   550  				break
   551  			}
   552  		}
   553  	}
   554  
   555  	var limitNodes []ListableNode
   556  	// Upgrade the rest of the nodes
   557  	for n, nodeToUpgrade := range nodesToUpgrade {
   558  		if upgradedNodes[nodeToUpgrade.Node.IP] == true {
   559  			continue
   560  		}
   561  		for _, role := range nodeToUpgrade.Roles {
   562  			if role != "etcd" && role != "master" {
   563  				node := nodeToUpgrade
   564  				limitNodes = append(limitNodes, node)
   565  				// don't forget to run the remaining nodes if its < maxParallelWorkers
   566  				if len(limitNodes) == maxParallelWorkers || n == len(nodesToUpgrade)-1 {
   567  					if err := ae.upgradeNodes(plan, onlineUpgrade, restartServices, limitNodes...); err != nil {
   568  						return fmt.Errorf("error upgrading node %q: %v", node.Node.Host, err)
   569  					}
   570  					// empty the slice
   571  					limitNodes = limitNodes[:0]
   572  				}
   573  				upgradedNodes[node.Node.IP] = true
   574  				break
   575  			}
   576  		}
   577  	}
   578  	return nil
   579  }
   580  
   581  func (ae *ansibleExecutor) upgradeNodes(plan Plan, onlineUpgrade bool, restartServices bool, nodes ...ListableNode) error {
   582  	inventory := buildInventoryFromPlan(&plan)
   583  	cc, err := ae.buildClusterCatalog(&plan)
   584  	if err != nil {
   585  		return err
   586  	}
   587  	cc.OnlineUpgrade = onlineUpgrade
   588  	if restartServices {
   589  		cc.EnableRestart()
   590  	}
   591  	var limit []string
   592  	nodeRoles := make(map[string][]string)
   593  	for _, node := range nodes {
   594  		limit = append(limit, node.Node.Host)
   595  		nodeRoles[node.Node.Host] = node.Roles
   596  	}
   597  	t := task{
   598  		name:           "upgrade-nodes",
   599  		playbook:       "upgrade-nodes.yaml",
   600  		inventory:      inventory,
   601  		clusterCatalog: *cc,
   602  		plan:           plan,
   603  		explainer:      ae.defaultExplainer(),
   604  		limit:          limit,
   605  	}
   606  	if len(limit) == 1 {
   607  		util.PrintHeader(ae.stdout, fmt.Sprintf("Upgrade Node: %s %s", limit, nodes[0].Roles), '=')
   608  	} else { // print the roles for multiple nodes
   609  		util.PrintHeader(ae.stdout, "Upgrade Nodes:", '=')
   610  		util.PrintTable(ae.stdout, nodeRoles)
   611  	}
   612  	return ae.execute(t)
   613  }
   614  
   615  func (ae *ansibleExecutor) ValidateControlPlane(plan Plan) error {
   616  	inventory := buildInventoryFromPlan(&plan)
   617  	cc, err := ae.buildClusterCatalog(&plan)
   618  	if err != nil {
   619  		return err
   620  	}
   621  	t := task{
   622  		name:           "validate-control-plane",
   623  		playbook:       "validate-control-plane.yaml",
   624  		inventory:      inventory,
   625  		clusterCatalog: *cc,
   626  		plan:           plan,
   627  		explainer:      ae.defaultExplainer(),
   628  	}
   629  	return ae.execute(t)
   630  }
   631  
   632  func (ae *ansibleExecutor) UpgradeClusterServices(plan Plan) error {
   633  	inventory := buildInventoryFromPlan(&plan)
   634  	cc, err := ae.buildClusterCatalog(&plan)
   635  	if err != nil {
   636  		return err
   637  	}
   638  	t := task{
   639  		name:           "upgrade-cluster-services",
   640  		playbook:       "upgrade-cluster-services.yaml",
   641  		inventory:      inventory,
   642  		clusterCatalog: *cc,
   643  		plan:           plan,
   644  		explainer:      ae.defaultExplainer(),
   645  	}
   646  	return ae.execute(t)
   647  }
   648  
   649  func (ae *ansibleExecutor) DiagnoseNodes(plan Plan) error {
   650  	inventory := buildInventoryFromPlan(&plan)
   651  	cc, err := ae.buildClusterCatalog(&plan)
   652  	if err != nil {
   653  		return err
   654  	}
   655  	// dateTime will be appended to the diagnostics directory
   656  	now := time.Now().Format("2006-01-02-15-04-05")
   657  	cc.DiagnosticsDirectory = filepath.Join(ae.options.DiagnosticsDirecty, now)
   658  	cc.DiagnosticsDateTime = now
   659  	t := task{
   660  		name:           "diagnose",
   661  		playbook:       "diagnose-nodes.yaml",
   662  		inventory:      inventory,
   663  		clusterCatalog: *cc,
   664  		plan:           plan,
   665  		explainer:      ae.defaultExplainer(),
   666  	}
   667  	return ae.execute(t)
   668  }
   669  
   670  // creates the extra vars that are required for the installation playbook.
   671  func (ae *ansibleExecutor) buildClusterCatalog(p *Plan) (*ansible.ClusterCatalog, error) {
   672  	tlsDir, err := filepath.Abs(ae.certsDir)
   673  	if err != nil {
   674  		return nil, fmt.Errorf("failed to determine absolute path to %s: %v", ae.certsDir, err)
   675  	}
   676  
   677  	dnsIP, err := getDNSServiceIP(p)
   678  	if err != nil {
   679  		return nil, fmt.Errorf("error getting DNS service IP: %v", err)
   680  	}
   681  
   682  	cc := ansible.ClusterCatalog{
   683  		ClusterName:                   p.Cluster.Name,
   684  		AdminPassword:                 p.Cluster.AdminPassword,
   685  		TLSDirectory:                  tlsDir,
   686  		ServicesCIDR:                  p.Cluster.Networking.ServiceCIDRBlock,
   687  		PodCIDR:                       p.Cluster.Networking.PodCIDRBlock,
   688  		DNSServiceIP:                  dnsIP,
   689  		EnableModifyHosts:             p.Cluster.Networking.UpdateHostsFiles,
   690  		EnablePackageInstallation:     !p.Cluster.DisablePackageInstallation,
   691  		KismaticPreflightCheckerLinux: filepath.Join("inspector", "linux", "amd64", "kismatic-inspector"),
   692  		KuberangPath:                  filepath.Join("kuberang", "linux", "amd64", "kuberang"),
   693  		DisconnectedInstallation:      p.Cluster.DisconnectedInstallation,
   694  		HTTPProxy:                     p.Cluster.Networking.HTTPProxy,
   695  		HTTPSProxy:                    p.Cluster.Networking.HTTPSProxy,
   696  		TargetVersion:                 KismaticVersion.String(),
   697  		APIServerOptions:              p.Cluster.APIServerOptions.Overrides,
   698  		KubeControllerManagerOptions:  p.Cluster.KubeControllerManagerOptions.Overrides,
   699  		KubeSchedulerOptions:          p.Cluster.KubeSchedulerOptions.Overrides,
   700  		KubeProxyOptions:              p.Cluster.KubeProxyOptions.Overrides,
   701  		KubeletOptions:                p.Cluster.KubeletOptions.Overrides,
   702  	}
   703  
   704  	// set versions
   705  	cc.Versions.Kubernetes = p.Cluster.Version
   706  	cc.Versions.KubernetesYum = p.Cluster.Version[1:] + "-0"
   707  	cc.Versions.KubernetesDeb = p.Cluster.Version[1:] + "-00"
   708  
   709  	cc.NoProxy = strings.Join(p.AllAddresses(), ",")
   710  	if p.Cluster.Networking.NoProxy != "" {
   711  		cc.NoProxy = cc.NoProxy + "," + p.Cluster.Networking.NoProxy
   712  	}
   713  
   714  	cc.LocalKubeconfigDirectory = filepath.Join(ae.options.GeneratedAssetsDirectory, "kubeconfig")
   715  	// absolute path required for ansible
   716  	generatedDir, err := filepath.Abs(filepath.Join(ae.options.GeneratedAssetsDirectory, "kubeconfig"))
   717  	if err != nil {
   718  		return nil, fmt.Errorf("failed to determine absolute path to %s: %v", filepath.Join(ae.options.GeneratedAssetsDirectory, "kubeconfig"), err)
   719  	}
   720  	cc.LocalKubeconfigDirectory = generatedDir
   721  
   722  	// Setup FQDN or default to first master
   723  	if p.Master.LoadBalancedFQDN != "" {
   724  		cc.LoadBalancedFQDN = p.Master.LoadBalancedFQDN
   725  	} else {
   726  		cc.LoadBalancedFQDN = p.Master.Nodes[0].InternalIP
   727  	}
   728  
   729  	if p.PrivateRegistryProvided() {
   730  		cc.ConfigureDockerWithPrivateRegistry = true
   731  		cc.DockerRegistryServer = p.DockerRegistry.Server
   732  		cc.DockerRegistryCAPath = p.DockerRegistry.CAPath
   733  		cc.DockerRegistryUsername = p.DockerRegistry.Username
   734  		cc.DockerRegistryPassword = p.DockerRegistry.Password
   735  	}
   736  
   737  	// Setup docker options
   738  	cc.Docker.Enabled = !p.Docker.Disable
   739  	cc.Docker.Logs.Driver = p.Docker.Logs.Driver
   740  	cc.Docker.Logs.Opts = p.Docker.Logs.Opts
   741  	cc.Docker.Storage.Driver = p.Docker.Storage.Driver
   742  	cc.Docker.Storage.Opts = p.Docker.Storage.Opts
   743  	cc.Docker.Storage.OptsList = []string{}
   744  	// A formatted list to set in docker daemon.json
   745  	for k, v := range p.Docker.Storage.Opts {
   746  		cc.Docker.Storage.OptsList = append(cc.Docker.Storage.OptsList, fmt.Sprintf("%s=%s", k, v))
   747  	}
   748  	cc.Docker.Storage.DirectLVMBlockDevice = ansible.DirectLVMBlockDevice{
   749  		Path:                        p.Docker.Storage.DirectLVMBlockDevice.Path,
   750  		ThinpoolPercent:             p.Docker.Storage.DirectLVMBlockDevice.ThinpoolPercent,
   751  		ThinpoolMetaPercent:         p.Docker.Storage.DirectLVMBlockDevice.ThinpoolMetaPercent,
   752  		ThinpoolAutoextendThreshold: p.Docker.Storage.DirectLVMBlockDevice.ThinpoolAutoextendThreshold,
   753  		ThinpoolAutoextendPercent:   p.Docker.Storage.DirectLVMBlockDevice.ThinpoolAutoextendPercent,
   754  	}
   755  
   756  	if p.Ingress.Nodes != nil && len(p.Ingress.Nodes) > 0 {
   757  		cc.EnableConfigureIngress = true
   758  	} else {
   759  		cc.EnableConfigureIngress = false
   760  	}
   761  
   762  	if p.NFS != nil {
   763  		for _, n := range p.NFS.Volumes {
   764  			cc.NFSVolumes = append(cc.NFSVolumes, ansible.NFSVolume{
   765  				Path: n.Path,
   766  				Host: n.Host,
   767  			})
   768  		}
   769  	}
   770  
   771  	cc.EnableGluster = p.Storage.Nodes != nil && len(p.Storage.Nodes) > 0
   772  
   773  	cc.CloudProvider = p.Cluster.CloudProvider.Provider
   774  	cc.CloudConfig = p.Cluster.CloudProvider.Config
   775  
   776  	// additional files
   777  	for _, n := range p.AdditionalFiles {
   778  		cc.AdditionalFiles = append(cc.AdditionalFiles, ansible.AdditionalFile{
   779  			Source:      n.Source,
   780  			Destination: n.Destination,
   781  			Hosts:       n.Hosts,
   782  		})
   783  	}
   784  
   785  	// add_ons
   786  	cc.RunPodValidation = p.NetworkConfigured()
   787  	// CNI
   788  	if p.AddOns.CNI != nil && !p.AddOns.CNI.Disable {
   789  		cc.CNI.Enabled = true
   790  		cc.CNI.Provider = p.AddOns.CNI.Provider
   791  		cc.CNI.Options.Portmap.Enabled = !p.AddOns.CNI.Options.Portmap.Disable
   792  		// Calico
   793  		cc.CNI.Options.Calico.Mode = p.AddOns.CNI.Options.Calico.Mode
   794  		cc.CNI.Options.Calico.LogLevel = p.AddOns.CNI.Options.Calico.LogLevel
   795  		cc.CNI.Options.Calico.WorkloadMTU = p.AddOns.CNI.Options.Calico.WorkloadMTU
   796  		cc.CNI.Options.Calico.FelixInputMTU = p.AddOns.CNI.Options.Calico.FelixInputMTU
   797  		cc.CNI.Options.Calico.IPAutodetectionMethod = p.AddOns.CNI.Options.Calico.IPAutodetectionMethod
   798  		// Weave
   799  		cc.CNI.Options.Weave.Password = p.AddOns.CNI.Options.Weave.Password
   800  		if cc.CNI.Provider == cniProviderContiv {
   801  			cc.InsecureNetworkingEtcd = true
   802  		}
   803  	}
   804  
   805  	// DNS
   806  	cc.DNS.Enabled = !p.AddOns.DNS.Disable
   807  	cc.DNS.Provider = p.AddOns.DNS.Provider
   808  	cc.DNS.Options.Replicas = p.AddOns.DNS.Options.Replicas
   809  
   810  	// heapster
   811  	if p.AddOns.HeapsterMonitoring != nil && !p.AddOns.HeapsterMonitoring.Disable {
   812  		cc.Heapster.Enabled = true
   813  		cc.Heapster.Options.Heapster.Replicas = p.AddOns.HeapsterMonitoring.Options.Heapster.Replicas
   814  		cc.Heapster.Options.Heapster.ServiceType = p.AddOns.HeapsterMonitoring.Options.Heapster.ServiceType
   815  		cc.Heapster.Options.Heapster.Sink = p.AddOns.HeapsterMonitoring.Options.Heapster.Sink
   816  		cc.Heapster.Options.InfluxDB.PVCName = p.AddOns.HeapsterMonitoring.Options.InfluxDB.PVCName
   817  	}
   818  
   819  	// metrics-server
   820  	cc.MetricsServer.Enabled = !p.AddOns.MetricsServer.Disable
   821  
   822  	// dashboard
   823  	cc.Dashboard.Enabled = !p.AddOns.Dashboard.Disable
   824  	cc.Dashboard.Options.ServiceType = p.AddOns.Dashboard.Options.ServiceType
   825  	cc.Dashboard.Options.NodePort = p.AddOns.Dashboard.Options.NodePort
   826  
   827  	// package_manager
   828  	if !p.AddOns.PackageManager.Disable {
   829  		// Currently only helm is supported
   830  		switch p.AddOns.PackageManager.Provider {
   831  		case "helm":
   832  			cc.Helm.Enabled = true
   833  		default:
   834  			cc.Helm.Enabled = true
   835  		}
   836  		cc.Helm.Namespace = p.AddOns.PackageManager.Options.Helm.Namespace
   837  	}
   838  
   839  	cc.Rescheduler.Enabled = !p.AddOns.Rescheduler.Disable
   840  
   841  	// merge node labels
   842  	// cannot use inventory file because nodes share roles
   843  	// set it to a map[host][]key=value
   844  	cc.NodeLabels = make(map[string][]string)
   845  	for _, n := range p.getAllNodes() {
   846  		if val, ok := cc.NodeLabels[n.Host]; ok {
   847  			cc.NodeLabels[n.Host] = append(val, keyValueList(n.Labels)...)
   848  		} else {
   849  			cc.NodeLabels[n.Host] = keyValueList(n.Labels)
   850  		}
   851  	}
   852  	// merge node taints
   853  	// cannot use inventory file because nodes share roles
   854  	// set it to a map[host][]key=value:effect
   855  	cc.NodeTaints = make(map[string][]string)
   856  	for _, n := range p.getAllNodes() {
   857  		if val, ok := cc.NodeTaints[n.Host]; ok {
   858  			cc.NodeTaints[n.Host] = append(val, keyValueEffectList(n.Taints)...)
   859  		} else {
   860  			cc.NodeTaints[n.Host] = keyValueEffectList(n.Taints)
   861  		}
   862  	}
   863  
   864  	// setup kubelet node overrides
   865  	cc.KubeletNodeOptions = make(map[string]map[string]string)
   866  	for _, n := range p.GetUniqueNodes() {
   867  		cc.KubeletNodeOptions[n.Host] = n.KubeletOptions.Overrides
   868  	}
   869  
   870  	return &cc, nil
   871  }
   872  
   873  func (ae *ansibleExecutor) createRunDirectory(runName string) (string, error) {
   874  	start := time.Now()
   875  	runDirectory := filepath.Join(ae.options.RunsDirectory, runName, start.Format("2006-01-02-15-04-05"))
   876  	if err := os.MkdirAll(runDirectory, 0777); err != nil {
   877  		return "", fmt.Errorf("error creating directory: %v", err)
   878  	}
   879  	return runDirectory, nil
   880  }
   881  
   882  func (ae *ansibleExecutor) ansibleRunnerWithExplainer(explainer explain.AnsibleEventExplainer, ansibleLog io.Writer, runDirectory string) (ansible.Runner, *explain.AnsibleEventStreamExplainer, error) {
   883  	if ae.runnerExplainerFactory != nil {
   884  		return ae.runnerExplainerFactory(explainer, ansibleLog)
   885  	}
   886  
   887  	// Setup sink for ansible stdout
   888  	var ansibleOut io.Writer
   889  	switch ae.consoleOutputFormat {
   890  	case ansible.JSONLinesFormat:
   891  		ansibleOut = timestampWriter(ansibleLog)
   892  	case ansible.RawFormat:
   893  		ansibleOut = io.MultiWriter(ae.stdout, timestampWriter(ansibleLog))
   894  	}
   895  
   896  	// Send stdout and stderr to ansibleOut
   897  	runner, err := ansible.NewRunner(ansibleOut, ansibleOut, ae.ansibleDir, runDirectory)
   898  	if err != nil {
   899  		return nil, nil, fmt.Errorf("error creating ansible runner: %v", err)
   900  	}
   901  
   902  	streamExplainer := &explain.AnsibleEventStreamExplainer{
   903  		EventExplainer: explainer,
   904  	}
   905  
   906  	return runner, streamExplainer, nil
   907  }
   908  
   909  func (ae *ansibleExecutor) defaultExplainer() explain.AnsibleEventExplainer {
   910  	var out io.Writer
   911  	switch ae.consoleOutputFormat {
   912  	case ansible.JSONLinesFormat:
   913  		out = ae.stdout
   914  	case ansible.RawFormat:
   915  		out = ioutil.Discard
   916  	}
   917  	return explain.DefaultExplainer(ae.options.Verbose, out)
   918  }
   919  
   920  func (ae *ansibleExecutor) preflightExplainer() explain.AnsibleEventExplainer {
   921  	var out io.Writer
   922  	switch ae.consoleOutputFormat {
   923  	case ansible.JSONLinesFormat:
   924  		out = ae.stdout
   925  	case ansible.RawFormat:
   926  		out = ioutil.Discard
   927  	}
   928  	return explain.PreflightExplainer(ae.options.Verbose, out)
   929  }
   930  
   931  func buildInventoryFromPlan(p *Plan) ansible.Inventory {
   932  	etcdNodes := []ansible.Node{}
   933  	for _, n := range p.Etcd.Nodes {
   934  		etcdNodes = append(etcdNodes, installNodeToAnsibleNode(&n, &p.Cluster.SSH))
   935  	}
   936  	masterNodes := []ansible.Node{}
   937  	for _, n := range p.Master.Nodes {
   938  		masterNodes = append(masterNodes, installNodeToAnsibleNode(&n, &p.Cluster.SSH))
   939  	}
   940  	workerNodes := []ansible.Node{}
   941  	for _, n := range p.Worker.Nodes {
   942  		workerNodes = append(workerNodes, installNodeToAnsibleNode(&n, &p.Cluster.SSH))
   943  	}
   944  	ingressNodes := []ansible.Node{}
   945  	if p.Ingress.Nodes != nil {
   946  		for _, n := range p.Ingress.Nodes {
   947  			ingressNodes = append(ingressNodes, installNodeToAnsibleNode(&n, &p.Cluster.SSH))
   948  		}
   949  	}
   950  	storageNodes := []ansible.Node{}
   951  	if p.Storage.Nodes != nil {
   952  		for _, n := range p.Storage.Nodes {
   953  			storageNodes = append(storageNodes, installNodeToAnsibleNode(&n, &p.Cluster.SSH))
   954  		}
   955  	}
   956  
   957  	inventory := ansible.Inventory{
   958  		Roles: []ansible.Role{
   959  			{
   960  				Name:  "etcd",
   961  				Nodes: etcdNodes,
   962  			},
   963  			{
   964  				Name:  "master",
   965  				Nodes: masterNodes,
   966  			},
   967  			{
   968  				Name:  "worker",
   969  				Nodes: workerNodes,
   970  			},
   971  			{
   972  				Name:  "ingress",
   973  				Nodes: ingressNodes,
   974  			},
   975  			{
   976  				Name:  "storage",
   977  				Nodes: storageNodes,
   978  			},
   979  		},
   980  	}
   981  
   982  	return inventory
   983  }
   984  
   985  // Converts plan node to ansible node
   986  func installNodeToAnsibleNode(n *Node, s *SSHConfig) ansible.Node {
   987  	return ansible.Node{
   988  		Host:          n.Host,
   989  		PublicIP:      n.IP,
   990  		InternalIP:    n.InternalIP,
   991  		SSHPrivateKey: s.Key,
   992  		SSHUser:       s.User,
   993  		SSHPort:       s.Port,
   994  	}
   995  }
   996  
   997  // Prepend each line of the incoming stream with a timestamp
   998  func timestampWriter(out io.Writer) io.Writer {
   999  	pr, pw := io.Pipe()
  1000  	go func(r io.Reader) {
  1001  		lr := util.NewLineReader(r, 64*1024)
  1002  		var (
  1003  			err  error
  1004  			line []byte
  1005  		)
  1006  		for err == nil {
  1007  			line, err = lr.Read()
  1008  			fmt.Fprintf(out, "%s - %s\n", time.Now().UTC().Format("2006-01-02 15:04:05.000-0700"), string(line))
  1009  		}
  1010  		if err != io.EOF {
  1011  			fmt.Printf("Error timestamping ansible logs: %v", err)
  1012  		}
  1013  	}(pr)
  1014  	return pw
  1015  }
  1016  
  1017  // key=value slice
  1018  func keyValueList(in map[string]string) []string {
  1019  	pairs := make([]string, 0, len(in))
  1020  	for k, v := range in {
  1021  		pairs = append(pairs, fmt.Sprintf("%s=%s", k, v))
  1022  	}
  1023  	return pairs
  1024  }
  1025  
  1026  func keyValueEffectList(in []Taint) []string {
  1027  	taints := make([]string, 0, len(in))
  1028  	for _, taint := range in {
  1029  		taints = append(taints, fmt.Sprintf("%s=%s:%s", taint.Key, taint.Value, taint.Effect))
  1030  	}
  1031  	return taints
  1032  }