github.com/jenkins-x/test-infra@v0.0.7/kubetest/dump.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"log"
    25  	"net"
    26  	"os"
    27  	"path/filepath"
    28  	"strings"
    29  
    30  	"golang.org/x/crypto/ssh"
    31  )
    32  
    33  // logDumper gets all the nodes from a kubernetes cluster and dumps a well-known set of logs
    34  type logDumper struct {
    35  	sshClientFactory sshClientFactory
    36  
    37  	artifactsDir string
    38  
    39  	services []string
    40  	files    []string
    41  }
    42  
    43  // newLogDumper is the constructor for a logDumper
    44  func newLogDumper(sshClientFactory sshClientFactory, artifactsDir string) (*logDumper, error) {
    45  	d := &logDumper{
    46  		sshClientFactory: sshClientFactory,
    47  		artifactsDir:     artifactsDir,
    48  	}
    49  
    50  	d.services = []string{
    51  		"node-problem-detector",
    52  		"kubelet",
    53  		"docker",
    54  		"kops-configuration",
    55  		"protokube",
    56  	}
    57  	d.files = []string{
    58  		"kube-apiserver",
    59  		"kube-scheduler",
    60  		"rescheduler",
    61  		"kube-controller-manager",
    62  		"etcd",
    63  		"etcd-events",
    64  		"glbc",
    65  		"cluster-autoscaler",
    66  		"kube-addon-manager",
    67  		"fluentd",
    68  		"kube-proxy",
    69  		"node-problem-detector",
    70  		"cloud-init-output",
    71  		"startupscript",
    72  		"kern",
    73  		"docker",
    74  	}
    75  
    76  	return d, nil
    77  }
    78  
    79  // DumpAllNodes connects to every node from kubectl get nodes and dumps the logs.
    80  // additionalIPs holds IP addresses of instances found by the deployment tool;
    81  // if the IPs are not found from kubectl get nodes, then these will be dumped also.
    82  // This allows for dumping log on nodes even if they don't register as a kubernetes
    83  // node, or if a node fails to register, or if the whole cluster fails to start.
    84  func (d *logDumper) DumpAllNodes(ctx context.Context, additionalIPs []string) error {
    85  	var dumped []*node
    86  
    87  	nodes, err := kubectlGetNodes("")
    88  	if err != nil {
    89  		log.Printf("Failed to get nodes for dumping via kubectl: %v", err)
    90  	} else {
    91  		for i := range nodes.Items {
    92  			if ctx.Err() != nil {
    93  				log.Printf("stopping dumping nodes: %v", ctx.Err())
    94  				return ctx.Err()
    95  			}
    96  
    97  			node := &nodes.Items[i]
    98  
    99  			ip := ""
   100  			for _, address := range node.Status.Addresses {
   101  				if address.Type == "ExternalIP" {
   102  					ip = address.Address
   103  					break
   104  				}
   105  			}
   106  
   107  			err := d.dumpNode(ctx, node.Metadata.Name, ip)
   108  			if err != nil {
   109  				log.Printf("could not dump node %s (%s): %v", node.Metadata.Name, ip, err)
   110  			} else {
   111  				dumped = append(dumped, node)
   112  			}
   113  		}
   114  	}
   115  
   116  	notDumped := findInstancesNotDumped(additionalIPs, dumped)
   117  	for _, ip := range notDumped {
   118  		if ctx.Err() != nil {
   119  			log.Printf("stopping dumping nodes: %v", ctx.Err())
   120  			return ctx.Err()
   121  		}
   122  
   123  		log.Printf("dumping node not registered in kubernetes: %s", ip)
   124  		err := d.dumpNode(ctx, ip, ip)
   125  		if err != nil {
   126  			log.Printf("error dumping node %s: %v", ip, err)
   127  		}
   128  	}
   129  
   130  	return nil
   131  }
   132  
   133  // findInstancesNotDumped returns ips from the slice that do not appear as any address of the nodes
   134  func findInstancesNotDumped(ips []string, dumped []*node) []string {
   135  	var notDumped []string
   136  	dumpedAddresses := make(map[string]bool)
   137  	for _, node := range dumped {
   138  		for _, address := range node.Status.Addresses {
   139  			dumpedAddresses[address.Address] = true
   140  		}
   141  	}
   142  
   143  	for _, ip := range ips {
   144  		if !dumpedAddresses[ip] {
   145  			notDumped = append(notDumped, ip)
   146  		}
   147  	}
   148  	return notDumped
   149  }
   150  
   151  // DumpNode connects to a node and dumps the logs.
   152  func (d *logDumper) dumpNode(ctx context.Context, name string, ip string) error {
   153  	if ip == "" {
   154  		return fmt.Errorf("could not find address for %v, ", name)
   155  	}
   156  
   157  	log.Printf("Dumping node %s", name)
   158  
   159  	n, err := d.connectToNode(ctx, name, ip)
   160  	if err != nil {
   161  		return fmt.Errorf("could not connect: %v", err)
   162  	}
   163  
   164  	// As long as we connect to the node we will not return an error;
   165  	// a failure to collect a log (or even any logs at all) is not
   166  	// considered an error in dumping the node.
   167  	// TODO(justinsb): clean up / rationalize
   168  	errors := n.dump(ctx)
   169  	for _, e := range errors {
   170  		log.Printf("error dumping node %s: %v", name, e)
   171  	}
   172  
   173  	if err := n.Close(); err != nil {
   174  		log.Printf("error closing connection: %v", err)
   175  	}
   176  
   177  	return nil
   178  }
   179  
   180  // sshClient is an interface abstracting *ssh.Client, which allows us to test it
   181  type sshClient interface {
   182  	io.Closer
   183  
   184  	// ExecPiped runs the command, piping stdout & stderr
   185  	ExecPiped(ctx context.Context, command string, stdout io.Writer, stderr io.Writer) error
   186  }
   187  
   188  // sshClientFactory is an interface abstracting to a node over SSH
   189  type sshClientFactory interface {
   190  	Dial(ctx context.Context, host string) (sshClient, error)
   191  }
   192  
   193  // logDumperNode holds state for a particular node we are dumping
   194  type logDumperNode struct {
   195  	client sshClient
   196  	dumper *logDumper
   197  
   198  	dir string
   199  }
   200  
   201  // connectToNode makes an SSH connection to the node and returns a logDumperNode
   202  func (d *logDumper) connectToNode(ctx context.Context, nodeName string, host string) (*logDumperNode, error) {
   203  	client, err := d.sshClientFactory.Dial(ctx, host)
   204  	if err != nil {
   205  		return nil, fmt.Errorf("unable to SSH to %q: %v", host, err)
   206  	}
   207  	return &logDumperNode{
   208  		client: client,
   209  		dir:    filepath.Join(d.artifactsDir, nodeName),
   210  		dumper: d,
   211  	}, nil
   212  }
   213  
   214  // logDumperNode cleans up any state in the logDumperNode
   215  func (n *logDumperNode) Close() error {
   216  	return n.client.Close()
   217  }
   218  
   219  // dump captures the well-known set of logs
   220  func (n *logDumperNode) dump(ctx context.Context) []error {
   221  	if ctx.Err() != nil {
   222  		return []error{ctx.Err()}
   223  	}
   224  
   225  	var errors []error
   226  
   227  	// Capture kernel log
   228  	if err := n.shellToFile(ctx, "sudo journalctl --output=short-precise -k", filepath.Join(n.dir, "kern.log")); err != nil {
   229  		errors = append(errors, err)
   230  	}
   231  
   232  	// Capture full journal - needed so we can see e.g. disk mounts
   233  	// This does duplicate the other files, but ensures we have all output
   234  	if err := n.shellToFile(ctx, "sudo journalctl --output=short-precise", filepath.Join(n.dir, "journal.log")); err != nil {
   235  		errors = append(errors, err)
   236  	}
   237  
   238  	// Capture logs from any systemd services in our list, that are registered
   239  	services, err := n.listSystemdUnits(ctx)
   240  	if err != nil {
   241  		errors = append(errors, fmt.Errorf("error listing systemd services: %v", err))
   242  	}
   243  	for _, s := range n.dumper.services {
   244  		name := s + ".service"
   245  		for _, service := range services {
   246  			if service == name {
   247  				if err := n.shellToFile(ctx, "sudo journalctl --output=cat -u "+name, filepath.Join(n.dir, s+".log")); err != nil {
   248  					errors = append(errors, err)
   249  				}
   250  			}
   251  		}
   252  	}
   253  
   254  	// Capture any file logs where the files exist
   255  	fileList, err := n.findFiles(ctx, "/var/log")
   256  	if err != nil {
   257  		errors = append(errors, fmt.Errorf("error reading /var/log: %v", err))
   258  	}
   259  	for _, name := range n.dumper.files {
   260  		prefix := "/var/log/" + name + ".log"
   261  		for _, f := range fileList {
   262  			if !strings.HasPrefix(f, prefix) {
   263  				continue
   264  			}
   265  			if err := n.shellToFile(ctx, "sudo cat "+f, filepath.Join(n.dir, filepath.Base(f))); err != nil {
   266  				errors = append(errors, err)
   267  			}
   268  		}
   269  	}
   270  
   271  	return errors
   272  }
   273  
   274  // findFiles lists files under the specified directory (recursively)
   275  func (n *logDumperNode) findFiles(ctx context.Context, dir string) ([]string, error) {
   276  	var stdout bytes.Buffer
   277  	var stderr bytes.Buffer
   278  
   279  	err := n.client.ExecPiped(ctx, "sudo find "+dir+" -print0", &stdout, &stderr)
   280  	if err != nil {
   281  		return nil, fmt.Errorf("error listing %q: %v", dir, err)
   282  	}
   283  
   284  	paths := []string{}
   285  	for _, b := range bytes.Split(stdout.Bytes(), []byte{0}) {
   286  		if len(b) == 0 {
   287  			// Likely the last value
   288  			continue
   289  		}
   290  		paths = append(paths, string(b))
   291  	}
   292  	return paths, nil
   293  }
   294  
   295  // listSystemdUnits returns the list of systemd units on the node
   296  func (n *logDumperNode) listSystemdUnits(ctx context.Context) ([]string, error) {
   297  	var stdout bytes.Buffer
   298  	var stderr bytes.Buffer
   299  
   300  	err := n.client.ExecPiped(ctx, "sudo systemctl list-units -t service --no-pager --no-legend --all", &stdout, &stderr)
   301  	if err != nil {
   302  		return nil, fmt.Errorf("error listing systemd units: %v", err)
   303  	}
   304  
   305  	var services []string
   306  	for _, line := range strings.Split(stdout.String(), "\n") {
   307  		tokens := strings.Fields(line)
   308  		if len(tokens) == 0 || tokens[0] == "" {
   309  			continue
   310  		}
   311  		services = append(services, tokens[0])
   312  	}
   313  	return services, nil
   314  }
   315  
   316  // shellToFile executes a command and copies the output to a file
   317  func (n *logDumperNode) shellToFile(ctx context.Context, command string, destPath string) error {
   318  	if err := os.MkdirAll(filepath.Dir(destPath), 0755); err != nil {
   319  		log.Printf("unable to mkdir on %q: %v", filepath.Dir(destPath), err)
   320  	}
   321  
   322  	f, err := os.Create(destPath)
   323  	if err != nil {
   324  		return fmt.Errorf("error creating file %q: %v", destPath, err)
   325  	}
   326  	defer f.Close()
   327  
   328  	if err := n.client.ExecPiped(ctx, command, f, f); err != nil {
   329  		return fmt.Errorf("error executing command %q: %v", command, err)
   330  	}
   331  
   332  	return nil
   333  }
   334  
   335  // sshClientImplementation is the default implementation of sshClient, binding to a *ssh.Client
   336  type sshClientImplementation struct {
   337  	client *ssh.Client
   338  }
   339  
   340  var _ sshClient = &sshClientImplementation{}
   341  
   342  // ExecPiped implements sshClientImplementation::ExecPiped
   343  func (s *sshClientImplementation) ExecPiped(ctx context.Context, cmd string, stdout io.Writer, stderr io.Writer) error {
   344  	if ctx.Err() != nil {
   345  		return ctx.Err()
   346  	}
   347  
   348  	finished := make(chan error)
   349  	go func() {
   350  		session, err := s.client.NewSession()
   351  		if err != nil {
   352  			finished <- fmt.Errorf("error creating ssh session: %v", err)
   353  			return
   354  		}
   355  		defer session.Close()
   356  
   357  		if verbose {
   358  			log.Printf("Running SSH command: %v", cmd)
   359  		}
   360  
   361  		session.Stdout = stdout
   362  		session.Stderr = stderr
   363  
   364  		finished <- session.Run(cmd)
   365  	}()
   366  
   367  	select {
   368  	case <-ctx.Done():
   369  		log.Print("closing SSH tcp connection due to context completion")
   370  
   371  		// terminate the TCP connection to force a disconnect - we assume everyone is using the same context.
   372  		// We could make this better by sending a signal on the session, waiting and then closing the session,
   373  		// and only if we still haven't succeeded then closing the TCP connection.  This is sufficient for our
   374  		// current usage though - and hopefully that logic will be implemented in the SSH package itself.
   375  		s.Close()
   376  
   377  		<-finished // Wait for cancellation
   378  		return ctx.Err()
   379  
   380  	case err := <-finished:
   381  		return err
   382  	}
   383  }
   384  
   385  // Close implements sshClientImplementation::Close
   386  func (s *sshClientImplementation) Close() error {
   387  	return s.client.Close()
   388  }
   389  
   390  // sshClientFactoryImplementation is the default implementation of sshClientFactory
   391  type sshClientFactoryImplementation struct {
   392  	sshConfig *ssh.ClientConfig
   393  }
   394  
   395  var _ sshClientFactory = &sshClientFactoryImplementation{}
   396  
   397  // Dial implements sshClientFactory::Dial
   398  func (f *sshClientFactoryImplementation) Dial(ctx context.Context, host string) (sshClient, error) {
   399  	addr := host + ":22"
   400  	d := net.Dialer{}
   401  	conn, err := d.DialContext(ctx, "tcp", addr)
   402  	if err != nil {
   403  		return nil, err
   404  	}
   405  
   406  	// We have a TCP connection; we will force-close it to support context cancellation
   407  
   408  	var client *ssh.Client
   409  	finished := make(chan error)
   410  	go func() {
   411  		c, chans, reqs, err := ssh.NewClientConn(conn, addr, f.sshConfig)
   412  		if err == nil {
   413  			client = ssh.NewClient(c, chans, reqs)
   414  		}
   415  		finished <- err
   416  	}()
   417  
   418  	select {
   419  	case <-ctx.Done():
   420  		log.Print("cancelling SSH tcp connection due to context completion")
   421  		conn.Close() // Close the TCP connection to force cancellation
   422  		<-finished   // Wait for cancellation
   423  		return nil, ctx.Err()
   424  	case err := <-finished:
   425  		if err != nil {
   426  			return nil, err
   427  		}
   428  		return &sshClientImplementation{
   429  			client: client,
   430  		}, nil
   431  	}
   432  }