github.com/abayer/test-infra@v0.0.5/kubetest/dump.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"log"
    25  	"net"
    26  	"os"
    27  	"path/filepath"
    28  	"strings"
    29  
    30  	"golang.org/x/crypto/ssh"
    31  )
    32  
    33  // logDumper gets all the nodes from a kubernetes cluster and dumps a well-known set of logs
    34  type logDumper struct {
    35  	sshClientFactory sshClientFactory
    36  
    37  	artifactsDir string
    38  
    39  	services []string
    40  	files    []string
    41  }
    42  
    43  // newLogDumper is the constructor for a logDumper
    44  func newLogDumper(sshClientFactory sshClientFactory, artifactsDir string) (*logDumper, error) {
    45  	d := &logDumper{
    46  		sshClientFactory: sshClientFactory,
    47  		artifactsDir:     artifactsDir,
    48  	}
    49  
    50  	d.services = []string{
    51  		"node-problem-detector",
    52  		"kubelet",
    53  		"docker",
    54  		"kops-configuration",
    55  		"protokube",
    56  	}
    57  	d.files = []string{
    58  		"kube-apiserver",
    59  		"kube-scheduler",
    60  		"rescheduler",
    61  		"kube-controller-manager",
    62  		"etcd",
    63  		"etcd-events",
    64  		"glbc",
    65  		"cluster-autoscaler",
    66  		"kube-addon-manager",
    67  		"fluentd",
    68  		"kube-proxy",
    69  		"node-problem-detector",
    70  		"cloud-init-output",
    71  		"startupscript",
    72  		"kern",
    73  		"docker",
    74  	}
    75  
    76  	return d, nil
    77  }
    78  
    79  // DumpAllNodes connects to every node from kubectl get nodes and dumps the logs.
    80  // additionalIPs holds IP addresses of instances found by the deployment tool;
    81  // if the IPs are not found from kubectl get nodes, then these will be dumped also.
    82  // This allows for dumping log on nodes even if they don't register as a kubernetes
    83  // node, or if a node fails to register, or if the whole cluster fails to start.
    84  func (d *logDumper) DumpAllNodes(ctx context.Context, additionalIPs []string) error {
    85  	var dumped []*node
    86  
    87  	nodes, err := kubectlGetNodes("")
    88  	if err != nil {
    89  		log.Printf("Failed to get nodes for dumping via kubectl: %v", err)
    90  	} else {
    91  		for i := range nodes.Items {
    92  			if ctx.Err() != nil {
    93  				log.Printf("stopping dumping nodes: %v", ctx.Err())
    94  				return ctx.Err()
    95  			}
    96  
    97  			node := &nodes.Items[i]
    98  
    99  			ip := ""
   100  			for _, address := range node.Status.Addresses {
   101  				if address.Type == "ExternalIP" {
   102  					ip = address.Address
   103  					break
   104  				}
   105  			}
   106  
   107  			err := d.dumpNode(ctx, node.Metadata.Name, ip)
   108  			if err != nil {
   109  				log.Printf("could not dump node %s (%s): %v", node.Metadata.Name, ip, err)
   110  			} else {
   111  				dumped = append(dumped, node)
   112  			}
   113  		}
   114  	}
   115  
   116  	notDumped := findInstancesNotDumped(additionalIPs, dumped)
   117  	for _, ip := range notDumped {
   118  		if ctx.Err() != nil {
   119  			log.Printf("stopping dumping nodes: %v", ctx.Err())
   120  			return ctx.Err()
   121  		}
   122  
   123  		log.Printf("dumping node not registered in kubernetes: %s", ip)
   124  		err := d.dumpNode(ctx, ip, ip)
   125  		if err != nil {
   126  			log.Printf("error dumping node %s: %v", ip, err)
   127  		}
   128  	}
   129  
   130  	return nil
   131  }
   132  
   133  // findInstancesNotDumped returns ips from the slice that do not appear as any address of the nodes
   134  func findInstancesNotDumped(ips []string, dumped []*node) []string {
   135  	var notDumped []string
   136  	dumpedAddresses := make(map[string]bool)
   137  	for _, node := range dumped {
   138  		for _, address := range node.Status.Addresses {
   139  			dumpedAddresses[address.Address] = true
   140  		}
   141  	}
   142  
   143  	for _, ip := range ips {
   144  		if !dumpedAddresses[ip] {
   145  			notDumped = append(notDumped, ip)
   146  		}
   147  	}
   148  	return notDumped
   149  }
   150  
   151  // DumpNode connects to a node and dumps the logs.
   152  func (d *logDumper) dumpNode(ctx context.Context, name string, ip string) error {
   153  	if ip == "" {
   154  		return fmt.Errorf("could not find address for %v, ", name)
   155  	}
   156  
   157  	log.Printf("Dumping node %s", name)
   158  
   159  	n, err := d.connectToNode(ctx, name, ip)
   160  	if err != nil {
   161  		return fmt.Errorf("could not connect: %v", err)
   162  	}
   163  
   164  	// As long as we connect to the node we will not return an error;
   165  	// a failure to collect a log (or even any logs at all) is not
   166  	// considered an error in dumping the node.
   167  	// TODO(justinsb): clean up / rationalize
   168  	errors := n.dump(ctx)
   169  	for _, e := range errors {
   170  		log.Printf("error dumping node %s: %v", name, e)
   171  	}
   172  
   173  	if err := n.Close(); err != nil {
   174  		log.Printf("error closing connection: %v", err)
   175  	}
   176  
   177  	return nil
   178  }
   179  
   180  // sshClient is an interface abstracting *ssh.Client, which allows us to test it
   181  type sshClient interface {
   182  	io.Closer
   183  
   184  	// ExecPiped runs the command, piping stdout & stderr
   185  	ExecPiped(ctx context.Context, command string, stdout io.Writer, stderr io.Writer) error
   186  }
   187  
   188  // sshClientFactory is an interface abstracting to a node over SSH
   189  type sshClientFactory interface {
   190  	Dial(ctx context.Context, host string) (sshClient, error)
   191  }
   192  
   193  // logDumperNode holds state for a particular node we are dumping
   194  type logDumperNode struct {
   195  	client sshClient
   196  	dumper *logDumper
   197  
   198  	dir string
   199  }
   200  
   201  // connectToNode makes an SSH connection to the node and returns a logDumperNode
   202  func (d *logDumper) connectToNode(ctx context.Context, nodeName string, host string) (*logDumperNode, error) {
   203  	client, err := d.sshClientFactory.Dial(ctx, host)
   204  	if err != nil {
   205  		return nil, fmt.Errorf("unable to SSH to %q: %v", host, err)
   206  	}
   207  	return &logDumperNode{
   208  		client: client,
   209  		dir:    filepath.Join(d.artifactsDir, nodeName),
   210  		dumper: d,
   211  	}, nil
   212  }
   213  
   214  // logDumperNode cleans up any state in the logDumperNode
   215  func (n *logDumperNode) Close() error {
   216  	return n.client.Close()
   217  }
   218  
   219  // dump captures the well-known set of logs
   220  func (n *logDumperNode) dump(ctx context.Context) []error {
   221  	if ctx.Err() != nil {
   222  		return []error{ctx.Err()}
   223  	}
   224  
   225  	var errors []error
   226  
   227  	// Capture kernel log
   228  	if err := n.shellToFile(ctx, "sudo journalctl --output=short-precise -k", filepath.Join(n.dir, "kern.log")); err != nil {
   229  		errors = append(errors, err)
   230  	}
   231  
   232  	// Capture logs from any systemd services in our list, that are registered
   233  	services, err := n.listSystemdUnits(ctx)
   234  	if err != nil {
   235  		errors = append(errors, fmt.Errorf("error listing systemd services: %v", err))
   236  	}
   237  	for _, s := range n.dumper.services {
   238  		name := s + ".service"
   239  		for _, service := range services {
   240  			if service == name {
   241  				if err := n.shellToFile(ctx, "sudo journalctl --output=cat -u "+name, filepath.Join(n.dir, s+".log")); err != nil {
   242  					errors = append(errors, err)
   243  				}
   244  			}
   245  		}
   246  	}
   247  
   248  	// Capture any file logs where the files exist
   249  	fileList, err := n.findFiles(ctx, "/var/log")
   250  	if err != nil {
   251  		errors = append(errors, fmt.Errorf("error reading /var/log: %v", err))
   252  	}
   253  	for _, name := range n.dumper.files {
   254  		prefix := "/var/log/" + name + ".log"
   255  		for _, f := range fileList {
   256  			if !strings.HasPrefix(f, prefix) {
   257  				continue
   258  			}
   259  			if err := n.shellToFile(ctx, "sudo cat "+f, filepath.Join(n.dir, filepath.Base(f))); err != nil {
   260  				errors = append(errors, err)
   261  			}
   262  		}
   263  	}
   264  
   265  	return errors
   266  }
   267  
   268  // findFiles lists files under the specified directory (recursively)
   269  func (n *logDumperNode) findFiles(ctx context.Context, dir string) ([]string, error) {
   270  	var stdout bytes.Buffer
   271  	var stderr bytes.Buffer
   272  
   273  	err := n.client.ExecPiped(ctx, "sudo find "+dir+" -print0", &stdout, &stderr)
   274  	if err != nil {
   275  		return nil, fmt.Errorf("error listing %q: %v", dir, err)
   276  	}
   277  
   278  	paths := []string{}
   279  	for _, b := range bytes.Split(stdout.Bytes(), []byte{0}) {
   280  		if len(b) == 0 {
   281  			// Likely the last value
   282  			continue
   283  		}
   284  		paths = append(paths, string(b))
   285  	}
   286  	return paths, nil
   287  }
   288  
   289  // listSystemdUnits returns the list of systemd units on the node
   290  func (n *logDumperNode) listSystemdUnits(ctx context.Context) ([]string, error) {
   291  	var stdout bytes.Buffer
   292  	var stderr bytes.Buffer
   293  
   294  	err := n.client.ExecPiped(ctx, "sudo systemctl list-units -t service --no-pager --no-legend --all", &stdout, &stderr)
   295  	if err != nil {
   296  		return nil, fmt.Errorf("error listing systemd units: %v", err)
   297  	}
   298  
   299  	var services []string
   300  	for _, line := range strings.Split(stdout.String(), "\n") {
   301  		tokens := strings.Fields(line)
   302  		if len(tokens) == 0 || tokens[0] == "" {
   303  			continue
   304  		}
   305  		services = append(services, tokens[0])
   306  	}
   307  	return services, nil
   308  }
   309  
   310  // shellToFile executes a command and copies the output to a file
   311  func (n *logDumperNode) shellToFile(ctx context.Context, command string, destPath string) error {
   312  	if err := os.MkdirAll(filepath.Dir(destPath), 0755); err != nil {
   313  		log.Printf("unable to mkdir on %q: %v", filepath.Dir(destPath), err)
   314  	}
   315  
   316  	f, err := os.Create(destPath)
   317  	if err != nil {
   318  		return fmt.Errorf("error creating file %q: %v", destPath, err)
   319  	}
   320  	defer f.Close()
   321  
   322  	if err := n.client.ExecPiped(ctx, command, f, f); err != nil {
   323  		return fmt.Errorf("error executing command %q: %v", command, err)
   324  	}
   325  
   326  	return nil
   327  }
   328  
   329  // sshClientImplementation is the default implementation of sshClient, binding to a *ssh.Client
   330  type sshClientImplementation struct {
   331  	client *ssh.Client
   332  }
   333  
   334  var _ sshClient = &sshClientImplementation{}
   335  
   336  // ExecPiped implements sshClientImplementation::ExecPiped
   337  func (s *sshClientImplementation) ExecPiped(ctx context.Context, cmd string, stdout io.Writer, stderr io.Writer) error {
   338  	if ctx.Err() != nil {
   339  		return ctx.Err()
   340  	}
   341  
   342  	finished := make(chan error)
   343  	go func() {
   344  		session, err := s.client.NewSession()
   345  		if err != nil {
   346  			finished <- fmt.Errorf("error creating ssh session: %v", err)
   347  			return
   348  		}
   349  		defer session.Close()
   350  
   351  		if verbose {
   352  			log.Printf("Running SSH command: %v", cmd)
   353  		}
   354  
   355  		session.Stdout = stdout
   356  		session.Stderr = stderr
   357  
   358  		finished <- session.Run(cmd)
   359  	}()
   360  
   361  	select {
   362  	case <-ctx.Done():
   363  		log.Print("closing SSH tcp connection due to context completion")
   364  
   365  		// terminate the TCP connection to force a disconnect - we assume everyone is using the same context.
   366  		// We could make this better by sending a signal on the session, waiting and then closing the session,
   367  		// and only if we still haven't succeeded then closing the TCP connection.  This is sufficient for our
   368  		// current usage though - and hopefully that logic will be implemented in the SSH package itself.
   369  		s.Close()
   370  
   371  		<-finished // Wait for cancellation
   372  		return ctx.Err()
   373  
   374  	case err := <-finished:
   375  		return err
   376  	}
   377  }
   378  
   379  // Close implements sshClientImplementation::Close
   380  func (s *sshClientImplementation) Close() error {
   381  	return s.client.Close()
   382  }
   383  
   384  // sshClientFactoryImplementation is the default implementation of sshClientFactory
   385  type sshClientFactoryImplementation struct {
   386  	sshConfig *ssh.ClientConfig
   387  }
   388  
   389  var _ sshClientFactory = &sshClientFactoryImplementation{}
   390  
   391  // Dial implements sshClientFactory::Dial
   392  func (f *sshClientFactoryImplementation) Dial(ctx context.Context, host string) (sshClient, error) {
   393  	addr := host + ":22"
   394  	d := net.Dialer{}
   395  	conn, err := d.DialContext(ctx, "tcp", addr)
   396  	if err != nil {
   397  		return nil, err
   398  	}
   399  
   400  	// We have a TCP connection; we will force-close it to support context cancellation
   401  
   402  	var client *ssh.Client
   403  	finished := make(chan error)
   404  	go func() {
   405  		c, chans, reqs, err := ssh.NewClientConn(conn, addr, f.sshConfig)
   406  		if err == nil {
   407  			client = ssh.NewClient(c, chans, reqs)
   408  		}
   409  		finished <- err
   410  	}()
   411  
   412  	select {
   413  	case <-ctx.Done():
   414  		log.Print("cancelling SSH tcp connection due to context completion")
   415  		conn.Close() // Close the TCP connection to force cancellation
   416  		<-finished   // Wait for cancellation
   417  		return nil, ctx.Err()
   418  	case err := <-finished:
   419  		if err != nil {
   420  			return nil, err
   421  		}
   422  		return &sshClientImplementation{
   423  			client: client,
   424  		}, nil
   425  	}
   426  }