github.com/openshift/installer@v1.4.17/cmd/openshift-install/gather.go (about)

     1  package main
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"io/fs"
     8  	"net"
     9  	"os"
    10  	"path/filepath"
    11  	"strconv"
    12  	"strings"
    13  	"syscall"
    14  	"time"
    15  
    16  	"github.com/sirupsen/logrus"
    17  	"github.com/spf13/cobra"
    18  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    19  	"k8s.io/client-go/rest"
    20  
    21  	configv1 "github.com/openshift/api/config/v1"
    22  	configclient "github.com/openshift/client-go/config/clientset/versioned"
    23  	"github.com/openshift/installer/cmd/openshift-install/command"
    24  	"github.com/openshift/installer/pkg/asset/installconfig"
    25  	assetstore "github.com/openshift/installer/pkg/asset/store"
    26  	"github.com/openshift/installer/pkg/asset/tls"
    27  	"github.com/openshift/installer/pkg/clusterapi"
    28  	serialgather "github.com/openshift/installer/pkg/gather"
    29  	"github.com/openshift/installer/pkg/gather/service"
    30  	"github.com/openshift/installer/pkg/gather/ssh"
    31  	"github.com/openshift/installer/pkg/infrastructure"
    32  	infra "github.com/openshift/installer/pkg/infrastructure/platform"
    33  
    34  	_ "github.com/openshift/installer/pkg/gather/aws"
    35  	_ "github.com/openshift/installer/pkg/gather/azure"
    36  	_ "github.com/openshift/installer/pkg/gather/gcp"
    37  )
    38  
    39  func newGatherCmd(ctx context.Context) *cobra.Command {
    40  	cmd := &cobra.Command{
    41  		Use:   "gather",
    42  		Short: "Gather debugging data for a given installation failure",
    43  		Long: `Gather debugging data for a given installation failure.
    44  
    45  When installation for OpenShift cluster fails, gathering all the data useful for debugging can
    46  become a difficult task. This command helps users to collect the most relevant information that can be used
    47  to debug the installation failures`,
    48  		RunE: func(cmd *cobra.Command, args []string) error {
    49  			return cmd.Help()
    50  		},
    51  	}
    52  	cmd.AddCommand(newGatherBootstrapCmd(ctx))
    53  	return cmd
    54  }
    55  
    56  var gatherBootstrapOpts struct {
    57  	bootstrap    string
    58  	masters      []string
    59  	sshKeys      []string
    60  	skipAnalysis bool
    61  }
    62  
    63  func newGatherBootstrapCmd(ctx context.Context) *cobra.Command {
    64  	cmd := &cobra.Command{
    65  		Use:   "bootstrap",
    66  		Short: "Gather debugging data for a failing-to-bootstrap control plane",
    67  		Args:  cobra.ExactArgs(0),
    68  		Run: func(_ *cobra.Command, _ []string) {
    69  			cleanup := command.SetupFileHook(command.RootOpts.Dir)
    70  			defer cleanup()
    71  			bundlePath, err := runGatherBootstrapCmd(ctx, command.RootOpts.Dir)
    72  			if err != nil {
    73  				logrus.Fatal(err)
    74  			}
    75  
    76  			if !gatherBootstrapOpts.skipAnalysis {
    77  				if err := service.AnalyzeGatherBundle(bundlePath); err != nil {
    78  					logrus.Fatal(err)
    79  				}
    80  			}
    81  
    82  			logrus.Infof("Bootstrap gather logs captured here %q", bundlePath)
    83  		},
    84  	}
    85  	cmd.PersistentFlags().StringVar(&gatherBootstrapOpts.bootstrap, "bootstrap", "", "Hostname or IP of the bootstrap host")
    86  	cmd.PersistentFlags().StringArrayVar(&gatherBootstrapOpts.masters, "master", []string{}, "Hostnames or IPs of all control plane hosts")
    87  	cmd.PersistentFlags().StringArrayVar(&gatherBootstrapOpts.sshKeys, "key", []string{}, "Path to SSH private keys that should be used for authentication. If no key was provided, SSH private keys from user's environment will be used")
    88  	cmd.PersistentFlags().BoolVar(&gatherBootstrapOpts.skipAnalysis, "skipAnalysis", false, "Skip analysis of the gathered data")
    89  	return cmd
    90  }
    91  
    92  func runGatherBootstrapCmd(ctx context.Context, directory string) (string, error) {
    93  	assetStore, err := assetstore.NewStore(directory)
    94  	if err != nil {
    95  		return "", fmt.Errorf("failed to create asset store: %w", err)
    96  	}
    97  	// add the default bootstrap key pair to the sshKeys list
    98  	bootstrapSSHKeyPair := &tls.BootstrapSSHKeyPair{}
    99  	if err := assetStore.Fetch(ctx, bootstrapSSHKeyPair); err != nil {
   100  		return "", fmt.Errorf("failed to fetch %s: %w", bootstrapSSHKeyPair.Name(), err)
   101  	}
   102  	tmpfile, err := os.CreateTemp("", "bootstrap-ssh")
   103  	if err != nil {
   104  		return "", err
   105  	}
   106  	defer os.Remove(tmpfile.Name())
   107  	if _, err := tmpfile.Write(bootstrapSSHKeyPair.Private()); err != nil {
   108  		return "", err
   109  	}
   110  	if err := tmpfile.Close(); err != nil {
   111  		return "", err
   112  	}
   113  	gatherBootstrapOpts.sshKeys = append(gatherBootstrapOpts.sshKeys, tmpfile.Name())
   114  
   115  	ha := &infrastructure.HostAddresses{
   116  		Bootstrap: gatherBootstrapOpts.bootstrap,
   117  		Port:      22,
   118  		Masters:   gatherBootstrapOpts.masters,
   119  	}
   120  
   121  	if ha.Bootstrap == "" && len(ha.Masters) == 0 {
   122  		config := &installconfig.InstallConfig{}
   123  		if err := assetStore.Fetch(ctx, config); err != nil {
   124  			return "", fmt.Errorf("failed to fetch %s: %w", config.Name(), err)
   125  		}
   126  
   127  		provider, err := infra.ProviderForPlatform(config.Config.Platform.Name(), config.Config.EnabledFeatureGates())
   128  		if err != nil {
   129  			return "", fmt.Errorf("error getting infrastructure provider: %w", err)
   130  		}
   131  		if err = provider.ExtractHostAddresses(directory, config.Config, ha); err != nil {
   132  			logrus.Warnf("Failed to extract host addresses: %s", err.Error())
   133  		}
   134  	}
   135  
   136  	if ha.Bootstrap == "" {
   137  		return "", errors.New("must provide bootstrap host address")
   138  	}
   139  
   140  	return gatherBootstrap(ha.Bootstrap, ha.Port, ha.Masters, directory)
   141  }
   142  
   143  func gatherBootstrap(bootstrap string, port int, masters []string, directory string) (string, error) {
   144  	gatherID := time.Now().Format("20060102150405")
   145  	archives := map[string]string{}
   146  
   147  	if capiManifestsBundlePath, err := gatherCAPIArtifacts(directory, gatherID); err != nil {
   148  		// Do not fail the whole gather if we can't find capi manifests (we can be running terraform)
   149  		logrus.Infof("Failed to gather Cluster API manifests: %s", err.Error())
   150  	} else {
   151  		archives[capiManifestsBundlePath] = "clusterapi"
   152  	}
   153  
   154  	serialLogBundle := filepath.Join(directory, fmt.Sprintf("serial-log-bundle-%s.tar.gz", gatherID))
   155  	serialLogBundlePath, err := filepath.Abs(serialLogBundle)
   156  	if err != nil {
   157  		return "", fmt.Errorf("failed to stat log file: %w", err)
   158  	}
   159  
   160  	consoleGather, err := serialgather.New(logrus.StandardLogger(), serialLogBundlePath, bootstrap, masters, directory)
   161  	if err != nil {
   162  		logrus.Infof("Skipping VM console logs gather: %s", err.Error())
   163  	} else {
   164  		logrus.Info("Pulling VM console logs")
   165  		if err := consoleGather.Run(); err != nil {
   166  			logrus.Infof("Failed to gather VM console logs: %s", err.Error())
   167  		} else {
   168  			archives[serialLogBundlePath] = "serial"
   169  		}
   170  	}
   171  
   172  	clusterLogBundlePath, err := pullLogsFromBootstrap(gatherID, bootstrap, port, masters, directory)
   173  	if err != nil {
   174  		logrus.Infof("Failed to gather bootstrap logs: %s", err.Error())
   175  	} else {
   176  		archives[clusterLogBundlePath] = ""
   177  	}
   178  
   179  	if len(archives) == 0 {
   180  		return "", fmt.Errorf("failed to gather VM console and bootstrap logs")
   181  	}
   182  
   183  	logBundlePath := filepath.Join(directory, fmt.Sprintf("log-bundle-%s.tar.gz", gatherID))
   184  	err = serialgather.CombineArchives(logBundlePath, archives)
   185  	if err != nil {
   186  		return "", fmt.Errorf("failed to combine archives: %w", err)
   187  	}
   188  
   189  	return logBundlePath, nil
   190  }
   191  
   192  func pullLogsFromBootstrap(gatherID string, bootstrap string, port int, masters []string, directory string) (string, error) {
   193  	logrus.Info("Pulling debug logs from the bootstrap machine")
   194  	client, err := ssh.NewClient("core", net.JoinHostPort(bootstrap, strconv.Itoa(port)), gatherBootstrapOpts.sshKeys)
   195  	if err != nil {
   196  		if errors.Is(err, syscall.ECONNREFUSED) || errors.Is(err, syscall.ETIMEDOUT) {
   197  			return "", fmt.Errorf("failed to connect to the bootstrap machine: %w", err)
   198  		}
   199  		return "", fmt.Errorf("failed to create SSH client: %w", err)
   200  	}
   201  
   202  	if err := ssh.Run(client, fmt.Sprintf("/usr/local/bin/installer-gather.sh --id %s %s", gatherID, strings.Join(masters, " "))); err != nil {
   203  		return "", fmt.Errorf("failed to run remote command: %w", err)
   204  	}
   205  
   206  	file := filepath.Join(directory, fmt.Sprintf("cluster-log-bundle-%s.tar.gz", gatherID))
   207  	if err := ssh.PullFileTo(client, fmt.Sprintf("/home/core/log-bundle-%s.tar.gz", gatherID), file); err != nil {
   208  		return "", fmt.Errorf("failed to pull log file from remote: %w", err)
   209  	}
   210  
   211  	clusterLogBundlePath, err := filepath.Abs(file)
   212  	if err != nil {
   213  		return "", fmt.Errorf("failed to stat log file: %w", err)
   214  	}
   215  
   216  	return clusterLogBundlePath, nil
   217  }
   218  
   219  func logClusterOperatorConditions(ctx context.Context, config *rest.Config) error {
   220  	client, err := configclient.NewForConfig(config)
   221  	if err != nil {
   222  		return fmt.Errorf("creating a config client: %w", err)
   223  	}
   224  
   225  	operators, err := client.ConfigV1().ClusterOperators().List(ctx, metav1.ListOptions{})
   226  	if err != nil {
   227  		return fmt.Errorf("listing ClusterOperator objects: %w", err)
   228  	}
   229  
   230  	for _, operator := range operators.Items {
   231  		for _, condition := range operator.Status.Conditions {
   232  			if condition.Type == configv1.OperatorUpgradeable {
   233  				continue
   234  			} else if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionTrue {
   235  				continue
   236  			} else if (condition.Type == configv1.OperatorDegraded || condition.Type == configv1.OperatorProgressing) && condition.Status == configv1.ConditionFalse {
   237  				continue
   238  			}
   239  			if condition.Type == configv1.OperatorAvailable || condition.Type == configv1.OperatorDegraded {
   240  				logrus.Errorf("Cluster operator %s %s is %s with %s: %s", operator.ObjectMeta.Name, condition.Type, condition.Status, condition.Reason, condition.Message)
   241  			} else {
   242  				logrus.Infof("Cluster operator %s %s is %s with %s: %s", operator.ObjectMeta.Name, condition.Type, condition.Status, condition.Reason, condition.Message)
   243  			}
   244  		}
   245  	}
   246  
   247  	return nil
   248  }
   249  
   250  func gatherCAPIArtifacts(directory, gatherID string) (string, error) {
   251  	logrus.Infoln("Pulling Cluster API artifacts")
   252  	dir, err := filepath.Abs(directory)
   253  	if err != nil {
   254  		return "", fmt.Errorf("failed to get absolute path for %s: %w", directory, err)
   255  	}
   256  
   257  	capiDir := filepath.Join(dir, clusterapi.ArtifactsDir)
   258  	if _, err := os.Stat(capiDir); err != nil {
   259  		if errors.Is(err, fs.ErrNotExist) {
   260  			return "", fmt.Errorf("either Cluster API manifests not generated or terraform provision")
   261  		}
   262  		return "", fmt.Errorf("failed to stat Cluster API output directory: %w", err)
   263  	}
   264  
   265  	bundleDir := filepath.Join(dir, fmt.Sprintf("capi-artifacts-bundle-%s", gatherID))
   266  	// Symlink the hidden directory so the artifacts are not hidden in the archive
   267  	if err := os.Symlink(capiDir, bundleDir); err != nil {
   268  		return "", fmt.Errorf("failed to copy Cluster API artifacts: %w", err)
   269  	}
   270  	defer os.Remove(bundleDir)
   271  
   272  	var capiArtifacts []string
   273  	manifests, err := filepath.Glob(filepath.Join(bundleDir, "*.yaml"))
   274  	if err != nil {
   275  		return "", fmt.Errorf("failed to gather Cluster API manifests: %w", err)
   276  	}
   277  	capiArtifacts = append(capiArtifacts, manifests...)
   278  
   279  	logs, err := filepath.Glob(filepath.Join(bundleDir, "*.log"))
   280  	if err != nil {
   281  		return "", fmt.Errorf("failed to gather Cluster API control plane logs: %w", err)
   282  	}
   283  	capiArtifacts = append(capiArtifacts, logs...)
   284  
   285  	capiArtifactsBundlePath := fmt.Sprintf("%s.tar.gz", bundleDir)
   286  	if err := serialgather.CreateArchive(capiArtifacts, capiArtifactsBundlePath); err != nil {
   287  		return "", fmt.Errorf("failed to create clusterapi bundle file: %w", err)
   288  	}
   289  	return capiArtifactsBundlePath, nil
   290  }