k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/logexporter/cmd/main.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // TODO: Make this exporter work for master too, currently facing
    18  // gcloud auth error when run from within a pod on the master.
    19  
    20  package main
    21  
    22  import (
    23  	"bytes"
    24  	"fmt"
    25  	"io"
    26  	"net"
    27  	"net/http"
    28  	"os"
    29  	"os/exec"
    30  	"path/filepath"
    31  	"strings"
    32  	"time"
    33  
    34  	"github.com/spf13/pflag"
    35  	"k8s.io/klog/v2"
    36  )
    37  
    38  // Initialize the log exporter's configuration related flags.
    39  var (
    40  	cloudProvider        = pflag.String("cloud-provider", "", "Cloud provider for this node (gce/gke/aws/kubemark/..)")
    41  	dumpSystemdJournal   = pflag.Bool("dump-systemd-journal", false, "Whether to dump the full systemd journal")
    42  	enableHollowNodeLogs = pflag.Bool("enable-hollow-node-logs", false, "Enable uploading hollow node logs too. Relevant only for kubemark nodes")
    43  	extraLogFiles        = pflag.StringSlice("extra-log-files", []string{}, "Extra log files to dump")
    44  	extraSystemdServices = pflag.StringSlice("extra-systemd-services", []string{}, "Extra systemd services to dump")
    45  	gcsPath              = pflag.String("gcs-path", "", "Path to the GCS directory under which to upload logs, for eg: gs://my-logs-bucket/logs")
    46  	gcloudAuthFilePath   = pflag.String("gcloud-auth-file-path", "/etc/service-account/service-account.json", "Path to gcloud service account file, for authenticating gsutil to write to GCS bucket")
    47  	useAdc               = pflag.Bool("use-application-default-credentials", false, "Whether to use Application Default Credentials instead of the provided service account file")
    48  	journalPath          = pflag.String("journal-path", "/var/log/journal", "Path where the systemd journal dir is mounted")
    49  	nodeName             = pflag.String("node-name", "", "Name of the node this log exporter is running on")
    50  	sleepDuration        = pflag.Duration("sleep-duration", 60*time.Second, "Duration to sleep before exiting with success. Useful for making pods schedule with hard anti-affinity when run as a job on a k8s cluster")
    51  )
    52  
    53  var (
    54  	localLogPath = "/var/log"
    55  
    56  	// Node-type specific logfiles.
    57  	// Currently we only handle nodes, and neglect master.
    58  	nodeLogs = []string{"kube-proxy", "node-problem-detector", "fluentd"}
    59  
    60  	// Cloud provider specific logfiles.
    61  	awsLogs      = []string{"cloud-init-output"}
    62  	gceLogs      = []string{"startupscript"}
    63  	kubemarkLogs = []string{"*-hollow-node-*"}
    64  
    65  	// System services/kernel related logfiles.
    66  	kernelLog            = "kern"
    67  	initdLogs            = []string{"docker"}
    68  	supervisordLogs      = []string{"kubelet", "supervisor/supervisord", "supervisor/kubelet-stdout", "supervisor/kubelet-stderr", "supervisor/docker-stdout", "supervisor/docker-stderr"}
    69  	systemdServices      = []string{"kern", "kubelet", "docker"}
    70  	systemdSetupServices = []string{"kube-node-installation", "kube-node-configuration"}
    71  	nodeSystemdServices  = []string{"node-problem-detector"}
    72  )
    73  
    74  // Check if the config provided through the flags take valid values.
    75  func checkConfigValidity() error {
    76  	klog.Info("Verifying if a valid config has been provided through the flags")
    77  	if *nodeName == "" {
    78  		return fmt.Errorf("Flag --node-name has its value unspecified")
    79  	}
    80  	if *gcsPath == "" {
    81  		return fmt.Errorf("Flag --gcs-path has its value unspecified")
    82  	}
    83  	if !*useAdc {
    84  		if _, err := os.Stat(*gcloudAuthFilePath); err != nil {
    85  			return fmt.Errorf("Could not find the gcloud service account file: %w", err)
    86  		} else if err := runCommand("gcloud", "auth", "activate-service-account", "--key-file="+*gcloudAuthFilePath); err != nil {
    87  			return fmt.Errorf("Failed to activate gcloud service account: %w", err)
    88  		}
    89  	}
    90  	return nil
    91  }
    92  
    93  // Create logfile for systemd service in outputDir with the given journalctl outputMode.
    94  func createSystemdLogfile(service string, outputMode string, outputDir string) error {
    95  	// Generate the journalctl command.
    96  	journalCmdArgs := []string{fmt.Sprintf("--output=%v", outputMode), "-D", *journalPath}
    97  	if service == "kern" {
    98  		journalCmdArgs = append(journalCmdArgs, "-k")
    99  	} else {
   100  		journalCmdArgs = append(journalCmdArgs, "-u", fmt.Sprintf("%v.service", service))
   101  	}
   102  	cmd := exec.Command("journalctl", journalCmdArgs...)
   103  
   104  	// Run the command and record the output to a file.
   105  	output, err := cmd.Output()
   106  	if err != nil {
   107  		return fmt.Errorf("Journalctl command for '%v' service failed: %w", service, err)
   108  	}
   109  	logfile := filepath.Join(outputDir, service+".log")
   110  	if err := os.WriteFile(logfile, output, 0444); err != nil {
   111  		return fmt.Errorf("Writing to file of journalctl logs for '%v' service failed: %w", service, err)
   112  	}
   113  	return nil
   114  }
   115  
   116  // createFullSystemdLogfile creates logfile for full systemd journal in the outputDir.
   117  func createFullSystemdLogfile(outputDir string) error {
   118  	cmd := exec.Command("journalctl", "--output=short-precise", "-D", *journalPath)
   119  	// Run the command and record the output to a file.
   120  	output, err := cmd.Output()
   121  	if err != nil {
   122  		return fmt.Errorf("Journalctl command failed: %w", err)
   123  	}
   124  	logfile := filepath.Join(outputDir, "systemd.log")
   125  	if err := os.WriteFile(logfile, output, 0444); err != nil {
   126  		return fmt.Errorf("Writing full journalctl logs to file failed: %w", err)
   127  	}
   128  	return nil
   129  }
   130  
   131  // Create logfiles for systemd services in outputDir.
   132  func createSystemdLogfiles(outputDir string) {
   133  	services := append(systemdServices, nodeSystemdServices...)
   134  	services = append(services, *extraSystemdServices...)
   135  	for _, service := range services {
   136  		if err := createSystemdLogfile(service, "cat", outputDir); err != nil {
   137  			klog.Warningf("Failed to record journalctl logs: %v", err)
   138  		}
   139  	}
   140  	// Service logs specific to VM setup.
   141  	for _, service := range systemdSetupServices {
   142  		if err := createSystemdLogfile(service, "short-precise", outputDir); err != nil {
   143  			klog.Warningf("Failed to record journalctl logs: %v", err)
   144  		}
   145  	}
   146  	if *dumpSystemdJournal {
   147  		if err := createFullSystemdLogfile(outputDir); err != nil {
   148  			klog.Warningf("Failed to record journalctl logs: %v", err)
   149  		}
   150  	}
   151  }
   152  
   153  // Copy logfiles specific to this node based on the cloud-provider, system services, etc
   154  // to a temporary directory. Also create logfiles for systemd services if journalctl is present.
   155  // We do not expect this function to see an error.
   156  func prepareLogfiles(logDir string) {
   157  	klog.Info("Preparing logfiles relevant to this node")
   158  	logfiles := nodeLogs[:]
   159  	logfiles = append(logfiles, *extraLogFiles...)
   160  
   161  	switch *cloudProvider {
   162  	case "gce", "gke":
   163  		logfiles = append(logfiles, gceLogs...)
   164  	case "aws":
   165  		logfiles = append(logfiles, awsLogs...)
   166  	default:
   167  		klog.Errorf("Unknown cloud provider '%v' provided, skipping any provider specific logs", *cloudProvider)
   168  	}
   169  
   170  	// Grab kubemark logs too, if asked for.
   171  	if *enableHollowNodeLogs {
   172  		logfiles = append(logfiles, kubemarkLogs...)
   173  	}
   174  
   175  	// Select system/service specific logs.
   176  	if _, err := os.Stat("/workspace/etc/systemd/journald.conf"); err == nil {
   177  		klog.Info("Journalctl found on host. Collecting systemd logs")
   178  		createSystemdLogfiles(logDir)
   179  	} else {
   180  		klog.Infof("Journalctl not found on host (%v). Collecting supervisord logs instead", err)
   181  		logfiles = append(logfiles, kernelLog)
   182  		logfiles = append(logfiles, initdLogs...)
   183  		logfiles = append(logfiles, supervisordLogs...)
   184  	}
   185  
   186  	// Copy all the logfiles that exist, to logDir.
   187  	for _, logfile := range logfiles {
   188  		logfileFullPath := filepath.Join(localLogPath, logfile+".log*") // Append .log* to copy rotated logs too.
   189  		cmd := exec.Command("/bin/sh", "-c", fmt.Sprintf("cp %v %v", logfileFullPath, logDir))
   190  		if err := cmd.Run(); err != nil {
   191  			klog.Warningf("Failed to copy any logfiles with pattern '%v': %v", logfileFullPath, err)
   192  		}
   193  	}
   194  }
   195  
   196  func uploadLogfilesToGCS(logDir string) error {
   197  	cmd := exec.Command("/bin/sh", "-c", fmt.Sprintf("ls %v/*", logDir))
   198  	output, err := cmd.Output()
   199  	if err != nil {
   200  		return fmt.Errorf("Could not list any logfiles: %w", err)
   201  	}
   202  	klog.Infof("List of logfiles available: %v", string(output))
   203  
   204  	gcsLogPath := *gcsPath + "/" + *nodeName
   205  	klog.Infof("Uploading logfiles to GCS at path '%v'", gcsLogPath)
   206  	for uploadAttempt := 0; uploadAttempt < 3; uploadAttempt++ {
   207  		// Upload the files with compression (-z) and parallelism (-m) for speeding up.
   208  		if err = runCommand("gsutil", "-m", "-q", "cp", "-c",
   209  			"-z", "log,txt,xml", logDir+"/*", gcsLogPath); err != nil {
   210  			klog.Errorf("Attempt %v to upload to GCS failed: %v", uploadAttempt, err)
   211  			continue
   212  		}
   213  		return writeSuccessMarkerFile()
   214  	}
   215  	return fmt.Errorf("Multiple attempts of gsutil failed, the final one due to: %w", err)
   216  }
   217  
   218  // Write a marker file to GCS named after this node to indicate logexporter's success.
   219  // The directory to which we write this file can then be used as a registry to quickly
   220  // fetch the list of nodes on which logexporter succeeded.
   221  func writeSuccessMarkerFile() error {
   222  	markerFilePath := *gcsPath + "/logexported-nodes-registry/" + *nodeName + ".txt"
   223  	cmd := exec.Command("gsutil", "-q", "cp", "-", markerFilePath)
   224  	stdin, err := cmd.StdinPipe()
   225  	if err != nil {
   226  		return fmt.Errorf("Failed to get stdin pipe to write marker file: %w", err)
   227  	}
   228  	io.WriteString(stdin, "")
   229  	stdin.Close()
   230  	if err = cmd.Run(); err != nil {
   231  		return fmt.Errorf("Failed to write marker file to GCS: %w", err)
   232  	}
   233  	return nil
   234  }
   235  
   236  func runCommand(name string, arg ...string) error {
   237  	klog.Infof("Running: %s %s", name, strings.Join(arg, " "))
   238  	cmd := exec.Command(name, arg...)
   239  	var stderr, stdout bytes.Buffer
   240  	cmd.Stderr, cmd.Stdout = &stderr, &stdout
   241  	err := cmd.Run()
   242  	klog.Infof("Stdout:\n%s\n", stdout.String())
   243  	klog.Infof("Stderr:\n%s\n", stderr.String())
   244  	return err
   245  }
   246  
   247  func dumpNetworkDebugInfo() {
   248  	klog.Info("Dumping network connectivity debug info")
   249  	resolv, err := os.ReadFile("/etc/resolv.conf")
   250  	if err != nil {
   251  		klog.Errorf("Failed to read /etc/resolv.conf: %v", err)
   252  	}
   253  	klog.Infof("/etc/resolv.conf: %q", string(resolv))
   254  	addrs, err := net.LookupHost("kubernetes.default")
   255  	if err != nil {
   256  		klog.Errorf("Failed to resolve kubernetes.default: %v", err)
   257  	}
   258  	klog.Infof("kubernetes.default resolves to: %v", addrs)
   259  	addrs, err = net.LookupHost("google.com")
   260  	if err != nil {
   261  		klog.Errorf("Failed to resolve google.com: %v", err)
   262  	}
   263  	klog.Infof("google.com resolves to: %v", addrs)
   264  	resp, err := http.Get("http://google.com/")
   265  	if err != nil {
   266  		klog.Errorf("Failed to get http://google.com/: %v", err)
   267  	}
   268  	defer resp.Body.Close()
   269  	klog.Infof("GET http://google.com finished with: %v code", resp.StatusCode)
   270  }
   271  
   272  func main() {
   273  	pflag.Parse()
   274  	if err := checkConfigValidity(); err != nil {
   275  		klog.Errorf("Bad config provided: %v", err)
   276  		dumpNetworkDebugInfo()
   277  		klog.Fatalf("Bad config provided: %v", err)
   278  	}
   279  
   280  	localTmpLogPath, err := os.MkdirTemp("/tmp", "k8s-systemd-logs")
   281  	if err != nil {
   282  		klog.Fatalf("Could not create temporary dir locally for copying logs: %v", err)
   283  	}
   284  	defer os.RemoveAll(localTmpLogPath)
   285  
   286  	prepareLogfiles(localTmpLogPath)
   287  	if err := uploadLogfilesToGCS(localTmpLogPath); err != nil {
   288  		klog.Fatalf("Could not upload logs to GCS: %v", err)
   289  	}
   290  	klog.Info("Logs successfully uploaded")
   291  
   292  	klog.Infof("Entering sleep for a duration of %v seconds", *sleepDuration)
   293  	time.Sleep(*sleepDuration)
   294  }