k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/logexporter/cmd/main.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // TODO: Make this exporter work for master too, currently facing 18 // gcloud auth error when run from within a pod on the master. 19 20 package main 21 22 import ( 23 "bytes" 24 "fmt" 25 "io" 26 "net" 27 "net/http" 28 "os" 29 "os/exec" 30 "path/filepath" 31 "strings" 32 "time" 33 34 "github.com/spf13/pflag" 35 "k8s.io/klog/v2" 36 ) 37 38 // Initialize the log exporter's configuration related flags. 39 var ( 40 cloudProvider = pflag.String("cloud-provider", "", "Cloud provider for this node (gce/gke/aws/kubemark/..)") 41 dumpSystemdJournal = pflag.Bool("dump-systemd-journal", false, "Whether to dump the full systemd journal") 42 enableHollowNodeLogs = pflag.Bool("enable-hollow-node-logs", false, "Enable uploading hollow node logs too. Relevant only for kubemark nodes") 43 extraLogFiles = pflag.StringSlice("extra-log-files", []string{}, "Extra log files to dump") 44 extraSystemdServices = pflag.StringSlice("extra-systemd-services", []string{}, "Extra systemd services to dump") 45 gcsPath = pflag.String("gcs-path", "", "Path to the GCS directory under which to upload logs, for eg: gs://my-logs-bucket/logs") 46 gcloudAuthFilePath = pflag.String("gcloud-auth-file-path", "/etc/service-account/service-account.json", "Path to gcloud service account file, for authenticating gsutil to write to GCS bucket") 47 useAdc = pflag.Bool("use-application-default-credentials", false, "Whether to use Application Default Credentials instead of the provided service account file") 48 journalPath = pflag.String("journal-path", "/var/log/journal", "Path where the systemd journal dir is mounted") 49 nodeName = pflag.String("node-name", "", "Name of the node this log exporter is running on") 50 sleepDuration = pflag.Duration("sleep-duration", 60*time.Second, "Duration to sleep before exiting with success. Useful for making pods schedule with hard anti-affinity when run as a job on a k8s cluster") 51 ) 52 53 var ( 54 localLogPath = "/var/log" 55 56 // Node-type specific logfiles. 57 // Currently we only handle nodes, and neglect master. 58 nodeLogs = []string{"kube-proxy", "node-problem-detector", "fluentd"} 59 60 // Cloud provider specific logfiles. 61 awsLogs = []string{"cloud-init-output"} 62 gceLogs = []string{"startupscript"} 63 kubemarkLogs = []string{"*-hollow-node-*"} 64 65 // System services/kernel related logfiles. 66 kernelLog = "kern" 67 initdLogs = []string{"docker"} 68 supervisordLogs = []string{"kubelet", "supervisor/supervisord", "supervisor/kubelet-stdout", "supervisor/kubelet-stderr", "supervisor/docker-stdout", "supervisor/docker-stderr"} 69 systemdServices = []string{"kern", "kubelet", "docker"} 70 systemdSetupServices = []string{"kube-node-installation", "kube-node-configuration"} 71 nodeSystemdServices = []string{"node-problem-detector"} 72 ) 73 74 // Check if the config provided through the flags take valid values. 75 func checkConfigValidity() error { 76 klog.Info("Verifying if a valid config has been provided through the flags") 77 if *nodeName == "" { 78 return fmt.Errorf("Flag --node-name has its value unspecified") 79 } 80 if *gcsPath == "" { 81 return fmt.Errorf("Flag --gcs-path has its value unspecified") 82 } 83 if !*useAdc { 84 if _, err := os.Stat(*gcloudAuthFilePath); err != nil { 85 return fmt.Errorf("Could not find the gcloud service account file: %w", err) 86 } else if err := runCommand("gcloud", "auth", "activate-service-account", "--key-file="+*gcloudAuthFilePath); err != nil { 87 return fmt.Errorf("Failed to activate gcloud service account: %w", err) 88 } 89 } 90 return nil 91 } 92 93 // Create logfile for systemd service in outputDir with the given journalctl outputMode. 94 func createSystemdLogfile(service string, outputMode string, outputDir string) error { 95 // Generate the journalctl command. 96 journalCmdArgs := []string{fmt.Sprintf("--output=%v", outputMode), "-D", *journalPath} 97 if service == "kern" { 98 journalCmdArgs = append(journalCmdArgs, "-k") 99 } else { 100 journalCmdArgs = append(journalCmdArgs, "-u", fmt.Sprintf("%v.service", service)) 101 } 102 cmd := exec.Command("journalctl", journalCmdArgs...) 103 104 // Run the command and record the output to a file. 105 output, err := cmd.Output() 106 if err != nil { 107 return fmt.Errorf("Journalctl command for '%v' service failed: %w", service, err) 108 } 109 logfile := filepath.Join(outputDir, service+".log") 110 if err := os.WriteFile(logfile, output, 0444); err != nil { 111 return fmt.Errorf("Writing to file of journalctl logs for '%v' service failed: %w", service, err) 112 } 113 return nil 114 } 115 116 // createFullSystemdLogfile creates logfile for full systemd journal in the outputDir. 117 func createFullSystemdLogfile(outputDir string) error { 118 cmd := exec.Command("journalctl", "--output=short-precise", "-D", *journalPath) 119 // Run the command and record the output to a file. 120 output, err := cmd.Output() 121 if err != nil { 122 return fmt.Errorf("Journalctl command failed: %w", err) 123 } 124 logfile := filepath.Join(outputDir, "systemd.log") 125 if err := os.WriteFile(logfile, output, 0444); err != nil { 126 return fmt.Errorf("Writing full journalctl logs to file failed: %w", err) 127 } 128 return nil 129 } 130 131 // Create logfiles for systemd services in outputDir. 132 func createSystemdLogfiles(outputDir string) { 133 services := append(systemdServices, nodeSystemdServices...) 134 services = append(services, *extraSystemdServices...) 135 for _, service := range services { 136 if err := createSystemdLogfile(service, "cat", outputDir); err != nil { 137 klog.Warningf("Failed to record journalctl logs: %v", err) 138 } 139 } 140 // Service logs specific to VM setup. 141 for _, service := range systemdSetupServices { 142 if err := createSystemdLogfile(service, "short-precise", outputDir); err != nil { 143 klog.Warningf("Failed to record journalctl logs: %v", err) 144 } 145 } 146 if *dumpSystemdJournal { 147 if err := createFullSystemdLogfile(outputDir); err != nil { 148 klog.Warningf("Failed to record journalctl logs: %v", err) 149 } 150 } 151 } 152 153 // Copy logfiles specific to this node based on the cloud-provider, system services, etc 154 // to a temporary directory. Also create logfiles for systemd services if journalctl is present. 155 // We do not expect this function to see an error. 156 func prepareLogfiles(logDir string) { 157 klog.Info("Preparing logfiles relevant to this node") 158 logfiles := nodeLogs[:] 159 logfiles = append(logfiles, *extraLogFiles...) 160 161 switch *cloudProvider { 162 case "gce", "gke": 163 logfiles = append(logfiles, gceLogs...) 164 case "aws": 165 logfiles = append(logfiles, awsLogs...) 166 default: 167 klog.Errorf("Unknown cloud provider '%v' provided, skipping any provider specific logs", *cloudProvider) 168 } 169 170 // Grab kubemark logs too, if asked for. 171 if *enableHollowNodeLogs { 172 logfiles = append(logfiles, kubemarkLogs...) 173 } 174 175 // Select system/service specific logs. 176 if _, err := os.Stat("/workspace/etc/systemd/journald.conf"); err == nil { 177 klog.Info("Journalctl found on host. Collecting systemd logs") 178 createSystemdLogfiles(logDir) 179 } else { 180 klog.Infof("Journalctl not found on host (%v). Collecting supervisord logs instead", err) 181 logfiles = append(logfiles, kernelLog) 182 logfiles = append(logfiles, initdLogs...) 183 logfiles = append(logfiles, supervisordLogs...) 184 } 185 186 // Copy all the logfiles that exist, to logDir. 187 for _, logfile := range logfiles { 188 logfileFullPath := filepath.Join(localLogPath, logfile+".log*") // Append .log* to copy rotated logs too. 189 cmd := exec.Command("/bin/sh", "-c", fmt.Sprintf("cp %v %v", logfileFullPath, logDir)) 190 if err := cmd.Run(); err != nil { 191 klog.Warningf("Failed to copy any logfiles with pattern '%v': %v", logfileFullPath, err) 192 } 193 } 194 } 195 196 func uploadLogfilesToGCS(logDir string) error { 197 cmd := exec.Command("/bin/sh", "-c", fmt.Sprintf("ls %v/*", logDir)) 198 output, err := cmd.Output() 199 if err != nil { 200 return fmt.Errorf("Could not list any logfiles: %w", err) 201 } 202 klog.Infof("List of logfiles available: %v", string(output)) 203 204 gcsLogPath := *gcsPath + "/" + *nodeName 205 klog.Infof("Uploading logfiles to GCS at path '%v'", gcsLogPath) 206 for uploadAttempt := 0; uploadAttempt < 3; uploadAttempt++ { 207 // Upload the files with compression (-z) and parallelism (-m) for speeding up. 208 if err = runCommand("gsutil", "-m", "-q", "cp", "-c", 209 "-z", "log,txt,xml", logDir+"/*", gcsLogPath); err != nil { 210 klog.Errorf("Attempt %v to upload to GCS failed: %v", uploadAttempt, err) 211 continue 212 } 213 return writeSuccessMarkerFile() 214 } 215 return fmt.Errorf("Multiple attempts of gsutil failed, the final one due to: %w", err) 216 } 217 218 // Write a marker file to GCS named after this node to indicate logexporter's success. 219 // The directory to which we write this file can then be used as a registry to quickly 220 // fetch the list of nodes on which logexporter succeeded. 221 func writeSuccessMarkerFile() error { 222 markerFilePath := *gcsPath + "/logexported-nodes-registry/" + *nodeName + ".txt" 223 cmd := exec.Command("gsutil", "-q", "cp", "-", markerFilePath) 224 stdin, err := cmd.StdinPipe() 225 if err != nil { 226 return fmt.Errorf("Failed to get stdin pipe to write marker file: %w", err) 227 } 228 io.WriteString(stdin, "") 229 stdin.Close() 230 if err = cmd.Run(); err != nil { 231 return fmt.Errorf("Failed to write marker file to GCS: %w", err) 232 } 233 return nil 234 } 235 236 func runCommand(name string, arg ...string) error { 237 klog.Infof("Running: %s %s", name, strings.Join(arg, " ")) 238 cmd := exec.Command(name, arg...) 239 var stderr, stdout bytes.Buffer 240 cmd.Stderr, cmd.Stdout = &stderr, &stdout 241 err := cmd.Run() 242 klog.Infof("Stdout:\n%s\n", stdout.String()) 243 klog.Infof("Stderr:\n%s\n", stderr.String()) 244 return err 245 } 246 247 func dumpNetworkDebugInfo() { 248 klog.Info("Dumping network connectivity debug info") 249 resolv, err := os.ReadFile("/etc/resolv.conf") 250 if err != nil { 251 klog.Errorf("Failed to read /etc/resolv.conf: %v", err) 252 } 253 klog.Infof("/etc/resolv.conf: %q", string(resolv)) 254 addrs, err := net.LookupHost("kubernetes.default") 255 if err != nil { 256 klog.Errorf("Failed to resolve kubernetes.default: %v", err) 257 } 258 klog.Infof("kubernetes.default resolves to: %v", addrs) 259 addrs, err = net.LookupHost("google.com") 260 if err != nil { 261 klog.Errorf("Failed to resolve google.com: %v", err) 262 } 263 klog.Infof("google.com resolves to: %v", addrs) 264 resp, err := http.Get("http://google.com/") 265 if err != nil { 266 klog.Errorf("Failed to get http://google.com/: %v", err) 267 } 268 defer resp.Body.Close() 269 klog.Infof("GET http://google.com finished with: %v code", resp.StatusCode) 270 } 271 272 func main() { 273 pflag.Parse() 274 if err := checkConfigValidity(); err != nil { 275 klog.Errorf("Bad config provided: %v", err) 276 dumpNetworkDebugInfo() 277 klog.Fatalf("Bad config provided: %v", err) 278 } 279 280 localTmpLogPath, err := os.MkdirTemp("/tmp", "k8s-systemd-logs") 281 if err != nil { 282 klog.Fatalf("Could not create temporary dir locally for copying logs: %v", err) 283 } 284 defer os.RemoveAll(localTmpLogPath) 285 286 prepareLogfiles(localTmpLogPath) 287 if err := uploadLogfilesToGCS(localTmpLogPath); err != nil { 288 klog.Fatalf("Could not upload logs to GCS: %v", err) 289 } 290 klog.Info("Logs successfully uploaded") 291 292 klog.Infof("Entering sleep for a duration of %v seconds", *sleepDuration) 293 time.Sleep(*sleepDuration) 294 }