github.com/jenkins-x/test-infra@v0.0.7/kubetest/dump.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "bytes" 21 "context" 22 "fmt" 23 "io" 24 "log" 25 "net" 26 "os" 27 "path/filepath" 28 "strings" 29 30 "golang.org/x/crypto/ssh" 31 ) 32 33 // logDumper gets all the nodes from a kubernetes cluster and dumps a well-known set of logs 34 type logDumper struct { 35 sshClientFactory sshClientFactory 36 37 artifactsDir string 38 39 services []string 40 files []string 41 } 42 43 // newLogDumper is the constructor for a logDumper 44 func newLogDumper(sshClientFactory sshClientFactory, artifactsDir string) (*logDumper, error) { 45 d := &logDumper{ 46 sshClientFactory: sshClientFactory, 47 artifactsDir: artifactsDir, 48 } 49 50 d.services = []string{ 51 "node-problem-detector", 52 "kubelet", 53 "docker", 54 "kops-configuration", 55 "protokube", 56 } 57 d.files = []string{ 58 "kube-apiserver", 59 "kube-scheduler", 60 "rescheduler", 61 "kube-controller-manager", 62 "etcd", 63 "etcd-events", 64 "glbc", 65 "cluster-autoscaler", 66 "kube-addon-manager", 67 "fluentd", 68 "kube-proxy", 69 "node-problem-detector", 70 "cloud-init-output", 71 "startupscript", 72 "kern", 73 "docker", 74 } 75 76 return d, nil 77 } 78 79 // DumpAllNodes connects to every node from kubectl get nodes and dumps the logs. 80 // additionalIPs holds IP addresses of instances found by the deployment tool; 81 // if the IPs are not found from kubectl get nodes, then these will be dumped also. 82 // This allows for dumping log on nodes even if they don't register as a kubernetes 83 // node, or if a node fails to register, or if the whole cluster fails to start. 84 func (d *logDumper) DumpAllNodes(ctx context.Context, additionalIPs []string) error { 85 var dumped []*node 86 87 nodes, err := kubectlGetNodes("") 88 if err != nil { 89 log.Printf("Failed to get nodes for dumping via kubectl: %v", err) 90 } else { 91 for i := range nodes.Items { 92 if ctx.Err() != nil { 93 log.Printf("stopping dumping nodes: %v", ctx.Err()) 94 return ctx.Err() 95 } 96 97 node := &nodes.Items[i] 98 99 ip := "" 100 for _, address := range node.Status.Addresses { 101 if address.Type == "ExternalIP" { 102 ip = address.Address 103 break 104 } 105 } 106 107 err := d.dumpNode(ctx, node.Metadata.Name, ip) 108 if err != nil { 109 log.Printf("could not dump node %s (%s): %v", node.Metadata.Name, ip, err) 110 } else { 111 dumped = append(dumped, node) 112 } 113 } 114 } 115 116 notDumped := findInstancesNotDumped(additionalIPs, dumped) 117 for _, ip := range notDumped { 118 if ctx.Err() != nil { 119 log.Printf("stopping dumping nodes: %v", ctx.Err()) 120 return ctx.Err() 121 } 122 123 log.Printf("dumping node not registered in kubernetes: %s", ip) 124 err := d.dumpNode(ctx, ip, ip) 125 if err != nil { 126 log.Printf("error dumping node %s: %v", ip, err) 127 } 128 } 129 130 return nil 131 } 132 133 // findInstancesNotDumped returns ips from the slice that do not appear as any address of the nodes 134 func findInstancesNotDumped(ips []string, dumped []*node) []string { 135 var notDumped []string 136 dumpedAddresses := make(map[string]bool) 137 for _, node := range dumped { 138 for _, address := range node.Status.Addresses { 139 dumpedAddresses[address.Address] = true 140 } 141 } 142 143 for _, ip := range ips { 144 if !dumpedAddresses[ip] { 145 notDumped = append(notDumped, ip) 146 } 147 } 148 return notDumped 149 } 150 151 // DumpNode connects to a node and dumps the logs. 152 func (d *logDumper) dumpNode(ctx context.Context, name string, ip string) error { 153 if ip == "" { 154 return fmt.Errorf("could not find address for %v, ", name) 155 } 156 157 log.Printf("Dumping node %s", name) 158 159 n, err := d.connectToNode(ctx, name, ip) 160 if err != nil { 161 return fmt.Errorf("could not connect: %v", err) 162 } 163 164 // As long as we connect to the node we will not return an error; 165 // a failure to collect a log (or even any logs at all) is not 166 // considered an error in dumping the node. 167 // TODO(justinsb): clean up / rationalize 168 errors := n.dump(ctx) 169 for _, e := range errors { 170 log.Printf("error dumping node %s: %v", name, e) 171 } 172 173 if err := n.Close(); err != nil { 174 log.Printf("error closing connection: %v", err) 175 } 176 177 return nil 178 } 179 180 // sshClient is an interface abstracting *ssh.Client, which allows us to test it 181 type sshClient interface { 182 io.Closer 183 184 // ExecPiped runs the command, piping stdout & stderr 185 ExecPiped(ctx context.Context, command string, stdout io.Writer, stderr io.Writer) error 186 } 187 188 // sshClientFactory is an interface abstracting to a node over SSH 189 type sshClientFactory interface { 190 Dial(ctx context.Context, host string) (sshClient, error) 191 } 192 193 // logDumperNode holds state for a particular node we are dumping 194 type logDumperNode struct { 195 client sshClient 196 dumper *logDumper 197 198 dir string 199 } 200 201 // connectToNode makes an SSH connection to the node and returns a logDumperNode 202 func (d *logDumper) connectToNode(ctx context.Context, nodeName string, host string) (*logDumperNode, error) { 203 client, err := d.sshClientFactory.Dial(ctx, host) 204 if err != nil { 205 return nil, fmt.Errorf("unable to SSH to %q: %v", host, err) 206 } 207 return &logDumperNode{ 208 client: client, 209 dir: filepath.Join(d.artifactsDir, nodeName), 210 dumper: d, 211 }, nil 212 } 213 214 // logDumperNode cleans up any state in the logDumperNode 215 func (n *logDumperNode) Close() error { 216 return n.client.Close() 217 } 218 219 // dump captures the well-known set of logs 220 func (n *logDumperNode) dump(ctx context.Context) []error { 221 if ctx.Err() != nil { 222 return []error{ctx.Err()} 223 } 224 225 var errors []error 226 227 // Capture kernel log 228 if err := n.shellToFile(ctx, "sudo journalctl --output=short-precise -k", filepath.Join(n.dir, "kern.log")); err != nil { 229 errors = append(errors, err) 230 } 231 232 // Capture full journal - needed so we can see e.g. disk mounts 233 // This does duplicate the other files, but ensures we have all output 234 if err := n.shellToFile(ctx, "sudo journalctl --output=short-precise", filepath.Join(n.dir, "journal.log")); err != nil { 235 errors = append(errors, err) 236 } 237 238 // Capture logs from any systemd services in our list, that are registered 239 services, err := n.listSystemdUnits(ctx) 240 if err != nil { 241 errors = append(errors, fmt.Errorf("error listing systemd services: %v", err)) 242 } 243 for _, s := range n.dumper.services { 244 name := s + ".service" 245 for _, service := range services { 246 if service == name { 247 if err := n.shellToFile(ctx, "sudo journalctl --output=cat -u "+name, filepath.Join(n.dir, s+".log")); err != nil { 248 errors = append(errors, err) 249 } 250 } 251 } 252 } 253 254 // Capture any file logs where the files exist 255 fileList, err := n.findFiles(ctx, "/var/log") 256 if err != nil { 257 errors = append(errors, fmt.Errorf("error reading /var/log: %v", err)) 258 } 259 for _, name := range n.dumper.files { 260 prefix := "/var/log/" + name + ".log" 261 for _, f := range fileList { 262 if !strings.HasPrefix(f, prefix) { 263 continue 264 } 265 if err := n.shellToFile(ctx, "sudo cat "+f, filepath.Join(n.dir, filepath.Base(f))); err != nil { 266 errors = append(errors, err) 267 } 268 } 269 } 270 271 return errors 272 } 273 274 // findFiles lists files under the specified directory (recursively) 275 func (n *logDumperNode) findFiles(ctx context.Context, dir string) ([]string, error) { 276 var stdout bytes.Buffer 277 var stderr bytes.Buffer 278 279 err := n.client.ExecPiped(ctx, "sudo find "+dir+" -print0", &stdout, &stderr) 280 if err != nil { 281 return nil, fmt.Errorf("error listing %q: %v", dir, err) 282 } 283 284 paths := []string{} 285 for _, b := range bytes.Split(stdout.Bytes(), []byte{0}) { 286 if len(b) == 0 { 287 // Likely the last value 288 continue 289 } 290 paths = append(paths, string(b)) 291 } 292 return paths, nil 293 } 294 295 // listSystemdUnits returns the list of systemd units on the node 296 func (n *logDumperNode) listSystemdUnits(ctx context.Context) ([]string, error) { 297 var stdout bytes.Buffer 298 var stderr bytes.Buffer 299 300 err := n.client.ExecPiped(ctx, "sudo systemctl list-units -t service --no-pager --no-legend --all", &stdout, &stderr) 301 if err != nil { 302 return nil, fmt.Errorf("error listing systemd units: %v", err) 303 } 304 305 var services []string 306 for _, line := range strings.Split(stdout.String(), "\n") { 307 tokens := strings.Fields(line) 308 if len(tokens) == 0 || tokens[0] == "" { 309 continue 310 } 311 services = append(services, tokens[0]) 312 } 313 return services, nil 314 } 315 316 // shellToFile executes a command and copies the output to a file 317 func (n *logDumperNode) shellToFile(ctx context.Context, command string, destPath string) error { 318 if err := os.MkdirAll(filepath.Dir(destPath), 0755); err != nil { 319 log.Printf("unable to mkdir on %q: %v", filepath.Dir(destPath), err) 320 } 321 322 f, err := os.Create(destPath) 323 if err != nil { 324 return fmt.Errorf("error creating file %q: %v", destPath, err) 325 } 326 defer f.Close() 327 328 if err := n.client.ExecPiped(ctx, command, f, f); err != nil { 329 return fmt.Errorf("error executing command %q: %v", command, err) 330 } 331 332 return nil 333 } 334 335 // sshClientImplementation is the default implementation of sshClient, binding to a *ssh.Client 336 type sshClientImplementation struct { 337 client *ssh.Client 338 } 339 340 var _ sshClient = &sshClientImplementation{} 341 342 // ExecPiped implements sshClientImplementation::ExecPiped 343 func (s *sshClientImplementation) ExecPiped(ctx context.Context, cmd string, stdout io.Writer, stderr io.Writer) error { 344 if ctx.Err() != nil { 345 return ctx.Err() 346 } 347 348 finished := make(chan error) 349 go func() { 350 session, err := s.client.NewSession() 351 if err != nil { 352 finished <- fmt.Errorf("error creating ssh session: %v", err) 353 return 354 } 355 defer session.Close() 356 357 if verbose { 358 log.Printf("Running SSH command: %v", cmd) 359 } 360 361 session.Stdout = stdout 362 session.Stderr = stderr 363 364 finished <- session.Run(cmd) 365 }() 366 367 select { 368 case <-ctx.Done(): 369 log.Print("closing SSH tcp connection due to context completion") 370 371 // terminate the TCP connection to force a disconnect - we assume everyone is using the same context. 372 // We could make this better by sending a signal on the session, waiting and then closing the session, 373 // and only if we still haven't succeeded then closing the TCP connection. This is sufficient for our 374 // current usage though - and hopefully that logic will be implemented in the SSH package itself. 375 s.Close() 376 377 <-finished // Wait for cancellation 378 return ctx.Err() 379 380 case err := <-finished: 381 return err 382 } 383 } 384 385 // Close implements sshClientImplementation::Close 386 func (s *sshClientImplementation) Close() error { 387 return s.client.Close() 388 } 389 390 // sshClientFactoryImplementation is the default implementation of sshClientFactory 391 type sshClientFactoryImplementation struct { 392 sshConfig *ssh.ClientConfig 393 } 394 395 var _ sshClientFactory = &sshClientFactoryImplementation{} 396 397 // Dial implements sshClientFactory::Dial 398 func (f *sshClientFactoryImplementation) Dial(ctx context.Context, host string) (sshClient, error) { 399 addr := host + ":22" 400 d := net.Dialer{} 401 conn, err := d.DialContext(ctx, "tcp", addr) 402 if err != nil { 403 return nil, err 404 } 405 406 // We have a TCP connection; we will force-close it to support context cancellation 407 408 var client *ssh.Client 409 finished := make(chan error) 410 go func() { 411 c, chans, reqs, err := ssh.NewClientConn(conn, addr, f.sshConfig) 412 if err == nil { 413 client = ssh.NewClient(c, chans, reqs) 414 } 415 finished <- err 416 }() 417 418 select { 419 case <-ctx.Done(): 420 log.Print("cancelling SSH tcp connection due to context completion") 421 conn.Close() // Close the TCP connection to force cancellation 422 <-finished // Wait for cancellation 423 return nil, ctx.Err() 424 case err := <-finished: 425 if err != nil { 426 return nil, err 427 } 428 return &sshClientImplementation{ 429 client: client, 430 }, nil 431 } 432 }