github.com/abayer/test-infra@v0.0.5/kubetest/dump.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "bytes" 21 "context" 22 "fmt" 23 "io" 24 "log" 25 "net" 26 "os" 27 "path/filepath" 28 "strings" 29 30 "golang.org/x/crypto/ssh" 31 ) 32 33 // logDumper gets all the nodes from a kubernetes cluster and dumps a well-known set of logs 34 type logDumper struct { 35 sshClientFactory sshClientFactory 36 37 artifactsDir string 38 39 services []string 40 files []string 41 } 42 43 // newLogDumper is the constructor for a logDumper 44 func newLogDumper(sshClientFactory sshClientFactory, artifactsDir string) (*logDumper, error) { 45 d := &logDumper{ 46 sshClientFactory: sshClientFactory, 47 artifactsDir: artifactsDir, 48 } 49 50 d.services = []string{ 51 "node-problem-detector", 52 "kubelet", 53 "docker", 54 "kops-configuration", 55 "protokube", 56 } 57 d.files = []string{ 58 "kube-apiserver", 59 "kube-scheduler", 60 "rescheduler", 61 "kube-controller-manager", 62 "etcd", 63 "etcd-events", 64 "glbc", 65 "cluster-autoscaler", 66 "kube-addon-manager", 67 "fluentd", 68 "kube-proxy", 69 "node-problem-detector", 70 "cloud-init-output", 71 "startupscript", 72 "kern", 73 "docker", 74 } 75 76 return d, nil 77 } 78 79 // DumpAllNodes connects to every node from kubectl get nodes and dumps the logs. 80 // additionalIPs holds IP addresses of instances found by the deployment tool; 81 // if the IPs are not found from kubectl get nodes, then these will be dumped also. 82 // This allows for dumping log on nodes even if they don't register as a kubernetes 83 // node, or if a node fails to register, or if the whole cluster fails to start. 84 func (d *logDumper) DumpAllNodes(ctx context.Context, additionalIPs []string) error { 85 var dumped []*node 86 87 nodes, err := kubectlGetNodes("") 88 if err != nil { 89 log.Printf("Failed to get nodes for dumping via kubectl: %v", err) 90 } else { 91 for i := range nodes.Items { 92 if ctx.Err() != nil { 93 log.Printf("stopping dumping nodes: %v", ctx.Err()) 94 return ctx.Err() 95 } 96 97 node := &nodes.Items[i] 98 99 ip := "" 100 for _, address := range node.Status.Addresses { 101 if address.Type == "ExternalIP" { 102 ip = address.Address 103 break 104 } 105 } 106 107 err := d.dumpNode(ctx, node.Metadata.Name, ip) 108 if err != nil { 109 log.Printf("could not dump node %s (%s): %v", node.Metadata.Name, ip, err) 110 } else { 111 dumped = append(dumped, node) 112 } 113 } 114 } 115 116 notDumped := findInstancesNotDumped(additionalIPs, dumped) 117 for _, ip := range notDumped { 118 if ctx.Err() != nil { 119 log.Printf("stopping dumping nodes: %v", ctx.Err()) 120 return ctx.Err() 121 } 122 123 log.Printf("dumping node not registered in kubernetes: %s", ip) 124 err := d.dumpNode(ctx, ip, ip) 125 if err != nil { 126 log.Printf("error dumping node %s: %v", ip, err) 127 } 128 } 129 130 return nil 131 } 132 133 // findInstancesNotDumped returns ips from the slice that do not appear as any address of the nodes 134 func findInstancesNotDumped(ips []string, dumped []*node) []string { 135 var notDumped []string 136 dumpedAddresses := make(map[string]bool) 137 for _, node := range dumped { 138 for _, address := range node.Status.Addresses { 139 dumpedAddresses[address.Address] = true 140 } 141 } 142 143 for _, ip := range ips { 144 if !dumpedAddresses[ip] { 145 notDumped = append(notDumped, ip) 146 } 147 } 148 return notDumped 149 } 150 151 // DumpNode connects to a node and dumps the logs. 152 func (d *logDumper) dumpNode(ctx context.Context, name string, ip string) error { 153 if ip == "" { 154 return fmt.Errorf("could not find address for %v, ", name) 155 } 156 157 log.Printf("Dumping node %s", name) 158 159 n, err := d.connectToNode(ctx, name, ip) 160 if err != nil { 161 return fmt.Errorf("could not connect: %v", err) 162 } 163 164 // As long as we connect to the node we will not return an error; 165 // a failure to collect a log (or even any logs at all) is not 166 // considered an error in dumping the node. 167 // TODO(justinsb): clean up / rationalize 168 errors := n.dump(ctx) 169 for _, e := range errors { 170 log.Printf("error dumping node %s: %v", name, e) 171 } 172 173 if err := n.Close(); err != nil { 174 log.Printf("error closing connection: %v", err) 175 } 176 177 return nil 178 } 179 180 // sshClient is an interface abstracting *ssh.Client, which allows us to test it 181 type sshClient interface { 182 io.Closer 183 184 // ExecPiped runs the command, piping stdout & stderr 185 ExecPiped(ctx context.Context, command string, stdout io.Writer, stderr io.Writer) error 186 } 187 188 // sshClientFactory is an interface abstracting to a node over SSH 189 type sshClientFactory interface { 190 Dial(ctx context.Context, host string) (sshClient, error) 191 } 192 193 // logDumperNode holds state for a particular node we are dumping 194 type logDumperNode struct { 195 client sshClient 196 dumper *logDumper 197 198 dir string 199 } 200 201 // connectToNode makes an SSH connection to the node and returns a logDumperNode 202 func (d *logDumper) connectToNode(ctx context.Context, nodeName string, host string) (*logDumperNode, error) { 203 client, err := d.sshClientFactory.Dial(ctx, host) 204 if err != nil { 205 return nil, fmt.Errorf("unable to SSH to %q: %v", host, err) 206 } 207 return &logDumperNode{ 208 client: client, 209 dir: filepath.Join(d.artifactsDir, nodeName), 210 dumper: d, 211 }, nil 212 } 213 214 // logDumperNode cleans up any state in the logDumperNode 215 func (n *logDumperNode) Close() error { 216 return n.client.Close() 217 } 218 219 // dump captures the well-known set of logs 220 func (n *logDumperNode) dump(ctx context.Context) []error { 221 if ctx.Err() != nil { 222 return []error{ctx.Err()} 223 } 224 225 var errors []error 226 227 // Capture kernel log 228 if err := n.shellToFile(ctx, "sudo journalctl --output=short-precise -k", filepath.Join(n.dir, "kern.log")); err != nil { 229 errors = append(errors, err) 230 } 231 232 // Capture logs from any systemd services in our list, that are registered 233 services, err := n.listSystemdUnits(ctx) 234 if err != nil { 235 errors = append(errors, fmt.Errorf("error listing systemd services: %v", err)) 236 } 237 for _, s := range n.dumper.services { 238 name := s + ".service" 239 for _, service := range services { 240 if service == name { 241 if err := n.shellToFile(ctx, "sudo journalctl --output=cat -u "+name, filepath.Join(n.dir, s+".log")); err != nil { 242 errors = append(errors, err) 243 } 244 } 245 } 246 } 247 248 // Capture any file logs where the files exist 249 fileList, err := n.findFiles(ctx, "/var/log") 250 if err != nil { 251 errors = append(errors, fmt.Errorf("error reading /var/log: %v", err)) 252 } 253 for _, name := range n.dumper.files { 254 prefix := "/var/log/" + name + ".log" 255 for _, f := range fileList { 256 if !strings.HasPrefix(f, prefix) { 257 continue 258 } 259 if err := n.shellToFile(ctx, "sudo cat "+f, filepath.Join(n.dir, filepath.Base(f))); err != nil { 260 errors = append(errors, err) 261 } 262 } 263 } 264 265 return errors 266 } 267 268 // findFiles lists files under the specified directory (recursively) 269 func (n *logDumperNode) findFiles(ctx context.Context, dir string) ([]string, error) { 270 var stdout bytes.Buffer 271 var stderr bytes.Buffer 272 273 err := n.client.ExecPiped(ctx, "sudo find "+dir+" -print0", &stdout, &stderr) 274 if err != nil { 275 return nil, fmt.Errorf("error listing %q: %v", dir, err) 276 } 277 278 paths := []string{} 279 for _, b := range bytes.Split(stdout.Bytes(), []byte{0}) { 280 if len(b) == 0 { 281 // Likely the last value 282 continue 283 } 284 paths = append(paths, string(b)) 285 } 286 return paths, nil 287 } 288 289 // listSystemdUnits returns the list of systemd units on the node 290 func (n *logDumperNode) listSystemdUnits(ctx context.Context) ([]string, error) { 291 var stdout bytes.Buffer 292 var stderr bytes.Buffer 293 294 err := n.client.ExecPiped(ctx, "sudo systemctl list-units -t service --no-pager --no-legend --all", &stdout, &stderr) 295 if err != nil { 296 return nil, fmt.Errorf("error listing systemd units: %v", err) 297 } 298 299 var services []string 300 for _, line := range strings.Split(stdout.String(), "\n") { 301 tokens := strings.Fields(line) 302 if len(tokens) == 0 || tokens[0] == "" { 303 continue 304 } 305 services = append(services, tokens[0]) 306 } 307 return services, nil 308 } 309 310 // shellToFile executes a command and copies the output to a file 311 func (n *logDumperNode) shellToFile(ctx context.Context, command string, destPath string) error { 312 if err := os.MkdirAll(filepath.Dir(destPath), 0755); err != nil { 313 log.Printf("unable to mkdir on %q: %v", filepath.Dir(destPath), err) 314 } 315 316 f, err := os.Create(destPath) 317 if err != nil { 318 return fmt.Errorf("error creating file %q: %v", destPath, err) 319 } 320 defer f.Close() 321 322 if err := n.client.ExecPiped(ctx, command, f, f); err != nil { 323 return fmt.Errorf("error executing command %q: %v", command, err) 324 } 325 326 return nil 327 } 328 329 // sshClientImplementation is the default implementation of sshClient, binding to a *ssh.Client 330 type sshClientImplementation struct { 331 client *ssh.Client 332 } 333 334 var _ sshClient = &sshClientImplementation{} 335 336 // ExecPiped implements sshClientImplementation::ExecPiped 337 func (s *sshClientImplementation) ExecPiped(ctx context.Context, cmd string, stdout io.Writer, stderr io.Writer) error { 338 if ctx.Err() != nil { 339 return ctx.Err() 340 } 341 342 finished := make(chan error) 343 go func() { 344 session, err := s.client.NewSession() 345 if err != nil { 346 finished <- fmt.Errorf("error creating ssh session: %v", err) 347 return 348 } 349 defer session.Close() 350 351 if verbose { 352 log.Printf("Running SSH command: %v", cmd) 353 } 354 355 session.Stdout = stdout 356 session.Stderr = stderr 357 358 finished <- session.Run(cmd) 359 }() 360 361 select { 362 case <-ctx.Done(): 363 log.Print("closing SSH tcp connection due to context completion") 364 365 // terminate the TCP connection to force a disconnect - we assume everyone is using the same context. 366 // We could make this better by sending a signal on the session, waiting and then closing the session, 367 // and only if we still haven't succeeded then closing the TCP connection. This is sufficient for our 368 // current usage though - and hopefully that logic will be implemented in the SSH package itself. 369 s.Close() 370 371 <-finished // Wait for cancellation 372 return ctx.Err() 373 374 case err := <-finished: 375 return err 376 } 377 } 378 379 // Close implements sshClientImplementation::Close 380 func (s *sshClientImplementation) Close() error { 381 return s.client.Close() 382 } 383 384 // sshClientFactoryImplementation is the default implementation of sshClientFactory 385 type sshClientFactoryImplementation struct { 386 sshConfig *ssh.ClientConfig 387 } 388 389 var _ sshClientFactory = &sshClientFactoryImplementation{} 390 391 // Dial implements sshClientFactory::Dial 392 func (f *sshClientFactoryImplementation) Dial(ctx context.Context, host string) (sshClient, error) { 393 addr := host + ":22" 394 d := net.Dialer{} 395 conn, err := d.DialContext(ctx, "tcp", addr) 396 if err != nil { 397 return nil, err 398 } 399 400 // We have a TCP connection; we will force-close it to support context cancellation 401 402 var client *ssh.Client 403 finished := make(chan error) 404 go func() { 405 c, chans, reqs, err := ssh.NewClientConn(conn, addr, f.sshConfig) 406 if err == nil { 407 client = ssh.NewClient(c, chans, reqs) 408 } 409 finished <- err 410 }() 411 412 select { 413 case <-ctx.Done(): 414 log.Print("cancelling SSH tcp connection due to context completion") 415 conn.Close() // Close the TCP connection to force cancellation 416 <-finished // Wait for cancellation 417 return nil, ctx.Err() 418 case err := <-finished: 419 if err != nil { 420 return nil, err 421 } 422 return &sshClientImplementation{ 423 client: client, 424 }, nil 425 } 426 }