k8s.io/kubernetes@v1.29.3/test/e2e/framework/debug/dump.go (about) 1 /* 2 Copyright 2014 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package debug 18 19 import ( 20 "context" 21 "fmt" 22 "sort" 23 "time" 24 25 "github.com/onsi/ginkgo/v2" 26 27 v1 "k8s.io/api/core/v1" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/fields" 30 clientset "k8s.io/client-go/kubernetes" 31 restclient "k8s.io/client-go/rest" 32 "k8s.io/kubernetes/test/e2e/framework" 33 e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" 34 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 35 ) 36 37 // EventsLister is a func that lists events. 38 type EventsLister func(opts metav1.ListOptions, ns string) (*v1.EventList, error) 39 40 // dumpEventsInNamespace dumps events in the given namespace. 41 func dumpEventsInNamespace(eventsLister EventsLister, namespace string) { 42 ginkgo.By(fmt.Sprintf("Collecting events from namespace %q.", namespace)) 43 events, err := eventsLister(metav1.ListOptions{}, namespace) 44 framework.ExpectNoError(err, "failed to list events in namespace %q", namespace) 45 46 ginkgo.By(fmt.Sprintf("Found %d events.", len(events.Items))) 47 // Sort events by their first timestamp 48 sortedEvents := events.Items 49 if len(sortedEvents) > 1 { 50 sort.Sort(byFirstTimestamp(sortedEvents)) 51 } 52 for _, e := range sortedEvents { 53 framework.Logf("At %v - event for %v: %v %v: %v", e.FirstTimestamp, e.InvolvedObject.Name, e.Source, e.Reason, e.Message) 54 } 55 // Note that we don't wait for any Cleanup to propagate, which means 56 // that if you delete a bunch of pods right before ending your test, 57 // you may or may not see the killing/deletion/Cleanup events. 58 } 59 60 // DumpAllNamespaceInfo dumps events, pods and nodes information in the given namespace. 61 func DumpAllNamespaceInfo(ctx context.Context, c clientset.Interface, namespace string) { 62 dumpEventsInNamespace(func(opts metav1.ListOptions, ns string) (*v1.EventList, error) { 63 return c.CoreV1().Events(ns).List(ctx, opts) 64 }, namespace) 65 66 e2epod.DumpAllPodInfoForNamespace(ctx, c, namespace, framework.TestContext.ReportDir) 67 68 // If cluster is large, then the following logs are basically useless, because: 69 // 1. it takes tens of minutes or hours to grab all of them 70 // 2. there are so many of them that working with them are mostly impossible 71 // So we dump them only if the cluster is relatively small. 72 maxNodesForDump := framework.TestContext.MaxNodesToGather 73 nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) 74 if err != nil { 75 framework.Logf("unable to fetch node list: %v", err) 76 return 77 } 78 if len(nodes.Items) <= maxNodesForDump { 79 dumpAllNodeInfo(ctx, c, nodes) 80 } else { 81 framework.Logf("skipping dumping cluster info - cluster too large") 82 } 83 } 84 85 // byFirstTimestamp sorts a slice of events by first timestamp, using their involvedObject's name as a tie breaker. 86 type byFirstTimestamp []v1.Event 87 88 func (o byFirstTimestamp) Len() int { return len(o) } 89 func (o byFirstTimestamp) Swap(i, j int) { o[i], o[j] = o[j], o[i] } 90 91 func (o byFirstTimestamp) Less(i, j int) bool { 92 if o[i].FirstTimestamp.Equal(&o[j].FirstTimestamp) { 93 return o[i].InvolvedObject.Name < o[j].InvolvedObject.Name 94 } 95 return o[i].FirstTimestamp.Before(&o[j].FirstTimestamp) 96 } 97 98 func dumpAllNodeInfo(ctx context.Context, c clientset.Interface, nodes *v1.NodeList) { 99 names := make([]string, len(nodes.Items)) 100 for ix := range nodes.Items { 101 names[ix] = nodes.Items[ix].Name 102 } 103 DumpNodeDebugInfo(ctx, c, names, framework.Logf) 104 } 105 106 // DumpNodeDebugInfo dumps debug information of the given nodes. 107 func DumpNodeDebugInfo(ctx context.Context, c clientset.Interface, nodeNames []string, logFunc func(fmt string, args ...interface{})) { 108 for _, n := range nodeNames { 109 logFunc("\nLogging node info for node %v", n) 110 node, err := c.CoreV1().Nodes().Get(ctx, n, metav1.GetOptions{}) 111 if err != nil { 112 logFunc("Error getting node info %v", err) 113 } 114 logFunc("Node Info: %v", node) 115 116 logFunc("\nLogging kubelet events for node %v", n) 117 for _, e := range getNodeEvents(ctx, c, n) { 118 logFunc("source %v type %v message %v reason %v first ts %v last ts %v, involved obj %+v", 119 e.Source, e.Type, e.Message, e.Reason, e.FirstTimestamp, e.LastTimestamp, e.InvolvedObject) 120 } 121 logFunc("\nLogging pods the kubelet thinks is on node %v", n) 122 podList, err := getKubeletPods(ctx, c, n) 123 if err != nil { 124 logFunc("Unable to retrieve kubelet pods for node %v: %v", n, err) 125 continue 126 } 127 for _, p := range podList.Items { 128 logFunc("%v started at %v (%d+%d container statuses recorded)", p.Name, p.Status.StartTime, len(p.Status.InitContainerStatuses), len(p.Status.ContainerStatuses)) 129 for _, c := range p.Status.InitContainerStatuses { 130 logFunc("\tInit container %v ready: %v, restart count %v", 131 c.Name, c.Ready, c.RestartCount) 132 } 133 for _, c := range p.Status.ContainerStatuses { 134 logFunc("\tContainer %v ready: %v, restart count %v", 135 c.Name, c.Ready, c.RestartCount) 136 } 137 } 138 _, err = e2emetrics.HighLatencyKubeletOperations(ctx, c, 10*time.Second, n, logFunc) 139 framework.ExpectNoError(err) 140 // TODO: Log node resource info 141 } 142 } 143 144 // getKubeletPods retrieves the list of pods on the kubelet. 145 func getKubeletPods(ctx context.Context, c clientset.Interface, node string) (*v1.PodList, error) { 146 var client restclient.Result 147 finished := make(chan struct{}, 1) 148 go func() { 149 // call chain tends to hang in some cases when Node is not ready. Add an artificial timeout for this call. #22165 150 client = c.CoreV1().RESTClient().Get(). 151 Resource("nodes"). 152 SubResource("proxy"). 153 Name(fmt.Sprintf("%v:%v", node, framework.KubeletPort)). 154 Suffix("pods"). 155 Do(ctx) 156 157 finished <- struct{}{} 158 }() 159 select { 160 case <-finished: 161 result := &v1.PodList{} 162 if err := client.Into(result); err != nil { 163 return &v1.PodList{}, err 164 } 165 return result, nil 166 case <-time.After(framework.PodGetTimeout): 167 return &v1.PodList{}, fmt.Errorf("Waiting up to %v for getting the list of pods", framework.PodGetTimeout) 168 } 169 } 170 171 // logNodeEvents logs kubelet events from the given node. This includes kubelet 172 // restart and node unhealthy events. Note that listing events like this will mess 173 // with latency metrics, beware of calling it during a test. 174 func getNodeEvents(ctx context.Context, c clientset.Interface, nodeName string) []v1.Event { 175 selector := fields.Set{ 176 "involvedObject.kind": "Node", 177 "involvedObject.name": nodeName, 178 "involvedObject.namespace": metav1.NamespaceAll, 179 "source": "kubelet", 180 }.AsSelector().String() 181 options := metav1.ListOptions{FieldSelector: selector} 182 events, err := c.CoreV1().Events(metav1.NamespaceSystem).List(ctx, options) 183 if err != nil { 184 framework.Logf("Unexpected error retrieving node events %v", err) 185 return []v1.Event{} 186 } 187 return events.Items 188 }