k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/ooms_tracker.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package common 18 19 import ( 20 "context" 21 "fmt" 22 "regexp" 23 "strings" 24 "sync" 25 "time" 26 27 corev1 "k8s.io/api/core/v1" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/fields" 30 "k8s.io/apimachinery/pkg/runtime" 31 "k8s.io/apimachinery/pkg/watch" 32 clientset "k8s.io/client-go/kubernetes" 33 "k8s.io/client-go/tools/cache" 34 "k8s.io/client-go/tools/pager" 35 "k8s.io/klog/v2" 36 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 37 "k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer" 38 "k8s.io/perf-tests/clusterloader2/pkg/util" 39 ) 40 41 const ( 42 clusterOOMsTrackerEnabledParamName = "clusterOOMsTrackerEnabled" 43 clusterOOMsTrackerName = "ClusterOOMsTracker" 44 clusterOOMsIgnoredProcessesParamName = "clusterOOMsIgnoredProcesses" 45 informerTimeout = time.Minute 46 oomEventReason = "OOMKilling" 47 initialListPageSize = 10000 48 ) 49 50 var ( 51 oomEventMsgRegex = regexp.MustCompile(`Killed process (\d+) \((.+)\) total-vm:(\d+kB), anon-rss:\d+kB, file-rss:\d+kB.*`) 52 ) 53 54 func init() { 55 if err := measurement.Register(clusterOOMsTrackerName, createClusterOOMsTrackerMeasurement); err != nil { 56 klog.Fatalf("Cannot register %s: %v", clusterOOMsTrackerName, err) 57 } 58 } 59 60 func createClusterOOMsTrackerMeasurement() measurement.Measurement { 61 return &clusterOOMsTrackerMeasurement{} 62 } 63 64 type clusterOOMsTrackerMeasurement struct { 65 selector *util.ObjectSelector 66 msgRegex *regexp.Regexp 67 isRunning bool 68 startTime time.Time 69 stopCh chan struct{} 70 lock sync.Mutex 71 processIgnored map[string]bool 72 resourceVersionRecorded map[string]bool 73 ooms []oomEvent 74 } 75 76 // TODO: Reevaluate if we can add new fields here when node-problem-detector 77 // starts using new events. 78 type oomEvent struct { 79 Node string `json:"node"` 80 Process string `json:"process"` 81 ProcessMemory string `json:"memory"` 82 ProcessID string `json:"pid"` 83 Time time.Time `json:"time"` 84 } 85 86 func (m *clusterOOMsTrackerMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 87 clusterOOMsTrackerEnabled, err := util.GetBoolOrDefault(config.Params, clusterOOMsTrackerEnabledParamName, false) 88 if err != nil { 89 return nil, fmt.Errorf("problem with getting %s param: %w", clusterOOMsTrackerEnabledParamName, err) 90 } 91 if !clusterOOMsTrackerEnabled { 92 klog.V(1).Info("skipping tracking of OOMs in the cluster") 93 return nil, nil 94 } 95 96 action, err := util.GetString(config.Params, "action") 97 if err != nil { 98 return nil, fmt.Errorf("problem with getting %s param: %w", "action", err) 99 } 100 101 switch action { 102 case "start": 103 if err = m.start(config); err != nil { 104 return nil, fmt.Errorf("starting cluster OOMs measurement problem: %w", err) 105 } 106 return nil, nil 107 case "gather": 108 m.lock.Lock() 109 defer m.lock.Unlock() 110 return m.gather() 111 default: 112 return nil, fmt.Errorf("unknown action %v", action) 113 } 114 } 115 116 func (m *clusterOOMsTrackerMeasurement) Dispose() { 117 m.stop() 118 } 119 120 func (m *clusterOOMsTrackerMeasurement) String() string { 121 return clusterOOMsTrackerName 122 } 123 124 func (m *clusterOOMsTrackerMeasurement) getOOMsTrackerInformer(ctx context.Context, client clientset.Interface) cache.SharedInformer { 125 listFunc := func(options metav1.ListOptions) (runtime.Object, error) { 126 o := metav1.ListOptions{ 127 Limit: 1, 128 } 129 result, err := client.CoreV1().Events(metav1.NamespaceAll).List(ctx, o) 130 if err != nil { 131 return nil, err 132 } 133 result.Continue = "" 134 result.Items = nil 135 return result, nil 136 } 137 watchFunc := func(options metav1.ListOptions) (watch.Interface, error) { 138 options.FieldSelector = m.selector.FieldSelector 139 return client.CoreV1().Events(metav1.NamespaceAll).Watch(ctx, options) 140 } 141 i := cache.NewSharedInformer(&cache.ListWatch{ListFunc: listFunc, WatchFunc: watchFunc}, nil, 0) 142 i.AddEventHandler(cache.ResourceEventHandlerFuncs{ 143 AddFunc: func(obj interface{}) { 144 m.handleOOMEvent(obj) 145 }, 146 UpdateFunc: func(_, obj interface{}) { 147 m.handleOOMEvent(obj) 148 }, 149 DeleteFunc: func(_ interface{}) {}, 150 }) 151 return i 152 } 153 154 func (m *clusterOOMsTrackerMeasurement) handlePriorOOMs(ctx context.Context, client clientset.Interface) error { 155 pg := pager.New(pager.SimplePageFunc(func(opts metav1.ListOptions) (runtime.Object, error) { 156 return client.CoreV1().Events(metav1.NamespaceAll).List(ctx, opts) 157 })) 158 pg.PageSize = initialListPageSize 159 160 if err := pg.EachListItem(ctx, metav1.ListOptions{}, func(obj runtime.Object) error { 161 m.handleOOMEvent(obj) 162 return nil 163 }); err != nil { 164 return err 165 } 166 return nil 167 } 168 169 func (m *clusterOOMsTrackerMeasurement) start(config *measurement.Config) error { 170 if m.isRunning { 171 klog.V(2).Infof("%s: cluster OOMs tracking measurement already running", m) 172 return nil 173 } 174 klog.V(2).Infof("%s: starting cluster OOMs tracking measurement...", m) 175 if err := m.initFields(config); err != nil { 176 return fmt.Errorf("problem with OOMs tracking measurement fields initialization: %w", err) 177 } 178 ctx := context.Background() 179 client := config.ClusterFramework.GetClientSets().GetClient() 180 181 // Watching for OOM events from node-problem-detector below. 182 i := m.getOOMsTrackerInformer(ctx, client) 183 if err := informer.StartAndSync(i, m.stopCh, informerTimeout); err != nil { 184 return fmt.Errorf("problem with OOM events informer starting: %w", err) 185 } 186 187 // Searching for OOM events that happened before the measurement start. 188 // We're running this *after* initiating the informer above because doing 189 // the same in the reverse order might make us miss some OOMs. 190 if err := m.handlePriorOOMs(ctx, client); err != nil { 191 return fmt.Errorf("problem with handling prior OOMs: %w", err) 192 } 193 194 return nil 195 } 196 197 func (m *clusterOOMsTrackerMeasurement) initFields(config *measurement.Config) error { 198 m.isRunning = true 199 m.startTime = time.Now() 200 m.stopCh = make(chan struct{}) 201 m.selector = &util.ObjectSelector{ 202 FieldSelector: fields.Set{"reason": oomEventReason}.AsSelector().String(), 203 Namespace: metav1.NamespaceAll, 204 } 205 m.msgRegex = oomEventMsgRegex 206 m.resourceVersionRecorded = make(map[string]bool) 207 208 ignoredProcessesString, err := util.GetStringOrDefault(config.Params, clusterOOMsIgnoredProcessesParamName, "") 209 if err != nil { 210 return err 211 } 212 m.processIgnored = make(map[string]bool) 213 if ignoredProcessesString != "" { 214 processNames := strings.Split(ignoredProcessesString, ",") 215 for _, processName := range processNames { 216 m.processIgnored[processName] = true 217 } 218 } 219 return nil 220 } 221 222 func (m *clusterOOMsTrackerMeasurement) stop() { 223 if m.isRunning { 224 m.isRunning = false 225 close(m.stopCh) 226 } 227 } 228 229 func (m *clusterOOMsTrackerMeasurement) gather() ([]measurement.Summary, error) { 230 klog.V(2).Infof("%s: gathering cluster OOMs tracking measurement", clusterOOMsTrackerName) 231 if !m.isRunning { 232 return nil, fmt.Errorf("measurement %s has not been started", clusterOOMsTrackerName) 233 } 234 235 m.stop() 236 237 oomData := make(map[string][]oomEvent) 238 oomData["failures"] = make([]oomEvent, 0) 239 oomData["past"] = make([]oomEvent, 0) 240 oomData["ignored"] = make([]oomEvent, 0) 241 242 for _, oom := range m.ooms { 243 if m.startTime.After(oom.Time) { 244 oomData["past"] = append(oomData["past"], oom) 245 continue 246 } 247 if m.processIgnored[oom.Process] { 248 oomData["ignored"] = append(oomData["ignored"], oom) 249 continue 250 } 251 oomData["failures"] = append(oomData["failures"], oom) 252 } 253 254 content, err := util.PrettyPrintJSON(oomData) 255 if err != nil { 256 return nil, fmt.Errorf("OOMs PrettyPrintJSON problem: %w", err) 257 } 258 259 summary := measurement.CreateSummary(clusterOOMsTrackerName, "json", content) 260 if oomFailures := oomData["failures"]; len(oomFailures) > 0 { 261 err = fmt.Errorf("OOMs recorded: %+v", oomFailures) 262 } 263 return []measurement.Summary{summary}, err 264 } 265 266 func (m *clusterOOMsTrackerMeasurement) handleOOMEvent(obj interface{}) { 267 event, ok := obj.(*corev1.Event) 268 if !ok || event.Reason != oomEventReason { 269 return 270 } 271 272 m.lock.Lock() 273 defer m.lock.Unlock() 274 275 if m.resourceVersionRecorded[event.ObjectMeta.ResourceVersion] { 276 // We are catching an OOM event with already recorded resource 277 // version which may happen on relisting the events when a watch 278 // breaks. Because of that, we do not want to register that 279 // OOM more than once. 280 return 281 } 282 m.resourceVersionRecorded[event.ObjectMeta.ResourceVersion] = true 283 284 klog.V(2).Infof("OOM detected: %+v", event) 285 286 oom := oomEvent{ 287 Node: event.InvolvedObject.Name, 288 } 289 if !event.EventTime.IsZero() { 290 oom.Time = event.EventTime.Time 291 } else { 292 oom.Time = event.FirstTimestamp.Time 293 } 294 295 if match := m.msgRegex.FindStringSubmatch(event.Message); len(match) == 4 { 296 oom.ProcessID = match[1] 297 oom.Process = match[2] 298 oom.ProcessMemory = match[3] 299 } else { 300 klog.Warningf(`unrecognized OOM event message pattern; event message contents: "%v"`, event.Message) 301 } 302 303 m.ooms = append(m.ooms, oom) 304 }