k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/ooms_tracker.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"regexp"
    23  	"strings"
    24  	"sync"
    25  	"time"
    26  
    27  	corev1 "k8s.io/api/core/v1"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/fields"
    30  	"k8s.io/apimachinery/pkg/runtime"
    31  	"k8s.io/apimachinery/pkg/watch"
    32  	clientset "k8s.io/client-go/kubernetes"
    33  	"k8s.io/client-go/tools/cache"
    34  	"k8s.io/client-go/tools/pager"
    35  	"k8s.io/klog/v2"
    36  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    37  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer"
    38  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    39  )
    40  
    41  const (
    42  	clusterOOMsTrackerEnabledParamName   = "clusterOOMsTrackerEnabled"
    43  	clusterOOMsTrackerName               = "ClusterOOMsTracker"
    44  	clusterOOMsIgnoredProcessesParamName = "clusterOOMsIgnoredProcesses"
    45  	informerTimeout                      = time.Minute
    46  	oomEventReason                       = "OOMKilling"
    47  	initialListPageSize                  = 10000
    48  )
    49  
    50  var (
    51  	oomEventMsgRegex = regexp.MustCompile(`Killed process (\d+) \((.+)\) total-vm:(\d+kB), anon-rss:\d+kB, file-rss:\d+kB.*`)
    52  )
    53  
    54  func init() {
    55  	if err := measurement.Register(clusterOOMsTrackerName, createClusterOOMsTrackerMeasurement); err != nil {
    56  		klog.Fatalf("Cannot register %s: %v", clusterOOMsTrackerName, err)
    57  	}
    58  }
    59  
    60  func createClusterOOMsTrackerMeasurement() measurement.Measurement {
    61  	return &clusterOOMsTrackerMeasurement{}
    62  }
    63  
    64  type clusterOOMsTrackerMeasurement struct {
    65  	selector                *util.ObjectSelector
    66  	msgRegex                *regexp.Regexp
    67  	isRunning               bool
    68  	startTime               time.Time
    69  	stopCh                  chan struct{}
    70  	lock                    sync.Mutex
    71  	processIgnored          map[string]bool
    72  	resourceVersionRecorded map[string]bool
    73  	ooms                    []oomEvent
    74  }
    75  
    76  // TODO: Reevaluate if we can add new fields here when node-problem-detector
    77  // starts using new events.
    78  type oomEvent struct {
    79  	Node          string    `json:"node"`
    80  	Process       string    `json:"process"`
    81  	ProcessMemory string    `json:"memory"`
    82  	ProcessID     string    `json:"pid"`
    83  	Time          time.Time `json:"time"`
    84  }
    85  
    86  func (m *clusterOOMsTrackerMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
    87  	clusterOOMsTrackerEnabled, err := util.GetBoolOrDefault(config.Params, clusterOOMsTrackerEnabledParamName, false)
    88  	if err != nil {
    89  		return nil, fmt.Errorf("problem with getting %s param: %w", clusterOOMsTrackerEnabledParamName, err)
    90  	}
    91  	if !clusterOOMsTrackerEnabled {
    92  		klog.V(1).Info("skipping tracking of OOMs in the cluster")
    93  		return nil, nil
    94  	}
    95  
    96  	action, err := util.GetString(config.Params, "action")
    97  	if err != nil {
    98  		return nil, fmt.Errorf("problem with getting %s param: %w", "action", err)
    99  	}
   100  
   101  	switch action {
   102  	case "start":
   103  		if err = m.start(config); err != nil {
   104  			return nil, fmt.Errorf("starting cluster OOMs measurement problem: %w", err)
   105  		}
   106  		return nil, nil
   107  	case "gather":
   108  		m.lock.Lock()
   109  		defer m.lock.Unlock()
   110  		return m.gather()
   111  	default:
   112  		return nil, fmt.Errorf("unknown action %v", action)
   113  	}
   114  }
   115  
   116  func (m *clusterOOMsTrackerMeasurement) Dispose() {
   117  	m.stop()
   118  }
   119  
   120  func (m *clusterOOMsTrackerMeasurement) String() string {
   121  	return clusterOOMsTrackerName
   122  }
   123  
   124  func (m *clusterOOMsTrackerMeasurement) getOOMsTrackerInformer(ctx context.Context, client clientset.Interface) cache.SharedInformer {
   125  	listFunc := func(options metav1.ListOptions) (runtime.Object, error) {
   126  		o := metav1.ListOptions{
   127  			Limit: 1,
   128  		}
   129  		result, err := client.CoreV1().Events(metav1.NamespaceAll).List(ctx, o)
   130  		if err != nil {
   131  			return nil, err
   132  		}
   133  		result.Continue = ""
   134  		result.Items = nil
   135  		return result, nil
   136  	}
   137  	watchFunc := func(options metav1.ListOptions) (watch.Interface, error) {
   138  		options.FieldSelector = m.selector.FieldSelector
   139  		return client.CoreV1().Events(metav1.NamespaceAll).Watch(ctx, options)
   140  	}
   141  	i := cache.NewSharedInformer(&cache.ListWatch{ListFunc: listFunc, WatchFunc: watchFunc}, nil, 0)
   142  	i.AddEventHandler(cache.ResourceEventHandlerFuncs{
   143  		AddFunc: func(obj interface{}) {
   144  			m.handleOOMEvent(obj)
   145  		},
   146  		UpdateFunc: func(_, obj interface{}) {
   147  			m.handleOOMEvent(obj)
   148  		},
   149  		DeleteFunc: func(_ interface{}) {},
   150  	})
   151  	return i
   152  }
   153  
   154  func (m *clusterOOMsTrackerMeasurement) handlePriorOOMs(ctx context.Context, client clientset.Interface) error {
   155  	pg := pager.New(pager.SimplePageFunc(func(opts metav1.ListOptions) (runtime.Object, error) {
   156  		return client.CoreV1().Events(metav1.NamespaceAll).List(ctx, opts)
   157  	}))
   158  	pg.PageSize = initialListPageSize
   159  
   160  	if err := pg.EachListItem(ctx, metav1.ListOptions{}, func(obj runtime.Object) error {
   161  		m.handleOOMEvent(obj)
   162  		return nil
   163  	}); err != nil {
   164  		return err
   165  	}
   166  	return nil
   167  }
   168  
   169  func (m *clusterOOMsTrackerMeasurement) start(config *measurement.Config) error {
   170  	if m.isRunning {
   171  		klog.V(2).Infof("%s: cluster OOMs tracking measurement already running", m)
   172  		return nil
   173  	}
   174  	klog.V(2).Infof("%s: starting cluster OOMs tracking measurement...", m)
   175  	if err := m.initFields(config); err != nil {
   176  		return fmt.Errorf("problem with OOMs tracking measurement fields initialization: %w", err)
   177  	}
   178  	ctx := context.Background()
   179  	client := config.ClusterFramework.GetClientSets().GetClient()
   180  
   181  	// Watching for OOM events from node-problem-detector below.
   182  	i := m.getOOMsTrackerInformer(ctx, client)
   183  	if err := informer.StartAndSync(i, m.stopCh, informerTimeout); err != nil {
   184  		return fmt.Errorf("problem with OOM events informer starting: %w", err)
   185  	}
   186  
   187  	// Searching for OOM events that happened before the measurement start.
   188  	// We're running this *after* initiating the informer above because doing
   189  	// the same in the reverse order might make us miss some OOMs.
   190  	if err := m.handlePriorOOMs(ctx, client); err != nil {
   191  		return fmt.Errorf("problem with handling prior OOMs: %w", err)
   192  	}
   193  
   194  	return nil
   195  }
   196  
   197  func (m *clusterOOMsTrackerMeasurement) initFields(config *measurement.Config) error {
   198  	m.isRunning = true
   199  	m.startTime = time.Now()
   200  	m.stopCh = make(chan struct{})
   201  	m.selector = &util.ObjectSelector{
   202  		FieldSelector: fields.Set{"reason": oomEventReason}.AsSelector().String(),
   203  		Namespace:     metav1.NamespaceAll,
   204  	}
   205  	m.msgRegex = oomEventMsgRegex
   206  	m.resourceVersionRecorded = make(map[string]bool)
   207  
   208  	ignoredProcessesString, err := util.GetStringOrDefault(config.Params, clusterOOMsIgnoredProcessesParamName, "")
   209  	if err != nil {
   210  		return err
   211  	}
   212  	m.processIgnored = make(map[string]bool)
   213  	if ignoredProcessesString != "" {
   214  		processNames := strings.Split(ignoredProcessesString, ",")
   215  		for _, processName := range processNames {
   216  			m.processIgnored[processName] = true
   217  		}
   218  	}
   219  	return nil
   220  }
   221  
   222  func (m *clusterOOMsTrackerMeasurement) stop() {
   223  	if m.isRunning {
   224  		m.isRunning = false
   225  		close(m.stopCh)
   226  	}
   227  }
   228  
   229  func (m *clusterOOMsTrackerMeasurement) gather() ([]measurement.Summary, error) {
   230  	klog.V(2).Infof("%s: gathering cluster OOMs tracking measurement", clusterOOMsTrackerName)
   231  	if !m.isRunning {
   232  		return nil, fmt.Errorf("measurement %s has not been started", clusterOOMsTrackerName)
   233  	}
   234  
   235  	m.stop()
   236  
   237  	oomData := make(map[string][]oomEvent)
   238  	oomData["failures"] = make([]oomEvent, 0)
   239  	oomData["past"] = make([]oomEvent, 0)
   240  	oomData["ignored"] = make([]oomEvent, 0)
   241  
   242  	for _, oom := range m.ooms {
   243  		if m.startTime.After(oom.Time) {
   244  			oomData["past"] = append(oomData["past"], oom)
   245  			continue
   246  		}
   247  		if m.processIgnored[oom.Process] {
   248  			oomData["ignored"] = append(oomData["ignored"], oom)
   249  			continue
   250  		}
   251  		oomData["failures"] = append(oomData["failures"], oom)
   252  	}
   253  
   254  	content, err := util.PrettyPrintJSON(oomData)
   255  	if err != nil {
   256  		return nil, fmt.Errorf("OOMs PrettyPrintJSON problem: %w", err)
   257  	}
   258  
   259  	summary := measurement.CreateSummary(clusterOOMsTrackerName, "json", content)
   260  	if oomFailures := oomData["failures"]; len(oomFailures) > 0 {
   261  		err = fmt.Errorf("OOMs recorded: %+v", oomFailures)
   262  	}
   263  	return []measurement.Summary{summary}, err
   264  }
   265  
   266  func (m *clusterOOMsTrackerMeasurement) handleOOMEvent(obj interface{}) {
   267  	event, ok := obj.(*corev1.Event)
   268  	if !ok || event.Reason != oomEventReason {
   269  		return
   270  	}
   271  
   272  	m.lock.Lock()
   273  	defer m.lock.Unlock()
   274  
   275  	if m.resourceVersionRecorded[event.ObjectMeta.ResourceVersion] {
   276  		// We are catching an OOM event with already recorded resource
   277  		// version which may happen on relisting the events when a watch
   278  		// breaks. Because of that, we do not want to register that
   279  		// OOM more than once.
   280  		return
   281  	}
   282  	m.resourceVersionRecorded[event.ObjectMeta.ResourceVersion] = true
   283  
   284  	klog.V(2).Infof("OOM detected: %+v", event)
   285  
   286  	oom := oomEvent{
   287  		Node: event.InvolvedObject.Name,
   288  	}
   289  	if !event.EventTime.IsZero() {
   290  		oom.Time = event.EventTime.Time
   291  	} else {
   292  		oom.Time = event.FirstTimestamp.Time
   293  	}
   294  
   295  	if match := m.msgRegex.FindStringSubmatch(event.Message); len(match) == 4 {
   296  		oom.ProcessID = match[1]
   297  		oom.Process = match[2]
   298  		oom.ProcessMemory = match[3]
   299  	} else {
   300  		klog.Warningf(`unrecognized OOM event message pattern; event message contents: "%v"`, event.Message)
   301  	}
   302  
   303  	m.ooms = append(m.ooms, oom)
   304  }