github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/resource-recommend/oom/oom_recorder.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package oom
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"sort"
    23  	"sync"
    24  	"time"
    25  
    26  	"github.com/pkg/errors"
    27  	v1 "k8s.io/api/core/v1"
    28  	"k8s.io/apimachinery/pkg/api/resource"
    29  	"k8s.io/apimachinery/pkg/types"
    30  	"k8s.io/client-go/util/workqueue"
    31  	"k8s.io/klog/v2"
    32  	"sigs.k8s.io/controller-runtime/pkg/client"
    33  )
    34  
    35  const (
    36  	ConfigMapOOMRecordName      = "oom-record"
    37  	ConfigMapDataOOMRecord      = "oom-data"
    38  	ConfigMapOOMRecordNameSpace = "kube-system"
    39  	CacheCleanTimeDurationHour  = 12
    40  	DataRetentionHour           = 168
    41  )
    42  
    43  type Recorder interface {
    44  	ListOOMRecords() []OOMRecord
    45  }
    46  
    47  type OOMRecord struct {
    48  	Namespace string
    49  	Pod       string
    50  	Container string
    51  	Memory    resource.Quantity
    52  	OOMAt     time.Time
    53  }
    54  
    55  type PodOOMRecorder struct {
    56  	client.Client
    57  
    58  	mu sync.Mutex
    59  
    60  	OOMRecordMaxNumber int
    61  	cache              []OOMRecord
    62  	Queue              workqueue.Interface
    63  }
    64  
    65  func (r *PodOOMRecorder) initOOMCacheFromConfigmap() {
    66  	r.mu.Lock()
    67  	defer r.mu.Unlock()
    68  
    69  	oomRecords, err := r.ListOOMRecordsFromConfigmap()
    70  	if err != nil {
    71  		// TODO: add monitor metric
    72  		klog.ErrorS(err, "init cache from configmap failed")
    73  	}
    74  	r.cache = oomRecords
    75  }
    76  
    77  func (r *PodOOMRecorder) ListOOMRecords() []OOMRecord {
    78  	return r.cache
    79  }
    80  
    81  func (r *PodOOMRecorder) cleanOOMRecord() {
    82  	r.mu.Lock()
    83  	defer r.mu.Unlock()
    84  	oomCache := r.ListOOMRecords()
    85  	sort.Slice(oomCache, func(i, j int) bool {
    86  		return oomCache[i].OOMAt.Before(oomCache[j].OOMAt)
    87  	})
    88  	now := time.Now()
    89  	index := 0
    90  	for i := len(oomCache) - 1; i >= 0; i-- {
    91  		if oomCache[i].OOMAt.Before(now.Add(-DataRetentionHour * time.Hour)) {
    92  			break
    93  		}
    94  		index++
    95  		if index >= r.OOMRecordMaxNumber {
    96  			break
    97  		}
    98  	}
    99  	r.cache = oomCache[len(oomCache)-index:]
   100  }
   101  
   102  func (r *PodOOMRecorder) updateOOMRecordCache(oomRecord OOMRecord) bool {
   103  	r.mu.Lock()
   104  	defer r.mu.Unlock()
   105  
   106  	oomCache := r.ListOOMRecords()
   107  	if oomCache == nil {
   108  		oomCache = []OOMRecord{}
   109  	}
   110  
   111  	isFound := false
   112  	isUpdated := false
   113  	for i := range oomCache {
   114  		if oomCache[i].Namespace == oomRecord.Namespace && oomCache[i].Pod == oomRecord.Pod && oomCache[i].Container == oomRecord.Container {
   115  			if oomRecord.Memory.Value() >= oomCache[i].Memory.Value() && !oomRecord.OOMAt.Equal(oomCache[i].OOMAt) {
   116  				oomCache[i].Memory = oomRecord.Memory
   117  				oomCache[i].OOMAt = oomRecord.OOMAt
   118  				isUpdated = true
   119  			}
   120  			isFound = true
   121  			break
   122  		}
   123  	}
   124  
   125  	if !isFound {
   126  		oomCache = append(oomCache, oomRecord)
   127  		isUpdated = true
   128  	}
   129  	if isUpdated {
   130  		r.cache = oomCache
   131  	}
   132  	return isUpdated
   133  }
   134  
   135  func (r *PodOOMRecorder) updateOOMRecordConfigMap() error {
   136  	r.mu.Lock()
   137  	defer r.mu.Unlock()
   138  
   139  	oomCache := r.ListOOMRecords()
   140  	cacheData, err := json.Marshal(oomCache)
   141  	if err != nil {
   142  		return err
   143  	}
   144  	oomConfigMap := &v1.ConfigMap{}
   145  	err = r.Client.Get(context.TODO(), types.NamespacedName{
   146  		Namespace: ConfigMapOOMRecordNameSpace,
   147  		Name:      ConfigMapOOMRecordName,
   148  	}, oomConfigMap)
   149  	if err != nil {
   150  		if client.IgnoreNotFound(err) != nil {
   151  			return err
   152  		}
   153  		oomConfigMap.Name = ConfigMapOOMRecordName
   154  		oomConfigMap.Namespace = ConfigMapOOMRecordNameSpace
   155  		oomConfigMap.Data = map[string]string{
   156  			ConfigMapDataOOMRecord: string(cacheData),
   157  		}
   158  		return r.Client.Create(context.TODO(), oomConfigMap)
   159  	}
   160  	oomConfigMap.Data = map[string]string{
   161  		ConfigMapDataOOMRecord: string(cacheData),
   162  	}
   163  	return r.Client.Update(context.TODO(), oomConfigMap)
   164  }
   165  
   166  func (r *PodOOMRecorder) Run(stopCh <-chan struct{}) error {
   167  	r.initOOMCacheFromConfigmap()
   168  	cleanTicker := time.NewTicker(time.Duration(CacheCleanTimeDurationHour) * time.Hour)
   169  	go func() {
   170  		defer func() {
   171  			if r := recover(); r != nil {
   172  				if r := recover(); r != nil {
   173  					err := errors.Errorf("Run clean oom recorder panic: %v", r.(error))
   174  					klog.Error(err)
   175  					panic(err)
   176  				}
   177  			}
   178  		}()
   179  		for range cleanTicker.C {
   180  			r.cleanOOMRecord()
   181  		}
   182  	}()
   183  	for {
   184  		select {
   185  		case <-stopCh:
   186  			return nil
   187  		default:
   188  		}
   189  
   190  		record, shutdown := r.Queue.Get()
   191  		if shutdown {
   192  			return errors.New("queue of OOMRecord recorder is shutting down ! ")
   193  		}
   194  		oomRecord, ok := record.(OOMRecord)
   195  		if !ok {
   196  			klog.Error("type conversion failed")
   197  			r.Queue.Done(record)
   198  			continue
   199  		}
   200  		isUpdated := r.updateOOMRecordCache(oomRecord)
   201  		if !isUpdated {
   202  			r.Queue.Done(record)
   203  			continue
   204  		}
   205  
   206  		err := r.updateOOMRecordConfigMap()
   207  		if err != nil {
   208  			klog.ErrorS(err, "Update oomRecord failed")
   209  		}
   210  		r.Queue.Done(record)
   211  	}
   212  }
   213  
   214  func (r *PodOOMRecorder) ListOOMRecordsFromConfigmap() ([]OOMRecord, error) {
   215  	oomConfigMap := &v1.ConfigMap{}
   216  	err := r.Client.Get(context.TODO(), types.NamespacedName{
   217  		Namespace: ConfigMapOOMRecordNameSpace,
   218  		Name:      ConfigMapOOMRecordName,
   219  	}, oomConfigMap)
   220  	if err != nil {
   221  		return nil, client.IgnoreNotFound(err)
   222  	}
   223  	oomRecords := make([]OOMRecord, 0)
   224  	err = json.Unmarshal([]byte(oomConfigMap.Data[ConfigMapDataOOMRecord]), &oomRecords)
   225  	return oomRecords, err
   226  }