github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/resource-recommend/oom/oom_recorder.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package oom 18 19 import ( 20 "context" 21 "encoding/json" 22 "sort" 23 "sync" 24 "time" 25 26 "github.com/pkg/errors" 27 v1 "k8s.io/api/core/v1" 28 "k8s.io/apimachinery/pkg/api/resource" 29 "k8s.io/apimachinery/pkg/types" 30 "k8s.io/client-go/util/workqueue" 31 "k8s.io/klog/v2" 32 "sigs.k8s.io/controller-runtime/pkg/client" 33 ) 34 35 const ( 36 ConfigMapOOMRecordName = "oom-record" 37 ConfigMapDataOOMRecord = "oom-data" 38 ConfigMapOOMRecordNameSpace = "kube-system" 39 CacheCleanTimeDurationHour = 12 40 DataRetentionHour = 168 41 ) 42 43 type Recorder interface { 44 ListOOMRecords() []OOMRecord 45 } 46 47 type OOMRecord struct { 48 Namespace string 49 Pod string 50 Container string 51 Memory resource.Quantity 52 OOMAt time.Time 53 } 54 55 type PodOOMRecorder struct { 56 client.Client 57 58 mu sync.Mutex 59 60 OOMRecordMaxNumber int 61 cache []OOMRecord 62 Queue workqueue.Interface 63 } 64 65 func (r *PodOOMRecorder) initOOMCacheFromConfigmap() { 66 r.mu.Lock() 67 defer r.mu.Unlock() 68 69 oomRecords, err := r.ListOOMRecordsFromConfigmap() 70 if err != nil { 71 // TODO: add monitor metric 72 klog.ErrorS(err, "init cache from configmap failed") 73 } 74 r.cache = oomRecords 75 } 76 77 func (r *PodOOMRecorder) ListOOMRecords() []OOMRecord { 78 return r.cache 79 } 80 81 func (r *PodOOMRecorder) cleanOOMRecord() { 82 r.mu.Lock() 83 defer r.mu.Unlock() 84 oomCache := r.ListOOMRecords() 85 sort.Slice(oomCache, func(i, j int) bool { 86 return oomCache[i].OOMAt.Before(oomCache[j].OOMAt) 87 }) 88 now := time.Now() 89 index := 0 90 for i := len(oomCache) - 1; i >= 0; i-- { 91 if oomCache[i].OOMAt.Before(now.Add(-DataRetentionHour * time.Hour)) { 92 break 93 } 94 index++ 95 if index >= r.OOMRecordMaxNumber { 96 break 97 } 98 } 99 r.cache = oomCache[len(oomCache)-index:] 100 } 101 102 func (r *PodOOMRecorder) updateOOMRecordCache(oomRecord OOMRecord) bool { 103 r.mu.Lock() 104 defer r.mu.Unlock() 105 106 oomCache := r.ListOOMRecords() 107 if oomCache == nil { 108 oomCache = []OOMRecord{} 109 } 110 111 isFound := false 112 isUpdated := false 113 for i := range oomCache { 114 if oomCache[i].Namespace == oomRecord.Namespace && oomCache[i].Pod == oomRecord.Pod && oomCache[i].Container == oomRecord.Container { 115 if oomRecord.Memory.Value() >= oomCache[i].Memory.Value() && !oomRecord.OOMAt.Equal(oomCache[i].OOMAt) { 116 oomCache[i].Memory = oomRecord.Memory 117 oomCache[i].OOMAt = oomRecord.OOMAt 118 isUpdated = true 119 } 120 isFound = true 121 break 122 } 123 } 124 125 if !isFound { 126 oomCache = append(oomCache, oomRecord) 127 isUpdated = true 128 } 129 if isUpdated { 130 r.cache = oomCache 131 } 132 return isUpdated 133 } 134 135 func (r *PodOOMRecorder) updateOOMRecordConfigMap() error { 136 r.mu.Lock() 137 defer r.mu.Unlock() 138 139 oomCache := r.ListOOMRecords() 140 cacheData, err := json.Marshal(oomCache) 141 if err != nil { 142 return err 143 } 144 oomConfigMap := &v1.ConfigMap{} 145 err = r.Client.Get(context.TODO(), types.NamespacedName{ 146 Namespace: ConfigMapOOMRecordNameSpace, 147 Name: ConfigMapOOMRecordName, 148 }, oomConfigMap) 149 if err != nil { 150 if client.IgnoreNotFound(err) != nil { 151 return err 152 } 153 oomConfigMap.Name = ConfigMapOOMRecordName 154 oomConfigMap.Namespace = ConfigMapOOMRecordNameSpace 155 oomConfigMap.Data = map[string]string{ 156 ConfigMapDataOOMRecord: string(cacheData), 157 } 158 return r.Client.Create(context.TODO(), oomConfigMap) 159 } 160 oomConfigMap.Data = map[string]string{ 161 ConfigMapDataOOMRecord: string(cacheData), 162 } 163 return r.Client.Update(context.TODO(), oomConfigMap) 164 } 165 166 func (r *PodOOMRecorder) Run(stopCh <-chan struct{}) error { 167 r.initOOMCacheFromConfigmap() 168 cleanTicker := time.NewTicker(time.Duration(CacheCleanTimeDurationHour) * time.Hour) 169 go func() { 170 defer func() { 171 if r := recover(); r != nil { 172 if r := recover(); r != nil { 173 err := errors.Errorf("Run clean oom recorder panic: %v", r.(error)) 174 klog.Error(err) 175 panic(err) 176 } 177 } 178 }() 179 for range cleanTicker.C { 180 r.cleanOOMRecord() 181 } 182 }() 183 for { 184 select { 185 case <-stopCh: 186 return nil 187 default: 188 } 189 190 record, shutdown := r.Queue.Get() 191 if shutdown { 192 return errors.New("queue of OOMRecord recorder is shutting down ! ") 193 } 194 oomRecord, ok := record.(OOMRecord) 195 if !ok { 196 klog.Error("type conversion failed") 197 r.Queue.Done(record) 198 continue 199 } 200 isUpdated := r.updateOOMRecordCache(oomRecord) 201 if !isUpdated { 202 r.Queue.Done(record) 203 continue 204 } 205 206 err := r.updateOOMRecordConfigMap() 207 if err != nil { 208 klog.ErrorS(err, "Update oomRecord failed") 209 } 210 r.Queue.Done(record) 211 } 212 } 213 214 func (r *PodOOMRecorder) ListOOMRecordsFromConfigmap() ([]OOMRecord, error) { 215 oomConfigMap := &v1.ConfigMap{} 216 err := r.Client.Get(context.TODO(), types.NamespacedName{ 217 Namespace: ConfigMapOOMRecordNameSpace, 218 Name: ConfigMapOOMRecordName, 219 }, oomConfigMap) 220 if err != nil { 221 return nil, client.IgnoreNotFound(err) 222 } 223 oomRecords := make([]OOMRecord, 0) 224 err = json.Unmarshal([]byte(oomConfigMap.Data[ConfigMapDataOOMRecord]), &oomRecords) 225 return oomRecords, err 226 }