github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/orm/metamanager/manager.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package metamanager 18 19 import ( 20 "context" 21 "sync" 22 "time" 23 24 v1 "k8s.io/api/core/v1" 25 "k8s.io/apimachinery/pkg/util/wait" 26 "k8s.io/klog/v2" 27 28 "github.com/kubewharf/katalyst-core/pkg/metaserver" 29 "github.com/kubewharf/katalyst-core/pkg/metrics" 30 "github.com/kubewharf/katalyst-core/pkg/util/cgroup/common" 31 "github.com/kubewharf/katalyst-core/pkg/util/native" 32 ) 33 34 type Manager struct { 35 ctx context.Context 36 37 emitter metrics.MetricEmitter 38 39 *metaserver.MetaServer 40 mutex sync.RWMutex 41 42 cachedPods CachedPodListFunc 43 44 podFirstRemoveTime map[string]time.Time 45 46 podAddedFuncs []PodAddedFunc 47 podDeletedFuncs []PodDeletedFunc 48 } 49 50 func NewManager( 51 emitter metrics.MetricEmitter, 52 cachedPods CachedPodListFunc, 53 metaServer *metaserver.MetaServer, 54 ) *Manager { 55 m := &Manager{ 56 emitter: emitter, 57 MetaServer: metaServer, 58 cachedPods: cachedPods, 59 podAddedFuncs: make([]PodAddedFunc, 0), 60 podDeletedFuncs: make([]PodDeletedFunc, 0), 61 podFirstRemoveTime: make(map[string]time.Time), 62 } 63 return m 64 } 65 66 func (m *Manager) Run(ctx context.Context, reconcilePeriod time.Duration) { 67 m.ctx = ctx 68 go wait.Until(m.reconcile, reconcilePeriod, m.ctx.Done()) 69 } 70 71 func (m *Manager) reconcile() { 72 activePods, err := m.MetaServer.GetPodList(m.ctx, native.PodIsActive) 73 if err != nil { 74 klog.Errorf("metamanager reconcile GetPodList fail: %v", err) 75 _ = m.emitter.StoreInt64(MetricReconcileFail, 1, metrics.MetricTypeNameRaw) 76 return 77 } 78 79 // reconcile new pods 80 podsToBeAdded := m.reconcileNewPods(activePods) 81 if len(podsToBeAdded) > 0 { 82 m.notifyAddPods(podsToBeAdded) 83 } 84 85 // reconcile pod terminated and had been deleted 86 podsTobeRemoved := m.reconcileRemovePods(activePods) 87 if len(podsTobeRemoved) > 0 { 88 m.notifyDeletePods(podsTobeRemoved) 89 } 90 } 91 92 // ReconcilePods returns a list of new pods and pod should be deleted 93 func (m *Manager) ReconcilePods() ([]string, map[string]struct{}, error) { 94 activePods, err := m.MetaServer.GetPodList(m.ctx, native.PodIsActive) 95 if err != nil { 96 klog.Errorf("metamanager reconcile GetPodList fail: %v", err) 97 _ = m.emitter.StoreInt64(MetricReconcileFail, 1, metrics.MetricTypeNameRaw) 98 return nil, nil, err 99 } 100 101 // reconcile new pods 102 podsToBeAdded := m.reconcileNewPods(activePods) 103 104 // reconcile pod terminated and had been deleted 105 podsTobeRemoved := m.reconcileRemovePods(activePods) 106 return podsToBeAdded, podsTobeRemoved, nil 107 } 108 109 func (m *Manager) GetPods() []*v1.Pod { 110 activePods, err := m.MetaServer.GetPodList(m.ctx, native.PodIsActive) 111 if err != nil { 112 klog.Errorf("GetPodList fail: %v", err) 113 return []*v1.Pod{} 114 } 115 116 return activePods 117 } 118 119 func (m *Manager) RegistPodAddedFunc(podAddedFunc PodAddedFunc) { 120 m.podAddedFuncs = append(m.podAddedFuncs, podAddedFunc) 121 } 122 123 func (m *Manager) RegistPodDeletedFunc(podDeletedFunc PodDeletedFunc) { 124 m.podDeletedFuncs = append(m.podDeletedFuncs, podDeletedFunc) 125 } 126 127 // reconcileNewPods checks new pods between activePods from metaServer and pods in manager cache 128 func (m *Manager) reconcileNewPods(activePods []*v1.Pod) []string { 129 podsToBeAdded := make([]string, 0) 130 podList := m.cachedPods() 131 132 for _, pod := range activePods { 133 if !podList.Has(string(pod.UID)) { 134 podsToBeAdded = append(podsToBeAdded, string(pod.UID)) 135 } 136 } 137 138 return podsToBeAdded 139 } 140 141 // reconcileRemovePods checks deleted pods between activePods from metaServer and pods in manager cache 142 func (m *Manager) reconcileRemovePods(activePods []*v1.Pod) map[string]struct{} { 143 podsToBeRemoved := make(map[string]struct{}) 144 podList := m.cachedPods() 145 146 for _, pod := range activePods { 147 if podList.Has(string(pod.UID)) { 148 podList = podList.Delete(string(pod.UID)) 149 } 150 } 151 152 // gc pod remove timestamp 153 m.mutex.Lock() 154 for _, pod := range activePods { 155 delete(m.podFirstRemoveTime, string(pod.UID)) 156 } 157 m.mutex.Unlock() 158 159 // check pod can be removed 160 for _, podUID := range podList.UnsortedList() { 161 if m.canPodDelete(podUID) { 162 podsToBeRemoved[podUID] = struct{}{} 163 } 164 } 165 166 return podsToBeRemoved 167 } 168 169 func (m *Manager) notifyAddPods(podUIDs []string) { 170 if len(m.podAddedFuncs) > 0 { 171 klog.V(5).Infof("metaManager notifyAddPods: %v", podUIDs) 172 173 for _, podUID := range podUIDs { 174 for _, addFunc := range m.podAddedFuncs { 175 addFunc(podUID) 176 } 177 } 178 } 179 } 180 181 func (m *Manager) notifyDeletePods(podUIDSet map[string]struct{}) { 182 if len(m.podDeletedFuncs) > 0 { 183 klog.V(5).Infof("metaManager notifyDeletePods: %v", podUIDSet) 184 185 for podUID := range podUIDSet { 186 for _, deleteFuncs := range m.podDeletedFuncs { 187 deleteFuncs(podUID) 188 } 189 } 190 } 191 } 192 193 func (m *Manager) canPodDelete(podUID string) bool { 194 m.mutex.Lock() 195 defer m.mutex.Unlock() 196 // generate pod cgroup path, use cpu as subsystem 197 _, err := common.GetPodAbsCgroupPath(common.CgroupSubsysCPU, podUID) 198 if err != nil { 199 // GetPodAbsCgroupPath return error only if pod cgroup path not exist 200 klog.Warning(err.Error()) 201 delete(m.podFirstRemoveTime, podUID) 202 return true 203 } 204 205 // pod is not exist in metaServer, deletionTimestamp can not be got by pod 206 // first deletion check time should be record 207 firstRemoveTime, ok := m.podFirstRemoveTime[podUID] 208 if !ok { 209 m.podFirstRemoveTime[podUID] = time.Now() 210 } else { 211 if time.Now().After(firstRemoveTime.Add(forceRemoveDuration)) { 212 delete(m.podFirstRemoveTime, podUID) 213 return true 214 } 215 216 return false 217 } 218 219 return false 220 }