github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/orm/metamanager/manager.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package metamanager
    18  
    19  import (
    20  	"context"
    21  	"sync"
    22  	"time"
    23  
    24  	v1 "k8s.io/api/core/v1"
    25  	"k8s.io/apimachinery/pkg/util/wait"
    26  	"k8s.io/klog/v2"
    27  
    28  	"github.com/kubewharf/katalyst-core/pkg/metaserver"
    29  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    30  	"github.com/kubewharf/katalyst-core/pkg/util/cgroup/common"
    31  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    32  )
    33  
    34  type Manager struct {
    35  	ctx context.Context
    36  
    37  	emitter metrics.MetricEmitter
    38  
    39  	*metaserver.MetaServer
    40  	mutex sync.RWMutex
    41  
    42  	cachedPods CachedPodListFunc
    43  
    44  	podFirstRemoveTime map[string]time.Time
    45  
    46  	podAddedFuncs   []PodAddedFunc
    47  	podDeletedFuncs []PodDeletedFunc
    48  }
    49  
    50  func NewManager(
    51  	emitter metrics.MetricEmitter,
    52  	cachedPods CachedPodListFunc,
    53  	metaServer *metaserver.MetaServer,
    54  ) *Manager {
    55  	m := &Manager{
    56  		emitter:            emitter,
    57  		MetaServer:         metaServer,
    58  		cachedPods:         cachedPods,
    59  		podAddedFuncs:      make([]PodAddedFunc, 0),
    60  		podDeletedFuncs:    make([]PodDeletedFunc, 0),
    61  		podFirstRemoveTime: make(map[string]time.Time),
    62  	}
    63  	return m
    64  }
    65  
    66  func (m *Manager) Run(ctx context.Context, reconcilePeriod time.Duration) {
    67  	m.ctx = ctx
    68  	go wait.Until(m.reconcile, reconcilePeriod, m.ctx.Done())
    69  }
    70  
    71  func (m *Manager) reconcile() {
    72  	activePods, err := m.MetaServer.GetPodList(m.ctx, native.PodIsActive)
    73  	if err != nil {
    74  		klog.Errorf("metamanager reconcile GetPodList fail: %v", err)
    75  		_ = m.emitter.StoreInt64(MetricReconcileFail, 1, metrics.MetricTypeNameRaw)
    76  		return
    77  	}
    78  
    79  	// reconcile new pods
    80  	podsToBeAdded := m.reconcileNewPods(activePods)
    81  	if len(podsToBeAdded) > 0 {
    82  		m.notifyAddPods(podsToBeAdded)
    83  	}
    84  
    85  	// reconcile pod terminated and had been deleted
    86  	podsTobeRemoved := m.reconcileRemovePods(activePods)
    87  	if len(podsTobeRemoved) > 0 {
    88  		m.notifyDeletePods(podsTobeRemoved)
    89  	}
    90  }
    91  
    92  // ReconcilePods returns a list of new pods and pod should be deleted
    93  func (m *Manager) ReconcilePods() ([]string, map[string]struct{}, error) {
    94  	activePods, err := m.MetaServer.GetPodList(m.ctx, native.PodIsActive)
    95  	if err != nil {
    96  		klog.Errorf("metamanager reconcile GetPodList fail: %v", err)
    97  		_ = m.emitter.StoreInt64(MetricReconcileFail, 1, metrics.MetricTypeNameRaw)
    98  		return nil, nil, err
    99  	}
   100  
   101  	// reconcile new pods
   102  	podsToBeAdded := m.reconcileNewPods(activePods)
   103  
   104  	// reconcile pod terminated and had been deleted
   105  	podsTobeRemoved := m.reconcileRemovePods(activePods)
   106  	return podsToBeAdded, podsTobeRemoved, nil
   107  }
   108  
   109  func (m *Manager) GetPods() []*v1.Pod {
   110  	activePods, err := m.MetaServer.GetPodList(m.ctx, native.PodIsActive)
   111  	if err != nil {
   112  		klog.Errorf("GetPodList fail: %v", err)
   113  		return []*v1.Pod{}
   114  	}
   115  
   116  	return activePods
   117  }
   118  
   119  func (m *Manager) RegistPodAddedFunc(podAddedFunc PodAddedFunc) {
   120  	m.podAddedFuncs = append(m.podAddedFuncs, podAddedFunc)
   121  }
   122  
   123  func (m *Manager) RegistPodDeletedFunc(podDeletedFunc PodDeletedFunc) {
   124  	m.podDeletedFuncs = append(m.podDeletedFuncs, podDeletedFunc)
   125  }
   126  
   127  // reconcileNewPods checks new pods between activePods from metaServer and pods in manager cache
   128  func (m *Manager) reconcileNewPods(activePods []*v1.Pod) []string {
   129  	podsToBeAdded := make([]string, 0)
   130  	podList := m.cachedPods()
   131  
   132  	for _, pod := range activePods {
   133  		if !podList.Has(string(pod.UID)) {
   134  			podsToBeAdded = append(podsToBeAdded, string(pod.UID))
   135  		}
   136  	}
   137  
   138  	return podsToBeAdded
   139  }
   140  
   141  // reconcileRemovePods checks deleted pods between activePods from metaServer and pods in manager cache
   142  func (m *Manager) reconcileRemovePods(activePods []*v1.Pod) map[string]struct{} {
   143  	podsToBeRemoved := make(map[string]struct{})
   144  	podList := m.cachedPods()
   145  
   146  	for _, pod := range activePods {
   147  		if podList.Has(string(pod.UID)) {
   148  			podList = podList.Delete(string(pod.UID))
   149  		}
   150  	}
   151  
   152  	// gc pod remove timestamp
   153  	m.mutex.Lock()
   154  	for _, pod := range activePods {
   155  		delete(m.podFirstRemoveTime, string(pod.UID))
   156  	}
   157  	m.mutex.Unlock()
   158  
   159  	// check pod can be removed
   160  	for _, podUID := range podList.UnsortedList() {
   161  		if m.canPodDelete(podUID) {
   162  			podsToBeRemoved[podUID] = struct{}{}
   163  		}
   164  	}
   165  
   166  	return podsToBeRemoved
   167  }
   168  
   169  func (m *Manager) notifyAddPods(podUIDs []string) {
   170  	if len(m.podAddedFuncs) > 0 {
   171  		klog.V(5).Infof("metaManager notifyAddPods: %v", podUIDs)
   172  
   173  		for _, podUID := range podUIDs {
   174  			for _, addFunc := range m.podAddedFuncs {
   175  				addFunc(podUID)
   176  			}
   177  		}
   178  	}
   179  }
   180  
   181  func (m *Manager) notifyDeletePods(podUIDSet map[string]struct{}) {
   182  	if len(m.podDeletedFuncs) > 0 {
   183  		klog.V(5).Infof("metaManager notifyDeletePods: %v", podUIDSet)
   184  
   185  		for podUID := range podUIDSet {
   186  			for _, deleteFuncs := range m.podDeletedFuncs {
   187  				deleteFuncs(podUID)
   188  			}
   189  		}
   190  	}
   191  }
   192  
   193  func (m *Manager) canPodDelete(podUID string) bool {
   194  	m.mutex.Lock()
   195  	defer m.mutex.Unlock()
   196  	// generate pod cgroup path, use cpu as subsystem
   197  	_, err := common.GetPodAbsCgroupPath(common.CgroupSubsysCPU, podUID)
   198  	if err != nil {
   199  		// GetPodAbsCgroupPath return error only if pod cgroup path not exist
   200  		klog.Warning(err.Error())
   201  		delete(m.podFirstRemoveTime, podUID)
   202  		return true
   203  	}
   204  
   205  	// pod is not exist in metaServer, deletionTimestamp can not be got by pod
   206  	// first deletion check time should be record
   207  	firstRemoveTime, ok := m.podFirstRemoveTime[podUID]
   208  	if !ok {
   209  		m.podFirstRemoveTime[podUID] = time.Now()
   210  	} else {
   211  		if time.Now().After(firstRemoveTime.Add(forceRemoveDuration)) {
   212  			delete(m.podFirstRemoveTime, podUID)
   213  			return true
   214  		}
   215  
   216  		return false
   217  	}
   218  
   219  	return false
   220  }