github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/orm/manager.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package orm
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"math"
    23  	"net"
    24  	"os"
    25  	"path/filepath"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/opencontainers/selinux/go-selinux"
    30  	"k8s.io/klog/v2"
    31  
    32  	"google.golang.org/grpc"
    33  	v1 "k8s.io/api/core/v1"
    34  	"k8s.io/apimachinery/pkg/util/wait"
    35  	"k8s.io/kubelet/pkg/apis/pluginregistration/v1"
    36  	pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1"
    37  	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
    38  	maputil "k8s.io/kubernetes/pkg/util/maps"
    39  
    40  	"github.com/kubewharf/katalyst-core/pkg/agent/orm/deviceprovider/kubelet"
    41  	"github.com/kubewharf/katalyst-core/pkg/agent/orm/endpoint"
    42  	"github.com/kubewharf/katalyst-core/pkg/agent/orm/executor"
    43  	"github.com/kubewharf/katalyst-core/pkg/agent/orm/metamanager"
    44  	"github.com/kubewharf/katalyst-core/pkg/agent/orm/server"
    45  	"github.com/kubewharf/katalyst-core/pkg/agent/orm/server/podresources"
    46  	"github.com/kubewharf/katalyst-core/pkg/agent/orm/topology"
    47  	"github.com/kubewharf/katalyst-core/pkg/config"
    48  	"github.com/kubewharf/katalyst-core/pkg/config/generic"
    49  	"github.com/kubewharf/katalyst-core/pkg/metaserver"
    50  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    51  	"github.com/kubewharf/katalyst-core/pkg/util/bitmask"
    52  	cgroupmgr "github.com/kubewharf/katalyst-core/pkg/util/cgroup/manager"
    53  	podresourcesutil "github.com/kubewharf/katalyst-core/pkg/util/kubelet/podresources"
    54  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    55  )
    56  
    57  type ManagerImpl struct {
    58  	ctx context.Context
    59  
    60  	socketname string
    61  	socketdir  string
    62  
    63  	// resource to QRMPlugins and executors
    64  	mutex            sync.RWMutex
    65  	endpoints        map[string]endpoint.EndpointInfo
    66  	resourceExecutor executor.Executor
    67  
    68  	metaManager *metamanager.Manager
    69  
    70  	topologyManager topology.Manager
    71  
    72  	server *grpc.Server
    73  	wg     sync.WaitGroup
    74  
    75  	podAddChan    chan string
    76  	podDeleteChan chan string
    77  
    78  	podResources      *podResourcesChk
    79  	checkpointManager checkpointmanager.CheckpointManager
    80  
    81  	emitter   metrics.MetricEmitter
    82  	qosConfig *generic.QoSConfiguration
    83  
    84  	reconcilePeriod   time.Duration
    85  	resourceNamesMap  map[string]string
    86  	podResourceSocket string
    87  
    88  	devicesProvider podresources.DevicesProvider
    89  }
    90  
    91  func NewManager(socketPath string, emitter metrics.MetricEmitter, metaServer *metaserver.MetaServer, config *config.Configuration) (*ManagerImpl, error) {
    92  	klog.V(2).Infof("new ORM..., socketPath: %v, resourceNameMap: %v, reconcilePeriod: %v", socketPath, config.ORMResourceNamesMap, config.ORMRconcilePeriod)
    93  
    94  	if socketPath == "" || !filepath.IsAbs(socketPath) {
    95  		return nil, fmt.Errorf(errBadSocket+" %s", socketPath)
    96  	}
    97  	dir, file := filepath.Split(socketPath)
    98  
    99  	checkpointManager, err := checkpointmanager.NewCheckpointManager(dir)
   100  	if err != nil {
   101  		return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
   102  	}
   103  
   104  	m := &ManagerImpl{
   105  		socketdir:  dir,
   106  		socketname: file,
   107  
   108  		endpoints:         make(map[string]endpoint.EndpointInfo),
   109  		podResources:      newPodResourcesChk(),
   110  		checkpointManager: checkpointManager,
   111  
   112  		resourceNamesMap: config.ORMResourceNamesMap,
   113  		reconcilePeriod:  config.ORMRconcilePeriod,
   114  
   115  		podAddChan:        make(chan string, config.ORMPodNotifyChanLen),
   116  		podDeleteChan:     make(chan string, config.ORMPodNotifyChanLen),
   117  		emitter:           emitter,
   118  		qosConfig:         config.QoSConfiguration,
   119  		podResourceSocket: config.ORMPodResourcesSocket,
   120  	}
   121  
   122  	m.resourceExecutor = executor.NewExecutor(cgroupmgr.GetManager())
   123  
   124  	metaManager := metamanager.NewManager(emitter, m.podResources.pods, metaServer)
   125  	m.metaManager = metaManager
   126  
   127  	topologyManager, err := topology.NewManager(metaServer.Topology, config.TopologyPolicyName, config.NumericAlignResources)
   128  	if err != nil {
   129  		klog.Error(err)
   130  		return nil, err
   131  	}
   132  	topologyManager.AddHintProvider(m)
   133  	m.topologyManager = topologyManager
   134  
   135  	m.initDeviceProvider(config)
   136  
   137  	if err := m.removeContents(m.socketdir); err != nil {
   138  		err = fmt.Errorf("[ORM] Fail to clean up stale contents under %s: %v", m.socketdir, err)
   139  		klog.Error(err)
   140  		return nil, err
   141  	}
   142  	klog.V(5).Infof("removeContents......")
   143  
   144  	return m, nil
   145  }
   146  
   147  func (m *ManagerImpl) Run(ctx context.Context) {
   148  	klog.V(2).Infof("[ORM] running...")
   149  	m.ctx = ctx
   150  
   151  	// read data from checkpoint
   152  	err := m.readCheckpoint()
   153  	if err != nil {
   154  		klog.Fatalf("[ORM] read checkpoint fail: %v", err)
   155  	}
   156  
   157  	if err = os.MkdirAll(m.socketdir, 0o750); err != nil {
   158  		klog.Fatalf("[ORM] Mkdir socketdir %v fail: %v", m.socketdir, err)
   159  	}
   160  	if selinux.GetEnabled() {
   161  		if err := selinux.SetFileLabel(m.socketdir, KubeletPluginsDirSELinuxLabel); err != nil {
   162  			klog.Warningf("[ORM] Unprivileged containerized plugins might not work. Could not set selinux context on %s: %v", m.socketdir, err)
   163  		}
   164  	}
   165  
   166  	socketPath := filepath.Join(m.socketdir, m.socketname)
   167  	s, err := net.Listen("unix", socketPath)
   168  	if err != nil {
   169  		klog.Fatalf(errListenSocket+" %v", err)
   170  	}
   171  
   172  	m.wg.Add(1)
   173  	m.server = grpc.NewServer([]grpc.ServerOption{}...)
   174  
   175  	pluginapi.RegisterRegistrationServer(m.server, m)
   176  
   177  	klog.V(2).Infof("[ORM] Serving resource plugin registration server on %q", socketPath)
   178  	go func() {
   179  		defer func() {
   180  			m.wg.Done()
   181  
   182  			if err := recover(); err != nil {
   183  				klog.Fatalf("[ORM] Start recover from err: %v", err)
   184  			}
   185  			s.Close()
   186  		}()
   187  		m.server.Serve(s)
   188  	}()
   189  
   190  	klog.V(5).Infof("[ORM] start serve socketPath %v", socketPath)
   191  	go func() {
   192  		m.process()
   193  	}()
   194  
   195  	go wait.Until(m.reconcile, m.reconcilePeriod, m.ctx.Done())
   196  
   197  	m.metaManager.RegistPodAddedFunc(m.onPodAdd)
   198  	m.metaManager.RegistPodDeletedFunc(m.onPodDelete)
   199  
   200  	m.metaManager.Run(ctx, m.reconcilePeriod)
   201  
   202  	go server.ListenAndServePodResources(m.podResourceSocket, m.metaManager, m, m.devicesProvider, m.emitter)
   203  }
   204  
   205  func (m *ManagerImpl) GetHandlerType() string {
   206  	return pluginregistration.ResourcePlugin
   207  }
   208  
   209  func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topology.TopologyHint {
   210  	if pod == nil || container == nil {
   211  		klog.Errorf("[ORM] GetTopologyHints got nil pod: %v or container: %v", pod, container)
   212  		return nil
   213  	}
   214  
   215  	podUID := string(pod.UID)
   216  	contName := container.Name
   217  	containerType, containerIndex, err := GetContainerTypeAndIndex(pod, container)
   218  	if err != nil {
   219  		return nil
   220  	}
   221  
   222  	resourceHints := make(map[string][]topology.TopologyHint)
   223  	for resourceObj, requestedObj := range container.Resources.Requests {
   224  		requested := int(requestedObj.Value())
   225  		resource, err := m.getMappedResourceName(string(resourceObj), container.Resources.Requests)
   226  		if err != nil {
   227  			klog.Errorf("resource %s getMappedResourceName fail: %v", string(resourceObj), err)
   228  			return nil
   229  		}
   230  
   231  		if requestedObj.IsZero() {
   232  			continue
   233  		}
   234  
   235  		allocationInfo := m.podResources.containerResource(podUID, contName, resource)
   236  		if allocationInfo != nil && allocationInfo.ResourceHints != nil && len(allocationInfo.ResourceHints.Hints) > 0 {
   237  
   238  			allocated := int(math.Ceil(allocationInfo.AllocatedQuantity))
   239  
   240  			if allocationInfo.IsScalarResource && allocated >= requested {
   241  				resourceHints[resource] = ParseListOfTopologyHints(allocationInfo.ResourceHints)
   242  				klog.Warningf("[ORM] resource %s already allocated to (pod %s/%s, container %v) with larger number than request: requested: %d, allocated: %d; not to getTopologyHints",
   243  					resource, pod.GetNamespace(), pod.GetName(), container.Name, requested, allocated)
   244  				continue
   245  			} else {
   246  				klog.Warningf("[ORM] resource %s already allocated to (pod %s/%s, container %v) with smaller number than request: requested: %d, allocated: %d; continue to getTopologyHints",
   247  					resource, pod.GetNamespace(), pod.GetName(), container.Name, requested, int(math.Ceil(allocationInfo.AllocatedQuantity)))
   248  			}
   249  		}
   250  
   251  		m.mutex.Lock()
   252  		e, ok := m.endpoints[resource]
   253  		m.mutex.Unlock()
   254  		if !ok || e.Opts == nil || !e.Opts.WithTopologyAlignment {
   255  			klog.V(5).Infof("[ORM] GetTopologyHints resource %s not supported", resource)
   256  			continue
   257  		}
   258  
   259  		resourceReq := &pluginapi.ResourceRequest{
   260  			PodUid:         podUID,
   261  			PodNamespace:   pod.GetNamespace(),
   262  			PodName:        pod.GetName(),
   263  			ContainerName:  container.Name,
   264  			ContainerType:  containerType,
   265  			ContainerIndex: containerIndex,
   266  			PodRole:        pod.Labels[pluginapi.PodRoleLabelKey],
   267  			PodType:        pod.Annotations[pluginapi.PodTypeAnnotationKey],
   268  			Labels:         maputil.CopySS(pod.Labels),
   269  			Annotations:    maputil.CopySS(pod.Annotations),
   270  			// use mapped resource name in "ResourceName" to indicates which endpoint to request
   271  			ResourceName: resource,
   272  			// use original requested resource name in "ResourceRequests" in order to make plugin identity real requested resource name
   273  			ResourceRequests: map[string]float64{string(resourceObj): requestedObj.AsApproximateFloat64()},
   274  		}
   275  
   276  		resp, err := e.E.GetTopologyHints(context.Background(), resourceReq)
   277  		if err != nil {
   278  			klog.Errorf("[ORM] call GetTopologyHints of %s resource plugin for pod: %s/%s, container: %s failed with error: %v",
   279  				resource, pod.GetNamespace(), pod.GetName(), contName, err)
   280  
   281  			resourceHints[resource] = []topology.TopologyHint{}
   282  			continue
   283  		}
   284  
   285  		resourceHints[resource] = ParseListOfTopologyHints(resp.ResourceHints[resource])
   286  
   287  		klog.Infof("[ORM] GetTopologyHints for resource: %s, pod: %s/%s, container: %s, result: %+v",
   288  			resource, pod.Namespace, pod.Name, contName, resourceHints[resource])
   289  	}
   290  
   291  	return resourceHints
   292  }
   293  
   294  func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topology.TopologyHint {
   295  	// [TODO]: implement pod scope get topologyHints for provider and resource plugins.
   296  	return nil
   297  }
   298  
   299  func (m *ManagerImpl) Allocate(pod *v1.Pod, container *v1.Container) error {
   300  	if pod == nil || container == nil {
   301  		return fmt.Errorf("Allocate got nil pod: %v or container: %v", pod, container)
   302  	}
   303  
   304  	err := m.addContainer(pod, container)
   305  	if err != nil {
   306  		return err
   307  	}
   308  
   309  	err = m.syncContainer(pod, container)
   310  	return err
   311  }
   312  
   313  func (m *ManagerImpl) initDeviceProvider(config *config.Configuration) {
   314  	switch config.ORMDevicesProvider {
   315  	case kubeletDevicesProvider:
   316  		p, err := kubelet.NewProvider(config.ORMKubeletPodResourcesEndpoints, podresourcesutil.GetV1Client)
   317  		if err != nil {
   318  			klog.Fatalf("new kubelet devices provider fail: %v", err)
   319  		}
   320  		m.devicesProvider = p
   321  	case NoneDevicesProvider:
   322  		m.devicesProvider = &podresources.DevicesProviderStub{}
   323  	default:
   324  		klog.Fatalf("Unknown ORMDevicesProvider: %s", config.ORMDevicesProvider)
   325  	}
   326  }
   327  
   328  func (m *ManagerImpl) onPodAdd(podUID string) {
   329  	klog.V(5).Infof("[ORM] onPodAdd: %v", podUID)
   330  
   331  	timeout, cancel := context.WithTimeout(m.ctx, 1*time.Second)
   332  	defer cancel()
   333  
   334  	select {
   335  	case m.podAddChan <- podUID:
   336  
   337  	case <-timeout.Done():
   338  		klog.Errorf("[ORM] add pod timeout: %v", podUID)
   339  		_ = m.emitter.StoreInt64(MetricAddPodTimeout, 1, metrics.MetricTypeNameRaw)
   340  	}
   341  }
   342  
   343  func (m *ManagerImpl) onPodDelete(podUID string) {
   344  	klog.V(5).Infof("[ORM] onPodDelete: %v", podUID)
   345  
   346  	timeout, cancel := context.WithTimeout(m.ctx, 1*time.Second)
   347  	defer cancel()
   348  
   349  	select {
   350  	case m.podDeleteChan <- podUID:
   351  
   352  	case <-timeout.Done():
   353  		klog.Errorf("[ORM] delete pod timeout: %v", podUID)
   354  		_ = m.emitter.StoreInt64(MetricDeletePodTImeout, 1, metrics.MetricTypeNameRaw)
   355  	}
   356  }
   357  
   358  func (m *ManagerImpl) process() {
   359  	klog.Infof("[ORM] start process...")
   360  
   361  	for {
   362  		select {
   363  		case podUID := <-m.podAddChan:
   364  			err := m.processAddPod(podUID)
   365  			if err != nil {
   366  				klog.Errorf("[ORM] processAddPod fail, podUID: %v, err: %v", podUID, err)
   367  			}
   368  
   369  		case podUID := <-m.podDeleteChan:
   370  			err := m.processDeletePod(podUID)
   371  			if err != nil {
   372  				klog.Errorf("[ORM] processDeletePod fail, podUID: %v, err: %v", podUID, err)
   373  			}
   374  
   375  		case <-m.ctx.Done():
   376  			klog.Infof("[ORM] ctx done, exit")
   377  			return
   378  		}
   379  	}
   380  }
   381  
   382  func (m *ManagerImpl) processAddPod(podUID string) error {
   383  	pod, err := m.metaManager.MetaServer.GetPod(m.ctx, podUID)
   384  	if err != nil {
   385  		klog.Errorf("[ORM] processAddPod getPod fail, podUID: %v, err: %v", podUID, err)
   386  		return err
   387  	}
   388  
   389  	return m.topologyManager.Admit(pod)
   390  }
   391  
   392  func (m *ManagerImpl) processDeletePod(podUID string) error {
   393  	allSuccess := true
   394  
   395  	m.mutex.Lock()
   396  	for resourceName, endpoint := range m.endpoints {
   397  		_, err := endpoint.E.RemovePod(m.ctx, &pluginapi.RemovePodRequest{
   398  			PodUid: podUID,
   399  		})
   400  		if err != nil {
   401  			allSuccess = false
   402  			klog.Errorf("[ORM] plugin %v remove pod %v fail: %v", resourceName, podUID, err)
   403  		}
   404  	}
   405  	m.mutex.Unlock()
   406  
   407  	if allSuccess {
   408  		m.podResources.deletePod(podUID)
   409  		m.topologyManager.RemovePod(podUID)
   410  	}
   411  
   412  	return m.writeCheckpoint()
   413  }
   414  
   415  func (m *ManagerImpl) addContainer(pod *v1.Pod, container *v1.Container) error {
   416  	klog.V(5).Infof("[ORM] addContainer, pod: %v, container: %v", pod.Name, container.Name)
   417  
   418  	systemCores, err := isPodKatalystQoSLevelSystemCores(m.qosConfig, pod)
   419  	if err != nil {
   420  		klog.Errorf("[ORM] check pod %s qos level fail: %v", pod.Name, err)
   421  		return err
   422  	}
   423  
   424  	if native.CheckDaemonPod(pod) && !systemCores {
   425  		klog.Infof("[ORM] skip pod: %s/%s, container: %s resource allocation",
   426  			pod.Namespace, pod.Name, container.Name)
   427  		return nil
   428  	}
   429  
   430  	containerType, containerIndex, err := GetContainerTypeAndIndex(pod, container)
   431  	if err != nil {
   432  		return err
   433  	}
   434  
   435  	for k, v := range container.Resources.Requests {
   436  		needed := int(v.Value())
   437  		resource, err := m.getMappedResourceName(string(k), container.Resources.Requests)
   438  		if err != nil {
   439  			klog.Errorf("resource %s getMappedResourceName fail: %v", string(k), err)
   440  			return err
   441  		}
   442  
   443  		allocationInfo := m.podResources.containerResource(string(pod.UID), container.Name, resource)
   444  		if allocationInfo != nil {
   445  			allocated := int(math.Ceil(allocationInfo.AllocatedQuantity))
   446  
   447  			if allocationInfo.IsScalarResource && allocated >= needed {
   448  				klog.Infof("[ORM] resource %s already allocated to (pod %s/%s, container %v) with larger number than request: requested: %d, allocated: %d; not to allocate",
   449  					resource, pod.GetNamespace(), pod.GetName(), container.Name, needed, allocated)
   450  				continue
   451  			} else {
   452  				klog.Warningf("[ORM] resource %s already allocated to (pod %s/%s, container %v) with smaller number than request: requested: %d, allocated: %d; continue to allocate",
   453  					resource, pod.GetNamespace(), pod.GetName(), container.Name, needed, allocated)
   454  			}
   455  		}
   456  
   457  		m.mutex.Lock()
   458  		e, ok := m.endpoints[resource]
   459  		m.mutex.Unlock()
   460  		if !ok {
   461  			klog.V(5).Infof("[ORM] addContainer resource %s not supported", resource)
   462  			continue
   463  		}
   464  
   465  		resourceReq := &pluginapi.ResourceRequest{
   466  			PodUid:         string(pod.UID),
   467  			PodNamespace:   pod.GetNamespace(),
   468  			PodName:        pod.GetName(),
   469  			ContainerName:  container.Name,
   470  			ContainerType:  containerType,
   471  			ContainerIndex: containerIndex,
   472  			// PodRole and PodType should be identified by more general annotations
   473  			PodRole: pod.Labels[pluginapi.PodRoleLabelKey],
   474  			PodType: pod.Annotations[pluginapi.PodTypeAnnotationKey],
   475  			// use mapped resource name in "ResourceName" to indicates which endpoint to request
   476  			ResourceName: resource,
   477  			// use original requested resource name in "ResourceRequests" in order to make plugin identity real requested resource name
   478  			ResourceRequests: map[string]float64{resource: v.AsApproximateFloat64()},
   479  			Labels:           maputil.CopySS(pod.Labels),
   480  			Annotations:      maputil.CopySS(pod.Annotations),
   481  		}
   482  
   483  		if e.Opts != nil && e.Opts.WithTopologyAlignment {
   484  			hint := m.topologyManager.GetAffinity(string(pod.UID), container.Name, resource)
   485  
   486  			if hint.NUMANodeAffinity == nil {
   487  				klog.Warningf("[ORM] pod: %s/%s; container: %s allocate resource: %s without numa nodes affinity",
   488  					pod.Namespace, pod.Name, container.Name, resource)
   489  			} else {
   490  				klog.Warningf("[ORM] pod: %s/%s; container: %s allocate resource: %s get hint: %v from store",
   491  					pod.Namespace, pod.Name, container.Name, resource, hint)
   492  			}
   493  
   494  			resourceReq.Hint = ParseTopologyManagerHint(hint)
   495  		}
   496  
   497  		response, err := e.E.Allocate(m.ctx, resourceReq)
   498  		if err != nil {
   499  			err = fmt.Errorf("[ORM] addContainer allocate fail, pod %v, container %v, err: %v", pod.Name, container.Name, err)
   500  			klog.Error(err)
   501  			return err
   502  		}
   503  
   504  		if response.AllocationResult == nil {
   505  			klog.Warningf("[ORM] allocate for pod %v container %v resource %v got nil allocation result", pod.Name, container.Name, resource)
   506  			continue
   507  		}
   508  
   509  		// update
   510  		m.UpdatePodResources(response.AllocationResult.ResourceAllocation, pod, container, resource)
   511  	}
   512  
   513  	// write checkpoint
   514  	return m.writeCheckpoint()
   515  }
   516  
   517  func (m *ManagerImpl) syncContainer(pod *v1.Pod, container *v1.Container) error {
   518  	klog.Infof("[ORM] syncContainer, pod: %v, container: %v", pod.Name, container.Name)
   519  	containerAllResources := m.podResources.containerAllResources(string(pod.UID), container.Name)
   520  	if containerAllResources == nil {
   521  		klog.V(5).Infof("got pod %v container %v resources nil", pod.Name, container.Name)
   522  		return nil
   523  	}
   524  
   525  	err := m.resourceExecutor.UpdateContainerResources(pod, container, containerAllResources)
   526  	if err != nil {
   527  		klog.Errorf("[ORM] UpdateContainerResources fail, pod: %v, container: %v, err: %v", pod.Name, container.Name, err)
   528  		return err
   529  	}
   530  
   531  	return nil
   532  }
   533  
   534  func (m *ManagerImpl) reconcile() {
   535  	klog.V(5).Infof("[ORM] reconcile...")
   536  	resourceAllocationResps := make(map[string]*pluginapi.GetResourcesAllocationResponse)
   537  	activePods, err := m.metaManager.MetaServer.GetPodList(m.ctx, native.PodIsActive)
   538  	if err != nil {
   539  		klog.Errorf("[ORM] getPodList fail: %v", err)
   540  		return
   541  	}
   542  
   543  	m.mutex.Lock()
   544  	for resourceName, e := range m.endpoints {
   545  		if e.E.IsStopped() {
   546  			klog.Warningf("[ORM] skip getResourceAllocation of resource: %s, because plugin stopped", resourceName)
   547  			continue
   548  		} else if !e.Opts.NeedReconcile {
   549  			klog.V(5).Infof("[ORM] skip getResourceAllocation of resource: %s, because plugin needn't reconciling", resourceName)
   550  			continue
   551  		}
   552  		resp, err := e.E.GetResourceAllocation(m.ctx, &pluginapi.GetResourcesAllocationRequest{})
   553  		if err != nil {
   554  			klog.Errorf("[ORM] plugin %s getResourcesAllocation fail: %v", resourceName, err)
   555  			continue
   556  		}
   557  
   558  		resourceAllocationResps[resourceName] = resp
   559  	}
   560  	m.mutex.Unlock()
   561  
   562  	for _, pod := range activePods {
   563  		if pod == nil {
   564  			continue
   565  		}
   566  		systemCores, err := isPodKatalystQoSLevelSystemCores(m.qosConfig, pod)
   567  		if err != nil {
   568  			klog.Errorf("[ORM] check pod %s qos level fail: %v", pod.Name, err)
   569  		}
   570  
   571  		if native.CheckDaemonPod(pod) && !systemCores {
   572  			continue
   573  		}
   574  		for _, container := range pod.Spec.Containers {
   575  
   576  			needsReAllocate := false
   577  			for resourceName, resp := range resourceAllocationResps {
   578  				if resp == nil {
   579  					klog.Warningf("[ORM] resource: %s got nil resourceAllocationResp", resourceName)
   580  					continue
   581  				}
   582  
   583  				isRequested, err := m.IsContainerRequestResource(&container, resourceName)
   584  				if err != nil {
   585  					klog.Errorf("[ORM] IsContainerRequestResource fail, container %v,  resourceName %v, err: %v", container.Name, resourceName, err)
   586  					continue
   587  				}
   588  
   589  				if isRequested {
   590  					if resp.PodResources[string(pod.UID)] != nil && resp.PodResources[string(pod.UID)].ContainerResources[container.Name] != nil {
   591  						resourceAllocations := resp.PodResources[string(pod.UID)].ContainerResources[container.Name]
   592  						m.UpdatePodResources(resourceAllocations.ResourceAllocation, pod, &container, resourceName)
   593  					} else {
   594  						needsReAllocate = true
   595  						m.podResources.deleteResourceAllocationInfo(string(pod.UID), container.Name, resourceName)
   596  					}
   597  				}
   598  			}
   599  			if needsReAllocate && !isSkippedContainer(pod, &container) {
   600  				klog.Infof("[ORM] needs re-allocate resource plugin resources for pod %s/%s, container %s during reconcileState",
   601  					pod.Namespace, pod.Name, container.Name)
   602  				err = m.addContainer(pod, &container)
   603  				if err != nil {
   604  					klog.Errorf("[ORM] re addContainer fail, pod %v container %v, err: %v", pod.Name, container.Name, err)
   605  					continue
   606  				}
   607  			}
   608  
   609  			_ = m.syncContainer(pod, &container)
   610  		}
   611  	}
   612  
   613  	err = m.writeCheckpoint()
   614  	if err != nil {
   615  		klog.Errorf("[ORM] writeCheckpoint: %v", err)
   616  	}
   617  }
   618  
   619  func (m *ManagerImpl) UpdatePodResources(
   620  	resourceAllocation map[string]*pluginapi.ResourceAllocationInfo,
   621  	pod *v1.Pod, container *v1.Container, resource string,
   622  ) {
   623  	for accResourceName, allocationInfo := range resourceAllocation {
   624  		if allocationInfo == nil {
   625  			klog.Warningf("[ORM] allocation request for resources %s - accompanying resource: %s for pod: %s/%s, container: %s got nil allocation information",
   626  				resource, accResourceName, pod.Namespace, pod.Name, container.Name)
   627  			continue
   628  		}
   629  
   630  		klog.V(4).Infof("[ORM] allocation information for resources %s - accompanying resource: %s for pod: %s/%s, container: %s is %v",
   631  			resource, accResourceName, pod.Namespace, pod.Name, container.Name, *allocationInfo)
   632  
   633  		m.podResources.insert(string(pod.UID), container.Name, accResourceName, allocationInfo)
   634  	}
   635  }
   636  
   637  // getMappedResourceName returns mapped resource name of input "resourceName" in m.resourceNamesMap if there is the mapping entry,
   638  // or it will return input "resourceName".
   639  // If both the input "resourceName" and the mapped resource name are requested, it will return error.
   640  func (m *ManagerImpl) getMappedResourceName(resourceName string, requests v1.ResourceList) (string, error) {
   641  	if _, found := m.resourceNamesMap[resourceName]; !found {
   642  		return resourceName, nil
   643  	}
   644  
   645  	mappedResourceName := m.resourceNamesMap[resourceName]
   646  
   647  	_, foundReq := requests[v1.ResourceName(resourceName)]
   648  	_, foundMappedReq := requests[v1.ResourceName(mappedResourceName)]
   649  
   650  	if foundReq && foundMappedReq {
   651  		return mappedResourceName, fmt.Errorf("both %s and mapped %s are requested", resourceName, mappedResourceName)
   652  	}
   653  
   654  	klog.V(5).Infof("[ORM] map resource name: %s to %s", resourceName, mappedResourceName)
   655  
   656  	return mappedResourceName, nil
   657  }
   658  
   659  func (m *ManagerImpl) IsContainerRequestResource(container *v1.Container, resourceName string) (bool, error) {
   660  	if container == nil {
   661  		return false, nil
   662  	}
   663  
   664  	for k := range container.Resources.Requests {
   665  		requestedResourceName, err := m.getMappedResourceName(string(k), container.Resources.Requests)
   666  		if err != nil {
   667  			return false, err
   668  		}
   669  
   670  		if requestedResourceName == resourceName {
   671  			return true, nil
   672  		}
   673  	}
   674  
   675  	return false, nil
   676  }
   677  
   678  func GetContainerTypeAndIndex(pod *v1.Pod, container *v1.Container) (containerType pluginapi.ContainerType, containerIndex uint64, err error) {
   679  	if pod == nil || container == nil {
   680  		err = fmt.Errorf("got nil pod: %v or container: %v", pod, container)
   681  		return
   682  	}
   683  
   684  	foundContainer := false
   685  
   686  	for i, initContainer := range pod.Spec.InitContainers {
   687  		if container.Name == initContainer.Name {
   688  			foundContainer = true
   689  			containerType = pluginapi.ContainerType_INIT
   690  			containerIndex = uint64(i)
   691  			break
   692  		}
   693  	}
   694  
   695  	if !foundContainer {
   696  		mainContainerName := pod.Annotations[MainContainerNameAnnotationKey]
   697  
   698  		if mainContainerName == "" && len(pod.Spec.Containers) > 0 {
   699  			mainContainerName = pod.Spec.Containers[0].Name
   700  		}
   701  
   702  		for i, appContainer := range pod.Spec.Containers {
   703  			if container.Name == appContainer.Name {
   704  				foundContainer = true
   705  
   706  				if container.Name == mainContainerName {
   707  					containerType = pluginapi.ContainerType_MAIN
   708  				} else {
   709  					containerType = pluginapi.ContainerType_SIDECAR
   710  				}
   711  
   712  				containerIndex = uint64(i)
   713  				break
   714  			}
   715  		}
   716  	}
   717  
   718  	if !foundContainer {
   719  		err = fmt.Errorf("GetContainerTypeAndIndex doesn't find container: %s in pod: %s/%s", container.Name, pod.Namespace, pod.Name)
   720  	}
   721  
   722  	return
   723  }
   724  
   725  func isSkippedContainer(pod *v1.Pod, container *v1.Container) bool {
   726  	containerType, _, err := GetContainerTypeAndIndex(pod, container)
   727  	if err != nil {
   728  		klog.Errorf("GetContainerTypeAndIndex failed with error: %v", err)
   729  		return false
   730  	}
   731  
   732  	return containerType == pluginapi.ContainerType_INIT
   733  }
   734  
   735  func isPodKatalystQoSLevelSystemCores(qosConfig *generic.QoSConfiguration, pod *v1.Pod) (bool, error) {
   736  	qosLevel, err := qosConfig.GetQoSLevelForPod(pod)
   737  	if err != nil {
   738  		return false, err
   739  	}
   740  
   741  	return qosLevel == pluginapi.KatalystQoSLevelSystemCores, nil
   742  }
   743  
   744  func ParseListOfTopologyHints(hintsList *pluginapi.ListOfTopologyHints) []topology.TopologyHint {
   745  	if hintsList == nil {
   746  		return nil
   747  	}
   748  
   749  	resultHints := make([]topology.TopologyHint, 0, len(hintsList.Hints))
   750  
   751  	for _, hint := range hintsList.Hints {
   752  		if hint != nil {
   753  
   754  			mask := bitmask.NewEmptyBitMask()
   755  
   756  			for _, node := range hint.Nodes {
   757  				mask.Add(int(node))
   758  			}
   759  
   760  			resultHints = append(resultHints, topology.TopologyHint{
   761  				NUMANodeAffinity: mask,
   762  				Preferred:        hint.Preferred,
   763  			})
   764  		}
   765  	}
   766  
   767  	return resultHints
   768  }
   769  
   770  func ParseTopologyManagerHint(hint topology.TopologyHint) *pluginapi.TopologyHint {
   771  	var nodes []uint64
   772  
   773  	if hint.NUMANodeAffinity != nil {
   774  		bits := hint.NUMANodeAffinity.GetBits()
   775  
   776  		for _, node := range bits {
   777  			nodes = append(nodes, uint64(node))
   778  		}
   779  	}
   780  
   781  	return &pluginapi.TopologyHint{
   782  		Nodes:     nodes,
   783  		Preferred: hint.Preferred,
   784  	}
   785  }