volcano.sh/volcano@v1.9.0/pkg/scheduler/api/node_info.go (about)

     1  /*
     2   Copyright 2021 The Volcano Authors.
     3  
     4   Licensed under the Apache License, Version 2.0 (the "License");
     5   you may not use this file except in compliance with the License.
     6   You may obtain a copy of the License at
     7  
     8       http://www.apache.org/licenses/LICENSE-2.0
     9  
    10   Unless required by applicable law or agreed to in writing, software
    11   distributed under the License is distributed on an "AS IS" BASIS,
    12   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   See the License for the specific language governing permissions and
    14   limitations under the License.
    15  */
    16  
    17  package api
    18  
    19  import (
    20  	"fmt"
    21  	"strconv"
    22  	"time"
    23  
    24  	v1 "k8s.io/api/core/v1"
    25  	"k8s.io/klog/v2"
    26  	k8sframework "k8s.io/kubernetes/pkg/scheduler/framework"
    27  
    28  	"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    29  
    30  	"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare"
    31  	"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu"
    32  )
    33  
    34  type AllocateFailError struct {
    35  	Reason string
    36  }
    37  
    38  func (o *AllocateFailError) Error() string {
    39  	return o.Reason
    40  }
    41  
    42  type CSINodeStatusInfo struct {
    43  	CSINodeName  string
    44  	DriverStatus map[string]bool
    45  }
    46  
    47  // NodeInfo is node level aggregated information.
    48  type NodeInfo struct {
    49  	Name string
    50  	Node *v1.Node
    51  
    52  	// The state of node
    53  	State NodeState
    54  
    55  	// The releasing resource on that node
    56  	Releasing *Resource
    57  	// The pipelined resource on that node
    58  	Pipelined *Resource
    59  	// The idle resource on that node
    60  	Idle *Resource
    61  	// The used resource on that node, including running and terminating
    62  	// pods
    63  	Used *Resource
    64  
    65  	Allocatable   *Resource
    66  	Capacity      *Resource
    67  	ResourceUsage *NodeUsage
    68  
    69  	Tasks             map[TaskID]*TaskInfo
    70  	NumaInfo          *NumatopoInfo
    71  	NumaChgFlag       NumaChgFlag
    72  	NumaSchedulerInfo *NumatopoInfo
    73  	RevocableZone     string
    74  
    75  	// Used to store custom information
    76  	Others map[string]interface{}
    77  	//SharedDevices map[string]SharedDevicePool
    78  
    79  	// enable node resource oversubscription
    80  	OversubscriptionNode bool
    81  	// OfflineJobEvicting true means node resource usage too high then dispatched pod can not use oversubscription resource
    82  	OfflineJobEvicting bool
    83  
    84  	// Resource Oversubscription feature: the Oversubscription Resource reported in annotation
    85  	OversubscriptionResource *Resource
    86  
    87  	// ImageStates holds the entry of an image if and only if this image is on the node. The entry can be used for
    88  	// checking an image's existence and advanced usage (e.g., image locality scheduling policy) based on the image
    89  	// state information.
    90  	ImageStates map[string]*k8sframework.ImageStateSummary
    91  }
    92  
    93  // FutureIdle returns resources that will be idle in the future:
    94  //
    95  // That is current idle resources plus released resources minus pipelined resources.
    96  func (ni *NodeInfo) FutureIdle() *Resource {
    97  	return ni.Idle.Clone().Add(ni.Releasing).SubWithoutAssert(ni.Pipelined)
    98  }
    99  
   100  // GetNodeAllocatable return node Allocatable without OversubscriptionResource resource
   101  func (ni *NodeInfo) GetNodeAllocatable() *Resource {
   102  	return NewResource(ni.Node.Status.Allocatable)
   103  }
   104  
   105  // NodeState defines the current state of node.
   106  type NodeState struct {
   107  	Phase  NodePhase
   108  	Reason string
   109  }
   110  
   111  // NodeUsage defines the real load usage of node
   112  type NodeUsage struct {
   113  	MetricsTime time.Time
   114  	CPUUsageAvg map[string]float64
   115  	MEMUsageAvg map[string]float64
   116  }
   117  
   118  func (nu *NodeUsage) DeepCopy() *NodeUsage {
   119  	newUsage := &NodeUsage{
   120  		CPUUsageAvg: make(map[string]float64),
   121  		MEMUsageAvg: make(map[string]float64),
   122  	}
   123  	newUsage.MetricsTime = nu.MetricsTime
   124  	for k, v := range nu.CPUUsageAvg {
   125  		newUsage.CPUUsageAvg[k] = v
   126  	}
   127  	for k, v := range nu.MEMUsageAvg {
   128  		newUsage.MEMUsageAvg[k] = v
   129  	}
   130  	return newUsage
   131  }
   132  
   133  // NewNodeInfo is used to create new nodeInfo object
   134  func NewNodeInfo(node *v1.Node) *NodeInfo {
   135  	nodeInfo := &NodeInfo{
   136  		Releasing: EmptyResource(),
   137  		Pipelined: EmptyResource(),
   138  		Idle:      EmptyResource(),
   139  		Used:      EmptyResource(),
   140  
   141  		Allocatable:   EmptyResource(),
   142  		Capacity:      EmptyResource(),
   143  		ResourceUsage: &NodeUsage{},
   144  
   145  		OversubscriptionResource: EmptyResource(),
   146  		Tasks:                    make(map[TaskID]*TaskInfo),
   147  
   148  		Others:      make(map[string]interface{}),
   149  		ImageStates: make(map[string]*k8sframework.ImageStateSummary),
   150  	}
   151  
   152  	nodeInfo.setOversubscription(node)
   153  
   154  	if node != nil {
   155  		nodeInfo.Name = node.Name
   156  		nodeInfo.Node = node
   157  		nodeInfo.Idle = NewResource(node.Status.Allocatable).Add(nodeInfo.OversubscriptionResource)
   158  		nodeInfo.Allocatable = NewResource(node.Status.Allocatable).Add(nodeInfo.OversubscriptionResource)
   159  		nodeInfo.Capacity = NewResource(node.Status.Capacity).Add(nodeInfo.OversubscriptionResource)
   160  	}
   161  	nodeInfo.setNodeOthersResource(node)
   162  	nodeInfo.setNodeState(node)
   163  	nodeInfo.setRevocableZone(node)
   164  
   165  	return nodeInfo
   166  }
   167  
   168  // RefreshNumaSchedulerInfoByCrd used to update scheduler numa information based the CRD numatopo
   169  func (ni *NodeInfo) RefreshNumaSchedulerInfoByCrd() {
   170  	if ni.NumaInfo == nil {
   171  		ni.NumaSchedulerInfo = nil
   172  		return
   173  	}
   174  
   175  	tmp := ni.NumaInfo.DeepCopy()
   176  	if ni.NumaChgFlag == NumaInfoMoreFlag {
   177  		ni.NumaSchedulerInfo = tmp
   178  	} else if ni.NumaChgFlag == NumaInfoLessFlag {
   179  		numaResMap := ni.NumaSchedulerInfo.NumaResMap
   180  		for resName, resInfo := range tmp.NumaResMap {
   181  			klog.V(5).Infof("resource %s Allocatable : current %v new %v on node %s",
   182  				resName, numaResMap[resName], resInfo, ni.Name)
   183  			if numaResMap[resName].Allocatable.Size() >= resInfo.Allocatable.Size() {
   184  				numaResMap[resName].Allocatable = resInfo.Allocatable.Clone()
   185  				numaResMap[resName].Capacity = resInfo.Capacity
   186  			}
   187  		}
   188  	}
   189  
   190  	ni.NumaChgFlag = NumaInfoResetFlag
   191  }
   192  
   193  // Clone used to clone nodeInfo Object
   194  func (ni *NodeInfo) Clone() *NodeInfo {
   195  	res := NewNodeInfo(ni.Node)
   196  
   197  	for _, p := range ni.Tasks {
   198  		res.AddTask(p)
   199  	}
   200  	if ni.NumaInfo != nil {
   201  		res.NumaInfo = ni.NumaInfo.DeepCopy()
   202  	}
   203  	if ni.ResourceUsage != nil {
   204  		res.ResourceUsage = ni.ResourceUsage.DeepCopy()
   205  	}
   206  
   207  	if ni.NumaSchedulerInfo != nil {
   208  		res.NumaSchedulerInfo = ni.NumaSchedulerInfo.DeepCopy()
   209  		klog.V(5).Infof("node[%s]", ni.Name)
   210  		for resName, resInfo := range res.NumaSchedulerInfo.NumaResMap {
   211  			klog.V(5).Infof("current resource %s : %v", resName, resInfo)
   212  		}
   213  
   214  		klog.V(5).Infof("current Policies : %v", res.NumaSchedulerInfo.Policies)
   215  	}
   216  
   217  	klog.V(5).Infof("imageStates is %v", res.ImageStates)
   218  
   219  	res.Others = ni.CloneOthers()
   220  	res.ImageStates = ni.CloneImageSummary()
   221  	return res
   222  }
   223  
   224  // Ready returns whether node is ready for scheduling
   225  func (ni *NodeInfo) Ready() bool {
   226  	return ni.State.Phase == Ready
   227  }
   228  
   229  func (ni *NodeInfo) setRevocableZone(node *v1.Node) {
   230  	if node == nil {
   231  		klog.Warningf("the argument node is null.")
   232  		return
   233  	}
   234  
   235  	revocableZone := ""
   236  	if len(node.Labels) > 0 {
   237  		if value, found := node.Labels[v1beta1.RevocableZone]; found {
   238  			revocableZone = value
   239  		}
   240  	}
   241  	ni.RevocableZone = revocableZone
   242  }
   243  
   244  // Check node if enable Oversubscription and set Oversubscription resources
   245  // Only support oversubscription cpu and memory resource for this version
   246  func (ni *NodeInfo) setOversubscription(node *v1.Node) {
   247  	if node == nil {
   248  		return
   249  	}
   250  
   251  	ni.OversubscriptionNode = false
   252  	ni.OfflineJobEvicting = false
   253  	if len(node.Labels) > 0 {
   254  		if value, found := node.Labels[OversubscriptionNode]; found {
   255  			b, err := strconv.ParseBool(value)
   256  			if err == nil {
   257  				ni.OversubscriptionNode = b
   258  			} else {
   259  				ni.OversubscriptionNode = false
   260  			}
   261  			klog.V(5).Infof("Set node %s Oversubscription to %v", node.Name, ni.OversubscriptionNode)
   262  		}
   263  	}
   264  
   265  	if len(node.Annotations) > 0 {
   266  		if value, found := node.Annotations[OfflineJobEvicting]; found {
   267  			b, err := strconv.ParseBool(value)
   268  			if err == nil {
   269  				ni.OfflineJobEvicting = b
   270  			} else {
   271  				ni.OfflineJobEvicting = false
   272  			}
   273  			klog.V(5).Infof("Set node %s OfflineJobEvicting to %v", node.Name, ni.OfflineJobEvicting)
   274  		}
   275  		if value, found := node.Annotations[OversubscriptionCPU]; found {
   276  			ni.OversubscriptionResource.MilliCPU, _ = strconv.ParseFloat(value, 64)
   277  			klog.V(5).Infof("Set node %s Oversubscription CPU to %v", node.Name, ni.OversubscriptionResource.MilliCPU)
   278  		}
   279  		if value, found := node.Annotations[OversubscriptionMemory]; found {
   280  			ni.OversubscriptionResource.Memory, _ = strconv.ParseFloat(value, 64)
   281  			klog.V(5).Infof("Set node %s Oversubscription Memory to %v", node.Name, ni.OversubscriptionResource.Memory)
   282  		}
   283  	}
   284  }
   285  
   286  func (ni *NodeInfo) setNodeState(node *v1.Node) {
   287  	// If node is nil, the node is un-initialized in cache
   288  	if node == nil {
   289  		ni.State = NodeState{
   290  			Phase:  NotReady,
   291  			Reason: "UnInitialized",
   292  		}
   293  		return
   294  	}
   295  
   296  	// set NodeState according to resources
   297  	if ok, resources := ni.Used.LessEqualWithResourcesName(ni.Allocatable, Zero); !ok {
   298  		klog.ErrorS(nil, "Node out of sync", "name", ni.Name, "resources", resources)
   299  	}
   300  
   301  	// If node not ready, e.g. power off
   302  	for _, cond := range node.Status.Conditions {
   303  		if cond.Type == v1.NodeReady && cond.Status != v1.ConditionTrue {
   304  			ni.State = NodeState{
   305  				Phase:  NotReady,
   306  				Reason: "NotReady",
   307  			}
   308  			klog.Warningf("set the node %s status to %s.", node.Name, NotReady.String())
   309  			return
   310  		}
   311  	}
   312  
   313  	// Node is ready (ignore node conditions because of taint/toleration)
   314  	ni.State = NodeState{
   315  		Phase:  Ready,
   316  		Reason: "",
   317  	}
   318  
   319  	klog.V(4).Infof("set the node %s status to %s.", node.Name, Ready.String())
   320  }
   321  
   322  // SetNode sets kubernetes node object to nodeInfo object
   323  func (ni *NodeInfo) SetNode(node *v1.Node) {
   324  	ni.setNodeState(node)
   325  	if !ni.Ready() {
   326  		klog.Warningf("Failed to set node info for %s, phase: %s, reason: %s",
   327  			ni.Name, ni.State.Phase, ni.State.Reason)
   328  		return
   329  	}
   330  
   331  	// Dry run, make sure all fields other than `State` are in the original state.
   332  	copy := ni.Clone()
   333  	copy.setNode(node)
   334  	copy.setNodeState(node)
   335  	if !copy.Ready() {
   336  		klog.Warningf("SetNode makes node %s not ready, phase: %s, reason: %s",
   337  			copy.Name, copy.State.Phase, copy.State.Reason)
   338  		// Set state of node to !Ready, left other fields untouched
   339  		ni.State = copy.State
   340  		return
   341  	}
   342  
   343  	ni.setNode(node)
   344  }
   345  
   346  // setNodeOthersResource initialize sharable devices
   347  func (ni *NodeInfo) setNodeOthersResource(node *v1.Node) {
   348  	ni.Others[GPUSharingDevice] = gpushare.NewGPUDevices(ni.Name, node)
   349  	ni.Others[vgpu.DeviceName] = vgpu.NewGPUDevices(ni.Name, node)
   350  	IgnoredDevicesList.Set(
   351  		ni.Others[GPUSharingDevice].(Devices).GetIgnoredDevices(),
   352  		ni.Others[vgpu.DeviceName].(Devices).GetIgnoredDevices(),
   353  	)
   354  }
   355  
   356  // setNode sets kubernetes node object to nodeInfo object without assertion
   357  func (ni *NodeInfo) setNode(node *v1.Node) {
   358  	ni.setOversubscription(node)
   359  	ni.setNodeOthersResource(node)
   360  	ni.setRevocableZone(node)
   361  
   362  	ni.Name = node.Name
   363  	ni.Node = node
   364  
   365  	ni.Allocatable = NewResource(node.Status.Allocatable).Add(ni.OversubscriptionResource)
   366  	ni.Capacity = NewResource(node.Status.Capacity).Add(ni.OversubscriptionResource)
   367  	ni.Releasing = EmptyResource()
   368  	ni.Pipelined = EmptyResource()
   369  	ni.Idle = NewResource(node.Status.Allocatable).Add(ni.OversubscriptionResource)
   370  	ni.Used = EmptyResource()
   371  
   372  	for _, ti := range ni.Tasks {
   373  		switch ti.Status {
   374  		case Releasing:
   375  			ni.allocateIdleResource(ti)
   376  			ni.Releasing.Add(ti.Resreq)
   377  			ni.Used.Add(ti.Resreq)
   378  			ni.addResource(ti.Pod)
   379  		case Pipelined:
   380  			ni.Pipelined.Add(ti.Resreq)
   381  		default:
   382  			ni.allocateIdleResource(ti)
   383  			ni.Used.Add(ti.Resreq)
   384  			ni.addResource(ti.Pod)
   385  		}
   386  	}
   387  }
   388  
   389  func (ni *NodeInfo) allocateIdleResource(ti *TaskInfo) {
   390  	ok, resources := ti.Resreq.LessEqualWithResourcesName(ni.Idle, Zero)
   391  	if ok {
   392  		ni.Idle.sub(ti.Resreq)
   393  		return
   394  	}
   395  
   396  	ni.Idle.sub(ti.Resreq)
   397  	klog.ErrorS(nil, "Idle resources turn into negative after allocated",
   398  		"nodeName", ni.Name, "task", klog.KObj(ti.Pod), "resources", resources, "idle", ni.Idle.String(), "req", ti.Resreq.String())
   399  }
   400  
   401  // AddTask is used to add a task in nodeInfo object
   402  //
   403  // If error occurs both task and node are guaranteed to be in the original state.
   404  func (ni *NodeInfo) AddTask(task *TaskInfo) error {
   405  	if len(task.NodeName) > 0 && len(ni.Name) > 0 && task.NodeName != ni.Name {
   406  		return fmt.Errorf("task <%v/%v> already on different node <%v>",
   407  			task.Namespace, task.Name, task.NodeName)
   408  	}
   409  
   410  	key := PodKey(task.Pod)
   411  	if _, found := ni.Tasks[key]; found {
   412  		return fmt.Errorf("task <%v/%v> already on node <%v>",
   413  			task.Namespace, task.Name, ni.Name)
   414  	}
   415  
   416  	// Node will hold a copy of task to make sure the status
   417  	// change will not impact resource in node.
   418  	ti := task.Clone()
   419  
   420  	if ni.Node != nil {
   421  		switch ti.Status {
   422  		case Releasing:
   423  			ni.allocateIdleResource(ti)
   424  			ni.Releasing.Add(ti.Resreq)
   425  			ni.Used.Add(ti.Resreq)
   426  			ni.addResource(ti.Pod)
   427  		case Pipelined:
   428  			ni.Pipelined.Add(ti.Resreq)
   429  		case Binding:
   430  			// When task in Binding status, it will bind to node, we should double-check whether idle resources are enough to put task before bind to apiserver.
   431  			if ok, resNames := ti.Resreq.LessEqualWithResourcesName(ni.Idle, Zero); !ok {
   432  				return fmt.Errorf("node %s resources %v are not enough to put task <%s/%s>, idle: %s, req: %s", ni.Name, resNames, ti.Namespace, ti.Name, ni.Idle.String(), ti.Resreq.String())
   433  			}
   434  			ni.allocateIdleResource(ti)
   435  			ni.Used.Add(ti.Resreq)
   436  			ni.addResource(ti.Pod)
   437  		default:
   438  			ni.allocateIdleResource(ti)
   439  			ni.Used.Add(ti.Resreq)
   440  			ni.addResource(ti.Pod)
   441  		}
   442  	}
   443  
   444  	if ni.NumaInfo != nil {
   445  		ni.NumaInfo.AddTask(ti)
   446  	}
   447  
   448  	// Update task node name upon successful task addition.
   449  	task.NodeName = ni.Name
   450  	ti.NodeName = ni.Name
   451  	ni.Tasks[key] = ti
   452  
   453  	return nil
   454  }
   455  
   456  // RemoveTask used to remove a task from nodeInfo object.
   457  //
   458  // If error occurs both task and node are guaranteed to be in the original state.
   459  func (ni *NodeInfo) RemoveTask(ti *TaskInfo) error {
   460  	key := PodKey(ti.Pod)
   461  
   462  	task, found := ni.Tasks[key]
   463  	if !found {
   464  		klog.Warningf("failed to find task <%v/%v> on host <%v>",
   465  			ti.Namespace, ti.Name, ni.Name)
   466  		return nil
   467  	}
   468  
   469  	if ni.Node != nil {
   470  		switch task.Status {
   471  		case Releasing:
   472  			ni.Releasing.Sub(task.Resreq)
   473  			ni.Idle.Add(task.Resreq)
   474  			ni.Used.Sub(task.Resreq)
   475  			ni.subResource(ti.Pod)
   476  		case Pipelined:
   477  			ni.Pipelined.Sub(task.Resreq)
   478  		default:
   479  			ni.Idle.Add(task.Resreq)
   480  			ni.Used.Sub(task.Resreq)
   481  			ni.subResource(ti.Pod)
   482  		}
   483  	}
   484  
   485  	if ni.NumaInfo != nil {
   486  		ni.NumaInfo.RemoveTask(ti)
   487  	}
   488  
   489  	delete(ni.Tasks, key)
   490  
   491  	return nil
   492  }
   493  
   494  // addResource is used to add sharable devices
   495  func (ni *NodeInfo) addResource(pod *v1.Pod) {
   496  	ni.Others[GPUSharingDevice].(Devices).AddResource(pod)
   497  	ni.Others[vgpu.DeviceName].(Devices).AddResource(pod)
   498  }
   499  
   500  // subResource is used to subtract sharable devices
   501  func (ni *NodeInfo) subResource(pod *v1.Pod) {
   502  	ni.Others[GPUSharingDevice].(Devices).SubResource(pod)
   503  	ni.Others[vgpu.DeviceName].(Devices).SubResource(pod)
   504  }
   505  
   506  // UpdateTask is used to update a task in nodeInfo object.
   507  //
   508  // If error occurs both task and node are guaranteed to be in the original state.
   509  func (ni *NodeInfo) UpdateTask(ti *TaskInfo) error {
   510  	if err := ni.RemoveTask(ti); err != nil {
   511  		return err
   512  	}
   513  
   514  	if err := ni.AddTask(ti); err != nil {
   515  		// This should never happen if task removal was successful,
   516  		// because only possible error during task addition is when task is still on a node.
   517  		klog.Fatalf("Failed to add Task <%s,%s> to Node <%s> during task update",
   518  			ti.Namespace, ti.Name, ni.Name)
   519  	}
   520  	return nil
   521  }
   522  
   523  // String returns nodeInfo details in string format
   524  func (ni NodeInfo) String() string {
   525  	tasks := ""
   526  
   527  	i := 0
   528  	for _, task := range ni.Tasks {
   529  		tasks += fmt.Sprintf("\n\t %d: %v", i, task)
   530  		i++
   531  	}
   532  
   533  	return fmt.Sprintf("Node (%s): allocatable<%v> idle <%v>, used <%v>, releasing <%v>, oversubscribution <%v>, "+
   534  		"state <phase %s, reaseon %s>, oversubscributionNode <%v>, offlineJobEvicting <%v>,taints <%v>%s, imageStates %v",
   535  		ni.Name, ni.Allocatable, ni.Idle, ni.Used, ni.Releasing, ni.OversubscriptionResource, ni.State.Phase, ni.State.Reason, ni.OversubscriptionNode, ni.OfflineJobEvicting, ni.Node.Spec.Taints, tasks, ni.ImageStates)
   536  }
   537  
   538  // Pods returns all pods running in that node
   539  func (ni *NodeInfo) Pods() (pods []*v1.Pod) {
   540  	for _, t := range ni.Tasks {
   541  		pods = append(pods, t.Pod)
   542  	}
   543  
   544  	return
   545  }
   546  
   547  // CloneImageSummary Clone Image State
   548  func (ni *NodeInfo) CloneImageSummary() map[string]*k8sframework.ImageStateSummary {
   549  	nodeImageStates := make(map[string]*k8sframework.ImageStateSummary)
   550  	for imageName, summary := range ni.ImageStates {
   551  		newImageSummary := &k8sframework.ImageStateSummary{
   552  			Size:     summary.Size,
   553  			NumNodes: summary.NumNodes,
   554  		}
   555  		nodeImageStates[imageName] = newImageSummary
   556  	}
   557  	return nodeImageStates
   558  }
   559  
   560  // CloneOthers clone other map resources
   561  func (ni *NodeInfo) CloneOthers() map[string]interface{} {
   562  	others := make(map[string]interface{})
   563  	for k, v := range ni.Others {
   564  		others[k] = v
   565  	}
   566  	return others
   567  }
   568  
   569  // Clone clone csi node status info
   570  func (cs *CSINodeStatusInfo) Clone() *CSINodeStatusInfo {
   571  	newcs := &CSINodeStatusInfo{
   572  		CSINodeName:  cs.CSINodeName,
   573  		DriverStatus: make(map[string]bool),
   574  	}
   575  	for k, v := range cs.DriverStatus {
   576  		newcs.DriverStatus[k] = v
   577  	}
   578  	return newcs
   579  }