volcano.sh/volcano@v1.9.0/pkg/scheduler/api/node_info.go (about) 1 /* 2 Copyright 2021 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package api 18 19 import ( 20 "fmt" 21 "strconv" 22 "time" 23 24 v1 "k8s.io/api/core/v1" 25 "k8s.io/klog/v2" 26 k8sframework "k8s.io/kubernetes/pkg/scheduler/framework" 27 28 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 29 30 "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" 31 "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" 32 ) 33 34 type AllocateFailError struct { 35 Reason string 36 } 37 38 func (o *AllocateFailError) Error() string { 39 return o.Reason 40 } 41 42 type CSINodeStatusInfo struct { 43 CSINodeName string 44 DriverStatus map[string]bool 45 } 46 47 // NodeInfo is node level aggregated information. 48 type NodeInfo struct { 49 Name string 50 Node *v1.Node 51 52 // The state of node 53 State NodeState 54 55 // The releasing resource on that node 56 Releasing *Resource 57 // The pipelined resource on that node 58 Pipelined *Resource 59 // The idle resource on that node 60 Idle *Resource 61 // The used resource on that node, including running and terminating 62 // pods 63 Used *Resource 64 65 Allocatable *Resource 66 Capacity *Resource 67 ResourceUsage *NodeUsage 68 69 Tasks map[TaskID]*TaskInfo 70 NumaInfo *NumatopoInfo 71 NumaChgFlag NumaChgFlag 72 NumaSchedulerInfo *NumatopoInfo 73 RevocableZone string 74 75 // Used to store custom information 76 Others map[string]interface{} 77 //SharedDevices map[string]SharedDevicePool 78 79 // enable node resource oversubscription 80 OversubscriptionNode bool 81 // OfflineJobEvicting true means node resource usage too high then dispatched pod can not use oversubscription resource 82 OfflineJobEvicting bool 83 84 // Resource Oversubscription feature: the Oversubscription Resource reported in annotation 85 OversubscriptionResource *Resource 86 87 // ImageStates holds the entry of an image if and only if this image is on the node. The entry can be used for 88 // checking an image's existence and advanced usage (e.g., image locality scheduling policy) based on the image 89 // state information. 90 ImageStates map[string]*k8sframework.ImageStateSummary 91 } 92 93 // FutureIdle returns resources that will be idle in the future: 94 // 95 // That is current idle resources plus released resources minus pipelined resources. 96 func (ni *NodeInfo) FutureIdle() *Resource { 97 return ni.Idle.Clone().Add(ni.Releasing).SubWithoutAssert(ni.Pipelined) 98 } 99 100 // GetNodeAllocatable return node Allocatable without OversubscriptionResource resource 101 func (ni *NodeInfo) GetNodeAllocatable() *Resource { 102 return NewResource(ni.Node.Status.Allocatable) 103 } 104 105 // NodeState defines the current state of node. 106 type NodeState struct { 107 Phase NodePhase 108 Reason string 109 } 110 111 // NodeUsage defines the real load usage of node 112 type NodeUsage struct { 113 MetricsTime time.Time 114 CPUUsageAvg map[string]float64 115 MEMUsageAvg map[string]float64 116 } 117 118 func (nu *NodeUsage) DeepCopy() *NodeUsage { 119 newUsage := &NodeUsage{ 120 CPUUsageAvg: make(map[string]float64), 121 MEMUsageAvg: make(map[string]float64), 122 } 123 newUsage.MetricsTime = nu.MetricsTime 124 for k, v := range nu.CPUUsageAvg { 125 newUsage.CPUUsageAvg[k] = v 126 } 127 for k, v := range nu.MEMUsageAvg { 128 newUsage.MEMUsageAvg[k] = v 129 } 130 return newUsage 131 } 132 133 // NewNodeInfo is used to create new nodeInfo object 134 func NewNodeInfo(node *v1.Node) *NodeInfo { 135 nodeInfo := &NodeInfo{ 136 Releasing: EmptyResource(), 137 Pipelined: EmptyResource(), 138 Idle: EmptyResource(), 139 Used: EmptyResource(), 140 141 Allocatable: EmptyResource(), 142 Capacity: EmptyResource(), 143 ResourceUsage: &NodeUsage{}, 144 145 OversubscriptionResource: EmptyResource(), 146 Tasks: make(map[TaskID]*TaskInfo), 147 148 Others: make(map[string]interface{}), 149 ImageStates: make(map[string]*k8sframework.ImageStateSummary), 150 } 151 152 nodeInfo.setOversubscription(node) 153 154 if node != nil { 155 nodeInfo.Name = node.Name 156 nodeInfo.Node = node 157 nodeInfo.Idle = NewResource(node.Status.Allocatable).Add(nodeInfo.OversubscriptionResource) 158 nodeInfo.Allocatable = NewResource(node.Status.Allocatable).Add(nodeInfo.OversubscriptionResource) 159 nodeInfo.Capacity = NewResource(node.Status.Capacity).Add(nodeInfo.OversubscriptionResource) 160 } 161 nodeInfo.setNodeOthersResource(node) 162 nodeInfo.setNodeState(node) 163 nodeInfo.setRevocableZone(node) 164 165 return nodeInfo 166 } 167 168 // RefreshNumaSchedulerInfoByCrd used to update scheduler numa information based the CRD numatopo 169 func (ni *NodeInfo) RefreshNumaSchedulerInfoByCrd() { 170 if ni.NumaInfo == nil { 171 ni.NumaSchedulerInfo = nil 172 return 173 } 174 175 tmp := ni.NumaInfo.DeepCopy() 176 if ni.NumaChgFlag == NumaInfoMoreFlag { 177 ni.NumaSchedulerInfo = tmp 178 } else if ni.NumaChgFlag == NumaInfoLessFlag { 179 numaResMap := ni.NumaSchedulerInfo.NumaResMap 180 for resName, resInfo := range tmp.NumaResMap { 181 klog.V(5).Infof("resource %s Allocatable : current %v new %v on node %s", 182 resName, numaResMap[resName], resInfo, ni.Name) 183 if numaResMap[resName].Allocatable.Size() >= resInfo.Allocatable.Size() { 184 numaResMap[resName].Allocatable = resInfo.Allocatable.Clone() 185 numaResMap[resName].Capacity = resInfo.Capacity 186 } 187 } 188 } 189 190 ni.NumaChgFlag = NumaInfoResetFlag 191 } 192 193 // Clone used to clone nodeInfo Object 194 func (ni *NodeInfo) Clone() *NodeInfo { 195 res := NewNodeInfo(ni.Node) 196 197 for _, p := range ni.Tasks { 198 res.AddTask(p) 199 } 200 if ni.NumaInfo != nil { 201 res.NumaInfo = ni.NumaInfo.DeepCopy() 202 } 203 if ni.ResourceUsage != nil { 204 res.ResourceUsage = ni.ResourceUsage.DeepCopy() 205 } 206 207 if ni.NumaSchedulerInfo != nil { 208 res.NumaSchedulerInfo = ni.NumaSchedulerInfo.DeepCopy() 209 klog.V(5).Infof("node[%s]", ni.Name) 210 for resName, resInfo := range res.NumaSchedulerInfo.NumaResMap { 211 klog.V(5).Infof("current resource %s : %v", resName, resInfo) 212 } 213 214 klog.V(5).Infof("current Policies : %v", res.NumaSchedulerInfo.Policies) 215 } 216 217 klog.V(5).Infof("imageStates is %v", res.ImageStates) 218 219 res.Others = ni.CloneOthers() 220 res.ImageStates = ni.CloneImageSummary() 221 return res 222 } 223 224 // Ready returns whether node is ready for scheduling 225 func (ni *NodeInfo) Ready() bool { 226 return ni.State.Phase == Ready 227 } 228 229 func (ni *NodeInfo) setRevocableZone(node *v1.Node) { 230 if node == nil { 231 klog.Warningf("the argument node is null.") 232 return 233 } 234 235 revocableZone := "" 236 if len(node.Labels) > 0 { 237 if value, found := node.Labels[v1beta1.RevocableZone]; found { 238 revocableZone = value 239 } 240 } 241 ni.RevocableZone = revocableZone 242 } 243 244 // Check node if enable Oversubscription and set Oversubscription resources 245 // Only support oversubscription cpu and memory resource for this version 246 func (ni *NodeInfo) setOversubscription(node *v1.Node) { 247 if node == nil { 248 return 249 } 250 251 ni.OversubscriptionNode = false 252 ni.OfflineJobEvicting = false 253 if len(node.Labels) > 0 { 254 if value, found := node.Labels[OversubscriptionNode]; found { 255 b, err := strconv.ParseBool(value) 256 if err == nil { 257 ni.OversubscriptionNode = b 258 } else { 259 ni.OversubscriptionNode = false 260 } 261 klog.V(5).Infof("Set node %s Oversubscription to %v", node.Name, ni.OversubscriptionNode) 262 } 263 } 264 265 if len(node.Annotations) > 0 { 266 if value, found := node.Annotations[OfflineJobEvicting]; found { 267 b, err := strconv.ParseBool(value) 268 if err == nil { 269 ni.OfflineJobEvicting = b 270 } else { 271 ni.OfflineJobEvicting = false 272 } 273 klog.V(5).Infof("Set node %s OfflineJobEvicting to %v", node.Name, ni.OfflineJobEvicting) 274 } 275 if value, found := node.Annotations[OversubscriptionCPU]; found { 276 ni.OversubscriptionResource.MilliCPU, _ = strconv.ParseFloat(value, 64) 277 klog.V(5).Infof("Set node %s Oversubscription CPU to %v", node.Name, ni.OversubscriptionResource.MilliCPU) 278 } 279 if value, found := node.Annotations[OversubscriptionMemory]; found { 280 ni.OversubscriptionResource.Memory, _ = strconv.ParseFloat(value, 64) 281 klog.V(5).Infof("Set node %s Oversubscription Memory to %v", node.Name, ni.OversubscriptionResource.Memory) 282 } 283 } 284 } 285 286 func (ni *NodeInfo) setNodeState(node *v1.Node) { 287 // If node is nil, the node is un-initialized in cache 288 if node == nil { 289 ni.State = NodeState{ 290 Phase: NotReady, 291 Reason: "UnInitialized", 292 } 293 return 294 } 295 296 // set NodeState according to resources 297 if ok, resources := ni.Used.LessEqualWithResourcesName(ni.Allocatable, Zero); !ok { 298 klog.ErrorS(nil, "Node out of sync", "name", ni.Name, "resources", resources) 299 } 300 301 // If node not ready, e.g. power off 302 for _, cond := range node.Status.Conditions { 303 if cond.Type == v1.NodeReady && cond.Status != v1.ConditionTrue { 304 ni.State = NodeState{ 305 Phase: NotReady, 306 Reason: "NotReady", 307 } 308 klog.Warningf("set the node %s status to %s.", node.Name, NotReady.String()) 309 return 310 } 311 } 312 313 // Node is ready (ignore node conditions because of taint/toleration) 314 ni.State = NodeState{ 315 Phase: Ready, 316 Reason: "", 317 } 318 319 klog.V(4).Infof("set the node %s status to %s.", node.Name, Ready.String()) 320 } 321 322 // SetNode sets kubernetes node object to nodeInfo object 323 func (ni *NodeInfo) SetNode(node *v1.Node) { 324 ni.setNodeState(node) 325 if !ni.Ready() { 326 klog.Warningf("Failed to set node info for %s, phase: %s, reason: %s", 327 ni.Name, ni.State.Phase, ni.State.Reason) 328 return 329 } 330 331 // Dry run, make sure all fields other than `State` are in the original state. 332 copy := ni.Clone() 333 copy.setNode(node) 334 copy.setNodeState(node) 335 if !copy.Ready() { 336 klog.Warningf("SetNode makes node %s not ready, phase: %s, reason: %s", 337 copy.Name, copy.State.Phase, copy.State.Reason) 338 // Set state of node to !Ready, left other fields untouched 339 ni.State = copy.State 340 return 341 } 342 343 ni.setNode(node) 344 } 345 346 // setNodeOthersResource initialize sharable devices 347 func (ni *NodeInfo) setNodeOthersResource(node *v1.Node) { 348 ni.Others[GPUSharingDevice] = gpushare.NewGPUDevices(ni.Name, node) 349 ni.Others[vgpu.DeviceName] = vgpu.NewGPUDevices(ni.Name, node) 350 IgnoredDevicesList.Set( 351 ni.Others[GPUSharingDevice].(Devices).GetIgnoredDevices(), 352 ni.Others[vgpu.DeviceName].(Devices).GetIgnoredDevices(), 353 ) 354 } 355 356 // setNode sets kubernetes node object to nodeInfo object without assertion 357 func (ni *NodeInfo) setNode(node *v1.Node) { 358 ni.setOversubscription(node) 359 ni.setNodeOthersResource(node) 360 ni.setRevocableZone(node) 361 362 ni.Name = node.Name 363 ni.Node = node 364 365 ni.Allocatable = NewResource(node.Status.Allocatable).Add(ni.OversubscriptionResource) 366 ni.Capacity = NewResource(node.Status.Capacity).Add(ni.OversubscriptionResource) 367 ni.Releasing = EmptyResource() 368 ni.Pipelined = EmptyResource() 369 ni.Idle = NewResource(node.Status.Allocatable).Add(ni.OversubscriptionResource) 370 ni.Used = EmptyResource() 371 372 for _, ti := range ni.Tasks { 373 switch ti.Status { 374 case Releasing: 375 ni.allocateIdleResource(ti) 376 ni.Releasing.Add(ti.Resreq) 377 ni.Used.Add(ti.Resreq) 378 ni.addResource(ti.Pod) 379 case Pipelined: 380 ni.Pipelined.Add(ti.Resreq) 381 default: 382 ni.allocateIdleResource(ti) 383 ni.Used.Add(ti.Resreq) 384 ni.addResource(ti.Pod) 385 } 386 } 387 } 388 389 func (ni *NodeInfo) allocateIdleResource(ti *TaskInfo) { 390 ok, resources := ti.Resreq.LessEqualWithResourcesName(ni.Idle, Zero) 391 if ok { 392 ni.Idle.sub(ti.Resreq) 393 return 394 } 395 396 ni.Idle.sub(ti.Resreq) 397 klog.ErrorS(nil, "Idle resources turn into negative after allocated", 398 "nodeName", ni.Name, "task", klog.KObj(ti.Pod), "resources", resources, "idle", ni.Idle.String(), "req", ti.Resreq.String()) 399 } 400 401 // AddTask is used to add a task in nodeInfo object 402 // 403 // If error occurs both task and node are guaranteed to be in the original state. 404 func (ni *NodeInfo) AddTask(task *TaskInfo) error { 405 if len(task.NodeName) > 0 && len(ni.Name) > 0 && task.NodeName != ni.Name { 406 return fmt.Errorf("task <%v/%v> already on different node <%v>", 407 task.Namespace, task.Name, task.NodeName) 408 } 409 410 key := PodKey(task.Pod) 411 if _, found := ni.Tasks[key]; found { 412 return fmt.Errorf("task <%v/%v> already on node <%v>", 413 task.Namespace, task.Name, ni.Name) 414 } 415 416 // Node will hold a copy of task to make sure the status 417 // change will not impact resource in node. 418 ti := task.Clone() 419 420 if ni.Node != nil { 421 switch ti.Status { 422 case Releasing: 423 ni.allocateIdleResource(ti) 424 ni.Releasing.Add(ti.Resreq) 425 ni.Used.Add(ti.Resreq) 426 ni.addResource(ti.Pod) 427 case Pipelined: 428 ni.Pipelined.Add(ti.Resreq) 429 case Binding: 430 // When task in Binding status, it will bind to node, we should double-check whether idle resources are enough to put task before bind to apiserver. 431 if ok, resNames := ti.Resreq.LessEqualWithResourcesName(ni.Idle, Zero); !ok { 432 return fmt.Errorf("node %s resources %v are not enough to put task <%s/%s>, idle: %s, req: %s", ni.Name, resNames, ti.Namespace, ti.Name, ni.Idle.String(), ti.Resreq.String()) 433 } 434 ni.allocateIdleResource(ti) 435 ni.Used.Add(ti.Resreq) 436 ni.addResource(ti.Pod) 437 default: 438 ni.allocateIdleResource(ti) 439 ni.Used.Add(ti.Resreq) 440 ni.addResource(ti.Pod) 441 } 442 } 443 444 if ni.NumaInfo != nil { 445 ni.NumaInfo.AddTask(ti) 446 } 447 448 // Update task node name upon successful task addition. 449 task.NodeName = ni.Name 450 ti.NodeName = ni.Name 451 ni.Tasks[key] = ti 452 453 return nil 454 } 455 456 // RemoveTask used to remove a task from nodeInfo object. 457 // 458 // If error occurs both task and node are guaranteed to be in the original state. 459 func (ni *NodeInfo) RemoveTask(ti *TaskInfo) error { 460 key := PodKey(ti.Pod) 461 462 task, found := ni.Tasks[key] 463 if !found { 464 klog.Warningf("failed to find task <%v/%v> on host <%v>", 465 ti.Namespace, ti.Name, ni.Name) 466 return nil 467 } 468 469 if ni.Node != nil { 470 switch task.Status { 471 case Releasing: 472 ni.Releasing.Sub(task.Resreq) 473 ni.Idle.Add(task.Resreq) 474 ni.Used.Sub(task.Resreq) 475 ni.subResource(ti.Pod) 476 case Pipelined: 477 ni.Pipelined.Sub(task.Resreq) 478 default: 479 ni.Idle.Add(task.Resreq) 480 ni.Used.Sub(task.Resreq) 481 ni.subResource(ti.Pod) 482 } 483 } 484 485 if ni.NumaInfo != nil { 486 ni.NumaInfo.RemoveTask(ti) 487 } 488 489 delete(ni.Tasks, key) 490 491 return nil 492 } 493 494 // addResource is used to add sharable devices 495 func (ni *NodeInfo) addResource(pod *v1.Pod) { 496 ni.Others[GPUSharingDevice].(Devices).AddResource(pod) 497 ni.Others[vgpu.DeviceName].(Devices).AddResource(pod) 498 } 499 500 // subResource is used to subtract sharable devices 501 func (ni *NodeInfo) subResource(pod *v1.Pod) { 502 ni.Others[GPUSharingDevice].(Devices).SubResource(pod) 503 ni.Others[vgpu.DeviceName].(Devices).SubResource(pod) 504 } 505 506 // UpdateTask is used to update a task in nodeInfo object. 507 // 508 // If error occurs both task and node are guaranteed to be in the original state. 509 func (ni *NodeInfo) UpdateTask(ti *TaskInfo) error { 510 if err := ni.RemoveTask(ti); err != nil { 511 return err 512 } 513 514 if err := ni.AddTask(ti); err != nil { 515 // This should never happen if task removal was successful, 516 // because only possible error during task addition is when task is still on a node. 517 klog.Fatalf("Failed to add Task <%s,%s> to Node <%s> during task update", 518 ti.Namespace, ti.Name, ni.Name) 519 } 520 return nil 521 } 522 523 // String returns nodeInfo details in string format 524 func (ni NodeInfo) String() string { 525 tasks := "" 526 527 i := 0 528 for _, task := range ni.Tasks { 529 tasks += fmt.Sprintf("\n\t %d: %v", i, task) 530 i++ 531 } 532 533 return fmt.Sprintf("Node (%s): allocatable<%v> idle <%v>, used <%v>, releasing <%v>, oversubscribution <%v>, "+ 534 "state <phase %s, reaseon %s>, oversubscributionNode <%v>, offlineJobEvicting <%v>,taints <%v>%s, imageStates %v", 535 ni.Name, ni.Allocatable, ni.Idle, ni.Used, ni.Releasing, ni.OversubscriptionResource, ni.State.Phase, ni.State.Reason, ni.OversubscriptionNode, ni.OfflineJobEvicting, ni.Node.Spec.Taints, tasks, ni.ImageStates) 536 } 537 538 // Pods returns all pods running in that node 539 func (ni *NodeInfo) Pods() (pods []*v1.Pod) { 540 for _, t := range ni.Tasks { 541 pods = append(pods, t.Pod) 542 } 543 544 return 545 } 546 547 // CloneImageSummary Clone Image State 548 func (ni *NodeInfo) CloneImageSummary() map[string]*k8sframework.ImageStateSummary { 549 nodeImageStates := make(map[string]*k8sframework.ImageStateSummary) 550 for imageName, summary := range ni.ImageStates { 551 newImageSummary := &k8sframework.ImageStateSummary{ 552 Size: summary.Size, 553 NumNodes: summary.NumNodes, 554 } 555 nodeImageStates[imageName] = newImageSummary 556 } 557 return nodeImageStates 558 } 559 560 // CloneOthers clone other map resources 561 func (ni *NodeInfo) CloneOthers() map[string]interface{} { 562 others := make(map[string]interface{}) 563 for k, v := range ni.Others { 564 others[k] = v 565 } 566 return others 567 } 568 569 // Clone clone csi node status info 570 func (cs *CSINodeStatusInfo) Clone() *CSINodeStatusInfo { 571 newcs := &CSINodeStatusInfo{ 572 CSINodeName: cs.CSINodeName, 573 DriverStatus: make(map[string]bool), 574 } 575 for k, v := range cs.DriverStatus { 576 newcs.DriverStatus[k] = v 577 } 578 return newcs 579 }