volcano.sh/volcano@v1.9.0/pkg/scheduler/framework/session.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package framework 18 19 import ( 20 "fmt" 21 "reflect" 22 23 v1 "k8s.io/api/core/v1" 24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 "k8s.io/apimachinery/pkg/types" 26 "k8s.io/apimachinery/pkg/util/uuid" 27 "k8s.io/client-go/informers" 28 "k8s.io/client-go/kubernetes" 29 "k8s.io/client-go/rest" 30 "k8s.io/client-go/tools/record" 31 "k8s.io/klog/v2" 32 k8sframework "k8s.io/kubernetes/pkg/scheduler/framework" 33 34 "volcano.sh/apis/pkg/apis/scheduling" 35 schedulingscheme "volcano.sh/apis/pkg/apis/scheduling/scheme" 36 vcv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 37 "volcano.sh/volcano/pkg/scheduler/api" 38 "volcano.sh/volcano/pkg/scheduler/cache" 39 "volcano.sh/volcano/pkg/scheduler/conf" 40 "volcano.sh/volcano/pkg/scheduler/metrics" 41 "volcano.sh/volcano/pkg/scheduler/util" 42 ) 43 44 // Session information for the current session 45 type Session struct { 46 UID types.UID 47 48 kubeClient kubernetes.Interface 49 recorder record.EventRecorder 50 cache cache.Cache 51 restConfig *rest.Config 52 informerFactory informers.SharedInformerFactory 53 54 TotalResource *api.Resource 55 // podGroupStatus cache podgroup status during schedule 56 // This should not be mutated after initiated 57 podGroupStatus map[api.JobID]scheduling.PodGroupStatus 58 59 Jobs map[api.JobID]*api.JobInfo 60 Nodes map[string]*api.NodeInfo 61 CSINodesStatus map[string]*api.CSINodeStatusInfo 62 RevocableNodes map[string]*api.NodeInfo 63 Queues map[api.QueueID]*api.QueueInfo 64 NamespaceInfo map[api.NamespaceName]*api.NamespaceInfo 65 66 // NodeMap is like Nodes except that it uses k8s NodeInfo api and should only 67 // be used in k8s compatable api scenarios such as in predicates and nodeorder plugins. 68 NodeMap map[string]*k8sframework.NodeInfo 69 PodLister *PodLister 70 71 Tiers []conf.Tier 72 Configurations []conf.Configuration 73 NodeList []*api.NodeInfo 74 75 plugins map[string]Plugin 76 eventHandlers []*EventHandler 77 jobOrderFns map[string]api.CompareFn 78 queueOrderFns map[string]api.CompareFn 79 taskOrderFns map[string]api.CompareFn 80 clusterOrderFns map[string]api.CompareFn 81 predicateFns map[string]api.PredicateFn 82 prePredicateFns map[string]api.PrePredicateFn 83 bestNodeFns map[string]api.BestNodeFn 84 nodeOrderFns map[string]api.NodeOrderFn 85 batchNodeOrderFns map[string]api.BatchNodeOrderFn 86 nodeMapFns map[string]api.NodeMapFn 87 nodeReduceFns map[string]api.NodeReduceFn 88 preemptableFns map[string]api.EvictableFn 89 reclaimableFns map[string]api.EvictableFn 90 overusedFns map[string]api.ValidateFn 91 // preemptiveFns means whether current queue can reclaim from other queue, 92 // while reclaimableFns means whether current queue's resources can be reclaimed. 93 preemptiveFns map[string]api.ValidateFn 94 allocatableFns map[string]api.AllocatableFn 95 jobReadyFns map[string]api.ValidateFn 96 jobPipelinedFns map[string]api.VoteFn 97 jobValidFns map[string]api.ValidateExFn 98 jobEnqueueableFns map[string]api.VoteFn 99 jobEnqueuedFns map[string]api.JobEnqueuedFn 100 targetJobFns map[string]api.TargetJobFn 101 reservedNodesFns map[string]api.ReservedNodesFn 102 victimTasksFns map[string][]api.VictimTasksFn 103 jobStarvingFns map[string]api.ValidateFn 104 } 105 106 func openSession(cache cache.Cache) *Session { 107 ssn := &Session{ 108 UID: uuid.NewUUID(), 109 kubeClient: cache.Client(), 110 restConfig: cache.ClientConfig(), 111 recorder: cache.EventRecorder(), 112 cache: cache, 113 informerFactory: cache.SharedInformerFactory(), 114 115 TotalResource: api.EmptyResource(), 116 podGroupStatus: map[api.JobID]scheduling.PodGroupStatus{}, 117 118 Jobs: map[api.JobID]*api.JobInfo{}, 119 Nodes: map[string]*api.NodeInfo{}, 120 CSINodesStatus: map[string]*api.CSINodeStatusInfo{}, 121 RevocableNodes: map[string]*api.NodeInfo{}, 122 Queues: map[api.QueueID]*api.QueueInfo{}, 123 124 plugins: map[string]Plugin{}, 125 jobOrderFns: map[string]api.CompareFn{}, 126 queueOrderFns: map[string]api.CompareFn{}, 127 taskOrderFns: map[string]api.CompareFn{}, 128 clusterOrderFns: map[string]api.CompareFn{}, 129 predicateFns: map[string]api.PredicateFn{}, 130 prePredicateFns: map[string]api.PrePredicateFn{}, 131 bestNodeFns: map[string]api.BestNodeFn{}, 132 nodeOrderFns: map[string]api.NodeOrderFn{}, 133 batchNodeOrderFns: map[string]api.BatchNodeOrderFn{}, 134 nodeMapFns: map[string]api.NodeMapFn{}, 135 nodeReduceFns: map[string]api.NodeReduceFn{}, 136 preemptableFns: map[string]api.EvictableFn{}, 137 reclaimableFns: map[string]api.EvictableFn{}, 138 overusedFns: map[string]api.ValidateFn{}, 139 preemptiveFns: map[string]api.ValidateFn{}, 140 allocatableFns: map[string]api.AllocatableFn{}, 141 jobReadyFns: map[string]api.ValidateFn{}, 142 jobPipelinedFns: map[string]api.VoteFn{}, 143 jobValidFns: map[string]api.ValidateExFn{}, 144 jobEnqueueableFns: map[string]api.VoteFn{}, 145 jobEnqueuedFns: map[string]api.JobEnqueuedFn{}, 146 targetJobFns: map[string]api.TargetJobFn{}, 147 reservedNodesFns: map[string]api.ReservedNodesFn{}, 148 victimTasksFns: map[string][]api.VictimTasksFn{}, 149 jobStarvingFns: map[string]api.ValidateFn{}, 150 } 151 152 snapshot := cache.Snapshot() 153 154 ssn.Jobs = snapshot.Jobs 155 for _, job := range ssn.Jobs { 156 if job.PodGroup != nil { 157 ssn.podGroupStatus[job.UID] = *job.PodGroup.Status.DeepCopy() 158 } 159 160 if vjr := ssn.JobValid(job); vjr != nil { 161 if !vjr.Pass { 162 jc := &scheduling.PodGroupCondition{ 163 Type: scheduling.PodGroupUnschedulableType, 164 Status: v1.ConditionTrue, 165 LastTransitionTime: metav1.Now(), 166 TransitionID: string(ssn.UID), 167 Reason: vjr.Reason, 168 Message: vjr.Message, 169 } 170 171 if err := ssn.UpdatePodGroupCondition(job, jc); err != nil { 172 klog.Errorf("Failed to update job condition: %v", err) 173 } 174 } 175 176 delete(ssn.Jobs, job.UID) 177 } 178 } 179 ssn.NodeList = util.GetNodeList(snapshot.Nodes, snapshot.NodeList) 180 ssn.Nodes = snapshot.Nodes 181 ssn.CSINodesStatus = snapshot.CSINodesStatus 182 ssn.RevocableNodes = snapshot.RevocableNodes 183 ssn.Queues = snapshot.Queues 184 ssn.NamespaceInfo = snapshot.NamespaceInfo 185 // calculate all nodes' resource only once in each schedule cycle, other plugins can clone it when need 186 for _, n := range ssn.Nodes { 187 ssn.TotalResource.Add(n.Allocatable) 188 } 189 190 klog.V(3).Infof("Open Session %v with <%d> Job and <%d> Queues", 191 ssn.UID, len(ssn.Jobs), len(ssn.Queues)) 192 193 return ssn 194 } 195 196 // updateQueueStatus updates allocated field in queue status on session close. 197 func updateQueueStatus(ssn *Session) { 198 // calculate allocated resources on each queue 199 var allocatedResources = make(map[api.QueueID]*api.Resource, len(ssn.Queues)) 200 for queueID := range ssn.Queues { 201 allocatedResources[queueID] = &api.Resource{} 202 } 203 for _, job := range ssn.Jobs { 204 for _, runningTask := range job.TaskStatusIndex[api.Running] { 205 allocatedResources[job.Queue].Add(runningTask.Resreq) 206 } 207 } 208 209 // update queue status 210 for queueID := range ssn.Queues { 211 // convert api.Resource to v1.ResourceList 212 var queueStatus = util.ConvertRes2ResList(allocatedResources[queueID]).DeepCopy() 213 if reflect.DeepEqual(ssn.Queues[queueID].Queue.Status.Allocated, queueStatus) { 214 klog.V(5).Infof("Queue <%s> allocated resource keeps equal, no need to update queue status <%v>.", 215 queueID, ssn.Queues[queueID].Queue.Status.Allocated) 216 continue 217 } 218 219 ssn.Queues[queueID].Queue.Status.Allocated = queueStatus 220 221 if err := ssn.cache.UpdateQueueStatus(ssn.Queues[queueID]); err != nil { 222 klog.Errorf("failed to update queue <%s> status: %s", ssn.Queues[queueID].Name, err.Error()) 223 } 224 } 225 } 226 227 func closeSession(ssn *Session) { 228 ju := newJobUpdater(ssn) 229 ju.UpdateAll() 230 231 updateQueueStatus(ssn) 232 233 ssn.Jobs = nil 234 ssn.Nodes = nil 235 ssn.RevocableNodes = nil 236 ssn.plugins = nil 237 ssn.eventHandlers = nil 238 ssn.jobOrderFns = nil 239 ssn.queueOrderFns = nil 240 ssn.clusterOrderFns = nil 241 ssn.NodeList = nil 242 ssn.TotalResource = nil 243 244 klog.V(3).Infof("Close Session %v", ssn.UID) 245 } 246 247 func jobStatus(ssn *Session, jobInfo *api.JobInfo) scheduling.PodGroupStatus { 248 status := jobInfo.PodGroup.Status 249 250 unschedulable := false 251 for _, c := range status.Conditions { 252 if c.Type == scheduling.PodGroupUnschedulableType && 253 c.Status == v1.ConditionTrue && 254 c.TransitionID == string(ssn.UID) { 255 unschedulable = true 256 break 257 } 258 } 259 260 // If running tasks && unschedulable, unknown phase 261 if len(jobInfo.TaskStatusIndex[api.Running]) != 0 && unschedulable { 262 status.Phase = scheduling.PodGroupUnknown 263 } else { 264 allocated := 0 265 for status, tasks := range jobInfo.TaskStatusIndex { 266 if api.AllocatedStatus(status) || status == api.Succeeded { 267 allocated += len(tasks) 268 } 269 } 270 271 // If there're enough allocated resource, it's running 272 if int32(allocated) >= jobInfo.PodGroup.Spec.MinMember { 273 status.Phase = scheduling.PodGroupRunning 274 // If all allocated tasks is succeeded, it's completed 275 if len(jobInfo.TaskStatusIndex[api.Succeeded]) == allocated { 276 status.Phase = scheduling.PodGroupCompleted 277 } 278 } else if jobInfo.PodGroup.Status.Phase != scheduling.PodGroupInqueue { 279 status.Phase = scheduling.PodGroupPending 280 } 281 } 282 283 status.Running = int32(len(jobInfo.TaskStatusIndex[api.Running])) 284 status.Failed = int32(len(jobInfo.TaskStatusIndex[api.Failed])) 285 status.Succeeded = int32(len(jobInfo.TaskStatusIndex[api.Succeeded])) 286 287 return status 288 } 289 290 // Statement returns new statement object 291 func (ssn *Session) Statement() *Statement { 292 return &Statement{ 293 ssn: ssn, 294 } 295 } 296 297 // Pipeline the task to the node in the session 298 func (ssn *Session) Pipeline(task *api.TaskInfo, hostname string) error { 299 // Only update status in session 300 job, found := ssn.Jobs[task.Job] 301 if found { 302 if err := job.UpdateTaskStatus(task, api.Pipelined); err != nil { 303 klog.Errorf("Failed to update task <%v/%v> status to %v when pipeline in Session <%v>: %v", 304 task.Namespace, task.Name, api.Pipelined, ssn.UID, err) 305 return err 306 } 307 } else { 308 klog.Errorf("Failed to find Job <%s> in Session <%s> index when pipeline.", 309 task.Job, ssn.UID) 310 return fmt.Errorf("failed to find job %s when pipeline", task.Job) 311 } 312 313 task.NodeName = hostname 314 315 if node, found := ssn.Nodes[hostname]; found { 316 if err := node.AddTask(task); err != nil { 317 klog.Errorf("Failed to add task <%v/%v> to node <%v> when pipeline in Session <%v>: %v", 318 task.Namespace, task.Name, hostname, ssn.UID, err) 319 return err 320 } 321 klog.V(3).Infof("After pipelined Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>", 322 task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing) 323 } else { 324 klog.Errorf("Failed to find Node <%s> in Session <%s> index when pipeline.", 325 hostname, ssn.UID) 326 return fmt.Errorf("failed to find node %s", hostname) 327 } 328 329 for _, eh := range ssn.eventHandlers { 330 if eh.AllocateFunc != nil { 331 eh.AllocateFunc(&Event{ 332 Task: task, 333 }) 334 } 335 } 336 337 return nil 338 } 339 340 // Allocate the task to the node in the session 341 func (ssn *Session) Allocate(task *api.TaskInfo, nodeInfo *api.NodeInfo) (err error) { 342 podVolumes, err := ssn.cache.GetPodVolumes(task, nodeInfo.Node) 343 if err != nil { 344 return err 345 } 346 347 hostname := nodeInfo.Name 348 if err := ssn.cache.AllocateVolumes(task, hostname, podVolumes); err != nil { 349 return err 350 } 351 defer func() { 352 if err != nil { 353 ssn.cache.RevertVolumes(task, podVolumes) 354 } 355 }() 356 357 task.Pod.Spec.NodeName = hostname 358 task.PodVolumes = podVolumes 359 360 // Only update status in session 361 job, found := ssn.Jobs[task.Job] 362 if found { 363 if err := job.UpdateTaskStatus(task, api.Allocated); err != nil { 364 klog.Errorf("Failed to update task <%v/%v> status to %v when binding in Session <%v>: %v", 365 task.Namespace, task.Name, api.Allocated, ssn.UID, err) 366 return err 367 } 368 } else { 369 klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.", 370 task.Job, ssn.UID) 371 return fmt.Errorf("failed to find job %s", task.Job) 372 } 373 374 task.NodeName = hostname 375 376 if node, found := ssn.Nodes[hostname]; found { 377 if err := node.AddTask(task); err != nil { 378 klog.Errorf("Failed to add task <%v/%v> to node <%v> when binding in Session <%v>: %v", 379 task.Namespace, task.Name, hostname, ssn.UID, err) 380 return err 381 } 382 klog.V(3).Infof("After allocated Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>", 383 task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing) 384 } else { 385 klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.", 386 hostname, ssn.UID) 387 return fmt.Errorf("failed to find node %s", hostname) 388 } 389 390 // Callbacks 391 for _, eh := range ssn.eventHandlers { 392 if eh.AllocateFunc != nil { 393 eh.AllocateFunc(&Event{ 394 Task: task, 395 }) 396 } 397 } 398 399 if ssn.JobReady(job) { 400 for _, task := range job.TaskStatusIndex[api.Allocated] { 401 if err := ssn.dispatch(task); err != nil { 402 klog.Errorf("Failed to dispatch task <%v/%v>: %v", 403 task.Namespace, task.Name, err) 404 return err 405 } 406 } 407 } else { 408 ssn.cache.RevertVolumes(task, podVolumes) 409 } 410 411 return nil 412 } 413 414 func (ssn *Session) dispatch(task *api.TaskInfo) error { 415 if err := ssn.cache.AddBindTask(task); err != nil { 416 return err 417 } 418 419 // Update status in session 420 if job, found := ssn.Jobs[task.Job]; found { 421 if err := job.UpdateTaskStatus(task, api.Binding); err != nil { 422 klog.Errorf("Failed to update task <%v/%v> status to %v when binding in Session <%v>: %v", 423 task.Namespace, task.Name, api.Binding, ssn.UID, err) 424 return err 425 } 426 } else { 427 klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.", 428 task.Job, ssn.UID) 429 return fmt.Errorf("failed to find job %s", task.Job) 430 } 431 432 metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time)) 433 return nil 434 } 435 436 // Evict the task in the session 437 func (ssn *Session) Evict(reclaimee *api.TaskInfo, reason string) error { 438 if err := ssn.cache.Evict(reclaimee, reason); err != nil { 439 return err 440 } 441 442 // Update status in session 443 job, found := ssn.Jobs[reclaimee.Job] 444 if found { 445 if err := job.UpdateTaskStatus(reclaimee, api.Releasing); err != nil { 446 klog.Errorf("Failed to update task <%v/%v> status to %v when evicting in Session <%v>: %v", 447 reclaimee.Namespace, reclaimee.Name, api.Releasing, ssn.UID, err) 448 return err 449 } 450 } else { 451 klog.Errorf("Failed to find Job <%s> in Session <%s> index when evicting.", 452 reclaimee.Job, ssn.UID) 453 return fmt.Errorf("failed to find job %s", reclaimee.Job) 454 } 455 456 // Update task in node. 457 if node, found := ssn.Nodes[reclaimee.NodeName]; found { 458 if err := node.UpdateTask(reclaimee); err != nil { 459 klog.Errorf("Failed to update task <%v/%v> in Session <%v>: %v", 460 reclaimee.Namespace, reclaimee.Name, ssn.UID, err) 461 return err 462 } 463 } 464 465 for _, eh := range ssn.eventHandlers { 466 if eh.DeallocateFunc != nil { 467 eh.DeallocateFunc(&Event{ 468 Task: reclaimee, 469 }) 470 } 471 } 472 473 return nil 474 } 475 476 // BindPodGroup bind PodGroup to specified cluster 477 func (ssn *Session) BindPodGroup(job *api.JobInfo, cluster string) error { 478 return ssn.cache.BindPodGroup(job, cluster) 479 } 480 481 // UpdatePodGroupCondition update job condition accordingly. 482 func (ssn *Session) UpdatePodGroupCondition(jobInfo *api.JobInfo, cond *scheduling.PodGroupCondition) error { 483 job, ok := ssn.Jobs[jobInfo.UID] 484 if !ok { 485 return fmt.Errorf("failed to find job <%s/%s>", jobInfo.Namespace, jobInfo.Name) 486 } 487 488 index := -1 489 for i, c := range job.PodGroup.Status.Conditions { 490 if c.Type == cond.Type { 491 index = i 492 break 493 } 494 } 495 496 // Update condition to the new condition. 497 if index < 0 { 498 job.PodGroup.Status.Conditions = append(job.PodGroup.Status.Conditions, *cond) 499 } else { 500 job.PodGroup.Status.Conditions[index] = *cond 501 } 502 503 return nil 504 } 505 506 // AddEventHandler add event handlers 507 func (ssn *Session) AddEventHandler(eh *EventHandler) { 508 ssn.eventHandlers = append(ssn.eventHandlers, eh) 509 } 510 511 // UpdateSchedulerNumaInfo update SchedulerNumaInfo 512 func (ssn *Session) UpdateSchedulerNumaInfo(AllocatedSets map[string]api.ResNumaSets) { 513 ssn.cache.UpdateSchedulerNumaInfo(AllocatedSets) 514 } 515 516 // KubeClient returns the kubernetes client 517 func (ssn Session) KubeClient() kubernetes.Interface { 518 return ssn.kubeClient 519 } 520 521 // ClientConfig returns the rest client 522 func (ssn Session) ClientConfig() *rest.Config { 523 return ssn.restConfig 524 } 525 526 // InformerFactory returns the scheduler ShareInformerFactory 527 func (ssn Session) InformerFactory() informers.SharedInformerFactory { 528 return ssn.informerFactory 529 } 530 531 // RecordPodGroupEvent records podGroup events 532 func (ssn Session) RecordPodGroupEvent(podGroup *api.PodGroup, eventType, reason, msg string) { 533 if podGroup == nil { 534 return 535 } 536 537 pg := &vcv1beta1.PodGroup{} 538 if err := schedulingscheme.Scheme.Convert(&podGroup.PodGroup, pg, nil); err != nil { 539 klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err) 540 return 541 } 542 ssn.recorder.Eventf(pg, eventType, reason, msg) 543 } 544 545 // String return nodes and jobs information in the session 546 func (ssn Session) String() string { 547 msg := fmt.Sprintf("Session %v: \n", ssn.UID) 548 549 for _, job := range ssn.Jobs { 550 msg = fmt.Sprintf("%s%v\n", msg, job) 551 } 552 553 for _, node := range ssn.Nodes { 554 msg = fmt.Sprintf("%s%v\n", msg, node) 555 } 556 557 return msg 558 }