volcano.sh/volcano@v1.9.0/pkg/controllers/job/job_controller_handler.go (about) 1 /* 2 Copyright 2017 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package job 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "strconv" 24 25 v1 "k8s.io/api/core/v1" 26 apierrors "k8s.io/apimachinery/pkg/api/errors" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/client-go/tools/cache" 29 "k8s.io/klog/v2" 30 31 batch "volcano.sh/apis/pkg/apis/batch/v1alpha1" 32 bus "volcano.sh/apis/pkg/apis/bus/v1alpha1" 33 "volcano.sh/apis/pkg/apis/helpers" 34 scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 35 "volcano.sh/volcano/pkg/controllers/apis" 36 jobcache "volcano.sh/volcano/pkg/controllers/cache" 37 jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers" 38 ) 39 40 func (cc *jobcontroller) addCommand(obj interface{}) { 41 cmd, ok := obj.(*bus.Command) 42 if !ok { 43 klog.Errorf("obj is not Command") 44 return 45 } 46 47 cc.commandQueue.Add(cmd) 48 } 49 50 func (cc *jobcontroller) addJob(obj interface{}) { 51 job, ok := obj.(*batch.Job) 52 if !ok { 53 klog.Errorf("obj is not Job") 54 return 55 } 56 57 req := apis.Request{ 58 Namespace: job.Namespace, 59 JobName: job.Name, 60 61 Event: bus.OutOfSyncEvent, 62 } 63 64 // TODO(k82cn): if failed to add job, the cache should be refresh 65 if err := cc.cache.Add(job); err != nil { 66 klog.Errorf("Failed to add job <%s/%s>: %v in cache", 67 job.Namespace, job.Name, err) 68 } 69 key := jobhelpers.GetJobKeyByReq(&req) 70 queue := cc.getWorkerQueue(key) 71 queue.Add(req) 72 } 73 74 func (cc *jobcontroller) updateJob(oldObj, newObj interface{}) { 75 newJob, ok := newObj.(*batch.Job) 76 if !ok { 77 klog.Errorf("newObj is not Job") 78 return 79 } 80 81 oldJob, ok := oldObj.(*batch.Job) 82 if !ok { 83 klog.Errorf("oldJob is not Job") 84 return 85 } 86 87 // No need to update if ResourceVersion is not changed 88 if newJob.ResourceVersion == oldJob.ResourceVersion { 89 klog.V(6).Infof("No need to update because job is not modified.") 90 return 91 } 92 93 if err := cc.cache.Update(newJob); err != nil { 94 klog.Errorf("UpdateJob - Failed to update job <%s/%s>: %v in cache", 95 newJob.Namespace, newJob.Name, err) 96 } 97 98 // NOTE: Since we only reconcile job based on Spec, we will ignore other attributes 99 // For Job status, it's used internally and always been updated via our controller. 100 if reflect.DeepEqual(newJob.Spec, oldJob.Spec) && newJob.Status.State.Phase == oldJob.Status.State.Phase { 101 klog.V(6).Infof("Job update event is ignored since no update in 'Spec'.") 102 return 103 } 104 105 req := apis.Request{ 106 Namespace: newJob.Namespace, 107 JobName: newJob.Name, 108 Event: bus.OutOfSyncEvent, 109 } 110 key := jobhelpers.GetJobKeyByReq(&req) 111 queue := cc.getWorkerQueue(key) 112 queue.Add(req) 113 } 114 115 func (cc *jobcontroller) deleteJob(obj interface{}) { 116 job, ok := obj.(*batch.Job) 117 if !ok { 118 // If we reached here it means the Job was deleted but its final state is unrecorded. 119 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 120 if !ok { 121 klog.Errorf("Couldn't get object from tombstone %#v", obj) 122 return 123 } 124 job, ok = tombstone.Obj.(*batch.Job) 125 if !ok { 126 klog.Errorf("Tombstone contained object that is not a volcano Job: %#v", obj) 127 return 128 } 129 } 130 131 if err := cc.cache.Delete(job); err != nil { 132 klog.Errorf("Failed to delete job <%s/%s>: %v in cache", 133 job.Namespace, job.Name, err) 134 } 135 } 136 137 func (cc *jobcontroller) addPod(obj interface{}) { 138 pod, ok := obj.(*v1.Pod) 139 if !ok { 140 klog.Errorf("Failed to convert %v to v1.Pod", obj) 141 return 142 } 143 // Filter out pods that are not created from volcano job 144 if !isControlledBy(pod, helpers.JobKind) { 145 return 146 } 147 148 jobName, found := pod.Annotations[batch.JobNameKey] 149 if !found { 150 klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping", 151 pod.Namespace, pod.Name) 152 return 153 } 154 155 version, found := pod.Annotations[batch.JobVersion] 156 if !found { 157 klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping", 158 pod.Namespace, pod.Name) 159 return 160 } 161 162 dVersion, err := strconv.Atoi(version) 163 if err != nil { 164 klog.Infof("Failed to convert jobVersion of Pod <%s/%s> into number, skipping", 165 pod.Namespace, pod.Name) 166 return 167 } 168 169 if pod.DeletionTimestamp != nil { 170 cc.deletePod(pod) 171 return 172 } 173 174 req := apis.Request{ 175 Namespace: pod.Namespace, 176 JobName: jobName, 177 178 Event: bus.OutOfSyncEvent, 179 JobVersion: int32(dVersion), 180 } 181 182 if err := cc.cache.AddPod(pod); err != nil { 183 klog.Errorf("Failed to add Pod <%s/%s>: %v to cache", 184 pod.Namespace, pod.Name, err) 185 } 186 key := jobhelpers.GetJobKeyByReq(&req) 187 queue := cc.getWorkerQueue(key) 188 queue.Add(req) 189 } 190 191 func (cc *jobcontroller) updatePod(oldObj, newObj interface{}) { 192 oldPod, ok := oldObj.(*v1.Pod) 193 if !ok { 194 klog.Errorf("Failed to convert %v to v1.Pod", oldObj) 195 return 196 } 197 198 newPod, ok := newObj.(*v1.Pod) 199 if !ok { 200 klog.Errorf("Failed to convert %v to v1.Pod", newObj) 201 return 202 } 203 204 // Filter out pods that are not created from volcano job 205 if !isControlledBy(newPod, helpers.JobKind) { 206 return 207 } 208 209 if newPod.ResourceVersion == oldPod.ResourceVersion { 210 return 211 } 212 213 if newPod.DeletionTimestamp != nil { 214 cc.deletePod(newObj) 215 return 216 } 217 218 taskName, found := newPod.Annotations[batch.TaskSpecKey] 219 if !found { 220 klog.Infof("Failed to find taskName of Pod <%s/%s>, skipping", 221 newPod.Namespace, newPod.Name) 222 return 223 } 224 225 jobName, found := newPod.Annotations[batch.JobNameKey] 226 if !found { 227 klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping", 228 newPod.Namespace, newPod.Name) 229 return 230 } 231 232 version, found := newPod.Annotations[batch.JobVersion] 233 if !found { 234 klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping", 235 newPod.Namespace, newPod.Name) 236 return 237 } 238 239 dVersion, err := strconv.Atoi(version) 240 if err != nil { 241 klog.Infof("Failed to convert jobVersion of Pod into number <%s/%s>, skipping", 242 newPod.Namespace, newPod.Name) 243 return 244 } 245 246 if err := cc.cache.UpdatePod(newPod); err != nil { 247 klog.Errorf("Failed to update Pod <%s/%s>: %v in cache", 248 newPod.Namespace, newPod.Name, err) 249 } 250 251 event := bus.OutOfSyncEvent 252 var exitCode int32 253 254 switch newPod.Status.Phase { 255 case v1.PodFailed: 256 if oldPod.Status.Phase != v1.PodFailed { 257 event = bus.PodFailedEvent 258 // TODO: currently only one container pod is supported by volcano 259 // Once multi containers pod is supported, update accordingly. 260 if len(newPod.Status.ContainerStatuses) > 0 && newPod.Status.ContainerStatuses[0].State.Terminated != nil { 261 exitCode = newPod.Status.ContainerStatuses[0].State.Terminated.ExitCode 262 } 263 } 264 case v1.PodSucceeded: 265 if oldPod.Status.Phase != v1.PodSucceeded && 266 cc.cache.TaskCompleted(jobcache.JobKeyByName(newPod.Namespace, jobName), taskName) { 267 event = bus.TaskCompletedEvent 268 } 269 case v1.PodPending, v1.PodRunning: 270 if cc.cache.TaskFailed(jobcache.JobKeyByName(newPod.Namespace, jobName), taskName) { 271 event = bus.TaskFailedEvent 272 } 273 } 274 275 req := apis.Request{ 276 Namespace: newPod.Namespace, 277 JobName: jobName, 278 TaskName: taskName, 279 280 Event: event, 281 ExitCode: exitCode, 282 JobVersion: int32(dVersion), 283 } 284 285 key := jobhelpers.GetJobKeyByReq(&req) 286 queue := cc.getWorkerQueue(key) 287 queue.Add(req) 288 } 289 290 func (cc *jobcontroller) deletePod(obj interface{}) { 291 pod, ok := obj.(*v1.Pod) 292 if !ok { 293 // If we reached here it means the pod was deleted but its final state is unrecorded. 294 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 295 if !ok { 296 klog.Errorf("Couldn't get object from tombstone %#v", obj) 297 return 298 } 299 pod, ok = tombstone.Obj.(*v1.Pod) 300 if !ok { 301 klog.Errorf("Tombstone contained object that is not a Pod: %#v", obj) 302 return 303 } 304 } 305 306 // Filter out pods that are not created from volcano job 307 if !isControlledBy(pod, helpers.JobKind) { 308 return 309 } 310 311 taskName, found := pod.Annotations[batch.TaskSpecKey] 312 if !found { 313 klog.Infof("Failed to find taskName of Pod <%s/%s>, skipping", 314 pod.Namespace, pod.Name) 315 return 316 } 317 318 jobName, found := pod.Annotations[batch.JobNameKey] 319 if !found { 320 klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping", 321 pod.Namespace, pod.Name) 322 return 323 } 324 325 version, found := pod.Annotations[batch.JobVersion] 326 if !found { 327 klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping", 328 pod.Namespace, pod.Name) 329 return 330 } 331 332 dVersion, err := strconv.Atoi(version) 333 if err != nil { 334 klog.Infof("Failed to convert jobVersion of Pod <%s/%s> into number, skipping", 335 pod.Namespace, pod.Name) 336 return 337 } 338 339 req := apis.Request{ 340 Namespace: pod.Namespace, 341 JobName: jobName, 342 TaskName: taskName, 343 344 Event: bus.PodEvictedEvent, 345 JobVersion: int32(dVersion), 346 } 347 348 if err := cc.cache.DeletePod(pod); err != nil { 349 klog.Errorf("Failed to delete Pod <%s/%s>: %v in cache", 350 pod.Namespace, pod.Name, err) 351 } 352 353 key := jobhelpers.GetJobKeyByReq(&req) 354 queue := cc.getWorkerQueue(key) 355 queue.Add(req) 356 } 357 358 func (cc *jobcontroller) recordJobEvent(namespace, name string, event batch.JobEvent, message string) { 359 job, err := cc.cache.Get(jobcache.JobKeyByName(namespace, name)) 360 if err != nil { 361 klog.Warningf("Failed to find job in cache when reporting job event <%s/%s>: %v", 362 namespace, name, err) 363 return 364 } 365 cc.recorder.Event(job.Job, v1.EventTypeNormal, string(event), message) 366 } 367 368 func (cc *jobcontroller) handleCommands() { 369 for cc.processNextCommand() { 370 } 371 } 372 373 func (cc *jobcontroller) processNextCommand() bool { 374 obj, shutdown := cc.commandQueue.Get() 375 if shutdown { 376 return false 377 } 378 cmd := obj.(*bus.Command) 379 defer cc.commandQueue.Done(cmd) 380 381 if err := cc.vcClient.BusV1alpha1().Commands(cmd.Namespace).Delete(context.TODO(), cmd.Name, metav1.DeleteOptions{}); err != nil { 382 if !apierrors.IsNotFound(err) { 383 klog.Errorf("Failed to delete Command <%s/%s>.", cmd.Namespace, cmd.Name) 384 cc.commandQueue.AddRateLimited(cmd) 385 } 386 return true 387 } 388 cc.recordJobEvent(cmd.Namespace, cmd.TargetObject.Name, 389 batch.CommandIssued, 390 fmt.Sprintf( 391 "Start to execute command %s, and clean it up to make sure executed not more than once.", cmd.Action)) 392 req := apis.Request{ 393 Namespace: cmd.Namespace, 394 JobName: cmd.TargetObject.Name, 395 Event: bus.CommandIssuedEvent, 396 Action: bus.Action(cmd.Action), 397 } 398 399 key := jobhelpers.GetJobKeyByReq(&req) 400 queue := cc.getWorkerQueue(key) 401 queue.Add(req) 402 403 return true 404 } 405 406 func (cc *jobcontroller) updatePodGroup(oldObj, newObj interface{}) { 407 oldPG, ok := oldObj.(*scheduling.PodGroup) 408 if !ok { 409 klog.Errorf("Failed to convert %v to PodGroup", newObj) 410 return 411 } 412 413 newPG, ok := newObj.(*scheduling.PodGroup) 414 if !ok { 415 klog.Errorf("Failed to convert %v to PodGroup", newObj) 416 return 417 } 418 419 jobNameKey := newPG.Name 420 ors := newPG.OwnerReferences 421 for _, or := range ors { 422 if or.Kind == "Job" { 423 jobNameKey = or.Name 424 } 425 } 426 427 _, err := cc.cache.Get(jobcache.JobKeyByName(newPG.Namespace, jobNameKey)) 428 if err != nil && newPG.Annotations != nil { 429 klog.Warningf( 430 "Failed to find job in cache by PodGroup(%s/%s), this may not be a PodGroup for volcano job.", newPG.Namespace, newPG.Name) 431 } 432 433 if newPG.Status.Phase != oldPG.Status.Phase { 434 req := apis.Request{ 435 Namespace: newPG.Namespace, 436 JobName: jobNameKey, 437 } 438 switch newPG.Status.Phase { 439 case scheduling.PodGroupUnknown: 440 req.Event = bus.JobUnknownEvent 441 } 442 key := jobhelpers.GetJobKeyByReq(&req) 443 queue := cc.getWorkerQueue(key) 444 queue.Add(req) 445 } 446 } 447 448 // TODO(k82cn): add handler for PodGroup unschedulable event.