volcano.sh/volcano@v1.9.0/pkg/controllers/cache/cache.go (about) 1 /* 2 Copyright 2019 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cache 18 19 import ( 20 "fmt" 21 "strconv" 22 "sync" 23 "time" 24 25 "golang.org/x/time/rate" 26 v1 "k8s.io/api/core/v1" 27 "k8s.io/apimachinery/pkg/util/wait" 28 "k8s.io/client-go/util/workqueue" 29 "k8s.io/klog/v2" 30 31 "volcano.sh/apis/pkg/apis/batch/v1alpha1" 32 33 "volcano.sh/volcano/pkg/controllers/apis" 34 ) 35 36 type jobCache struct { 37 sync.Mutex 38 39 jobs map[string]*apis.JobInfo 40 deletedJobs workqueue.RateLimitingInterface 41 } 42 43 func keyFn(ns, name string) string { 44 return fmt.Sprintf("%s/%s", ns, name) 45 } 46 47 // JobKeyByName gets the key for the job name. 48 func JobKeyByName(namespace string, name string) string { 49 return keyFn(namespace, name) 50 } 51 52 // JobKeyByReq gets the key for the job request. 53 func JobKeyByReq(req *apis.Request) string { 54 return keyFn(req.Namespace, req.JobName) 55 } 56 57 // JobKey gets the "ns"/"name" format of the given job. 58 func JobKey(job *v1alpha1.Job) string { 59 return keyFn(job.Namespace, job.Name) 60 } 61 62 func jobTerminated(job *apis.JobInfo) bool { 63 return job.Job == nil && len(job.Pods) == 0 64 } 65 66 func jobKeyOfPod(pod *v1.Pod) (string, error) { 67 jobName, found := pod.Annotations[v1alpha1.JobNameKey] 68 if !found { 69 return "", fmt.Errorf("failed to find job name of pod <%s/%s>", 70 pod.Namespace, pod.Name) 71 } 72 73 return keyFn(pod.Namespace, jobName), nil 74 } 75 76 // New gets the job Cache. 77 func New() Cache { 78 queue := workqueue.NewMaxOfRateLimiter( 79 workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 180*time.Second), 80 // 10 qps, 100 bucket size. This is only for retry speed and its only the overall factor (not per item) 81 &workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, 82 ) 83 84 return &jobCache{ 85 jobs: map[string]*apis.JobInfo{}, 86 deletedJobs: workqueue.NewRateLimitingQueue(queue), 87 } 88 } 89 90 func (jc *jobCache) Get(key string) (*apis.JobInfo, error) { 91 jc.Lock() 92 defer jc.Unlock() 93 94 job, found := jc.jobs[key] 95 if !found { 96 return nil, fmt.Errorf("failed to find job <%s>", key) 97 } 98 99 if job.Job == nil { 100 return nil, fmt.Errorf("job <%s> is not ready", key) 101 } 102 103 return job.Clone(), nil 104 } 105 106 func (jc *jobCache) GetStatus(key string) (*v1alpha1.JobStatus, error) { 107 jc.Lock() 108 defer jc.Unlock() 109 110 job, found := jc.jobs[key] 111 if !found { 112 return nil, fmt.Errorf("failed to find job <%s>", key) 113 } 114 115 if job.Job == nil { 116 return nil, fmt.Errorf("job <%s> is not ready", key) 117 } 118 119 status := job.Job.Status 120 121 return &status, nil 122 } 123 124 func (jc *jobCache) Add(job *v1alpha1.Job) error { 125 jc.Lock() 126 defer jc.Unlock() 127 key := JobKey(job) 128 if jobInfo, found := jc.jobs[key]; found { 129 if jobInfo.Job == nil { 130 jobInfo.SetJob(job) 131 132 return nil 133 } 134 return fmt.Errorf("duplicated jobInfo <%v>", key) 135 } 136 137 jc.jobs[key] = &apis.JobInfo{ 138 Name: job.Name, 139 Namespace: job.Namespace, 140 141 Job: job, 142 Pods: make(map[string]map[string]*v1.Pod), 143 } 144 145 return nil 146 } 147 148 func (jc *jobCache) Update(obj *v1alpha1.Job) error { 149 jc.Lock() 150 defer jc.Unlock() 151 152 key := JobKey(obj) 153 job, found := jc.jobs[key] 154 if !found { 155 return fmt.Errorf("failed to find job <%v>", key) 156 } 157 158 if job.Job != nil { 159 var oldResourceVersion, newResourceVersion uint64 160 var err error 161 if oldResourceVersion, err = strconv.ParseUint(job.Job.ResourceVersion, 10, 64); err != nil { 162 return fmt.Errorf("failed to parase job <%v> resource version <%s>", key, job.Job.ResourceVersion) 163 } 164 165 if newResourceVersion, err = strconv.ParseUint(obj.ResourceVersion, 10, 64); err != nil { 166 return fmt.Errorf("failed to parase job <%v> resource version <%s>", key, obj.ResourceVersion) 167 } 168 if newResourceVersion < oldResourceVersion { 169 return fmt.Errorf("job <%v> has too old resource version: %d (%d)", key, newResourceVersion, oldResourceVersion) 170 } 171 } 172 job.Job = obj 173 return nil 174 } 175 176 func (jc *jobCache) Delete(obj *v1alpha1.Job) error { 177 jc.Lock() 178 defer jc.Unlock() 179 180 key := JobKey(obj) 181 jobInfo, found := jc.jobs[key] 182 if !found { 183 return fmt.Errorf("failed to find job <%v>", key) 184 } 185 jobInfo.Job = nil 186 jc.deleteJob(jobInfo) 187 188 return nil 189 } 190 191 func (jc *jobCache) AddPod(pod *v1.Pod) error { 192 jc.Lock() 193 defer jc.Unlock() 194 195 key, err := jobKeyOfPod(pod) 196 if err != nil { 197 return err 198 } 199 200 job, found := jc.jobs[key] 201 if !found { 202 job = &apis.JobInfo{ 203 Pods: make(map[string]map[string]*v1.Pod), 204 } 205 jc.jobs[key] = job 206 } 207 208 return job.AddPod(pod) 209 } 210 211 func (jc *jobCache) UpdatePod(pod *v1.Pod) error { 212 jc.Lock() 213 defer jc.Unlock() 214 215 key, err := jobKeyOfPod(pod) 216 if err != nil { 217 return err 218 } 219 220 job, found := jc.jobs[key] 221 if !found { 222 job = &apis.JobInfo{ 223 Pods: make(map[string]map[string]*v1.Pod), 224 } 225 jc.jobs[key] = job 226 } 227 228 return job.UpdatePod(pod) 229 } 230 231 func (jc *jobCache) DeletePod(pod *v1.Pod) error { 232 jc.Lock() 233 defer jc.Unlock() 234 235 key, err := jobKeyOfPod(pod) 236 if err != nil { 237 return err 238 } 239 240 job, found := jc.jobs[key] 241 if !found { 242 job = &apis.JobInfo{ 243 Pods: make(map[string]map[string]*v1.Pod), 244 } 245 jc.jobs[key] = job 246 } 247 248 if err := job.DeletePod(pod); err != nil { 249 return err 250 } 251 252 if jobTerminated(job) { 253 jc.deleteJob(job) 254 } 255 256 return nil 257 } 258 259 func (jc *jobCache) Run(stopCh <-chan struct{}) { 260 wait.Until(jc.worker, 0, stopCh) 261 } 262 263 func (jc *jobCache) TaskCompleted(jobKey, taskName string) bool { 264 jc.Lock() 265 defer jc.Unlock() 266 267 var taskReplicas, completed int32 268 269 jobInfo, found := jc.jobs[jobKey] 270 if !found { 271 return false 272 } 273 274 taskPods, found := jobInfo.Pods[taskName] 275 276 if !found { 277 return false 278 } 279 280 if jobInfo.Job == nil { 281 return false 282 } 283 284 for _, task := range jobInfo.Job.Spec.Tasks { 285 if task.Name == taskName { 286 taskReplicas = task.Replicas 287 break 288 } 289 } 290 if taskReplicas <= 0 { 291 return false 292 } 293 294 for _, pod := range taskPods { 295 if pod.Status.Phase == v1.PodSucceeded { 296 completed++ 297 } 298 } 299 return completed >= taskReplicas 300 } 301 302 func (jc *jobCache) TaskFailed(jobKey, taskName string) bool { 303 jc.Lock() 304 defer jc.Unlock() 305 306 var taskReplicas, retried, maxRetry int32 307 308 jobInfo, found := jc.jobs[jobKey] 309 if !found { 310 return false 311 } 312 313 taskPods, found := jobInfo.Pods[taskName] 314 315 if !found || jobInfo.Job == nil { 316 return false 317 } 318 319 for _, task := range jobInfo.Job.Spec.Tasks { 320 if task.Name == taskName { 321 maxRetry = task.MaxRetry 322 taskReplicas = task.Replicas 323 break 324 } 325 } 326 327 // maxRetry == -1 means no limit 328 if taskReplicas == 0 || maxRetry == -1 { 329 return false 330 } 331 332 // Compatible with existing job 333 if maxRetry == 0 { 334 maxRetry = 3 335 } 336 337 for _, pod := range taskPods { 338 if pod.Status.Phase == v1.PodRunning || pod.Status.Phase == v1.PodPending { 339 for j := range pod.Status.InitContainerStatuses { 340 stat := pod.Status.InitContainerStatuses[j] 341 retried += stat.RestartCount 342 } 343 for j := range pod.Status.ContainerStatuses { 344 stat := pod.Status.ContainerStatuses[j] 345 retried += stat.RestartCount 346 } 347 } 348 } 349 return retried >= maxRetry 350 } 351 352 func (jc *jobCache) worker() { 353 for jc.processCleanupJob() { 354 } 355 } 356 357 func (jc *jobCache) processCleanupJob() bool { 358 obj, shutdown := jc.deletedJobs.Get() 359 if shutdown { 360 return false 361 } 362 defer jc.deletedJobs.Done(obj) 363 364 job, ok := obj.(*apis.JobInfo) 365 if !ok { 366 klog.Errorf("failed to convert %v to *apis.JobInfo", obj) 367 return true 368 } 369 370 jc.Mutex.Lock() 371 defer jc.Mutex.Unlock() 372 373 if jobTerminated(job) { 374 jc.deletedJobs.Forget(obj) 375 key := keyFn(job.Namespace, job.Name) 376 delete(jc.jobs, key) 377 klog.V(3).Infof("Job <%s> was deleted.", key) 378 } else { 379 // Retry 380 jc.retryDeleteJob(job) 381 } 382 return true 383 } 384 385 func (jc *jobCache) deleteJob(job *apis.JobInfo) { 386 klog.V(3).Infof("Try to delete Job <%v/%v>", 387 job.Namespace, job.Name) 388 389 jc.deletedJobs.Add(job) 390 } 391 392 func (jc *jobCache) retryDeleteJob(job *apis.JobInfo) { 393 klog.V(3).Infof("Retry to delete Job <%v/%v>", 394 job.Namespace, job.Name) 395 396 jc.deletedJobs.AddRateLimited(job) 397 }