volcano.sh/volcano@v1.9.0/pkg/controllers/cache/cache.go (about)

     1  /*
     2  Copyright 2019 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cache
    18  
    19  import (
    20  	"fmt"
    21  	"strconv"
    22  	"sync"
    23  	"time"
    24  
    25  	"golang.org/x/time/rate"
    26  	v1 "k8s.io/api/core/v1"
    27  	"k8s.io/apimachinery/pkg/util/wait"
    28  	"k8s.io/client-go/util/workqueue"
    29  	"k8s.io/klog/v2"
    30  
    31  	"volcano.sh/apis/pkg/apis/batch/v1alpha1"
    32  
    33  	"volcano.sh/volcano/pkg/controllers/apis"
    34  )
    35  
    36  type jobCache struct {
    37  	sync.Mutex
    38  
    39  	jobs        map[string]*apis.JobInfo
    40  	deletedJobs workqueue.RateLimitingInterface
    41  }
    42  
    43  func keyFn(ns, name string) string {
    44  	return fmt.Sprintf("%s/%s", ns, name)
    45  }
    46  
    47  // JobKeyByName gets the key for the job name.
    48  func JobKeyByName(namespace string, name string) string {
    49  	return keyFn(namespace, name)
    50  }
    51  
    52  // JobKeyByReq gets the key for the job request.
    53  func JobKeyByReq(req *apis.Request) string {
    54  	return keyFn(req.Namespace, req.JobName)
    55  }
    56  
    57  // JobKey gets the "ns"/"name" format of the given job.
    58  func JobKey(job *v1alpha1.Job) string {
    59  	return keyFn(job.Namespace, job.Name)
    60  }
    61  
    62  func jobTerminated(job *apis.JobInfo) bool {
    63  	return job.Job == nil && len(job.Pods) == 0
    64  }
    65  
    66  func jobKeyOfPod(pod *v1.Pod) (string, error) {
    67  	jobName, found := pod.Annotations[v1alpha1.JobNameKey]
    68  	if !found {
    69  		return "", fmt.Errorf("failed to find job name of pod <%s/%s>",
    70  			pod.Namespace, pod.Name)
    71  	}
    72  
    73  	return keyFn(pod.Namespace, jobName), nil
    74  }
    75  
    76  // New gets the job Cache.
    77  func New() Cache {
    78  	queue := workqueue.NewMaxOfRateLimiter(
    79  		workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 180*time.Second),
    80  		// 10 qps, 100 bucket size.  This is only for retry speed and its only the overall factor (not per item)
    81  		&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
    82  	)
    83  
    84  	return &jobCache{
    85  		jobs:        map[string]*apis.JobInfo{},
    86  		deletedJobs: workqueue.NewRateLimitingQueue(queue),
    87  	}
    88  }
    89  
    90  func (jc *jobCache) Get(key string) (*apis.JobInfo, error) {
    91  	jc.Lock()
    92  	defer jc.Unlock()
    93  
    94  	job, found := jc.jobs[key]
    95  	if !found {
    96  		return nil, fmt.Errorf("failed to find job <%s>", key)
    97  	}
    98  
    99  	if job.Job == nil {
   100  		return nil, fmt.Errorf("job <%s> is not ready", key)
   101  	}
   102  
   103  	return job.Clone(), nil
   104  }
   105  
   106  func (jc *jobCache) GetStatus(key string) (*v1alpha1.JobStatus, error) {
   107  	jc.Lock()
   108  	defer jc.Unlock()
   109  
   110  	job, found := jc.jobs[key]
   111  	if !found {
   112  		return nil, fmt.Errorf("failed to find job <%s>", key)
   113  	}
   114  
   115  	if job.Job == nil {
   116  		return nil, fmt.Errorf("job <%s> is not ready", key)
   117  	}
   118  
   119  	status := job.Job.Status
   120  
   121  	return &status, nil
   122  }
   123  
   124  func (jc *jobCache) Add(job *v1alpha1.Job) error {
   125  	jc.Lock()
   126  	defer jc.Unlock()
   127  	key := JobKey(job)
   128  	if jobInfo, found := jc.jobs[key]; found {
   129  		if jobInfo.Job == nil {
   130  			jobInfo.SetJob(job)
   131  
   132  			return nil
   133  		}
   134  		return fmt.Errorf("duplicated jobInfo <%v>", key)
   135  	}
   136  
   137  	jc.jobs[key] = &apis.JobInfo{
   138  		Name:      job.Name,
   139  		Namespace: job.Namespace,
   140  
   141  		Job:  job,
   142  		Pods: make(map[string]map[string]*v1.Pod),
   143  	}
   144  
   145  	return nil
   146  }
   147  
   148  func (jc *jobCache) Update(obj *v1alpha1.Job) error {
   149  	jc.Lock()
   150  	defer jc.Unlock()
   151  
   152  	key := JobKey(obj)
   153  	job, found := jc.jobs[key]
   154  	if !found {
   155  		return fmt.Errorf("failed to find job <%v>", key)
   156  	}
   157  
   158  	if job.Job != nil {
   159  		var oldResourceVersion, newResourceVersion uint64
   160  		var err error
   161  		if oldResourceVersion, err = strconv.ParseUint(job.Job.ResourceVersion, 10, 64); err != nil {
   162  			return fmt.Errorf("failed to parase job <%v> resource version <%s>", key, job.Job.ResourceVersion)
   163  		}
   164  
   165  		if newResourceVersion, err = strconv.ParseUint(obj.ResourceVersion, 10, 64); err != nil {
   166  			return fmt.Errorf("failed to parase job <%v> resource version <%s>", key, obj.ResourceVersion)
   167  		}
   168  		if newResourceVersion < oldResourceVersion {
   169  			return fmt.Errorf("job <%v> has too old resource version: %d (%d)", key, newResourceVersion, oldResourceVersion)
   170  		}
   171  	}
   172  	job.Job = obj
   173  	return nil
   174  }
   175  
   176  func (jc *jobCache) Delete(obj *v1alpha1.Job) error {
   177  	jc.Lock()
   178  	defer jc.Unlock()
   179  
   180  	key := JobKey(obj)
   181  	jobInfo, found := jc.jobs[key]
   182  	if !found {
   183  		return fmt.Errorf("failed to find job <%v>", key)
   184  	}
   185  	jobInfo.Job = nil
   186  	jc.deleteJob(jobInfo)
   187  
   188  	return nil
   189  }
   190  
   191  func (jc *jobCache) AddPod(pod *v1.Pod) error {
   192  	jc.Lock()
   193  	defer jc.Unlock()
   194  
   195  	key, err := jobKeyOfPod(pod)
   196  	if err != nil {
   197  		return err
   198  	}
   199  
   200  	job, found := jc.jobs[key]
   201  	if !found {
   202  		job = &apis.JobInfo{
   203  			Pods: make(map[string]map[string]*v1.Pod),
   204  		}
   205  		jc.jobs[key] = job
   206  	}
   207  
   208  	return job.AddPod(pod)
   209  }
   210  
   211  func (jc *jobCache) UpdatePod(pod *v1.Pod) error {
   212  	jc.Lock()
   213  	defer jc.Unlock()
   214  
   215  	key, err := jobKeyOfPod(pod)
   216  	if err != nil {
   217  		return err
   218  	}
   219  
   220  	job, found := jc.jobs[key]
   221  	if !found {
   222  		job = &apis.JobInfo{
   223  			Pods: make(map[string]map[string]*v1.Pod),
   224  		}
   225  		jc.jobs[key] = job
   226  	}
   227  
   228  	return job.UpdatePod(pod)
   229  }
   230  
   231  func (jc *jobCache) DeletePod(pod *v1.Pod) error {
   232  	jc.Lock()
   233  	defer jc.Unlock()
   234  
   235  	key, err := jobKeyOfPod(pod)
   236  	if err != nil {
   237  		return err
   238  	}
   239  
   240  	job, found := jc.jobs[key]
   241  	if !found {
   242  		job = &apis.JobInfo{
   243  			Pods: make(map[string]map[string]*v1.Pod),
   244  		}
   245  		jc.jobs[key] = job
   246  	}
   247  
   248  	if err := job.DeletePod(pod); err != nil {
   249  		return err
   250  	}
   251  
   252  	if jobTerminated(job) {
   253  		jc.deleteJob(job)
   254  	}
   255  
   256  	return nil
   257  }
   258  
   259  func (jc *jobCache) Run(stopCh <-chan struct{}) {
   260  	wait.Until(jc.worker, 0, stopCh)
   261  }
   262  
   263  func (jc *jobCache) TaskCompleted(jobKey, taskName string) bool {
   264  	jc.Lock()
   265  	defer jc.Unlock()
   266  
   267  	var taskReplicas, completed int32
   268  
   269  	jobInfo, found := jc.jobs[jobKey]
   270  	if !found {
   271  		return false
   272  	}
   273  
   274  	taskPods, found := jobInfo.Pods[taskName]
   275  
   276  	if !found {
   277  		return false
   278  	}
   279  
   280  	if jobInfo.Job == nil {
   281  		return false
   282  	}
   283  
   284  	for _, task := range jobInfo.Job.Spec.Tasks {
   285  		if task.Name == taskName {
   286  			taskReplicas = task.Replicas
   287  			break
   288  		}
   289  	}
   290  	if taskReplicas <= 0 {
   291  		return false
   292  	}
   293  
   294  	for _, pod := range taskPods {
   295  		if pod.Status.Phase == v1.PodSucceeded {
   296  			completed++
   297  		}
   298  	}
   299  	return completed >= taskReplicas
   300  }
   301  
   302  func (jc *jobCache) TaskFailed(jobKey, taskName string) bool {
   303  	jc.Lock()
   304  	defer jc.Unlock()
   305  
   306  	var taskReplicas, retried, maxRetry int32
   307  
   308  	jobInfo, found := jc.jobs[jobKey]
   309  	if !found {
   310  		return false
   311  	}
   312  
   313  	taskPods, found := jobInfo.Pods[taskName]
   314  
   315  	if !found || jobInfo.Job == nil {
   316  		return false
   317  	}
   318  
   319  	for _, task := range jobInfo.Job.Spec.Tasks {
   320  		if task.Name == taskName {
   321  			maxRetry = task.MaxRetry
   322  			taskReplicas = task.Replicas
   323  			break
   324  		}
   325  	}
   326  
   327  	// maxRetry == -1 means no limit
   328  	if taskReplicas == 0 || maxRetry == -1 {
   329  		return false
   330  	}
   331  
   332  	// Compatible with existing job
   333  	if maxRetry == 0 {
   334  		maxRetry = 3
   335  	}
   336  
   337  	for _, pod := range taskPods {
   338  		if pod.Status.Phase == v1.PodRunning || pod.Status.Phase == v1.PodPending {
   339  			for j := range pod.Status.InitContainerStatuses {
   340  				stat := pod.Status.InitContainerStatuses[j]
   341  				retried += stat.RestartCount
   342  			}
   343  			for j := range pod.Status.ContainerStatuses {
   344  				stat := pod.Status.ContainerStatuses[j]
   345  				retried += stat.RestartCount
   346  			}
   347  		}
   348  	}
   349  	return retried >= maxRetry
   350  }
   351  
   352  func (jc *jobCache) worker() {
   353  	for jc.processCleanupJob() {
   354  	}
   355  }
   356  
   357  func (jc *jobCache) processCleanupJob() bool {
   358  	obj, shutdown := jc.deletedJobs.Get()
   359  	if shutdown {
   360  		return false
   361  	}
   362  	defer jc.deletedJobs.Done(obj)
   363  
   364  	job, ok := obj.(*apis.JobInfo)
   365  	if !ok {
   366  		klog.Errorf("failed to convert %v to *apis.JobInfo", obj)
   367  		return true
   368  	}
   369  
   370  	jc.Mutex.Lock()
   371  	defer jc.Mutex.Unlock()
   372  
   373  	if jobTerminated(job) {
   374  		jc.deletedJobs.Forget(obj)
   375  		key := keyFn(job.Namespace, job.Name)
   376  		delete(jc.jobs, key)
   377  		klog.V(3).Infof("Job <%s> was deleted.", key)
   378  	} else {
   379  		// Retry
   380  		jc.retryDeleteJob(job)
   381  	}
   382  	return true
   383  }
   384  
   385  func (jc *jobCache) deleteJob(job *apis.JobInfo) {
   386  	klog.V(3).Infof("Try to delete Job <%v/%v>",
   387  		job.Namespace, job.Name)
   388  
   389  	jc.deletedJobs.Add(job)
   390  }
   391  
   392  func (jc *jobCache) retryDeleteJob(job *apis.JobInfo) {
   393  	klog.V(3).Infof("Retry to delete Job <%v/%v>",
   394  		job.Namespace, job.Name)
   395  
   396  	jc.deletedJobs.AddRateLimited(job)
   397  }