github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/common/service.go (about)

     1  // Copyright 2019 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //	http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  package common
    15  
    16  import (
    17  	"fmt"
    18  	"strconv"
    19  	"strings"
    20  
    21  	apiv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    22  	"github.com/kubeflow/training-operator/pkg/controller.v1/control"
    23  	"github.com/kubeflow/training-operator/pkg/controller.v1/expectation"
    24  	"github.com/kubeflow/training-operator/pkg/core"
    25  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    26  	utillabels "github.com/kubeflow/training-operator/pkg/util/labels"
    27  
    28  	"github.com/prometheus/client_golang/prometheus"
    29  	"github.com/prometheus/client_golang/prometheus/promauto"
    30  	log "github.com/sirupsen/logrus"
    31  	v1 "k8s.io/api/core/v1"
    32  	"k8s.io/apimachinery/pkg/api/errors"
    33  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    34  	"k8s.io/apimachinery/pkg/labels"
    35  	"k8s.io/apimachinery/pkg/runtime"
    36  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    37  )
    38  
    39  var (
    40  	succeededServiceCreationCount = promauto.NewCounter(prometheus.CounterOpts{
    41  		Name: "succeeded_service_creation_total",
    42  		Help: "The total number of succeeded service creation",
    43  	})
    44  	failedServiceCreationCount = promauto.NewCounter(prometheus.CounterOpts{
    45  		Name: "failed_service_creation_total",
    46  		Help: "The total number of failed service creation",
    47  	})
    48  )
    49  
    50  // When a service is created, enqueue the controller that manages it and update its expectations.
    51  func (jc *JobController) AddService(obj interface{}) {
    52  	service := obj.(*v1.Service)
    53  	if service.DeletionTimestamp != nil {
    54  		// on a restart of the controller controller, it's possible a new service shows up in a state that
    55  		// is already pending deletion. Prevent the service from being a creation observation.
    56  		// tc.deleteService(service)
    57  		return
    58  	}
    59  
    60  	// If it has a ControllerRef, that's all that matters.
    61  	if controllerRef := metav1.GetControllerOf(service); controllerRef != nil {
    62  		job := jc.resolveControllerRef(service.Namespace, controllerRef)
    63  		if job == nil {
    64  			return
    65  		}
    66  
    67  		jobKey, err := KeyFunc(job)
    68  		if err != nil {
    69  			return
    70  		}
    71  
    72  		rType, err := utillabels.ReplicaType(service.Labels)
    73  		if err != nil {
    74  			log.Infof("This service maybe not created by %v", jc.Controller.ControllerName())
    75  			return
    76  		}
    77  
    78  		expectationServicesKey := expectation.GenExpectationServicesKey(jobKey, string(rType))
    79  
    80  		jc.Expectations.CreationObserved(expectationServicesKey)
    81  		// TODO: we may need add backoff here
    82  		jc.WorkQueue.Add(jobKey)
    83  
    84  		return
    85  	}
    86  
    87  }
    88  
    89  // When a service is updated, figure out what job/s manage it and wake them up.
    90  // If the labels of the service have changed we need to awaken both the old
    91  // and new replica set. old and cur must be *v1.Service types.
    92  func (jc *JobController) UpdateService(old, cur interface{}) {
    93  	// TODO(CPH): handle this gracefully.
    94  }
    95  
    96  // When a service is deleted, enqueue the job that manages the service and update its expectations.
    97  // obj could be an *v1.Service, or a DeletionFinalStateUnknown marker item.
    98  func (jc *JobController) DeleteService(obj interface{}) {
    99  	// TODO(CPH): handle this gracefully.
   100  }
   101  
   102  // getServicesForJob returns the set of services that this job should manage.
   103  // It also reconciles ControllerRef by adopting/orphaning.
   104  // Note that the returned services are pointers into the cache.
   105  func (jc *JobController) GetServicesForJob(jobObject interface{}) ([]*v1.Service, error) {
   106  	job, ok := jobObject.(metav1.Object)
   107  	if !ok {
   108  		return nil, fmt.Errorf("job is not of type metav1.Object")
   109  	}
   110  
   111  	// Create selector
   112  	selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
   113  		MatchLabels: jc.GenLabels(job.GetName()),
   114  	})
   115  
   116  	if err != nil {
   117  		return nil, fmt.Errorf("couldn't convert Job selector: %v", err)
   118  	}
   119  	// List all services to include those that don't match the selector anymore
   120  	// but have a ControllerRef pointing to this controller.
   121  	services, err := jc.ServiceLister.Services(job.GetNamespace()).List(labels.Everything())
   122  	if err != nil {
   123  		return nil, err
   124  	}
   125  
   126  	// If any adoptions are attempted, we should first recheck for deletion
   127  	// with an uncached quorum read sometime after listing services (see #42639).
   128  	canAdoptFunc := RecheckDeletionTimestamp(func() (metav1.Object, error) {
   129  		fresh, err := jc.Controller.GetJobFromInformerCache(job.GetNamespace(), job.GetName())
   130  		if err != nil {
   131  			return nil, err
   132  		}
   133  		if fresh.GetUID() != job.GetUID() {
   134  			return nil, fmt.Errorf("original Job %v/%v is gone: got uid %v, wanted %v", job.GetNamespace(), job.GetName(), fresh.GetUID(), job.GetUID())
   135  		}
   136  		return fresh, nil
   137  	})
   138  	cm := control.NewServiceControllerRefManager(jc.ServiceControl, job, selector, jc.Controller.GetAPIGroupVersionKind(), canAdoptFunc)
   139  	return cm.ClaimServices(services)
   140  }
   141  
   142  // FilterServicesForReplicaType returns service belong to a replicaType.
   143  func (jc *JobController) FilterServicesForReplicaType(services []*v1.Service, replicaType string) ([]*v1.Service, error) {
   144  	return core.FilterServicesForReplicaType(services, replicaType)
   145  }
   146  
   147  // GetServiceSlices returns a slice, which element is the slice of service.
   148  // Assume the return object is serviceSlices, then serviceSlices[i] is an
   149  // array of pointers to services corresponding to Services for replica i.
   150  func (jc *JobController) GetServiceSlices(services []*v1.Service, replicas int, logger *log.Entry) [][]*v1.Service {
   151  	return core.GetServiceSlices(services, replicas, logger)
   152  }
   153  
   154  // reconcileServices checks and updates services for each given ReplicaSpec.
   155  // It will requeue the job in case of an error while creating/deleting services.
   156  func (jc *JobController) ReconcileServices(
   157  	job metav1.Object,
   158  	services []*v1.Service,
   159  	rtype apiv1.ReplicaType,
   160  	spec *apiv1.ReplicaSpec) error {
   161  
   162  	// Convert ReplicaType to lower string.
   163  	rt := strings.ToLower(string(rtype))
   164  	replicas := int(*spec.Replicas)
   165  	// Get all services for the type rt.
   166  	services, err := jc.FilterServicesForReplicaType(services, rt)
   167  	if err != nil {
   168  		return err
   169  	}
   170  
   171  	// GetServiceSlices will return enough information here to make decision to add/remove/update resources.
   172  	//
   173  	// For example, let's assume we have services with replica-index 0, 1, 2
   174  	// If replica is 4, return a slice with size 4. [[0],[1],[2],[]], a svc with replica-index 3 will be created.
   175  	//
   176  	// If replica is 1, return a slice with size 3. [[0],[1],[2]], svc with replica-index 1 and 2 are out of range and will be deleted.
   177  	serviceSlices := jc.GetServiceSlices(services, replicas, commonutil.LoggerForReplica(job, rt))
   178  
   179  	for index, serviceSlice := range serviceSlices {
   180  		if len(serviceSlice) > 1 {
   181  			commonutil.LoggerForReplica(job, rt).Warningf("We have too many services for %s %d", rtype, index)
   182  		} else if len(serviceSlice) == 0 {
   183  			commonutil.LoggerForReplica(job, rt).Infof("need to create new service: %s-%d", rtype, index)
   184  			err = jc.CreateNewService(job, rtype, spec, strconv.Itoa(index))
   185  			if err != nil {
   186  				return err
   187  			}
   188  		} else {
   189  			// Check the status of the current svc.
   190  			svc := serviceSlice[0]
   191  
   192  			// check if the index is in the valid range, if not, we should kill the svc
   193  			if index < 0 || index >= replicas {
   194  				err = jc.ServiceControl.DeleteService(svc.Namespace, svc.Name, job.(runtime.Object))
   195  				if err != nil {
   196  					return err
   197  				}
   198  			}
   199  		}
   200  	}
   201  	return nil
   202  }
   203  
   204  // GetPortsFromJob gets the ports of job container. Port could be nil, if distributed communication strategy doesn't need and no other ports that need to be exposed.
   205  func (jc *JobController) GetPortsFromJob(spec *apiv1.ReplicaSpec) (map[string]int32, error) {
   206  	return core.GetPortsFromJob(spec, jc.Controller.GetDefaultContainerName())
   207  }
   208  
   209  // CreateNewService creates a new service for the given index and type.
   210  func (jc *JobController) CreateNewService(job metav1.Object, rtype apiv1.ReplicaType,
   211  	spec *apiv1.ReplicaSpec, index string) error {
   212  	jobKey, err := KeyFunc(job)
   213  	if err != nil {
   214  		utilruntime.HandleError(fmt.Errorf("couldn't get key for job object %#v: %v", job, err))
   215  		return err
   216  	}
   217  
   218  	rt := strings.ToLower(string(rtype))
   219  	labels := jc.GenLabels(job.GetName())
   220  	utillabels.SetReplicaType(labels, rt)
   221  	utillabels.SetReplicaIndexStr(labels, index)
   222  
   223  	ports, err := jc.GetPortsFromJob(spec)
   224  	if err != nil {
   225  		return err
   226  	}
   227  
   228  	service := &v1.Service{
   229  		Spec: v1.ServiceSpec{
   230  			ClusterIP: "None",
   231  			Selector:  labels,
   232  			Ports:     []v1.ServicePort{},
   233  		},
   234  	}
   235  
   236  	// Add service ports to headless service
   237  	for name, port := range ports {
   238  		svcPort := v1.ServicePort{Name: name, Port: port}
   239  		service.Spec.Ports = append(service.Spec.Ports, svcPort)
   240  	}
   241  
   242  	service.Name = GenGeneralName(job.GetName(), rt, index)
   243  	service.Labels = labels
   244  	// Create OwnerReference.
   245  	controllerRef := jc.GenOwnerReference(job)
   246  
   247  	// Creation is expected when there is no error returned
   248  	expectationServicesKey := expectation.GenExpectationServicesKey(jobKey, rt)
   249  	jc.Expectations.RaiseExpectations(expectationServicesKey, 1, 0)
   250  
   251  	err = jc.ServiceControl.CreateServicesWithControllerRef(job.GetNamespace(), service, job.(runtime.Object), controllerRef)
   252  	if err != nil && errors.IsTimeout(err) {
   253  		// Service is created but its initialization has timed out.
   254  		// If the initialization is successful eventually, the
   255  		// controller will observe the creation via the informer.
   256  		// If the initialization fails, or if the service keeps
   257  		// uninitialized for a long time, the informer will not
   258  		// receive any update, and the controller will create a new
   259  		// service when the expectation expires.
   260  		succeededServiceCreationCount.Inc()
   261  		return nil
   262  	} else if err != nil {
   263  		// Since error occurred(the informer won't observe this service),
   264  		// we decrement the expected number of creates
   265  		// and wait until next reconciliation
   266  		jc.Expectations.CreationObserved(expectationServicesKey)
   267  		failedServiceCreationCount.Inc()
   268  		return err
   269  	}
   270  	succeededServiceCreationCount.Inc()
   271  	return nil
   272  }