github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/common/service.go (about) 1 // Copyright 2019 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 package common 15 16 import ( 17 "fmt" 18 "strconv" 19 "strings" 20 21 apiv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 22 "github.com/kubeflow/training-operator/pkg/controller.v1/control" 23 "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" 24 "github.com/kubeflow/training-operator/pkg/core" 25 commonutil "github.com/kubeflow/training-operator/pkg/util" 26 utillabels "github.com/kubeflow/training-operator/pkg/util/labels" 27 28 "github.com/prometheus/client_golang/prometheus" 29 "github.com/prometheus/client_golang/prometheus/promauto" 30 log "github.com/sirupsen/logrus" 31 v1 "k8s.io/api/core/v1" 32 "k8s.io/apimachinery/pkg/api/errors" 33 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 "k8s.io/apimachinery/pkg/labels" 35 "k8s.io/apimachinery/pkg/runtime" 36 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 37 ) 38 39 var ( 40 succeededServiceCreationCount = promauto.NewCounter(prometheus.CounterOpts{ 41 Name: "succeeded_service_creation_total", 42 Help: "The total number of succeeded service creation", 43 }) 44 failedServiceCreationCount = promauto.NewCounter(prometheus.CounterOpts{ 45 Name: "failed_service_creation_total", 46 Help: "The total number of failed service creation", 47 }) 48 ) 49 50 // When a service is created, enqueue the controller that manages it and update its expectations. 51 func (jc *JobController) AddService(obj interface{}) { 52 service := obj.(*v1.Service) 53 if service.DeletionTimestamp != nil { 54 // on a restart of the controller controller, it's possible a new service shows up in a state that 55 // is already pending deletion. Prevent the service from being a creation observation. 56 // tc.deleteService(service) 57 return 58 } 59 60 // If it has a ControllerRef, that's all that matters. 61 if controllerRef := metav1.GetControllerOf(service); controllerRef != nil { 62 job := jc.resolveControllerRef(service.Namespace, controllerRef) 63 if job == nil { 64 return 65 } 66 67 jobKey, err := KeyFunc(job) 68 if err != nil { 69 return 70 } 71 72 rType, err := utillabels.ReplicaType(service.Labels) 73 if err != nil { 74 log.Infof("This service maybe not created by %v", jc.Controller.ControllerName()) 75 return 76 } 77 78 expectationServicesKey := expectation.GenExpectationServicesKey(jobKey, string(rType)) 79 80 jc.Expectations.CreationObserved(expectationServicesKey) 81 // TODO: we may need add backoff here 82 jc.WorkQueue.Add(jobKey) 83 84 return 85 } 86 87 } 88 89 // When a service is updated, figure out what job/s manage it and wake them up. 90 // If the labels of the service have changed we need to awaken both the old 91 // and new replica set. old and cur must be *v1.Service types. 92 func (jc *JobController) UpdateService(old, cur interface{}) { 93 // TODO(CPH): handle this gracefully. 94 } 95 96 // When a service is deleted, enqueue the job that manages the service and update its expectations. 97 // obj could be an *v1.Service, or a DeletionFinalStateUnknown marker item. 98 func (jc *JobController) DeleteService(obj interface{}) { 99 // TODO(CPH): handle this gracefully. 100 } 101 102 // getServicesForJob returns the set of services that this job should manage. 103 // It also reconciles ControllerRef by adopting/orphaning. 104 // Note that the returned services are pointers into the cache. 105 func (jc *JobController) GetServicesForJob(jobObject interface{}) ([]*v1.Service, error) { 106 job, ok := jobObject.(metav1.Object) 107 if !ok { 108 return nil, fmt.Errorf("job is not of type metav1.Object") 109 } 110 111 // Create selector 112 selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ 113 MatchLabels: jc.GenLabels(job.GetName()), 114 }) 115 116 if err != nil { 117 return nil, fmt.Errorf("couldn't convert Job selector: %v", err) 118 } 119 // List all services to include those that don't match the selector anymore 120 // but have a ControllerRef pointing to this controller. 121 services, err := jc.ServiceLister.Services(job.GetNamespace()).List(labels.Everything()) 122 if err != nil { 123 return nil, err 124 } 125 126 // If any adoptions are attempted, we should first recheck for deletion 127 // with an uncached quorum read sometime after listing services (see #42639). 128 canAdoptFunc := RecheckDeletionTimestamp(func() (metav1.Object, error) { 129 fresh, err := jc.Controller.GetJobFromInformerCache(job.GetNamespace(), job.GetName()) 130 if err != nil { 131 return nil, err 132 } 133 if fresh.GetUID() != job.GetUID() { 134 return nil, fmt.Errorf("original Job %v/%v is gone: got uid %v, wanted %v", job.GetNamespace(), job.GetName(), fresh.GetUID(), job.GetUID()) 135 } 136 return fresh, nil 137 }) 138 cm := control.NewServiceControllerRefManager(jc.ServiceControl, job, selector, jc.Controller.GetAPIGroupVersionKind(), canAdoptFunc) 139 return cm.ClaimServices(services) 140 } 141 142 // FilterServicesForReplicaType returns service belong to a replicaType. 143 func (jc *JobController) FilterServicesForReplicaType(services []*v1.Service, replicaType string) ([]*v1.Service, error) { 144 return core.FilterServicesForReplicaType(services, replicaType) 145 } 146 147 // GetServiceSlices returns a slice, which element is the slice of service. 148 // Assume the return object is serviceSlices, then serviceSlices[i] is an 149 // array of pointers to services corresponding to Services for replica i. 150 func (jc *JobController) GetServiceSlices(services []*v1.Service, replicas int, logger *log.Entry) [][]*v1.Service { 151 return core.GetServiceSlices(services, replicas, logger) 152 } 153 154 // reconcileServices checks and updates services for each given ReplicaSpec. 155 // It will requeue the job in case of an error while creating/deleting services. 156 func (jc *JobController) ReconcileServices( 157 job metav1.Object, 158 services []*v1.Service, 159 rtype apiv1.ReplicaType, 160 spec *apiv1.ReplicaSpec) error { 161 162 // Convert ReplicaType to lower string. 163 rt := strings.ToLower(string(rtype)) 164 replicas := int(*spec.Replicas) 165 // Get all services for the type rt. 166 services, err := jc.FilterServicesForReplicaType(services, rt) 167 if err != nil { 168 return err 169 } 170 171 // GetServiceSlices will return enough information here to make decision to add/remove/update resources. 172 // 173 // For example, let's assume we have services with replica-index 0, 1, 2 174 // If replica is 4, return a slice with size 4. [[0],[1],[2],[]], a svc with replica-index 3 will be created. 175 // 176 // If replica is 1, return a slice with size 3. [[0],[1],[2]], svc with replica-index 1 and 2 are out of range and will be deleted. 177 serviceSlices := jc.GetServiceSlices(services, replicas, commonutil.LoggerForReplica(job, rt)) 178 179 for index, serviceSlice := range serviceSlices { 180 if len(serviceSlice) > 1 { 181 commonutil.LoggerForReplica(job, rt).Warningf("We have too many services for %s %d", rtype, index) 182 } else if len(serviceSlice) == 0 { 183 commonutil.LoggerForReplica(job, rt).Infof("need to create new service: %s-%d", rtype, index) 184 err = jc.CreateNewService(job, rtype, spec, strconv.Itoa(index)) 185 if err != nil { 186 return err 187 } 188 } else { 189 // Check the status of the current svc. 190 svc := serviceSlice[0] 191 192 // check if the index is in the valid range, if not, we should kill the svc 193 if index < 0 || index >= replicas { 194 err = jc.ServiceControl.DeleteService(svc.Namespace, svc.Name, job.(runtime.Object)) 195 if err != nil { 196 return err 197 } 198 } 199 } 200 } 201 return nil 202 } 203 204 // GetPortsFromJob gets the ports of job container. Port could be nil, if distributed communication strategy doesn't need and no other ports that need to be exposed. 205 func (jc *JobController) GetPortsFromJob(spec *apiv1.ReplicaSpec) (map[string]int32, error) { 206 return core.GetPortsFromJob(spec, jc.Controller.GetDefaultContainerName()) 207 } 208 209 // CreateNewService creates a new service for the given index and type. 210 func (jc *JobController) CreateNewService(job metav1.Object, rtype apiv1.ReplicaType, 211 spec *apiv1.ReplicaSpec, index string) error { 212 jobKey, err := KeyFunc(job) 213 if err != nil { 214 utilruntime.HandleError(fmt.Errorf("couldn't get key for job object %#v: %v", job, err)) 215 return err 216 } 217 218 rt := strings.ToLower(string(rtype)) 219 labels := jc.GenLabels(job.GetName()) 220 utillabels.SetReplicaType(labels, rt) 221 utillabels.SetReplicaIndexStr(labels, index) 222 223 ports, err := jc.GetPortsFromJob(spec) 224 if err != nil { 225 return err 226 } 227 228 service := &v1.Service{ 229 Spec: v1.ServiceSpec{ 230 ClusterIP: "None", 231 Selector: labels, 232 Ports: []v1.ServicePort{}, 233 }, 234 } 235 236 // Add service ports to headless service 237 for name, port := range ports { 238 svcPort := v1.ServicePort{Name: name, Port: port} 239 service.Spec.Ports = append(service.Spec.Ports, svcPort) 240 } 241 242 service.Name = GenGeneralName(job.GetName(), rt, index) 243 service.Labels = labels 244 // Create OwnerReference. 245 controllerRef := jc.GenOwnerReference(job) 246 247 // Creation is expected when there is no error returned 248 expectationServicesKey := expectation.GenExpectationServicesKey(jobKey, rt) 249 jc.Expectations.RaiseExpectations(expectationServicesKey, 1, 0) 250 251 err = jc.ServiceControl.CreateServicesWithControllerRef(job.GetNamespace(), service, job.(runtime.Object), controllerRef) 252 if err != nil && errors.IsTimeout(err) { 253 // Service is created but its initialization has timed out. 254 // If the initialization is successful eventually, the 255 // controller will observe the creation via the informer. 256 // If the initialization fails, or if the service keeps 257 // uninitialized for a long time, the informer will not 258 // receive any update, and the controller will create a new 259 // service when the expectation expires. 260 succeededServiceCreationCount.Inc() 261 return nil 262 } else if err != nil { 263 // Since error occurred(the informer won't observe this service), 264 // we decrement the expected number of creates 265 // and wait until next reconciliation 266 jc.Expectations.CreationObserved(expectationServicesKey) 267 failedServiceCreationCount.Inc() 268 return err 269 } 270 succeededServiceCreationCount.Inc() 271 return nil 272 }