github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/serviceregistration/nsd/nsd.go (about) 1 package nsd 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "strings" 8 9 "github.com/hashicorp/go-hclog" 10 "github.com/hashicorp/go-multierror" 11 "github.com/hashicorp/nomad/client/serviceregistration" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 type ServiceRegistrationHandler struct { 16 log hclog.Logger 17 cfg *ServiceRegistrationHandlerCfg 18 19 // checkWatcher watches checks of services in the Nomad service provider, 20 // and restarts associated tasks in accordance with their check_restart stanza. 21 checkWatcher serviceregistration.CheckWatcher 22 23 // registrationEnabled tracks whether this handler is enabled for 24 // registrations. This is needed as it's possible a client has its config 25 // changed whilst allocations using this provider are running on it. In 26 // this situation we need to be able to deregister services, but disallow 27 // registering new ones. 28 registrationEnabled bool 29 30 // shutDownCh coordinates shutting down the handler and any long-running 31 // processes, such as the RPC retry. 32 shutDownCh chan struct{} 33 } 34 35 // ServiceRegistrationHandlerCfg holds critical information used during the 36 // normal process of the ServiceRegistrationHandler. It is used to keep the 37 // NewServiceRegistrationHandler function signature small and easy to modify. 38 type ServiceRegistrationHandlerCfg struct { 39 40 // Enabled tracks whether this client feature is enabled. 41 Enabled bool 42 43 // Datacenter, NodeID, and Region are all properties of the Nomad client 44 // and are used to perform RPC requests. 45 Datacenter string 46 NodeID string 47 Region string 48 49 // NodeSecret is the secret ID of the node and is used to authenticate RPC 50 // requests. 51 NodeSecret string 52 53 // RPCFn is the client RPC function which is used to perform client to 54 // server service registration RPC calls. This RPC function has basic retry 55 // functionality. 56 RPCFn func(method string, args, resp interface{}) error 57 58 // CheckWatcher watches checks of services in the Nomad service provider, 59 // and restarts associated tasks in accordance with their check_restart stanza. 60 CheckWatcher serviceregistration.CheckWatcher 61 } 62 63 // NewServiceRegistrationHandler returns a ready to use 64 // ServiceRegistrationHandler which implements the serviceregistration.Handler 65 // interface. 66 func NewServiceRegistrationHandler(log hclog.Logger, cfg *ServiceRegistrationHandlerCfg) serviceregistration.Handler { 67 go cfg.CheckWatcher.Run(context.TODO()) 68 return &ServiceRegistrationHandler{ 69 cfg: cfg, 70 log: log.Named("service_registration.nomad"), 71 registrationEnabled: cfg.Enabled, 72 checkWatcher: cfg.CheckWatcher, 73 shutDownCh: make(chan struct{}), 74 } 75 } 76 77 func (s *ServiceRegistrationHandler) RegisterWorkload(workload *serviceregistration.WorkloadServices) error { 78 // Check whether we are enabled or not first. Hitting this likely means 79 // there is a bug within the implicit constraint, or process using it, as 80 // that should guard ever placing an allocation on this client. 81 if !s.registrationEnabled { 82 return errors.New(`service registration provider "nomad" not enabled`) 83 } 84 85 // Collect all errors generating service registrations. 86 var mErr multierror.Error 87 88 registrations := make([]*structs.ServiceRegistration, len(workload.Services)) 89 90 // Iterate over the services and generate a hydrated registration object for 91 // each. All services are part of a single allocation, therefore we cannot 92 // have one failure without all becoming a failure. 93 for i, serviceSpec := range workload.Services { 94 serviceRegistration, err := s.generateNomadServiceRegistration(serviceSpec, workload) 95 if err != nil { 96 mErr.Errors = append(mErr.Errors, err) 97 } else if mErr.ErrorOrNil() == nil { 98 registrations[i] = serviceRegistration 99 } 100 } 101 102 // If we generated any errors, return this to the caller. 103 if err := mErr.ErrorOrNil(); err != nil { 104 return err 105 } 106 107 // Service registrations look ok; startup check watchers as specified. The 108 // astute observer may notice the services are not actually registered yet - 109 // this is the same as the Consul flow so hopefully things just work out. 110 for _, service := range workload.Services { 111 for _, check := range service.Checks { 112 if check.TriggersRestarts() { 113 checkID := string(structs.NomadCheckID(workload.AllocInfo.AllocID, workload.AllocInfo.Group, check)) 114 s.checkWatcher.Watch(workload.AllocInfo.AllocID, workload.Name(), checkID, check, workload.Restarter) 115 } 116 } 117 } 118 119 args := structs.ServiceRegistrationUpsertRequest{ 120 Services: registrations, 121 WriteRequest: structs.WriteRequest{ 122 Region: s.cfg.Region, 123 AuthToken: s.cfg.NodeSecret, 124 }, 125 } 126 127 var resp structs.ServiceRegistrationUpsertResponse 128 129 return s.cfg.RPCFn(structs.ServiceRegistrationUpsertRPCMethod, &args, &resp) 130 } 131 132 // RemoveWorkload iterates the services and removes them from the service 133 // registration state. 134 // 135 // This function works regardless of whether the client has this feature 136 // enabled. This covers situations where the feature is disabled, yet still has 137 // allocations which, when stopped need their registrations removed. 138 func (s *ServiceRegistrationHandler) RemoveWorkload(workload *serviceregistration.WorkloadServices) { 139 for _, serviceSpec := range workload.Services { 140 go s.removeWorkload(workload, serviceSpec) 141 } 142 } 143 144 func (s *ServiceRegistrationHandler) removeWorkload( 145 workload *serviceregistration.WorkloadServices, serviceSpec *structs.Service) { 146 147 // Stop check watcher 148 for _, service := range workload.Services { 149 for _, check := range service.Checks { 150 checkID := string(structs.NomadCheckID(workload.AllocInfo.AllocID, workload.AllocInfo.Group, check)) 151 s.checkWatcher.Unwatch(checkID) 152 } 153 } 154 155 // Generate the consistent ID for this service, so we know what to remove. 156 id := serviceregistration.MakeAllocServiceID(workload.AllocInfo.AllocID, workload.Name(), serviceSpec) 157 158 deleteArgs := structs.ServiceRegistrationDeleteByIDRequest{ 159 ID: id, 160 WriteRequest: structs.WriteRequest{ 161 Region: s.cfg.Region, 162 Namespace: workload.ProviderNamespace, 163 AuthToken: s.cfg.NodeSecret, 164 }, 165 } 166 167 var deleteResp structs.ServiceRegistrationDeleteByIDResponse 168 169 err := s.cfg.RPCFn(structs.ServiceRegistrationDeleteByIDRPCMethod, &deleteArgs, &deleteResp) 170 if err == nil { 171 return 172 } 173 174 // The Nomad API exposes service registration deletion to handle 175 // orphaned service registrations. In the event a service is removed 176 // accidentally that is still running, we will hit this error when we 177 // eventually want to remove it. We therefore want to handle this, 178 // while ensuring the operator can see. 179 if strings.Contains(err.Error(), "service registration not found") { 180 s.log.Info("attempted to delete non-existent service registration", 181 "service_id", id, "namespace", workload.ProviderNamespace) 182 return 183 } 184 185 // Log the error as there is nothing left to do, so the operator can see it 186 // and identify any problems. 187 s.log.Error("failed to delete service registration", 188 "error", err, "service_id", id, "namespace", workload.ProviderNamespace) 189 } 190 191 func (s *ServiceRegistrationHandler) UpdateWorkload(old, new *serviceregistration.WorkloadServices) error { 192 193 // Overwrite the workload with the deduplicated versions. 194 old, new = s.dedupUpdatedWorkload(old, new) 195 196 // Use the register error as an update protection and only ever deregister 197 // when this has completed successfully. In the event of an error, we can 198 // return this to the caller stack without modifying state in a weird half 199 // manner. 200 if len(new.Services) > 0 { 201 if err := s.RegisterWorkload(new); err != nil { 202 return err 203 } 204 } 205 206 if len(old.Services) > 0 { 207 s.RemoveWorkload(old) 208 } 209 210 return nil 211 } 212 213 // dedupUpdatedWorkload works through the request old and new workload to 214 // return a deduplicated set of services. 215 // 216 // This is within its own function to make testing easier. 217 func (s *ServiceRegistrationHandler) dedupUpdatedWorkload( 218 oldWork, newWork *serviceregistration.WorkloadServices) ( 219 *serviceregistration.WorkloadServices, *serviceregistration.WorkloadServices) { 220 221 // Create copies of the old and new workload services. These specifically 222 // ignore the services array so this can be populated as the function 223 // decides what is needed. 224 oldCopy := oldWork.Copy() 225 oldCopy.Services = make([]*structs.Service, 0) 226 227 newCopy := newWork.Copy() 228 newCopy.Services = make([]*structs.Service, 0) 229 230 // Generate and populate a mapping of the new service registration IDs. 231 newIDs := make(map[string]*structs.Service, len(newWork.Services)) 232 233 for _, s := range newWork.Services { 234 newIDs[serviceregistration.MakeAllocServiceID(newWork.AllocInfo.AllocID, newWork.Name(), s)] = s 235 } 236 237 // Iterate through the old services in order to identify whether they can 238 // be modified solely via upsert, or whether they need to be deleted. 239 for _, oldService := range oldWork.Services { 240 241 // Generate the service ID of the old service. If this is not found 242 // within the new mapping then we need to remove it. 243 oldID := serviceregistration.MakeAllocServiceID(oldWork.AllocInfo.AllocID, oldWork.Name(), oldService) 244 newSvc, ok := newIDs[oldID] 245 if !ok { 246 oldCopy.Services = append(oldCopy.Services, oldService) 247 continue 248 } 249 250 // Add the new service into the array for upserting and remove its 251 // entry for the map. Doing it here is efficient as we are already 252 // inside a loop. 253 // 254 // There isn't much point in hashing the old/new services as we would 255 // still need to ensure the service has previously been registered 256 // before discarding it from future RPC calls. The Nomad state handles 257 // performing the diff gracefully, therefore this will still be a 258 // single RPC. 259 newCopy.Services = append(newCopy.Services, newSvc) 260 delete(newIDs, oldID) 261 } 262 263 // Iterate the remaining new IDs to add them to the registration array. It 264 // catches any that didn't get added via the previous loop. 265 for _, newSvc := range newIDs { 266 newCopy.Services = append(newCopy.Services, newSvc) 267 } 268 269 return oldCopy, newCopy 270 } 271 272 // AllocRegistrations is currently a noop implementation as the Nomad provider 273 // does not support health check which is the sole subsystem caller of this 274 // function. 275 func (s *ServiceRegistrationHandler) AllocRegistrations(_ string) (*serviceregistration.AllocRegistration, error) { 276 return nil, nil 277 } 278 279 // UpdateTTL is currently a noop implementation as the Nomad provider does not 280 // support health check which is the sole subsystem caller of this function. 281 func (s *ServiceRegistrationHandler) UpdateTTL(_, _, _, _ string) error { 282 return nil 283 } 284 285 // Shutdown is used to initiate shutdown of the handler. This is specifically 286 // used to exit any routines running retry functions without leaving them 287 // orphaned. 288 func (s *ServiceRegistrationHandler) Shutdown() { close(s.shutDownCh) } 289 290 // generateNomadServiceRegistration is a helper to build the Nomad specific 291 // registration object on a per-service basis. 292 func (s *ServiceRegistrationHandler) generateNomadServiceRegistration( 293 serviceSpec *structs.Service, workload *serviceregistration.WorkloadServices) (*structs.ServiceRegistration, error) { 294 295 // Service address modes default to auto. 296 addrMode := serviceSpec.AddressMode 297 if addrMode == "" { 298 addrMode = structs.AddressModeAuto 299 } 300 301 // Determine the address to advertise based on the mode. 302 ip, port, err := serviceregistration.GetAddress( 303 serviceSpec.Address, addrMode, serviceSpec.PortLabel, workload.Networks, 304 workload.DriverNetwork, workload.Ports, workload.NetworkStatus) 305 if err != nil { 306 return nil, fmt.Errorf("unable to get address for service %q: %v", serviceSpec.Name, err) 307 } 308 309 // Build the tags to use for this registration which is a result of whether 310 // this is a canary, or not. 311 var tags []string 312 313 if workload.Canary && len(serviceSpec.CanaryTags) > 0 { 314 tags = make([]string, len(serviceSpec.CanaryTags)) 315 copy(tags, serviceSpec.CanaryTags) 316 } else { 317 tags = make([]string, len(serviceSpec.Tags)) 318 copy(tags, serviceSpec.Tags) 319 } 320 321 return &structs.ServiceRegistration{ 322 ID: serviceregistration.MakeAllocServiceID(workload.AllocInfo.AllocID, workload.Name(), serviceSpec), 323 ServiceName: serviceSpec.Name, 324 NodeID: s.cfg.NodeID, 325 JobID: workload.AllocInfo.JobID, 326 AllocID: workload.AllocInfo.AllocID, 327 Namespace: workload.ProviderNamespace, 328 Datacenter: s.cfg.Datacenter, 329 Tags: tags, 330 Address: ip, 331 Port: port, 332 }, nil 333 }