github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/serviceregistration/nsd/nsd.go (about)

     1  package nsd
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"strings"
     8  
     9  	"github.com/hashicorp/go-hclog"
    10  	"github.com/hashicorp/go-multierror"
    11  	"github.com/hashicorp/nomad/client/serviceregistration"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  type ServiceRegistrationHandler struct {
    16  	log hclog.Logger
    17  	cfg *ServiceRegistrationHandlerCfg
    18  
    19  	// checkWatcher watches checks of services in the Nomad service provider,
    20  	// and restarts associated tasks in accordance with their check_restart stanza.
    21  	checkWatcher serviceregistration.CheckWatcher
    22  
    23  	// registrationEnabled tracks whether this handler is enabled for
    24  	// registrations. This is needed as it's possible a client has its config
    25  	// changed whilst allocations using this provider are running on it. In
    26  	// this situation we need to be able to deregister services, but disallow
    27  	// registering new ones.
    28  	registrationEnabled bool
    29  
    30  	// shutDownCh coordinates shutting down the handler and any long-running
    31  	// processes, such as the RPC retry.
    32  	shutDownCh chan struct{}
    33  }
    34  
    35  // ServiceRegistrationHandlerCfg holds critical information used during the
    36  // normal process of the ServiceRegistrationHandler. It is used to keep the
    37  // NewServiceRegistrationHandler function signature small and easy to modify.
    38  type ServiceRegistrationHandlerCfg struct {
    39  
    40  	// Enabled tracks whether this client feature is enabled.
    41  	Enabled bool
    42  
    43  	// Datacenter, NodeID, and Region are all properties of the Nomad client
    44  	// and are used to perform RPC requests.
    45  	Datacenter string
    46  	NodeID     string
    47  	Region     string
    48  
    49  	// NodeSecret is the secret ID of the node and is used to authenticate RPC
    50  	// requests.
    51  	NodeSecret string
    52  
    53  	// RPCFn is the client RPC function which is used to perform client to
    54  	// server service registration RPC calls. This RPC function has basic retry
    55  	// functionality.
    56  	RPCFn func(method string, args, resp interface{}) error
    57  
    58  	// CheckWatcher watches checks of services in the Nomad service provider,
    59  	// and restarts associated tasks in accordance with their check_restart stanza.
    60  	CheckWatcher serviceregistration.CheckWatcher
    61  }
    62  
    63  // NewServiceRegistrationHandler returns a ready to use
    64  // ServiceRegistrationHandler which implements the serviceregistration.Handler
    65  // interface.
    66  func NewServiceRegistrationHandler(log hclog.Logger, cfg *ServiceRegistrationHandlerCfg) serviceregistration.Handler {
    67  	go cfg.CheckWatcher.Run(context.TODO())
    68  	return &ServiceRegistrationHandler{
    69  		cfg:                 cfg,
    70  		log:                 log.Named("service_registration.nomad"),
    71  		registrationEnabled: cfg.Enabled,
    72  		checkWatcher:        cfg.CheckWatcher,
    73  		shutDownCh:          make(chan struct{}),
    74  	}
    75  }
    76  
    77  func (s *ServiceRegistrationHandler) RegisterWorkload(workload *serviceregistration.WorkloadServices) error {
    78  	// Check whether we are enabled or not first. Hitting this likely means
    79  	// there is a bug within the implicit constraint, or process using it, as
    80  	// that should guard ever placing an allocation on this client.
    81  	if !s.registrationEnabled {
    82  		return errors.New(`service registration provider "nomad" not enabled`)
    83  	}
    84  
    85  	// Collect all errors generating service registrations.
    86  	var mErr multierror.Error
    87  
    88  	registrations := make([]*structs.ServiceRegistration, len(workload.Services))
    89  
    90  	// Iterate over the services and generate a hydrated registration object for
    91  	// each. All services are part of a single allocation, therefore we cannot
    92  	// have one failure without all becoming a failure.
    93  	for i, serviceSpec := range workload.Services {
    94  		serviceRegistration, err := s.generateNomadServiceRegistration(serviceSpec, workload)
    95  		if err != nil {
    96  			mErr.Errors = append(mErr.Errors, err)
    97  		} else if mErr.ErrorOrNil() == nil {
    98  			registrations[i] = serviceRegistration
    99  		}
   100  	}
   101  
   102  	// If we generated any errors, return this to the caller.
   103  	if err := mErr.ErrorOrNil(); err != nil {
   104  		return err
   105  	}
   106  
   107  	// Service registrations look ok; startup check watchers as specified. The
   108  	// astute observer may notice the services are not actually registered yet -
   109  	// this is the same as the Consul flow so hopefully things just work out.
   110  	for _, service := range workload.Services {
   111  		for _, check := range service.Checks {
   112  			if check.TriggersRestarts() {
   113  				checkID := string(structs.NomadCheckID(workload.AllocInfo.AllocID, workload.AllocInfo.Group, check))
   114  				s.checkWatcher.Watch(workload.AllocInfo.AllocID, workload.Name(), checkID, check, workload.Restarter)
   115  			}
   116  		}
   117  	}
   118  
   119  	args := structs.ServiceRegistrationUpsertRequest{
   120  		Services: registrations,
   121  		WriteRequest: structs.WriteRequest{
   122  			Region:    s.cfg.Region,
   123  			AuthToken: s.cfg.NodeSecret,
   124  		},
   125  	}
   126  
   127  	var resp structs.ServiceRegistrationUpsertResponse
   128  
   129  	return s.cfg.RPCFn(structs.ServiceRegistrationUpsertRPCMethod, &args, &resp)
   130  }
   131  
   132  // RemoveWorkload iterates the services and removes them from the service
   133  // registration state.
   134  //
   135  // This function works regardless of whether the client has this feature
   136  // enabled. This covers situations where the feature is disabled, yet still has
   137  // allocations which, when stopped need their registrations removed.
   138  func (s *ServiceRegistrationHandler) RemoveWorkload(workload *serviceregistration.WorkloadServices) {
   139  	for _, serviceSpec := range workload.Services {
   140  		go s.removeWorkload(workload, serviceSpec)
   141  	}
   142  }
   143  
   144  func (s *ServiceRegistrationHandler) removeWorkload(
   145  	workload *serviceregistration.WorkloadServices, serviceSpec *structs.Service) {
   146  
   147  	// Stop check watcher
   148  	for _, service := range workload.Services {
   149  		for _, check := range service.Checks {
   150  			checkID := string(structs.NomadCheckID(workload.AllocInfo.AllocID, workload.AllocInfo.Group, check))
   151  			s.checkWatcher.Unwatch(checkID)
   152  		}
   153  	}
   154  
   155  	// Generate the consistent ID for this service, so we know what to remove.
   156  	id := serviceregistration.MakeAllocServiceID(workload.AllocInfo.AllocID, workload.Name(), serviceSpec)
   157  
   158  	deleteArgs := structs.ServiceRegistrationDeleteByIDRequest{
   159  		ID: id,
   160  		WriteRequest: structs.WriteRequest{
   161  			Region:    s.cfg.Region,
   162  			Namespace: workload.ProviderNamespace,
   163  			AuthToken: s.cfg.NodeSecret,
   164  		},
   165  	}
   166  
   167  	var deleteResp structs.ServiceRegistrationDeleteByIDResponse
   168  
   169  	err := s.cfg.RPCFn(structs.ServiceRegistrationDeleteByIDRPCMethod, &deleteArgs, &deleteResp)
   170  	if err == nil {
   171  		return
   172  	}
   173  
   174  	// The Nomad API exposes service registration deletion to handle
   175  	// orphaned service registrations. In the event a service is removed
   176  	// accidentally that is still running, we will hit this error when we
   177  	// eventually want to remove it. We therefore want to handle this,
   178  	// while ensuring the operator can see.
   179  	if strings.Contains(err.Error(), "service registration not found") {
   180  		s.log.Info("attempted to delete non-existent service registration",
   181  			"service_id", id, "namespace", workload.ProviderNamespace)
   182  		return
   183  	}
   184  
   185  	// Log the error as there is nothing left to do, so the operator can see it
   186  	// and identify any problems.
   187  	s.log.Error("failed to delete service registration",
   188  		"error", err, "service_id", id, "namespace", workload.ProviderNamespace)
   189  }
   190  
   191  func (s *ServiceRegistrationHandler) UpdateWorkload(old, new *serviceregistration.WorkloadServices) error {
   192  
   193  	// Overwrite the workload with the deduplicated versions.
   194  	old, new = s.dedupUpdatedWorkload(old, new)
   195  
   196  	// Use the register error as an update protection and only ever deregister
   197  	// when this has completed successfully. In the event of an error, we can
   198  	// return this to the caller stack without modifying state in a weird half
   199  	// manner.
   200  	if len(new.Services) > 0 {
   201  		if err := s.RegisterWorkload(new); err != nil {
   202  			return err
   203  		}
   204  	}
   205  
   206  	if len(old.Services) > 0 {
   207  		s.RemoveWorkload(old)
   208  	}
   209  
   210  	return nil
   211  }
   212  
   213  // dedupUpdatedWorkload works through the request old and new workload to
   214  // return a deduplicated set of services.
   215  //
   216  // This is within its own function to make testing easier.
   217  func (s *ServiceRegistrationHandler) dedupUpdatedWorkload(
   218  	oldWork, newWork *serviceregistration.WorkloadServices) (
   219  	*serviceregistration.WorkloadServices, *serviceregistration.WorkloadServices) {
   220  
   221  	// Create copies of the old and new workload services. These specifically
   222  	// ignore the services array so this can be populated as the function
   223  	// decides what is needed.
   224  	oldCopy := oldWork.Copy()
   225  	oldCopy.Services = make([]*structs.Service, 0)
   226  
   227  	newCopy := newWork.Copy()
   228  	newCopy.Services = make([]*structs.Service, 0)
   229  
   230  	// Generate and populate a mapping of the new service registration IDs.
   231  	newIDs := make(map[string]*structs.Service, len(newWork.Services))
   232  
   233  	for _, s := range newWork.Services {
   234  		newIDs[serviceregistration.MakeAllocServiceID(newWork.AllocInfo.AllocID, newWork.Name(), s)] = s
   235  	}
   236  
   237  	// Iterate through the old services in order to identify whether they can
   238  	// be modified solely via upsert, or whether they need to be deleted.
   239  	for _, oldService := range oldWork.Services {
   240  
   241  		// Generate the service ID of the old service. If this is not found
   242  		// within the new mapping then we need to remove it.
   243  		oldID := serviceregistration.MakeAllocServiceID(oldWork.AllocInfo.AllocID, oldWork.Name(), oldService)
   244  		newSvc, ok := newIDs[oldID]
   245  		if !ok {
   246  			oldCopy.Services = append(oldCopy.Services, oldService)
   247  			continue
   248  		}
   249  
   250  		// Add the new service into the array for upserting and remove its
   251  		// entry for the map. Doing it here is efficient as we are already
   252  		// inside a loop.
   253  		//
   254  		// There isn't much point in hashing the old/new services as we would
   255  		// still need to ensure the service has previously been registered
   256  		// before discarding it from future RPC calls. The Nomad state handles
   257  		// performing the diff gracefully, therefore this will still be a
   258  		// single RPC.
   259  		newCopy.Services = append(newCopy.Services, newSvc)
   260  		delete(newIDs, oldID)
   261  	}
   262  
   263  	// Iterate the remaining new IDs to add them to the registration array. It
   264  	// catches any that didn't get added via the previous loop.
   265  	for _, newSvc := range newIDs {
   266  		newCopy.Services = append(newCopy.Services, newSvc)
   267  	}
   268  
   269  	return oldCopy, newCopy
   270  }
   271  
   272  // AllocRegistrations is currently a noop implementation as the Nomad provider
   273  // does not support health check which is the sole subsystem caller of this
   274  // function.
   275  func (s *ServiceRegistrationHandler) AllocRegistrations(_ string) (*serviceregistration.AllocRegistration, error) {
   276  	return nil, nil
   277  }
   278  
   279  // UpdateTTL is currently a noop implementation as the Nomad provider does not
   280  // support health check which is the sole subsystem caller of this function.
   281  func (s *ServiceRegistrationHandler) UpdateTTL(_, _, _, _ string) error {
   282  	return nil
   283  }
   284  
   285  // Shutdown is used to initiate shutdown of the handler. This is specifically
   286  // used to exit any routines running retry functions without leaving them
   287  // orphaned.
   288  func (s *ServiceRegistrationHandler) Shutdown() { close(s.shutDownCh) }
   289  
   290  // generateNomadServiceRegistration is a helper to build the Nomad specific
   291  // registration object on a per-service basis.
   292  func (s *ServiceRegistrationHandler) generateNomadServiceRegistration(
   293  	serviceSpec *structs.Service, workload *serviceregistration.WorkloadServices) (*structs.ServiceRegistration, error) {
   294  
   295  	// Service address modes default to auto.
   296  	addrMode := serviceSpec.AddressMode
   297  	if addrMode == "" {
   298  		addrMode = structs.AddressModeAuto
   299  	}
   300  
   301  	// Determine the address to advertise based on the mode.
   302  	ip, port, err := serviceregistration.GetAddress(
   303  		serviceSpec.Address, addrMode, serviceSpec.PortLabel, workload.Networks,
   304  		workload.DriverNetwork, workload.Ports, workload.NetworkStatus)
   305  	if err != nil {
   306  		return nil, fmt.Errorf("unable to get address for service %q: %v", serviceSpec.Name, err)
   307  	}
   308  
   309  	// Build the tags to use for this registration which is a result of whether
   310  	// this is a canary, or not.
   311  	var tags []string
   312  
   313  	if workload.Canary && len(serviceSpec.CanaryTags) > 0 {
   314  		tags = make([]string, len(serviceSpec.CanaryTags))
   315  		copy(tags, serviceSpec.CanaryTags)
   316  	} else {
   317  		tags = make([]string, len(serviceSpec.Tags))
   318  		copy(tags, serviceSpec.Tags)
   319  	}
   320  
   321  	return &structs.ServiceRegistration{
   322  		ID:          serviceregistration.MakeAllocServiceID(workload.AllocInfo.AllocID, workload.Name(), serviceSpec),
   323  		ServiceName: serviceSpec.Name,
   324  		NodeID:      s.cfg.NodeID,
   325  		JobID:       workload.AllocInfo.JobID,
   326  		AllocID:     workload.AllocInfo.AllocID,
   327  		Namespace:   workload.ProviderNamespace,
   328  		Datacenter:  s.cfg.Datacenter,
   329  		Tags:        tags,
   330  		Address:     ip,
   331  		Port:        port,
   332  	}, nil
   333  }