github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/checks_hook.go (about)

     1  package allocrunner
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/hashicorp/go-hclog"
     9  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    10  	"github.com/hashicorp/nomad/client/serviceregistration/checks"
    11  	"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
    12  	"github.com/hashicorp/nomad/helper"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  )
    15  
    16  const (
    17  	// checksHookName is the name of this hook as appears in logs
    18  	checksHookName = "checks_hook"
    19  )
    20  
    21  // observers maintains a map from check_id -> observer for a particular check. Each
    22  // observer in the map must share the same context.
    23  type observers map[structs.CheckID]*observer
    24  
    25  // An observer is used to execute a particular check on its interval and update the
    26  // check store with those results.
    27  type observer struct {
    28  	ctx        context.Context
    29  	cancel     context.CancelFunc
    30  	checker    checks.Checker
    31  	checkStore checkstore.Shim
    32  
    33  	qc      *checks.QueryContext
    34  	check   *structs.ServiceCheck
    35  	allocID string
    36  }
    37  
    38  // start checking our check on its interval
    39  func (o *observer) start() {
    40  	// compromise between immediate (too early) and waiting full interval (slow)
    41  	firstWait := o.check.Interval / 2
    42  
    43  	timer, cancel := helper.NewSafeTimer(firstWait)
    44  	defer cancel()
    45  
    46  	for {
    47  		select {
    48  
    49  		// exit the observer
    50  		case <-o.ctx.Done():
    51  			return
    52  
    53  		// time to execute the check
    54  		case <-timer.C:
    55  			query := checks.GetCheckQuery(o.check)
    56  			result := o.checker.Do(o.ctx, o.qc, query)
    57  
    58  			// and put the results into the store (already logged)
    59  			_ = o.checkStore.Set(o.allocID, result)
    60  
    61  			// setup timer for next interval
    62  			timer.Reset(o.check.Interval)
    63  		}
    64  	}
    65  }
    66  
    67  // stop checking our check - this will also interrupt an in-progress execution
    68  func (o *observer) stop() {
    69  	o.cancel()
    70  }
    71  
    72  // checksHook manages checks of Nomad service registrations, at both the group and
    73  // task level, by storing / removing them from the Client state store.
    74  //
    75  // Does not manage Consul service checks; see groupServiceHook instead.
    76  type checksHook struct {
    77  	logger  hclog.Logger
    78  	network structs.NetworkStatus
    79  	shim    checkstore.Shim
    80  	checker checks.Checker
    81  	allocID string
    82  
    83  	// fields that get re-initialized on allocation update
    84  	lock      sync.RWMutex
    85  	ctx       context.Context
    86  	stop      func()
    87  	observers observers
    88  	alloc     *structs.Allocation
    89  }
    90  
    91  func newChecksHook(
    92  	logger hclog.Logger,
    93  	alloc *structs.Allocation,
    94  	shim checkstore.Shim,
    95  	network structs.NetworkStatus,
    96  ) *checksHook {
    97  	h := &checksHook{
    98  		logger:  logger.Named(checksHookName),
    99  		allocID: alloc.ID,
   100  		alloc:   alloc,
   101  		shim:    shim,
   102  		network: network,
   103  		checker: checks.New(logger),
   104  	}
   105  	h.initialize(alloc)
   106  	return h
   107  }
   108  
   109  // initialize the dynamic fields of checksHook, which is to say setup all the
   110  // observers and query context things associated with the alloc.
   111  //
   112  // Should be called during initial setup only.
   113  func (h *checksHook) initialize(alloc *structs.Allocation) {
   114  	h.lock.Lock()
   115  	defer h.lock.Unlock()
   116  
   117  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   118  	if tg == nil {
   119  		return
   120  	}
   121  
   122  	// fresh context and stop function for this allocation
   123  	h.ctx, h.stop = context.WithCancel(context.Background())
   124  
   125  	// fresh set of observers
   126  	h.observers = make(observers)
   127  
   128  	// set the initial alloc
   129  	h.alloc = alloc
   130  }
   131  
   132  // observe will create the observer for each service in services.
   133  // services must use only nomad service provider.
   134  //
   135  // Caller must hold h.lock.
   136  func (h *checksHook) observe(alloc *structs.Allocation, services []*structs.Service) {
   137  	var ports structs.AllocatedPorts
   138  	var networks structs.Networks
   139  	if alloc.AllocatedResources != nil {
   140  		ports = alloc.AllocatedResources.Shared.Ports
   141  		networks = alloc.AllocatedResources.Shared.Networks
   142  	}
   143  
   144  	for _, service := range services {
   145  		for _, check := range service.Checks {
   146  
   147  			// remember the initialization time
   148  			now := time.Now().UTC().Unix()
   149  
   150  			// create the deterministic check id for this check
   151  			id := structs.NomadCheckID(alloc.ID, alloc.TaskGroup, check)
   152  
   153  			// an observer for this check already exists
   154  			if _, exists := h.observers[id]; exists {
   155  				continue
   156  			}
   157  
   158  			ctx, cancel := context.WithCancel(h.ctx)
   159  
   160  			// create the observer for this check
   161  			h.observers[id] = &observer{
   162  				ctx:        ctx,
   163  				cancel:     cancel,
   164  				check:      check.Copy(),
   165  				checkStore: h.shim,
   166  				checker:    h.checker,
   167  				allocID:    h.allocID,
   168  				qc: &checks.QueryContext{
   169  					ID:               id,
   170  					CustomAddress:    service.Address,
   171  					ServicePortLabel: service.PortLabel,
   172  					Ports:            ports,
   173  					Networks:         networks,
   174  					NetworkStatus:    h.network,
   175  					Group:            alloc.Name,
   176  					Task:             service.TaskName,
   177  					Service:          service.Name,
   178  					Check:            check.Name,
   179  				},
   180  			}
   181  
   182  			// insert a pending result into state store for each check
   183  			result := checks.Stub(id, structs.GetCheckMode(check), now, alloc.Name, service.TaskName, service.Name, check.Name)
   184  			if err := h.shim.Set(h.allocID, result); err != nil {
   185  				h.logger.Error("failed to set initial check status", "id", h.allocID, "error", err)
   186  				continue
   187  			}
   188  
   189  			// start the observer
   190  			go h.observers[id].start()
   191  		}
   192  	}
   193  }
   194  
   195  func (h *checksHook) Name() string {
   196  	return checksHookName
   197  }
   198  
   199  func (h *checksHook) Prerun() error {
   200  	h.lock.Lock()
   201  	defer h.lock.Unlock()
   202  
   203  	group := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup)
   204  	if group == nil {
   205  		return nil
   206  	}
   207  
   208  	// create and start observers of nomad service checks in alloc
   209  	h.observe(h.alloc, group.NomadServices())
   210  
   211  	return nil
   212  }
   213  
   214  func (h *checksHook) Update(request *interfaces.RunnerUpdateRequest) error {
   215  	h.lock.Lock()
   216  	defer h.lock.Unlock()
   217  
   218  	group := request.Alloc.Job.LookupTaskGroup(request.Alloc.TaskGroup)
   219  	if group == nil {
   220  		return nil
   221  	}
   222  
   223  	// get all group and task level services using nomad provider
   224  	services := group.NomadServices()
   225  
   226  	// create a set of the updated set of checks
   227  	next := make([]structs.CheckID, 0, len(h.observers))
   228  	for _, service := range services {
   229  		for _, check := range service.Checks {
   230  			next = append(next, structs.NomadCheckID(
   231  				request.Alloc.ID,
   232  				request.Alloc.TaskGroup,
   233  				check,
   234  			))
   235  		}
   236  	}
   237  
   238  	// stop the observers of the checks we are removing
   239  	remove := h.shim.Difference(request.Alloc.ID, next)
   240  	for _, id := range remove {
   241  		h.observers[id].stop()
   242  		delete(h.observers, id)
   243  	}
   244  
   245  	// remove checks that are no longer part of the allocation
   246  	if err := h.shim.Remove(request.Alloc.ID, remove); err != nil {
   247  		return err
   248  	}
   249  
   250  	// remember this new alloc
   251  	h.alloc = request.Alloc
   252  
   253  	// ensure we are observing new checks (idempotent)
   254  	h.observe(request.Alloc, services)
   255  
   256  	return nil
   257  }
   258  
   259  func (h *checksHook) PreKill() {
   260  	h.lock.Lock()
   261  	defer h.lock.Unlock()
   262  
   263  	// terminate our hook context, which threads down to all observers
   264  	h.stop()
   265  
   266  	// purge all checks for this allocation from the client state store
   267  	if err := h.shim.Purge(h.allocID); err != nil {
   268  		h.logger.Error("failed to purge check results", "alloc_id", h.allocID, "error", err)
   269  	}
   270  }