github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allocrunner/taskrunner/script_check_hook.go

github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allocrunner/taskrunner/script_check_hook.go (about)

     1  package taskrunner
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/consul/api"
    10  	log "github.com/hashicorp/go-hclog"
    11  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    12  	tinterfaces "github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces"
    13  	"github.com/hashicorp/nomad/client/consul"
    14  	"github.com/hashicorp/nomad/client/taskenv"
    15  	agentconsul "github.com/hashicorp/nomad/command/agent/consul"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  )
    18  
    19  var _ interfaces.TaskPoststartHook = &scriptCheckHook{}
    20  var _ interfaces.TaskUpdateHook = &scriptCheckHook{}
    21  var _ interfaces.TaskStopHook = &scriptCheckHook{}
    22  
    23  // default max amount of time to wait for all scripts on shutdown.
    24  const defaultShutdownWait = time.Minute
    25  
    26  type scriptCheckHookConfig struct {
    27  	alloc        *structs.Allocation
    28  	task         *structs.Task
    29  	consul       consul.ConsulServiceAPI
    30  	logger       log.Logger
    31  	shutdownWait time.Duration
    32  }
    33  
    34  // scriptCheckHook implements a task runner hook for running script
    35  // checks in the context of a task
    36  type scriptCheckHook struct {
    37  	consul       consul.ConsulServiceAPI
    38  	alloc        *structs.Allocation
    39  	task         *structs.Task
    40  	logger       log.Logger
    41  	shutdownWait time.Duration // max time to wait for scripts to shutdown
    42  	shutdownCh   chan struct{} // closed when all scripts should shutdown
    43  
    44  	// The following fields can be changed by Update()
    45  	driverExec tinterfaces.ScriptExecutor
    46  	taskEnv    *taskenv.TaskEnv
    47  
    48  	// These maintain state and are populated by Poststart() or Update()
    49  	scripts        map[string]*scriptCheck
    50  	runningScripts map[string]*taskletHandle
    51  
    52  	// Since Update() may be called concurrently with any other hook all
    53  	// hook methods must be fully serialized
    54  	mu sync.Mutex
    55  }
    56  
    57  // newScriptCheckHook returns a hook without any scriptChecks.
    58  // They will get created only once their task environment is ready
    59  // in Poststart() or Update()
    60  func newScriptCheckHook(c scriptCheckHookConfig) *scriptCheckHook {
    61  	h := &scriptCheckHook{
    62  		consul:         c.consul,
    63  		alloc:          c.alloc,
    64  		task:           c.task,
    65  		scripts:        make(map[string]*scriptCheck),
    66  		runningScripts: make(map[string]*taskletHandle),
    67  		shutdownWait:   defaultShutdownWait,
    68  		shutdownCh:     make(chan struct{}),
    69  	}
    70  
    71  	if c.shutdownWait != 0 {
    72  		h.shutdownWait = c.shutdownWait // override for testing
    73  	}
    74  	h.logger = c.logger.Named(h.Name())
    75  	return h
    76  }
    77  
    78  func (h *scriptCheckHook) Name() string {
    79  	return "script_checks"
    80  }
    81  
    82  // Prestart implements interfaces.TaskPrestartHook. It stores the
    83  // initial structs.Task
    84  func (h *scriptCheckHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, _ *interfaces.TaskPrestartResponse) error {
    85  	h.mu.Lock()
    86  	defer h.mu.Unlock()
    87  	h.task = req.Task
    88  	return nil
    89  }
    90  
    91  // PostStart implements interfaces.TaskPoststartHook. It creates new
    92  // script checks with the current task context (driver and env), and
    93  // starts up the scripts.
    94  func (h *scriptCheckHook) Poststart(ctx context.Context, req *interfaces.TaskPoststartRequest, _ *interfaces.TaskPoststartResponse) error {
    95  	h.mu.Lock()
    96  	defer h.mu.Unlock()
    97  
    98  	if req.DriverExec == nil {
    99  		h.logger.Debug("driver doesn't support script checks")
   100  		return nil
   101  	}
   102  	h.driverExec = req.DriverExec
   103  	h.taskEnv = req.TaskEnv
   104  
   105  	return h.upsertChecks()
   106  }
   107  
   108  // Updated implements interfaces.TaskUpdateHook. It creates new
   109  // script checks with the current task context (driver and env and possibly
   110  // new structs.Task), and starts up the scripts.
   111  func (h *scriptCheckHook) Update(ctx context.Context, req *interfaces.TaskUpdateRequest, _ *interfaces.TaskUpdateResponse) error {
   112  	h.mu.Lock()
   113  	defer h.mu.Unlock()
   114  
   115  	task := req.Alloc.LookupTask(h.task.Name)
   116  	if task == nil {
   117  		return fmt.Errorf("task %q not found in updated alloc", h.task.Name)
   118  	}
   119  	h.alloc = req.Alloc
   120  	h.task = task
   121  	h.taskEnv = req.TaskEnv
   122  
   123  	return h.upsertChecks()
   124  }
   125  
   126  func (h *scriptCheckHook) upsertChecks() error {
   127  	// Create new script checks struct with new task context
   128  	oldScriptChecks := h.scripts
   129  	h.scripts = h.newScriptChecks()
   130  
   131  	// Run new or replacement scripts
   132  	for id, script := range h.scripts {
   133  		// If it's already running, cancel and replace
   134  		if oldScript, running := h.runningScripts[id]; running {
   135  			oldScript.cancel()
   136  		}
   137  		// Start and store the handle
   138  		h.runningScripts[id] = script.run()
   139  	}
   140  
   141  	// Cancel scripts we no longer want
   142  	for id := range oldScriptChecks {
   143  		if _, ok := h.scripts[id]; !ok {
   144  			if oldScript, running := h.runningScripts[id]; running {
   145  				oldScript.cancel()
   146  			}
   147  		}
   148  	}
   149  	return nil
   150  }
   151  
   152  // Stop implements interfaces.TaskStopHook and blocks waiting for running
   153  // scripts to finish (or for the shutdownWait timeout to expire).
   154  func (h *scriptCheckHook) Stop(ctx context.Context, req *interfaces.TaskStopRequest, resp *interfaces.TaskStopResponse) error {
   155  	h.mu.Lock()
   156  	defer h.mu.Unlock()
   157  	close(h.shutdownCh)
   158  	deadline := time.After(h.shutdownWait)
   159  	err := fmt.Errorf("timed out waiting for script checks to exit")
   160  	for _, script := range h.runningScripts {
   161  		select {
   162  		case <-script.wait():
   163  		case <-ctx.Done():
   164  			// the caller is passing the background context, so
   165  			// we should never really see this outside of testing
   166  		case <-deadline:
   167  			// at this point the Consul client has been cleaned
   168  			// up so we don't want to hang onto this.
   169  			return err
   170  		}
   171  	}
   172  	return nil
   173  }
   174  
   175  func (h *scriptCheckHook) newScriptChecks() map[string]*scriptCheck {
   176  	scriptChecks := make(map[string]*scriptCheck)
   177  	interpolatedTaskServices := taskenv.InterpolateServices(h.taskEnv, h.task.Services)
   178  	for _, service := range interpolatedTaskServices {
   179  		for _, check := range service.Checks {
   180  			if check.Type != structs.ServiceCheckScript {
   181  				continue
   182  			}
   183  			serviceID := agentconsul.MakeAllocServiceID(
   184  				h.alloc.ID, h.task.Name, service)
   185  			sc := newScriptCheck(&scriptCheckConfig{
   186  				allocID:    h.alloc.ID,
   187  				taskName:   h.task.Name,
   188  				check:      check,
   189  				serviceID:  serviceID,
   190  				agent:      h.consul,
   191  				driverExec: h.driverExec,
   192  				taskEnv:    h.taskEnv,
   193  				logger:     h.logger,
   194  				shutdownCh: h.shutdownCh,
   195  			})
   196  			if sc != nil {
   197  				scriptChecks[sc.id] = sc
   198  			}
   199  		}
   200  	}
   201  
   202  	// Walk back through the task group to see if there are script checks
   203  	// associated with the task. If so, we'll create scriptCheck tasklets
   204  	// for them. The group-level service and any check restart behaviors it
   205  	// needs are entirely encapsulated within the group service hook which
   206  	// watches Consul for status changes.
   207  	//
   208  	// The script check is associated with a group task if the service.task or
   209  	// service.check.task matches the task name. The service.check.task takes
   210  	// precedence.
   211  	tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup)
   212  	interpolatedGroupServices := taskenv.InterpolateServices(h.taskEnv, tg.Services)
   213  	for _, service := range interpolatedGroupServices {
   214  		for _, check := range service.Checks {
   215  			if check.Type != structs.ServiceCheckScript {
   216  				continue
   217  			}
   218  			if !h.associated(h.task.Name, service.TaskName, check.TaskName) {
   219  				continue
   220  			}
   221  			groupTaskName := "group-" + tg.Name
   222  			serviceID := agentconsul.MakeAllocServiceID(
   223  				h.alloc.ID, groupTaskName, service)
   224  			sc := newScriptCheck(&scriptCheckConfig{
   225  				allocID:    h.alloc.ID,
   226  				taskName:   groupTaskName,
   227  				check:      check,
   228  				serviceID:  serviceID,
   229  				agent:      h.consul,
   230  				driverExec: h.driverExec,
   231  				taskEnv:    h.taskEnv,
   232  				logger:     h.logger,
   233  				shutdownCh: h.shutdownCh,
   234  				isGroup:    true,
   235  			})
   236  			if sc != nil {
   237  				scriptChecks[sc.id] = sc
   238  			}
   239  		}
   240  	}
   241  	return scriptChecks
   242  }
   243  
   244  // associated returns true if the script check is associated with the task. This
   245  // would be the case if the check.task is the same as task, or if the service.task
   246  // is the same as the task _and_ check.task is not configured (i.e. the check
   247  // inherits the task of the service).
   248  func (*scriptCheckHook) associated(task, serviceTask, checkTask string) bool {
   249  	if checkTask == task {
   250  		return true
   251  	}
   252  	if serviceTask == task && checkTask == "" {
   253  		return true
   254  	}
   255  	return false
   256  }
   257  
   258  // heartbeater is the subset of consul agent functionality needed by script
   259  // checks to heartbeat
   260  type heartbeater interface {
   261  	UpdateTTL(id, output, status string) error
   262  }
   263  
   264  // scriptCheck runs script checks via a interfaces.ScriptExecutor and updates the
   265  // appropriate check's TTL when the script succeeds.
   266  type scriptCheck struct {
   267  	id          string
   268  	agent       heartbeater
   269  	check       *structs.ServiceCheck
   270  	lastCheckOk bool // true if the last check was ok; otherwise false
   271  	tasklet
   272  }
   273  
   274  // scriptCheckConfig is a parameter struct for newScriptCheck
   275  type scriptCheckConfig struct {
   276  	allocID    string
   277  	taskName   string
   278  	serviceID  string
   279  	check      *structs.ServiceCheck
   280  	agent      heartbeater
   281  	driverExec tinterfaces.ScriptExecutor
   282  	taskEnv    *taskenv.TaskEnv
   283  	logger     log.Logger
   284  	shutdownCh chan struct{}
   285  	isGroup    bool
   286  }
   287  
   288  // newScriptCheck constructs a scriptCheck. we're only going to
   289  // configure the immutable fields of scriptCheck here, with the
   290  // rest being configured during the Poststart hook so that we have
   291  // the rest of the task execution environment
   292  func newScriptCheck(config *scriptCheckConfig) *scriptCheck {
   293  
   294  	// Guard against not having a valid taskEnv. This can be the case if the
   295  	// PreKilling or Exited hook is run before Poststart.
   296  	if config.taskEnv == nil || config.driverExec == nil {
   297  		return nil
   298  	}
   299  
   300  	orig := config.check
   301  	sc := &scriptCheck{
   302  		agent:       config.agent,
   303  		check:       config.check.Copy(),
   304  		lastCheckOk: true, // start logging on first failure
   305  	}
   306  
   307  	// we can't use the promoted fields of tasklet in the struct literal
   308  	sc.Command = config.taskEnv.ReplaceEnv(config.check.Command)
   309  	sc.Args = config.taskEnv.ParseAndReplace(config.check.Args)
   310  	sc.Interval = config.check.Interval
   311  	sc.Timeout = config.check.Timeout
   312  	sc.exec = config.driverExec
   313  	sc.callback = newScriptCheckCallback(sc)
   314  	sc.logger = config.logger
   315  	sc.shutdownCh = config.shutdownCh
   316  	sc.check.Command = sc.Command
   317  	sc.check.Args = sc.Args
   318  
   319  	if config.isGroup {
   320  		// group services don't have access to a task environment
   321  		// at creation, so their checks get registered before the
   322  		// check can be interpolated here. if we don't use the
   323  		// original checkID, they can't be updated.
   324  		sc.id = agentconsul.MakeCheckID(config.serviceID, orig)
   325  	} else {
   326  		sc.id = agentconsul.MakeCheckID(config.serviceID, sc.check)
   327  	}
   328  	return sc
   329  }
   330  
   331  // Copy does a *shallow* copy of script checks.
   332  func (sc *scriptCheck) Copy() *scriptCheck {
   333  	newSc := sc
   334  	return newSc
   335  }
   336  
   337  // closes over the script check and returns the taskletCallback for
   338  // when the script check executes.
   339  func newScriptCheckCallback(s *scriptCheck) taskletCallback {
   340  
   341  	return func(ctx context.Context, params execResult) {
   342  		output := params.output
   343  		code := params.code
   344  		err := params.err
   345  
   346  		state := api.HealthCritical
   347  		switch code {
   348  		case 0:
   349  			state = api.HealthPassing
   350  		case 1:
   351  			state = api.HealthWarning
   352  		}
   353  
   354  		var outputMsg string
   355  		if err != nil {
   356  			state = api.HealthCritical
   357  			outputMsg = err.Error()
   358  		} else {
   359  			outputMsg = string(output)
   360  		}
   361  
   362  		// heartbeat the check to Consul
   363  		err = s.updateTTL(ctx, outputMsg, state)
   364  		select {
   365  		case <-ctx.Done():
   366  			// check has been removed; don't report errors
   367  			return
   368  		default:
   369  		}
   370  
   371  		if err != nil {
   372  			if s.lastCheckOk {
   373  				s.lastCheckOk = false
   374  				s.logger.Warn("updating check failed", "error", err)
   375  			} else {
   376  				s.logger.Debug("updating check still failing", "error", err)
   377  			}
   378  
   379  		} else if !s.lastCheckOk {
   380  			// Succeeded for the first time or after failing; log
   381  			s.lastCheckOk = true
   382  			s.logger.Info("updating check succeeded")
   383  		}
   384  	}
   385  }
   386  
   387  const (
   388  	updateTTLBackoffBaseline = 1 * time.Second
   389  	updateTTLBackoffLimit    = 3 * time.Second
   390  )
   391  
   392  // updateTTL updates the state to Consul, performing an exponential backoff
   393  // in the case where the check isn't registered in Consul to avoid a race between
   394  // service registration and the first check.
   395  func (s *scriptCheck) updateTTL(ctx context.Context, msg, state string) error {
   396  	for attempts := 0; ; attempts++ {
   397  		err := s.agent.UpdateTTL(s.id, msg, state)
   398  		if err == nil {
   399  			return nil
   400  		}
   401  
   402  		// Handle the retry case
   403  		backoff := (1 << (2 * uint64(attempts))) * updateTTLBackoffBaseline
   404  		if backoff > updateTTLBackoffLimit {
   405  			return err
   406  		}
   407  
   408  		// Wait till retrying
   409  		select {
   410  		case <-ctx.Done():
   411  			return err
   412  		case <-time.After(backoff):
   413  		}
   414  	}
   415  }