github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/taskrunner/script_check_hook.go (about)

     1  package taskrunner
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/consul/api"
    10  	log "github.com/hashicorp/go-hclog"
    11  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    12  	tinterfaces "github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces"
    13  	"github.com/hashicorp/nomad/client/consul"
    14  	"github.com/hashicorp/nomad/client/taskenv"
    15  	agentconsul "github.com/hashicorp/nomad/command/agent/consul"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  )
    18  
    19  var _ interfaces.TaskPoststartHook = &scriptCheckHook{}
    20  var _ interfaces.TaskUpdateHook = &scriptCheckHook{}
    21  var _ interfaces.TaskStopHook = &scriptCheckHook{}
    22  
    23  // default max amount of time to wait for all scripts on shutdown.
    24  const defaultShutdownWait = time.Minute
    25  
    26  type scriptCheckHookConfig struct {
    27  	alloc        *structs.Allocation
    28  	task         *structs.Task
    29  	consul       consul.ConsulServiceAPI
    30  	logger       log.Logger
    31  	shutdownWait time.Duration
    32  }
    33  
    34  // scriptCheckHook implements a task runner hook for running script
    35  // checks in the context of a task
    36  type scriptCheckHook struct {
    37  	consul       consul.ConsulServiceAPI
    38  	alloc        *structs.Allocation
    39  	task         *structs.Task
    40  	logger       log.Logger
    41  	shutdownWait time.Duration // max time to wait for scripts to shutdown
    42  	shutdownCh   chan struct{} // closed when all scripts should shutdown
    43  
    44  	// The following fields can be changed by Update()
    45  	driverExec tinterfaces.ScriptExecutor
    46  	taskEnv    *taskenv.TaskEnv
    47  
    48  	// These maintain state and are populated by Poststart() or Update()
    49  	scripts        map[string]*scriptCheck
    50  	runningScripts map[string]*taskletHandle
    51  
    52  	// Since Update() may be called concurrently with any other hook all
    53  	// hook methods must be fully serialized
    54  	mu sync.Mutex
    55  }
    56  
    57  // newScriptCheckHook returns a hook without any scriptChecks.
    58  // They will get created only once their task environment is ready
    59  // in Poststart() or Update()
    60  func newScriptCheckHook(c scriptCheckHookConfig) *scriptCheckHook {
    61  	h := &scriptCheckHook{
    62  		consul:         c.consul,
    63  		alloc:          c.alloc,
    64  		task:           c.task,
    65  		scripts:        make(map[string]*scriptCheck),
    66  		runningScripts: make(map[string]*taskletHandle),
    67  		shutdownWait:   defaultShutdownWait,
    68  		shutdownCh:     make(chan struct{}),
    69  	}
    70  
    71  	if c.shutdownWait != 0 {
    72  		h.shutdownWait = c.shutdownWait // override for testing
    73  	}
    74  	h.logger = c.logger.Named(h.Name())
    75  	return h
    76  }
    77  
    78  func (h *scriptCheckHook) Name() string {
    79  	return "script_checks"
    80  }
    81  
    82  // Prestart implements interfaces.TaskPrestartHook. It stores the
    83  // initial structs.Task
    84  func (h *scriptCheckHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, _ *interfaces.TaskPrestartResponse) error {
    85  	h.mu.Lock()
    86  	defer h.mu.Unlock()
    87  	h.task = req.Task
    88  	return nil
    89  }
    90  
    91  // PostStart implements interfaces.TaskPoststartHook. It creates new
    92  // script checks with the current task context (driver and env), and
    93  // starts up the scripts.
    94  func (h *scriptCheckHook) Poststart(ctx context.Context, req *interfaces.TaskPoststartRequest, _ *interfaces.TaskPoststartResponse) error {
    95  	h.mu.Lock()
    96  	defer h.mu.Unlock()
    97  
    98  	if req.DriverExec == nil {
    99  		h.logger.Debug("driver doesn't support script checks")
   100  		return nil
   101  	}
   102  	h.driverExec = req.DriverExec
   103  	h.taskEnv = req.TaskEnv
   104  
   105  	return h.upsertChecks()
   106  }
   107  
   108  // Updated implements interfaces.TaskUpdateHook. It creates new
   109  // script checks with the current task context (driver and env and possibly
   110  // new structs.Task), and starts up the scripts.
   111  func (h *scriptCheckHook) Update(ctx context.Context, req *interfaces.TaskUpdateRequest, _ *interfaces.TaskUpdateResponse) error {
   112  	h.mu.Lock()
   113  	defer h.mu.Unlock()
   114  
   115  	task := req.Alloc.LookupTask(h.task.Name)
   116  	if task == nil {
   117  		return fmt.Errorf("task %q not found in updated alloc", h.task.Name)
   118  	}
   119  	h.alloc = req.Alloc
   120  	h.task = task
   121  	h.taskEnv = req.TaskEnv
   122  
   123  	return h.upsertChecks()
   124  }
   125  
   126  func (h *scriptCheckHook) upsertChecks() error {
   127  	// Create new script checks struct with new task context
   128  	oldScriptChecks := h.scripts
   129  	h.scripts = h.newScriptChecks()
   130  
   131  	// Run new or replacement scripts
   132  	for id, script := range h.scripts {
   133  		// If it's already running, cancel and replace
   134  		if oldScript, running := h.runningScripts[id]; running {
   135  			oldScript.cancel()
   136  		}
   137  		// Start and store the handle
   138  		h.runningScripts[id] = script.run()
   139  	}
   140  
   141  	// Cancel scripts we no longer want
   142  	for id := range oldScriptChecks {
   143  		if _, ok := h.scripts[id]; !ok {
   144  			if oldScript, running := h.runningScripts[id]; running {
   145  				oldScript.cancel()
   146  			}
   147  		}
   148  	}
   149  	return nil
   150  }
   151  
   152  // Stop implements interfaces.TaskStopHook and blocks waiting for running
   153  // scripts to finish (or for the shutdownWait timeout to expire).
   154  func (h *scriptCheckHook) Stop(ctx context.Context, req *interfaces.TaskStopRequest, resp *interfaces.TaskStopResponse) error {
   155  	h.mu.Lock()
   156  	defer h.mu.Unlock()
   157  	close(h.shutdownCh)
   158  	deadline := time.After(h.shutdownWait)
   159  	err := fmt.Errorf("timed out waiting for script checks to exit")
   160  	for _, script := range h.runningScripts {
   161  		select {
   162  		case <-script.wait():
   163  		case <-ctx.Done():
   164  			// the caller is passing the background context, so
   165  			// we should never really see this outside of testing
   166  		case <-deadline:
   167  			// at this point the Consul client has been cleaned
   168  			// up so we don't want to hang onto this.
   169  			return err
   170  		}
   171  	}
   172  	return nil
   173  }
   174  
   175  func (h *scriptCheckHook) newScriptChecks() map[string]*scriptCheck {
   176  	scriptChecks := make(map[string]*scriptCheck)
   177  	interpolatedTaskServices := taskenv.InterpolateServices(h.taskEnv, h.task.Services)
   178  	for _, service := range interpolatedTaskServices {
   179  		for _, check := range service.Checks {
   180  			if check.Type != structs.ServiceCheckScript {
   181  				continue
   182  			}
   183  			serviceID := agentconsul.MakeAllocServiceID(
   184  				h.alloc.ID, h.task.Name, service)
   185  			sc := newScriptCheck(&scriptCheckConfig{
   186  				allocID:    h.alloc.ID,
   187  				taskName:   h.task.Name,
   188  				check:      check,
   189  				serviceID:  serviceID,
   190  				agent:      h.consul,
   191  				driverExec: h.driverExec,
   192  				taskEnv:    h.taskEnv,
   193  				logger:     h.logger,
   194  				shutdownCh: h.shutdownCh,
   195  			})
   196  			if sc != nil {
   197  				scriptChecks[sc.id] = sc
   198  			}
   199  		}
   200  	}
   201  
   202  	// Walk back through the task group to see if there are script checks
   203  	// associated with the task. If so, we'll create scriptCheck tasklets
   204  	// for them. The group-level service and any check restart behaviors it
   205  	// needs are entirely encapsulated within the group service hook which
   206  	// watches Consul for status changes.
   207  	tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup)
   208  	interpolatedGroupServices := taskenv.InterpolateServices(h.taskEnv, tg.Services)
   209  	for _, service := range interpolatedGroupServices {
   210  		for _, check := range service.Checks {
   211  			if check.Type != structs.ServiceCheckScript {
   212  				continue
   213  			}
   214  			if check.TaskName != h.task.Name {
   215  				continue
   216  			}
   217  			groupTaskName := "group-" + tg.Name
   218  			serviceID := agentconsul.MakeAllocServiceID(
   219  				h.alloc.ID, groupTaskName, service)
   220  			sc := newScriptCheck(&scriptCheckConfig{
   221  				allocID:    h.alloc.ID,
   222  				taskName:   groupTaskName,
   223  				check:      check,
   224  				serviceID:  serviceID,
   225  				agent:      h.consul,
   226  				driverExec: h.driverExec,
   227  				taskEnv:    h.taskEnv,
   228  				logger:     h.logger,
   229  				shutdownCh: h.shutdownCh,
   230  				isGroup:    true,
   231  			})
   232  			if sc != nil {
   233  				scriptChecks[sc.id] = sc
   234  			}
   235  		}
   236  	}
   237  	return scriptChecks
   238  }
   239  
   240  // heartbeater is the subset of consul agent functionality needed by script
   241  // checks to heartbeat
   242  type heartbeater interface {
   243  	UpdateTTL(id, output, status string) error
   244  }
   245  
   246  // scriptCheck runs script checks via a interfaces.ScriptExecutor and updates the
   247  // appropriate check's TTL when the script succeeds.
   248  type scriptCheck struct {
   249  	id          string
   250  	agent       heartbeater
   251  	check       *structs.ServiceCheck
   252  	lastCheckOk bool // true if the last check was ok; otherwise false
   253  	tasklet
   254  }
   255  
   256  // scriptCheckConfig is a parameter struct for newScriptCheck
   257  type scriptCheckConfig struct {
   258  	allocID    string
   259  	taskName   string
   260  	serviceID  string
   261  	check      *structs.ServiceCheck
   262  	agent      heartbeater
   263  	driverExec tinterfaces.ScriptExecutor
   264  	taskEnv    *taskenv.TaskEnv
   265  	logger     log.Logger
   266  	shutdownCh chan struct{}
   267  	isGroup    bool
   268  }
   269  
   270  // newScriptCheck constructs a scriptCheck. we're only going to
   271  // configure the immutable fields of scriptCheck here, with the
   272  // rest being configured during the Poststart hook so that we have
   273  // the rest of the task execution environment
   274  func newScriptCheck(config *scriptCheckConfig) *scriptCheck {
   275  
   276  	// Guard against not having a valid taskEnv. This can be the case if the
   277  	// PreKilling or Exited hook is run before Poststart.
   278  	if config.taskEnv == nil || config.driverExec == nil {
   279  		return nil
   280  	}
   281  
   282  	orig := config.check
   283  	sc := &scriptCheck{
   284  		agent:       config.agent,
   285  		check:       config.check.Copy(),
   286  		lastCheckOk: true, // start logging on first failure
   287  	}
   288  
   289  	// we can't use the promoted fields of tasklet in the struct literal
   290  	sc.Command = config.taskEnv.ReplaceEnv(config.check.Command)
   291  	sc.Args = config.taskEnv.ParseAndReplace(config.check.Args)
   292  	sc.Interval = config.check.Interval
   293  	sc.Timeout = config.check.Timeout
   294  	sc.exec = config.driverExec
   295  	sc.callback = newScriptCheckCallback(sc)
   296  	sc.logger = config.logger
   297  	sc.shutdownCh = config.shutdownCh
   298  	sc.check.Command = sc.Command
   299  	sc.check.Args = sc.Args
   300  
   301  	if config.isGroup {
   302  		// group services don't have access to a task environment
   303  		// at creation, so their checks get registered before the
   304  		// check can be interpolated here. if we don't use the
   305  		// original checkID, they can't be updated.
   306  		sc.id = agentconsul.MakeCheckID(config.serviceID, orig)
   307  	} else {
   308  		sc.id = agentconsul.MakeCheckID(config.serviceID, sc.check)
   309  	}
   310  	return sc
   311  }
   312  
   313  // Copy does a *shallow* copy of script checks.
   314  func (sc *scriptCheck) Copy() *scriptCheck {
   315  	newSc := sc
   316  	return newSc
   317  }
   318  
   319  // closes over the script check and returns the taskletCallback for
   320  // when the script check executes.
   321  func newScriptCheckCallback(s *scriptCheck) taskletCallback {
   322  
   323  	return func(ctx context.Context, params execResult) {
   324  		output := params.output
   325  		code := params.code
   326  		err := params.err
   327  
   328  		state := api.HealthCritical
   329  		switch code {
   330  		case 0:
   331  			state = api.HealthPassing
   332  		case 1:
   333  			state = api.HealthWarning
   334  		}
   335  
   336  		var outputMsg string
   337  		if err != nil {
   338  			state = api.HealthCritical
   339  			outputMsg = err.Error()
   340  		} else {
   341  			outputMsg = string(output)
   342  		}
   343  
   344  		// heartbeat the check to Consul
   345  		err = s.updateTTL(ctx, outputMsg, state)
   346  		select {
   347  		case <-ctx.Done():
   348  			// check has been removed; don't report errors
   349  			return
   350  		default:
   351  		}
   352  
   353  		if err != nil {
   354  			if s.lastCheckOk {
   355  				s.lastCheckOk = false
   356  				s.logger.Warn("updating check failed", "error", err)
   357  			} else {
   358  				s.logger.Debug("updating check still failing", "error", err)
   359  			}
   360  
   361  		} else if !s.lastCheckOk {
   362  			// Succeeded for the first time or after failing; log
   363  			s.lastCheckOk = true
   364  			s.logger.Info("updating check succeeded")
   365  		}
   366  	}
   367  }
   368  
   369  const (
   370  	updateTTLBackoffBaseline = 1 * time.Second
   371  	updateTTLBackoffLimit    = 3 * time.Second
   372  )
   373  
   374  // updateTTL updates the state to Consul, performing an expontential backoff
   375  // in the case where the check isn't registered in Consul to avoid a race between
   376  // service registration and the first check.
   377  func (s *scriptCheck) updateTTL(ctx context.Context, msg, state string) error {
   378  	for attempts := 0; ; attempts++ {
   379  		err := s.agent.UpdateTTL(s.id, msg, state)
   380  		if err == nil {
   381  			return nil
   382  		}
   383  
   384  		// Handle the retry case
   385  		backoff := (1 << (2 * uint64(attempts))) * updateTTLBackoffBaseline
   386  		if backoff > updateTTLBackoffLimit {
   387  			return err
   388  		}
   389  
   390  		// Wait till retrying
   391  		select {
   392  		case <-ctx.Done():
   393  			return err
   394  		case <-time.After(backoff):
   395  		}
   396  	}
   397  }