github.com/smithx10/nomad@v0.9.1-rc1/command/agent/consul/script.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"time"
     6  
     7  	metrics "github.com/armon/go-metrics"
     8  	log "github.com/hashicorp/go-hclog"
     9  
    10  	"github.com/hashicorp/consul/api"
    11  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  // heartbeater is the subset of consul agent functionality needed by script
    16  // checks to heartbeat
    17  type heartbeater interface {
    18  	UpdateTTL(id, output, status string) error
    19  }
    20  
    21  // contextExec allows canceling a ScriptExecutor with a context.
    22  type contextExec struct {
    23  	// pctx is the parent context. A subcontext will be created with Exec's
    24  	// timeout.
    25  	pctx context.Context
    26  
    27  	// exec to be wrapped in a context
    28  	exec interfaces.ScriptExecutor
    29  }
    30  
    31  func newContextExec(ctx context.Context, exec interfaces.ScriptExecutor) *contextExec {
    32  	return &contextExec{
    33  		pctx: ctx,
    34  		exec: exec,
    35  	}
    36  }
    37  
    38  type execResult struct {
    39  	buf  []byte
    40  	code int
    41  	err  error
    42  }
    43  
    44  // Exec a command until the timeout expires, the context is canceled, or the
    45  // underlying Exec returns.
    46  func (c *contextExec) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
    47  	resCh := make(chan execResult, 1)
    48  
    49  	// Don't trust the underlying implementation to obey timeout
    50  	ctx, cancel := context.WithTimeout(c.pctx, timeout)
    51  	defer cancel()
    52  
    53  	go func() {
    54  		output, code, err := c.exec.Exec(timeout, cmd, args)
    55  		select {
    56  		case resCh <- execResult{output, code, err}:
    57  		case <-ctx.Done():
    58  		}
    59  	}()
    60  
    61  	select {
    62  	case res := <-resCh:
    63  		return res.buf, res.code, res.err
    64  	case <-ctx.Done():
    65  		return nil, 0, ctx.Err()
    66  	}
    67  }
    68  
    69  // scriptHandle is returned by scriptCheck.run by cancelling a scriptCheck and
    70  // waiting for it to shutdown.
    71  type scriptHandle struct {
    72  	// cancel the script
    73  	cancel func()
    74  	exitCh chan struct{}
    75  }
    76  
    77  // wait returns a chan that's closed when the script exits
    78  func (s *scriptHandle) wait() <-chan struct{} {
    79  	return s.exitCh
    80  }
    81  
    82  // scriptCheck runs script checks via a ScriptExecutor and updates the
    83  // appropriate check's TTL when the script succeeds.
    84  type scriptCheck struct {
    85  	allocID  string
    86  	taskName string
    87  
    88  	id    string
    89  	check *structs.ServiceCheck
    90  	exec  interfaces.ScriptExecutor
    91  	agent heartbeater
    92  
    93  	// lastCheckOk is true if the last check was ok; otherwise false
    94  	lastCheckOk bool
    95  
    96  	logger     log.Logger
    97  	shutdownCh <-chan struct{}
    98  }
    99  
   100  // newScriptCheck creates a new scriptCheck. run() should be called once the
   101  // initial check is registered with Consul.
   102  func newScriptCheck(allocID, taskName, checkID string, check *structs.ServiceCheck,
   103  	exec interfaces.ScriptExecutor, agent heartbeater, logger log.Logger,
   104  	shutdownCh <-chan struct{}) *scriptCheck {
   105  
   106  	logger = logger.ResetNamed("consul.checks").With("task", taskName, "alloc_id", allocID, "check", check.Name)
   107  	return &scriptCheck{
   108  		allocID:     allocID,
   109  		taskName:    taskName,
   110  		id:          checkID,
   111  		check:       check,
   112  		exec:        exec,
   113  		agent:       agent,
   114  		lastCheckOk: true, // start logging on first failure
   115  		logger:      logger,
   116  		shutdownCh:  shutdownCh,
   117  	}
   118  }
   119  
   120  // run this script check and return its cancel func. If the shutdownCh is
   121  // closed the check will be run once more before exiting.
   122  func (s *scriptCheck) run() *scriptHandle {
   123  	ctx, cancel := context.WithCancel(context.Background())
   124  	exitCh := make(chan struct{})
   125  
   126  	// Wrap the original ScriptExecutor in one that obeys context
   127  	// cancelation.
   128  	ctxExec := newContextExec(ctx, s.exec)
   129  
   130  	go func() {
   131  		defer close(exitCh)
   132  		timer := time.NewTimer(0)
   133  		defer timer.Stop()
   134  		for {
   135  			// Block until check is removed, Nomad is shutting
   136  			// down, or the check interval is up
   137  			select {
   138  			case <-ctx.Done():
   139  				// check has been removed
   140  				return
   141  			case <-s.shutdownCh:
   142  				// unblock but don't exit until after we heartbeat once more
   143  			case <-timer.C:
   144  				timer.Reset(s.check.Interval)
   145  			}
   146  			metrics.IncrCounter([]string{"client", "consul", "script_runs"}, 1)
   147  
   148  			// Execute check script with timeout
   149  			output, code, err := ctxExec.Exec(s.check.Timeout, s.check.Command, s.check.Args)
   150  			switch err {
   151  			case context.Canceled:
   152  				// check removed during execution; exit
   153  				return
   154  			case context.DeadlineExceeded:
   155  				metrics.IncrCounter([]string{"client", "consul", "script_timeouts"}, 1)
   156  				// If no error was returned, set one to make sure the task goes critical
   157  				if err == nil {
   158  					err = context.DeadlineExceeded
   159  				}
   160  
   161  				// Log deadline exceeded every time as it's a
   162  				// distinct issue from checks returning
   163  				// failures
   164  				s.logger.Warn("check timed out", "timeout", s.check.Timeout)
   165  			}
   166  
   167  			state := api.HealthCritical
   168  			switch code {
   169  			case 0:
   170  				state = api.HealthPassing
   171  			case 1:
   172  				state = api.HealthWarning
   173  			}
   174  
   175  			var outputMsg string
   176  			if err != nil {
   177  				state = api.HealthCritical
   178  				outputMsg = err.Error()
   179  			} else {
   180  				outputMsg = string(output)
   181  			}
   182  
   183  			// Actually heartbeat the check
   184  			err = s.agent.UpdateTTL(s.id, outputMsg, state)
   185  			select {
   186  			case <-ctx.Done():
   187  				// check has been removed; don't report errors
   188  				return
   189  			default:
   190  			}
   191  
   192  			if err != nil {
   193  				if s.lastCheckOk {
   194  					s.lastCheckOk = false
   195  					s.logger.Warn("updating check failed", "error", err)
   196  				} else {
   197  					s.logger.Debug("updating check still failing", "error", err)
   198  				}
   199  
   200  			} else if !s.lastCheckOk {
   201  				// Succeeded for the first time or after failing; log
   202  				s.lastCheckOk = true
   203  				s.logger.Info("updating check succeeded")
   204  			}
   205  
   206  			select {
   207  			case <-s.shutdownCh:
   208  				// We've been told to exit and just heartbeated so exit
   209  				return
   210  			default:
   211  			}
   212  		}
   213  	}()
   214  	return &scriptHandle{cancel: cancel, exitCh: exitCh}
   215  }