github.com/nir0s/nomad@v0.8.7-rc1/command/agent/consul/script.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"log"
     6  	"time"
     7  
     8  	metrics "github.com/armon/go-metrics"
     9  	"github.com/hashicorp/consul/api"
    10  	"github.com/hashicorp/nomad/client/driver"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  // heartbeater is the subset of consul agent functionality needed by script
    15  // checks to heartbeat
    16  type heartbeater interface {
    17  	UpdateTTL(id, output, status string) error
    18  }
    19  
    20  // scriptHandle is returned by scriptCheck.run by cancelling a scriptCheck and
    21  // waiting for it to shutdown.
    22  type scriptHandle struct {
    23  	// cancel the script
    24  	cancel func()
    25  	exitCh chan struct{}
    26  }
    27  
    28  // wait returns a chan that's closed when the script exits
    29  func (s *scriptHandle) wait() <-chan struct{} {
    30  	return s.exitCh
    31  }
    32  
    33  // scriptCheck runs script checks via a ScriptExecutor and updates the
    34  // appropriate check's TTL when the script succeeds.
    35  type scriptCheck struct {
    36  	allocID  string
    37  	taskName string
    38  
    39  	id    string
    40  	check *structs.ServiceCheck
    41  	exec  driver.ScriptExecutor
    42  	agent heartbeater
    43  
    44  	// lastCheckOk is true if the last check was ok; otherwise false
    45  	lastCheckOk bool
    46  
    47  	logger     *log.Logger
    48  	shutdownCh <-chan struct{}
    49  }
    50  
    51  // newScriptCheck creates a new scriptCheck. run() should be called once the
    52  // initial check is registered with Consul.
    53  func newScriptCheck(allocID, taskName, checkID string, check *structs.ServiceCheck,
    54  	exec driver.ScriptExecutor, agent heartbeater, logger *log.Logger,
    55  	shutdownCh <-chan struct{}) *scriptCheck {
    56  
    57  	return &scriptCheck{
    58  		allocID:     allocID,
    59  		taskName:    taskName,
    60  		id:          checkID,
    61  		check:       check,
    62  		exec:        exec,
    63  		agent:       agent,
    64  		lastCheckOk: true, // start logging on first failure
    65  		logger:      logger,
    66  		shutdownCh:  shutdownCh,
    67  	}
    68  }
    69  
    70  // run this script check and return its cancel func. If the shutdownCh is
    71  // closed the check will be run once more before exiting.
    72  func (s *scriptCheck) run() *scriptHandle {
    73  	ctx, cancel := context.WithCancel(context.Background())
    74  	exitCh := make(chan struct{})
    75  	go func() {
    76  		defer close(exitCh)
    77  		timer := time.NewTimer(0)
    78  		defer timer.Stop()
    79  		for {
    80  			// Block until check is removed, Nomad is shutting
    81  			// down, or the check interval is up
    82  			select {
    83  			case <-ctx.Done():
    84  				// check has been removed
    85  				return
    86  			case <-s.shutdownCh:
    87  				// unblock but don't exit until after we heartbeat once more
    88  			case <-timer.C:
    89  				timer.Reset(s.check.Interval)
    90  			}
    91  			metrics.IncrCounter([]string{"client", "consul", "script_runs"}, 1)
    92  
    93  			// Execute check script with timeout
    94  			execctx, cancel := context.WithTimeout(ctx, s.check.Timeout)
    95  			output, code, err := s.exec.Exec(execctx, s.check.Command, s.check.Args)
    96  			switch execctx.Err() {
    97  			case context.Canceled:
    98  				// check removed during execution; exit
    99  				cancel()
   100  				return
   101  			case context.DeadlineExceeded:
   102  				metrics.IncrCounter([]string{"client", "consul", "script_timeouts"}, 1)
   103  				// If no error was returned, set one to make sure the task goes critical
   104  				if err == nil {
   105  					err = context.DeadlineExceeded
   106  				}
   107  
   108  				// Log deadline exceeded every time as it's a
   109  				// distinct issue from checks returning
   110  				// failures
   111  				s.logger.Printf("[WARN] consul.checks: check %q for task %q alloc %q timed out (%s)",
   112  					s.check.Name, s.taskName, s.allocID, s.check.Timeout)
   113  			}
   114  
   115  			// cleanup context
   116  			cancel()
   117  
   118  			state := api.HealthCritical
   119  			switch code {
   120  			case 0:
   121  				state = api.HealthPassing
   122  			case 1:
   123  				state = api.HealthWarning
   124  			}
   125  
   126  			var outputMsg string
   127  			if err != nil {
   128  				state = api.HealthCritical
   129  				outputMsg = err.Error()
   130  			} else {
   131  				outputMsg = string(output)
   132  			}
   133  
   134  			// Actually heartbeat the check
   135  			err = s.agent.UpdateTTL(s.id, outputMsg, state)
   136  			select {
   137  			case <-ctx.Done():
   138  				// check has been removed; don't report errors
   139  				return
   140  			default:
   141  			}
   142  
   143  			if err != nil {
   144  				if s.lastCheckOk {
   145  					s.lastCheckOk = false
   146  					s.logger.Printf("[WARN] consul.checks: update for task %q alloc %q check %q failed: %v",
   147  						s.taskName, s.allocID, s.check.Name, err)
   148  				} else {
   149  					s.logger.Printf("[DEBUG] consul.checks: update for task %q alloc %q check %q still failing: %v",
   150  						s.taskName, s.allocID, s.check.Name, err)
   151  				}
   152  
   153  			} else if !s.lastCheckOk {
   154  				// Succeeded for the first time or after failing; log
   155  				s.lastCheckOk = true
   156  				s.logger.Printf("[INFO] consul.checks: update for task %q alloc %q check %q succeeded",
   157  					s.taskName, s.allocID, s.check.Name)
   158  			}
   159  
   160  			select {
   161  			case <-s.shutdownCh:
   162  				// We've been told to exit and just heartbeated so exit
   163  				return
   164  			default:
   165  			}
   166  		}
   167  	}()
   168  	return &scriptHandle{cancel: cancel, exitCh: exitCh}
   169  }