github.com/outbrain/consul@v1.4.5/agent/checks/alias.go (about)

     1  package checks
     2  
     3  import (
     4  	"fmt"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/hashicorp/consul/agent/structs"
     9  	"github.com/hashicorp/consul/api"
    10  	"github.com/hashicorp/consul/types"
    11  )
    12  
    13  // Constants related to alias check backoff.
    14  const (
    15  	checkAliasBackoffMin     = 3               // 3 attempts before backing off
    16  	checkAliasBackoffMaxWait = 1 * time.Minute // maximum backoff wait time
    17  )
    18  
    19  // CheckAlias is a check type that aliases the health of another service
    20  // instance or node. If the service aliased has any critical health checks, then
    21  // this check is critical. If the service has no critical but warnings,
    22  // then this check is warning, and if a service has only passing checks, then
    23  // this check is passing.
    24  type CheckAlias struct {
    25  	Node      string // Node name of the service. If empty, assumed to be this node.
    26  	ServiceID string // ID (not name) of the service to alias
    27  
    28  	CheckID types.CheckID               // ID of this check
    29  	RPC     RPC                         // Used to query remote server if necessary
    30  	RPCReq  structs.NodeSpecificRequest // Base request
    31  	Notify  AliasNotifier               // For updating the check state
    32  
    33  	stop     bool
    34  	stopCh   chan struct{}
    35  	stopLock sync.Mutex
    36  }
    37  
    38  // AliasNotifier is a CheckNotifier specifically for the Alias check.
    39  // This requires additional methods that are satisfied by the agent
    40  // local state.
    41  type AliasNotifier interface {
    42  	CheckNotifier
    43  
    44  	AddAliasCheck(types.CheckID, string, chan<- struct{}) error
    45  	RemoveAliasCheck(types.CheckID, string)
    46  	Checks() map[types.CheckID]*structs.HealthCheck
    47  }
    48  
    49  // Start is used to start the check, runs until Stop() func (c *CheckAlias) Start() {
    50  func (c *CheckAlias) Start() {
    51  	c.stopLock.Lock()
    52  	defer c.stopLock.Unlock()
    53  	c.stop = false
    54  	c.stopCh = make(chan struct{})
    55  	go c.run(c.stopCh)
    56  }
    57  
    58  // Stop is used to stop the check.
    59  func (c *CheckAlias) Stop() {
    60  	c.stopLock.Lock()
    61  	defer c.stopLock.Unlock()
    62  	if !c.stop {
    63  		c.stop = true
    64  		close(c.stopCh)
    65  	}
    66  }
    67  
    68  // run is invoked in a goroutine until Stop() is called.
    69  func (c *CheckAlias) run(stopCh chan struct{}) {
    70  	// If we have a specific node set, then use a blocking query
    71  	if c.Node != "" {
    72  		c.runQuery(stopCh)
    73  		return
    74  	}
    75  
    76  	// Use the local state to match the service.
    77  	c.runLocal(stopCh)
    78  }
    79  
    80  func (c *CheckAlias) runLocal(stopCh chan struct{}) {
    81  	// Very important this is buffered as 1 so that we do not lose any
    82  	// queued updates. This only has to be exactly 1 since the existence
    83  	// of any update triggers us to load the full health check state.
    84  	notifyCh := make(chan struct{}, 1)
    85  	c.Notify.AddAliasCheck(c.CheckID, c.ServiceID, notifyCh)
    86  	defer c.Notify.RemoveAliasCheck(c.CheckID, c.ServiceID)
    87  
    88  	updateStatus := func() {
    89  		checks := c.Notify.Checks()
    90  		checksList := make([]*structs.HealthCheck, 0, len(checks))
    91  		for _, chk := range checks {
    92  			checksList = append(checksList, chk)
    93  		}
    94  		c.processChecks(checksList)
    95  	}
    96  
    97  	// Immediately run to get the current state of the target service
    98  	updateStatus()
    99  
   100  	for {
   101  		select {
   102  		case <-notifyCh:
   103  			updateStatus()
   104  		case <-stopCh:
   105  			return
   106  		}
   107  	}
   108  }
   109  
   110  func (c *CheckAlias) runQuery(stopCh chan struct{}) {
   111  	args := c.RPCReq
   112  	args.Node = c.Node
   113  	args.AllowStale = true
   114  	args.MaxQueryTime = 1 * time.Minute
   115  
   116  	var attempt uint
   117  	for {
   118  		// Check if we're stopped. We fallthrough and block otherwise,
   119  		// which has a maximum time set above so we'll always check for
   120  		// stop within a reasonable amount of time.
   121  		select {
   122  		case <-stopCh:
   123  			return
   124  		default:
   125  		}
   126  
   127  		// Backoff if we have to
   128  		if attempt > checkAliasBackoffMin {
   129  			shift := attempt - checkAliasBackoffMin
   130  			if shift > 31 {
   131  				shift = 31 // so we don't overflow to 0
   132  			}
   133  			waitTime := (1 << shift) * time.Second
   134  			if waitTime > checkAliasBackoffMaxWait {
   135  				waitTime = checkAliasBackoffMaxWait
   136  			}
   137  			time.Sleep(waitTime)
   138  		}
   139  
   140  		// Get the current health checks for the specified node.
   141  		//
   142  		// NOTE(mitchellh): This currently returns ALL health checks for
   143  		// a node even though we also have the service ID. This can be
   144  		// optimized if we introduce a new RPC endpoint to filter both,
   145  		// but for blocking queries isn't that much more efficient since the checks
   146  		// index is global to the cluster.
   147  		var out structs.IndexedHealthChecks
   148  		if err := c.RPC.RPC("Health.NodeChecks", &args, &out); err != nil {
   149  			attempt++
   150  			if attempt > 1 {
   151  				c.Notify.UpdateCheck(c.CheckID, api.HealthCritical,
   152  					fmt.Sprintf("Failure checking aliased node or service: %s", err))
   153  			}
   154  
   155  			continue
   156  		}
   157  
   158  		attempt = 0 // Reset the attempts so we don't backoff the next
   159  
   160  		// Set our index for the next request
   161  		args.MinQueryIndex = out.Index
   162  
   163  		// We want to ensure that we're always blocking on subsequent requests
   164  		// to avoid hot loops. Index 1 is always safe since the min raft index
   165  		// is at least 5. Note this shouldn't happen but protecting against this
   166  		// case is safer than a 100% CPU loop.
   167  		if args.MinQueryIndex < 1 {
   168  			args.MinQueryIndex = 1
   169  		}
   170  
   171  		c.processChecks(out.HealthChecks)
   172  	}
   173  }
   174  
   175  // processChecks is a common helper for taking a set of health checks and
   176  // using them to update our alias. This is abstracted since the checks can
   177  // come from both the remote server as well as local state.
   178  func (c *CheckAlias) processChecks(checks []*structs.HealthCheck) {
   179  	health := api.HealthPassing
   180  	msg := "No checks found."
   181  	for _, chk := range checks {
   182  		if c.Node != "" && chk.Node != c.Node {
   183  			continue
   184  		}
   185  
   186  		// We allow ServiceID == "" so that we also check node checks
   187  		if chk.ServiceID != "" && chk.ServiceID != c.ServiceID {
   188  			continue
   189  		}
   190  
   191  		if chk.Status == api.HealthCritical || chk.Status == api.HealthWarning {
   192  			health = chk.Status
   193  			msg = fmt.Sprintf("Aliased check %q failing: %s", chk.Name, chk.Output)
   194  
   195  			// Critical checks exit the for loop immediately since we
   196  			// know that this is the health state. Warnings do not since
   197  			// there may still be a critical check.
   198  			if chk.Status == api.HealthCritical {
   199  				break
   200  			}
   201  		}
   202  
   203  		msg = "All checks passing."
   204  	}
   205  
   206  	// Update our check value
   207  	c.Notify.UpdateCheck(c.CheckID, health, msg)
   208  }