github.com/outbrain/consul@v1.4.5/agent/checks/alias.go (about) 1 package checks 2 3 import ( 4 "fmt" 5 "sync" 6 "time" 7 8 "github.com/hashicorp/consul/agent/structs" 9 "github.com/hashicorp/consul/api" 10 "github.com/hashicorp/consul/types" 11 ) 12 13 // Constants related to alias check backoff. 14 const ( 15 checkAliasBackoffMin = 3 // 3 attempts before backing off 16 checkAliasBackoffMaxWait = 1 * time.Minute // maximum backoff wait time 17 ) 18 19 // CheckAlias is a check type that aliases the health of another service 20 // instance or node. If the service aliased has any critical health checks, then 21 // this check is critical. If the service has no critical but warnings, 22 // then this check is warning, and if a service has only passing checks, then 23 // this check is passing. 24 type CheckAlias struct { 25 Node string // Node name of the service. If empty, assumed to be this node. 26 ServiceID string // ID (not name) of the service to alias 27 28 CheckID types.CheckID // ID of this check 29 RPC RPC // Used to query remote server if necessary 30 RPCReq structs.NodeSpecificRequest // Base request 31 Notify AliasNotifier // For updating the check state 32 33 stop bool 34 stopCh chan struct{} 35 stopLock sync.Mutex 36 } 37 38 // AliasNotifier is a CheckNotifier specifically for the Alias check. 39 // This requires additional methods that are satisfied by the agent 40 // local state. 41 type AliasNotifier interface { 42 CheckNotifier 43 44 AddAliasCheck(types.CheckID, string, chan<- struct{}) error 45 RemoveAliasCheck(types.CheckID, string) 46 Checks() map[types.CheckID]*structs.HealthCheck 47 } 48 49 // Start is used to start the check, runs until Stop() func (c *CheckAlias) Start() { 50 func (c *CheckAlias) Start() { 51 c.stopLock.Lock() 52 defer c.stopLock.Unlock() 53 c.stop = false 54 c.stopCh = make(chan struct{}) 55 go c.run(c.stopCh) 56 } 57 58 // Stop is used to stop the check. 59 func (c *CheckAlias) Stop() { 60 c.stopLock.Lock() 61 defer c.stopLock.Unlock() 62 if !c.stop { 63 c.stop = true 64 close(c.stopCh) 65 } 66 } 67 68 // run is invoked in a goroutine until Stop() is called. 69 func (c *CheckAlias) run(stopCh chan struct{}) { 70 // If we have a specific node set, then use a blocking query 71 if c.Node != "" { 72 c.runQuery(stopCh) 73 return 74 } 75 76 // Use the local state to match the service. 77 c.runLocal(stopCh) 78 } 79 80 func (c *CheckAlias) runLocal(stopCh chan struct{}) { 81 // Very important this is buffered as 1 so that we do not lose any 82 // queued updates. This only has to be exactly 1 since the existence 83 // of any update triggers us to load the full health check state. 84 notifyCh := make(chan struct{}, 1) 85 c.Notify.AddAliasCheck(c.CheckID, c.ServiceID, notifyCh) 86 defer c.Notify.RemoveAliasCheck(c.CheckID, c.ServiceID) 87 88 updateStatus := func() { 89 checks := c.Notify.Checks() 90 checksList := make([]*structs.HealthCheck, 0, len(checks)) 91 for _, chk := range checks { 92 checksList = append(checksList, chk) 93 } 94 c.processChecks(checksList) 95 } 96 97 // Immediately run to get the current state of the target service 98 updateStatus() 99 100 for { 101 select { 102 case <-notifyCh: 103 updateStatus() 104 case <-stopCh: 105 return 106 } 107 } 108 } 109 110 func (c *CheckAlias) runQuery(stopCh chan struct{}) { 111 args := c.RPCReq 112 args.Node = c.Node 113 args.AllowStale = true 114 args.MaxQueryTime = 1 * time.Minute 115 116 var attempt uint 117 for { 118 // Check if we're stopped. We fallthrough and block otherwise, 119 // which has a maximum time set above so we'll always check for 120 // stop within a reasonable amount of time. 121 select { 122 case <-stopCh: 123 return 124 default: 125 } 126 127 // Backoff if we have to 128 if attempt > checkAliasBackoffMin { 129 shift := attempt - checkAliasBackoffMin 130 if shift > 31 { 131 shift = 31 // so we don't overflow to 0 132 } 133 waitTime := (1 << shift) * time.Second 134 if waitTime > checkAliasBackoffMaxWait { 135 waitTime = checkAliasBackoffMaxWait 136 } 137 time.Sleep(waitTime) 138 } 139 140 // Get the current health checks for the specified node. 141 // 142 // NOTE(mitchellh): This currently returns ALL health checks for 143 // a node even though we also have the service ID. This can be 144 // optimized if we introduce a new RPC endpoint to filter both, 145 // but for blocking queries isn't that much more efficient since the checks 146 // index is global to the cluster. 147 var out structs.IndexedHealthChecks 148 if err := c.RPC.RPC("Health.NodeChecks", &args, &out); err != nil { 149 attempt++ 150 if attempt > 1 { 151 c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, 152 fmt.Sprintf("Failure checking aliased node or service: %s", err)) 153 } 154 155 continue 156 } 157 158 attempt = 0 // Reset the attempts so we don't backoff the next 159 160 // Set our index for the next request 161 args.MinQueryIndex = out.Index 162 163 // We want to ensure that we're always blocking on subsequent requests 164 // to avoid hot loops. Index 1 is always safe since the min raft index 165 // is at least 5. Note this shouldn't happen but protecting against this 166 // case is safer than a 100% CPU loop. 167 if args.MinQueryIndex < 1 { 168 args.MinQueryIndex = 1 169 } 170 171 c.processChecks(out.HealthChecks) 172 } 173 } 174 175 // processChecks is a common helper for taking a set of health checks and 176 // using them to update our alias. This is abstracted since the checks can 177 // come from both the remote server as well as local state. 178 func (c *CheckAlias) processChecks(checks []*structs.HealthCheck) { 179 health := api.HealthPassing 180 msg := "No checks found." 181 for _, chk := range checks { 182 if c.Node != "" && chk.Node != c.Node { 183 continue 184 } 185 186 // We allow ServiceID == "" so that we also check node checks 187 if chk.ServiceID != "" && chk.ServiceID != c.ServiceID { 188 continue 189 } 190 191 if chk.Status == api.HealthCritical || chk.Status == api.HealthWarning { 192 health = chk.Status 193 msg = fmt.Sprintf("Aliased check %q failing: %s", chk.Name, chk.Output) 194 195 // Critical checks exit the for loop immediately since we 196 // know that this is the health state. Warnings do not since 197 // there may still be a critical check. 198 if chk.Status == api.HealthCritical { 199 break 200 } 201 } 202 203 msg = "All checks passing." 204 } 205 206 // Update our check value 207 c.Notify.UpdateCheck(c.CheckID, health, msg) 208 }