github.com/looshlee/cilium@v1.6.12/pkg/status/status.go (about) 1 // Copyright 2018 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package status 16 17 import ( 18 "context" 19 "fmt" 20 "time" 21 22 "github.com/sirupsen/logrus" 23 24 "github.com/cilium/cilium/pkg/defaults" 25 "github.com/cilium/cilium/pkg/lock" 26 "github.com/cilium/cilium/pkg/logging" 27 "github.com/cilium/cilium/pkg/logging/logfields" 28 ) 29 30 const ( 31 subsystem = "status" 32 ) 33 34 var ( 35 log = logging.DefaultLogger.WithField(logfields.LogSubsys, subsystem) 36 ) 37 38 // Status is passed to a probe when its state changes 39 type Status struct { 40 // Data is non-nil when the probe has completed successfully. Data is 41 // set to the value returned by Probe() 42 Data interface{} 43 44 // Err is non-nil if either the probe file or the Failure or Warning 45 // threshold has been reached 46 Err error 47 48 // StaleWarning is true once the WarningThreshold has been reached 49 StaleWarning bool 50 } 51 52 // Probe is run by the collector at a particular interval between invocations 53 type Probe struct { 54 Name string 55 56 Probe func(ctx context.Context) (interface{}, error) 57 58 // OnStatusUpdate is called whenever the status of the probe changes 59 OnStatusUpdate func(status Status) 60 61 // Interval allows to specify a probe specific interval that can be 62 // mutated based on whether the probe is failing or based on external 63 // factors such as current cluster size 64 Interval func(failures int) time.Duration 65 66 // consecutiveFailures is the number of consecutive failures in the 67 // probe becoming stale or failing. It is managed by 68 // updateProbeStatus() 69 consecutiveFailures int 70 } 71 72 // Collector concurrently runs probes used to check status of various subsystems 73 type Collector struct { 74 lock.RWMutex // protects staleProbes and probeStartTime 75 config Config 76 stop chan struct{} 77 staleProbes map[string]struct{} 78 probeStartTime map[string]time.Time 79 } 80 81 // Config is the collector configuration 82 type Config struct { 83 WarningThreshold time.Duration 84 FailureThreshold time.Duration 85 Interval time.Duration 86 } 87 88 // NewCollector creates a collector and starts the given probes. 89 // 90 // Each probe runs in a separate goroutine. 91 func NewCollector(probes []Probe, config Config) *Collector { 92 c := &Collector{ 93 config: config, 94 stop: make(chan struct{}), 95 staleProbes: make(map[string]struct{}), 96 probeStartTime: make(map[string]time.Time), 97 } 98 99 if c.config.Interval == time.Duration(0) { 100 c.config.Interval = defaults.StatusCollectorInterval 101 } 102 103 if c.config.FailureThreshold == time.Duration(0) { 104 c.config.FailureThreshold = defaults.StatusCollectorFailureThreshold 105 } 106 107 if c.config.WarningThreshold == time.Duration(0) { 108 c.config.WarningThreshold = defaults.StatusCollectorWarningThreshold 109 } 110 111 for i := range probes { 112 c.spawnProbe(&probes[i]) 113 } 114 115 return c 116 } 117 118 // Close exits all probes and shuts down the collector 119 // TODO(brb): call it when daemon exits (after GH#6248). 120 func (c *Collector) Close() { 121 close(c.stop) 122 } 123 124 // GetStaleProbes returns a map of stale probes which key is a probe name and 125 // value is a time when the last instance of the probe has been started. 126 // 127 // A probe is declared stale if it hasn't returned in FailureThreshold. 128 func (c *Collector) GetStaleProbes() map[string]time.Time { 129 c.RLock() 130 defer c.RUnlock() 131 132 probes := make(map[string]time.Time) 133 134 for p := range c.staleProbes { 135 probes[p] = c.probeStartTime[p] 136 } 137 138 return probes 139 } 140 141 // spawnProbe starts a goroutine which invokes the probe at the particular interval. 142 func (c *Collector) spawnProbe(p *Probe) { 143 go func() { 144 for { 145 c.runProbe(p) 146 147 interval := c.config.Interval 148 if p.Interval != nil { 149 interval = p.Interval(p.consecutiveFailures) 150 } 151 152 select { 153 case <-c.stop: 154 // collector is closed, stop looping 155 return 156 case <-time.After(interval): 157 // keep looping 158 } 159 } 160 }() 161 } 162 163 // runProbe runs the given probe, and returns either after the probe has returned 164 // or after the collector has been closed. 165 func (c *Collector) runProbe(p *Probe) { 166 var ( 167 statusData interface{} 168 err error 169 warningThreshold = time.After(c.config.WarningThreshold) 170 hardTimeout = false 171 probeReturned = make(chan struct{}, 1) 172 ctx, cancel = context.WithTimeout(context.Background(), c.config.FailureThreshold) 173 ctxTimeout = make(chan struct{}, 1) 174 ) 175 176 c.Lock() 177 c.probeStartTime[p.Name] = time.Now() 178 c.Unlock() 179 180 go func() { 181 statusData, err = p.Probe(ctx) 182 close(probeReturned) 183 }() 184 185 go func() { 186 // Once ctx.Done() has been closed, we notify the polling loop by 187 // sending to the ctxTimeout channel. We cannot just close the channel, 188 // because otherwise the loop will always enter the "<-ctxTimeout" case. 189 <-ctx.Done() 190 ctxTimeout <- struct{}{} 191 }() 192 193 // This is a loop so that, when we hit a FailureThreshold, we still do 194 // not return until the probe returns. This is to ensure the same probe 195 // does not run again while it is blocked. 196 for { 197 select { 198 case <-c.stop: 199 // Collector was closed. The probe will complete in the background 200 // and won't be restarted again. 201 cancel() 202 return 203 204 case <-warningThreshold: 205 // Just warn and continue waiting for probe 206 log.WithField(logfields.Probe, p.Name). 207 Warnf("No response from probe within %v seconds", 208 c.config.WarningThreshold.Seconds()) 209 210 case <-probeReturned: 211 // The probe completed and we can return from runProbe 212 switch { 213 case hardTimeout: 214 // FailureThreshold was already reached. Keep the failure error 215 // message 216 case err != nil: 217 c.updateProbeStatus(p, nil, false, err) 218 default: 219 c.updateProbeStatus(p, statusData, false, nil) 220 } 221 222 cancel() 223 return 224 225 case <-ctxTimeout: 226 // We have timed out. Report a status and mark that we timed out so we 227 // do not emit status later. 228 staleErr := fmt.Errorf("no response from %s probe within %v seconds", 229 p.Name, c.config.FailureThreshold.Seconds()) 230 c.updateProbeStatus(p, nil, true, staleErr) 231 hardTimeout = true 232 } 233 } 234 } 235 236 func (c *Collector) updateProbeStatus(p *Probe, data interface{}, stale bool, err error) { 237 // Update stale status of the probe 238 c.Lock() 239 startTime := c.probeStartTime[p.Name] 240 if stale { 241 c.staleProbes[p.Name] = struct{}{} 242 p.consecutiveFailures++ 243 } else { 244 delete(c.staleProbes, p.Name) 245 if err == nil { 246 p.consecutiveFailures = 0 247 } else { 248 p.consecutiveFailures++ 249 } 250 } 251 c.Unlock() 252 253 if stale { 254 log.WithFields(logrus.Fields{ 255 logfields.StartTime: startTime, 256 logfields.Probe: p.Name, 257 }).Warn("Timeout while waiting probe") 258 } 259 260 // Notify the probe about status update 261 p.OnStatusUpdate(Status{Err: err, Data: data, StaleWarning: stale}) 262 }