github.com/cilium/cilium@v1.16.2/pkg/status/status.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package status 5 6 import ( 7 "compress/gzip" 8 "context" 9 "fmt" 10 "os" 11 "runtime/pprof" 12 "sync/atomic" 13 14 "github.com/sirupsen/logrus" 15 16 "github.com/cilium/cilium/pkg/defaults" 17 "github.com/cilium/cilium/pkg/inctimer" 18 "github.com/cilium/cilium/pkg/lock" 19 "github.com/cilium/cilium/pkg/logging" 20 "github.com/cilium/cilium/pkg/logging/logfields" 21 "github.com/cilium/cilium/pkg/time" 22 ) 23 24 const ( 25 subsystem = "status" 26 ) 27 28 var ( 29 log = logging.DefaultLogger.WithField(logfields.LogSubsys, subsystem) 30 ) 31 32 // Status is passed to a probe when its state changes 33 type Status struct { 34 // Data is non-nil when the probe has completed successfully. Data is 35 // set to the value returned by Probe() 36 Data interface{} 37 38 // Err is non-nil if either the probe file or the Failure or Warning 39 // threshold has been reached 40 Err error 41 42 // StaleWarning is true once the WarningThreshold has been reached 43 StaleWarning bool 44 } 45 46 // Probe is run by the collector at a particular interval between invocations 47 type Probe struct { 48 Name string 49 50 Probe func(ctx context.Context) (interface{}, error) 51 52 // OnStatusUpdate is called whenever the status of the probe changes 53 OnStatusUpdate func(status Status) 54 55 // Interval allows to specify a probe specific interval that can be 56 // mutated based on whether the probe is failing or based on external 57 // factors such as current cluster size 58 Interval func(failures int) time.Duration 59 60 // consecutiveFailures is the number of consecutive failures in the 61 // probe becoming stale or failing. It is managed by 62 // updateProbeStatus() 63 consecutiveFailures int 64 } 65 66 // Collector concurrently runs probes used to check status of various subsystems 67 type Collector struct { 68 lock.RWMutex // protects staleProbes and probeStartTime 69 config Config 70 stop chan struct{} 71 staleProbes map[string]struct{} 72 probeStartTime map[string]time.Time 73 74 // lastStackdumpTime is the last time we dumped stack; only do it 75 // every 5 minutes so we don't waste resources. 76 lastStackdumpTime atomic.Int64 77 } 78 79 // Config is the collector configuration 80 type Config struct { 81 WarningThreshold time.Duration 82 FailureThreshold time.Duration 83 Interval time.Duration 84 StackdumpPath string 85 } 86 87 // NewCollector creates a collector and starts the given probes. 88 // 89 // Each probe runs in a separate goroutine. 90 func NewCollector(probes []Probe, config Config) *Collector { 91 c := &Collector{ 92 config: config, 93 stop: make(chan struct{}), 94 staleProbes: make(map[string]struct{}), 95 probeStartTime: make(map[string]time.Time), 96 } 97 98 if c.config.Interval == time.Duration(0) { 99 c.config.Interval = defaults.StatusCollectorInterval 100 } 101 102 if c.config.FailureThreshold == time.Duration(0) { 103 c.config.FailureThreshold = defaults.StatusCollectorFailureThreshold 104 } 105 106 if c.config.WarningThreshold == time.Duration(0) { 107 c.config.WarningThreshold = defaults.StatusCollectorWarningThreshold 108 } 109 110 for i := range probes { 111 c.spawnProbe(&probes[i]) 112 } 113 114 return c 115 } 116 117 // Close exits all probes and shuts down the collector 118 func (c *Collector) Close() { 119 close(c.stop) 120 } 121 122 // GetStaleProbes returns a map of stale probes which key is a probe name and 123 // value is a time when the last instance of the probe has been started. 124 // 125 // A probe is declared stale if it hasn't returned in FailureThreshold. 126 func (c *Collector) GetStaleProbes() map[string]time.Time { 127 c.RLock() 128 defer c.RUnlock() 129 130 probes := make(map[string]time.Time, len(c.staleProbes)) 131 132 for p := range c.staleProbes { 133 probes[p] = c.probeStartTime[p] 134 } 135 136 return probes 137 } 138 139 // spawnProbe starts a goroutine which invokes the probe at the particular interval. 140 func (c *Collector) spawnProbe(p *Probe) { 141 go func() { 142 timer, stopTimer := inctimer.New() 143 defer stopTimer() 144 for { 145 c.runProbe(p) 146 147 interval := c.config.Interval 148 if p.Interval != nil { 149 interval = p.Interval(p.consecutiveFailures) 150 } 151 select { 152 case <-c.stop: 153 // collector is closed, stop looping 154 return 155 case <-timer.After(interval): 156 // keep looping 157 } 158 } 159 }() 160 } 161 162 // runProbe runs the given probe, and returns either after the probe has returned 163 // or after the collector has been closed. 164 func (c *Collector) runProbe(p *Probe) { 165 var ( 166 statusData interface{} 167 err error 168 warningThreshold = time.After(c.config.WarningThreshold) 169 hardTimeout = false 170 probeReturned = make(chan struct{}, 1) 171 ctx, cancel = context.WithTimeout(context.Background(), c.config.FailureThreshold) 172 ctxTimeout = make(chan struct{}, 1) 173 ) 174 175 c.Lock() 176 c.probeStartTime[p.Name] = time.Now() 177 c.Unlock() 178 179 go func() { 180 statusData, err = p.Probe(ctx) 181 close(probeReturned) 182 }() 183 184 go func() { 185 // Once ctx.Done() has been closed, we notify the polling loop by 186 // sending to the ctxTimeout channel. We cannot just close the channel, 187 // because otherwise the loop will always enter the "<-ctxTimeout" case. 188 <-ctx.Done() 189 ctxTimeout <- struct{}{} 190 }() 191 192 // This is a loop so that, when we hit a FailureThreshold, we still do 193 // not return until the probe returns. This is to ensure the same probe 194 // does not run again while it is blocked. 195 for { 196 select { 197 case <-c.stop: 198 // Collector was closed. The probe will complete in the background 199 // and won't be restarted again. 200 cancel() 201 return 202 203 case <-warningThreshold: 204 // Just warn and continue waiting for probe 205 log.WithField(logfields.Probe, p.Name). 206 Warnf("No response from probe within %v seconds", 207 c.config.WarningThreshold.Seconds()) 208 209 case <-probeReturned: 210 // The probe completed and we can return from runProbe 211 switch { 212 case hardTimeout: 213 // FailureThreshold was already reached. Keep the failure error 214 // message 215 case err != nil: 216 c.updateProbeStatus(p, nil, false, err) 217 default: 218 c.updateProbeStatus(p, statusData, false, nil) 219 } 220 221 cancel() 222 return 223 224 case <-ctxTimeout: 225 // We have timed out. Report a status and mark that we timed out so we 226 // do not emit status later. 227 staleErr := fmt.Errorf("no response from %s probe within %v seconds", 228 p.Name, c.config.FailureThreshold.Seconds()) 229 c.updateProbeStatus(p, nil, true, staleErr) 230 hardTimeout = true 231 } 232 } 233 } 234 235 func (c *Collector) updateProbeStatus(p *Probe, data interface{}, stale bool, err error) { 236 // Update stale status of the probe 237 c.Lock() 238 startTime := c.probeStartTime[p.Name] 239 if stale { 240 c.staleProbes[p.Name] = struct{}{} 241 p.consecutiveFailures++ 242 } else { 243 delete(c.staleProbes, p.Name) 244 if err == nil { 245 p.consecutiveFailures = 0 246 } else { 247 p.consecutiveFailures++ 248 } 249 } 250 c.Unlock() 251 252 if stale { 253 log.WithFields(logrus.Fields{ 254 logfields.StartTime: startTime, 255 logfields.Probe: p.Name, 256 }).Warn("Timeout while waiting probe") 257 258 // We just had a probe time out. This is commonly caused by a deadlock. 259 // So, capture a stack dump to aid in debugging. 260 go c.maybeDumpStack() 261 } 262 263 // Notify the probe about status update 264 p.OnStatusUpdate(Status{Err: err, Data: data, StaleWarning: stale}) 265 } 266 267 // maybeDumpStack dumps the goroutine stack to a file on disk (usually in /run/cilium/state) 268 // if one hasn't been written in the past 5 minutes. 269 // This is triggered if a collector is stale, which can be caused by deadlocks. 270 func (c *Collector) maybeDumpStack() { 271 if c.config.StackdumpPath == "" { 272 return 273 } 274 275 now := time.Now().Unix() 276 before := c.lastStackdumpTime.Load() 277 if now-before < 5*60 { 278 return 279 } 280 swapped := c.lastStackdumpTime.CompareAndSwap(before, now) 281 if !swapped { 282 return 283 } 284 285 profile := pprof.Lookup("goroutine") 286 if profile == nil { 287 return 288 } 289 290 out, err := os.Create(c.config.StackdumpPath) 291 if err != nil { 292 log.WithError(err).WithField("path", c.config.StackdumpPath).Warn("Failed to write stack dump") 293 } 294 defer out.Close() 295 gzout := gzip.NewWriter(out) 296 defer gzout.Close() 297 298 profile.WriteTo(gzout, 2) // 2: print same stack format as panic 299 log.Infof("Wrote stack dump to %s", c.config.StackdumpPath) 300 }