github.com/fafucoder/cilium@v1.6.11/pkg/controller/controller.go (about) 1 // Copyright 2018 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package controller 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "time" 22 23 "github.com/cilium/cilium/api/v1/models" 24 "github.com/cilium/cilium/pkg/lock" 25 "github.com/cilium/cilium/pkg/metrics" 26 "github.com/cilium/cilium/pkg/option" 27 28 "github.com/go-openapi/strfmt" 29 "github.com/sirupsen/logrus" 30 ) 31 32 const ( 33 success = "success" 34 failure = "failure" 35 ) 36 37 // ControllerFunc is a function that the controller runs. This type is used for 38 // DoFunc and StopFunc. 39 type ControllerFunc func(ctx context.Context) error 40 41 // ExitReason is a returnable type from DoFunc that causes the 42 // controller to exit. This reason is recorded in the controller's status. The 43 // controller is not removed from any manager. 44 // Construct one with NewExitReason("a reason") 45 type ExitReason struct { 46 // This is constucted in this odd way because the type assertion in 47 // runController didn't work otherwise. 48 error 49 } 50 51 // NewExitReason returns a new ExitReason 52 func NewExitReason(reason string) ExitReason { 53 return ExitReason{errors.New(reason)} 54 } 55 56 // ControllerParams contains all parameters of a controller 57 type ControllerParams struct { 58 // DoFunc is the function that will be run until it succeeds and/or 59 // using the interval RunInterval if not 0. 60 // An unset DoFunc is an error and will be logged as one. 61 DoFunc ControllerFunc 62 63 // StopFunc is called when the controller stops. It is intended to run any 64 // clean-up tasks for the controller (e.g. deallocate/release resources) 65 // It is guaranteed that DoFunc is called at least once before StopFunc is 66 // called. 67 // An unset StopFunc is not an error (and will be a no-op) 68 // Note: Since this occurs on controller exit, error counts and tracking may 69 // not be checked after StopFunc is run. 70 StopFunc ControllerFunc 71 72 // If set to any other value than 0, will cause DoFunc to be run in the 73 // specified interval. The interval starts from when the DoFunc has 74 // returned last 75 RunInterval time.Duration 76 77 // ErrorRetryBaseDuration is the initial time to wait to run DoFunc 78 // again on return of an error. On each consecutive error, this value 79 // is multiplied by the number of consecutive errors to provide a 80 // constant back off. The default is 1s. 81 ErrorRetryBaseDuration time.Duration 82 83 // NoErrorRetry when set to true, disabled retries on errors 84 NoErrorRetry bool 85 } 86 87 // undefinedDoFunc is used when no DoFunc is set. controller.DoFunc is set to this 88 // when the controller is incorrectly initialised. 89 func undefinedDoFunc(name string) error { 90 return fmt.Errorf("controller %s DoFunc is nil", name) 91 } 92 93 // NoopFunc is a no-op placeholder for DoFunc & StopFunc. 94 // It is automatically used when StopFunc is undefined, and can be used as a 95 // DoFunc stub when the controller should only run StopFunc. 96 func NoopFunc(ctx context.Context) error { 97 return nil 98 } 99 100 // Controller is a simple pattern that allows to perform the following 101 // tasks: 102 // - Run an operation in the background and retry until it succeeds 103 // - Perform a regular sync operation in the background 104 // 105 // A controller has configurable retry intervals and will collect statistics 106 // on number of successful runs, number of failures, last error message, 107 // and last error timestamp. 108 // 109 // Controllers have a name and are tied to a Manager. The manager is typically 110 // bound to higher level objects such as endpoint. These higher level objects 111 // can then run multiple controllers to perform async tasks such as: 112 // - Annotating k8s resources with values 113 // - Synchronizing an object with the kvstore 114 // - Any other async operation to may fail and require retries 115 // 116 // Embedding the Manager into higher level resources allows to bind controllers 117 // to the lifetime of that object. Controllers also have a UUID to allow 118 // correlating all log messages of a controller instance. 119 // 120 // Guidelines to writing controllers: 121 // * Make sure that the task the controller performs is done in an atomic 122 // fashion, e.g. if a controller modifies a resource in multiple steps, an 123 // intermediate manipulation operation failing should not leave behind 124 // an inconsistent state. This can typically be achieved by locking the 125 // resource and rolling back or by using transactions. 126 // * Controllers typically act on behalf of a higher level object such as an 127 // endpoint. The controller must ensure that the higher level object is 128 // properly locked when accessing any fields. 129 // * Controllers run asynchronously in the background, it is the responsibility 130 // of the controller to be aware of the lifecycle of the owning higher level 131 // object. This is typically achieved by removing all controllers when the 132 // owner dies. It is the responsibility of the owner to either lock the owner 133 // in a way that will delay destruction throughout the controller run or to 134 // check for the destruction throughout the run. 135 type Controller struct { 136 mutex lock.RWMutex 137 name string 138 params ControllerParams 139 successCount int 140 lastSuccessStamp time.Time 141 failureCount int 142 consecutiveErrors int 143 lastError error 144 lastErrorStamp time.Time 145 lastDuration time.Duration 146 uuid string 147 stop chan struct{} 148 update chan struct{} 149 ctxDoFunc context.Context 150 cancelDoFunc context.CancelFunc 151 152 // terminated is closed after the controller has been terminated 153 terminated chan struct{} 154 } 155 156 // GetSuccessCount returns the number of successful controller runs 157 func (c *Controller) GetSuccessCount() int { 158 c.mutex.RLock() 159 defer c.mutex.RUnlock() 160 161 return c.successCount 162 } 163 164 // GetFailureCount returns the number of failed controller runs 165 func (c *Controller) GetFailureCount() int { 166 c.mutex.RLock() 167 defer c.mutex.RUnlock() 168 169 return c.failureCount 170 } 171 172 // GetLastError returns the last error returned 173 func (c *Controller) GetLastError() error { 174 c.mutex.RLock() 175 defer c.mutex.RUnlock() 176 177 return c.lastError 178 } 179 180 // GetLastErrorTimestamp returns the last error returned 181 func (c *Controller) GetLastErrorTimestamp() time.Time { 182 c.mutex.RLock() 183 defer c.mutex.RUnlock() 184 185 return c.lastErrorStamp 186 } 187 188 func (c *Controller) runController() { 189 errorRetries := 1 190 191 c.mutex.RLock() 192 params := c.params 193 c.mutex.RUnlock() 194 runFunc := true 195 interval := 10 * time.Minute 196 197 for { 198 var err error 199 if runFunc { 200 interval = params.RunInterval 201 202 start := time.Now() 203 err = params.DoFunc(c.ctxDoFunc) 204 duration := time.Since(start) 205 206 c.mutex.Lock() 207 c.lastDuration = duration 208 c.getLogger().Debug("Controller func execution time: ", c.lastDuration) 209 210 if err != nil { 211 switch err := err.(type) { 212 case ExitReason: 213 // This is actually not an error case, but it causes an exit 214 c.recordSuccess() 215 c.lastError = err // This will be shown in the controller status 216 217 // Don't exit the goroutine, since that only happens when the 218 // controller is explicitly stopped. Instead, just wait for 219 // the next update. 220 c.getLogger().Debug("Controller run succeeded; waiting for next controller update or stop") 221 runFunc = false 222 interval = 10 * time.Minute 223 224 default: 225 c.getLogger().WithField(fieldConsecutiveErrors, errorRetries). 226 WithError(err).Debug("Controller run failed") 227 c.recordError(err) 228 229 if !params.NoErrorRetry { 230 if params.ErrorRetryBaseDuration != time.Duration(0) { 231 interval = time.Duration(errorRetries) * params.ErrorRetryBaseDuration 232 } else { 233 interval = time.Duration(errorRetries) * time.Second 234 } 235 236 errorRetries++ 237 } 238 } 239 } else { 240 c.recordSuccess() 241 242 // reset error retries after successful attempt 243 errorRetries = 1 244 245 // If no run interval is specified, no further updates 246 // are required. 247 if interval == time.Duration(0) { 248 // Don't exit the goroutine, since that only happens when the 249 // controller is explicitly stopped. Instead, just wait for 250 // the next update. 251 c.getLogger().Debug("Controller run succeeded; waiting for next controller update or stop") 252 runFunc = false 253 interval = 10 * time.Minute 254 } 255 } 256 257 c.mutex.Unlock() 258 } 259 260 select { 261 case <-c.stop: 262 goto shutdown 263 264 case <-c.update: 265 // If we receive a signal on both channels c.stop and c.update, 266 // golang will pick either c.stop or c.update randomly. 267 // This select will make sure we don't execute the controller 268 // while we are shutting down. 269 select { 270 case <-c.stop: 271 goto shutdown 272 default: 273 } 274 // Pick up any changes to the parameters in case the controller has 275 // been updated. 276 c.mutex.RLock() 277 params = c.params 278 c.mutex.RUnlock() 279 runFunc = true 280 281 case <-time.After(interval): 282 } 283 284 } 285 286 shutdown: 287 c.getLogger().Debug("Shutting down controller") 288 289 if err := params.StopFunc(context.TODO()); err != nil { 290 c.mutex.Lock() 291 c.recordError(err) 292 c.mutex.Unlock() 293 c.getLogger().WithField(fieldConsecutiveErrors, errorRetries). 294 WithError(err).Warn("Error on Controller stop") 295 } 296 297 close(c.terminated) 298 } 299 300 // updateParamsLocked sets the specified controller's parameters. 301 // 302 // If the RunInterval exceeds ControllerMaxInterval, it will be capped. 303 func (c *Controller) updateParamsLocked(params ControllerParams) { 304 c.params = params 305 306 maxInterval := time.Duration(option.Config.MaxControllerInterval) * time.Second 307 if maxInterval > 0 && params.RunInterval > maxInterval { 308 c.getLogger().Infof("Limiting interval to %s", maxInterval) 309 c.params.RunInterval = maxInterval 310 } 311 } 312 313 func (c *Controller) stopController() { 314 if c.cancelDoFunc != nil { 315 c.cancelDoFunc() 316 } 317 318 close(c.stop) 319 close(c.update) 320 } 321 322 // logger returns a logrus object with controllerName and UUID fields. 323 func (c *Controller) getLogger() *logrus.Entry { 324 return log.WithFields(logrus.Fields{ 325 fieldControllerName: c.name, 326 fieldUUID: c.uuid, 327 }) 328 } 329 330 // GetStatusModel returns a models.ControllerStatus representing the 331 // controller's configuration & status 332 func (c *Controller) GetStatusModel() *models.ControllerStatus { 333 c.mutex.RLock() 334 defer c.mutex.RUnlock() 335 336 status := &models.ControllerStatus{ 337 Name: c.name, 338 UUID: strfmt.UUID(c.uuid), 339 Configuration: &models.ControllerStatusConfiguration{ 340 ErrorRetry: !c.params.NoErrorRetry, 341 ErrorRetryBase: strfmt.Duration(c.params.ErrorRetryBaseDuration), 342 Interval: strfmt.Duration(c.params.RunInterval), 343 }, 344 Status: &models.ControllerStatusStatus{ 345 SuccessCount: int64(c.successCount), 346 LastSuccessTimestamp: strfmt.DateTime(c.lastSuccessStamp), 347 FailureCount: int64(c.failureCount), 348 LastFailureTimestamp: strfmt.DateTime(c.lastErrorStamp), 349 ConsecutiveFailureCount: int64(c.consecutiveErrors), 350 }, 351 } 352 353 if c.lastError != nil { 354 status.Status.LastFailureMsg = c.lastError.Error() 355 } 356 357 return status 358 } 359 360 // recordError updates all statistic collection variables on error 361 // c.mutex must be held. 362 func (c *Controller) recordError(err error) { 363 c.lastError = err 364 c.lastErrorStamp = time.Now() 365 c.failureCount++ 366 c.consecutiveErrors++ 367 metrics.ControllerRuns.WithLabelValues(failure).Inc() 368 metrics.ControllerRunsDuration.WithLabelValues(failure).Observe(c.lastDuration.Seconds()) 369 } 370 371 // recordSuccess updates all statistic collection variables on success 372 // c.mutex must be held. 373 func (c *Controller) recordSuccess() { 374 c.lastError = nil 375 c.lastSuccessStamp = time.Now() 376 c.successCount++ 377 c.consecutiveErrors = 0 378 379 metrics.ControllerRuns.WithLabelValues(success).Inc() 380 metrics.ControllerRunsDuration.WithLabelValues(success).Observe(c.lastDuration.Seconds()) 381 }