github.com/fafucoder/cilium@v1.6.11/pkg/controller/controller.go (about)

     1  // Copyright 2018 Authors of Cilium
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package controller
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"time"
    22  
    23  	"github.com/cilium/cilium/api/v1/models"
    24  	"github.com/cilium/cilium/pkg/lock"
    25  	"github.com/cilium/cilium/pkg/metrics"
    26  	"github.com/cilium/cilium/pkg/option"
    27  
    28  	"github.com/go-openapi/strfmt"
    29  	"github.com/sirupsen/logrus"
    30  )
    31  
    32  const (
    33  	success = "success"
    34  	failure = "failure"
    35  )
    36  
    37  // ControllerFunc is a function that the controller runs. This type is used for
    38  // DoFunc and StopFunc.
    39  type ControllerFunc func(ctx context.Context) error
    40  
    41  // ExitReason is a returnable type from DoFunc that causes the
    42  // controller to exit. This reason is recorded in the controller's status. The
    43  // controller is not removed from any manager.
    44  // Construct one with NewExitReason("a reason")
    45  type ExitReason struct {
    46  	// This is constucted in this odd way because the type assertion in
    47  	// runController didn't work otherwise.
    48  	error
    49  }
    50  
    51  // NewExitReason returns a new ExitReason
    52  func NewExitReason(reason string) ExitReason {
    53  	return ExitReason{errors.New(reason)}
    54  }
    55  
    56  // ControllerParams contains all parameters of a controller
    57  type ControllerParams struct {
    58  	// DoFunc is the function that will be run until it succeeds and/or
    59  	// using the interval RunInterval if not 0.
    60  	// An unset DoFunc is an error and will be logged as one.
    61  	DoFunc ControllerFunc
    62  
    63  	// StopFunc is called when the controller stops. It is intended to run any
    64  	// clean-up tasks for the controller (e.g. deallocate/release resources)
    65  	// It is guaranteed that DoFunc is called at least once before StopFunc is
    66  	// called.
    67  	// An unset StopFunc is not an error (and will be a no-op)
    68  	// Note: Since this occurs on controller exit, error counts and tracking may
    69  	// not be checked after StopFunc is run.
    70  	StopFunc ControllerFunc
    71  
    72  	// If set to any other value than 0, will cause DoFunc to be run in the
    73  	// specified interval. The interval starts from when the DoFunc has
    74  	// returned last
    75  	RunInterval time.Duration
    76  
    77  	// ErrorRetryBaseDuration is the initial time to wait to run DoFunc
    78  	// again on return of an error. On each consecutive error, this value
    79  	// is multiplied by the number of consecutive errors to provide a
    80  	// constant back off. The default is 1s.
    81  	ErrorRetryBaseDuration time.Duration
    82  
    83  	// NoErrorRetry when set to true, disabled retries on errors
    84  	NoErrorRetry bool
    85  }
    86  
    87  // undefinedDoFunc is used when no DoFunc is set. controller.DoFunc is set to this
    88  // when the controller is incorrectly initialised.
    89  func undefinedDoFunc(name string) error {
    90  	return fmt.Errorf("controller %s DoFunc is nil", name)
    91  }
    92  
    93  // NoopFunc is a no-op placeholder for DoFunc & StopFunc.
    94  // It is automatically used when StopFunc is undefined, and can be used as a
    95  // DoFunc stub when the controller should only run StopFunc.
    96  func NoopFunc(ctx context.Context) error {
    97  	return nil
    98  }
    99  
   100  // Controller is a simple pattern that allows to perform the following
   101  // tasks:
   102  //   - Run an operation in the background and retry until it succeeds
   103  //   - Perform a regular sync operation in the background
   104  //
   105  // A controller has configurable retry intervals and will collect statistics
   106  // on number of successful runs, number of failures, last error message,
   107  // and last error timestamp.
   108  //
   109  // Controllers have a name and are tied to a Manager. The manager is typically
   110  // bound to higher level objects such as endpoint. These higher level objects
   111  // can then run multiple controllers to perform async tasks such as:
   112  //  - Annotating k8s resources with values
   113  //  - Synchronizing an object with the kvstore
   114  //  - Any other async operation to may fail and require retries
   115  //
   116  // Embedding the Manager into higher level resources allows to bind controllers
   117  // to the lifetime of that object. Controllers also have a UUID to allow
   118  // correlating all log messages of a controller instance.
   119  //
   120  // Guidelines to writing controllers:
   121  // * Make sure that the task the controller performs is done in an atomic
   122  //   fashion, e.g. if a controller modifies a resource in multiple steps, an
   123  //   intermediate manipulation operation failing should not leave behind
   124  //   an inconsistent state. This can typically be achieved by locking the
   125  //   resource and rolling back or by using transactions.
   126  // * Controllers typically act on behalf of a higher level object such as an
   127  //   endpoint. The controller must ensure that the higher level object is
   128  //   properly locked when accessing any fields.
   129  // * Controllers run asynchronously in the background, it is the responsibility
   130  //   of the controller to be aware of the lifecycle of the owning higher level
   131  //   object. This is typically achieved by removing all controllers when the
   132  //   owner dies. It is the responsibility of the owner to either lock the owner
   133  //   in a way that will delay destruction throughout the controller run or to
   134  //   check for the destruction throughout the run.
   135  type Controller struct {
   136  	mutex             lock.RWMutex
   137  	name              string
   138  	params            ControllerParams
   139  	successCount      int
   140  	lastSuccessStamp  time.Time
   141  	failureCount      int
   142  	consecutiveErrors int
   143  	lastError         error
   144  	lastErrorStamp    time.Time
   145  	lastDuration      time.Duration
   146  	uuid              string
   147  	stop              chan struct{}
   148  	update            chan struct{}
   149  	ctxDoFunc         context.Context
   150  	cancelDoFunc      context.CancelFunc
   151  
   152  	// terminated is closed after the controller has been terminated
   153  	terminated chan struct{}
   154  }
   155  
   156  // GetSuccessCount returns the number of successful controller runs
   157  func (c *Controller) GetSuccessCount() int {
   158  	c.mutex.RLock()
   159  	defer c.mutex.RUnlock()
   160  
   161  	return c.successCount
   162  }
   163  
   164  // GetFailureCount returns the number of failed controller runs
   165  func (c *Controller) GetFailureCount() int {
   166  	c.mutex.RLock()
   167  	defer c.mutex.RUnlock()
   168  
   169  	return c.failureCount
   170  }
   171  
   172  // GetLastError returns the last error returned
   173  func (c *Controller) GetLastError() error {
   174  	c.mutex.RLock()
   175  	defer c.mutex.RUnlock()
   176  
   177  	return c.lastError
   178  }
   179  
   180  // GetLastErrorTimestamp returns the last error returned
   181  func (c *Controller) GetLastErrorTimestamp() time.Time {
   182  	c.mutex.RLock()
   183  	defer c.mutex.RUnlock()
   184  
   185  	return c.lastErrorStamp
   186  }
   187  
   188  func (c *Controller) runController() {
   189  	errorRetries := 1
   190  
   191  	c.mutex.RLock()
   192  	params := c.params
   193  	c.mutex.RUnlock()
   194  	runFunc := true
   195  	interval := 10 * time.Minute
   196  
   197  	for {
   198  		var err error
   199  		if runFunc {
   200  			interval = params.RunInterval
   201  
   202  			start := time.Now()
   203  			err = params.DoFunc(c.ctxDoFunc)
   204  			duration := time.Since(start)
   205  
   206  			c.mutex.Lock()
   207  			c.lastDuration = duration
   208  			c.getLogger().Debug("Controller func execution time: ", c.lastDuration)
   209  
   210  			if err != nil {
   211  				switch err := err.(type) {
   212  				case ExitReason:
   213  					// This is actually not an error case, but it causes an exit
   214  					c.recordSuccess()
   215  					c.lastError = err // This will be shown in the controller status
   216  
   217  					// Don't exit the goroutine, since that only happens when the
   218  					// controller is explicitly stopped. Instead, just wait for
   219  					// the next update.
   220  					c.getLogger().Debug("Controller run succeeded; waiting for next controller update or stop")
   221  					runFunc = false
   222  					interval = 10 * time.Minute
   223  
   224  				default:
   225  					c.getLogger().WithField(fieldConsecutiveErrors, errorRetries).
   226  						WithError(err).Debug("Controller run failed")
   227  					c.recordError(err)
   228  
   229  					if !params.NoErrorRetry {
   230  						if params.ErrorRetryBaseDuration != time.Duration(0) {
   231  							interval = time.Duration(errorRetries) * params.ErrorRetryBaseDuration
   232  						} else {
   233  							interval = time.Duration(errorRetries) * time.Second
   234  						}
   235  
   236  						errorRetries++
   237  					}
   238  				}
   239  			} else {
   240  				c.recordSuccess()
   241  
   242  				// reset error retries after successful attempt
   243  				errorRetries = 1
   244  
   245  				// If no run interval is specified, no further updates
   246  				// are required.
   247  				if interval == time.Duration(0) {
   248  					// Don't exit the goroutine, since that only happens when the
   249  					// controller is explicitly stopped. Instead, just wait for
   250  					// the next update.
   251  					c.getLogger().Debug("Controller run succeeded; waiting for next controller update or stop")
   252  					runFunc = false
   253  					interval = 10 * time.Minute
   254  				}
   255  			}
   256  
   257  			c.mutex.Unlock()
   258  		}
   259  
   260  		select {
   261  		case <-c.stop:
   262  			goto shutdown
   263  
   264  		case <-c.update:
   265  			// If we receive a signal on both channels c.stop and c.update,
   266  			// golang will pick either c.stop or c.update randomly.
   267  			// This select will make sure we don't execute the controller
   268  			// while we are shutting down.
   269  			select {
   270  			case <-c.stop:
   271  				goto shutdown
   272  			default:
   273  			}
   274  			// Pick up any changes to the parameters in case the controller has
   275  			// been updated.
   276  			c.mutex.RLock()
   277  			params = c.params
   278  			c.mutex.RUnlock()
   279  			runFunc = true
   280  
   281  		case <-time.After(interval):
   282  		}
   283  
   284  	}
   285  
   286  shutdown:
   287  	c.getLogger().Debug("Shutting down controller")
   288  
   289  	if err := params.StopFunc(context.TODO()); err != nil {
   290  		c.mutex.Lock()
   291  		c.recordError(err)
   292  		c.mutex.Unlock()
   293  		c.getLogger().WithField(fieldConsecutiveErrors, errorRetries).
   294  			WithError(err).Warn("Error on Controller stop")
   295  	}
   296  
   297  	close(c.terminated)
   298  }
   299  
   300  // updateParamsLocked sets the specified controller's parameters.
   301  //
   302  // If the RunInterval exceeds ControllerMaxInterval, it will be capped.
   303  func (c *Controller) updateParamsLocked(params ControllerParams) {
   304  	c.params = params
   305  
   306  	maxInterval := time.Duration(option.Config.MaxControllerInterval) * time.Second
   307  	if maxInterval > 0 && params.RunInterval > maxInterval {
   308  		c.getLogger().Infof("Limiting interval to %s", maxInterval)
   309  		c.params.RunInterval = maxInterval
   310  	}
   311  }
   312  
   313  func (c *Controller) stopController() {
   314  	if c.cancelDoFunc != nil {
   315  		c.cancelDoFunc()
   316  	}
   317  
   318  	close(c.stop)
   319  	close(c.update)
   320  }
   321  
   322  // logger returns a logrus object with controllerName and UUID fields.
   323  func (c *Controller) getLogger() *logrus.Entry {
   324  	return log.WithFields(logrus.Fields{
   325  		fieldControllerName: c.name,
   326  		fieldUUID:           c.uuid,
   327  	})
   328  }
   329  
   330  // GetStatusModel returns a models.ControllerStatus representing the
   331  // controller's configuration & status
   332  func (c *Controller) GetStatusModel() *models.ControllerStatus {
   333  	c.mutex.RLock()
   334  	defer c.mutex.RUnlock()
   335  
   336  	status := &models.ControllerStatus{
   337  		Name: c.name,
   338  		UUID: strfmt.UUID(c.uuid),
   339  		Configuration: &models.ControllerStatusConfiguration{
   340  			ErrorRetry:     !c.params.NoErrorRetry,
   341  			ErrorRetryBase: strfmt.Duration(c.params.ErrorRetryBaseDuration),
   342  			Interval:       strfmt.Duration(c.params.RunInterval),
   343  		},
   344  		Status: &models.ControllerStatusStatus{
   345  			SuccessCount:            int64(c.successCount),
   346  			LastSuccessTimestamp:    strfmt.DateTime(c.lastSuccessStamp),
   347  			FailureCount:            int64(c.failureCount),
   348  			LastFailureTimestamp:    strfmt.DateTime(c.lastErrorStamp),
   349  			ConsecutiveFailureCount: int64(c.consecutiveErrors),
   350  		},
   351  	}
   352  
   353  	if c.lastError != nil {
   354  		status.Status.LastFailureMsg = c.lastError.Error()
   355  	}
   356  
   357  	return status
   358  }
   359  
   360  // recordError updates all statistic collection variables on error
   361  // c.mutex must be held.
   362  func (c *Controller) recordError(err error) {
   363  	c.lastError = err
   364  	c.lastErrorStamp = time.Now()
   365  	c.failureCount++
   366  	c.consecutiveErrors++
   367  	metrics.ControllerRuns.WithLabelValues(failure).Inc()
   368  	metrics.ControllerRunsDuration.WithLabelValues(failure).Observe(c.lastDuration.Seconds())
   369  }
   370  
   371  // recordSuccess updates all statistic collection variables on success
   372  // c.mutex must be held.
   373  func (c *Controller) recordSuccess() {
   374  	c.lastError = nil
   375  	c.lastSuccessStamp = time.Now()
   376  	c.successCount++
   377  	c.consecutiveErrors = 0
   378  
   379  	metrics.ControllerRuns.WithLabelValues(success).Inc()
   380  	metrics.ControllerRunsDuration.WithLabelValues(success).Observe(c.lastDuration.Seconds())
   381  }