github.com/looshlee/cilium@v1.6.12/daemon/status.go

github.com/looshlee/cilium@v1.6.12/daemon/status.go (about)

     1  // Copyright 2016-2019 Authors of Cilium
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package main
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"math/rand"
    21  	"time"
    22  
    23  	"github.com/cilium/cilium/api/v1/models"
    24  	. "github.com/cilium/cilium/api/v1/server/restapi/daemon"
    25  	"github.com/cilium/cilium/pkg/backoff"
    26  	"github.com/cilium/cilium/pkg/controller"
    27  	"github.com/cilium/cilium/pkg/datapath"
    28  	"github.com/cilium/cilium/pkg/k8s"
    29  	k8smetrics "github.com/cilium/cilium/pkg/k8s/metrics"
    30  	"github.com/cilium/cilium/pkg/kvstore"
    31  	"github.com/cilium/cilium/pkg/lock"
    32  	"github.com/cilium/cilium/pkg/node"
    33  	"github.com/cilium/cilium/pkg/option"
    34  	"github.com/cilium/cilium/pkg/status"
    35  	"github.com/cilium/cilium/pkg/workloads"
    36  
    37  	"github.com/go-openapi/runtime/middleware"
    38  	"github.com/go-openapi/strfmt"
    39  	versionapi "k8s.io/apimachinery/pkg/version"
    40  )
    41  
    42  const (
    43  	// k8sVersionCheckInterval is the interval in which the Kubernetes
    44  	// version is verified even if connectivity is given
    45  	k8sVersionCheckInterval = 15 * time.Minute
    46  
    47  	// k8sMinimumEventHearbeat is the time interval in which any received
    48  	// event will be considered proof that the apiserver connectivity is
    49  	// healthty
    50  	k8sMinimumEventHearbeat = time.Minute
    51  )
    52  
    53  func init() {
    54  	rand.Seed(time.Now().UnixNano())
    55  }
    56  
    57  type k8sVersion struct {
    58  	version          string
    59  	lastVersionCheck time.Time
    60  	lock             lock.Mutex
    61  }
    62  
    63  func (k *k8sVersion) cachedVersion() (string, bool) {
    64  	k.lock.Lock()
    65  	defer k.lock.Unlock()
    66  
    67  	if time.Since(k8smetrics.LastInteraction.Time()) > k8sMinimumEventHearbeat {
    68  		return "", false
    69  	}
    70  
    71  	if k.version == "" || time.Since(k.lastVersionCheck) > k8sVersionCheckInterval {
    72  		return "", false
    73  	}
    74  
    75  	return k.version, true
    76  }
    77  
    78  func (k *k8sVersion) update(version *versionapi.Info) string {
    79  	k.lock.Lock()
    80  	defer k.lock.Unlock()
    81  
    82  	k.version = fmt.Sprintf("%s.%s (%s) [%s]", version.Major, version.Minor, version.GitVersion, version.Platform)
    83  	k.lastVersionCheck = time.Now()
    84  	return k.version
    85  }
    86  
    87  var k8sVersionCache k8sVersion
    88  
    89  func (d *Daemon) getK8sStatus() *models.K8sStatus {
    90  	if !k8s.IsEnabled() {
    91  		return &models.K8sStatus{State: models.StatusStateDisabled}
    92  	}
    93  
    94  	version, valid := k8sVersionCache.cachedVersion()
    95  	if !valid {
    96  		k8sVersion, err := k8s.Client().Discovery().ServerVersion()
    97  		if err != nil {
    98  			return &models.K8sStatus{State: models.StatusStateFailure, Msg: err.Error()}
    99  		}
   100  
   101  		version = k8sVersionCache.update(k8sVersion)
   102  	}
   103  
   104  	k8sStatus := &models.K8sStatus{
   105  		State:          models.StatusStateOk,
   106  		Msg:            version,
   107  		K8sAPIVersions: d.k8sAPIGroups.getGroups(),
   108  	}
   109  
   110  	return k8sStatus
   111  }
   112  
   113  type getHealthz struct {
   114  	daemon *Daemon
   115  }
   116  
   117  func NewGetHealthzHandler(d *Daemon) GetHealthzHandler {
   118  	return &getHealthz{daemon: d}
   119  }
   120  
   121  func checkLocks(d *Daemon) {
   122  	// Try to acquire a couple of global locks to have the status API fail
   123  	// in case of a deadlock on these locks
   124  
   125  	option.Config.ConfigPatchMutex.Lock()
   126  	option.Config.ConfigPatchMutex.Unlock()
   127  }
   128  
   129  func (d *Daemon) getNodeStatus() *models.ClusterStatus {
   130  	clusterStatus := models.ClusterStatus{
   131  		Self: d.nodeDiscovery.LocalNode.Fullname(),
   132  	}
   133  	for _, node := range d.nodeDiscovery.Manager.GetNodes() {
   134  		clusterStatus.Nodes = append(clusterStatus.Nodes, node.GetModel())
   135  	}
   136  	return &clusterStatus
   137  }
   138  
   139  func (h *getHealthz) Handle(params GetHealthzParams) middleware.Responder {
   140  	brief := params.Brief != nil && *params.Brief
   141  	sr := h.daemon.getStatus(brief)
   142  
   143  	return NewGetHealthzOK().WithPayload(&sr)
   144  }
   145  
   146  type getNodes struct {
   147  	d *Daemon
   148  	// mutex to protect the clients map against concurrent access
   149  	lock.RWMutex
   150  	// clients maps a client ID to a clusterNodesClient
   151  	clients map[int64]*clusterNodesClient
   152  }
   153  
   154  func NewGetClusterNodesHandler(d *Daemon) GetClusterNodesHandler {
   155  	return &getNodes{
   156  		d:       d,
   157  		clients: map[int64]*clusterNodesClient{},
   158  	}
   159  }
   160  
   161  // clientGCTimeout is the time for which the clients are kept. After timeout
   162  // is reached, clients will be cleaned up.
   163  const clientGCTimeout = 15 * time.Minute
   164  
   165  type clusterNodesClient struct {
   166  	// mutex to protect the client against concurrent access
   167  	lock.RWMutex
   168  	lastSync time.Time
   169  	*models.ClusterNodeStatus
   170  }
   171  
   172  func (c *clusterNodesClient) NodeAdd(newNode node.Node) error {
   173  	c.Lock()
   174  	c.NodesAdded = append(c.NodesAdded, newNode.GetModel())
   175  	c.Unlock()
   176  	return nil
   177  }
   178  
   179  func (c *clusterNodesClient) NodeUpdate(oldNode, newNode node.Node) error {
   180  	c.Lock()
   181  	c.NodesAdded = append(c.NodesAdded, newNode.GetModel())
   182  	c.NodesRemoved = append(c.NodesRemoved, oldNode.GetModel())
   183  	c.Unlock()
   184  	return nil
   185  }
   186  
   187  func (c *clusterNodesClient) NodeDelete(node node.Node) error {
   188  	c.Lock()
   189  	// If the node was added/updated and removed before the clusterNodesClient
   190  	// was aware of it then we can safely remove it from the list of added
   191  	// nodes and not set it in the list of removed nodes.
   192  	found := -1
   193  	for i, added := range c.NodesAdded {
   194  		if added.Name == node.Fullname() {
   195  			found = i
   196  		}
   197  	}
   198  	if found != -1 {
   199  		c.NodesAdded = append(c.NodesAdded[:found], c.NodesAdded[found+1:]...)
   200  	} else {
   201  		c.NodesRemoved = append(c.NodesRemoved, node.GetModel())
   202  	}
   203  	c.Unlock()
   204  	return nil
   205  }
   206  
   207  func (c *clusterNodesClient) NodeValidateImplementation(node node.Node) error {
   208  	// no-op
   209  	return nil
   210  }
   211  
   212  func (c *clusterNodesClient) NodeConfigurationChanged(config datapath.LocalNodeConfiguration) error {
   213  	// no-op
   214  	return nil
   215  }
   216  
   217  func (h *getNodes) cleanupClients() {
   218  	past := time.Now().Add(-clientGCTimeout)
   219  	for k, v := range h.clients {
   220  		if v.lastSync.Before(past) {
   221  			h.d.nodeDiscovery.Manager.Unsubscribe(v)
   222  			delete(h.clients, k)
   223  		}
   224  	}
   225  }
   226  
   227  func (h *getNodes) Handle(params GetClusterNodesParams) middleware.Responder {
   228  	var cns *models.ClusterNodeStatus
   229  	// If ClientID is not set then we send all nodes, otherwise we will store
   230  	// the client ID in the list of clients and we subscribe this new client
   231  	// to the list of clients.
   232  	if params.ClientID == nil {
   233  		ns := h.d.getNodeStatus()
   234  		cns = &models.ClusterNodeStatus{
   235  			Self:       ns.Self,
   236  			NodesAdded: ns.Nodes,
   237  		}
   238  		return NewGetClusterNodesOK().WithPayload(cns)
   239  	}
   240  
   241  	h.Lock()
   242  	defer h.Unlock()
   243  
   244  	var clientID int64
   245  	c, exists := h.clients[*params.ClientID]
   246  	if exists {
   247  		clientID = *params.ClientID
   248  	} else {
   249  		clientID = rand.Int63()
   250  		// make sure we haven't allocated an existing client ID nor the
   251  		// randomizer has allocated ID 0, if we have then we will return
   252  		// clientID 0.
   253  		_, exists := h.clients[clientID]
   254  		if exists || clientID == 0 {
   255  			ns := h.d.getNodeStatus()
   256  			cns = &models.ClusterNodeStatus{
   257  				ClientID:   0,
   258  				Self:       ns.Self,
   259  				NodesAdded: ns.Nodes,
   260  			}
   261  			return NewGetClusterNodesOK().WithPayload(cns)
   262  		}
   263  		c = &clusterNodesClient{
   264  			lastSync: time.Now(),
   265  			ClusterNodeStatus: &models.ClusterNodeStatus{
   266  				ClientID: clientID,
   267  				Self:     h.d.nodeDiscovery.LocalNode.Fullname(),
   268  			},
   269  		}
   270  		h.d.nodeDiscovery.Manager.Subscribe(c)
   271  
   272  		// Clean up other clients before adding a new one
   273  		h.cleanupClients()
   274  		h.clients[clientID] = c
   275  	}
   276  	c.Lock()
   277  	// Copy the ClusterNodeStatus to the response
   278  	cns = c.ClusterNodeStatus
   279  	// Store a new ClusterNodeStatus to reset the list of nodes
   280  	// added / removed.
   281  	c.ClusterNodeStatus = &models.ClusterNodeStatus{
   282  		ClientID: clientID,
   283  		Self:     h.d.nodeDiscovery.LocalNode.Fullname(),
   284  	}
   285  	c.lastSync = time.Now()
   286  	c.Unlock()
   287  
   288  	return NewGetClusterNodesOK().WithPayload(cns)
   289  }
   290  
   291  // getStatus returns the daemon status. If brief is provided a minimal version
   292  // of the StatusResponse is provided.
   293  func (d *Daemon) getStatus(brief bool) models.StatusResponse {
   294  	staleProbes := d.statusCollector.GetStaleProbes()
   295  	stale := make(map[string]strfmt.DateTime, len(staleProbes))
   296  	for probe, startTime := range staleProbes {
   297  		stale[probe] = strfmt.DateTime(startTime)
   298  	}
   299  
   300  	d.statusCollectMutex.RLock()
   301  	defer d.statusCollectMutex.RUnlock()
   302  
   303  	var sr models.StatusResponse
   304  	if brief {
   305  		csCopy := new(models.ClusterStatus)
   306  		if d.statusResponse.Cluster != nil && d.statusResponse.Cluster.CiliumHealth != nil {
   307  			in, out := &d.statusResponse.Cluster.CiliumHealth, &csCopy.CiliumHealth
   308  			*out = new(models.Status)
   309  			**out = **in
   310  		}
   311  		var minimalControllers models.ControllerStatuses
   312  		if d.statusResponse.Controllers != nil {
   313  			for _, c := range d.statusResponse.Controllers {
   314  				if c.Status == nil {
   315  					continue
   316  				}
   317  				// With brief, the client should only care if a single controller
   318  				// is failing and its status so we don't need to continuing
   319  				// checking for failure messages for the remaining controllers.
   320  				if c.Status.LastFailureMsg != "" {
   321  					minimalControllers = append(minimalControllers, c.DeepCopy())
   322  					break
   323  				}
   324  			}
   325  		}
   326  		sr = models.StatusResponse{
   327  			Cluster:     csCopy,
   328  			Controllers: minimalControllers,
   329  		}
   330  	} else {
   331  		// d.statusResponse contains references, so we do a deep copy to be able to
   332  		// safely use sr after the method has returned
   333  		sr = *d.statusResponse.DeepCopy()
   334  	}
   335  
   336  	sr.Stale = stale
   337  
   338  	switch {
   339  	case len(sr.Stale) > 0:
   340  		sr.Cilium = &models.Status{
   341  			State: models.StatusStateWarning,
   342  			Msg:   "Stale status data",
   343  		}
   344  	case d.statusResponse.Kvstore != nil && d.statusResponse.Kvstore.State != models.StatusStateOk:
   345  		sr.Cilium = &models.Status{
   346  			State: d.statusResponse.Kvstore.State,
   347  			Msg:   "Kvstore service is not ready",
   348  		}
   349  	case d.statusResponse.ContainerRuntime != nil && d.statusResponse.ContainerRuntime.State != models.StatusStateOk:
   350  		msg := "Container runtime is not ready"
   351  		if d.statusResponse.ContainerRuntime.State == models.StatusStateDisabled {
   352  			msg = "Container runtime is disabled"
   353  		}
   354  		sr.Cilium = &models.Status{
   355  			State: d.statusResponse.ContainerRuntime.State,
   356  			Msg:   msg,
   357  		}
   358  	case k8s.IsEnabled() && d.statusResponse.Kubernetes != nil && d.statusResponse.Kubernetes.State != models.StatusStateOk:
   359  		sr.Cilium = &models.Status{
   360  			State: d.statusResponse.Kubernetes.State,
   361  			Msg:   "Kubernetes service is not ready",
   362  		}
   363  	default:
   364  		sr.Cilium = &models.Status{State: models.StatusStateOk, Msg: "OK"}
   365  	}
   366  
   367  	return sr
   368  }
   369  
   370  func (d *Daemon) startStatusCollector() {
   371  	probes := []status.Probe{
   372  		{
   373  			Name: "check-locks",
   374  			Probe: func(ctx context.Context) (interface{}, error) {
   375  				// Try to acquire a couple of global locks to have the status API fail
   376  				// in case of a deadlock on these locks
   377  				option.Config.ConfigPatchMutex.Lock()
   378  				option.Config.ConfigPatchMutex.Unlock()
   379  				return nil, nil
   380  			},
   381  			OnStatusUpdate: func(status status.Status) {
   382  				d.statusCollectMutex.Lock()
   383  				defer d.statusCollectMutex.Unlock()
   384  				// FIXME we have no field for the lock status
   385  			},
   386  		},
   387  		{
   388  			Name: "kvstore",
   389  			Probe: func(ctx context.Context) (interface{}, error) {
   390  				if option.Config.KVStore == "" {
   391  					return models.StatusStateDisabled, nil
   392  				} else {
   393  					return kvstore.Client().Status()
   394  				}
   395  			},
   396  			OnStatusUpdate: func(status status.Status) {
   397  				var msg string
   398  				state := models.StatusStateOk
   399  				info, ok := status.Data.(string)
   400  
   401  				switch {
   402  				case ok && status.Err != nil:
   403  					state = models.StatusStateFailure
   404  					msg = fmt.Sprintf("Err: %s - %s", status.Err, info)
   405  				case status.Err != nil:
   406  					state = models.StatusStateFailure
   407  					msg = fmt.Sprintf("Err: %s", status.Err)
   408  				case ok:
   409  					msg = fmt.Sprintf("%s", info)
   410  				}
   411  
   412  				d.statusCollectMutex.Lock()
   413  				defer d.statusCollectMutex.Unlock()
   414  
   415  				d.statusResponse.Kvstore = &models.Status{
   416  					State: state,
   417  					Msg:   msg,
   418  				}
   419  			},
   420  		},
   421  		{
   422  			Name: "container-runtime",
   423  			Probe: func(ctx context.Context) (interface{}, error) {
   424  				return workloads.Status(), nil
   425  			},
   426  			OnStatusUpdate: func(status status.Status) {
   427  				d.statusCollectMutex.Lock()
   428  				defer d.statusCollectMutex.Unlock()
   429  
   430  				if status.Err != nil {
   431  					d.statusResponse.ContainerRuntime = &models.Status{
   432  						State: models.StatusStateFailure,
   433  						Msg:   status.Err.Error(),
   434  					}
   435  					return
   436  				}
   437  
   438  				if s, ok := status.Data.(*models.Status); ok {
   439  					d.statusResponse.ContainerRuntime = s
   440  				}
   441  			},
   442  		},
   443  		{
   444  			Name: "kubernetes",
   445  			Interval: func(failures int) time.Duration {
   446  				if failures > 0 {
   447  					// While failing, we want an initial
   448  					// quick retry with exponential backoff
   449  					// to avoid continuous load on the
   450  					// apiserver
   451  					return backoff.CalculateDuration(5*time.Second, 2*time.Minute, 2.0, false, failures)
   452  				}
   453  
   454  				// The base interval is dependant on the
   455  				// cluster size. One status interval does not
   456  				// automatically translate to an apiserver
   457  				// interaction as any regular apiserver
   458  				// interaction is also used as an indication of
   459  				// successful connectivity so we can continue
   460  				// to be fairly aggressive.
   461  				//
   462  				// 1     |    7s
   463  				// 2     |   12s
   464  				// 4     |   15s
   465  				// 64    |   42s
   466  				// 512   | 1m02s
   467  				// 2048  | 1m15s
   468  				// 8192  | 1m30s
   469  				// 16384 | 1m32s
   470  				return d.nodeDiscovery.Manager.ClusterSizeDependantInterval(10 * time.Second)
   471  			},
   472  			Probe: func(ctx context.Context) (interface{}, error) {
   473  				return d.getK8sStatus(), nil
   474  			},
   475  			OnStatusUpdate: func(status status.Status) {
   476  				d.statusCollectMutex.Lock()
   477  				defer d.statusCollectMutex.Unlock()
   478  
   479  				if status.Err != nil {
   480  					d.statusResponse.Kubernetes = &models.K8sStatus{
   481  						State: models.StatusStateFailure,
   482  						Msg:   status.Err.Error(),
   483  					}
   484  					return
   485  				}
   486  				if s, ok := status.Data.(*models.K8sStatus); ok {
   487  					d.statusResponse.Kubernetes = s
   488  				}
   489  			},
   490  		},
   491  		{
   492  			Name: "ipam",
   493  			Probe: func(ctx context.Context) (interface{}, error) {
   494  				return d.DumpIPAM(), nil
   495  			},
   496  			OnStatusUpdate: func(status status.Status) {
   497  				d.statusCollectMutex.Lock()
   498  				defer d.statusCollectMutex.Unlock()
   499  
   500  				// IPAMStatus has no way to show errors
   501  				if status.Err == nil {
   502  					if s, ok := status.Data.(*models.IPAMStatus); ok {
   503  						d.statusResponse.IPAM = s
   504  					}
   505  				}
   506  			},
   507  		},
   508  		{
   509  			Name: "node-monitor",
   510  			Probe: func(ctx context.Context) (interface{}, error) {
   511  				return d.monitorAgent.State(), nil
   512  			},
   513  			OnStatusUpdate: func(status status.Status) {
   514  				d.statusCollectMutex.Lock()
   515  				defer d.statusCollectMutex.Unlock()
   516  
   517  				// NodeMonitor has no way to show errors
   518  				if status.Err == nil {
   519  					if s, ok := status.Data.(*models.MonitorStatus); ok {
   520  						d.statusResponse.NodeMonitor = s
   521  					}
   522  				}
   523  			},
   524  		},
   525  		{
   526  			Name: "cluster",
   527  			Probe: func(ctx context.Context) (interface{}, error) {
   528  				clusterStatus := &models.ClusterStatus{
   529  					Self: d.nodeDiscovery.LocalNode.Fullname(),
   530  				}
   531  				return clusterStatus, nil
   532  			},
   533  			OnStatusUpdate: func(status status.Status) {
   534  				d.statusCollectMutex.Lock()
   535  				defer d.statusCollectMutex.Unlock()
   536  
   537  				// ClusterStatus has no way to report errors
   538  				if status.Err == nil {
   539  					if s, ok := status.Data.(*models.ClusterStatus); ok {
   540  						if d.statusResponse.Cluster != nil {
   541  							// NB: CiliumHealth is set concurrently by the
   542  							// "cilium-health" probe, so do not override it
   543  							s.CiliumHealth = d.statusResponse.Cluster.CiliumHealth
   544  						}
   545  						d.statusResponse.Cluster = s
   546  					}
   547  				}
   548  			},
   549  		},
   550  		{
   551  			Name: "cilium-health",
   552  			Probe: func(ctx context.Context) (interface{}, error) {
   553  				if d.ciliumHealth == nil {
   554  					return nil, nil
   555  				}
   556  				return d.ciliumHealth.GetStatus(), nil
   557  			},
   558  			OnStatusUpdate: func(status status.Status) {
   559  				if d.ciliumHealth == nil {
   560  					return
   561  				}
   562  
   563  				d.statusCollectMutex.Lock()
   564  				defer d.statusCollectMutex.Unlock()
   565  
   566  				if d.statusResponse.Cluster == nil {
   567  					d.statusResponse.Cluster = &models.ClusterStatus{}
   568  				}
   569  				if status.Err != nil {
   570  					d.statusResponse.Cluster.CiliumHealth = &models.Status{
   571  						State: models.StatusStateFailure,
   572  						Msg:   status.Err.Error(),
   573  					}
   574  					return
   575  				}
   576  				if s, ok := status.Data.(*models.Status); ok {
   577  					d.statusResponse.Cluster.CiliumHealth = s
   578  				}
   579  			},
   580  		},
   581  		{
   582  			Name: "l7-proxy",
   583  			Probe: func(ctx context.Context) (interface{}, error) {
   584  				if d.l7Proxy == nil {
   585  					return nil, nil
   586  				}
   587  				return d.l7Proxy.GetStatusModel(), nil
   588  			},
   589  			OnStatusUpdate: func(status status.Status) {
   590  				d.statusCollectMutex.Lock()
   591  				defer d.statusCollectMutex.Unlock()
   592  
   593  				// ProxyStatus has no way to report errors
   594  				if status.Err == nil {
   595  					if s, ok := status.Data.(*models.ProxyStatus); ok {
   596  						d.statusResponse.Proxy = s
   597  					}
   598  				}
   599  			},
   600  		},
   601  		{
   602  			Name: "controllers",
   603  			Probe: func(ctx context.Context) (interface{}, error) {
   604  				return controller.GetGlobalStatus(), nil
   605  			},
   606  			OnStatusUpdate: func(status status.Status) {
   607  				d.statusCollectMutex.Lock()
   608  				defer d.statusCollectMutex.Unlock()
   609  
   610  				// ControllerStatuses has no way to report errors
   611  				if status.Err == nil {
   612  					if s, ok := status.Data.(models.ControllerStatuses); ok {
   613  						d.statusResponse.Controllers = s
   614  					}
   615  				}
   616  			},
   617  		},
   618  	}
   619  
   620  	d.statusCollector = status.NewCollector(probes, status.Config{})
   621  
   622  	return
   623  }