github.com/looshlee/cilium@v1.6.12/daemon/status.go (about) 1 // Copyright 2016-2019 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package main 16 17 import ( 18 "context" 19 "fmt" 20 "math/rand" 21 "time" 22 23 "github.com/cilium/cilium/api/v1/models" 24 . "github.com/cilium/cilium/api/v1/server/restapi/daemon" 25 "github.com/cilium/cilium/pkg/backoff" 26 "github.com/cilium/cilium/pkg/controller" 27 "github.com/cilium/cilium/pkg/datapath" 28 "github.com/cilium/cilium/pkg/k8s" 29 k8smetrics "github.com/cilium/cilium/pkg/k8s/metrics" 30 "github.com/cilium/cilium/pkg/kvstore" 31 "github.com/cilium/cilium/pkg/lock" 32 "github.com/cilium/cilium/pkg/node" 33 "github.com/cilium/cilium/pkg/option" 34 "github.com/cilium/cilium/pkg/status" 35 "github.com/cilium/cilium/pkg/workloads" 36 37 "github.com/go-openapi/runtime/middleware" 38 "github.com/go-openapi/strfmt" 39 versionapi "k8s.io/apimachinery/pkg/version" 40 ) 41 42 const ( 43 // k8sVersionCheckInterval is the interval in which the Kubernetes 44 // version is verified even if connectivity is given 45 k8sVersionCheckInterval = 15 * time.Minute 46 47 // k8sMinimumEventHearbeat is the time interval in which any received 48 // event will be considered proof that the apiserver connectivity is 49 // healthty 50 k8sMinimumEventHearbeat = time.Minute 51 ) 52 53 func init() { 54 rand.Seed(time.Now().UnixNano()) 55 } 56 57 type k8sVersion struct { 58 version string 59 lastVersionCheck time.Time 60 lock lock.Mutex 61 } 62 63 func (k *k8sVersion) cachedVersion() (string, bool) { 64 k.lock.Lock() 65 defer k.lock.Unlock() 66 67 if time.Since(k8smetrics.LastInteraction.Time()) > k8sMinimumEventHearbeat { 68 return "", false 69 } 70 71 if k.version == "" || time.Since(k.lastVersionCheck) > k8sVersionCheckInterval { 72 return "", false 73 } 74 75 return k.version, true 76 } 77 78 func (k *k8sVersion) update(version *versionapi.Info) string { 79 k.lock.Lock() 80 defer k.lock.Unlock() 81 82 k.version = fmt.Sprintf("%s.%s (%s) [%s]", version.Major, version.Minor, version.GitVersion, version.Platform) 83 k.lastVersionCheck = time.Now() 84 return k.version 85 } 86 87 var k8sVersionCache k8sVersion 88 89 func (d *Daemon) getK8sStatus() *models.K8sStatus { 90 if !k8s.IsEnabled() { 91 return &models.K8sStatus{State: models.StatusStateDisabled} 92 } 93 94 version, valid := k8sVersionCache.cachedVersion() 95 if !valid { 96 k8sVersion, err := k8s.Client().Discovery().ServerVersion() 97 if err != nil { 98 return &models.K8sStatus{State: models.StatusStateFailure, Msg: err.Error()} 99 } 100 101 version = k8sVersionCache.update(k8sVersion) 102 } 103 104 k8sStatus := &models.K8sStatus{ 105 State: models.StatusStateOk, 106 Msg: version, 107 K8sAPIVersions: d.k8sAPIGroups.getGroups(), 108 } 109 110 return k8sStatus 111 } 112 113 type getHealthz struct { 114 daemon *Daemon 115 } 116 117 func NewGetHealthzHandler(d *Daemon) GetHealthzHandler { 118 return &getHealthz{daemon: d} 119 } 120 121 func checkLocks(d *Daemon) { 122 // Try to acquire a couple of global locks to have the status API fail 123 // in case of a deadlock on these locks 124 125 option.Config.ConfigPatchMutex.Lock() 126 option.Config.ConfigPatchMutex.Unlock() 127 } 128 129 func (d *Daemon) getNodeStatus() *models.ClusterStatus { 130 clusterStatus := models.ClusterStatus{ 131 Self: d.nodeDiscovery.LocalNode.Fullname(), 132 } 133 for _, node := range d.nodeDiscovery.Manager.GetNodes() { 134 clusterStatus.Nodes = append(clusterStatus.Nodes, node.GetModel()) 135 } 136 return &clusterStatus 137 } 138 139 func (h *getHealthz) Handle(params GetHealthzParams) middleware.Responder { 140 brief := params.Brief != nil && *params.Brief 141 sr := h.daemon.getStatus(brief) 142 143 return NewGetHealthzOK().WithPayload(&sr) 144 } 145 146 type getNodes struct { 147 d *Daemon 148 // mutex to protect the clients map against concurrent access 149 lock.RWMutex 150 // clients maps a client ID to a clusterNodesClient 151 clients map[int64]*clusterNodesClient 152 } 153 154 func NewGetClusterNodesHandler(d *Daemon) GetClusterNodesHandler { 155 return &getNodes{ 156 d: d, 157 clients: map[int64]*clusterNodesClient{}, 158 } 159 } 160 161 // clientGCTimeout is the time for which the clients are kept. After timeout 162 // is reached, clients will be cleaned up. 163 const clientGCTimeout = 15 * time.Minute 164 165 type clusterNodesClient struct { 166 // mutex to protect the client against concurrent access 167 lock.RWMutex 168 lastSync time.Time 169 *models.ClusterNodeStatus 170 } 171 172 func (c *clusterNodesClient) NodeAdd(newNode node.Node) error { 173 c.Lock() 174 c.NodesAdded = append(c.NodesAdded, newNode.GetModel()) 175 c.Unlock() 176 return nil 177 } 178 179 func (c *clusterNodesClient) NodeUpdate(oldNode, newNode node.Node) error { 180 c.Lock() 181 c.NodesAdded = append(c.NodesAdded, newNode.GetModel()) 182 c.NodesRemoved = append(c.NodesRemoved, oldNode.GetModel()) 183 c.Unlock() 184 return nil 185 } 186 187 func (c *clusterNodesClient) NodeDelete(node node.Node) error { 188 c.Lock() 189 // If the node was added/updated and removed before the clusterNodesClient 190 // was aware of it then we can safely remove it from the list of added 191 // nodes and not set it in the list of removed nodes. 192 found := -1 193 for i, added := range c.NodesAdded { 194 if added.Name == node.Fullname() { 195 found = i 196 } 197 } 198 if found != -1 { 199 c.NodesAdded = append(c.NodesAdded[:found], c.NodesAdded[found+1:]...) 200 } else { 201 c.NodesRemoved = append(c.NodesRemoved, node.GetModel()) 202 } 203 c.Unlock() 204 return nil 205 } 206 207 func (c *clusterNodesClient) NodeValidateImplementation(node node.Node) error { 208 // no-op 209 return nil 210 } 211 212 func (c *clusterNodesClient) NodeConfigurationChanged(config datapath.LocalNodeConfiguration) error { 213 // no-op 214 return nil 215 } 216 217 func (h *getNodes) cleanupClients() { 218 past := time.Now().Add(-clientGCTimeout) 219 for k, v := range h.clients { 220 if v.lastSync.Before(past) { 221 h.d.nodeDiscovery.Manager.Unsubscribe(v) 222 delete(h.clients, k) 223 } 224 } 225 } 226 227 func (h *getNodes) Handle(params GetClusterNodesParams) middleware.Responder { 228 var cns *models.ClusterNodeStatus 229 // If ClientID is not set then we send all nodes, otherwise we will store 230 // the client ID in the list of clients and we subscribe this new client 231 // to the list of clients. 232 if params.ClientID == nil { 233 ns := h.d.getNodeStatus() 234 cns = &models.ClusterNodeStatus{ 235 Self: ns.Self, 236 NodesAdded: ns.Nodes, 237 } 238 return NewGetClusterNodesOK().WithPayload(cns) 239 } 240 241 h.Lock() 242 defer h.Unlock() 243 244 var clientID int64 245 c, exists := h.clients[*params.ClientID] 246 if exists { 247 clientID = *params.ClientID 248 } else { 249 clientID = rand.Int63() 250 // make sure we haven't allocated an existing client ID nor the 251 // randomizer has allocated ID 0, if we have then we will return 252 // clientID 0. 253 _, exists := h.clients[clientID] 254 if exists || clientID == 0 { 255 ns := h.d.getNodeStatus() 256 cns = &models.ClusterNodeStatus{ 257 ClientID: 0, 258 Self: ns.Self, 259 NodesAdded: ns.Nodes, 260 } 261 return NewGetClusterNodesOK().WithPayload(cns) 262 } 263 c = &clusterNodesClient{ 264 lastSync: time.Now(), 265 ClusterNodeStatus: &models.ClusterNodeStatus{ 266 ClientID: clientID, 267 Self: h.d.nodeDiscovery.LocalNode.Fullname(), 268 }, 269 } 270 h.d.nodeDiscovery.Manager.Subscribe(c) 271 272 // Clean up other clients before adding a new one 273 h.cleanupClients() 274 h.clients[clientID] = c 275 } 276 c.Lock() 277 // Copy the ClusterNodeStatus to the response 278 cns = c.ClusterNodeStatus 279 // Store a new ClusterNodeStatus to reset the list of nodes 280 // added / removed. 281 c.ClusterNodeStatus = &models.ClusterNodeStatus{ 282 ClientID: clientID, 283 Self: h.d.nodeDiscovery.LocalNode.Fullname(), 284 } 285 c.lastSync = time.Now() 286 c.Unlock() 287 288 return NewGetClusterNodesOK().WithPayload(cns) 289 } 290 291 // getStatus returns the daemon status. If brief is provided a minimal version 292 // of the StatusResponse is provided. 293 func (d *Daemon) getStatus(brief bool) models.StatusResponse { 294 staleProbes := d.statusCollector.GetStaleProbes() 295 stale := make(map[string]strfmt.DateTime, len(staleProbes)) 296 for probe, startTime := range staleProbes { 297 stale[probe] = strfmt.DateTime(startTime) 298 } 299 300 d.statusCollectMutex.RLock() 301 defer d.statusCollectMutex.RUnlock() 302 303 var sr models.StatusResponse 304 if brief { 305 csCopy := new(models.ClusterStatus) 306 if d.statusResponse.Cluster != nil && d.statusResponse.Cluster.CiliumHealth != nil { 307 in, out := &d.statusResponse.Cluster.CiliumHealth, &csCopy.CiliumHealth 308 *out = new(models.Status) 309 **out = **in 310 } 311 var minimalControllers models.ControllerStatuses 312 if d.statusResponse.Controllers != nil { 313 for _, c := range d.statusResponse.Controllers { 314 if c.Status == nil { 315 continue 316 } 317 // With brief, the client should only care if a single controller 318 // is failing and its status so we don't need to continuing 319 // checking for failure messages for the remaining controllers. 320 if c.Status.LastFailureMsg != "" { 321 minimalControllers = append(minimalControllers, c.DeepCopy()) 322 break 323 } 324 } 325 } 326 sr = models.StatusResponse{ 327 Cluster: csCopy, 328 Controllers: minimalControllers, 329 } 330 } else { 331 // d.statusResponse contains references, so we do a deep copy to be able to 332 // safely use sr after the method has returned 333 sr = *d.statusResponse.DeepCopy() 334 } 335 336 sr.Stale = stale 337 338 switch { 339 case len(sr.Stale) > 0: 340 sr.Cilium = &models.Status{ 341 State: models.StatusStateWarning, 342 Msg: "Stale status data", 343 } 344 case d.statusResponse.Kvstore != nil && d.statusResponse.Kvstore.State != models.StatusStateOk: 345 sr.Cilium = &models.Status{ 346 State: d.statusResponse.Kvstore.State, 347 Msg: "Kvstore service is not ready", 348 } 349 case d.statusResponse.ContainerRuntime != nil && d.statusResponse.ContainerRuntime.State != models.StatusStateOk: 350 msg := "Container runtime is not ready" 351 if d.statusResponse.ContainerRuntime.State == models.StatusStateDisabled { 352 msg = "Container runtime is disabled" 353 } 354 sr.Cilium = &models.Status{ 355 State: d.statusResponse.ContainerRuntime.State, 356 Msg: msg, 357 } 358 case k8s.IsEnabled() && d.statusResponse.Kubernetes != nil && d.statusResponse.Kubernetes.State != models.StatusStateOk: 359 sr.Cilium = &models.Status{ 360 State: d.statusResponse.Kubernetes.State, 361 Msg: "Kubernetes service is not ready", 362 } 363 default: 364 sr.Cilium = &models.Status{State: models.StatusStateOk, Msg: "OK"} 365 } 366 367 return sr 368 } 369 370 func (d *Daemon) startStatusCollector() { 371 probes := []status.Probe{ 372 { 373 Name: "check-locks", 374 Probe: func(ctx context.Context) (interface{}, error) { 375 // Try to acquire a couple of global locks to have the status API fail 376 // in case of a deadlock on these locks 377 option.Config.ConfigPatchMutex.Lock() 378 option.Config.ConfigPatchMutex.Unlock() 379 return nil, nil 380 }, 381 OnStatusUpdate: func(status status.Status) { 382 d.statusCollectMutex.Lock() 383 defer d.statusCollectMutex.Unlock() 384 // FIXME we have no field for the lock status 385 }, 386 }, 387 { 388 Name: "kvstore", 389 Probe: func(ctx context.Context) (interface{}, error) { 390 if option.Config.KVStore == "" { 391 return models.StatusStateDisabled, nil 392 } else { 393 return kvstore.Client().Status() 394 } 395 }, 396 OnStatusUpdate: func(status status.Status) { 397 var msg string 398 state := models.StatusStateOk 399 info, ok := status.Data.(string) 400 401 switch { 402 case ok && status.Err != nil: 403 state = models.StatusStateFailure 404 msg = fmt.Sprintf("Err: %s - %s", status.Err, info) 405 case status.Err != nil: 406 state = models.StatusStateFailure 407 msg = fmt.Sprintf("Err: %s", status.Err) 408 case ok: 409 msg = fmt.Sprintf("%s", info) 410 } 411 412 d.statusCollectMutex.Lock() 413 defer d.statusCollectMutex.Unlock() 414 415 d.statusResponse.Kvstore = &models.Status{ 416 State: state, 417 Msg: msg, 418 } 419 }, 420 }, 421 { 422 Name: "container-runtime", 423 Probe: func(ctx context.Context) (interface{}, error) { 424 return workloads.Status(), nil 425 }, 426 OnStatusUpdate: func(status status.Status) { 427 d.statusCollectMutex.Lock() 428 defer d.statusCollectMutex.Unlock() 429 430 if status.Err != nil { 431 d.statusResponse.ContainerRuntime = &models.Status{ 432 State: models.StatusStateFailure, 433 Msg: status.Err.Error(), 434 } 435 return 436 } 437 438 if s, ok := status.Data.(*models.Status); ok { 439 d.statusResponse.ContainerRuntime = s 440 } 441 }, 442 }, 443 { 444 Name: "kubernetes", 445 Interval: func(failures int) time.Duration { 446 if failures > 0 { 447 // While failing, we want an initial 448 // quick retry with exponential backoff 449 // to avoid continuous load on the 450 // apiserver 451 return backoff.CalculateDuration(5*time.Second, 2*time.Minute, 2.0, false, failures) 452 } 453 454 // The base interval is dependant on the 455 // cluster size. One status interval does not 456 // automatically translate to an apiserver 457 // interaction as any regular apiserver 458 // interaction is also used as an indication of 459 // successful connectivity so we can continue 460 // to be fairly aggressive. 461 // 462 // 1 | 7s 463 // 2 | 12s 464 // 4 | 15s 465 // 64 | 42s 466 // 512 | 1m02s 467 // 2048 | 1m15s 468 // 8192 | 1m30s 469 // 16384 | 1m32s 470 return d.nodeDiscovery.Manager.ClusterSizeDependantInterval(10 * time.Second) 471 }, 472 Probe: func(ctx context.Context) (interface{}, error) { 473 return d.getK8sStatus(), nil 474 }, 475 OnStatusUpdate: func(status status.Status) { 476 d.statusCollectMutex.Lock() 477 defer d.statusCollectMutex.Unlock() 478 479 if status.Err != nil { 480 d.statusResponse.Kubernetes = &models.K8sStatus{ 481 State: models.StatusStateFailure, 482 Msg: status.Err.Error(), 483 } 484 return 485 } 486 if s, ok := status.Data.(*models.K8sStatus); ok { 487 d.statusResponse.Kubernetes = s 488 } 489 }, 490 }, 491 { 492 Name: "ipam", 493 Probe: func(ctx context.Context) (interface{}, error) { 494 return d.DumpIPAM(), nil 495 }, 496 OnStatusUpdate: func(status status.Status) { 497 d.statusCollectMutex.Lock() 498 defer d.statusCollectMutex.Unlock() 499 500 // IPAMStatus has no way to show errors 501 if status.Err == nil { 502 if s, ok := status.Data.(*models.IPAMStatus); ok { 503 d.statusResponse.IPAM = s 504 } 505 } 506 }, 507 }, 508 { 509 Name: "node-monitor", 510 Probe: func(ctx context.Context) (interface{}, error) { 511 return d.monitorAgent.State(), nil 512 }, 513 OnStatusUpdate: func(status status.Status) { 514 d.statusCollectMutex.Lock() 515 defer d.statusCollectMutex.Unlock() 516 517 // NodeMonitor has no way to show errors 518 if status.Err == nil { 519 if s, ok := status.Data.(*models.MonitorStatus); ok { 520 d.statusResponse.NodeMonitor = s 521 } 522 } 523 }, 524 }, 525 { 526 Name: "cluster", 527 Probe: func(ctx context.Context) (interface{}, error) { 528 clusterStatus := &models.ClusterStatus{ 529 Self: d.nodeDiscovery.LocalNode.Fullname(), 530 } 531 return clusterStatus, nil 532 }, 533 OnStatusUpdate: func(status status.Status) { 534 d.statusCollectMutex.Lock() 535 defer d.statusCollectMutex.Unlock() 536 537 // ClusterStatus has no way to report errors 538 if status.Err == nil { 539 if s, ok := status.Data.(*models.ClusterStatus); ok { 540 if d.statusResponse.Cluster != nil { 541 // NB: CiliumHealth is set concurrently by the 542 // "cilium-health" probe, so do not override it 543 s.CiliumHealth = d.statusResponse.Cluster.CiliumHealth 544 } 545 d.statusResponse.Cluster = s 546 } 547 } 548 }, 549 }, 550 { 551 Name: "cilium-health", 552 Probe: func(ctx context.Context) (interface{}, error) { 553 if d.ciliumHealth == nil { 554 return nil, nil 555 } 556 return d.ciliumHealth.GetStatus(), nil 557 }, 558 OnStatusUpdate: func(status status.Status) { 559 if d.ciliumHealth == nil { 560 return 561 } 562 563 d.statusCollectMutex.Lock() 564 defer d.statusCollectMutex.Unlock() 565 566 if d.statusResponse.Cluster == nil { 567 d.statusResponse.Cluster = &models.ClusterStatus{} 568 } 569 if status.Err != nil { 570 d.statusResponse.Cluster.CiliumHealth = &models.Status{ 571 State: models.StatusStateFailure, 572 Msg: status.Err.Error(), 573 } 574 return 575 } 576 if s, ok := status.Data.(*models.Status); ok { 577 d.statusResponse.Cluster.CiliumHealth = s 578 } 579 }, 580 }, 581 { 582 Name: "l7-proxy", 583 Probe: func(ctx context.Context) (interface{}, error) { 584 if d.l7Proxy == nil { 585 return nil, nil 586 } 587 return d.l7Proxy.GetStatusModel(), nil 588 }, 589 OnStatusUpdate: func(status status.Status) { 590 d.statusCollectMutex.Lock() 591 defer d.statusCollectMutex.Unlock() 592 593 // ProxyStatus has no way to report errors 594 if status.Err == nil { 595 if s, ok := status.Data.(*models.ProxyStatus); ok { 596 d.statusResponse.Proxy = s 597 } 598 } 599 }, 600 }, 601 { 602 Name: "controllers", 603 Probe: func(ctx context.Context) (interface{}, error) { 604 return controller.GetGlobalStatus(), nil 605 }, 606 OnStatusUpdate: func(status status.Status) { 607 d.statusCollectMutex.Lock() 608 defer d.statusCollectMutex.Unlock() 609 610 // ControllerStatuses has no way to report errors 611 if status.Err == nil { 612 if s, ok := status.Data.(models.ControllerStatuses); ok { 613 d.statusResponse.Controllers = s 614 } 615 } 616 }, 617 }, 618 } 619 620 d.statusCollector = status.NewCollector(probes, status.Config{}) 621 622 return 623 }