github.com/imran-kn/cilium-fork@v1.6.9/daemon/state.go (about) 1 // Copyright 2016-2019 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package main 16 17 import ( 18 "context" 19 "fmt" 20 "io/ioutil" 21 "net" 22 "os" 23 "sync" 24 "time" 25 26 "github.com/cilium/cilium/pkg/controller" 27 "github.com/cilium/cilium/pkg/defaults" 28 "github.com/cilium/cilium/pkg/endpoint" 29 "github.com/cilium/cilium/pkg/endpoint/regeneration" 30 "github.com/cilium/cilium/pkg/endpointmanager" 31 "github.com/cilium/cilium/pkg/identity/cache" 32 "github.com/cilium/cilium/pkg/ipcache" 33 "github.com/cilium/cilium/pkg/k8s" 34 "github.com/cilium/cilium/pkg/labels" 35 "github.com/cilium/cilium/pkg/logging/logfields" 36 "github.com/cilium/cilium/pkg/maps/ctmap" 37 "github.com/cilium/cilium/pkg/maps/lxcmap" 38 "github.com/cilium/cilium/pkg/option" 39 "github.com/cilium/cilium/pkg/policy" 40 "github.com/cilium/cilium/pkg/workloads" 41 42 "github.com/sirupsen/logrus" 43 "github.com/vishvananda/netlink" 44 k8serrors "k8s.io/apimachinery/pkg/api/errors" 45 meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 46 ) 47 48 type endpointRestoreState struct { 49 restored []*endpoint.Endpoint 50 toClean []*endpoint.Endpoint 51 } 52 53 // validateEndpoint attempts to determine that the endpoint is valid, ie it 54 // still exists in k8s, its datapath devices are present, and Cilium is 55 // responsible for its workload, etc. 56 // 57 // Returns true to indicate that the endpoint is valid to restore, and an 58 // optional error. 59 func (d *Daemon) validateEndpoint(ep *endpoint.Endpoint) (valid bool, err error) { 60 // On each restart, the health endpoint is supposed to be recreated. 61 // Hence we need to clean health endpoint state unconditionally. 62 if ep.HasLabels(labels.LabelHealth) { 63 // Ignore health endpoint and don't report 64 // it as not restored. But we need to clean up the old 65 // state files, so do this now. 66 healthStateDir := ep.StateDirectoryPath() 67 scopedLog := log.WithFields(logrus.Fields{ 68 logfields.EndpointID: ep.ID, 69 logfields.Path: healthStateDir, 70 }) 71 scopedLog.Debug("Removing old health endpoint state directory") 72 if err := os.RemoveAll(healthStateDir); err != nil { 73 scopedLog.Warning("Cannot clean up old health state directory") 74 } 75 return false, nil 76 } 77 78 if ep.K8sPodName != "" && ep.K8sNamespace != "" && k8s.IsEnabled() { 79 _, err := k8s.Client().CoreV1().Pods(ep.K8sNamespace).Get(ep.K8sPodName, meta_v1.GetOptions{}) 80 if err != nil && k8serrors.IsNotFound(err) { 81 return false, fmt.Errorf("kubernetes pod not found") 82 } 83 } 84 85 if ep.HasIpvlanDataPath() { 86 // FIXME: We cannot check whether ipvlan slave netdev exists, 87 // because it requires entering container netns which is not 88 // always accessible (e.g. in k8s case "/proc" has to be bind 89 // mounted). Instead, we check whether the tail call map exists. 90 if _, err := os.Stat(ep.BPFIpvlanMapPath()); err != nil { 91 return false, fmt.Errorf("tail call map for IPvlan unavailable: %s", err) 92 } 93 } else if _, err := netlink.LinkByName(ep.IfName); err != nil { 94 return false, fmt.Errorf("interface %s could not be found", ep.IfName) 95 } 96 97 if option.Config.WorkloadsEnabled() && !workloads.IsRunning(ep) { 98 return false, fmt.Errorf("no workload could be associated with endpoint") 99 } 100 101 if !ep.DatapathConfiguration.ExternalIPAM { 102 if err := d.allocateIPsLocked(ep); err != nil { 103 return false, fmt.Errorf("Failed to re-allocate IP of endpoint: %s", err) 104 } 105 } 106 107 return true, nil 108 } 109 110 // restoreOldEndpoints reads the list of existing endpoints previously managed 111 // Cilium when it was last run and associated it with container workloads. This 112 // function performs the first step in restoring the endpoint structure, 113 // allocating their existing IP out of the CIDR block and then inserting the 114 // endpoints into the endpoints list. It needs to be followed by a call to 115 // regenerateRestoredEndpoints() once the endpoint builder is ready. 116 // 117 // If clean is true, endpoints which cannot be associated with a container 118 // workloads are deleted. 119 func (d *Daemon) restoreOldEndpoints(dir string, clean bool) (*endpointRestoreState, error) { 120 failed := 0 121 state := &endpointRestoreState{ 122 restored: []*endpoint.Endpoint{}, 123 toClean: []*endpoint.Endpoint{}, 124 } 125 126 if !option.Config.RestoreState { 127 log.Info("Endpoint restore is disabled, skipping restore step") 128 return state, nil 129 } 130 131 log.Info("Restoring endpoints...") 132 133 var ( 134 existingEndpoints map[string]*lxcmap.EndpointInfo 135 err error 136 ) 137 138 if !option.Config.DryMode { 139 existingEndpoints, err = lxcmap.DumpToMap() 140 if err != nil { 141 log.WithError(err).Warning("Unable to open endpoint map while restoring. Skipping cleanup of endpoint map on startup") 142 } 143 } 144 145 dirFiles, err := ioutil.ReadDir(dir) 146 if err != nil { 147 return state, err 148 } 149 eptsID := endpoint.FilterEPDir(dirFiles) 150 151 possibleEPs := endpoint.ReadEPsFromDirNames(d, dir, eptsID) 152 153 if len(possibleEPs) == 0 { 154 log.Info("No old endpoints found.") 155 return state, nil 156 } 157 158 for _, ep := range possibleEPs { 159 scopedLog := log.WithField(logfields.EndpointID, ep.ID) 160 if k8s.IsEnabled() { 161 scopedLog = scopedLog.WithField("k8sPodName", ep.GetK8sNamespaceAndPodNameLocked()) 162 } 163 164 restore, err := d.validateEndpoint(ep) 165 if err != nil { 166 scopedLog.WithError(err).Warningf("Unable to restore endpoint, ignoring") 167 failed++ 168 } 169 if !restore { 170 if clean { 171 state.toClean = append(state.toClean, ep) 172 } 173 continue 174 } 175 176 ep.UnconditionalLock() 177 scopedLog.Debug("Restoring endpoint") 178 ep.LogStatusOKLocked(endpoint.Other, "Restoring endpoint from previous cilium instance") 179 180 if !option.Config.KeepConfig { 181 ep.SetDefaultOpts(option.Config.Opts) 182 alwaysEnforce := policy.GetPolicyEnabled() == option.AlwaysEnforce 183 ep.SetDesiredIngressPolicyEnabledLocked(alwaysEnforce) 184 ep.SetDesiredEgressPolicyEnabledLocked(alwaysEnforce) 185 } 186 187 ep.Unlock() 188 189 ep.SkipStateClean() 190 191 state.restored = append(state.restored, ep) 192 193 if existingEndpoints != nil { 194 delete(existingEndpoints, ep.IPv4.String()) 195 delete(existingEndpoints, ep.IPv6.String()) 196 } 197 } 198 199 log.WithFields(logrus.Fields{ 200 "restored": len(state.restored), 201 "failed": failed, 202 }).Info("Endpoints restored") 203 204 if existingEndpoints != nil { 205 for hostIP, info := range existingEndpoints { 206 if ip := net.ParseIP(hostIP); !info.IsHost() && ip != nil { 207 if err := lxcmap.DeleteEntry(ip); err != nil { 208 log.WithError(err).Warn("Unable to delete obsolete endpoint from BPF map") 209 } else { 210 log.Debugf("Removed outdated endpoint %d from endpoint map", info.LxcID) 211 } 212 } 213 } 214 } 215 216 return state, nil 217 } 218 219 func (d *Daemon) regenerateRestoredEndpoints(state *endpointRestoreState) (restoreComplete chan struct{}) { 220 restoreComplete = make(chan struct{}, 0) 221 222 log.WithField("numRestored", len(state.restored)).Info("Regenerating restored endpoints") 223 224 // Before regenerating, check whether the CT map has properties that 225 // match this Cilium userspace instance. If not, it must be removed 226 ctmap.DeleteIfUpgradeNeeded(nil) 227 228 // we need to signalize when the endpoints are regenerated, i.e., when 229 // they have finished to rebuild after being restored. 230 epRegenerated := make(chan bool, len(state.restored)) 231 232 // Insert all endpoints into the endpoint list first before starting 233 // the regeneration. This is required to ensure that if an individual 234 // regeneration causes an identity change of an endpoint, the new 235 // identity will trigger a policy recalculation of all endpoints to 236 // account for the new identity during the grace period. For this 237 // purpose, all endpoints being restored must already be in the 238 // endpoint list. 239 for i := len(state.restored) - 1; i >= 0; i-- { 240 ep := state.restored[i] 241 // If the endpoint has local conntrack option enabled, then 242 // check whether the CT map needs upgrading (and do so). 243 if ep.Options.IsEnabled(option.ConntrackLocal) { 244 ctmap.DeleteIfUpgradeNeeded(ep) 245 } 246 247 // Insert into endpoint manager so it can be regenerated when calls to 248 // RegenerateAllEndpoints() are made. This must be done synchronously (i.e., 249 // not in a goroutine) because regenerateRestoredEndpoints must guarantee 250 // upon returning that endpoints are exposed to other subsystems via 251 // endpointmanager. 252 253 if err := endpointmanager.Insert(ep); err != nil { 254 log.WithError(err).Warning("Unable to restore endpoint") 255 // remove endpoint from slice of endpoints to restore 256 state.restored = append(state.restored[:i], state.restored[i+1:]...) 257 } 258 } 259 260 for _, ep := range state.restored { 261 go func(ep *endpoint.Endpoint, epRegenerated chan<- bool) { 262 if err := ep.RLockAlive(); err != nil { 263 ep.LogDisconnectedMutexAction(err, "before filtering labels during regenerating restored endpoint") 264 epRegenerated <- false 265 return 266 } 267 scopedLog := log.WithField(logfields.EndpointID, ep.ID) 268 // Filter the restored labels with the new daemon's filter 269 l, _ := labels.FilterLabels(ep.OpLabels.AllLabels()) 270 ep.RUnlock() 271 272 allocateCtx, cancel := context.WithTimeout(context.Background(), option.Config.KVstoreConnectivityTimeout) 273 defer cancel() 274 identity, _, err := cache.AllocateIdentity(allocateCtx, d, l) 275 276 if err != nil { 277 scopedLog.WithError(err).Warn("Unable to restore endpoint") 278 epRegenerated <- false 279 return 280 } 281 282 // Wait for initial identities and ipcache from the 283 // kvstore before doing any policy calculation for 284 // endpoints that don't have a fixed identity or are 285 // not well known. 286 if !identity.IsFixed() && !identity.IsWellKnown() { 287 identityCtx, cancel := context.WithTimeout(context.Background(), option.Config.KVstoreConnectivityTimeout) 288 defer cancel() 289 290 err = cache.WaitForInitialGlobalIdentities(identityCtx) 291 if err != nil { 292 scopedLog.WithError(err).Warn("Failed while waiting for initial global identities") 293 epRegenerated <- false 294 return 295 } 296 if option.Config.KVStore != "" { 297 ipcache.WaitForKVStoreSync() 298 } 299 } 300 301 if err := ep.LockAlive(); err != nil { 302 scopedLog.Warn("Endpoint to restore has been deleted") 303 epRegenerated <- false 304 return 305 } 306 307 ep.SetStateLocked(endpoint.StateRestoring, "Synchronizing endpoint labels with KVStore") 308 309 if ep.SecurityIdentity != nil { 310 if oldSecID := ep.SecurityIdentity.ID; identity.ID != oldSecID { 311 log.WithFields(logrus.Fields{ 312 logfields.EndpointID: ep.ID, 313 logfields.IdentityLabels + ".old": oldSecID, 314 logfields.IdentityLabels + ".new": identity.ID, 315 }).Info("Security identity for endpoint is different from the security identity restored for the endpoint") 316 317 // The identity of the endpoint being 318 // restored has changed. This can be 319 // caused by two main reasons: 320 // 321 // 1) Cilium has been upgraded, 322 // downgraded or the configuration has 323 // changed and the new version or 324 // configuration causes different 325 // labels to be considered security 326 // relevant for this endpoint. 327 // 328 // Immediately using the identity may 329 // cause connectivity problems if this 330 // is the first endpoint in the cluster 331 // to use the new identity. All other 332 // nodes will not have had a chance to 333 // adjust the security policies for 334 // their endpoints. Hence, apply a 335 // grace period to allow for the 336 // update. It is not required to check 337 // any local endpoints for potential 338 // outdated security rules, the 339 // notification of the new security 340 // identity will have been received and 341 // will trigger the necessary 342 // recalculation of all local 343 // endpoints. 344 // 345 // 2) The identity is outdated as the 346 // state in the kvstore has changed. 347 // This reason would justify an 348 // immediate use of the new identity 349 // but given the current identity is 350 // already in place, it is also correct 351 // to continue using it for the 352 // duration of a grace period. 353 time.Sleep(defaults.IdentityChangeGracePeriod) 354 } 355 } 356 // The identity of a freshly restored endpoint is incomplete due to some 357 // parts of the identity not being marshaled to JSON. Hence we must set 358 // the identity even if has not changed. 359 ep.SetIdentity(identity, true) 360 ep.Unlock() 361 362 regenerationMetadata := ®eneration.ExternalRegenerationMetadata{ 363 Reason: "syncing state to host", 364 } 365 if buildSuccess := <-ep.Regenerate(regenerationMetadata); !buildSuccess { 366 scopedLog.Warn("Failed while regenerating endpoint") 367 epRegenerated <- false 368 return 369 } 370 371 // NOTE: UnconditionalRLock is used here because it's used only for logging an already restored endpoint 372 ep.UnconditionalRLock() 373 scopedLog.WithField(logfields.IPAddr, []string{ep.IPv4.String(), ep.IPv6.String()}).Info("Restored endpoint") 374 ep.RUnlock() 375 epRegenerated <- true 376 }(ep, epRegenerated) 377 } 378 379 var endpointCleanupCompleted sync.WaitGroup 380 for _, ep := range state.toClean { 381 endpointCleanupCompleted.Add(1) 382 go func(ep *endpoint.Endpoint) { 383 // The IP was not allocated yet so does not need to be free. 384 // The identity may be allocated in the kvstore but we can't 385 // release it easily as it will require to block on kvstore 386 // connectivity which we can't do at this point. Let the lease 387 // expire to release the identity. 388 d.deleteEndpointQuiet(ep, endpoint.DeleteConfig{ 389 NoIdentityRelease: true, 390 NoIPRelease: true, 391 }) 392 endpointCleanupCompleted.Done() 393 }(ep) 394 } 395 endpointCleanupCompleted.Wait() 396 397 go func() { 398 regenerated, total := 0, 0 399 if len(state.restored) > 0 { 400 for buildSuccess := range epRegenerated { 401 if buildSuccess { 402 regenerated++ 403 } 404 total++ 405 if total >= len(state.restored) { 406 break 407 } 408 } 409 } 410 close(epRegenerated) 411 412 log.WithFields(logrus.Fields{ 413 "regenerated": regenerated, 414 "total": total, 415 }).Info("Finished regenerating restored endpoints") 416 close(restoreComplete) 417 }() 418 419 return 420 } 421 422 func (d *Daemon) allocateIPsLocked(ep *endpoint.Endpoint) error { 423 var err error 424 425 if option.Config.EnableIPv6 && ep.IPv6 != nil { 426 err = d.ipam.AllocateIP(ep.IPv6.IP(), ep.HumanStringLocked()+" [restored]") 427 if err != nil { 428 return fmt.Errorf("unable to reallocate IPv6 address: %s", err) 429 } 430 431 defer func() { 432 if err != nil { 433 d.ipam.ReleaseIP(ep.IPv6.IP()) 434 } 435 }() 436 } 437 438 if option.Config.EnableIPv4 && ep.IPv4 != nil { 439 if err = d.ipam.AllocateIP(ep.IPv4.IP(), ep.HumanStringLocked()+" [restored]"); err != nil { 440 return fmt.Errorf("unable to reallocate IPv4 address: %s", err) 441 } 442 } 443 444 return nil 445 } 446 447 func (d *Daemon) initRestore(restoredEndpoints *endpointRestoreState) chan struct{} { 448 bootstrapStats.restore.Start() 449 var restoreComplete chan struct{} 450 if option.Config.RestoreState { 451 // When we regenerate restored endpoints, it is guaranteed tha we have 452 // received the full list of policies present at the time the daemon 453 // is bootstrapped. 454 restoreComplete = d.regenerateRestoredEndpoints(restoredEndpoints) 455 go func() { 456 <-restoreComplete 457 endParallelMapMode() 458 }() 459 460 go func() { 461 if k8s.IsEnabled() { 462 // Start controller which removes any leftover Kubernetes 463 // services that may have been deleted while Cilium was not 464 // running. Once this controller succeeds, because it has no 465 // RunInterval specified, it will not run again unless updated 466 // elsewhere. This means that if, for instance, a user manually 467 // adds a service via the CLI into the BPF maps, that it will 468 // not be cleaned up by the daemon until it restarts. 469 controller.NewManager().UpdateController("sync-lb-maps-with-k8s-services", 470 controller.ControllerParams{ 471 DoFunc: func(ctx context.Context) error { 472 return d.syncLBMapsWithK8s() 473 }, 474 }, 475 ) 476 return 477 } 478 if err := d.SyncLBMap(); err != nil { 479 log.WithError(err).Warn("Error while recovering endpoints") 480 } 481 }() 482 } else { 483 log.Info("State restore is disabled. Existing endpoints on node are ignored") 484 // We need to read all docker containers so we know we won't 485 // going to allocate the same IP addresses and we will ignore 486 // these containers from reading. 487 workloads.IgnoreRunningWorkloads() 488 489 // No restore happened, end parallel map mode immediately 490 endParallelMapMode() 491 } 492 bootstrapStats.restore.End(true) 493 494 return restoreComplete 495 }