github.com/kubearmor/cilium@v1.6.12/daemon/state.go (about)

     1  // Copyright 2016-2019 Authors of Cilium
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package main
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"io/ioutil"
    21  	"net"
    22  	"os"
    23  	"sync"
    24  	"time"
    25  
    26  	"github.com/cilium/cilium/pkg/controller"
    27  	"github.com/cilium/cilium/pkg/defaults"
    28  	"github.com/cilium/cilium/pkg/endpoint"
    29  	"github.com/cilium/cilium/pkg/endpoint/regeneration"
    30  	"github.com/cilium/cilium/pkg/endpointmanager"
    31  	"github.com/cilium/cilium/pkg/identity/cache"
    32  	"github.com/cilium/cilium/pkg/ipcache"
    33  	"github.com/cilium/cilium/pkg/k8s"
    34  	"github.com/cilium/cilium/pkg/labels"
    35  	"github.com/cilium/cilium/pkg/logging/logfields"
    36  	"github.com/cilium/cilium/pkg/maps/ctmap"
    37  	"github.com/cilium/cilium/pkg/maps/lxcmap"
    38  	"github.com/cilium/cilium/pkg/option"
    39  	"github.com/cilium/cilium/pkg/policy"
    40  	"github.com/cilium/cilium/pkg/workloads"
    41  
    42  	"github.com/sirupsen/logrus"
    43  	"github.com/vishvananda/netlink"
    44  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    45  	meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    46  )
    47  
    48  type endpointRestoreState struct {
    49  	restored []*endpoint.Endpoint
    50  	toClean  []*endpoint.Endpoint
    51  }
    52  
    53  // validateEndpoint attempts to determine that the endpoint is valid, ie it
    54  // still exists in k8s, its datapath devices are present, and Cilium is
    55  // responsible for its workload, etc.
    56  //
    57  // Returns true to indicate that the endpoint is valid to restore, and an
    58  // optional error.
    59  func (d *Daemon) validateEndpoint(ep *endpoint.Endpoint) (valid bool, err error) {
    60  	// On each restart, the health endpoint is supposed to be recreated.
    61  	// Hence we need to clean health endpoint state unconditionally.
    62  	if ep.HasLabels(labels.LabelHealth) {
    63  		// Ignore health endpoint and don't report
    64  		// it as not restored. But we need to clean up the old
    65  		// state files, so do this now.
    66  		healthStateDir := ep.StateDirectoryPath()
    67  		scopedLog := log.WithFields(logrus.Fields{
    68  			logfields.EndpointID: ep.ID,
    69  			logfields.Path:       healthStateDir,
    70  		})
    71  		scopedLog.Debug("Removing old health endpoint state directory")
    72  		if err := os.RemoveAll(healthStateDir); err != nil {
    73  			scopedLog.Warning("Cannot clean up old health state directory")
    74  		}
    75  		return false, nil
    76  	}
    77  
    78  	if ep.K8sPodName != "" && ep.K8sNamespace != "" && k8s.IsEnabled() {
    79  		_, err := k8s.Client().CoreV1().Pods(ep.K8sNamespace).Get(ep.K8sPodName, meta_v1.GetOptions{})
    80  		if err != nil && k8serrors.IsNotFound(err) {
    81  			return false, fmt.Errorf("kubernetes pod not found")
    82  		}
    83  	}
    84  
    85  	if ep.HasIpvlanDataPath() {
    86  		// FIXME: We cannot check whether ipvlan slave netdev exists,
    87  		// because it requires entering container netns which is not
    88  		// always accessible (e.g. in k8s case "/proc" has to be bind
    89  		// mounted). Instead, we check whether the tail call map exists.
    90  		if _, err := os.Stat(ep.BPFIpvlanMapPath()); err != nil {
    91  			return false, fmt.Errorf("tail call map for IPvlan unavailable: %s", err)
    92  		}
    93  	} else if _, err := netlink.LinkByName(ep.IfName); err != nil {
    94  		return false, fmt.Errorf("interface %s could not be found", ep.IfName)
    95  	}
    96  
    97  	if option.Config.WorkloadsEnabled() && !workloads.IsRunning(ep) {
    98  		return false, fmt.Errorf("no workload could be associated with endpoint")
    99  	}
   100  
   101  	if !ep.DatapathConfiguration.ExternalIPAM {
   102  		if err := d.allocateIPsLocked(ep); err != nil {
   103  			return false, fmt.Errorf("Failed to re-allocate IP of endpoint: %s", err)
   104  		}
   105  	}
   106  
   107  	return true, nil
   108  }
   109  
   110  // restoreOldEndpoints reads the list of existing endpoints previously managed
   111  // Cilium when it was last run and associated it with container workloads. This
   112  // function performs the first step in restoring the endpoint structure,
   113  // allocating their existing IP out of the CIDR block and then inserting the
   114  // endpoints into the endpoints list. It needs to be followed by a call to
   115  // regenerateRestoredEndpoints() once the endpoint builder is ready.
   116  //
   117  // If clean is true, endpoints which cannot be associated with a container
   118  // workloads are deleted.
   119  func (d *Daemon) restoreOldEndpoints(dir string, clean bool) (*endpointRestoreState, error) {
   120  	failed := 0
   121  	state := &endpointRestoreState{
   122  		restored: []*endpoint.Endpoint{},
   123  		toClean:  []*endpoint.Endpoint{},
   124  	}
   125  
   126  	if !option.Config.RestoreState {
   127  		log.Info("Endpoint restore is disabled, skipping restore step")
   128  		return state, nil
   129  	}
   130  
   131  	log.Info("Restoring endpoints...")
   132  
   133  	var (
   134  		existingEndpoints map[string]*lxcmap.EndpointInfo
   135  		err               error
   136  	)
   137  
   138  	if !option.Config.DryMode {
   139  		existingEndpoints, err = lxcmap.DumpToMap()
   140  		if err != nil {
   141  			log.WithError(err).Warning("Unable to open endpoint map while restoring. Skipping cleanup of endpoint map on startup")
   142  		}
   143  	}
   144  
   145  	dirFiles, err := ioutil.ReadDir(dir)
   146  	if err != nil {
   147  		return state, err
   148  	}
   149  	eptsID := endpoint.FilterEPDir(dirFiles)
   150  
   151  	possibleEPs := endpoint.ReadEPsFromDirNames(d, dir, eptsID)
   152  
   153  	if len(possibleEPs) == 0 {
   154  		log.Info("No old endpoints found.")
   155  		return state, nil
   156  	}
   157  
   158  	for _, ep := range possibleEPs {
   159  		scopedLog := log.WithField(logfields.EndpointID, ep.ID)
   160  		if k8s.IsEnabled() {
   161  			scopedLog = scopedLog.WithField("k8sPodName", ep.GetK8sNamespaceAndPodNameLocked())
   162  		}
   163  
   164  		restore, err := d.validateEndpoint(ep)
   165  		if err != nil {
   166  			scopedLog.WithError(err).Warningf("Unable to restore endpoint, ignoring")
   167  			failed++
   168  		}
   169  		if !restore {
   170  			if clean {
   171  				state.toClean = append(state.toClean, ep)
   172  			}
   173  			continue
   174  		}
   175  
   176  		ep.UnconditionalLock()
   177  		scopedLog.Debug("Restoring endpoint")
   178  		ep.LogStatusOKLocked(endpoint.Other, "Restoring endpoint from previous cilium instance")
   179  
   180  		if !option.Config.KeepConfig {
   181  			ep.SetDefaultOpts(option.Config.Opts)
   182  			alwaysEnforce := policy.GetPolicyEnabled() == option.AlwaysEnforce
   183  			ep.SetDesiredIngressPolicyEnabledLocked(alwaysEnforce)
   184  			ep.SetDesiredEgressPolicyEnabledLocked(alwaysEnforce)
   185  		}
   186  
   187  		ep.Unlock()
   188  
   189  		ep.SkipStateClean()
   190  
   191  		state.restored = append(state.restored, ep)
   192  
   193  		if existingEndpoints != nil {
   194  			delete(existingEndpoints, ep.IPv4.String())
   195  			delete(existingEndpoints, ep.IPv6.String())
   196  		}
   197  	}
   198  
   199  	log.WithFields(logrus.Fields{
   200  		"restored": len(state.restored),
   201  		"failed":   failed,
   202  	}).Info("Endpoints restored")
   203  
   204  	if existingEndpoints != nil {
   205  		for hostIP, info := range existingEndpoints {
   206  			if ip := net.ParseIP(hostIP); !info.IsHost() && ip != nil {
   207  				if err := lxcmap.DeleteEntry(ip); err != nil {
   208  					log.WithError(err).Warn("Unable to delete obsolete endpoint from BPF map")
   209  				} else {
   210  					log.Debugf("Removed outdated endpoint %d from endpoint map", info.LxcID)
   211  				}
   212  			}
   213  		}
   214  	}
   215  
   216  	return state, nil
   217  }
   218  
   219  func (d *Daemon) regenerateRestoredEndpoints(state *endpointRestoreState) (restoreComplete chan struct{}) {
   220  	restoreComplete = make(chan struct{}, 0)
   221  
   222  	log.WithField("numRestored", len(state.restored)).Info("Regenerating restored endpoints")
   223  
   224  	// Before regenerating, check whether the CT map has properties that
   225  	// match this Cilium userspace instance. If not, it must be removed
   226  	ctmap.DeleteIfUpgradeNeeded(nil)
   227  
   228  	// we need to signalize when the endpoints are regenerated, i.e., when
   229  	// they have finished to rebuild after being restored.
   230  	epRegenerated := make(chan bool, len(state.restored))
   231  
   232  	// Insert all endpoints into the endpoint list first before starting
   233  	// the regeneration. This is required to ensure that if an individual
   234  	// regeneration causes an identity change of an endpoint, the new
   235  	// identity will trigger a policy recalculation of all endpoints to
   236  	// account for the new identity during the grace period. For this
   237  	// purpose, all endpoints being restored must already be in the
   238  	// endpoint list.
   239  	for i := len(state.restored) - 1; i >= 0; i-- {
   240  		ep := state.restored[i]
   241  		// If the endpoint has local conntrack option enabled, then
   242  		// check whether the CT map needs upgrading (and do so).
   243  		if ep.Options.IsEnabled(option.ConntrackLocal) {
   244  			ctmap.DeleteIfUpgradeNeeded(ep)
   245  		}
   246  
   247  		// Insert into endpoint manager so it can be regenerated when calls to
   248  		// RegenerateAllEndpoints() are made. This must be done synchronously (i.e.,
   249  		// not in a goroutine) because regenerateRestoredEndpoints must guarantee
   250  		// upon returning that endpoints are exposed to other subsystems via
   251  		// endpointmanager.
   252  
   253  		if err := endpointmanager.Insert(ep); err != nil {
   254  			log.WithError(err).Warning("Unable to restore endpoint")
   255  			// remove endpoint from slice of endpoints to restore
   256  			state.restored = append(state.restored[:i], state.restored[i+1:]...)
   257  		}
   258  	}
   259  
   260  	for _, ep := range state.restored {
   261  		go func(ep *endpoint.Endpoint, epRegenerated chan<- bool) {
   262  			if err := ep.RLockAlive(); err != nil {
   263  				ep.LogDisconnectedMutexAction(err, "before filtering labels during regenerating restored endpoint")
   264  				epRegenerated <- false
   265  				return
   266  			}
   267  			scopedLog := log.WithField(logfields.EndpointID, ep.ID)
   268  			// Filter the restored labels with the new daemon's filter
   269  			l, _ := labels.FilterLabels(ep.OpLabels.AllLabels())
   270  			ep.RUnlock()
   271  
   272  			allocateCtx, cancel := context.WithTimeout(context.Background(), option.Config.KVstoreConnectivityTimeout)
   273  			defer cancel()
   274  			identity, _, err := cache.AllocateIdentity(allocateCtx, d, l)
   275  
   276  			if err != nil {
   277  				scopedLog.WithError(err).Warn("Unable to restore endpoint")
   278  				epRegenerated <- false
   279  				return
   280  			}
   281  
   282  			// Wait for initial identities and ipcache from the
   283  			// kvstore before doing any policy calculation for
   284  			// endpoints that don't have a fixed identity or are
   285  			// not well known.
   286  			if !identity.IsFixed() && !identity.IsWellKnown() {
   287  				identityCtx, cancel := context.WithTimeout(context.Background(), option.Config.KVstoreConnectivityTimeout)
   288  				defer cancel()
   289  
   290  				err = cache.WaitForInitialGlobalIdentities(identityCtx)
   291  				if err != nil {
   292  					scopedLog.WithError(err).Warn("Failed while waiting for initial global identities")
   293  					epRegenerated <- false
   294  					return
   295  				}
   296  				if option.Config.KVStore != "" {
   297  					ipcache.WaitForKVStoreSync()
   298  				}
   299  			}
   300  
   301  			if err := ep.LockAlive(); err != nil {
   302  				scopedLog.Warn("Endpoint to restore has been deleted")
   303  				epRegenerated <- false
   304  				return
   305  			}
   306  
   307  			ep.SetStateLocked(endpoint.StateRestoring, "Synchronizing endpoint labels with KVStore")
   308  
   309  			if ep.SecurityIdentity != nil {
   310  				if oldSecID := ep.SecurityIdentity.ID; identity.ID != oldSecID {
   311  					log.WithFields(logrus.Fields{
   312  						logfields.EndpointID:              ep.ID,
   313  						logfields.IdentityLabels + ".old": oldSecID,
   314  						logfields.IdentityLabels + ".new": identity.ID,
   315  					}).Info("Security identity for endpoint is different from the security identity restored for the endpoint")
   316  
   317  					// The identity of the endpoint being
   318  					// restored has changed. This can be
   319  					// caused by two main reasons:
   320  					//
   321  					// 1) Cilium has been upgraded,
   322  					// downgraded or the configuration has
   323  					// changed and the new version or
   324  					// configuration causes different
   325  					// labels to be considered security
   326  					// relevant for this endpoint.
   327  					//
   328  					// Immediately using the identity may
   329  					// cause connectivity problems if this
   330  					// is the first endpoint in the cluster
   331  					// to use the new identity. All other
   332  					// nodes will not have had a chance to
   333  					// adjust the security policies for
   334  					// their endpoints. Hence, apply a
   335  					// grace period to allow for the
   336  					// update. It is not required to check
   337  					// any local endpoints for potential
   338  					// outdated security rules, the
   339  					// notification of the new security
   340  					// identity will have been received and
   341  					// will trigger the necessary
   342  					// recalculation of all local
   343  					// endpoints.
   344  					//
   345  					// 2) The identity is outdated as the
   346  					// state in the kvstore has changed.
   347  					// This reason would justify an
   348  					// immediate use of the new identity
   349  					// but given the current identity is
   350  					// already in place, it is also correct
   351  					// to continue using it for the
   352  					// duration of a grace period.
   353  					time.Sleep(defaults.IdentityChangeGracePeriod)
   354  				}
   355  			}
   356  			// The identity of a freshly restored endpoint is incomplete due to some
   357  			// parts of the identity not being marshaled to JSON. Hence we must set
   358  			// the identity even if has not changed.
   359  			ep.SetIdentity(identity, true)
   360  			ep.Unlock()
   361  
   362  			regenerationMetadata := &regeneration.ExternalRegenerationMetadata{
   363  				Reason: "syncing state to host",
   364  			}
   365  			if buildSuccess := <-ep.Regenerate(regenerationMetadata); !buildSuccess {
   366  				scopedLog.Warn("Failed while regenerating endpoint")
   367  				epRegenerated <- false
   368  				return
   369  			}
   370  
   371  			// NOTE: UnconditionalRLock is used here because it's used only for logging an already restored endpoint
   372  			ep.UnconditionalRLock()
   373  			scopedLog.WithField(logfields.IPAddr, []string{ep.IPv4.String(), ep.IPv6.String()}).Info("Restored endpoint")
   374  			ep.RUnlock()
   375  			epRegenerated <- true
   376  		}(ep, epRegenerated)
   377  	}
   378  
   379  	var endpointCleanupCompleted sync.WaitGroup
   380  	for _, ep := range state.toClean {
   381  		endpointCleanupCompleted.Add(1)
   382  		go func(ep *endpoint.Endpoint) {
   383  			// The IP was not allocated yet so does not need to be free.
   384  			// The identity may be allocated in the kvstore but we can't
   385  			// release it easily as it will require to block on kvstore
   386  			// connectivity which we can't do at this point. Let the lease
   387  			// expire to release the identity.
   388  			d.deleteEndpointQuiet(ep, endpoint.DeleteConfig{
   389  				NoIdentityRelease: true,
   390  				NoIPRelease:       true,
   391  			})
   392  			endpointCleanupCompleted.Done()
   393  		}(ep)
   394  	}
   395  	endpointCleanupCompleted.Wait()
   396  
   397  	go func() {
   398  		regenerated, total := 0, 0
   399  		if len(state.restored) > 0 {
   400  			for buildSuccess := range epRegenerated {
   401  				if buildSuccess {
   402  					regenerated++
   403  				}
   404  				total++
   405  				if total >= len(state.restored) {
   406  					break
   407  				}
   408  			}
   409  		}
   410  		close(epRegenerated)
   411  
   412  		log.WithFields(logrus.Fields{
   413  			"regenerated": regenerated,
   414  			"total":       total,
   415  		}).Info("Finished regenerating restored endpoints")
   416  		close(restoreComplete)
   417  	}()
   418  
   419  	return
   420  }
   421  
   422  func (d *Daemon) allocateIPsLocked(ep *endpoint.Endpoint) error {
   423  	var err error
   424  
   425  	if option.Config.EnableIPv6 && ep.IPv6 != nil {
   426  		err = d.ipam.AllocateIP(ep.IPv6.IP(), ep.HumanStringLocked()+" [restored]")
   427  		if err != nil {
   428  			return fmt.Errorf("unable to reallocate IPv6 address: %s", err)
   429  		}
   430  
   431  		defer func() {
   432  			if err != nil {
   433  				d.ipam.ReleaseIP(ep.IPv6.IP())
   434  			}
   435  		}()
   436  	}
   437  
   438  	if option.Config.EnableIPv4 && ep.IPv4 != nil {
   439  		if err = d.ipam.AllocateIP(ep.IPv4.IP(), ep.HumanStringLocked()+" [restored]"); err != nil {
   440  			return fmt.Errorf("unable to reallocate IPv4 address: %s", err)
   441  		}
   442  	}
   443  
   444  	return nil
   445  }
   446  
   447  func (d *Daemon) initRestore(restoredEndpoints *endpointRestoreState) chan struct{} {
   448  	bootstrapStats.restore.Start()
   449  	var restoreComplete chan struct{}
   450  	if option.Config.RestoreState {
   451  		// When we regenerate restored endpoints, it is guaranteed tha we have
   452  		// received the full list of policies present at the time the daemon
   453  		// is bootstrapped.
   454  		restoreComplete = d.regenerateRestoredEndpoints(restoredEndpoints)
   455  		go func() {
   456  			<-restoreComplete
   457  			endParallelMapMode()
   458  		}()
   459  
   460  		go func() {
   461  			if k8s.IsEnabled() {
   462  				// Start controller which removes any leftover Kubernetes
   463  				// services that may have been deleted while Cilium was not
   464  				// running. Once this controller succeeds, because it has no
   465  				// RunInterval specified, it will not run again unless updated
   466  				// elsewhere. This means that if, for instance, a user manually
   467  				// adds a service via the CLI into the BPF maps, that it will
   468  				// not be cleaned up by the daemon until it restarts.
   469  				controller.NewManager().UpdateController("sync-lb-maps-with-k8s-services",
   470  					controller.ControllerParams{
   471  						DoFunc: func(ctx context.Context) error {
   472  							return d.syncLBMapsWithK8s()
   473  						},
   474  					},
   475  				)
   476  				return
   477  			}
   478  			if err := d.SyncLBMap(); err != nil {
   479  				log.WithError(err).Warn("Error while recovering endpoints")
   480  			}
   481  		}()
   482  	} else {
   483  		log.Info("State restore is disabled. Existing endpoints on node are ignored")
   484  		// We need to read all docker containers so we know we won't
   485  		// going to allocate the same IP addresses and we will ignore
   486  		// these containers from reading.
   487  		workloads.IgnoreRunningWorkloads()
   488  
   489  		// No restore happened, end parallel map mode immediately
   490  		endParallelMapMode()
   491  	}
   492  	bootstrapStats.restore.End(true)
   493  
   494  	return restoreComplete
   495  }