github.com/cilium/cilium@v1.16.2/pkg/endpointcleanup/cleanup.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package endpointcleanup 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 11 "github.com/cilium/hive/cell" 12 "github.com/cilium/hive/job" 13 "github.com/sirupsen/logrus" 14 k8serrors "k8s.io/apimachinery/pkg/api/errors" 15 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 16 apiTypes "k8s.io/apimachinery/pkg/types" 17 "k8s.io/apimachinery/pkg/util/wait" 18 19 "github.com/cilium/cilium/pkg/endpoint" 20 "github.com/cilium/cilium/pkg/endpointstate" 21 cilium_v2a1 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2alpha1" 22 k8sClient "github.com/cilium/cilium/pkg/k8s/client" 23 cilium_v2 "github.com/cilium/cilium/pkg/k8s/client/clientset/versioned/typed/cilium.io/v2" 24 "github.com/cilium/cilium/pkg/k8s/resource" 25 "github.com/cilium/cilium/pkg/k8s/types" 26 "github.com/cilium/cilium/pkg/logging/logfields" 27 "github.com/cilium/cilium/pkg/node" 28 "github.com/cilium/cilium/pkg/option" 29 "github.com/cilium/cilium/pkg/promise" 30 "github.com/cilium/cilium/pkg/resiliency" 31 "github.com/cilium/cilium/pkg/time" 32 ) 33 34 type localEndpointCache interface { 35 LookupCEPName(namespacedName string) *endpoint.Endpoint 36 } 37 38 type params struct { 39 cell.In 40 41 Logger logrus.FieldLogger 42 Lifecycle cell.Lifecycle 43 JobGroup job.Group 44 Health cell.Health 45 CiliumEndpoint resource.Resource[*types.CiliumEndpoint] 46 CiliumEndpointSlice resource.Resource[*cilium_v2a1.CiliumEndpointSlice] 47 Clientset k8sClient.Clientset 48 RestorerPromise promise.Promise[endpointstate.Restorer] 49 EndpointsCache localEndpointCache 50 Cfg Config 51 DaemonCfg *option.DaemonConfig 52 } 53 54 type cleanup struct { 55 log logrus.FieldLogger 56 ciliumEndpoint resource.Resource[*types.CiliumEndpoint] 57 ciliumEndpointSlice resource.Resource[*cilium_v2a1.CiliumEndpointSlice] 58 ciliumClient cilium_v2.CiliumV2Interface 59 restorerPromise promise.Promise[endpointstate.Restorer] 60 endpointsCache localEndpointCache 61 ciliumEndpointSliceEnabled bool 62 storeReleaseFn func() 63 } 64 65 func registerCleanup(p params) { 66 if !p.Clientset.IsEnabled() || !p.Cfg.EnableStaleCiliumEndpointCleanup || p.DaemonCfg.DisableCiliumEndpointCRD { 67 p.Logger.Info("Init procedure to clean up stale CiliumEndpoint disabled") 68 return 69 } 70 71 cleanup := &cleanup{ 72 log: p.Logger, 73 ciliumEndpoint: p.CiliumEndpoint, 74 ciliumEndpointSlice: p.CiliumEndpointSlice, 75 ciliumClient: p.Clientset.CiliumV2(), 76 restorerPromise: p.RestorerPromise, 77 endpointsCache: p.EndpointsCache, 78 ciliumEndpointSliceEnabled: p.DaemonCfg.EnableCiliumEndpointSlice, 79 } 80 81 p.JobGroup.Add( 82 job.OneShot("endpoint-cleanup", func(ctx context.Context, health cell.Health) error { 83 return cleanup.run(ctx) 84 }), 85 ) 86 } 87 88 func (c *cleanup) run(ctx context.Context) error { 89 defer func() { 90 if c.storeReleaseFn != nil { 91 c.storeReleaseFn() 92 } 93 }() 94 95 // Use restored endpoints to delete local CiliumEndpoints which are not in the restored endpoint cache. 96 // This will clear out any CiliumEndpoints that may be stale. 97 // Likely causes for this are Pods having their init container restarted or the node being restarted. 98 // This must wait for both K8s watcher caches to be synced and local endpoint restoration to be complete. 99 // Note: Synchronization of endpoints to their CEPs may not be complete at this point, but we only have to 100 // know what endpoints exist post-restoration in our endpointManager cache to perform cleanup. 101 restorer, err := c.restorerPromise.Await(ctx) 102 if err != nil { 103 return err 104 } 105 restorer.WaitForEndpointRestore(ctx) 106 107 var ( 108 retries int 109 bo = wait.Backoff{ 110 Duration: 500 * time.Millisecond, 111 Factor: 1, 112 Jitter: 0.1, 113 Steps: 5, 114 Cap: 0, 115 } 116 ) 117 err = wait.ExponentialBackoffWithContext(ctx, bo, func(ctx context.Context) (done bool, err error) { 118 if c.ciliumEndpointSliceEnabled { 119 err = c.cleanStaleCESs(ctx) 120 } else { 121 err = c.cleanStaleCEPs(ctx) 122 } 123 if err != nil { 124 retries++ 125 c.log.WithError(err).WithField(logfields.Attempt, retries).Error("Failed to clean up stale CEPs") 126 if resiliency.IsRetryable(err) { 127 return false, nil 128 } 129 return true, err 130 } 131 return true, nil 132 }) 133 if err != nil { 134 c.log.WithError(err).Error("Failed to clean up stale CEPs after multiple attempts") 135 } 136 return err 137 } 138 139 func (c *cleanup) cleanStaleCEPs(ctx context.Context) error { 140 var errs error 141 store, err := c.ciliumEndpoint.Store(ctx) 142 if err != nil { 143 return fmt.Errorf("failed to get CiliumEndpoint store: %w", err) 144 } 145 c.storeReleaseFn = store.Release 146 objs, err := store.ByIndex("localNode", node.GetCiliumEndpointNodeIP()) 147 if err != nil { 148 return fmt.Errorf("failed to get indexed CiliumEndpointSlice from store: %w", err) 149 } 150 for _, cep := range objs { 151 if cep.Networking.NodeIP == node.GetCiliumEndpointNodeIP() && c.endpointsCache.LookupCEPName(cep.Namespace+"/"+cep.Name) == nil { 152 if err := c.deleteCiliumEndpoint(ctx, cep.Namespace, cep.Name, &cep.ObjectMeta.UID); err != nil { 153 errs = errors.Join(errs, err) 154 } 155 } 156 } 157 return errs 158 } 159 160 func (c *cleanup) cleanStaleCESs(ctx context.Context) error { 161 var errs error 162 store, err := c.ciliumEndpointSlice.Store(ctx) 163 if err != nil { 164 return fmt.Errorf("failed to get CiliumEndpointSlice store: %w", err) 165 } 166 c.storeReleaseFn = store.Release 167 objs, err := store.ByIndex("localNode", node.GetCiliumEndpointNodeIP()) 168 if err != nil { 169 return fmt.Errorf("failed to get indexed CiliumEndpointSlice from store: %w", err) 170 } 171 for _, ces := range objs { 172 for _, cep := range ces.Endpoints { 173 if cep.Networking.NodeIP == node.GetCiliumEndpointNodeIP() && c.endpointsCache.LookupCEPName(ces.Namespace+"/"+cep.Name) == nil { 174 if err := c.deleteCiliumEndpoint(ctx, ces.Namespace, cep.Name, nil); err != nil { 175 errs = errors.Join(errs, err) 176 } 177 } 178 } 179 } 180 return errs 181 } 182 183 // deleteCiliumEndpoint safely deletes a CEP by name, if no UID is passed this will reverify that 184 // the CEP is still local before doing a delete. 185 func (c *cleanup) deleteCiliumEndpoint(ctx context.Context, cepNamespace, cepName string, cepUID *apiTypes.UID) error { 186 logwf := c.log.WithFields(logrus.Fields{ 187 logfields.CEPName: cepName, 188 logfields.K8sNamespace: cepNamespace, 189 }) 190 191 // To avoid having to store CEP UIDs in CES Endpoints array, we have to get the latest 192 // referenced CEP from apiserver to verify that it still references this node. 193 // To avoid excessive api calls, we only do this if CES is enabled and the CEP 194 // appears to be stale. 195 if cepUID == nil && c.ciliumEndpointSliceEnabled { 196 cep, err := c.ciliumClient.CiliumEndpoints(cepNamespace).Get(ctx, cepName, metav1.GetOptions{}) 197 if err != nil { 198 if k8serrors.IsNotFound(err) { 199 logwf.WithError(err).Info("CEP no longer exists, skipping staleness check") 200 return nil 201 } 202 logwf.WithError(err).Error("Failed to get possibly stale ciliumendpoints from apiserver") 203 return resiliency.Retryable(err) 204 } 205 if cep.Status.Networking.NodeIP != node.GetCiliumEndpointNodeIP() { 206 logwf.WithError(err).Debug("Stale CEP fetched apiserver no longer references this Node, skipping.") 207 return nil 208 } 209 cepUID = &cep.ObjectMeta.UID 210 } 211 // There exists a local CiliumEndpoint that is not in the endpoint manager. 212 // This function is run after completing endpoint restoration from local state and K8s cache sync. 213 // Therefore, we can delete the CiliumEndpoint as it is not referencing a Pod that is being managed. 214 // This may occur for various reasons: 215 // * Pod was restarted while Cilium was not running (likely prior to CNI conf being installed). 216 // * Local endpoint was deleted (i.e. due to reboot + temporary filesystem) and Cilium or the Pod where restarted. 217 logwf.Info("Found stale ciliumendpoint for local pod that is not being managed, deleting.") 218 if err := c.ciliumClient.CiliumEndpoints(cepNamespace).Delete(ctx, cepName, metav1.DeleteOptions{ 219 Preconditions: &metav1.Preconditions{ 220 UID: cepUID, 221 }, 222 }); err != nil { 223 if k8serrors.IsNotFound(err) { 224 // CEP not found, likely already deleted. Do not log as an error as that 225 // will fail CI runs. 226 logwf.Debug("Could not delete stale CEP") 227 return nil 228 } 229 logwf.Error("Could not delete stale CEP") 230 return resiliency.Retryable(err) 231 } 232 233 return nil 234 }