github.com/looshlee/beatles@v0.0.0-20220727174639-742810ab631c/operator/k8s_cep_gc.go (about)

     1  // Copyright 2016-2018 Authors of Cilium
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package main
    16  
    17  import (
    18  	"context"
    19  	"time"
    20  
    21  	"github.com/cilium/cilium/pkg/controller"
    22  	"github.com/cilium/cilium/pkg/k8s"
    23  	"github.com/cilium/cilium/pkg/logging/logfields"
    24  
    25  	"github.com/sirupsen/logrus"
    26  	core_v1 "k8s.io/api/core/v1"
    27  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    28  	meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  )
    30  
    31  var (
    32  	// ciliumEndpointGCInterval is the interval between attempts of the CEP GC
    33  	// controller.
    34  	// Note that only one node per cluster should run this, and most iterations
    35  	// will simply return.
    36  	ciliumEndpointGCInterval time.Duration
    37  )
    38  
    39  // enableCiliumEndpointSyncGC starts the node-singleton sweeper for
    40  // CiliumEndpoint objects where the managing node is no longer running. These
    41  // objects are created by the sync-to-k8s-ciliumendpoint controller on each
    42  // Endpoint.
    43  // The general steps are:
    44  //   - get list of nodes
    45  //   - only run with probability 1/nodes
    46  //   - get list of CEPs
    47  //   - for each CEP
    48  //       delete CEP if the corresponding pod does not exist
    49  // CiliumEndpoint objects have the same name as the pod they represent
    50  func enableCiliumEndpointSyncGC() {
    51  	var (
    52  		controllerName = "to-k8s-ciliumendpoint-gc"
    53  		scopedLog      = log.WithField("controller", controllerName)
    54  	)
    55  
    56  	log.Info("Starting to garbage collect stale CiliumEndpoint custom resources...")
    57  
    58  	ciliumClient := ciliumK8sClient.CiliumV2()
    59  
    60  	// this dummy manager is needed only to add this controller to the global list
    61  	controller.NewManager().UpdateController(controllerName,
    62  		controller.ControllerParams{
    63  			RunInterval: ciliumEndpointGCInterval,
    64  			DoFunc: func(ctx context.Context) error {
    65  				var (
    66  					listOpts = meta_v1.ListOptions{Limit: 10}
    67  					loopStop = time.Now().Add(ciliumEndpointGCInterval)
    68  				)
    69  
    70  				pods, err := k8s.Client().CoreV1().Pods("").List(meta_v1.ListOptions{})
    71  				if err != nil {
    72  					return err
    73  				}
    74  
    75  				podsCache := map[string]*core_v1.Pod{}
    76  				for _, pod := range pods.Items {
    77  					podsCache[pod.Namespace+"/"+pod.Name] = &pod
    78  				}
    79  
    80  			perCEPFetch:
    81  				for time.Now().Before(loopStop) { // Guard against no-break bugs
    82  					time.Sleep(time.Second) // Throttle lookups in case of a busy loop
    83  
    84  					ceps, err := ciliumClient.CiliumEndpoints(meta_v1.NamespaceAll).List(listOpts)
    85  					switch {
    86  					case err != nil && k8serrors.IsResourceExpired(err) && ceps.Continue != "":
    87  						// This combination means we saw a 410 ResourceExpired error but we
    88  						// can iterate on the now-current snapshot. We need to refetch,
    89  						// however.
    90  						// See https://github.com/kubernetes/apimachinery/blob/master/pkg/apis/meta/v1/types.go#L350-L381
    91  						// or the docs for k8s.io/apimachinery/pkg/apis/meta/v1.ListOptions
    92  						// vendored into this repo.
    93  						listOpts.Continue = ceps.Continue
    94  						continue perCEPFetch
    95  
    96  					case err != nil:
    97  						scopedLog.WithError(err).Debug("Cannot list CEPs")
    98  						return err
    99  					}
   100  
   101  					// setup listOpts for the next iteration
   102  					listOpts.Continue = ceps.Continue
   103  
   104  					// For each CEP we fetched, check if we know about it
   105  					for _, cep := range ceps.Items {
   106  						cepFullName := cep.Namespace + "/" + cep.Name
   107  						_, exists := podsCache[cepFullName]
   108  						if !exists {
   109  							// delete
   110  							scopedLog = scopedLog.WithFields(logrus.Fields{
   111  								logfields.EndpointID: cep.Status.ID,
   112  								logfields.K8sPodName: cepFullName,
   113  							})
   114  							scopedLog.Debug("Orphaned CiliumEndpoint is being garbage collected")
   115  							PropagationPolicy := meta_v1.DeletePropagationBackground // because these are const strings but the API wants pointers
   116  							if err := ciliumClient.CiliumEndpoints(cep.Namespace).Delete(cep.Name, &meta_v1.DeleteOptions{PropagationPolicy: &PropagationPolicy}); err != nil {
   117  								scopedLog.WithError(err).Debug("Unable to delete orphaned CEP")
   118  								return err
   119  							}
   120  						}
   121  					}
   122  					if ceps.Continue != "" {
   123  						// there is more data, continue
   124  						continue perCEPFetch
   125  					}
   126  					break perCEPFetch // break out as a safe default to avoid spammy loops
   127  				}
   128  				return nil
   129  			},
   130  		})
   131  }