github.com/cilium/cilium@v1.16.2/cilium-health/launch/endpoint.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package launch
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"net"
    10  	"os"
    11  	"os/exec"
    12  	"path/filepath"
    13  	"strconv"
    14  
    15  	"github.com/spf13/afero"
    16  	"github.com/vishvananda/netlink"
    17  
    18  	"github.com/cilium/cilium/api/v1/models"
    19  	"github.com/cilium/cilium/pkg/datapath/connector"
    20  	"github.com/cilium/cilium/pkg/datapath/linux/bigtcp"
    21  	"github.com/cilium/cilium/pkg/datapath/linux/route"
    22  	"github.com/cilium/cilium/pkg/datapath/linux/sysctl"
    23  	datapathOption "github.com/cilium/cilium/pkg/datapath/option"
    24  	"github.com/cilium/cilium/pkg/defaults"
    25  	"github.com/cilium/cilium/pkg/endpoint"
    26  	"github.com/cilium/cilium/pkg/endpoint/regeneration"
    27  	"github.com/cilium/cilium/pkg/health/probe"
    28  	"github.com/cilium/cilium/pkg/identity/cache"
    29  	"github.com/cilium/cilium/pkg/ipam"
    30  	ipamOption "github.com/cilium/cilium/pkg/ipam/option"
    31  	"github.com/cilium/cilium/pkg/ipcache"
    32  	"github.com/cilium/cilium/pkg/labels"
    33  	"github.com/cilium/cilium/pkg/launcher"
    34  	"github.com/cilium/cilium/pkg/logging/logfields"
    35  	"github.com/cilium/cilium/pkg/metrics"
    36  	"github.com/cilium/cilium/pkg/mtu"
    37  	"github.com/cilium/cilium/pkg/netns"
    38  	"github.com/cilium/cilium/pkg/node"
    39  	"github.com/cilium/cilium/pkg/option"
    40  	"github.com/cilium/cilium/pkg/pidfile"
    41  	"github.com/cilium/cilium/pkg/policy"
    42  	"github.com/cilium/cilium/pkg/time"
    43  )
    44  
    45  const (
    46  	ciliumHealth = "cilium-health"
    47  	binaryName   = "cilium-health-responder"
    48  
    49  	// healthName is the host-side virtual device name for cilium-health EP
    50  	healthName = "lxc_health"
    51  
    52  	// legacyHealthName is the host-side cilium-health EP device name used in
    53  	// older Cilium versions. Used for removal only.
    54  	legacyHealthName = "cilium_health"
    55  
    56  	// epIfaceName is the endpoint-side link device name for cilium-health.
    57  	epIfaceName = "cilium"
    58  
    59  	// PidfilePath
    60  	PidfilePath = "health-endpoint.pid"
    61  
    62  	// LaunchTime is the expected time within which the health endpoint
    63  	// should be able to be successfully run and its BPF program attached.
    64  	LaunchTime = 30 * time.Second
    65  )
    66  
    67  func getHealthRoutes(addressing *models.NodeAddressing, mtuConfig mtu.MTU) ([]route.Route, error) {
    68  	routes := []route.Route{}
    69  
    70  	if option.Config.EnableIPv4 {
    71  		v4Routes, err := connector.IPv4Routes(addressing, mtuConfig.GetRouteMTU())
    72  		if err == nil {
    73  			routes = append(routes, v4Routes...)
    74  		} else {
    75  			log.Debugf("Couldn't get IPv4 routes for health routing")
    76  		}
    77  	}
    78  
    79  	if option.Config.EnableIPv6 {
    80  		v6Routes, err := connector.IPv6Routes(addressing, mtuConfig.GetRouteMTU())
    81  		if err != nil {
    82  			return nil, fmt.Errorf("Failed to get IPv6 routes")
    83  		}
    84  		routes = append(routes, v6Routes...)
    85  	}
    86  
    87  	return routes, nil
    88  }
    89  
    90  // configureHealthRouting is meant to be run inside the health service netns
    91  func configureHealthRouting(routes []route.Route, dev string) error {
    92  	for _, rt := range routes {
    93  		cmd := rt.ToIPCommand(dev)
    94  		if len(cmd) < 2 {
    95  			return fmt.Errorf("ip command %s not expected len!", cmd)
    96  		}
    97  		prog := cmd[0]
    98  		args := cmd[1:]
    99  		log.Debugf("Running \"%s %+v\"", prog, args)
   100  		out, err := exec.Command(prog, args...).CombinedOutput()
   101  		if err == nil && len(out) > 0 {
   102  			log.WithField("prog", prog).WithField("args", args).Warn(out)
   103  		} else if err != nil {
   104  			return fmt.Errorf("error running %q with args %q: %w", prog, args, err)
   105  		}
   106  	}
   107  	return nil
   108  }
   109  
   110  // configureHealthInterface is meant to be run inside the health service netns
   111  func configureHealthInterface(ifName string, ip4Addr, ip6Addr *net.IPNet) error {
   112  	link, err := netlink.LinkByName(ifName)
   113  	if err != nil {
   114  		return err
   115  	}
   116  
   117  	if ip6Addr == nil {
   118  		// Use the direct sysctl without reconciliation of errors since we're in a different
   119  		// network namespace and thus can't use the normal sysctl API.
   120  		sysctl := sysctl.NewDirectSysctl(afero.NewOsFs(), option.Config.ProcFs)
   121  		// Ignore the error; if IPv6 is completely disabled
   122  		// then it's okay if we can't write the sysctl.
   123  		_ = sysctl.Enable([]string{"net", "ipv6", "conf", ifName, "disable_ipv6"})
   124  	} else {
   125  		if err = netlink.AddrAdd(link, &netlink.Addr{IPNet: ip6Addr}); err != nil {
   126  			return err
   127  		}
   128  	}
   129  
   130  	if ip4Addr != nil {
   131  		if err = netlink.AddrAdd(link, &netlink.Addr{IPNet: ip4Addr}); err != nil {
   132  			return err
   133  		}
   134  	}
   135  
   136  	if err = netlink.LinkSetUp(link); err != nil {
   137  		return err
   138  	}
   139  
   140  	lo, err := netlink.LinkByName("lo")
   141  	if err != nil {
   142  		return err
   143  	}
   144  
   145  	if err = netlink.LinkSetUp(lo); err != nil {
   146  		return err
   147  	}
   148  
   149  	return nil
   150  }
   151  
   152  // Client wraps a client to a specific cilium-health endpoint instance, to
   153  // provide convenience methods such as PingEndpoint().
   154  type Client struct {
   155  	host string
   156  }
   157  
   158  // PingEndpoint attempts to make an API ping request to the local cilium-health
   159  // endpoint, and returns whether this was successful.
   160  func (c *Client) PingEndpoint() error {
   161  	return probe.GetHello(c.host)
   162  }
   163  
   164  // KillEndpoint attempts to kill any existing cilium-health endpoint if it
   165  // exists.
   166  //
   167  // This is intended to be invoked in multiple situations:
   168  //   - The health endpoint has never been run before
   169  //   - The health endpoint was run during a previous run of the Cilium agent
   170  //   - The health endpoint crashed during the current run of the Cilium agent
   171  //     and needs to be cleaned up before it is restarted.
   172  func KillEndpoint() {
   173  	path := filepath.Join(option.Config.StateDir, PidfilePath)
   174  	scopedLog := log.WithField(logfields.PIDFile, path)
   175  	scopedLog.Debug("Killing old health endpoint process")
   176  	pid, err := pidfile.Kill(path)
   177  	if err != nil {
   178  		scopedLog.WithError(err).Warning("Failed to kill cilium-health-responder")
   179  	} else if pid != 0 {
   180  		scopedLog.WithField(logfields.PID, pid).Debug("Killed endpoint process")
   181  	}
   182  }
   183  
   184  // CleanupEndpoint cleans up remaining resources associated with the health
   185  // endpoint.
   186  //
   187  // This is expected to be called after the process is killed and the endpoint
   188  // is removed from the endpointmanager.
   189  func CleanupEndpoint() {
   190  	// Removes the interfaces used for the endpoint process.
   191  	//
   192  	// Explicit removal is performed to ensure that everything referencing the network namespace
   193  	// the endpoint process is executed under is disposed, so that the network namespace itself is properly disposed.
   194  	switch option.Config.DatapathMode {
   195  	case datapathOption.DatapathModeVeth, datapathOption.DatapathModeNetkit, datapathOption.DatapathModeNetkitL2:
   196  		for _, iface := range []string{legacyHealthName, healthName} {
   197  			scopedLog := log.WithField(logfields.Interface, iface)
   198  			if link, err := netlink.LinkByName(iface); err == nil {
   199  				err = netlink.LinkDel(link)
   200  				if err != nil {
   201  					scopedLog.WithError(err).Infof("Couldn't delete cilium-health %s device",
   202  						option.Config.DatapathMode)
   203  				}
   204  			} else {
   205  				scopedLog.WithError(err).Debug("Didn't find existing device")
   206  			}
   207  		}
   208  	}
   209  }
   210  
   211  // EndpointAdder is any type which adds an endpoint to be managed by Cilium.
   212  type EndpointAdder interface {
   213  	AddEndpoint(owner regeneration.Owner, ep *endpoint.Endpoint) error
   214  }
   215  
   216  // LaunchAsEndpoint launches the cilium-health agent in a nested network
   217  // namespace and attaches it to Cilium the same way as any other endpoint, but
   218  // with special reserved labels.
   219  //
   220  // CleanupEndpoint() must be called before calling LaunchAsEndpoint() to ensure
   221  // cleanup of prior cilium-health endpoint instances.
   222  func LaunchAsEndpoint(baseCtx context.Context,
   223  	owner regeneration.Owner,
   224  	policyGetter policyRepoGetter,
   225  	ipcache *ipcache.IPCache,
   226  	mtuConfig mtu.MTU,
   227  	bigTCPConfig *bigtcp.Configuration,
   228  	epMgr EndpointAdder,
   229  	allocator cache.IdentityAllocator,
   230  	routingConfig routingConfigurer,
   231  	sysctl sysctl.Sysctl,
   232  ) (*Client, error) {
   233  
   234  	var (
   235  		cmd  = launcher.Launcher{}
   236  		info = &models.EndpointChangeRequest{
   237  			ContainerName: ciliumHealth,
   238  			State:         models.EndpointStateWaitingDashForDashIdentity.Pointer(),
   239  			Addressing:    &models.AddressPair{},
   240  		}
   241  		healthIP               net.IP
   242  		ip4Address, ip6Address *net.IPNet
   243  	)
   244  
   245  	if healthIPv6 := node.GetEndpointHealthIPv6(); healthIPv6 != nil {
   246  		info.Addressing.IPV6 = healthIPv6.String()
   247  		info.Addressing.IPV6PoolName = ipam.PoolDefault().String()
   248  		ip6Address = &net.IPNet{IP: healthIPv6, Mask: defaults.ContainerIPv6Mask}
   249  		healthIP = healthIPv6
   250  	}
   251  	if healthIPv4 := node.GetEndpointHealthIPv4(); healthIPv4 != nil {
   252  		info.Addressing.IPV4 = healthIPv4.String()
   253  		info.Addressing.IPV4PoolName = ipam.PoolDefault().String()
   254  		ip4Address = &net.IPNet{IP: healthIPv4, Mask: defaults.ContainerIPv4Mask}
   255  		healthIP = healthIPv4
   256  	}
   257  
   258  	if option.Config.EnableEndpointRoutes {
   259  		disabled := false
   260  		dpConfig := &models.EndpointDatapathConfiguration{
   261  			InstallEndpointRoute: true,
   262  			RequireEgressProg:    true,
   263  			RequireRouting:       &disabled,
   264  		}
   265  		info.DatapathConfiguration = dpConfig
   266  	}
   267  
   268  	ns, err := netns.New()
   269  	if err != nil {
   270  		return nil, fmt.Errorf("create cilium-health netns: %w", err)
   271  	}
   272  
   273  	switch option.Config.DatapathMode {
   274  	case datapathOption.DatapathModeVeth:
   275  		_, epLink, err := connector.SetupVethWithNames(healthName, epIfaceName, mtuConfig.GetDeviceMTU(),
   276  			bigTCPConfig.GetGROIPv6MaxSize(), bigTCPConfig.GetGSOIPv6MaxSize(),
   277  			bigTCPConfig.GetGROIPv4MaxSize(), bigTCPConfig.GetGSOIPv4MaxSize(),
   278  			info, sysctl)
   279  		if err != nil {
   280  			return nil, fmt.Errorf("Error while creating veth: %w", err)
   281  		}
   282  		if err = netlink.LinkSetNsFd(epLink, int(ns.FD())); err != nil {
   283  			return nil, fmt.Errorf("failed to move device %q to health namespace: %w", epIfaceName, err)
   284  		}
   285  	case datapathOption.DatapathModeNetkit, datapathOption.DatapathModeNetkitL2:
   286  		l2Mode := option.Config.DatapathMode == datapathOption.DatapathModeNetkitL2
   287  		_, epLink, err := connector.SetupNetkitWithNames(healthName, epIfaceName, mtuConfig.GetDeviceMTU(),
   288  			bigTCPConfig.GetGROIPv6MaxSize(), bigTCPConfig.GetGSOIPv6MaxSize(),
   289  			bigTCPConfig.GetGROIPv4MaxSize(), bigTCPConfig.GetGSOIPv4MaxSize(), l2Mode,
   290  			info, sysctl)
   291  		if err != nil {
   292  			return nil, fmt.Errorf("Error while creating netkit: %w", err)
   293  		}
   294  		if err = netlink.LinkSetNsFd(epLink, int(ns.FD())); err != nil {
   295  			return nil, fmt.Errorf("failed to move device %q to health namespace: %w", epIfaceName, err)
   296  		}
   297  	}
   298  
   299  	if err := ns.Do(func() error {
   300  		return configureHealthInterface(epIfaceName, ip4Address, ip6Address)
   301  	}); err != nil {
   302  		return nil, fmt.Errorf("failed configure health interface %q: %w", epIfaceName, err)
   303  	}
   304  
   305  	pidfile := filepath.Join(option.Config.StateDir, PidfilePath)
   306  	args := []string{"--listen", strconv.Itoa(option.Config.ClusterHealthPort), "--pidfile", pidfile}
   307  	cmd.SetTarget(binaryName)
   308  	cmd.SetArgs(args)
   309  	log.Debugf("Spawning health endpoint with command %q %q", binaryName, args)
   310  
   311  	// Run the health binary inside a netnamespace. Since `Do()` implicitly does
   312  	// `runtime.LockOSThread` the exec'd binary is guaranteed to inherit the
   313  	// correct netnamespace.
   314  	if err := ns.Do(func() error {
   315  		return cmd.Run()
   316  	}); err != nil {
   317  		return nil, err
   318  	}
   319  
   320  	// Create the endpoint
   321  	ep, err := endpoint.NewEndpointFromChangeModel(baseCtx, owner, policyGetter, ipcache, nil, allocator, info)
   322  	if err != nil {
   323  		return nil, fmt.Errorf("Error while creating endpoint model: %w", err)
   324  	}
   325  
   326  	// Wait until the cilium-health endpoint is running before setting up routes
   327  	deadline := time.Now().Add(1 * time.Minute)
   328  	for {
   329  		if _, err := os.Stat(pidfile); err == nil {
   330  			log.WithField("pidfile", pidfile).Debug("cilium-health agent running")
   331  			break
   332  		} else if time.Now().After(deadline) {
   333  			return nil, fmt.Errorf("Endpoint failed to run: %w", err)
   334  		} else {
   335  			time.Sleep(1 * time.Second)
   336  		}
   337  	}
   338  
   339  	// Set up the endpoint routes.
   340  	routes, err := getHealthRoutes(node.GetNodeAddressing(), mtuConfig)
   341  	if err != nil {
   342  		return nil, fmt.Errorf("Error while getting routes for containername %q: %w", info.ContainerName, err)
   343  	}
   344  
   345  	err = ns.Do(func() error {
   346  		return configureHealthRouting(routes, epIfaceName)
   347  	})
   348  	if err != nil {
   349  		return nil, fmt.Errorf("Error while configuring routes: %w", err)
   350  	}
   351  
   352  	if option.Config.IPAM == ipamOption.IPAMENI || option.Config.IPAM == ipamOption.IPAMAlibabaCloud {
   353  		// ENI mode does not support IPv6.
   354  		if err := routingConfig.Configure(
   355  			healthIP,
   356  			mtuConfig.GetDeviceMTU(),
   357  			option.Config.EgressMultiHomeIPRuleCompat,
   358  			false,
   359  		); err != nil {
   360  
   361  			return nil, fmt.Errorf("Error while configuring health endpoint rules and routes: %w", err)
   362  		}
   363  	}
   364  
   365  	if err := epMgr.AddEndpoint(owner, ep); err != nil {
   366  		return nil, fmt.Errorf("Error while adding endpoint: %w", err)
   367  	}
   368  
   369  	// Give the endpoint a security identity
   370  	ctx, cancel := context.WithTimeout(baseCtx, LaunchTime)
   371  	defer cancel()
   372  	ep.UpdateLabels(ctx, labels.LabelSourceAny, labels.LabelHealth, nil, true)
   373  
   374  	// Initialize the health client to talk to this instance.
   375  	client := &Client{host: "http://" + net.JoinHostPort(healthIP.String(), strconv.Itoa(option.Config.ClusterHealthPort))}
   376  	metrics.SubprocessStart.WithLabelValues(ciliumHealth).Inc()
   377  
   378  	return client, nil
   379  }
   380  
   381  type policyRepoGetter interface {
   382  	GetPolicyRepository() *policy.Repository
   383  }
   384  
   385  type routingConfigurer interface {
   386  	Configure(ip net.IP, mtu int, compat bool, host bool) error
   387  }