github.com/cilium/cilium@v1.16.2/cilium-health/launch/endpoint.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package launch 5 6 import ( 7 "context" 8 "fmt" 9 "net" 10 "os" 11 "os/exec" 12 "path/filepath" 13 "strconv" 14 15 "github.com/spf13/afero" 16 "github.com/vishvananda/netlink" 17 18 "github.com/cilium/cilium/api/v1/models" 19 "github.com/cilium/cilium/pkg/datapath/connector" 20 "github.com/cilium/cilium/pkg/datapath/linux/bigtcp" 21 "github.com/cilium/cilium/pkg/datapath/linux/route" 22 "github.com/cilium/cilium/pkg/datapath/linux/sysctl" 23 datapathOption "github.com/cilium/cilium/pkg/datapath/option" 24 "github.com/cilium/cilium/pkg/defaults" 25 "github.com/cilium/cilium/pkg/endpoint" 26 "github.com/cilium/cilium/pkg/endpoint/regeneration" 27 "github.com/cilium/cilium/pkg/health/probe" 28 "github.com/cilium/cilium/pkg/identity/cache" 29 "github.com/cilium/cilium/pkg/ipam" 30 ipamOption "github.com/cilium/cilium/pkg/ipam/option" 31 "github.com/cilium/cilium/pkg/ipcache" 32 "github.com/cilium/cilium/pkg/labels" 33 "github.com/cilium/cilium/pkg/launcher" 34 "github.com/cilium/cilium/pkg/logging/logfields" 35 "github.com/cilium/cilium/pkg/metrics" 36 "github.com/cilium/cilium/pkg/mtu" 37 "github.com/cilium/cilium/pkg/netns" 38 "github.com/cilium/cilium/pkg/node" 39 "github.com/cilium/cilium/pkg/option" 40 "github.com/cilium/cilium/pkg/pidfile" 41 "github.com/cilium/cilium/pkg/policy" 42 "github.com/cilium/cilium/pkg/time" 43 ) 44 45 const ( 46 ciliumHealth = "cilium-health" 47 binaryName = "cilium-health-responder" 48 49 // healthName is the host-side virtual device name for cilium-health EP 50 healthName = "lxc_health" 51 52 // legacyHealthName is the host-side cilium-health EP device name used in 53 // older Cilium versions. Used for removal only. 54 legacyHealthName = "cilium_health" 55 56 // epIfaceName is the endpoint-side link device name for cilium-health. 57 epIfaceName = "cilium" 58 59 // PidfilePath 60 PidfilePath = "health-endpoint.pid" 61 62 // LaunchTime is the expected time within which the health endpoint 63 // should be able to be successfully run and its BPF program attached. 64 LaunchTime = 30 * time.Second 65 ) 66 67 func getHealthRoutes(addressing *models.NodeAddressing, mtuConfig mtu.MTU) ([]route.Route, error) { 68 routes := []route.Route{} 69 70 if option.Config.EnableIPv4 { 71 v4Routes, err := connector.IPv4Routes(addressing, mtuConfig.GetRouteMTU()) 72 if err == nil { 73 routes = append(routes, v4Routes...) 74 } else { 75 log.Debugf("Couldn't get IPv4 routes for health routing") 76 } 77 } 78 79 if option.Config.EnableIPv6 { 80 v6Routes, err := connector.IPv6Routes(addressing, mtuConfig.GetRouteMTU()) 81 if err != nil { 82 return nil, fmt.Errorf("Failed to get IPv6 routes") 83 } 84 routes = append(routes, v6Routes...) 85 } 86 87 return routes, nil 88 } 89 90 // configureHealthRouting is meant to be run inside the health service netns 91 func configureHealthRouting(routes []route.Route, dev string) error { 92 for _, rt := range routes { 93 cmd := rt.ToIPCommand(dev) 94 if len(cmd) < 2 { 95 return fmt.Errorf("ip command %s not expected len!", cmd) 96 } 97 prog := cmd[0] 98 args := cmd[1:] 99 log.Debugf("Running \"%s %+v\"", prog, args) 100 out, err := exec.Command(prog, args...).CombinedOutput() 101 if err == nil && len(out) > 0 { 102 log.WithField("prog", prog).WithField("args", args).Warn(out) 103 } else if err != nil { 104 return fmt.Errorf("error running %q with args %q: %w", prog, args, err) 105 } 106 } 107 return nil 108 } 109 110 // configureHealthInterface is meant to be run inside the health service netns 111 func configureHealthInterface(ifName string, ip4Addr, ip6Addr *net.IPNet) error { 112 link, err := netlink.LinkByName(ifName) 113 if err != nil { 114 return err 115 } 116 117 if ip6Addr == nil { 118 // Use the direct sysctl without reconciliation of errors since we're in a different 119 // network namespace and thus can't use the normal sysctl API. 120 sysctl := sysctl.NewDirectSysctl(afero.NewOsFs(), option.Config.ProcFs) 121 // Ignore the error; if IPv6 is completely disabled 122 // then it's okay if we can't write the sysctl. 123 _ = sysctl.Enable([]string{"net", "ipv6", "conf", ifName, "disable_ipv6"}) 124 } else { 125 if err = netlink.AddrAdd(link, &netlink.Addr{IPNet: ip6Addr}); err != nil { 126 return err 127 } 128 } 129 130 if ip4Addr != nil { 131 if err = netlink.AddrAdd(link, &netlink.Addr{IPNet: ip4Addr}); err != nil { 132 return err 133 } 134 } 135 136 if err = netlink.LinkSetUp(link); err != nil { 137 return err 138 } 139 140 lo, err := netlink.LinkByName("lo") 141 if err != nil { 142 return err 143 } 144 145 if err = netlink.LinkSetUp(lo); err != nil { 146 return err 147 } 148 149 return nil 150 } 151 152 // Client wraps a client to a specific cilium-health endpoint instance, to 153 // provide convenience methods such as PingEndpoint(). 154 type Client struct { 155 host string 156 } 157 158 // PingEndpoint attempts to make an API ping request to the local cilium-health 159 // endpoint, and returns whether this was successful. 160 func (c *Client) PingEndpoint() error { 161 return probe.GetHello(c.host) 162 } 163 164 // KillEndpoint attempts to kill any existing cilium-health endpoint if it 165 // exists. 166 // 167 // This is intended to be invoked in multiple situations: 168 // - The health endpoint has never been run before 169 // - The health endpoint was run during a previous run of the Cilium agent 170 // - The health endpoint crashed during the current run of the Cilium agent 171 // and needs to be cleaned up before it is restarted. 172 func KillEndpoint() { 173 path := filepath.Join(option.Config.StateDir, PidfilePath) 174 scopedLog := log.WithField(logfields.PIDFile, path) 175 scopedLog.Debug("Killing old health endpoint process") 176 pid, err := pidfile.Kill(path) 177 if err != nil { 178 scopedLog.WithError(err).Warning("Failed to kill cilium-health-responder") 179 } else if pid != 0 { 180 scopedLog.WithField(logfields.PID, pid).Debug("Killed endpoint process") 181 } 182 } 183 184 // CleanupEndpoint cleans up remaining resources associated with the health 185 // endpoint. 186 // 187 // This is expected to be called after the process is killed and the endpoint 188 // is removed from the endpointmanager. 189 func CleanupEndpoint() { 190 // Removes the interfaces used for the endpoint process. 191 // 192 // Explicit removal is performed to ensure that everything referencing the network namespace 193 // the endpoint process is executed under is disposed, so that the network namespace itself is properly disposed. 194 switch option.Config.DatapathMode { 195 case datapathOption.DatapathModeVeth, datapathOption.DatapathModeNetkit, datapathOption.DatapathModeNetkitL2: 196 for _, iface := range []string{legacyHealthName, healthName} { 197 scopedLog := log.WithField(logfields.Interface, iface) 198 if link, err := netlink.LinkByName(iface); err == nil { 199 err = netlink.LinkDel(link) 200 if err != nil { 201 scopedLog.WithError(err).Infof("Couldn't delete cilium-health %s device", 202 option.Config.DatapathMode) 203 } 204 } else { 205 scopedLog.WithError(err).Debug("Didn't find existing device") 206 } 207 } 208 } 209 } 210 211 // EndpointAdder is any type which adds an endpoint to be managed by Cilium. 212 type EndpointAdder interface { 213 AddEndpoint(owner regeneration.Owner, ep *endpoint.Endpoint) error 214 } 215 216 // LaunchAsEndpoint launches the cilium-health agent in a nested network 217 // namespace and attaches it to Cilium the same way as any other endpoint, but 218 // with special reserved labels. 219 // 220 // CleanupEndpoint() must be called before calling LaunchAsEndpoint() to ensure 221 // cleanup of prior cilium-health endpoint instances. 222 func LaunchAsEndpoint(baseCtx context.Context, 223 owner regeneration.Owner, 224 policyGetter policyRepoGetter, 225 ipcache *ipcache.IPCache, 226 mtuConfig mtu.MTU, 227 bigTCPConfig *bigtcp.Configuration, 228 epMgr EndpointAdder, 229 allocator cache.IdentityAllocator, 230 routingConfig routingConfigurer, 231 sysctl sysctl.Sysctl, 232 ) (*Client, error) { 233 234 var ( 235 cmd = launcher.Launcher{} 236 info = &models.EndpointChangeRequest{ 237 ContainerName: ciliumHealth, 238 State: models.EndpointStateWaitingDashForDashIdentity.Pointer(), 239 Addressing: &models.AddressPair{}, 240 } 241 healthIP net.IP 242 ip4Address, ip6Address *net.IPNet 243 ) 244 245 if healthIPv6 := node.GetEndpointHealthIPv6(); healthIPv6 != nil { 246 info.Addressing.IPV6 = healthIPv6.String() 247 info.Addressing.IPV6PoolName = ipam.PoolDefault().String() 248 ip6Address = &net.IPNet{IP: healthIPv6, Mask: defaults.ContainerIPv6Mask} 249 healthIP = healthIPv6 250 } 251 if healthIPv4 := node.GetEndpointHealthIPv4(); healthIPv4 != nil { 252 info.Addressing.IPV4 = healthIPv4.String() 253 info.Addressing.IPV4PoolName = ipam.PoolDefault().String() 254 ip4Address = &net.IPNet{IP: healthIPv4, Mask: defaults.ContainerIPv4Mask} 255 healthIP = healthIPv4 256 } 257 258 if option.Config.EnableEndpointRoutes { 259 disabled := false 260 dpConfig := &models.EndpointDatapathConfiguration{ 261 InstallEndpointRoute: true, 262 RequireEgressProg: true, 263 RequireRouting: &disabled, 264 } 265 info.DatapathConfiguration = dpConfig 266 } 267 268 ns, err := netns.New() 269 if err != nil { 270 return nil, fmt.Errorf("create cilium-health netns: %w", err) 271 } 272 273 switch option.Config.DatapathMode { 274 case datapathOption.DatapathModeVeth: 275 _, epLink, err := connector.SetupVethWithNames(healthName, epIfaceName, mtuConfig.GetDeviceMTU(), 276 bigTCPConfig.GetGROIPv6MaxSize(), bigTCPConfig.GetGSOIPv6MaxSize(), 277 bigTCPConfig.GetGROIPv4MaxSize(), bigTCPConfig.GetGSOIPv4MaxSize(), 278 info, sysctl) 279 if err != nil { 280 return nil, fmt.Errorf("Error while creating veth: %w", err) 281 } 282 if err = netlink.LinkSetNsFd(epLink, int(ns.FD())); err != nil { 283 return nil, fmt.Errorf("failed to move device %q to health namespace: %w", epIfaceName, err) 284 } 285 case datapathOption.DatapathModeNetkit, datapathOption.DatapathModeNetkitL2: 286 l2Mode := option.Config.DatapathMode == datapathOption.DatapathModeNetkitL2 287 _, epLink, err := connector.SetupNetkitWithNames(healthName, epIfaceName, mtuConfig.GetDeviceMTU(), 288 bigTCPConfig.GetGROIPv6MaxSize(), bigTCPConfig.GetGSOIPv6MaxSize(), 289 bigTCPConfig.GetGROIPv4MaxSize(), bigTCPConfig.GetGSOIPv4MaxSize(), l2Mode, 290 info, sysctl) 291 if err != nil { 292 return nil, fmt.Errorf("Error while creating netkit: %w", err) 293 } 294 if err = netlink.LinkSetNsFd(epLink, int(ns.FD())); err != nil { 295 return nil, fmt.Errorf("failed to move device %q to health namespace: %w", epIfaceName, err) 296 } 297 } 298 299 if err := ns.Do(func() error { 300 return configureHealthInterface(epIfaceName, ip4Address, ip6Address) 301 }); err != nil { 302 return nil, fmt.Errorf("failed configure health interface %q: %w", epIfaceName, err) 303 } 304 305 pidfile := filepath.Join(option.Config.StateDir, PidfilePath) 306 args := []string{"--listen", strconv.Itoa(option.Config.ClusterHealthPort), "--pidfile", pidfile} 307 cmd.SetTarget(binaryName) 308 cmd.SetArgs(args) 309 log.Debugf("Spawning health endpoint with command %q %q", binaryName, args) 310 311 // Run the health binary inside a netnamespace. Since `Do()` implicitly does 312 // `runtime.LockOSThread` the exec'd binary is guaranteed to inherit the 313 // correct netnamespace. 314 if err := ns.Do(func() error { 315 return cmd.Run() 316 }); err != nil { 317 return nil, err 318 } 319 320 // Create the endpoint 321 ep, err := endpoint.NewEndpointFromChangeModel(baseCtx, owner, policyGetter, ipcache, nil, allocator, info) 322 if err != nil { 323 return nil, fmt.Errorf("Error while creating endpoint model: %w", err) 324 } 325 326 // Wait until the cilium-health endpoint is running before setting up routes 327 deadline := time.Now().Add(1 * time.Minute) 328 for { 329 if _, err := os.Stat(pidfile); err == nil { 330 log.WithField("pidfile", pidfile).Debug("cilium-health agent running") 331 break 332 } else if time.Now().After(deadline) { 333 return nil, fmt.Errorf("Endpoint failed to run: %w", err) 334 } else { 335 time.Sleep(1 * time.Second) 336 } 337 } 338 339 // Set up the endpoint routes. 340 routes, err := getHealthRoutes(node.GetNodeAddressing(), mtuConfig) 341 if err != nil { 342 return nil, fmt.Errorf("Error while getting routes for containername %q: %w", info.ContainerName, err) 343 } 344 345 err = ns.Do(func() error { 346 return configureHealthRouting(routes, epIfaceName) 347 }) 348 if err != nil { 349 return nil, fmt.Errorf("Error while configuring routes: %w", err) 350 } 351 352 if option.Config.IPAM == ipamOption.IPAMENI || option.Config.IPAM == ipamOption.IPAMAlibabaCloud { 353 // ENI mode does not support IPv6. 354 if err := routingConfig.Configure( 355 healthIP, 356 mtuConfig.GetDeviceMTU(), 357 option.Config.EgressMultiHomeIPRuleCompat, 358 false, 359 ); err != nil { 360 361 return nil, fmt.Errorf("Error while configuring health endpoint rules and routes: %w", err) 362 } 363 } 364 365 if err := epMgr.AddEndpoint(owner, ep); err != nil { 366 return nil, fmt.Errorf("Error while adding endpoint: %w", err) 367 } 368 369 // Give the endpoint a security identity 370 ctx, cancel := context.WithTimeout(baseCtx, LaunchTime) 371 defer cancel() 372 ep.UpdateLabels(ctx, labels.LabelSourceAny, labels.LabelHealth, nil, true) 373 374 // Initialize the health client to talk to this instance. 375 client := &Client{host: "http://" + net.JoinHostPort(healthIP.String(), strconv.Itoa(option.Config.ClusterHealthPort))} 376 metrics.SubprocessStart.WithLabelValues(ciliumHealth).Inc() 377 378 return client, nil 379 } 380 381 type policyRepoGetter interface { 382 GetPolicyRepository() *policy.Repository 383 } 384 385 type routingConfigurer interface { 386 Configure(ip net.IP, mtu int, compat bool, host bool) error 387 }