github.com/zhyoulun/cilium@v1.6.12/daemon/daemon.go (about) 1 // Copyright 2016-2019 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package main 16 17 import ( 18 "context" 19 "fmt" 20 "net" 21 "os" 22 "runtime" 23 "sync" 24 "time" 25 26 "github.com/cilium/cilium/api/v1/models" 27 health "github.com/cilium/cilium/cilium-health/launch" 28 "github.com/cilium/cilium/pkg/bpf" 29 "github.com/cilium/cilium/pkg/clustermesh" 30 "github.com/cilium/cilium/pkg/completion" 31 "github.com/cilium/cilium/pkg/controller" 32 "github.com/cilium/cilium/pkg/counter" 33 "github.com/cilium/cilium/pkg/datapath" 34 bpfIPCache "github.com/cilium/cilium/pkg/datapath/ipcache" 35 "github.com/cilium/cilium/pkg/datapath/linux/ipsec" 36 "github.com/cilium/cilium/pkg/datapath/loader" 37 "github.com/cilium/cilium/pkg/datapath/prefilter" 38 "github.com/cilium/cilium/pkg/debug" 39 "github.com/cilium/cilium/pkg/defaults" 40 "github.com/cilium/cilium/pkg/endpoint/connector" 41 "github.com/cilium/cilium/pkg/endpoint/regeneration" 42 "github.com/cilium/cilium/pkg/endpointmanager" 43 "github.com/cilium/cilium/pkg/fqdn" 44 "github.com/cilium/cilium/pkg/identity" 45 "github.com/cilium/cilium/pkg/identity/cache" 46 "github.com/cilium/cilium/pkg/identity/identitymanager" 47 "github.com/cilium/cilium/pkg/ipam" 48 "github.com/cilium/cilium/pkg/ipcache" 49 "github.com/cilium/cilium/pkg/k8s" 50 "github.com/cilium/cilium/pkg/loadbalancer" 51 "github.com/cilium/cilium/pkg/lock" 52 "github.com/cilium/cilium/pkg/logging" 53 "github.com/cilium/cilium/pkg/logging/logfields" 54 "github.com/cilium/cilium/pkg/maps/ctmap" 55 "github.com/cilium/cilium/pkg/maps/eppolicymap" 56 ipcachemap "github.com/cilium/cilium/pkg/maps/ipcache" 57 "github.com/cilium/cilium/pkg/maps/lbmap" 58 "github.com/cilium/cilium/pkg/maps/lxcmap" 59 "github.com/cilium/cilium/pkg/maps/metricsmap" 60 "github.com/cilium/cilium/pkg/maps/policymap" 61 "github.com/cilium/cilium/pkg/maps/sockmap" 62 "github.com/cilium/cilium/pkg/maps/tunnel" 63 monitoragent "github.com/cilium/cilium/pkg/monitor/agent" 64 monitorAPI "github.com/cilium/cilium/pkg/monitor/api" 65 "github.com/cilium/cilium/pkg/mtu" 66 "github.com/cilium/cilium/pkg/node" 67 nodemanager "github.com/cilium/cilium/pkg/node/manager" 68 nodeStore "github.com/cilium/cilium/pkg/node/store" 69 "github.com/cilium/cilium/pkg/nodediscovery" 70 "github.com/cilium/cilium/pkg/option" 71 "github.com/cilium/cilium/pkg/policy" 72 policyApi "github.com/cilium/cilium/pkg/policy/api" 73 "github.com/cilium/cilium/pkg/proxy" 74 "github.com/cilium/cilium/pkg/proxy/logger" 75 "github.com/cilium/cilium/pkg/revert" 76 "github.com/cilium/cilium/pkg/sockops" 77 "github.com/cilium/cilium/pkg/source" 78 "github.com/cilium/cilium/pkg/status" 79 "github.com/cilium/cilium/pkg/trigger" 80 "github.com/cilium/cilium/pkg/workloads" 81 cnitypes "github.com/cilium/cilium/plugins/cilium-cni/types" 82 83 "github.com/sirupsen/logrus" 84 "github.com/vishvananda/netlink" 85 "golang.org/x/sync/semaphore" 86 ) 87 88 const ( 89 // AutoCIDR indicates that a CIDR should be allocated 90 AutoCIDR = "auto" 91 ) 92 93 const ( 94 initArgLib int = iota 95 initArgRundir 96 initArgIPv4NodeIP 97 initArgIPv6NodeIP 98 initArgMode 99 initArgDevice 100 initArgDevicePreFilter 101 initArgModePreFilter 102 initArgMTU 103 initArgIPSec 104 initArgMasquerade 105 initArgEncryptInterface 106 initArgHostReachableServices 107 initArgHostReachableServicesUDP 108 initArgCgroupRoot 109 initArgBpffsRoot 110 initArgNodePort 111 initArgMax 112 ) 113 114 // Daemon is the cilium daemon that is in charge of perform all necessary plumbing, 115 // monitoring when a LXC starts. 116 type Daemon struct { 117 buildEndpointSem *semaphore.Weighted 118 l7Proxy *proxy.Proxy 119 loadBalancer *loadbalancer.LoadBalancer 120 policy *policy.Repository 121 preFilter *prefilter.PreFilter 122 // Only used for CRI-O since it does not support events. 123 workloadsEventsCh chan<- *workloads.EventMessage 124 125 statusCollectMutex lock.RWMutex 126 statusResponse models.StatusResponse 127 statusCollector *status.Collector 128 129 uniqueIDMU lock.Mutex 130 uniqueID map[uint64]context.CancelFunc 131 132 monitorAgent *monitoragent.Agent 133 ciliumHealth *health.CiliumHealth 134 135 // dnsNameManager tracks which api.FQDNSelector are present in policy which 136 // apply to locally running endpoints. 137 dnsNameManager *fqdn.NameManager 138 139 // dnsPoller polls DNS names and sends them to dnsNameManager 140 dnsPoller *fqdn.DNSPoller 141 142 // k8sAPIs is a set of k8s API in use. They are setup in EnableK8sWatcher, 143 // and may be disabled while the agent runs. 144 // This is on this object, instead of a global, because EnableK8sWatcher is 145 // on Daemon. 146 k8sAPIGroups k8sAPIGroupsUsed 147 148 // Used to synchronize generation of daemon's BPF programs and endpoint BPF 149 // programs. 150 compilationMutex *lock.RWMutex 151 152 // prefixLengths tracks a mapping from CIDR prefix length to the count 153 // of rules that refer to that prefix length. 154 prefixLengths *counter.PrefixLengthCounter 155 156 clustermesh *clustermesh.ClusterMesh 157 158 // k8sResourceSyncedMu protects the k8sResourceSynced map. 159 k8sResourceSyncedMu lock.RWMutex 160 161 // k8sResourceSynced maps a resource name to a channel. Once the given 162 // resource name is synchronized with k8s, the channel for which that 163 // resource name maps to is closed. 164 k8sResourceSynced map[string]chan struct{} 165 166 // k8sSvcCache is a cache of all Kubernetes services and endpoints 167 k8sSvcCache k8s.ServiceCache 168 169 mtuConfig mtu.Configuration 170 policyTrigger *trigger.Trigger 171 172 // datapath is the underlying datapath implementation to use to 173 // implement all aspects of an agent 174 datapath datapath.Datapath 175 176 // nodeDiscovery defines the node discovery logic of the agent 177 nodeDiscovery *nodediscovery.NodeDiscovery 178 179 // ipam is the IP address manager of the agent 180 ipam *ipam.IPAM 181 182 netConf *cnitypes.NetConf 183 184 // iptablesManager deals with all iptables rules installed in the node 185 iptablesManager rulesManager 186 } 187 188 // Datapath returns a reference to the datapath implementation. 189 func (d *Daemon) Datapath() datapath.Datapath { 190 return d.datapath 191 } 192 193 // UpdateProxyRedirect updates the redirect rules in the proxy for a particular 194 // endpoint using the provided L4 filter. Returns the allocated proxy port 195 func (d *Daemon) UpdateProxyRedirect(e regeneration.EndpointUpdater, l4 *policy.L4Filter, proxyWaitGroup *completion.WaitGroup) (uint16, error, revert.FinalizeFunc, revert.RevertFunc) { 196 if d.l7Proxy == nil { 197 return 0, fmt.Errorf("can't redirect, proxy disabled"), nil, nil 198 } 199 200 port, err, finalizeFunc, revertFunc := d.l7Proxy.CreateOrUpdateRedirect(l4, e.ProxyID(l4), e, proxyWaitGroup) 201 if err != nil { 202 return 0, err, nil, nil 203 } 204 205 return port, nil, finalizeFunc, revertFunc 206 } 207 208 // RemoveProxyRedirect removes a previously installed proxy redirect for an 209 // endpoint 210 func (d *Daemon) RemoveProxyRedirect(e regeneration.EndpointInfoSource, id string, proxyWaitGroup *completion.WaitGroup) (error, revert.FinalizeFunc, revert.RevertFunc) { 211 if d.l7Proxy == nil { 212 return nil, nil, nil 213 } 214 215 log.WithFields(logrus.Fields{ 216 logfields.EndpointID: e.GetID(), 217 logfields.L4PolicyID: id, 218 }).Debug("Removing redirect to endpoint") 219 return d.l7Proxy.RemoveRedirect(id, proxyWaitGroup) 220 } 221 222 // UpdateNetworkPolicy adds or updates a network policy in the set 223 // published to L7 proxies. 224 func (d *Daemon) UpdateNetworkPolicy(e regeneration.EndpointUpdater, policy *policy.L4Policy, 225 proxyWaitGroup *completion.WaitGroup) (error, revert.RevertFunc) { 226 if d.l7Proxy == nil { 227 return fmt.Errorf("can't update network policy, proxy disabled"), nil 228 } 229 err, revertFunc := d.l7Proxy.UpdateNetworkPolicy(e, policy, e.GetIngressPolicyEnabledLocked(), 230 e.GetEgressPolicyEnabledLocked(), proxyWaitGroup) 231 return err, revert.RevertFunc(revertFunc) 232 } 233 234 // RemoveNetworkPolicy removes a network policy from the set published to 235 // L7 proxies. 236 func (d *Daemon) RemoveNetworkPolicy(e regeneration.EndpointInfoSource) { 237 if d.l7Proxy == nil { 238 return 239 } 240 d.l7Proxy.RemoveNetworkPolicy(e) 241 } 242 243 // QueueEndpointBuild waits for a "build permit" for the endpoint 244 // identified by 'epID'. This function blocks until the endpoint can 245 // start building. The returned function must then be called to 246 // release the "build permit" when the most resource intensive parts 247 // of the build are done. The returned function is idempotent, so it 248 // may be called more than once. Returns a nil function if the caller should NOT 249 // start building the endpoint. This may happen due to a build being 250 // queued for the endpoint already, or due to the wait for the build 251 // permit being canceled. The latter case happens when the endpoint is 252 // being deleted. Returns an error if the build permit could not be acquired. 253 func (d *Daemon) QueueEndpointBuild(ctx context.Context, epID uint64) (func(), error) { 254 d.uniqueIDMU.Lock() 255 // Skip new build requests if the endpoint is already in the queue 256 // waiting. In this case the queued build will pick up any changes 257 // made so far, so there is no need to queue another build now. 258 if _, queued := d.uniqueID[epID]; queued { 259 d.uniqueIDMU.Unlock() 260 return nil, nil 261 } 262 // Store a cancel function to the 'uniqueID' map so that we can 263 // cancel the wait when the endpoint is being deleted. 264 uniqueIDCtx, cancel := context.WithCancel(ctx) 265 d.uniqueID[epID] = cancel 266 d.uniqueIDMU.Unlock() 267 268 // Acquire build permit. This may block. 269 err := d.buildEndpointSem.Acquire(uniqueIDCtx, 1) 270 271 // Not queueing any more, so remove the cancel func from 'uniqueID' map. 272 // The caller may still cancel the build by calling the cancel func after we 273 // return it. After this point another build may be queued for this 274 // endpoint. 275 d.uniqueIDMU.Lock() 276 delete(d.uniqueID, epID) 277 d.uniqueIDMU.Unlock() 278 279 if err != nil { 280 return nil, err // Acquire failed 281 } 282 283 // Acquire succeeded, but the context was canceled after? 284 if uniqueIDCtx.Err() != nil { 285 d.buildEndpointSem.Release(1) 286 return nil, uniqueIDCtx.Err() 287 } 288 289 // At this point the build permit has been acquired. It must 290 // be released by the caller by calling the returned function 291 // when the heavy lifting of the build is done. 292 // Using sync.Once to make the returned function idempotent. 293 var once sync.Once 294 doneFunc := func() { 295 once.Do(func() { 296 d.buildEndpointSem.Release(1) 297 }) 298 } 299 return doneFunc, nil 300 } 301 302 // RemoveFromEndpointQueue removes the endpoint from the "build permit" queue, 303 // canceling the wait for the build permit if still waiting. 304 func (d *Daemon) RemoveFromEndpointQueue(epID uint64) { 305 d.uniqueIDMU.Lock() 306 if cancel, queued := d.uniqueID[epID]; queued && cancel != nil { 307 delete(d.uniqueID, epID) 308 cancel() 309 } 310 d.uniqueIDMU.Unlock() 311 } 312 313 // GetPolicyRepository returns the policy repository of the daemon 314 func (d *Daemon) GetPolicyRepository() *policy.Repository { 315 return d.policy 316 } 317 318 // DebugEnabled returns if debug mode is enabled. 319 func (d *Daemon) DebugEnabled() bool { 320 return option.Config.Opts.IsEnabled(option.Debug) 321 } 322 323 // GetCIDRPrefixLengths returns the sorted list of unique prefix lengths used 324 // by CIDR policies. 325 func (d *Daemon) GetCIDRPrefixLengths() (s6, s4 []int) { 326 return d.prefixLengths.ToBPFData() 327 } 328 329 // GetOptions returns the datapath configuration options of the daemon. 330 func (d *Daemon) GetOptions() *option.IntOptions { 331 return option.Config.Opts 332 } 333 334 func (d *Daemon) setHostAddresses() error { 335 l, err := netlink.LinkByName(option.Config.LBInterface) 336 if err != nil { 337 return fmt.Errorf("unable to get network device %s: %s", option.Config.Device, err) 338 } 339 340 getAddr := func(netLinkFamily int) (net.IP, error) { 341 addrs, err := netlink.AddrList(l, netLinkFamily) 342 if err != nil { 343 return nil, fmt.Errorf("error while getting %s's addresses: %s", option.Config.Device, err) 344 } 345 for _, possibleAddr := range addrs { 346 if netlink.Scope(possibleAddr.Scope) == netlink.SCOPE_UNIVERSE { 347 return possibleAddr.IP, nil 348 } 349 } 350 return nil, nil 351 } 352 353 if option.Config.EnableIPv4 { 354 hostV4Addr, err := getAddr(netlink.FAMILY_V4) 355 if err != nil { 356 return err 357 } 358 if hostV4Addr != nil { 359 option.Config.HostV4Addr = hostV4Addr 360 log.Infof("Using IPv4 host address: %s", option.Config.HostV4Addr) 361 } 362 } 363 364 if option.Config.EnableIPv6 { 365 hostV6Addr, err := getAddr(netlink.FAMILY_V6) 366 if err != nil { 367 return err 368 } 369 if hostV6Addr != nil { 370 option.Config.HostV6Addr = hostV6Addr 371 log.Infof("Using IPv6 host address: %s", option.Config.HostV6Addr) 372 } 373 } 374 return nil 375 } 376 377 // GetCompilationLock returns the mutex responsible for synchronizing compilation 378 // of BPF programs. 379 func (d *Daemon) GetCompilationLock() *lock.RWMutex { 380 return d.compilationMutex 381 } 382 383 // initMaps opens all BPF maps (and creates them if they do not exist). This 384 // must be done *before* any operations which read BPF maps, especially 385 // restoring endpoints and services. 386 func (d *Daemon) initMaps() error { 387 if option.Config.DryMode { 388 return nil 389 } 390 391 // Delete old proxymaps if left over from an upgrade. 392 // TODO: Remove this code when Cilium 1.6 is the oldest supported release 393 for _, name := range []string{"cilium_proxy4", "cilium_proxy6"} { 394 path := bpf.MapPath(name) 395 if _, err := os.Stat(path); err == nil { 396 if err = os.RemoveAll(path); err == nil { 397 log.Infof("removed legacy proxymap file %s", path) 398 } 399 } 400 } 401 402 if _, err := lxcmap.LXCMap.OpenOrCreate(); err != nil { 403 return err 404 } 405 406 // The ipcache is shared between endpoints. Parallel mode needs to be 407 // used to allow existing endpoints that have not been regenerated yet 408 // to continue using the existing ipcache until the endpoint is 409 // regenerated for the first time. Existing endpoints are using a 410 // policy map which is potentially out of sync as local identities are 411 // re-allocated on startup. Parallel mode allows to continue using the 412 // old version until regeneration. Note that the old version is not 413 // updated with new identities. This is fine as any new identity 414 // appearing would require a regeneration of the endpoint anyway in 415 // order for the endpoint to gain the privilege of communication. 416 if _, err := ipcachemap.IPCache.OpenParallel(); err != nil { 417 return err 418 } 419 420 if _, err := metricsmap.Metrics.OpenOrCreate(); err != nil { 421 return err 422 } 423 424 if _, err := tunnel.TunnelMap.OpenOrCreate(); err != nil { 425 return err 426 } 427 428 if err := openServiceMaps(); err != nil { 429 log.WithError(err).Fatal("Unable to open service maps") 430 } 431 432 // Set up the list of IPCache listeners in the daemon, to be 433 // used by syncEndpointsAndHostIPs() 434 // xDS cache will be added later by calling AddListener(), but only if necessary. 435 ipcache.IPIdentityCache.SetListeners([]ipcache.IPIdentityMappingListener{ 436 bpfIPCache.NewListener(d), 437 }) 438 439 // Start the controller for periodic sync of the metrics map with 440 // the prometheus server. 441 controller.NewManager().UpdateController("metricsmap-bpf-prom-sync", 442 controller.ControllerParams{ 443 DoFunc: metricsmap.SyncMetricsMap, 444 RunInterval: 5 * time.Second, 445 }) 446 447 // Clean all lb entries 448 if !option.Config.RestoreState { 449 log.Debug("cleaning up all BPF LB maps") 450 451 d.loadBalancer.BPFMapMU.Lock() 452 defer d.loadBalancer.BPFMapMU.Unlock() 453 454 if option.Config.EnableIPv6 { 455 if err := lbmap.Service6MapV2.DeleteAll(); err != nil { 456 return err 457 } 458 if err := lbmap.RRSeq6MapV2.DeleteAll(); err != nil { 459 return err 460 } 461 if err := lbmap.Backend6Map.DeleteAll(); err != nil { 462 return err 463 } 464 } 465 if err := d.RevNATDeleteAll(); err != nil { 466 return err 467 } 468 469 if option.Config.EnableIPv4 { 470 if err := lbmap.Service4MapV2.DeleteAll(); err != nil { 471 return err 472 } 473 if err := lbmap.RRSeq4MapV2.DeleteAll(); err != nil { 474 return err 475 } 476 if err := lbmap.Backend4Map.DeleteAll(); err != nil { 477 return err 478 } 479 } 480 481 // If we are not restoring state, all endpoints can be 482 // deleted. Entries will be re-populated. 483 lxcmap.LXCMap.DeleteAll() 484 } 485 486 return nil 487 } 488 489 func (d *Daemon) init() error { 490 globalsDir := option.Config.GetGlobalsDir() 491 if err := os.MkdirAll(globalsDir, defaults.RuntimePathRights); err != nil { 492 log.WithError(err).WithField(logfields.Path, globalsDir).Fatal("Could not create runtime directory") 493 } 494 495 if err := os.Chdir(option.Config.StateDir); err != nil { 496 log.WithError(err).WithField(logfields.Path, option.Config.StateDir).Fatal("Could not change to runtime directory") 497 } 498 499 // Remove any old sockops and re-enable with _new_ programs if flag is set 500 sockops.SockmapDisable() 501 sockops.SkmsgDisable() 502 503 if !option.Config.DryMode { 504 if err := d.createNodeConfigHeaderfile(); err != nil { 505 return err 506 } 507 508 if option.Config.SockopsEnable { 509 disableSockops := func(err error) { 510 option.Config.SockopsEnable = false 511 log.WithError(err).Warn("Disabled '--sockops-enable' due to missing BPF kernel support") 512 } 513 eppolicymap.CreateEPPolicyMap() 514 if err := sockops.SockmapEnable(); err != nil { 515 disableSockops(err) 516 } else if err := sockops.SkmsgEnable(); err != nil { 517 disableSockops(err) 518 } else { 519 sockmap.SockmapCreate() 520 } 521 } 522 523 if err := d.compileBase(); err != nil { 524 return err 525 } 526 527 if err := d.syncEndpointsAndHostIPs(); err != nil { 528 return err 529 } 530 531 // Start the controller for periodic sync. The purpose of the 532 // controller is to ensure that endpoints and host IPs entries are 533 // reinserted to the bpf maps if they are ever removed from them. 534 controller.NewManager().UpdateController("sync-endpoints-and-host-ips", 535 controller.ControllerParams{ 536 DoFunc: func(ctx context.Context) error { 537 return d.syncEndpointsAndHostIPs() 538 }, 539 RunInterval: time.Minute, 540 }) 541 } 542 543 return nil 544 } 545 546 // syncLXCMap adds local host enties to bpf lxcmap, as well as 547 // ipcache, if needed, and also notifies the daemon and network policy 548 // hosts cache if changes were made. 549 func (d *Daemon) syncEndpointsAndHostIPs() error { 550 specialIdentities := []identity.IPIdentityPair{} 551 552 if option.Config.EnableIPv4 { 553 addrs, err := d.datapath.LocalNodeAddressing().IPv4().LocalAddresses() 554 if err != nil { 555 log.WithError(err).Warning("Unable to list local IPv4 addresses") 556 } 557 558 for _, ip := range addrs { 559 if option.Config.IsExcludedLocalAddress(ip) { 560 continue 561 } 562 563 if len(ip) > 0 { 564 specialIdentities = append(specialIdentities, 565 identity.IPIdentityPair{ 566 IP: ip, 567 ID: identity.ReservedIdentityHost, 568 }) 569 } 570 } 571 572 specialIdentities = append(specialIdentities, 573 identity.IPIdentityPair{ 574 IP: net.IPv4zero, 575 Mask: net.CIDRMask(0, net.IPv4len*8), 576 ID: identity.ReservedIdentityWorld, 577 }) 578 } 579 580 if option.Config.EnableIPv6 { 581 addrs, err := d.datapath.LocalNodeAddressing().IPv6().LocalAddresses() 582 if err != nil { 583 log.WithError(err).Warning("Unable to list local IPv4 addresses") 584 } 585 586 addrs = append(addrs, node.GetIPv6Router()) 587 for _, ip := range addrs { 588 if option.Config.IsExcludedLocalAddress(ip) { 589 continue 590 } 591 592 if len(ip) > 0 { 593 specialIdentities = append(specialIdentities, 594 identity.IPIdentityPair{ 595 IP: ip, 596 ID: identity.ReservedIdentityHost, 597 }) 598 } 599 } 600 601 specialIdentities = append(specialIdentities, 602 identity.IPIdentityPair{ 603 IP: net.IPv6zero, 604 Mask: net.CIDRMask(0, net.IPv6len*8), 605 ID: identity.ReservedIdentityWorld, 606 }) 607 } 608 609 existingEndpoints, err := lxcmap.DumpToMap() 610 if err != nil { 611 return err 612 } 613 614 for _, ipIDPair := range specialIdentities { 615 hostKey := node.GetIPsecKeyIdentity() 616 isHost := ipIDPair.ID == identity.ReservedIdentityHost 617 if isHost { 618 added, err := lxcmap.SyncHostEntry(ipIDPair.IP) 619 if err != nil { 620 return fmt.Errorf("Unable to add host entry to endpoint map: %s", err) 621 } 622 if added { 623 log.WithField(logfields.IPAddr, ipIDPair.IP).Debugf("Added local ip to endpoint map") 624 } 625 } 626 627 delete(existingEndpoints, ipIDPair.IP.String()) 628 629 // Upsert will not propagate (reserved:foo->ID) mappings across the cluster, 630 // and we specifically don't want to do so. 631 ipcache.IPIdentityCache.Upsert(ipIDPair.PrefixString(), nil, hostKey, ipcache.Identity{ 632 ID: ipIDPair.ID, 633 Source: source.Local, 634 }) 635 } 636 637 for hostIP, info := range existingEndpoints { 638 if ip := net.ParseIP(hostIP); info.IsHost() && ip != nil { 639 if err := lxcmap.DeleteEntry(ip); err != nil { 640 log.WithError(err).WithFields(logrus.Fields{ 641 logfields.IPAddr: hostIP, 642 }).Warn("Unable to delete obsolete host IP from BPF map") 643 } else { 644 log.Debugf("Removed outdated host ip %s from endpoint map", hostIP) 645 } 646 647 ipcache.IPIdentityCache.Delete(hostIP, source.Local) 648 } 649 } 650 651 return nil 652 } 653 654 func createIPNet(ones, bits int) *net.IPNet { 655 return &net.IPNet{ 656 Mask: net.CIDRMask(ones, bits), 657 } 658 } 659 660 // createPrefixLengthCounter wraps around the counter library, providing 661 // references to prefix lengths that will always be present. 662 func createPrefixLengthCounter() *counter.PrefixLengthCounter { 663 prefixLengths4 := ipcachemap.IPCache.GetMaxPrefixLengths(false) 664 prefixLengths6 := ipcachemap.IPCache.GetMaxPrefixLengths(true) 665 counter := counter.NewPrefixLengthCounter(prefixLengths6, prefixLengths4) 666 667 // This is a bit ugly, but there's not a great way to define an IPNet 668 // without parsing strings, etc. 669 defaultPrefixes := []*net.IPNet{ 670 // IPv4 671 createIPNet(0, net.IPv4len*8), // world 672 createIPNet(net.IPv4len*8, net.IPv4len*8), // hosts 673 674 // IPv6 675 createIPNet(0, net.IPv6len*8), // world 676 createIPNet(net.IPv6len*8, net.IPv6len*8), // hosts 677 } 678 _, err := counter.Add(defaultPrefixes) 679 if err != nil { 680 log.WithError(err).Fatal("Failed to create default prefix lengths") 681 } 682 return counter 683 } 684 685 type rulesManager interface { 686 RemoveRules() 687 InstallRules(ifName string) error 688 TransientRulesStart(ifName string) error 689 TransientRulesEnd(quiet bool) 690 } 691 692 // NewDaemon creates and returns a new Daemon with the parameters set in c. 693 func NewDaemon(dp datapath.Datapath, iptablesManager rulesManager) (*Daemon, *endpointRestoreState, error) { 694 var ( 695 err error 696 netConf *cnitypes.NetConf 697 configuredMTU = option.Config.MTU 698 ) 699 700 bootstrapStats.daemonInit.Start() 701 702 // Validate the daemon-specific global options. 703 if err := option.Config.Validate(); err != nil { 704 return nil, nil, fmt.Errorf("invalid daemon configuration: %s", err) 705 } 706 707 if option.Config.ReadCNIConfiguration != "" { 708 netConf, err = cnitypes.ReadNetConf(option.Config.ReadCNIConfiguration) 709 if err != nil { 710 log.WithError(err).Fatal("Unable to read CNI configuration") 711 } 712 713 if netConf.MTU != 0 { 714 configuredMTU = netConf.MTU 715 log.WithField("mtu", configuredMTU).Info("Overwriting MTU based on CNI configuration") 716 } 717 } 718 719 ctmap.InitMapInfo(option.Config.CTMapEntriesGlobalTCP, option.Config.CTMapEntriesGlobalAny, 720 option.Config.EnableIPv4, option.Config.EnableIPv6, 721 ) 722 policymap.InitMapInfo(option.Config.PolicyMapMaxEntries) 723 724 if option.Config.DryMode == false { 725 if err := bpf.ConfigureResourceLimits(); err != nil { 726 log.WithError(err).Fatal("Unable to set memory resource limits") 727 } 728 } 729 730 authKeySize, err := setupIPSec() 731 if err != nil { 732 return nil, nil, fmt.Errorf("unable to setup encryption: %s", err) 733 } 734 735 mtuConfig := mtu.NewConfiguration(authKeySize, option.Config.EnableIPSec, option.Config.Tunnel != option.TunnelDisabled, configuredMTU) 736 737 nodeMngr, err := nodemanager.NewManager("all", dp.Node()) 738 if err != nil { 739 return nil, nil, err 740 } 741 742 identity.UpdateReservedIdentitiesMetrics() 743 // Must be done before calling policy.NewPolicyRepository() below. 744 identity.InitWellKnownIdentities() 745 746 d := Daemon{ 747 loadBalancer: loadbalancer.NewLoadBalancer(), 748 k8sSvcCache: k8s.NewServiceCache(), 749 policy: policy.NewPolicyRepository(), 750 uniqueID: map[uint64]context.CancelFunc{}, 751 prefixLengths: createPrefixLengthCounter(), 752 k8sResourceSynced: map[string]chan struct{}{}, 753 buildEndpointSem: semaphore.NewWeighted(int64(numWorkerThreads())), 754 compilationMutex: new(lock.RWMutex), 755 netConf: netConf, 756 mtuConfig: mtuConfig, 757 datapath: dp, 758 nodeDiscovery: nodediscovery.NewNodeDiscovery(nodeMngr, mtuConfig), 759 iptablesManager: iptablesManager, 760 } 761 bootstrapStats.daemonInit.End(true) 762 763 // Open or create BPF maps. 764 bootstrapStats.mapsInit.Start() 765 err = d.initMaps() 766 bootstrapStats.mapsInit.EndError(err) 767 if err != nil { 768 log.WithError(err).Error("Error while opening/creating BPF maps") 769 return nil, nil, err 770 } 771 772 // Read the service IDs of existing services from the BPF map and 773 // reserve them. This must be done *before* connecting to the 774 // Kubernetes apiserver and serving the API to ensure service IDs are 775 // not changing across restarts or that a new service could accidentally 776 // use an existing service ID. 777 // Also, create missing v2 services from the corresponding legacy ones. 778 if option.Config.RestoreState && !option.Config.DryMode { 779 bootstrapStats.restore.Start() 780 restoreServices() 781 bootstrapStats.restore.End(true) 782 } 783 784 t, err := trigger.NewTrigger(trigger.Parameters{ 785 Name: "policy_update", 786 MetricsObserver: &policyTriggerMetrics{}, 787 MinInterval: option.Config.PolicyTriggerInterval, 788 TriggerFunc: d.policyUpdateTrigger, 789 }) 790 if err != nil { 791 return nil, nil, err 792 } 793 d.policyTrigger = t 794 795 debug.RegisterStatusObject("k8s-service-cache", &d.k8sSvcCache) 796 debug.RegisterStatusObject("ipam", d.ipam) 797 798 bootstrapStats.k8sInit.Start() 799 k8s.Configure(option.Config.K8sAPIServer, option.Config.K8sKubeConfigPath, defaults.K8sClientQPSLimit, defaults.K8sClientBurst) 800 bootstrapStats.k8sInit.End(true) 801 d.runK8sServiceHandler() 802 policyApi.InitEntities(option.Config.ClusterName) 803 804 bootstrapStats.workloadsInit.Start() 805 workloads.Init(&d) 806 bootstrapStats.workloadsInit.End(true) 807 808 bootstrapStats.cleanup.Start() 809 err = d.clearCiliumVeths() 810 bootstrapStats.cleanup.EndError(err) 811 if err != nil { 812 log.WithError(err).Warning("Unable to clean stale endpoint interfaces") 813 } 814 815 if k8s.IsEnabled() { 816 bootstrapStats.k8sInit.Start() 817 if err := k8s.Init(); err != nil { 818 log.WithError(err).Fatal("Unable to initialize Kubernetes subsystem") 819 } 820 821 if err := k8s.RegisterCRDs(); err != nil { 822 log.WithError(err).Fatal("Unable to register CRDs") 823 } 824 825 // Kubernetes demands that the localhost can always reach local 826 // pods. Therefore unless the AllowLocalhost policy is set to a 827 // specific mode, always allow localhost to reach local 828 // endpoints. 829 if option.Config.AllowLocalhost == option.AllowLocalhostAuto { 830 option.Config.AllowLocalhost = option.AllowLocalhostAlways 831 log.Info("k8s mode: Allowing localhost to reach local endpoints") 832 } 833 834 bootstrapStats.k8sInit.End(true) 835 } 836 837 d.bootstrapIPAM() 838 839 if err := d.bootstrapWorkloads(); err != nil { 840 return nil, nil, err 841 } 842 843 bootstrapStats.restore.Start() 844 // restore endpoints before any IPs are allocated to avoid eventual IP 845 // conflicts later on, otherwise any IP conflict will result in the 846 // endpoint not being able to be restored. 847 restoredEndpoints, err := d.restoreOldEndpoints(option.Config.StateDir, true) 848 if err != nil { 849 log.WithError(err).Error("Unable to restore existing endpoints") 850 } 851 bootstrapStats.restore.End(true) 852 853 if err := d.allocateIPs(); err != nil { 854 return nil, nil, err 855 } 856 857 // Annotation of the k8s node must happen after discovery of the 858 // PodCIDR range and allocation of the health IPs. 859 if k8s.IsEnabled() && option.Config.AnnotateK8sNode { 860 bootstrapStats.k8sInit.Start() 861 log.WithFields(logrus.Fields{ 862 logfields.V4Prefix: node.GetIPv4AllocRange(), 863 logfields.V6Prefix: node.GetIPv6NodeRange(), 864 logfields.V4HealthIP: d.nodeDiscovery.LocalNode.IPv4HealthIP, 865 logfields.V6HealthIP: d.nodeDiscovery.LocalNode.IPv6HealthIP, 866 logfields.V4CiliumHostIP: node.GetInternalIPv4(), 867 logfields.V6CiliumHostIP: node.GetIPv6Router(), 868 }).Info("Annotating k8s node") 869 870 err := k8s.Client().AnnotateNode(node.GetName(), 871 node.GetIPv4AllocRange(), node.GetIPv6NodeRange(), 872 d.nodeDiscovery.LocalNode.IPv4HealthIP, d.nodeDiscovery.LocalNode.IPv6HealthIP, 873 node.GetInternalIPv4(), node.GetIPv6Router()) 874 if err != nil { 875 log.WithError(err).Warning("Cannot annotate k8s node with CIDR range") 876 } 877 bootstrapStats.k8sInit.End(true) 878 } else if !option.Config.AnnotateK8sNode { 879 log.Debug("Annotate k8s node is disabled.") 880 } 881 882 d.nodeDiscovery.StartDiscovery(node.GetName(), &d) 883 884 // This needs to be done after the node addressing has been configured 885 // as the node address is required as suffix. 886 // well known identities have already been initialized above 887 // Ignore the channel returned by this function, as we want the global 888 // identity allocator to run asynchronously. 889 cache.InitIdentityAllocator(&d, k8s.CiliumClient(), nil) 890 891 d.bootstrapClusterMesh(nodeMngr) 892 893 bootstrapStats.bpfBase.Start() 894 err = d.init() 895 bootstrapStats.bpfBase.EndError(err) 896 if err != nil { 897 log.WithError(err).Error("Error while initializing daemon") 898 return nil, restoredEndpoints, err 899 } 900 901 // We can only start monitor agent once cilium_event has been set up. 902 if option.Config.RunMonitorAgent { 903 monitorAgent, err := monitoragent.NewAgent(context.TODO(), defaults.MonitorBufferPages) 904 if err != nil { 905 return nil, nil, err 906 } 907 d.monitorAgent = monitorAgent 908 } 909 if err := loader.RestoreTemplates(option.Config.StateDir); err != nil { 910 log.WithError(err).Error("Unable to restore previous BPF templates") 911 } 912 913 // Start watcher for endpoint IP --> identity mappings in key-value store. 914 // this needs to be done *after* init() for the daemon in that function, 915 // we populate the IPCache with the host's IP(s). 916 ipcache.InitIPIdentityWatcher() 917 identitymanager.Subscribe(d.policy) 918 919 bootstrapStats.proxyStart.Start() 920 // FIXME: Make the port range configurable. 921 if option.Config.EnableL7Proxy { 922 d.l7Proxy = proxy.StartProxySupport(10000, 20000, option.Config.RunDir, 923 option.Config.AccessLog, &d, option.Config.AgentLabels, d.datapath) 924 } else { 925 log.Info("L7 proxies are disabled") 926 } 927 bootstrapStats.proxyStart.End(true) 928 929 bootstrapStats.fqdn.Start() 930 if err := fqdn.ConfigFromResolvConf(); err != nil { 931 bootstrapStats.fqdn.EndError(err) 932 return nil, nil, err 933 } 934 935 err = d.bootstrapFQDN(restoredEndpoints, option.Config.ToFQDNsPreCache) 936 if err != nil { 937 bootstrapStats.fqdn.EndError(err) 938 return nil, restoredEndpoints, err 939 } 940 bootstrapStats.fqdn.End(true) 941 942 return &d, restoredEndpoints, nil 943 } 944 945 func setupIPSec() (int, error) { 946 if option.Config.EncryptNode == false { 947 ipsec.DeleteIPsecEncryptRoute() 948 } 949 950 if !option.Config.EnableIPSec { 951 return 0, nil 952 } 953 954 authKeySize, spi, err := ipsec.LoadIPSecKeysFile(option.Config.IPSecKeyFile) 955 if err != nil { 956 return 0, err 957 } 958 if option.Config.EnableIPv6 { 959 if err := ipsec.EnableIPv6Forwarding(); err != nil { 960 return 0, err 961 } 962 } 963 node.SetIPsecKeyIdentity(spi) 964 return authKeySize, nil 965 } 966 967 func (d *Daemon) bootstrapClusterMesh(nodeMngr *nodemanager.Manager) { 968 bootstrapStats.clusterMeshInit.Start() 969 if path := option.Config.ClusterMeshConfig; path != "" { 970 if option.Config.ClusterID == 0 { 971 log.Info("Cluster-ID is not specified, skipping ClusterMesh initialization") 972 } else { 973 log.WithField("path", path).Info("Initializing ClusterMesh routing") 974 clustermesh, err := clustermesh.NewClusterMesh(clustermesh.Configuration{ 975 Name: "clustermesh", 976 ConfigDirectory: path, 977 NodeKeyCreator: nodeStore.KeyCreator, 978 ServiceMerger: &d.k8sSvcCache, 979 NodeManager: nodeMngr, 980 }) 981 if err != nil { 982 log.WithError(err).Fatal("Unable to initialize ClusterMesh") 983 } 984 985 d.clustermesh = clustermesh 986 } 987 } 988 bootstrapStats.clusterMeshInit.End(true) 989 } 990 991 func (d *Daemon) bootstrapWorkloads() error { 992 if option.Config.WorkloadsEnabled() { 993 bootstrapStats.workloadsInit.Start() 994 // workaround for to use the values of the deprecated dockerEndpoint 995 // variable if it is set with a different value than defaults. 996 defaultDockerEndpoint := workloads.GetRuntimeDefaultOpt(workloads.Docker, "endpoint") 997 if defaultDockerEndpoint != option.Config.DockerEndpoint { 998 option.Config.ContainerRuntimeEndpoint[string(workloads.Docker)] = option.Config.DockerEndpoint 999 log.Warn(`"docker" flag is deprecated.` + 1000 `Please use "--container-runtime-endpoint=docker=` + defaultDockerEndpoint + `" instead`) 1001 } 1002 1003 opts := make(map[workloads.WorkloadRuntimeType]map[string]string) 1004 for rt, ep := range option.Config.ContainerRuntimeEndpoint { 1005 opts[workloads.WorkloadRuntimeType(rt)] = make(map[string]string) 1006 opts[workloads.WorkloadRuntimeType(rt)][workloads.EpOpt] = ep 1007 } 1008 if opts[workloads.Docker] == nil { 1009 opts[workloads.Docker] = make(map[string]string) 1010 } 1011 opts[workloads.Docker][workloads.DatapathModeOpt] = option.Config.DatapathMode 1012 1013 // Workloads must be initialized after IPAM has started as it requires 1014 // to allocate IPs. 1015 if err := workloads.Setup(d.ipam, option.Config.Workloads, opts); err != nil { 1016 return fmt.Errorf("unable to setup workload: %s", err) 1017 } 1018 1019 log.Infof("Container runtime options set: %s", workloads.GetRuntimeOptions()) 1020 bootstrapStats.workloadsInit.End(true) 1021 } 1022 return nil 1023 } 1024 1025 // Close shuts down a daemon 1026 func (d *Daemon) Close() { 1027 if d.policyTrigger != nil { 1028 d.policyTrigger.Shutdown() 1029 } 1030 d.nodeDiscovery.Close() 1031 } 1032 1033 func (d *Daemon) attachExistingInfraContainers() { 1034 m, err := workloads.Client().GetAllInfraContainersPID() 1035 if err != nil { 1036 log.WithError(err).Error("Unable to get all infra containers PIDs") 1037 return 1038 } 1039 log.Debugf("Containers found %+v", m) 1040 for containerID, pid := range m { 1041 epModel, err := connector.DeriveEndpointFrom(option.Config.FlannelMasterDevice, containerID, pid) 1042 if err != nil { 1043 log.WithError(err).WithField(logfields.ContainerID, containerID). 1044 Warning("Unable to derive endpoint from existing infra container") 1045 continue 1046 } 1047 log.Debugf("Adding endpoint %+v", epModel) 1048 ep, _, err := d.createEndpoint(context.Background(), epModel) 1049 if err != nil { 1050 log.WithError(err).WithField(logfields.ContainerID, containerID). 1051 Warning("Unable to attach existing infra container") 1052 continue 1053 } 1054 log.WithFields(logrus.Fields{ 1055 logfields.ContainerID: epModel.ContainerID, 1056 logfields.EndpointID: ep.ID, 1057 }).Info("Attached BPF program to existing container") 1058 } 1059 } 1060 1061 // TriggerReloadWithoutCompile causes all BPF programs and maps to be reloaded, 1062 // without recompiling the datapath logic for each endpoint. It first attempts 1063 // to recompile the base programs, and if this fails returns an error. If base 1064 // program load is successful, it subsequently triggers regeneration of all 1065 // endpoints and returns a waitgroup that may be used by the caller to wait for 1066 // all endpoint regeneration to complete. 1067 // 1068 // If an error is returned, then no regeneration was successful. If no error 1069 // is returned, then the base programs were successfully regenerated, but 1070 // endpoints may or may not have successfully regenerated. 1071 func (d *Daemon) TriggerReloadWithoutCompile(reason string) (*sync.WaitGroup, error) { 1072 log.Debugf("BPF reload triggered from %s", reason) 1073 if err := d.compileBase(); err != nil { 1074 return nil, fmt.Errorf("Unable to recompile base programs from %s: %s", reason, err) 1075 } 1076 1077 regenRequest := ®eneration.ExternalRegenerationMetadata{ 1078 Reason: reason, 1079 RegenerationLevel: regeneration.RegenerateWithDatapathLoad, 1080 } 1081 return endpointmanager.RegenerateAllEndpoints(regenRequest), nil 1082 } 1083 1084 func changedOption(key string, value option.OptionSetting, data interface{}) { 1085 d := data.(*Daemon) 1086 if key == option.Debug { 1087 // Set the debug toggle (this can be a no-op) 1088 logging.ToggleDebugLogs(d.DebugEnabled()) 1089 // Reflect log level change to proxies 1090 proxy.ChangeLogLevel(logging.GetLevel(logging.DefaultLogger)) 1091 } 1092 d.policy.BumpRevision() // force policy recalculation 1093 } 1094 1095 // numWorkerThreads returns the number of worker threads with a minimum of 4. 1096 func numWorkerThreads() int { 1097 ncpu := runtime.NumCPU() 1098 minWorkerThreads := 2 1099 1100 if ncpu < minWorkerThreads { 1101 return minWorkerThreads 1102 } 1103 return ncpu 1104 } 1105 1106 // SendNotification sends an agent notification to the monitor 1107 func (d *Daemon) SendNotification(typ monitorAPI.AgentNotification, text string) error { 1108 if option.Config.DryMode { 1109 return nil 1110 } 1111 event := monitorAPI.AgentNotify{Type: typ, Text: text} 1112 return d.monitorAgent.SendEvent(monitorAPI.MessageTypeAgent, event) 1113 } 1114 1115 // NewProxyLogRecord is invoked by the proxy accesslog on each new access log entry 1116 func (d *Daemon) NewProxyLogRecord(l *logger.LogRecord) error { 1117 return d.monitorAgent.SendEvent(monitorAPI.MessageTypeAccessLog, l.LogRecord) 1118 } 1119 1120 // GetNodeSuffix returns the suffix to be appended to kvstore keys of this 1121 // agent 1122 func (d *Daemon) GetNodeSuffix() string { 1123 var ip net.IP 1124 1125 switch { 1126 case option.Config.EnableIPv4: 1127 ip = node.GetExternalIPv4() 1128 case option.Config.EnableIPv6: 1129 ip = node.GetIPv6() 1130 } 1131 1132 if ip == nil { 1133 log.Fatal("Node IP not available yet") 1134 } 1135 1136 return ip.String() 1137 } 1138 1139 // GetNetConf returns the CNI configuration that was used to initiate the 1140 // daemon instance. This may return nil when no configuration is available. 1141 func (d *Daemon) GetNetConf() *cnitypes.NetConf { 1142 return d.netConf 1143 } 1144 1145 // UpdateCiliumNodeResource implements nodediscovery.Owner to create/update the 1146 // CiliumNode resource 1147 func (d *Daemon) UpdateCiliumNodeResource() { 1148 d.nodeDiscovery.UpdateCiliumNodeResource(d) 1149 }