github.com/cilium/cilium@v1.16.2/pkg/datapath/linux/devices_controller.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  //go:build linux
     5  
     6  package linux
     7  
     8  import (
     9  	"context"
    10  	"fmt"
    11  	"log/slog"
    12  	"net"
    13  	"net/netip"
    14  	"slices"
    15  	"strings"
    16  
    17  	"github.com/cilium/ebpf"
    18  	"github.com/cilium/ebpf/asm"
    19  	"github.com/cilium/hive/cell"
    20  	"github.com/cilium/statedb"
    21  	"github.com/spf13/pflag"
    22  	"github.com/vishvananda/netlink"
    23  	"github.com/vishvananda/netlink/nl"
    24  	vns "github.com/vishvananda/netns"
    25  	"golang.org/x/sys/unix"
    26  	"k8s.io/apimachinery/pkg/util/sets"
    27  
    28  	"github.com/cilium/cilium/pkg/datapath/linux/probes"
    29  	"github.com/cilium/cilium/pkg/datapath/tables"
    30  	"github.com/cilium/cilium/pkg/defaults"
    31  	"github.com/cilium/cilium/pkg/inctimer"
    32  	"github.com/cilium/cilium/pkg/ip"
    33  	"github.com/cilium/cilium/pkg/logging/logfields"
    34  	"github.com/cilium/cilium/pkg/netns"
    35  	"github.com/cilium/cilium/pkg/option"
    36  	"github.com/cilium/cilium/pkg/time"
    37  )
    38  
    39  // DevicesControllerCell registers a controller that subscribes to network devices
    40  // and routes via netlink and populates the devices and routes devices.
    41  var DevicesControllerCell = cell.Module(
    42  	"devices-controller",
    43  	"Synchronizes the device and route tables with the kernel",
    44  
    45  	// This controller owns the device and route tables. It provides
    46  	// the Table[*Device] from a constructor here to enforce start
    47  	// ordering and to populate the tables before there are any readers.
    48  	// But these cells are still usable directly in tests to provide
    49  	// the modules under test device and route test data.
    50  	cell.ProvidePrivate(
    51  		tables.NewDeviceTable,
    52  		tables.NewRouteTable,
    53  	),
    54  	cell.Invoke(
    55  		statedb.RegisterTable[*tables.Device],
    56  		statedb.RegisterTable[*tables.Route],
    57  	),
    58  
    59  	cell.Provide(
    60  		newDevicesController,
    61  		newDeviceManager,
    62  	),
    63  	cell.Config(DevicesConfig{}),
    64  
    65  	// Always construct the devices controller. We provide the
    66  	// *devicesController for DeviceManager, but once it has been removed,
    67  	// this can be refactored to just do an invoke to register the
    68  	// controller jobs.
    69  	cell.Invoke(func(*devicesController) {}),
    70  )
    71  
    72  func (c DevicesConfig) Flags(flags *pflag.FlagSet) {
    73  	flags.StringSlice(option.Devices, []string{}, "List of devices facing cluster/external network (used for BPF NodePort, BPF masquerading and host firewall); supports '+' as wildcard in device name, e.g. 'eth+'")
    74  
    75  	flags.Bool(option.ForceDeviceDetection, false, "Forces the auto-detection of devices, even if specific devices are explicitly listed")
    76  }
    77  
    78  var (
    79  	// batchingDuration is the amount of time to wait for more
    80  	// addr/route/link updates before processing the batch.
    81  	batchingDuration = 100 * time.Millisecond
    82  
    83  	// restartWaitDuration is the amount of time to wait after
    84  	// a netlink failure before restarting from scratch.
    85  	restartWaitDuration = time.Second
    86  
    87  	// Route filter to look at all routing tables.
    88  	routeFilter = netlink.Route{
    89  		Table: unix.RT_TABLE_UNSPEC,
    90  	}
    91  	routeFilterMask = netlink.RT_FILTER_TABLE
    92  )
    93  
    94  type DevicesConfig struct {
    95  	// Devices is the user-specified devices to use. This can be
    96  	// either concrete devices ("eth0,eth1"), or a wildcard "eth+".
    97  	// If empty the devices are auto-detected according to rules defined
    98  	// by isSelectedDevice().
    99  	Devices []string
   100  	// ForceDeviceDetection forces the auto-detection of devices,
   101  	// even if user-specific devices are explicitly listed.
   102  	ForceDeviceDetection bool
   103  }
   104  
   105  type devicesControllerParams struct {
   106  	cell.In
   107  
   108  	Config      DevicesConfig
   109  	Log         *slog.Logger
   110  	DB          statedb.Handle
   111  	DeviceTable statedb.RWTable[*tables.Device]
   112  	RouteTable  statedb.RWTable[*tables.Route]
   113  
   114  	// netlinkFuncs is optional and used by tests to verify error handling behavior.
   115  	NetlinkFuncs *netlinkFuncs `optional:"true"`
   116  }
   117  
   118  type devicesController struct {
   119  	params devicesControllerParams
   120  	log    *slog.Logger
   121  
   122  	initialized          chan struct{}
   123  	filter               deviceFilter
   124  	enforceAutoDetection bool
   125  	l3DevSupported       bool
   126  
   127  	// deadLinkIndexes tracks the set of links that have been deleted. This is needed
   128  	// to avoid processing route or address updates after a link delete as they may
   129  	// arrive out of order due to the use of separate netlink sockets.
   130  	deadLinkIndexes sets.Set[int]
   131  
   132  	cancel context.CancelFunc // controller's context is cancelled when stopped.
   133  }
   134  
   135  func newDevicesController(lc cell.Lifecycle, p devicesControllerParams) (*devicesController, statedb.Table[*tables.Device], statedb.Table[*tables.Route]) {
   136  	dc := &devicesController{
   137  		params:               p,
   138  		initialized:          make(chan struct{}),
   139  		filter:               deviceFilter(p.Config.Devices),
   140  		enforceAutoDetection: p.Config.ForceDeviceDetection,
   141  		log:                  p.Log,
   142  		deadLinkIndexes:      sets.New[int](),
   143  	}
   144  	lc.Append(dc)
   145  	return dc, p.DeviceTable, p.RouteTable
   146  }
   147  
   148  func (dc *devicesController) Start(startCtx cell.HookContext) error {
   149  	if dc.params.NetlinkFuncs == nil {
   150  		var err error
   151  		dc.params.NetlinkFuncs, err = makeNetlinkFuncs()
   152  		if err != nil {
   153  			return err
   154  		}
   155  
   156  		// Only probe for L3 device support when netlink isn't mocked by tests.
   157  		dc.l3DevSupported = probes.HaveProgramHelper(ebpf.SchedCLS, asm.FnSkbChangeHead) == nil
   158  	}
   159  
   160  	var ctx context.Context
   161  	ctx, dc.cancel = context.WithCancel(context.Background())
   162  
   163  	go dc.run(ctx)
   164  
   165  	// Wait until the initial population of the tables has finished
   166  	// successfully or the start has been aborted.
   167  	select {
   168  	case <-dc.initialized:
   169  	case <-startCtx.Done():
   170  		dc.cancel()
   171  	}
   172  
   173  	return nil
   174  }
   175  
   176  func (dc *devicesController) run(ctx context.Context) {
   177  	defer dc.params.NetlinkFuncs.Close()
   178  
   179  	// Run the controller in a loop and restarting on failures until stopped.
   180  	// We're doing this as netlink is an unreliable protocol that may drop
   181  	// messages if the socket buffer is filled (recvmsg returns ENOBUFS).
   182  	for ctx.Err() == nil {
   183  		dc.subscribeAndProcess(ctx)
   184  
   185  		t, stop := inctimer.New()
   186  
   187  		select {
   188  		case <-ctx.Done():
   189  			stop()
   190  			return
   191  		case <-t.After(restartWaitDuration):
   192  		}
   193  	}
   194  }
   195  
   196  func (dc *devicesController) subscribeAndProcess(ctx context.Context) {
   197  	// Wrap the controller context to allow cancelling it on failures.
   198  	ctx, cancel := context.WithCancel(ctx)
   199  	defer cancel()
   200  
   201  	// Callback for logging errors from the netlink subscriptions.
   202  	// It cancels the context to unsubscribe from netlink updates
   203  	// which stops the processing.
   204  	errorCallback := func(err error) {
   205  		if ctx.Err() != nil {
   206  			// The netlink unsubscribe can lead to errorCallback being called after
   207  			// context cancellation with a "receive called on closed socket".
   208  			// Thus ignore the error if the context was cancelled.
   209  			return
   210  		}
   211  
   212  		dc.log.Warn("Netlink error received, restarting", logfields.Error, err)
   213  
   214  		// Cancel the context to stop the subscriptions.
   215  		cancel()
   216  	}
   217  
   218  	addrUpdates := make(chan netlink.AddrUpdate)
   219  	if err := dc.params.NetlinkFuncs.AddrSubscribe(addrUpdates, ctx.Done(), errorCallback); err != nil {
   220  		dc.log.Warn("AddrSubscribe failed, restarting", logfields.Error, err)
   221  		return
   222  	}
   223  	routeUpdates := make(chan netlink.RouteUpdate)
   224  	err := dc.params.NetlinkFuncs.RouteSubscribe(routeUpdates, ctx.Done(), errorCallback)
   225  	if err != nil {
   226  		dc.log.Warn("RouteSubscribe failed, restarting", logfields.Error, err)
   227  		return
   228  	}
   229  	linkUpdates := make(chan netlink.LinkUpdate)
   230  	err = dc.params.NetlinkFuncs.LinkSubscribe(linkUpdates, ctx.Done(), errorCallback)
   231  	if err != nil {
   232  		dc.log.Warn("LinkSubscribe failed, restarting", logfields.Error, err)
   233  		return
   234  	}
   235  
   236  	// Initialize the tables by listing links, routes and addresses.
   237  	// Preferably we'd just subscribe to updates with listing enabled, but
   238  	// unfortunately netlink Go library does not mark where the initial list
   239  	// ends and updates begin.
   240  	err = dc.initialize()
   241  	if err != nil {
   242  		dc.log.Warn("Initialization failed, restarting", logfields.Error, err)
   243  		return
   244  	}
   245  
   246  	// Start processing the incremental updates until we're stopping or
   247  	// a failure is encountered.
   248  	dc.processUpdates(addrUpdates, routeUpdates, linkUpdates)
   249  }
   250  
   251  func (dc *devicesController) Stop(cell.HookContext) error {
   252  	dc.cancel()
   253  
   254  	// Unfortunately vishvananda/netlink is buggy and does not return from Recvfrom even
   255  	// though the stop channel given to AddrSubscribeWithOptions or RouteSubscribeWithOptions
   256  	// is closed. This is fixed by https://github.com/vishvananda/netlink/pull/793, which
   257  	// isn't yet merged.
   258  	// Due to this, we're currently not waiting here for run() to exit and thus leaving around
   259  	// couple goroutines until some address or route change arrive.
   260  	return nil
   261  }
   262  
   263  func (dc *devicesController) initialize() error {
   264  	// Do initial listing for each address, routes and links. We cannot use
   265  	// the 'ListExisting' option as it does not provide a mechanism to know when
   266  	// the listing is done and the updates begin. Netlink does send a NLMSG_DONE,
   267  	// but this is not exposed by the library.
   268  	batch := map[int][]any{}
   269  	links, err := dc.params.NetlinkFuncs.LinkList()
   270  	if err != nil {
   271  		return fmt.Errorf("LinkList failed: %w", err)
   272  	}
   273  	for _, link := range links {
   274  		batch[link.Attrs().Index] = append(batch[link.Attrs().Index], netlink.LinkUpdate{
   275  			Header: unix.NlMsghdr{Type: unix.RTM_NEWLINK},
   276  			Link:   link,
   277  		})
   278  	}
   279  	addrs, err := dc.params.NetlinkFuncs.AddrList(nil, netlink.FAMILY_ALL)
   280  	if err != nil {
   281  		return fmt.Errorf("AddrList failed: %w", err)
   282  	}
   283  	for _, addr := range addrs {
   284  		var ipnet net.IPNet
   285  		if addr.IPNet != nil {
   286  			ipnet = *addr.IPNet
   287  		}
   288  		batch[addr.LinkIndex] = append(batch[addr.LinkIndex], netlink.AddrUpdate{
   289  			LinkAddress: ipnet,
   290  			LinkIndex:   addr.LinkIndex,
   291  			Flags:       addr.Flags,
   292  			Scope:       addr.Scope,
   293  			PreferedLft: addr.PreferedLft,
   294  			ValidLft:    addr.ValidLft,
   295  			NewAddr:     true,
   296  		})
   297  	}
   298  	routes, err := dc.params.NetlinkFuncs.RouteListFiltered(netlink.FAMILY_ALL, &routeFilter, routeFilterMask)
   299  	if err != nil {
   300  		return fmt.Errorf("RouteList failed: %w", err)
   301  	}
   302  	for _, route := range routes {
   303  		batch[route.LinkIndex] = append(batch[route.LinkIndex], netlink.RouteUpdate{
   304  			Type:  unix.RTM_NEWROUTE,
   305  			Route: route,
   306  		})
   307  	}
   308  
   309  	txn := dc.params.DB.WriteTxn(dc.params.DeviceTable, dc.params.RouteTable)
   310  
   311  	// Flush existing data from potential prior run.
   312  	dc.params.DeviceTable.DeleteAll(txn)
   313  	dc.params.RouteTable.DeleteAll(txn)
   314  
   315  	// Process the initial batch.
   316  	dc.processBatch(txn, batch)
   317  
   318  	txn.Commit()
   319  
   320  	select {
   321  	case <-dc.initialized:
   322  	default:
   323  		close(dc.initialized)
   324  	}
   325  
   326  	return nil
   327  }
   328  
   329  func (dc *devicesController) deviceNameSet(txn statedb.ReadTxn) sets.Set[string] {
   330  	devs, _ := tables.SelectedDevices(dc.params.DeviceTable, txn)
   331  	return sets.New(tables.DeviceNames(devs)...)
   332  }
   333  
   334  func (dc *devicesController) processUpdates(
   335  	addrUpdates chan netlink.AddrUpdate,
   336  	routeUpdates chan netlink.RouteUpdate,
   337  	linkUpdates chan netlink.LinkUpdate,
   338  ) {
   339  	// Use a ticker to periodically commit the batch of updates to the device and route tables.
   340  	// We do this to reduce the number of write transactions to the state in cases like large
   341  	// routing tables and to reduce churn in other components that observe the devices by
   342  	// avoiding intermediate states (e.g. devices without addresses).
   343  	ticker := time.NewTicker(batchingDuration)
   344  	defer ticker.Stop()
   345  
   346  	batch := map[int][]any{}
   347  	appendUpdate := func(index int, u any) {
   348  		batch[index] = append(batch[index], u)
   349  	}
   350  
   351  	// Gather address, route and link updates into a batch and
   352  	// periodically commit it. We loop until all channels have
   353  	// been closed in order to release the netlink subscriptions
   354  	// when being stopped.
   355  	for addrUpdates != nil || routeUpdates != nil || linkUpdates != nil {
   356  		select {
   357  		case u, ok := <-addrUpdates:
   358  			if !ok {
   359  				addrUpdates = nil
   360  			} else {
   361  				appendUpdate(u.LinkIndex, u)
   362  			}
   363  
   364  		case r, ok := <-routeUpdates:
   365  			if !ok {
   366  				routeUpdates = nil
   367  			} else {
   368  				appendUpdate(r.LinkIndex, r)
   369  			}
   370  
   371  		case l, ok := <-linkUpdates:
   372  			if !ok {
   373  				linkUpdates = nil
   374  			} else {
   375  				appendUpdate(int(l.Index), l)
   376  			}
   377  
   378  		case <-ticker.C:
   379  			if len(batch) > 0 {
   380  				txn := dc.params.DB.WriteTxn(dc.params.DeviceTable, dc.params.RouteTable)
   381  				dc.processBatch(txn, batch)
   382  				txn.Commit()
   383  				batch = map[int][]any{}
   384  			}
   385  		}
   386  	}
   387  }
   388  
   389  func deviceAddressFromAddrUpdate(upd netlink.AddrUpdate) tables.DeviceAddress {
   390  	return tables.DeviceAddress{
   391  		Addr:      ip.MustAddrFromIP(upd.LinkAddress.IP),
   392  		Secondary: upd.Flags&unix.IFA_F_SECONDARY != 0,
   393  
   394  		// ifaddrmsg.ifa_scope is uint8, vishvananda/netlink has wrong type
   395  		Scope: tables.RouteScope(upd.Scope),
   396  	}
   397  }
   398  
   399  func populateFromLink(d *tables.Device, link netlink.Link) {
   400  	a := link.Attrs()
   401  	d.Index = a.Index
   402  	d.MTU = a.MTU
   403  	d.Name = a.Name
   404  	d.HardwareAddr = tables.HardwareAddr(a.HardwareAddr)
   405  	d.Flags = a.Flags
   406  	d.RawFlags = a.RawFlags
   407  	d.MasterIndex = a.MasterIndex
   408  	d.Type = link.Type()
   409  }
   410  
   411  // processBatch processes a batch of address, link and route updates.
   412  // The address and link updates are merged into a device object and upserted
   413  // into the device table.
   414  func (dc *devicesController) processBatch(txn statedb.WriteTxn, batch map[int][]any) {
   415  	before := dc.deviceNameSet(txn)
   416  	for index, updates := range batch {
   417  		d, _, _ := dc.params.DeviceTable.Get(txn, tables.DeviceIDIndex.Query(index))
   418  		if d == nil {
   419  			// Unseen device. We may receive address updates before link updates
   420  			// and thus the only thing we know at this point is the index.
   421  			d = &tables.Device{}
   422  			d.Index = index
   423  		} else {
   424  			d = d.DeepCopy()
   425  		}
   426  		deviceDeleted := false
   427  
   428  		// Set to true if the device was modified. This is done to avoid unnecessary
   429  		// modifications to the device that would wake up watchers.
   430  		deviceUpdated := false
   431  
   432  		for _, u := range updates {
   433  			switch u := u.(type) {
   434  			case netlink.AddrUpdate:
   435  				if dc.deadLinkIndexes.Has(u.LinkIndex) {
   436  					continue
   437  				}
   438  				addr := deviceAddressFromAddrUpdate(u)
   439  				i := slices.Index(d.Addrs, addr)
   440  				if u.NewAddr {
   441  					if i < 0 {
   442  						d.Addrs = append(d.Addrs, addr)
   443  					}
   444  				} else if i >= 0 {
   445  					d.Addrs = slices.Delete(d.Addrs, i, i+1)
   446  				}
   447  				deviceUpdated = true
   448  			case netlink.RouteUpdate:
   449  				if dc.deadLinkIndexes.Has(u.LinkIndex) {
   450  					// Ignore route updates for a device that has been removed
   451  					// to avoid processing an out of order route create after
   452  					// link delete (Linux won't send complete set of messages
   453  					// of routes deleted when link is deleted).
   454  					continue
   455  				}
   456  				r := tables.Route{
   457  					Table:     tables.RouteTable(u.Table),
   458  					LinkIndex: index,
   459  					Scope:     uint8(u.Scope),
   460  					Dst:       ipnetToPrefix(u.Family, u.Dst),
   461  				}
   462  				r.Src, _ = netip.AddrFromSlice(u.Src)
   463  				r.Gw, _ = netip.AddrFromSlice(u.Gw)
   464  
   465  				if u.Type == unix.RTM_NEWROUTE {
   466  					_, _, err := dc.params.RouteTable.Insert(txn, &r)
   467  					if err != nil {
   468  						dc.log.Warn("Failed to insert route", logfields.Error, err, "route", r)
   469  					}
   470  				} else if u.Type == unix.RTM_DELROUTE {
   471  					_, _, err := dc.params.RouteTable.Delete(txn, &r)
   472  					if err != nil {
   473  						dc.log.Warn("Failed to delete route", logfields.Error, err, "route", r)
   474  					}
   475  				}
   476  			case netlink.LinkUpdate:
   477  				if u.Header.Type == unix.RTM_DELLINK {
   478  					// Mark for deletion.
   479  					dc.deadLinkIndexes.Insert(d.Index)
   480  					deviceDeleted = true
   481  				} else {
   482  					dc.deadLinkIndexes.Delete(d.Index)
   483  					deviceDeleted = false
   484  					populateFromLink(d, u.Link)
   485  				}
   486  				deviceUpdated = true
   487  			}
   488  		}
   489  
   490  		// Recheck the viability of the device after the updates have been applied.
   491  		// Since route changes may cause device to be selected (e.g. veth device that
   492  		// has default route), always recheck viability if device is not selected.
   493  		if deviceUpdated || !d.Selected {
   494  			oldSelected := d.Selected
   495  			oldReason := d.NotSelectedReason
   496  			d.Selected, d.NotSelectedReason = dc.isSelectedDevice(d, txn)
   497  			if d.Selected != oldSelected || d.NotSelectedReason != oldReason {
   498  				deviceUpdated = true
   499  			}
   500  		}
   501  
   502  		if deviceDeleted {
   503  			// Remove the deleted device.
   504  			dc.params.DeviceTable.Delete(txn, d)
   505  
   506  			// Remove all routes for the device. For a deleted device netlink does not
   507  			// send complete set of route delete messages.
   508  			iter := dc.params.RouteTable.List(txn, tables.RouteLinkIndex.Query(d.Index))
   509  			for r, _, ok := iter.Next(); ok; r, _, ok = iter.Next() {
   510  				dc.params.RouteTable.Delete(txn, r)
   511  			}
   512  		} else if deviceUpdated {
   513  			// Create or update the device.
   514  			_, _, err := dc.params.DeviceTable.Insert(txn, d)
   515  			if err != nil {
   516  				dc.log.Warn("Failed to insert device", logfields.Error, err, logfields.Device, d)
   517  			}
   518  		}
   519  	}
   520  	after := dc.deviceNameSet(txn)
   521  	if !before.Equal(after) {
   522  		dc.log.Info("Devices changed", logfields.Devices, after.UnsortedList())
   523  	}
   524  }
   525  
   526  const (
   527  	// Exclude devices that have one or more of these flags set.
   528  	excludedIfFlagsMask uint32 = unix.IFF_SLAVE | unix.IFF_LOOPBACK
   529  
   530  	// Require these flags to be set.
   531  	requiredIfFlagsMask uint32 = unix.IFF_UP
   532  )
   533  
   534  // isSelectedDevice checks if the device is selected or not. We still maintain its state in
   535  // case it later becomes selected.
   536  func (dc *devicesController) isSelectedDevice(d *tables.Device, txn statedb.WriteTxn) (bool, string) {
   537  	if d.Name == "" {
   538  		// Looks like we have seen the addresses for this device before the initial link update,
   539  		// hence it has no name. Definitely not selected yet!
   540  		return false, "link not seen yet"
   541  	}
   542  
   543  	if len(d.Addrs) == 0 {
   544  		return false, "device has no addresses"
   545  	}
   546  
   547  	// Skip devices that don't have the required flags set.
   548  	if d.RawFlags&requiredIfFlagsMask == 0 {
   549  		return false, fmt.Sprintf("missing required flag (mask=0x%x, flags=0x%x)", requiredIfFlagsMask, d.RawFlags)
   550  	}
   551  
   552  	// If user specified devices or wildcards, then skip the device if it doesn't match.
   553  	// If the device does match and user not requested auto detection, then skip further checks.
   554  	// If the device does match and user requested auto detection, then continue to further checks.
   555  	if dc.filter.nonEmpty() {
   556  		if dc.filter.match(d.Name) {
   557  			return true, ""
   558  		}
   559  		if !dc.enforceAutoDetection {
   560  			return false, fmt.Sprintf("not matching user filter %v", dc.filter)
   561  		}
   562  	}
   563  
   564  	// Skip devices that have an excluded interface flag set.
   565  	if d.RawFlags&excludedIfFlagsMask != 0 {
   566  		return false, fmt.Sprintf("excluded flag set (mask=0x%x, flags=0x%x)", excludedIfFlagsMask, d.RawFlags)
   567  	}
   568  
   569  	// Ignore bridge and bonding slave devices
   570  	if d.MasterIndex != 0 {
   571  		return false, fmt.Sprintf("bridged or bonded to ifindex %d", d.MasterIndex)
   572  	}
   573  
   574  	// Ignore L3 devices if we cannot support them.
   575  	hasMacAddr := len(d.HardwareAddr) != 0
   576  	if !dc.l3DevSupported && !hasMacAddr {
   577  		return false, "L3 device, kernel too old, >= 5.8 required"
   578  	}
   579  
   580  	// Never consider devices with any of the excluded devices.
   581  	for _, p := range defaults.ExcludedDevicePrefixes {
   582  		if strings.HasPrefix(d.Name, p) {
   583  			return false, fmt.Sprintf("excluded prefix %q", p)
   584  		}
   585  	}
   586  
   587  	switch d.Type {
   588  	case "veth":
   589  		// Skip veth devices that don't have a default route (unless user has specified
   590  		// the device manually).
   591  		// This is a workaround for kubernetes-in-docker. We want to avoid
   592  		// veth devices in general as they may be leftovers from another CNI.
   593  		if !dc.filter.nonEmpty() && !tables.HasDefaultRoute(dc.params.RouteTable, txn, d.Index) {
   594  			return false, "veth without default route"
   595  		}
   596  
   597  	case "bridge", "openvswitch":
   598  		// Skip bridge devices as they're very unlikely to be used for K8s
   599  		// purposes. In the rare cases where a user wants to load datapath
   600  		// programs onto them they can override device detection with --devices.
   601  		return false, "bridge-like device, use --devices to override"
   602  	}
   603  
   604  	if !hasGlobalRoute(d.Index, dc.params.RouteTable, txn) {
   605  		return false, "no global unicast routes"
   606  	}
   607  
   608  	return true, ""
   609  }
   610  
   611  func hasGlobalRoute(devIndex int, tbl statedb.Table[*tables.Route], rxn statedb.ReadTxn) bool {
   612  	iter := tbl.List(rxn, tables.RouteLinkIndex.Query(devIndex))
   613  	hasGlobal := false
   614  	for r, _, ok := iter.Next(); ok; r, _, ok = iter.Next() {
   615  		if r.Dst.Addr().IsGlobalUnicast() {
   616  			hasGlobal = true
   617  			break
   618  		}
   619  	}
   620  
   621  	return hasGlobal
   622  }
   623  
   624  // deviceFilter implements filtering device names either by
   625  // concrete name ("eth0") or by iptables-like wildcard ("eth+").
   626  type deviceFilter []string
   627  
   628  // nonEmpty returns true if the filter has been defined
   629  // (i.e. user has specified --devices).
   630  func (lst deviceFilter) nonEmpty() bool {
   631  	return len(lst) > 0
   632  }
   633  
   634  // match checks whether the given device name passes the filter
   635  func (lst deviceFilter) match(dev string) bool {
   636  	if len(lst) == 0 {
   637  		return true
   638  	}
   639  	for _, entry := range lst {
   640  		if strings.HasSuffix(entry, "+") {
   641  			prefix := strings.TrimRight(entry, "+")
   642  			if strings.HasPrefix(dev, prefix) {
   643  				return true
   644  			}
   645  		} else if dev == entry {
   646  			return true
   647  		}
   648  	}
   649  	return false
   650  }
   651  
   652  // netlinkFuncs wraps the netlink subscribe functions into a simpler interface to facilitate
   653  // testing of the error handling paths.
   654  type netlinkFuncs struct {
   655  	RouteSubscribe    func(ch chan<- netlink.RouteUpdate, done <-chan struct{}, errorCallback func(error)) error
   656  	AddrSubscribe     func(ch chan<- netlink.AddrUpdate, done <-chan struct{}, errorCallback func(error)) error
   657  	LinkSubscribe     func(ch chan<- netlink.LinkUpdate, done <-chan struct{}, errorCallback func(error)) error
   658  	Close             func()
   659  	LinkList          func() ([]netlink.Link, error)
   660  	AddrList          func(link netlink.Link, family int) ([]netlink.Addr, error)
   661  	RouteListFiltered func(family int, filter *netlink.Route, filterMask uint64) ([]netlink.Route, error)
   662  }
   663  
   664  // makeNetlinkFuncs returns a *netlinkFuncs containing netlink accessors to the
   665  // network namespace of the calling goroutine's OS thread.
   666  func makeNetlinkFuncs() (*netlinkFuncs, error) {
   667  	netlinkHandle, err := netlink.NewHandle()
   668  	if err != nil {
   669  		return nil, fmt.Errorf("creating netlink handle: %w", err)
   670  	}
   671  
   672  	cur, err := netns.Current()
   673  	if err != nil {
   674  		return nil, fmt.Errorf("getting current netns: %w", err)
   675  	}
   676  
   677  	return &netlinkFuncs{
   678  		RouteSubscribe: func(ch chan<- netlink.RouteUpdate, done <-chan struct{}, errorCallback func(error)) error {
   679  			h := vns.NsHandle(cur.FD())
   680  			return netlink.RouteSubscribeWithOptions(ch, done,
   681  				netlink.RouteSubscribeOptions{
   682  					ListExisting:  false,
   683  					ErrorCallback: errorCallback,
   684  					Namespace:     &h,
   685  				})
   686  		},
   687  		AddrSubscribe: func(ch chan<- netlink.AddrUpdate, done <-chan struct{}, errorCallback func(error)) error {
   688  			h := vns.NsHandle(cur.FD())
   689  			return netlink.AddrSubscribeWithOptions(ch, done,
   690  				netlink.AddrSubscribeOptions{
   691  					ListExisting:  false,
   692  					ErrorCallback: errorCallback,
   693  					Namespace:     &h,
   694  				})
   695  		},
   696  		LinkSubscribe: func(ch chan<- netlink.LinkUpdate, done <-chan struct{}, errorCallback func(error)) error {
   697  			h := vns.NsHandle(cur.FD())
   698  			return netlink.LinkSubscribeWithOptions(ch, done,
   699  				netlink.LinkSubscribeOptions{
   700  					ListExisting:  false,
   701  					ErrorCallback: errorCallback,
   702  					Namespace:     &h,
   703  				})
   704  		},
   705  		Close:             netlinkHandle.Close,
   706  		LinkList:          netlinkHandle.LinkList,
   707  		AddrList:          netlinkHandle.AddrList,
   708  		RouteListFiltered: netlinkHandle.RouteListFiltered,
   709  	}, nil
   710  }
   711  
   712  func ipnetToPrefix(family int, ipn *net.IPNet) netip.Prefix {
   713  	if ipn != nil {
   714  		cidr, _ := ipn.Mask.Size()
   715  		return netip.PrefixFrom(ip.MustAddrFromIP(ipn.IP), cidr)
   716  	}
   717  	return netip.PrefixFrom(zeroAddr(family), 0)
   718  }
   719  
   720  func zeroAddr(family int) netip.Addr {
   721  	if family == nl.FAMILY_V4 {
   722  		return netip.IPv4Unspecified()
   723  	} else {
   724  		return netip.IPv6Unspecified()
   725  	}
   726  }