github.com/rish1988/moby@v25.0.2+incompatible/libnetwork/drivers/overlay/ov_network.go (about)

     1  //go:build linux
     2  
     3  package overlay
     4  
     5  import (
     6  	"context"
     7  	"errors"
     8  	"fmt"
     9  	"net"
    10  	"os"
    11  	"path/filepath"
    12  	"runtime"
    13  	"strconv"
    14  	"strings"
    15  	"sync"
    16  
    17  	"github.com/containerd/log"
    18  	"github.com/docker/docker/libnetwork/driverapi"
    19  	"github.com/docker/docker/libnetwork/drivers/overlay/overlayutils"
    20  	"github.com/docker/docker/libnetwork/netlabel"
    21  	"github.com/docker/docker/libnetwork/ns"
    22  	"github.com/docker/docker/libnetwork/osl"
    23  	"github.com/docker/docker/libnetwork/types"
    24  	"github.com/hashicorp/go-multierror"
    25  	"github.com/vishvananda/netlink"
    26  	"github.com/vishvananda/netns"
    27  	"golang.org/x/sys/unix"
    28  )
    29  
    30  var (
    31  	networkOnce sync.Once
    32  	networkMu   sync.Mutex
    33  	vniTbl      = make(map[uint32]string)
    34  )
    35  
    36  type networkTable map[string]*network
    37  
    38  type subnet struct {
    39  	sboxInit  bool
    40  	vxlanName string
    41  	brName    string
    42  	vni       uint32
    43  	initErr   error
    44  	subnetIP  *net.IPNet
    45  	gwIP      *net.IPNet
    46  }
    47  
    48  type network struct {
    49  	id        string
    50  	sbox      *osl.Namespace
    51  	endpoints endpointTable
    52  	driver    *driver
    53  	joinCnt   int
    54  	sboxInit  bool
    55  	initEpoch int
    56  	initErr   error
    57  	subnets   []*subnet
    58  	secure    bool
    59  	mtu       int
    60  	sync.Mutex
    61  }
    62  
    63  func init() {
    64  	// Lock main() to the initial thread to exclude the goroutines executing
    65  	// func setDefaultVLAN() from being scheduled onto that thread. Changes to
    66  	// the network namespace of the initial thread alter /proc/self/ns/net,
    67  	// which would break any code which (incorrectly) assumes that that file is
    68  	// a handle to the network namespace for the thread it is currently
    69  	// executing on.
    70  	runtime.LockOSThread()
    71  }
    72  
    73  func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
    74  	return nil, types.NotImplementedErrorf("not implemented")
    75  }
    76  
    77  func (d *driver) NetworkFree(id string) error {
    78  	return types.NotImplementedErrorf("not implemented")
    79  }
    80  
    81  func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
    82  	if id == "" {
    83  		return fmt.Errorf("invalid network id")
    84  	}
    85  	if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" {
    86  		return types.InvalidParameterErrorf("ipv4 pool is empty")
    87  	}
    88  
    89  	// Since we perform lazy configuration make sure we try
    90  	// configuring the driver when we enter CreateNetwork
    91  	if err := d.configure(); err != nil {
    92  		return err
    93  	}
    94  
    95  	n := &network{
    96  		id:        id,
    97  		driver:    d,
    98  		endpoints: endpointTable{},
    99  		subnets:   []*subnet{},
   100  	}
   101  
   102  	vnis := make([]uint32, 0, len(ipV4Data))
   103  	gval, ok := option[netlabel.GenericData]
   104  	if !ok {
   105  		return fmt.Errorf("option %s is missing", netlabel.GenericData)
   106  	}
   107  
   108  	optMap := gval.(map[string]string)
   109  	vnisOpt, ok := optMap[netlabel.OverlayVxlanIDList]
   110  	if !ok {
   111  		return errors.New("no VNI provided")
   112  	}
   113  	log.G(context.TODO()).Debugf("overlay: Received vxlan IDs: %s", vnisOpt)
   114  	var err error
   115  	vnis, err = overlayutils.AppendVNIList(vnis, vnisOpt)
   116  	if err != nil {
   117  		return err
   118  	}
   119  
   120  	if _, ok := optMap[secureOption]; ok {
   121  		n.secure = true
   122  	}
   123  	if val, ok := optMap[netlabel.DriverMTU]; ok {
   124  		var err error
   125  		if n.mtu, err = strconv.Atoi(val); err != nil {
   126  			return fmt.Errorf("failed to parse %v: %v", val, err)
   127  		}
   128  		if n.mtu < 0 {
   129  			return fmt.Errorf("invalid MTU value: %v", n.mtu)
   130  		}
   131  	}
   132  
   133  	if len(vnis) == 0 {
   134  		return errors.New("no VNI provided")
   135  	} else if len(vnis) < len(ipV4Data) {
   136  		return fmt.Errorf("insufficient vnis(%d) passed to overlay", len(vnis))
   137  	}
   138  
   139  	for i, ipd := range ipV4Data {
   140  		s := &subnet{
   141  			subnetIP: ipd.Pool,
   142  			gwIP:     ipd.Gateway,
   143  			vni:      vnis[i],
   144  		}
   145  
   146  		n.subnets = append(n.subnets, s)
   147  	}
   148  
   149  	d.Lock()
   150  	defer d.Unlock()
   151  	if d.networks[n.id] != nil {
   152  		return fmt.Errorf("attempt to create overlay network %v that already exists", n.id)
   153  	}
   154  
   155  	// Make sure no rule is on the way from any stale secure network
   156  	if !n.secure {
   157  		for _, vni := range vnis {
   158  			d.programMangle(vni, false)
   159  			d.programInput(vni, false)
   160  		}
   161  	}
   162  
   163  	if nInfo != nil {
   164  		if err := nInfo.TableEventRegister(ovPeerTable, driverapi.EndpointObject); err != nil {
   165  			// XXX Undo writeToStore?  No method to so.  Why?
   166  			return err
   167  		}
   168  	}
   169  
   170  	d.networks[id] = n
   171  
   172  	return nil
   173  }
   174  
   175  func (d *driver) DeleteNetwork(nid string) error {
   176  	if nid == "" {
   177  		return fmt.Errorf("invalid network id")
   178  	}
   179  
   180  	// Make sure driver resources are initialized before proceeding
   181  	if err := d.configure(); err != nil {
   182  		return err
   183  	}
   184  
   185  	d.Lock()
   186  	// Only perform a peer flush operation (if required) AFTER unlocking
   187  	// the driver lock to avoid deadlocking w/ the peerDB.
   188  	var doPeerFlush bool
   189  	defer func() {
   190  		d.Unlock()
   191  		if doPeerFlush {
   192  			d.peerFlush(nid)
   193  		}
   194  	}()
   195  
   196  	// This is similar to d.network(), but we need to keep holding the lock
   197  	// until we are done removing this network.
   198  	n := d.networks[nid]
   199  	if n == nil {
   200  		return fmt.Errorf("could not find network with id %s", nid)
   201  	}
   202  
   203  	for _, ep := range n.endpoints {
   204  		if ep.ifName != "" {
   205  			if link, err := ns.NlHandle().LinkByName(ep.ifName); err == nil {
   206  				if err := ns.NlHandle().LinkDel(link); err != nil {
   207  					log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.ifName, ep.id)
   208  				}
   209  			}
   210  		}
   211  	}
   212  
   213  	doPeerFlush = true
   214  	delete(d.networks, nid)
   215  
   216  	if n.secure {
   217  		for _, s := range n.subnets {
   218  			if err := d.programMangle(s.vni, false); err != nil {
   219  				log.G(context.TODO()).WithFields(log.Fields{
   220  					"error":      err,
   221  					"network_id": n.id,
   222  					"subnet":     s.subnetIP,
   223  				}).Warn("Failed to clean up iptables rules during overlay network deletion")
   224  			}
   225  			if err := d.programInput(s.vni, false); err != nil {
   226  				log.G(context.TODO()).WithFields(log.Fields{
   227  					"error":      err,
   228  					"network_id": n.id,
   229  					"subnet":     s.subnetIP,
   230  				}).Warn("Failed to clean up iptables rules during overlay network deletion")
   231  			}
   232  		}
   233  	}
   234  
   235  	return nil
   236  }
   237  
   238  func (d *driver) ProgramExternalConnectivity(nid, eid string, options map[string]interface{}) error {
   239  	return nil
   240  }
   241  
   242  func (d *driver) RevokeExternalConnectivity(nid, eid string) error {
   243  	return nil
   244  }
   245  
   246  func (n *network) joinSandbox(s *subnet, incJoinCount bool) error {
   247  	// If there is a race between two go routines here only one will win
   248  	// the other will wait.
   249  	networkOnce.Do(populateVNITbl)
   250  
   251  	n.Lock()
   252  	// If initialization was successful then tell the peerDB to initialize the
   253  	// sandbox with all the peers previously received from networkdb. But only
   254  	// do this after unlocking the network. Otherwise we could deadlock with
   255  	// on the peerDB channel while peerDB is waiting for the network lock.
   256  	var doInitPeerDB bool
   257  	defer func() {
   258  		n.Unlock()
   259  		if doInitPeerDB {
   260  			go n.driver.initSandboxPeerDB(n.id)
   261  		}
   262  	}()
   263  
   264  	if !n.sboxInit {
   265  		n.initErr = n.initSandbox()
   266  		doInitPeerDB = n.initErr == nil
   267  		// If there was an error, we cannot recover it
   268  		n.sboxInit = true
   269  	}
   270  
   271  	if n.initErr != nil {
   272  		return fmt.Errorf("network sandbox join failed: %v", n.initErr)
   273  	}
   274  
   275  	subnetErr := s.initErr
   276  	if !s.sboxInit {
   277  		subnetErr = n.initSubnetSandbox(s)
   278  		// We can recover from these errors
   279  		if subnetErr == nil {
   280  			s.initErr = subnetErr
   281  			s.sboxInit = true
   282  		}
   283  	}
   284  	if subnetErr != nil {
   285  		return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr)
   286  	}
   287  
   288  	if incJoinCount {
   289  		n.joinCnt++
   290  	}
   291  
   292  	return nil
   293  }
   294  
   295  func (n *network) leaveSandbox() {
   296  	n.Lock()
   297  	defer n.Unlock()
   298  	n.joinCnt--
   299  	if n.joinCnt != 0 {
   300  		return
   301  	}
   302  
   303  	n.destroySandbox()
   304  
   305  	n.sboxInit = false
   306  	n.initErr = nil
   307  	for _, s := range n.subnets {
   308  		s.sboxInit = false
   309  		s.initErr = nil
   310  	}
   311  }
   312  
   313  // to be called while holding network lock
   314  func (n *network) destroySandbox() {
   315  	if n.sbox != nil {
   316  		for _, iface := range n.sbox.Interfaces() {
   317  			if err := iface.Remove(); err != nil {
   318  				log.G(context.TODO()).Debugf("Remove interface %s failed: %v", iface.SrcName(), err)
   319  			}
   320  		}
   321  
   322  		for _, s := range n.subnets {
   323  			if s.vxlanName != "" {
   324  				err := deleteInterface(s.vxlanName)
   325  				if err != nil {
   326  					log.G(context.TODO()).Warnf("could not cleanup sandbox properly: %v", err)
   327  				}
   328  			}
   329  		}
   330  
   331  		n.sbox.Destroy()
   332  		n.sbox = nil
   333  	}
   334  }
   335  
   336  func populateVNITbl() {
   337  	filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")),
   338  		// NOTE(cpuguy83): The linter picked up on the fact that this walk function was not using this error argument
   339  		// That seems wrong... however I'm not familiar with this code or if that error matters
   340  		func(path string, _ os.DirEntry, _ error) error {
   341  			_, fname := filepath.Split(path)
   342  
   343  			if len(strings.Split(fname, "-")) <= 1 {
   344  				return nil
   345  			}
   346  
   347  			n, err := netns.GetFromPath(path)
   348  			if err != nil {
   349  				log.G(context.TODO()).Errorf("Could not open namespace path %s during vni population: %v", path, err)
   350  				return nil
   351  			}
   352  			defer n.Close()
   353  
   354  			nlh, err := netlink.NewHandleAt(n, unix.NETLINK_ROUTE)
   355  			if err != nil {
   356  				log.G(context.TODO()).Errorf("Could not open netlink handle during vni population for ns %s: %v", path, err)
   357  				return nil
   358  			}
   359  			defer nlh.Close()
   360  
   361  			err = nlh.SetSocketTimeout(soTimeout)
   362  			if err != nil {
   363  				log.G(context.TODO()).Warnf("Failed to set the timeout on the netlink handle sockets for vni table population: %v", err)
   364  			}
   365  
   366  			links, err := nlh.LinkList()
   367  			if err != nil {
   368  				log.G(context.TODO()).Errorf("Failed to list interfaces during vni population for ns %s: %v", path, err)
   369  				return nil
   370  			}
   371  
   372  			for _, l := range links {
   373  				if l.Type() == "vxlan" {
   374  					vniTbl[uint32(l.(*netlink.Vxlan).VxlanId)] = path
   375  				}
   376  			}
   377  
   378  			return nil
   379  		})
   380  }
   381  
   382  func (n *network) generateVxlanName(s *subnet) string {
   383  	id := n.id
   384  	if len(n.id) > 5 {
   385  		id = n.id[:5]
   386  	}
   387  
   388  	return fmt.Sprintf("vx-%06x-%v", s.vni, id)
   389  }
   390  
   391  func (n *network) generateBridgeName(s *subnet) string {
   392  	id := n.id
   393  	if len(n.id) > 5 {
   394  		id = n.id[:5]
   395  	}
   396  
   397  	return n.getBridgeNamePrefix(s) + "-" + id
   398  }
   399  
   400  func (n *network) getBridgeNamePrefix(s *subnet) string {
   401  	return fmt.Sprintf("ov-%06x", s.vni)
   402  }
   403  
   404  func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error {
   405  	// Try to find this subnet's vni is being used in some
   406  	// other namespace by looking at vniTbl that we just
   407  	// populated in the once init. If a hit is found then
   408  	// it must a stale namespace from previous
   409  	// life. Destroy it completely and reclaim resourced.
   410  	networkMu.Lock()
   411  	path, ok := vniTbl[s.vni]
   412  	networkMu.Unlock()
   413  
   414  	if ok {
   415  		deleteVxlanByVNI(path, s.vni)
   416  		if err := unix.Unmount(path, unix.MNT_FORCE); err != nil {
   417  			log.G(context.TODO()).Errorf("unmount of %s failed: %v", path, err)
   418  		}
   419  		os.Remove(path)
   420  
   421  		networkMu.Lock()
   422  		delete(vniTbl, s.vni)
   423  		networkMu.Unlock()
   424  	}
   425  
   426  	// create a bridge and vxlan device for this subnet and move it to the sandbox
   427  	sbox := n.sbox
   428  
   429  	if err := sbox.AddInterface(brName, "br", osl.WithIPv4Address(s.gwIP), osl.WithIsBridge(true)); err != nil {
   430  		return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
   431  	}
   432  
   433  	v6transport, err := n.driver.isIPv6Transport()
   434  	if err != nil {
   435  		log.G(context.TODO()).WithError(err).Errorf("Assuming IPv4 transport; overlay network %s will not pass traffic if the Swarm data plane is IPv6.", n.id)
   436  	}
   437  	if err := createVxlan(vxlanName, s.vni, n.maxMTU(), v6transport); err != nil {
   438  		return err
   439  	}
   440  
   441  	if err := sbox.AddInterface(vxlanName, "vxlan", osl.WithMaster(brName)); err != nil {
   442  		// If adding vxlan device to the overlay namespace fails, remove the bridge interface we
   443  		// already added to the namespace. This allows the caller to try the setup again.
   444  		for _, iface := range sbox.Interfaces() {
   445  			if iface.SrcName() == brName {
   446  				if ierr := iface.Remove(); ierr != nil {
   447  					log.G(context.TODO()).Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr)
   448  				}
   449  			}
   450  		}
   451  
   452  		// Also, delete the vxlan interface. Since a global vni id is associated
   453  		// with the vxlan interface, an orphaned vxlan interface will result in
   454  		// failure of vxlan device creation if the vni is assigned to some other
   455  		// network.
   456  		if deleteErr := deleteInterface(vxlanName); deleteErr != nil {
   457  			log.G(context.TODO()).Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err)
   458  		}
   459  		return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
   460  	}
   461  
   462  	if err := setDefaultVLAN(sbox); err != nil {
   463  		// not a fatal error
   464  		log.G(context.TODO()).WithError(err).Error("set bridge default vlan failed")
   465  	}
   466  	return nil
   467  }
   468  
   469  func setDefaultVLAN(ns *osl.Namespace) error {
   470  	var brName string
   471  	for _, i := range ns.Interfaces() {
   472  		if i.Bridge() {
   473  			brName = i.DstName()
   474  		}
   475  	}
   476  
   477  	// IFLA_BR_VLAN_DEFAULT_PVID was added in Linux v4.4 (see torvalds/linux@0f963b7), so we can't use netlink for
   478  	// setting this until Docker drops support for CentOS/RHEL 7 (kernel 3.10, eol date: 2024-06-30).
   479  	var innerErr error
   480  	err := ns.InvokeFunc(func() {
   481  		// Contrary to what the sysfs(5) man page says, the entries of /sys/class/net
   482  		// represent the networking devices visible in the network namespace of the
   483  		// process which mounted the sysfs filesystem, irrespective of the network
   484  		// namespace of the process accessing the directory. Remount sysfs in order to
   485  		// see the network devices in sbox's network namespace, making sure the mount
   486  		// doesn't propagate back.
   487  		//
   488  		// The Linux implementation of (osl.Sandbox).InvokeFunc() runs the function in a
   489  		// dedicated goroutine. The effects of unshare(CLONE_NEWNS) on a thread cannot
   490  		// be reverted so the thread needs to be terminated once the goroutine is
   491  		// finished.
   492  		runtime.LockOSThread()
   493  		if err := unix.Unshare(unix.CLONE_NEWNS); err != nil {
   494  			innerErr = os.NewSyscallError("unshare", err)
   495  			return
   496  		}
   497  		if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
   498  			innerErr = &os.PathError{Op: "mount", Path: "/", Err: err}
   499  			return
   500  		}
   501  		if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil {
   502  			innerErr = &os.PathError{Op: "mount", Path: "/sys", Err: err}
   503  			return
   504  		}
   505  
   506  		path := filepath.Join("/sys/class/net", brName, "bridge/default_pvid")
   507  		data := []byte{'0', '\n'}
   508  
   509  		if err := os.WriteFile(path, data, 0o644); err != nil {
   510  			innerErr = fmt.Errorf("failed to enable default vlan on bridge %s: %w", brName, err)
   511  			return
   512  		}
   513  	})
   514  	if err != nil {
   515  		return err
   516  	}
   517  	return innerErr
   518  }
   519  
   520  // Must be called with the network lock
   521  func (n *network) initSubnetSandbox(s *subnet) error {
   522  	brName := n.generateBridgeName(s)
   523  	vxlanName := n.generateVxlanName(s)
   524  
   525  	// Program iptables rules for mandatory encryption of the secure
   526  	// network, or clean up leftover rules for a stale secure network which
   527  	// was previously assigned the same VNI.
   528  	if err := n.driver.programMangle(s.vni, n.secure); err != nil {
   529  		return err
   530  	}
   531  	if err := n.driver.programInput(s.vni, n.secure); err != nil {
   532  		if n.secure {
   533  			return multierror.Append(err, n.driver.programMangle(s.vni, false))
   534  		}
   535  	}
   536  
   537  	if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil {
   538  		return err
   539  	}
   540  
   541  	s.vxlanName = vxlanName
   542  	s.brName = brName
   543  
   544  	return nil
   545  }
   546  
   547  func (n *network) cleanupStaleSandboxes() {
   548  	filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")),
   549  		func(path string, _ os.DirEntry, _ error) error {
   550  			_, fname := filepath.Split(path)
   551  
   552  			pList := strings.Split(fname, "-")
   553  			if len(pList) <= 1 {
   554  				return nil
   555  			}
   556  
   557  			pattern := pList[1]
   558  			if strings.Contains(n.id, pattern) {
   559  				// Delete all vnis
   560  				deleteVxlanByVNI(path, 0)
   561  				unix.Unmount(path, unix.MNT_DETACH)
   562  				os.Remove(path)
   563  
   564  				// Now that we have destroyed this
   565  				// sandbox, remove all references to
   566  				// it in vniTbl so that we don't
   567  				// inadvertently destroy the sandbox
   568  				// created in this life.
   569  				networkMu.Lock()
   570  				for vni, tblPath := range vniTbl {
   571  					if tblPath == path {
   572  						delete(vniTbl, vni)
   573  					}
   574  				}
   575  				networkMu.Unlock()
   576  			}
   577  
   578  			return nil
   579  		})
   580  }
   581  
   582  func (n *network) initSandbox() error {
   583  	n.initEpoch++
   584  
   585  	// If there are any stale sandboxes related to this network
   586  	// from previous daemon life clean it up here
   587  	n.cleanupStaleSandboxes()
   588  
   589  	key := osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch) + n.id)
   590  	sbox, err := osl.NewSandbox(key, true, false)
   591  	if err != nil {
   592  		return fmt.Errorf("could not get network sandbox: %v", err)
   593  	}
   594  
   595  	// this is needed to let the peerAdd configure the sandbox
   596  	n.sbox = sbox
   597  
   598  	return nil
   599  }
   600  
   601  func (d *driver) network(nid string) *network {
   602  	d.Lock()
   603  	n := d.networks[nid]
   604  	d.Unlock()
   605  
   606  	return n
   607  }
   608  
   609  func (n *network) sandbox() *osl.Namespace {
   610  	n.Lock()
   611  	defer n.Unlock()
   612  	return n.sbox
   613  }
   614  
   615  // getSubnetforIP returns the subnet to which the given IP belongs
   616  func (n *network) getSubnetforIP(ip *net.IPNet) *subnet {
   617  	for _, s := range n.subnets {
   618  		// first check if the mask lengths are the same
   619  		i, _ := s.subnetIP.Mask.Size()
   620  		j, _ := ip.Mask.Size()
   621  		if i != j {
   622  			continue
   623  		}
   624  		if s.subnetIP.Contains(ip.IP) {
   625  			return s
   626  		}
   627  	}
   628  	return nil
   629  }