github.com/rawahars/moby@v24.0.4+incompatible/libnetwork/osl/namespace_linux.go (about)

     1  package osl
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"net"
     7  	"os"
     8  	"path/filepath"
     9  	"runtime"
    10  	"strconv"
    11  	"strings"
    12  	"sync"
    13  	"syscall"
    14  	"time"
    15  
    16  	"github.com/docker/docker/internal/unshare"
    17  	"github.com/docker/docker/libnetwork/ns"
    18  	"github.com/docker/docker/libnetwork/osl/kernel"
    19  	"github.com/docker/docker/libnetwork/types"
    20  	"github.com/sirupsen/logrus"
    21  	"github.com/vishvananda/netlink"
    22  	"github.com/vishvananda/netns"
    23  	"golang.org/x/sys/unix"
    24  )
    25  
    26  const defaultPrefix = "/var/run/docker"
    27  
    28  func init() {
    29  	// Lock main() to the initial thread to exclude the goroutines spawned
    30  	// by func (*networkNamespace) InvokeFunc() or func setIPv6() below from
    31  	// being scheduled onto that thread. Changes to the network namespace of
    32  	// the initial thread alter /proc/self/ns/net, which would break any
    33  	// code which (incorrectly) assumes that that file is the network
    34  	// namespace for the thread it is currently executing on.
    35  	runtime.LockOSThread()
    36  }
    37  
    38  var (
    39  	once             sync.Once
    40  	garbagePathMap   = make(map[string]bool)
    41  	gpmLock          sync.Mutex
    42  	gpmWg            sync.WaitGroup
    43  	gpmCleanupPeriod = 60 * time.Second
    44  	gpmChan          = make(chan chan struct{})
    45  	prefix           = defaultPrefix
    46  )
    47  
    48  // The networkNamespace type is the linux implementation of the Sandbox
    49  // interface. It represents a linux network namespace, and moves an interface
    50  // into it when called on method AddInterface or sets the gateway etc.
    51  type networkNamespace struct {
    52  	path         string
    53  	iFaces       []*nwIface
    54  	gw           net.IP
    55  	gwv6         net.IP
    56  	staticRoutes []*types.StaticRoute
    57  	neighbors    []*neigh
    58  	nextIfIndex  map[string]int
    59  	isDefault    bool
    60  	nlHandle     *netlink.Handle
    61  	loV6Enabled  bool
    62  	sync.Mutex
    63  }
    64  
    65  // SetBasePath sets the base url prefix for the ns path
    66  func SetBasePath(path string) {
    67  	prefix = path
    68  }
    69  
    70  func basePath() string {
    71  	return filepath.Join(prefix, "netns")
    72  }
    73  
    74  func createBasePath() {
    75  	err := os.MkdirAll(basePath(), 0755)
    76  	if err != nil {
    77  		panic("Could not create net namespace path directory")
    78  	}
    79  
    80  	// Start the garbage collection go routine
    81  	go removeUnusedPaths()
    82  }
    83  
    84  func removeUnusedPaths() {
    85  	gpmLock.Lock()
    86  	period := gpmCleanupPeriod
    87  	gpmLock.Unlock()
    88  
    89  	ticker := time.NewTicker(period)
    90  	for {
    91  		var (
    92  			gc   chan struct{}
    93  			gcOk bool
    94  		)
    95  
    96  		select {
    97  		case <-ticker.C:
    98  		case gc, gcOk = <-gpmChan:
    99  		}
   100  
   101  		gpmLock.Lock()
   102  		pathList := make([]string, 0, len(garbagePathMap))
   103  		for path := range garbagePathMap {
   104  			pathList = append(pathList, path)
   105  		}
   106  		garbagePathMap = make(map[string]bool)
   107  		gpmWg.Add(1)
   108  		gpmLock.Unlock()
   109  
   110  		for _, path := range pathList {
   111  			os.Remove(path)
   112  		}
   113  
   114  		gpmWg.Done()
   115  		if gcOk {
   116  			close(gc)
   117  		}
   118  	}
   119  }
   120  
   121  func addToGarbagePaths(path string) {
   122  	gpmLock.Lock()
   123  	garbagePathMap[path] = true
   124  	gpmLock.Unlock()
   125  }
   126  
   127  func removeFromGarbagePaths(path string) {
   128  	gpmLock.Lock()
   129  	delete(garbagePathMap, path)
   130  	gpmLock.Unlock()
   131  }
   132  
   133  // GC triggers garbage collection of namespace path right away
   134  // and waits for it.
   135  func GC() {
   136  	gpmLock.Lock()
   137  	if len(garbagePathMap) == 0 {
   138  		// No need for GC if map is empty
   139  		gpmLock.Unlock()
   140  		return
   141  	}
   142  	gpmLock.Unlock()
   143  
   144  	// if content exists in the garbage paths
   145  	// we can trigger GC to run, providing a
   146  	// channel to be notified on completion
   147  	waitGC := make(chan struct{})
   148  	gpmChan <- waitGC
   149  	// wait for GC completion
   150  	<-waitGC
   151  }
   152  
   153  // GenerateKey generates a sandbox key based on the passed
   154  // container id.
   155  func GenerateKey(containerID string) string {
   156  	maxLen := 12
   157  	// Read sandbox key from host for overlay
   158  	if strings.HasPrefix(containerID, "-") {
   159  		var (
   160  			index    int
   161  			indexStr string
   162  			tmpkey   string
   163  		)
   164  		dir, err := os.ReadDir(basePath())
   165  		if err != nil {
   166  			return ""
   167  		}
   168  
   169  		for _, v := range dir {
   170  			id := v.Name()
   171  			if strings.HasSuffix(id, containerID[:maxLen-1]) {
   172  				indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
   173  				tmpindex, err := strconv.Atoi(indexStr)
   174  				if err != nil {
   175  					return ""
   176  				}
   177  				if tmpindex > index {
   178  					index = tmpindex
   179  					tmpkey = id
   180  				}
   181  			}
   182  		}
   183  		containerID = tmpkey
   184  		if containerID == "" {
   185  			return ""
   186  		}
   187  	}
   188  
   189  	if len(containerID) < maxLen {
   190  		maxLen = len(containerID)
   191  	}
   192  
   193  	return basePath() + "/" + containerID[:maxLen]
   194  }
   195  
   196  // NewSandbox provides a new sandbox instance created in an os specific way
   197  // provided a key which uniquely identifies the sandbox
   198  func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) {
   199  	if !isRestore {
   200  		err := createNetworkNamespace(key, osCreate)
   201  		if err != nil {
   202  			return nil, err
   203  		}
   204  	} else {
   205  		once.Do(createBasePath)
   206  	}
   207  
   208  	n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
   209  
   210  	sboxNs, err := netns.GetFromPath(n.path)
   211  	if err != nil {
   212  		return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
   213  	}
   214  	defer sboxNs.Close()
   215  
   216  	n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
   217  	if err != nil {
   218  		return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
   219  	}
   220  
   221  	err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
   222  	if err != nil {
   223  		logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
   224  	}
   225  	// In live-restore mode, IPV6 entries are getting cleaned up due to below code
   226  	// We should retain IPV6 configurations in live-restore mode when Docker Daemon
   227  	// comes back. It should work as it is on other cases
   228  	// As starting point, disable IPv6 on all interfaces
   229  	if !isRestore && !n.isDefault {
   230  		err = setIPv6(n.path, "all", false)
   231  		if err != nil {
   232  			logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
   233  		}
   234  	}
   235  
   236  	if err = n.loopbackUp(); err != nil {
   237  		n.nlHandle.Close()
   238  		return nil, err
   239  	}
   240  
   241  	return n, nil
   242  }
   243  
   244  func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter {
   245  	return n
   246  }
   247  
   248  func (n *networkNamespace) NeighborOptions() NeighborOptionSetter {
   249  	return n
   250  }
   251  
   252  func mountNetworkNamespace(basePath string, lnPath string) error {
   253  	return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
   254  }
   255  
   256  // GetSandboxForExternalKey returns sandbox object for the supplied path
   257  func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) {
   258  	if err := createNamespaceFile(key); err != nil {
   259  		return nil, err
   260  	}
   261  
   262  	if err := mountNetworkNamespace(basePath, key); err != nil {
   263  		return nil, err
   264  	}
   265  	n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)}
   266  
   267  	sboxNs, err := netns.GetFromPath(n.path)
   268  	if err != nil {
   269  		return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
   270  	}
   271  	defer sboxNs.Close()
   272  
   273  	n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
   274  	if err != nil {
   275  		return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
   276  	}
   277  
   278  	err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
   279  	if err != nil {
   280  		logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
   281  	}
   282  
   283  	// As starting point, disable IPv6 on all interfaces
   284  	err = setIPv6(n.path, "all", false)
   285  	if err != nil {
   286  		logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
   287  	}
   288  
   289  	if err = n.loopbackUp(); err != nil {
   290  		n.nlHandle.Close()
   291  		return nil, err
   292  	}
   293  
   294  	return n, nil
   295  }
   296  
   297  func createNetworkNamespace(path string, osCreate bool) error {
   298  	if err := createNamespaceFile(path); err != nil {
   299  		return err
   300  	}
   301  
   302  	do := func() error {
   303  		return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
   304  	}
   305  	if osCreate {
   306  		return unshare.Go(unix.CLONE_NEWNET, do, nil)
   307  	}
   308  	return do()
   309  }
   310  
   311  func unmountNamespaceFile(path string) {
   312  	if _, err := os.Stat(path); err == nil {
   313  		if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
   314  			logrus.WithError(err).Error("Error unmounting namespace file")
   315  		}
   316  	}
   317  }
   318  
   319  func createNamespaceFile(path string) (err error) {
   320  	var f *os.File
   321  
   322  	once.Do(createBasePath)
   323  	// Remove it from garbage collection list if present
   324  	removeFromGarbagePaths(path)
   325  
   326  	// If the path is there unmount it first
   327  	unmountNamespaceFile(path)
   328  
   329  	// wait for garbage collection to complete if it is in progress
   330  	// before trying to create the file.
   331  	gpmWg.Wait()
   332  
   333  	if f, err = os.Create(path); err == nil {
   334  		f.Close()
   335  	}
   336  
   337  	return err
   338  }
   339  
   340  func (n *networkNamespace) loopbackUp() error {
   341  	iface, err := n.nlHandle.LinkByName("lo")
   342  	if err != nil {
   343  		return err
   344  	}
   345  	return n.nlHandle.LinkSetUp(iface)
   346  }
   347  
   348  func (n *networkNamespace) GetLoopbackIfaceName() string {
   349  	return "lo"
   350  }
   351  
   352  func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error {
   353  	iface, err := n.nlHandle.LinkByName(ifName)
   354  	if err != nil {
   355  		return err
   356  	}
   357  	return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
   358  }
   359  
   360  func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
   361  	iface, err := n.nlHandle.LinkByName(ifName)
   362  	if err != nil {
   363  		return err
   364  	}
   365  	return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
   366  }
   367  
   368  func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) {
   369  	dstName := ""
   370  	for _, i := range n.Interfaces() {
   371  		if i.SrcName() == srcName {
   372  			dstName = i.DstName()
   373  			break
   374  		}
   375  	}
   376  	if dstName == "" {
   377  		return fmt.Errorf("failed to find interface %s in sandbox", srcName)
   378  	}
   379  
   380  	err := n.InvokeFunc(func() {
   381  		path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
   382  		if err := os.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil {
   383  			Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
   384  			return
   385  		}
   386  		path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
   387  		if err := os.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil {
   388  			Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
   389  			return
   390  		}
   391  	})
   392  	if err != nil {
   393  		return err
   394  	}
   395  	return
   396  }
   397  
   398  func (n *networkNamespace) InvokeFunc(f func()) error {
   399  	path := n.nsPath()
   400  	newNS, err := netns.GetFromPath(path)
   401  	if err != nil {
   402  		return fmt.Errorf("failed get network namespace %q: %w", path, err)
   403  	}
   404  	defer newNS.Close()
   405  
   406  	done := make(chan error, 1)
   407  	go func() {
   408  		runtime.LockOSThread()
   409  		// InvokeFunc() could have been called from a goroutine with
   410  		// tampered thread state, e.g. from another InvokeFunc()
   411  		// callback. The outer goroutine's thread state cannot be
   412  		// trusted.
   413  		origNS, err := netns.Get()
   414  		if err != nil {
   415  			runtime.UnlockOSThread()
   416  			done <- fmt.Errorf("failed to get original network namespace: %w", err)
   417  			return
   418  		}
   419  		defer origNS.Close()
   420  
   421  		if err := netns.Set(newNS); err != nil {
   422  			runtime.UnlockOSThread()
   423  			done <- err
   424  			return
   425  		}
   426  		defer func() {
   427  			close(done)
   428  			if err := netns.Set(origNS); err != nil {
   429  				logrus.WithError(err).Warn("failed to restore thread's network namespace")
   430  				// Recover from the error by leaving this goroutine locked to
   431  				// the thread. The runtime will terminate the thread and replace
   432  				// it with a clean one when this goroutine returns.
   433  			} else {
   434  				runtime.UnlockOSThread()
   435  			}
   436  		}()
   437  		f()
   438  	}()
   439  	return <-done
   440  }
   441  
   442  func (n *networkNamespace) nsPath() string {
   443  	n.Lock()
   444  	defer n.Unlock()
   445  
   446  	return n.path
   447  }
   448  
   449  func (n *networkNamespace) Info() Info {
   450  	return n
   451  }
   452  
   453  func (n *networkNamespace) Key() string {
   454  	return n.path
   455  }
   456  
   457  func (n *networkNamespace) Destroy() error {
   458  	if n.nlHandle != nil {
   459  		n.nlHandle.Close()
   460  	}
   461  	// Assuming no running process is executing in this network namespace,
   462  	// unmounting is sufficient to destroy it.
   463  	if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
   464  		return err
   465  	}
   466  
   467  	// Stash it into the garbage collection list
   468  	addToGarbagePaths(n.path)
   469  	return nil
   470  }
   471  
   472  // Restore restore the network namespace
   473  func (n *networkNamespace) Restore(ifsopt map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
   474  	// restore interfaces
   475  	for name, opts := range ifsopt {
   476  		i := &nwIface{srcName: name.SrcName, dstName: name.DstPrefix, ns: n}
   477  		i.processInterfaceOptions(opts...)
   478  		if i.master != "" {
   479  			i.dstMaster = n.findDst(i.master, true)
   480  			if i.dstMaster == "" {
   481  				return fmt.Errorf("could not find an appropriate master %q for %q",
   482  					i.master, i.srcName)
   483  			}
   484  		}
   485  		if n.isDefault {
   486  			i.dstName = i.srcName
   487  		} else {
   488  			links, err := n.nlHandle.LinkList()
   489  			if err != nil {
   490  				return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
   491  			}
   492  			// due to the docker network connect/disconnect, so the dstName should
   493  			// restore from the namespace
   494  			for _, link := range links {
   495  				addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
   496  				if err != nil {
   497  					return err
   498  				}
   499  				ifaceName := link.Attrs().Name
   500  				if strings.HasPrefix(ifaceName, "vxlan") {
   501  					if i.dstName == "vxlan" {
   502  						i.dstName = ifaceName
   503  						break
   504  					}
   505  				}
   506  				// find the interface name by ip
   507  				if i.address != nil {
   508  					for _, addr := range addrs {
   509  						if addr.IPNet.String() == i.address.String() {
   510  							i.dstName = ifaceName
   511  							break
   512  						}
   513  						continue
   514  					}
   515  					if i.dstName == ifaceName {
   516  						break
   517  					}
   518  				}
   519  				// This is to find the interface name of the pair in overlay sandbox
   520  				if strings.HasPrefix(ifaceName, "veth") {
   521  					if i.master != "" && i.dstName == "veth" {
   522  						i.dstName = ifaceName
   523  					}
   524  				}
   525  			}
   526  
   527  			var index int
   528  			indexStr := strings.TrimPrefix(i.dstName, name.DstPrefix)
   529  			if indexStr != "" {
   530  				index, err = strconv.Atoi(indexStr)
   531  				if err != nil {
   532  					return err
   533  				}
   534  			}
   535  			index++
   536  			n.Lock()
   537  			if index > n.nextIfIndex[name.DstPrefix] {
   538  				n.nextIfIndex[name.DstPrefix] = index
   539  			}
   540  			n.iFaces = append(n.iFaces, i)
   541  			n.Unlock()
   542  		}
   543  	}
   544  
   545  	// restore routes
   546  	for _, r := range routes {
   547  		n.Lock()
   548  		n.staticRoutes = append(n.staticRoutes, r)
   549  		n.Unlock()
   550  	}
   551  
   552  	// restore gateway
   553  	if len(gw) > 0 {
   554  		n.Lock()
   555  		n.gw = gw
   556  		n.Unlock()
   557  	}
   558  
   559  	if len(gw6) > 0 {
   560  		n.Lock()
   561  		n.gwv6 = gw6
   562  		n.Unlock()
   563  	}
   564  
   565  	return nil
   566  }
   567  
   568  // Checks whether IPv6 needs to be enabled/disabled on the loopback interface
   569  func (n *networkNamespace) checkLoV6() {
   570  	var (
   571  		enable = false
   572  		action = "disable"
   573  	)
   574  
   575  	n.Lock()
   576  	for _, iface := range n.iFaces {
   577  		if iface.AddressIPv6() != nil {
   578  			enable = true
   579  			action = "enable"
   580  			break
   581  		}
   582  	}
   583  	n.Unlock()
   584  
   585  	if n.loV6Enabled == enable {
   586  		return
   587  	}
   588  
   589  	if err := setIPv6(n.path, "lo", enable); err != nil {
   590  		logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err)
   591  	}
   592  
   593  	n.loV6Enabled = enable
   594  }
   595  
   596  func setIPv6(nspath, iface string, enable bool) error {
   597  	errCh := make(chan error, 1)
   598  	go func() {
   599  		defer close(errCh)
   600  
   601  		namespace, err := netns.GetFromPath(nspath)
   602  		if err != nil {
   603  			errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err)
   604  			return
   605  		}
   606  		defer namespace.Close()
   607  
   608  		runtime.LockOSThread()
   609  
   610  		origNS, err := netns.Get()
   611  		if err != nil {
   612  			runtime.UnlockOSThread()
   613  			errCh <- fmt.Errorf("failed to get current network namespace: %w", err)
   614  			return
   615  		}
   616  		defer origNS.Close()
   617  
   618  		if err = netns.Set(namespace); err != nil {
   619  			runtime.UnlockOSThread()
   620  			errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
   621  			return
   622  		}
   623  		defer func() {
   624  			if err := netns.Set(origNS); err != nil {
   625  				logrus.WithError(err).Error("libnetwork: restoring thread network namespace failed")
   626  				// The error is only fatal for the current thread. Keep this
   627  				// goroutine locked to the thread to make the runtime replace it
   628  				// with a clean thread once this goroutine returns.
   629  			} else {
   630  				runtime.UnlockOSThread()
   631  			}
   632  		}()
   633  
   634  		var (
   635  			action = "disable"
   636  			value  = byte('1')
   637  			path   = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface)
   638  		)
   639  
   640  		if enable {
   641  			action = "enable"
   642  			value = '0'
   643  		}
   644  
   645  		if _, err := os.Stat(path); err != nil {
   646  			if os.IsNotExist(err) {
   647  				logrus.WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?")
   648  				return
   649  			}
   650  			errCh <- err
   651  			return
   652  		}
   653  
   654  		if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil {
   655  			errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err)
   656  			return
   657  		}
   658  	}()
   659  	return <-errCh
   660  }
   661  
   662  // ApplyOSTweaks applies linux configs on the sandbox
   663  func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) {
   664  	for _, t := range types {
   665  		switch t {
   666  		case SandboxTypeLoadBalancer, SandboxTypeIngress:
   667  			kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
   668  				// disables any special handling on port reuse of existing IPVS connection table entries
   669  				// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
   670  				"net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
   671  				// expires connection from the IPVS connection table when the backend is not available
   672  				// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
   673  				"net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
   674  				// expires persistent connections to destination servers with weights set to 0
   675  				// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
   676  				"net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
   677  			})
   678  		}
   679  	}
   680  }