github.com/tonistiigi/docker@v0.10.1-0.20240229224939-974013b0dc6a/libnetwork/osl/namespace_linux.go (about)

     1  package osl
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"net"
     8  	"os"
     9  	"path/filepath"
    10  	"runtime"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"syscall"
    15  	"time"
    16  
    17  	"github.com/containerd/log"
    18  	"github.com/docker/docker/internal/unshare"
    19  	"github.com/docker/docker/libnetwork/ns"
    20  	"github.com/docker/docker/libnetwork/osl/kernel"
    21  	"github.com/docker/docker/libnetwork/types"
    22  	"github.com/vishvananda/netlink"
    23  	"github.com/vishvananda/netlink/nl"
    24  	"github.com/vishvananda/netns"
    25  	"golang.org/x/sys/unix"
    26  )
    27  
    28  const defaultPrefix = "/var/run/docker"
    29  
    30  func init() {
    31  	// Lock main() to the initial thread to exclude the goroutines spawned
    32  	// by func (*Namespace) InvokeFunc() or func setIPv6() below from
    33  	// being scheduled onto that thread. Changes to the network namespace of
    34  	// the initial thread alter /proc/self/ns/net, which would break any
    35  	// code which (incorrectly) assumes that that file is the network
    36  	// namespace for the thread it is currently executing on.
    37  	runtime.LockOSThread()
    38  }
    39  
    40  var (
    41  	once             sync.Once
    42  	garbagePathMap   = make(map[string]bool)
    43  	gpmLock          sync.Mutex
    44  	gpmWg            sync.WaitGroup
    45  	gpmCleanupPeriod = 60 * time.Second
    46  	gpmChan          = make(chan chan struct{})
    47  	netnsBasePath    = filepath.Join(defaultPrefix, "netns")
    48  )
    49  
    50  // SetBasePath sets the base url prefix for the ns path
    51  func SetBasePath(path string) {
    52  	netnsBasePath = filepath.Join(path, "netns")
    53  }
    54  
    55  func basePath() string {
    56  	return netnsBasePath
    57  }
    58  
    59  func createBasePath() {
    60  	err := os.MkdirAll(basePath(), 0o755)
    61  	if err != nil {
    62  		panic("Could not create net namespace path directory")
    63  	}
    64  
    65  	// Start the garbage collection go routine
    66  	go removeUnusedPaths()
    67  }
    68  
    69  func removeUnusedPaths() {
    70  	gpmLock.Lock()
    71  	period := gpmCleanupPeriod
    72  	gpmLock.Unlock()
    73  
    74  	ticker := time.NewTicker(period)
    75  	for {
    76  		var (
    77  			gc   chan struct{}
    78  			gcOk bool
    79  		)
    80  
    81  		select {
    82  		case <-ticker.C:
    83  		case gc, gcOk = <-gpmChan:
    84  		}
    85  
    86  		gpmLock.Lock()
    87  		pathList := make([]string, 0, len(garbagePathMap))
    88  		for path := range garbagePathMap {
    89  			pathList = append(pathList, path)
    90  		}
    91  		garbagePathMap = make(map[string]bool)
    92  		gpmWg.Add(1)
    93  		gpmLock.Unlock()
    94  
    95  		for _, path := range pathList {
    96  			os.Remove(path)
    97  		}
    98  
    99  		gpmWg.Done()
   100  		if gcOk {
   101  			close(gc)
   102  		}
   103  	}
   104  }
   105  
   106  func addToGarbagePaths(path string) {
   107  	gpmLock.Lock()
   108  	garbagePathMap[path] = true
   109  	gpmLock.Unlock()
   110  }
   111  
   112  func removeFromGarbagePaths(path string) {
   113  	gpmLock.Lock()
   114  	delete(garbagePathMap, path)
   115  	gpmLock.Unlock()
   116  }
   117  
   118  // GC triggers garbage collection of namespace path right away
   119  // and waits for it.
   120  func GC() {
   121  	gpmLock.Lock()
   122  	if len(garbagePathMap) == 0 {
   123  		// No need for GC if map is empty
   124  		gpmLock.Unlock()
   125  		return
   126  	}
   127  	gpmLock.Unlock()
   128  
   129  	// if content exists in the garbage paths
   130  	// we can trigger GC to run, providing a
   131  	// channel to be notified on completion
   132  	waitGC := make(chan struct{})
   133  	gpmChan <- waitGC
   134  	// wait for GC completion
   135  	<-waitGC
   136  }
   137  
   138  // GenerateKey generates a sandbox key based on the passed
   139  // container id.
   140  func GenerateKey(containerID string) string {
   141  	maxLen := 12
   142  	// Read sandbox key from host for overlay
   143  	if strings.HasPrefix(containerID, "-") {
   144  		var (
   145  			index    int
   146  			indexStr string
   147  			tmpkey   string
   148  		)
   149  		dir, err := os.ReadDir(basePath())
   150  		if err != nil {
   151  			return ""
   152  		}
   153  
   154  		for _, v := range dir {
   155  			id := v.Name()
   156  			if strings.HasSuffix(id, containerID[:maxLen-1]) {
   157  				indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
   158  				tmpindex, err := strconv.Atoi(indexStr)
   159  				if err != nil {
   160  					return ""
   161  				}
   162  				if tmpindex > index {
   163  					index = tmpindex
   164  					tmpkey = id
   165  				}
   166  			}
   167  		}
   168  		containerID = tmpkey
   169  		if containerID == "" {
   170  			return ""
   171  		}
   172  	}
   173  
   174  	if len(containerID) < maxLen {
   175  		maxLen = len(containerID)
   176  	}
   177  
   178  	return basePath() + "/" + containerID[:maxLen]
   179  }
   180  
   181  // NewSandbox provides a new Namespace instance created in an os specific way
   182  // provided a key which uniquely identifies the sandbox.
   183  func NewSandbox(key string, osCreate, isRestore bool) (*Namespace, error) {
   184  	if !isRestore {
   185  		err := createNetworkNamespace(key, osCreate)
   186  		if err != nil {
   187  			return nil, err
   188  		}
   189  	} else {
   190  		once.Do(createBasePath)
   191  	}
   192  
   193  	n := &Namespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
   194  
   195  	sboxNs, err := netns.GetFromPath(n.path)
   196  	if err != nil {
   197  		return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
   198  	}
   199  	defer sboxNs.Close()
   200  
   201  	n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
   202  	if err != nil {
   203  		return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
   204  	}
   205  
   206  	err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
   207  	if err != nil {
   208  		log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
   209  	}
   210  
   211  	if err = n.loopbackUp(); err != nil {
   212  		n.nlHandle.Close()
   213  		return nil, err
   214  	}
   215  
   216  	return n, nil
   217  }
   218  
   219  func mountNetworkNamespace(basePath string, lnPath string) error {
   220  	err := syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
   221  	if err != nil {
   222  		return fmt.Errorf("bind-mount %s -> %s: %w", basePath, lnPath, err)
   223  	}
   224  	return nil
   225  }
   226  
   227  // GetSandboxForExternalKey returns sandbox object for the supplied path
   228  func GetSandboxForExternalKey(basePath string, key string) (*Namespace, error) {
   229  	if err := createNamespaceFile(key); err != nil {
   230  		return nil, err
   231  	}
   232  
   233  	if err := mountNetworkNamespace(basePath, key); err != nil {
   234  		return nil, err
   235  	}
   236  	n := &Namespace{path: key, nextIfIndex: make(map[string]int)}
   237  
   238  	sboxNs, err := netns.GetFromPath(n.path)
   239  	if err != nil {
   240  		return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
   241  	}
   242  	defer sboxNs.Close()
   243  
   244  	n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
   245  	if err != nil {
   246  		return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
   247  	}
   248  
   249  	err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
   250  	if err != nil {
   251  		log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
   252  	}
   253  
   254  	if err = n.loopbackUp(); err != nil {
   255  		n.nlHandle.Close()
   256  		return nil, err
   257  	}
   258  
   259  	return n, nil
   260  }
   261  
   262  func createNetworkNamespace(path string, osCreate bool) error {
   263  	if err := createNamespaceFile(path); err != nil {
   264  		return err
   265  	}
   266  
   267  	do := func() error {
   268  		return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
   269  	}
   270  	if osCreate {
   271  		return unshare.Go(unix.CLONE_NEWNET, do, nil)
   272  	}
   273  	return do()
   274  }
   275  
   276  func unmountNamespaceFile(path string) {
   277  	if _, err := os.Stat(path); err != nil {
   278  		// ignore when we cannot stat the path
   279  		return
   280  	}
   281  	if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
   282  		log.G(context.TODO()).WithError(err).Error("Error unmounting namespace file")
   283  	}
   284  }
   285  
   286  func createNamespaceFile(path string) error {
   287  	once.Do(createBasePath)
   288  	// Remove it from garbage collection list if present
   289  	removeFromGarbagePaths(path)
   290  
   291  	// If the path is there unmount it first
   292  	unmountNamespaceFile(path)
   293  
   294  	// wait for garbage collection to complete if it is in progress
   295  	// before trying to create the file.
   296  	//
   297  	// TODO(aker): This garbage-collection was for a kernel bug in kernels 3.18-4.0.1: is this still needed on current kernels (and on kernel 3.10)? see https://github.com/moby/moby/pull/46315/commits/c0a6beba8e61d4019e1806d5241ba22007072ca2#r1331327103
   298  	gpmWg.Wait()
   299  
   300  	f, err := os.Create(path)
   301  	if err != nil {
   302  		return err
   303  	}
   304  	_ = f.Close()
   305  	return nil
   306  }
   307  
   308  // Namespace represents a network sandbox. It represents a Linux network
   309  // namespace, and moves an interface into it when called on method AddInterface
   310  // or sets the gateway etc. It holds a list of Interfaces, routes etc., and more
   311  // can be added dynamically.
   312  type Namespace struct {
   313  	path                string
   314  	iFaces              []*Interface
   315  	gw                  net.IP
   316  	gwv6                net.IP
   317  	staticRoutes        []*types.StaticRoute
   318  	neighbors           []*neigh
   319  	nextIfIndex         map[string]int
   320  	isDefault           bool
   321  	ipv6LoEnabledOnce   sync.Once
   322  	ipv6LoEnabledCached bool
   323  	nlHandle            *netlink.Handle
   324  	mu                  sync.Mutex
   325  }
   326  
   327  // Interfaces returns the collection of Interface previously added with the AddInterface
   328  // method. Note that this doesn't include network interfaces added in any
   329  // other way (such as the default loopback interface which is automatically
   330  // created on creation of a sandbox).
   331  func (n *Namespace) Interfaces() []*Interface {
   332  	ifaces := make([]*Interface, len(n.iFaces))
   333  	copy(ifaces, n.iFaces)
   334  	return ifaces
   335  }
   336  
   337  func (n *Namespace) loopbackUp() error {
   338  	iface, err := n.nlHandle.LinkByName("lo")
   339  	if err != nil {
   340  		return err
   341  	}
   342  	return n.nlHandle.LinkSetUp(iface)
   343  }
   344  
   345  // GetLoopbackIfaceName returns the name of the loopback interface
   346  func (n *Namespace) GetLoopbackIfaceName() string {
   347  	return "lo"
   348  }
   349  
   350  // AddAliasIP adds the passed IP address to the named interface
   351  func (n *Namespace) AddAliasIP(ifName string, ip *net.IPNet) error {
   352  	iface, err := n.nlHandle.LinkByName(ifName)
   353  	if err != nil {
   354  		return err
   355  	}
   356  	return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
   357  }
   358  
   359  // RemoveAliasIP removes the passed IP address from the named interface
   360  func (n *Namespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
   361  	iface, err := n.nlHandle.LinkByName(ifName)
   362  	if err != nil {
   363  		return err
   364  	}
   365  	return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
   366  }
   367  
   368  // DisableARPForVIP disables ARP replies and requests for VIP addresses
   369  // on a particular interface.
   370  func (n *Namespace) DisableARPForVIP(srcName string) (Err error) {
   371  	dstName := ""
   372  	for _, i := range n.Interfaces() {
   373  		if i.SrcName() == srcName {
   374  			dstName = i.DstName()
   375  			break
   376  		}
   377  	}
   378  	if dstName == "" {
   379  		return fmt.Errorf("failed to find interface %s in sandbox", srcName)
   380  	}
   381  
   382  	err := n.InvokeFunc(func() {
   383  		path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
   384  		if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil {
   385  			Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
   386  			return
   387  		}
   388  		path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
   389  		if err := os.WriteFile(path, []byte{'2', '\n'}, 0o644); err != nil {
   390  			Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
   391  			return
   392  		}
   393  	})
   394  	if err != nil {
   395  		return err
   396  	}
   397  	return
   398  }
   399  
   400  // InvokeFunc invoke a function in the network namespace.
   401  func (n *Namespace) InvokeFunc(f func()) error {
   402  	path := n.nsPath()
   403  	newNS, err := netns.GetFromPath(path)
   404  	if err != nil {
   405  		return fmt.Errorf("failed get network namespace %q: %w", path, err)
   406  	}
   407  	defer newNS.Close()
   408  
   409  	done := make(chan error, 1)
   410  	go func() {
   411  		runtime.LockOSThread()
   412  		// InvokeFunc() could have been called from a goroutine with
   413  		// tampered thread state, e.g. from another InvokeFunc()
   414  		// callback. The outer goroutine's thread state cannot be
   415  		// trusted.
   416  		origNS, err := netns.Get()
   417  		if err != nil {
   418  			runtime.UnlockOSThread()
   419  			done <- fmt.Errorf("failed to get original network namespace: %w", err)
   420  			return
   421  		}
   422  		defer origNS.Close()
   423  
   424  		if err := netns.Set(newNS); err != nil {
   425  			runtime.UnlockOSThread()
   426  			done <- err
   427  			return
   428  		}
   429  		defer func() {
   430  			close(done)
   431  			if err := netns.Set(origNS); err != nil {
   432  				log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace")
   433  				// Recover from the error by leaving this goroutine locked to
   434  				// the thread. The runtime will terminate the thread and replace
   435  				// it with a clean one when this goroutine returns.
   436  			} else {
   437  				runtime.UnlockOSThread()
   438  			}
   439  		}()
   440  		f()
   441  	}()
   442  	return <-done
   443  }
   444  
   445  func (n *Namespace) nsPath() string {
   446  	n.mu.Lock()
   447  	defer n.mu.Unlock()
   448  
   449  	return n.path
   450  }
   451  
   452  // Key returns the path where the network namespace is mounted.
   453  func (n *Namespace) Key() string {
   454  	return n.path
   455  }
   456  
   457  // Destroy destroys the sandbox.
   458  func (n *Namespace) Destroy() error {
   459  	if n.nlHandle != nil {
   460  		n.nlHandle.Close()
   461  	}
   462  	// Assuming no running process is executing in this network namespace,
   463  	// unmounting is sufficient to destroy it.
   464  	if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
   465  		return err
   466  	}
   467  
   468  	// Stash it into the garbage collection list
   469  	addToGarbagePaths(n.path)
   470  	return nil
   471  }
   472  
   473  // Restore restores the network namespace.
   474  func (n *Namespace) Restore(interfaces map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
   475  	// restore interfaces
   476  	for iface, opts := range interfaces {
   477  		i, err := newInterface(n, iface.SrcName, iface.DstPrefix, opts...)
   478  		if err != nil {
   479  			return err
   480  		}
   481  		if n.isDefault {
   482  			i.dstName = i.srcName
   483  		} else {
   484  			links, err := n.nlHandle.LinkList()
   485  			if err != nil {
   486  				return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
   487  			}
   488  			// due to the docker network connect/disconnect, so the dstName should
   489  			// restore from the namespace
   490  			for _, link := range links {
   491  				ifaceName := link.Attrs().Name
   492  				if i.dstName == "vxlan" && strings.HasPrefix(ifaceName, "vxlan") {
   493  					i.dstName = ifaceName
   494  					break
   495  				}
   496  				// find the interface name by ip
   497  				if i.address != nil {
   498  					addresses, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
   499  					if err != nil {
   500  						return err
   501  					}
   502  					for _, addr := range addresses {
   503  						if addr.IPNet.String() == i.address.String() {
   504  							i.dstName = ifaceName
   505  							break
   506  						}
   507  					}
   508  					if i.dstName == ifaceName {
   509  						break
   510  					}
   511  				}
   512  				// This is to find the interface name of the pair in overlay sandbox
   513  				if i.master != "" && i.dstName == "veth" && strings.HasPrefix(ifaceName, "veth") {
   514  					i.dstName = ifaceName
   515  				}
   516  			}
   517  
   518  			var index int
   519  			if idx := strings.TrimPrefix(i.dstName, iface.DstPrefix); idx != "" {
   520  				index, err = strconv.Atoi(idx)
   521  				if err != nil {
   522  					return fmt.Errorf("failed to restore interface in network namespace %q: invalid dstName for interface: %s: %v", n.path, i.dstName, err)
   523  				}
   524  			}
   525  			index++
   526  			n.mu.Lock()
   527  			if index > n.nextIfIndex[iface.DstPrefix] {
   528  				n.nextIfIndex[iface.DstPrefix] = index
   529  			}
   530  			n.iFaces = append(n.iFaces, i)
   531  			n.mu.Unlock()
   532  		}
   533  	}
   534  
   535  	// restore routes and gateways
   536  	n.mu.Lock()
   537  	n.staticRoutes = append(n.staticRoutes, routes...)
   538  	if len(gw) > 0 {
   539  		n.gw = gw
   540  	}
   541  	if len(gw6) > 0 {
   542  		n.gwv6 = gw6
   543  	}
   544  	n.mu.Unlock()
   545  	return nil
   546  }
   547  
   548  // IPv6LoEnabled checks whether the loopback interface has an IPv6 address ('::1'
   549  // is assigned by the kernel if IPv6 is enabled).
   550  func (n *Namespace) IPv6LoEnabled() bool {
   551  	n.ipv6LoEnabledOnce.Do(func() {
   552  		// If anything goes wrong, assume no-IPv6.
   553  		iface, err := n.nlHandle.LinkByName("lo")
   554  		if err != nil {
   555  			log.G(context.TODO()).WithError(err).Warn("Unable to find 'lo' to determine IPv6 support")
   556  			return
   557  		}
   558  		addrs, err := n.nlHandle.AddrList(iface, nl.FAMILY_V6)
   559  		if err != nil {
   560  			log.G(context.TODO()).WithError(err).Warn("Unable to get 'lo' addresses to determine IPv6 support")
   561  			return
   562  		}
   563  		n.ipv6LoEnabledCached = len(addrs) > 0
   564  	})
   565  	return n.ipv6LoEnabledCached
   566  }
   567  
   568  // ApplyOSTweaks applies operating system specific knobs on the sandbox.
   569  func (n *Namespace) ApplyOSTweaks(types []SandboxType) {
   570  	for _, t := range types {
   571  		switch t {
   572  		case SandboxTypeLoadBalancer, SandboxTypeIngress:
   573  			kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
   574  				// disables any special handling on port reuse of existing IPVS connection table entries
   575  				// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
   576  				"net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
   577  				// expires connection from the IPVS connection table when the backend is not available
   578  				// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
   579  				"net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
   580  				// expires persistent connections to destination servers with weights set to 0
   581  				// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
   582  				"net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
   583  			})
   584  		}
   585  	}
   586  }
   587  
   588  func setIPv6(nspath, iface string, enable bool) error {
   589  	errCh := make(chan error, 1)
   590  	go func() {
   591  		defer close(errCh)
   592  
   593  		namespace, err := netns.GetFromPath(nspath)
   594  		if err != nil {
   595  			errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err)
   596  			return
   597  		}
   598  		defer namespace.Close()
   599  
   600  		runtime.LockOSThread()
   601  
   602  		origNS, err := netns.Get()
   603  		if err != nil {
   604  			runtime.UnlockOSThread()
   605  			errCh <- fmt.Errorf("failed to get current network namespace: %w", err)
   606  			return
   607  		}
   608  		defer origNS.Close()
   609  
   610  		if err = netns.Set(namespace); err != nil {
   611  			runtime.UnlockOSThread()
   612  			errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
   613  			return
   614  		}
   615  		defer func() {
   616  			if err := netns.Set(origNS); err != nil {
   617  				log.G(context.TODO()).WithError(err).Error("libnetwork: restoring thread network namespace failed")
   618  				// The error is only fatal for the current thread. Keep this
   619  				// goroutine locked to the thread to make the runtime replace it
   620  				// with a clean thread once this goroutine returns.
   621  			} else {
   622  				runtime.UnlockOSThread()
   623  			}
   624  		}()
   625  
   626  		var (
   627  			action = "disable"
   628  			value  = byte('1')
   629  			path   = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface)
   630  		)
   631  
   632  		if enable {
   633  			action = "enable"
   634  			value = '0'
   635  		}
   636  
   637  		if _, err := os.Stat(path); err != nil {
   638  			if os.IsNotExist(err) {
   639  				log.G(context.TODO()).WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?")
   640  				return
   641  			}
   642  			errCh <- err
   643  			return
   644  		}
   645  
   646  		if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil {
   647  			errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err)
   648  			return
   649  		}
   650  	}()
   651  	return <-errCh
   652  }