github.com/moby/docker@v26.1.3+incompatible/internal/unshare/unshare_linux.go (about)

     1  //go:build go1.10
     2  
     3  package unshare // import "github.com/docker/docker/internal/unshare"
     4  
     5  import (
     6  	"fmt"
     7  	"os"
     8  	"runtime"
     9  
    10  	"golang.org/x/sys/unix"
    11  )
    12  
    13  func init() {
    14  	// The startup thread of a process is special in a few different ways.
    15  	// Most pertinent to the discussion at hand, any per-thread kernel state
    16  	// reflected in the /proc/[pid]/ directory for a process is taken from
    17  	// the state of the startup thread. Same goes for /proc/self/; it shows
    18  	// the state of the current process' startup thread, no matter which
    19  	// thread the files are being opened from. For most programs this is a
    20  	// distinction without a difference as the kernel state, such as the
    21  	// mount namespace and current working directory, is shared among (and
    22  	// kept synchronized across) all threads of a process. But things start
    23  	// to break down once threads start unsharing and modifying parts of
    24  	// their kernel state.
    25  	//
    26  	// The Go runtime schedules goroutines to execute on the startup thread,
    27  	// same as any other. How this could be problematic is best illustrated
    28  	// with a concrete example. Consider what happens if a call to
    29  	// Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
    30  	// onto the startup thread. The thread's mount namespace will be
    31  	// unshared and modified. The contents of the /proc/[pid]/mountinfo file
    32  	// will then describe the mount tree of the unshared namespace, not the
    33  	// namespace of any other thread. It will remain this way until the
    34  	// process exits. (The startup thread is special in another way: exiting
    35  	// it puts the process into a "non-waitable zombie" state. To avoid this
    36  	// fate, the Go runtime parks the thread instead of exiting if a
    37  	// goroutine returns while locked to the startup thread. More
    38  	// information can be found in the Go runtime sources:
    39  	// `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
    40  	// package reads from /proc/self/mountinfo, so will read the mount tree
    41  	// for the wrong namespace if the startup thread has had its mount
    42  	// namespace unshared! The /proc/thread-self/ directory, introduced in
    43  	// Linux 3.17, is one potential solution to this problem, but every
    44  	// package which opens files in /proc/self/ would need to be updated,
    45  	// and fallbacks to /proc/self/task/[tid]/ would be required to support
    46  	// older kernels. Overlooking any reference to /proc/self/ would
    47  	// manifest as stochastically-reproducible bugs, so this is far from an
    48  	// ideal solution.
    49  	//
    50  	// Reading from /proc/self/ would not be a problem if we could prevent
    51  	// the per-thread state of the startup thread from being modified
    52  	// nondeterministically in the first place. We can accomplish this
    53  	// simply by locking the main() function to the startup thread! Doing so
    54  	// excludes any other goroutine from being scheduled on the thread.
    55  	runtime.LockOSThread()
    56  }
    57  
    58  // reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
    59  // reversed using setns(2). The values are the basenames of the corresponding
    60  // /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
    61  // state.
    62  var reversibleSetnsFlags = map[int]string{
    63  	unix.CLONE_NEWCGROUP: "cgroup",
    64  	unix.CLONE_NEWNET:    "net",
    65  	unix.CLONE_NEWUTS:    "uts",
    66  	unix.CLONE_NEWPID:    "pid",
    67  	unix.CLONE_NEWTIME:   "time",
    68  
    69  	// The following CLONE_NEW* flags are not included because they imply
    70  	// another, irreversible flag when used with unshare(2).
    71  	//  - unix.CLONE_NEWIPC:  implies CLONE_SYSVMEM
    72  	//  - unix.CLONE_NEWNS:   implies CLONE_FS
    73  	//  - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
    74  }
    75  
    76  // Go calls the given functions in a new goroutine, locked to an OS thread,
    77  // which has had the parts of its execution state disassociated from the rest of
    78  // the current process using [unshare(2)]. It blocks until the new goroutine has
    79  // started and setupfn has returned. fn is only called if setupfn returns nil. A
    80  // nil setupfn or fn is equivalent to passing a no-op function.
    81  //
    82  // The disassociated execution state and any changes made to it are only visible
    83  // to the goroutine which the functions are called in. Any other goroutines,
    84  // including ones started from the function, will see the same execution state
    85  // as the rest of the process.
    86  //
    87  // The acceptable flags are documented in the [unshare(2)] Linux man-page.
    88  // The corresponding CLONE_* constants are defined in package [unix].
    89  //
    90  // # Warning
    91  //
    92  // This function may terminate the thread which the new goroutine executed on
    93  // after fn returns, which could cause subprocesses started with the
    94  // [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
    95  // termination. Any subprocess started before this function is called may be
    96  // affected, in addition to any subprocesses started inside setupfn or fn.
    97  // There are more details at https://go.dev/issue/27505.
    98  //
    99  // [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
   100  func Go(flags int, setupfn func() error, fn func()) error {
   101  	started := make(chan error)
   102  
   103  	maskedFlags := flags
   104  	for f := range reversibleSetnsFlags {
   105  		maskedFlags &^= f
   106  	}
   107  	isReversible := maskedFlags == 0
   108  
   109  	go func() {
   110  		// Prepare to manipulate per-thread kernel state.
   111  		runtime.LockOSThread()
   112  
   113  		// Not all changes to the execution state can be reverted.
   114  		// If an irreversible change to the execution state is made, our
   115  		// only recourse is to have the tampered thread terminated by
   116  		// returning from this function while the goroutine remains
   117  		// wired to the thread. The Go runtime will terminate the thread
   118  		// and replace it with a fresh one as needed.
   119  
   120  		if isReversible {
   121  			defer func() {
   122  				if isReversible {
   123  					// All execution state has been restored without error.
   124  					// The thread is once again fungible.
   125  					runtime.UnlockOSThread()
   126  				}
   127  			}()
   128  			tid := unix.Gettid()
   129  			for f, ns := range reversibleSetnsFlags {
   130  				if flags&f != f {
   131  					continue
   132  				}
   133  				// The /proc/thread-self directory was added in Linux 3.17.
   134  				// We are not using it to maximize compatibility.
   135  				pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
   136  				fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
   137  				if err != nil {
   138  					started <- &os.PathError{Op: "open", Path: pth, Err: err}
   139  					return
   140  				}
   141  				defer func() {
   142  					if isReversible {
   143  						if err := unix.Setns(fd, 0); err != nil {
   144  							isReversible = false
   145  						}
   146  					}
   147  					_ = unix.Close(fd)
   148  				}()
   149  			}
   150  		}
   151  
   152  		// Threads are implemented under Linux as processes which share
   153  		// a virtual memory space. Therefore in a multithreaded process
   154  		// unshare(2) disassociates parts of the calling thread's
   155  		// context from the thread it was clone(2)'d from.
   156  		if err := unix.Unshare(flags); err != nil {
   157  			started <- os.NewSyscallError("unshare", err)
   158  			return
   159  		}
   160  
   161  		if setupfn != nil {
   162  			if err := setupfn(); err != nil {
   163  				started <- err
   164  				return
   165  			}
   166  		}
   167  		close(started)
   168  
   169  		if fn != nil {
   170  			fn()
   171  		}
   172  	}()
   173  
   174  	return <-started
   175  }