github.com/rawahars/moby@v24.0.4+incompatible/internal/unshare/unshare_linux.go (about)

     1  //go:build go1.10
     2  // +build go1.10
     3  
     4  package unshare // import "github.com/docker/docker/internal/unshare"
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"runtime"
    10  
    11  	"golang.org/x/sys/unix"
    12  )
    13  
    14  func init() {
    15  	// The startup thread of a process is special in a few different ways.
    16  	// Most pertinent to the discussion at hand, any per-thread kernel state
    17  	// reflected in the /proc/[pid]/ directory for a process is taken from
    18  	// the state of the startup thread. Same goes for /proc/self/; it shows
    19  	// the state of the current process' startup thread, no matter which
    20  	// thread the files are being opened from. For most programs this is a
    21  	// distinction without a difference as the kernel state, such as the
    22  	// mount namespace and current working directory, is shared among (and
    23  	// kept synchronized across) all threads of a process. But things start
    24  	// to break down once threads start unsharing and modifying parts of
    25  	// their kernel state.
    26  	//
    27  	// The Go runtime schedules goroutines to execute on the startup thread,
    28  	// same as any other. How this could be problematic is best illustrated
    29  	// with a concrete example. Consider what happens if a call to
    30  	// Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
    31  	// onto the startup thread. The thread's mount namespace will be
    32  	// unshared and modified. The contents of the /proc/[pid]/mountinfo file
    33  	// will then describe the mount tree of the unshared namespace, not the
    34  	// namespace of any other thread. It will remain this way until the
    35  	// process exits. (The startup thread is special in another way: exiting
    36  	// it puts the process into a "non-waitable zombie" state. To avoid this
    37  	// fate, the Go runtime parks the thread instead of exiting if a
    38  	// goroutine returns while locked to the startup thread. More
    39  	// information can be found in the Go runtime sources:
    40  	// `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
    41  	// package reads from /proc/self/mountinfo, so will read the mount tree
    42  	// for the wrong namespace if the startup thread has had its mount
    43  	// namespace unshared! The /proc/thread-self/ directory, introduced in
    44  	// Linux 3.17, is one potential solution to this problem, but every
    45  	// package which opens files in /proc/self/ would need to be updated,
    46  	// and fallbacks to /proc/self/task/[tid]/ would be required to support
    47  	// older kernels. Overlooking any reference to /proc/self/ would
    48  	// manifest as stochastically-reproducible bugs, so this is far from an
    49  	// ideal solution.
    50  	//
    51  	// Reading from /proc/self/ would not be a problem if we could prevent
    52  	// the per-thread state of the startup thread from being modified
    53  	// nondeterministically in the first place. We can accomplish this
    54  	// simply by locking the main() function to the startup thread! Doing so
    55  	// excludes any other goroutine from being scheduled on the thread.
    56  	runtime.LockOSThread()
    57  }
    58  
    59  // reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
    60  // reversed using setns(2). The values are the basenames of the corresponding
    61  // /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
    62  // state.
    63  var reversibleSetnsFlags = map[int]string{
    64  	unix.CLONE_NEWCGROUP: "cgroup",
    65  	unix.CLONE_NEWNET:    "net",
    66  	unix.CLONE_NEWUTS:    "uts",
    67  	unix.CLONE_NEWPID:    "pid",
    68  	unix.CLONE_NEWTIME:   "time",
    69  
    70  	// The following CLONE_NEW* flags are not included because they imply
    71  	// another, irreversible flag when used with unshare(2).
    72  	//  - unix.CLONE_NEWIPC:  implies CLONE_SYSVMEM
    73  	//  - unix.CLONE_NEWNS:   implies CLONE_FS
    74  	//  - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
    75  }
    76  
    77  // Go calls the given functions in a new goroutine, locked to an OS thread,
    78  // which has had the parts of its execution state disassociated from the rest of
    79  // the current process using [unshare(2)]. It blocks until the new goroutine has
    80  // started and setupfn has returned. fn is only called if setupfn returns nil. A
    81  // nil setupfn or fn is equivalent to passing a no-op function.
    82  //
    83  // The disassociated execution state and any changes made to it are only visible
    84  // to the goroutine which the functions are called in. Any other goroutines,
    85  // including ones started from the function, will see the same execution state
    86  // as the rest of the process.
    87  //
    88  // The acceptable flags are documented in the [unshare(2)] Linux man-page.
    89  // The corresponding CLONE_* constants are defined in package [unix].
    90  //
    91  // # Warning
    92  //
    93  // This function may terminate the thread which the new goroutine executed on
    94  // after fn returns, which could cause subprocesses started with the
    95  // [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
    96  // termination. Any subprocess started before this function is called may be
    97  // affected, in addition to any subprocesses started inside setupfn or fn.
    98  // There are more details at https://go.dev/issue/27505.
    99  //
   100  // [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
   101  func Go(flags int, setupfn func() error, fn func()) error {
   102  	started := make(chan error)
   103  
   104  	maskedFlags := flags
   105  	for f := range reversibleSetnsFlags {
   106  		maskedFlags &^= f
   107  	}
   108  	isReversible := maskedFlags == 0
   109  
   110  	go func() {
   111  		// Prepare to manipulate per-thread kernel state.
   112  		runtime.LockOSThread()
   113  
   114  		// Not all changes to the execution state can be reverted.
   115  		// If an irreversible change to the execution state is made, our
   116  		// only recourse is to have the tampered thread terminated by
   117  		// returning from this function while the goroutine remains
   118  		// wired to the thread. The Go runtime will terminate the thread
   119  		// and replace it with a fresh one as needed.
   120  
   121  		if isReversible {
   122  			defer func() {
   123  				if isReversible {
   124  					// All execution state has been restored without error.
   125  					// The thread is once again fungible.
   126  					runtime.UnlockOSThread()
   127  				}
   128  			}()
   129  			tid := unix.Gettid()
   130  			for f, ns := range reversibleSetnsFlags {
   131  				if flags&f != f {
   132  					continue
   133  				}
   134  				// The /proc/thread-self directory was added in Linux 3.17.
   135  				// We are not using it to maximize compatibility.
   136  				pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
   137  				fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
   138  				if err != nil {
   139  					started <- &os.PathError{Op: "open", Path: pth, Err: err}
   140  					return
   141  				}
   142  				defer func() {
   143  					if isReversible {
   144  						if err := unix.Setns(fd, 0); err != nil {
   145  							isReversible = false
   146  						}
   147  					}
   148  					_ = unix.Close(fd)
   149  				}()
   150  			}
   151  		}
   152  
   153  		// Threads are implemented under Linux as processes which share
   154  		// a virtual memory space. Therefore in a multithreaded process
   155  		// unshare(2) disassociates parts of the calling thread's
   156  		// context from the thread it was clone(2)'d from.
   157  		if err := unix.Unshare(flags); err != nil {
   158  			started <- os.NewSyscallError("unshare", err)
   159  			return
   160  		}
   161  
   162  		if setupfn != nil {
   163  			if err := setupfn(); err != nil {
   164  				started <- err
   165  				return
   166  			}
   167  		}
   168  		close(started)
   169  
   170  		if fn != nil {
   171  			fn()
   172  		}
   173  	}()
   174  
   175  	return <-started
   176  }