github.com/moby/docker@v26.1.3+incompatible/internal/unshare/unshare_linux.go (about) 1 //go:build go1.10 2 3 package unshare // import "github.com/docker/docker/internal/unshare" 4 5 import ( 6 "fmt" 7 "os" 8 "runtime" 9 10 "golang.org/x/sys/unix" 11 ) 12 13 func init() { 14 // The startup thread of a process is special in a few different ways. 15 // Most pertinent to the discussion at hand, any per-thread kernel state 16 // reflected in the /proc/[pid]/ directory for a process is taken from 17 // the state of the startup thread. Same goes for /proc/self/; it shows 18 // the state of the current process' startup thread, no matter which 19 // thread the files are being opened from. For most programs this is a 20 // distinction without a difference as the kernel state, such as the 21 // mount namespace and current working directory, is shared among (and 22 // kept synchronized across) all threads of a process. But things start 23 // to break down once threads start unsharing and modifying parts of 24 // their kernel state. 25 // 26 // The Go runtime schedules goroutines to execute on the startup thread, 27 // same as any other. How this could be problematic is best illustrated 28 // with a concrete example. Consider what happens if a call to 29 // Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled 30 // onto the startup thread. The thread's mount namespace will be 31 // unshared and modified. The contents of the /proc/[pid]/mountinfo file 32 // will then describe the mount tree of the unshared namespace, not the 33 // namespace of any other thread. It will remain this way until the 34 // process exits. (The startup thread is special in another way: exiting 35 // it puts the process into a "non-waitable zombie" state. To avoid this 36 // fate, the Go runtime parks the thread instead of exiting if a 37 // goroutine returns while locked to the startup thread. More 38 // information can be found in the Go runtime sources: 39 // `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo 40 // package reads from /proc/self/mountinfo, so will read the mount tree 41 // for the wrong namespace if the startup thread has had its mount 42 // namespace unshared! The /proc/thread-self/ directory, introduced in 43 // Linux 3.17, is one potential solution to this problem, but every 44 // package which opens files in /proc/self/ would need to be updated, 45 // and fallbacks to /proc/self/task/[tid]/ would be required to support 46 // older kernels. Overlooking any reference to /proc/self/ would 47 // manifest as stochastically-reproducible bugs, so this is far from an 48 // ideal solution. 49 // 50 // Reading from /proc/self/ would not be a problem if we could prevent 51 // the per-thread state of the startup thread from being modified 52 // nondeterministically in the first place. We can accomplish this 53 // simply by locking the main() function to the startup thread! Doing so 54 // excludes any other goroutine from being scheduled on the thread. 55 runtime.LockOSThread() 56 } 57 58 // reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully 59 // reversed using setns(2). The values are the basenames of the corresponding 60 // /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the 61 // state. 62 var reversibleSetnsFlags = map[int]string{ 63 unix.CLONE_NEWCGROUP: "cgroup", 64 unix.CLONE_NEWNET: "net", 65 unix.CLONE_NEWUTS: "uts", 66 unix.CLONE_NEWPID: "pid", 67 unix.CLONE_NEWTIME: "time", 68 69 // The following CLONE_NEW* flags are not included because they imply 70 // another, irreversible flag when used with unshare(2). 71 // - unix.CLONE_NEWIPC: implies CLONE_SYSVMEM 72 // - unix.CLONE_NEWNS: implies CLONE_FS 73 // - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9 74 } 75 76 // Go calls the given functions in a new goroutine, locked to an OS thread, 77 // which has had the parts of its execution state disassociated from the rest of 78 // the current process using [unshare(2)]. It blocks until the new goroutine has 79 // started and setupfn has returned. fn is only called if setupfn returns nil. A 80 // nil setupfn or fn is equivalent to passing a no-op function. 81 // 82 // The disassociated execution state and any changes made to it are only visible 83 // to the goroutine which the functions are called in. Any other goroutines, 84 // including ones started from the function, will see the same execution state 85 // as the rest of the process. 86 // 87 // The acceptable flags are documented in the [unshare(2)] Linux man-page. 88 // The corresponding CLONE_* constants are defined in package [unix]. 89 // 90 // # Warning 91 // 92 // This function may terminate the thread which the new goroutine executed on 93 // after fn returns, which could cause subprocesses started with the 94 // [syscall.SysProcAttr] Pdeathsig field set to be signaled before process 95 // termination. Any subprocess started before this function is called may be 96 // affected, in addition to any subprocesses started inside setupfn or fn. 97 // There are more details at https://go.dev/issue/27505. 98 // 99 // [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html 100 func Go(flags int, setupfn func() error, fn func()) error { 101 started := make(chan error) 102 103 maskedFlags := flags 104 for f := range reversibleSetnsFlags { 105 maskedFlags &^= f 106 } 107 isReversible := maskedFlags == 0 108 109 go func() { 110 // Prepare to manipulate per-thread kernel state. 111 runtime.LockOSThread() 112 113 // Not all changes to the execution state can be reverted. 114 // If an irreversible change to the execution state is made, our 115 // only recourse is to have the tampered thread terminated by 116 // returning from this function while the goroutine remains 117 // wired to the thread. The Go runtime will terminate the thread 118 // and replace it with a fresh one as needed. 119 120 if isReversible { 121 defer func() { 122 if isReversible { 123 // All execution state has been restored without error. 124 // The thread is once again fungible. 125 runtime.UnlockOSThread() 126 } 127 }() 128 tid := unix.Gettid() 129 for f, ns := range reversibleSetnsFlags { 130 if flags&f != f { 131 continue 132 } 133 // The /proc/thread-self directory was added in Linux 3.17. 134 // We are not using it to maximize compatibility. 135 pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns) 136 fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0) 137 if err != nil { 138 started <- &os.PathError{Op: "open", Path: pth, Err: err} 139 return 140 } 141 defer func() { 142 if isReversible { 143 if err := unix.Setns(fd, 0); err != nil { 144 isReversible = false 145 } 146 } 147 _ = unix.Close(fd) 148 }() 149 } 150 } 151 152 // Threads are implemented under Linux as processes which share 153 // a virtual memory space. Therefore in a multithreaded process 154 // unshare(2) disassociates parts of the calling thread's 155 // context from the thread it was clone(2)'d from. 156 if err := unix.Unshare(flags); err != nil { 157 started <- os.NewSyscallError("unshare", err) 158 return 159 } 160 161 if setupfn != nil { 162 if err := setupfn(); err != nil { 163 started <- err 164 return 165 } 166 } 167 close(started) 168 169 if fn != nil { 170 fn() 171 } 172 }() 173 174 return <-started 175 }