github.com/rawahars/moby@v24.0.4+incompatible/internal/unshare/unshare_linux.go (about) 1 //go:build go1.10 2 // +build go1.10 3 4 package unshare // import "github.com/docker/docker/internal/unshare" 5 6 import ( 7 "fmt" 8 "os" 9 "runtime" 10 11 "golang.org/x/sys/unix" 12 ) 13 14 func init() { 15 // The startup thread of a process is special in a few different ways. 16 // Most pertinent to the discussion at hand, any per-thread kernel state 17 // reflected in the /proc/[pid]/ directory for a process is taken from 18 // the state of the startup thread. Same goes for /proc/self/; it shows 19 // the state of the current process' startup thread, no matter which 20 // thread the files are being opened from. For most programs this is a 21 // distinction without a difference as the kernel state, such as the 22 // mount namespace and current working directory, is shared among (and 23 // kept synchronized across) all threads of a process. But things start 24 // to break down once threads start unsharing and modifying parts of 25 // their kernel state. 26 // 27 // The Go runtime schedules goroutines to execute on the startup thread, 28 // same as any other. How this could be problematic is best illustrated 29 // with a concrete example. Consider what happens if a call to 30 // Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled 31 // onto the startup thread. The thread's mount namespace will be 32 // unshared and modified. The contents of the /proc/[pid]/mountinfo file 33 // will then describe the mount tree of the unshared namespace, not the 34 // namespace of any other thread. It will remain this way until the 35 // process exits. (The startup thread is special in another way: exiting 36 // it puts the process into a "non-waitable zombie" state. To avoid this 37 // fate, the Go runtime parks the thread instead of exiting if a 38 // goroutine returns while locked to the startup thread. More 39 // information can be found in the Go runtime sources: 40 // `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo 41 // package reads from /proc/self/mountinfo, so will read the mount tree 42 // for the wrong namespace if the startup thread has had its mount 43 // namespace unshared! The /proc/thread-self/ directory, introduced in 44 // Linux 3.17, is one potential solution to this problem, but every 45 // package which opens files in /proc/self/ would need to be updated, 46 // and fallbacks to /proc/self/task/[tid]/ would be required to support 47 // older kernels. Overlooking any reference to /proc/self/ would 48 // manifest as stochastically-reproducible bugs, so this is far from an 49 // ideal solution. 50 // 51 // Reading from /proc/self/ would not be a problem if we could prevent 52 // the per-thread state of the startup thread from being modified 53 // nondeterministically in the first place. We can accomplish this 54 // simply by locking the main() function to the startup thread! Doing so 55 // excludes any other goroutine from being scheduled on the thread. 56 runtime.LockOSThread() 57 } 58 59 // reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully 60 // reversed using setns(2). The values are the basenames of the corresponding 61 // /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the 62 // state. 63 var reversibleSetnsFlags = map[int]string{ 64 unix.CLONE_NEWCGROUP: "cgroup", 65 unix.CLONE_NEWNET: "net", 66 unix.CLONE_NEWUTS: "uts", 67 unix.CLONE_NEWPID: "pid", 68 unix.CLONE_NEWTIME: "time", 69 70 // The following CLONE_NEW* flags are not included because they imply 71 // another, irreversible flag when used with unshare(2). 72 // - unix.CLONE_NEWIPC: implies CLONE_SYSVMEM 73 // - unix.CLONE_NEWNS: implies CLONE_FS 74 // - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9 75 } 76 77 // Go calls the given functions in a new goroutine, locked to an OS thread, 78 // which has had the parts of its execution state disassociated from the rest of 79 // the current process using [unshare(2)]. It blocks until the new goroutine has 80 // started and setupfn has returned. fn is only called if setupfn returns nil. A 81 // nil setupfn or fn is equivalent to passing a no-op function. 82 // 83 // The disassociated execution state and any changes made to it are only visible 84 // to the goroutine which the functions are called in. Any other goroutines, 85 // including ones started from the function, will see the same execution state 86 // as the rest of the process. 87 // 88 // The acceptable flags are documented in the [unshare(2)] Linux man-page. 89 // The corresponding CLONE_* constants are defined in package [unix]. 90 // 91 // # Warning 92 // 93 // This function may terminate the thread which the new goroutine executed on 94 // after fn returns, which could cause subprocesses started with the 95 // [syscall.SysProcAttr] Pdeathsig field set to be signaled before process 96 // termination. Any subprocess started before this function is called may be 97 // affected, in addition to any subprocesses started inside setupfn or fn. 98 // There are more details at https://go.dev/issue/27505. 99 // 100 // [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html 101 func Go(flags int, setupfn func() error, fn func()) error { 102 started := make(chan error) 103 104 maskedFlags := flags 105 for f := range reversibleSetnsFlags { 106 maskedFlags &^= f 107 } 108 isReversible := maskedFlags == 0 109 110 go func() { 111 // Prepare to manipulate per-thread kernel state. 112 runtime.LockOSThread() 113 114 // Not all changes to the execution state can be reverted. 115 // If an irreversible change to the execution state is made, our 116 // only recourse is to have the tampered thread terminated by 117 // returning from this function while the goroutine remains 118 // wired to the thread. The Go runtime will terminate the thread 119 // and replace it with a fresh one as needed. 120 121 if isReversible { 122 defer func() { 123 if isReversible { 124 // All execution state has been restored without error. 125 // The thread is once again fungible. 126 runtime.UnlockOSThread() 127 } 128 }() 129 tid := unix.Gettid() 130 for f, ns := range reversibleSetnsFlags { 131 if flags&f != f { 132 continue 133 } 134 // The /proc/thread-self directory was added in Linux 3.17. 135 // We are not using it to maximize compatibility. 136 pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns) 137 fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0) 138 if err != nil { 139 started <- &os.PathError{Op: "open", Path: pth, Err: err} 140 return 141 } 142 defer func() { 143 if isReversible { 144 if err := unix.Setns(fd, 0); err != nil { 145 isReversible = false 146 } 147 } 148 _ = unix.Close(fd) 149 }() 150 } 151 } 152 153 // Threads are implemented under Linux as processes which share 154 // a virtual memory space. Therefore in a multithreaded process 155 // unshare(2) disassociates parts of the calling thread's 156 // context from the thread it was clone(2)'d from. 157 if err := unix.Unshare(flags); err != nil { 158 started <- os.NewSyscallError("unshare", err) 159 return 160 } 161 162 if setupfn != nil { 163 if err := setupfn(); err != nil { 164 started <- err 165 return 166 } 167 } 168 close(started) 169 170 if fn != nil { 171 fn() 172 } 173 }() 174 175 return <-started 176 }