github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/specutils/namespace.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package specutils 16 17 import ( 18 "fmt" 19 "os" 20 "os/exec" 21 "os/signal" 22 "path/filepath" 23 "runtime" 24 "syscall" 25 26 "github.com/MerlinKodo/gvisor/pkg/log" 27 specs "github.com/opencontainers/runtime-spec/specs-go" 28 "github.com/syndtr/gocapability/capability" 29 "golang.org/x/sys/unix" 30 ) 31 32 // nsCloneFlag returns the clone flag that can be used to set a namespace of 33 // the given type. 34 func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr { 35 switch nst { 36 case specs.IPCNamespace: 37 return unix.CLONE_NEWIPC 38 case specs.MountNamespace: 39 return unix.CLONE_NEWNS 40 case specs.NetworkNamespace: 41 return unix.CLONE_NEWNET 42 case specs.PIDNamespace: 43 return unix.CLONE_NEWPID 44 case specs.UTSNamespace: 45 return unix.CLONE_NEWUTS 46 case specs.UserNamespace: 47 return unix.CLONE_NEWUSER 48 case specs.CgroupNamespace: 49 return unix.CLONE_NEWCGROUP 50 default: 51 panic(fmt.Sprintf("unknown namespace %v", nst)) 52 } 53 } 54 55 // nsPath returns the path of the namespace for the current process and the 56 // given namespace. 57 func nsPath(nst specs.LinuxNamespaceType) string { 58 base := "/proc/self/ns" 59 switch nst { 60 case specs.CgroupNamespace: 61 return filepath.Join(base, "cgroup") 62 case specs.IPCNamespace: 63 return filepath.Join(base, "ipc") 64 case specs.MountNamespace: 65 return filepath.Join(base, "mnt") 66 case specs.NetworkNamespace: 67 return filepath.Join(base, "net") 68 case specs.PIDNamespace: 69 return filepath.Join(base, "pid") 70 case specs.UserNamespace: 71 return filepath.Join(base, "user") 72 case specs.UTSNamespace: 73 return filepath.Join(base, "uts") 74 default: 75 panic(fmt.Sprintf("unknown namespace %v", nst)) 76 } 77 } 78 79 // GetNS returns true and the namespace with the given type from the slice of 80 // namespaces in the spec. It returns false if the slice does not contain a 81 // namespace with the type. 82 func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) { 83 if s.Linux == nil { 84 return specs.LinuxNamespace{}, false 85 } 86 for _, ns := range s.Linux.Namespaces { 87 if ns.Type == nst { 88 return ns, true 89 } 90 } 91 return specs.LinuxNamespace{}, false 92 } 93 94 // setNS sets the namespace of the given type. It must be called with 95 // OSThreadLocked. 96 func setNS(fd, nsType uintptr) error { 97 if _, _, err := unix.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 { 98 return err 99 } 100 return nil 101 } 102 103 // ApplyNS applies the namespace on the current thread and returns a function 104 // that will restore the namespace to the original value. 105 // 106 // Preconditions: Must be called with os thread locked. 107 func ApplyNS(ns specs.LinuxNamespace) (func() error, error) { 108 log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path) 109 newNS, err := os.Open(ns.Path) 110 if err != nil { 111 return nil, fmt.Errorf("error opening %q: %v", ns.Path, err) 112 } 113 defer newNS.Close() 114 115 // Store current namespace to restore back. 116 curPath := nsPath(ns.Type) 117 oldNS, err := os.Open(curPath) 118 if err != nil { 119 return nil, fmt.Errorf("error opening %q: %v", curPath, err) 120 } 121 122 // Set namespace to the one requested and setup function to restore it back. 123 flag := nsCloneFlag(ns.Type) 124 if err := setNS(newNS.Fd(), flag); err != nil { 125 oldNS.Close() 126 return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err) 127 } 128 return func() error { 129 log.Infof("Restoring namespace %v", ns.Type) 130 defer oldNS.Close() 131 if err := setNS(oldNS.Fd(), flag); err != nil { 132 return fmt.Errorf("error restoring namespace: of type %v: %v", ns.Type, err) 133 } 134 return nil 135 }, nil 136 } 137 138 // StartInNS joins or creates the given namespaces and calls cmd.Start before 139 // restoring the namespaces to the original values. 140 func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { 141 errChan := make(chan error) 142 go func() { 143 runtime.LockOSThread() 144 defer runtime.UnlockOSThread() 145 146 rstFuncs, err := startInNS(cmd, nss) 147 errChan <- err 148 for _, rstFunc := range rstFuncs { 149 err := rstFunc() 150 if err == nil { 151 continue 152 } 153 154 // One or more namespaces have not been restored, but 155 // we can't destroy the current system thread, because 156 // a child process is execited with Pdeathsig. 157 log.Debugf("Block the current system thread due to: %s", err) 158 c := make(chan any) 159 <-c 160 } 161 }() 162 return <-errChan 163 } 164 165 func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) ([]func() error, error) { 166 if cmd.SysProcAttr == nil { 167 cmd.SysProcAttr = &unix.SysProcAttr{} 168 } 169 170 var deferFuncs []func() error 171 for _, ns := range nss { 172 if ns.Path == "" { 173 // No path. Just set a flag to create a new namespace. 174 cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type) 175 continue 176 } 177 // Join the given namespace, and restore the current namespace 178 // before exiting. 179 restoreNS, err := ApplyNS(ns) 180 if err != nil { 181 return deferFuncs, err 182 } 183 deferFuncs = append(deferFuncs, restoreNS) 184 } 185 186 err := cmd.Start() 187 if err != nil && cmd.SysProcAttr.Cloneflags&unix.CLONE_NEWUSER != 0 { 188 err = fmt.Errorf("%v: check whether /proc/sys/user/max_user_namespaces is set too low (gvisor.dev/issue/5964)", err) 189 } 190 return deferFuncs, err 191 } 192 193 // SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. 194 func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { 195 if s.Linux == nil { 196 return 197 } 198 if cmd.SysProcAttr == nil { 199 cmd.SysProcAttr = &unix.SysProcAttr{} 200 } 201 for _, idMap := range s.Linux.UIDMappings { 202 log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) 203 cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{ 204 ContainerID: int(idMap.ContainerID), 205 HostID: int(idMap.HostID), 206 Size: int(idMap.Size), 207 }) 208 } 209 for _, idMap := range s.Linux.GIDMappings { 210 log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) 211 cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{ 212 ContainerID: int(idMap.ContainerID), 213 HostID: int(idMap.HostID), 214 Size: int(idMap.Size), 215 }) 216 } 217 } 218 219 // HasCapabilities returns true if the user has all capabilities in 'cs'. 220 func HasCapabilities(cs ...capability.Cap) bool { 221 caps, err := capability.NewPid2(os.Getpid()) 222 if err != nil { 223 return false 224 } 225 if err := caps.Load(); err != nil { 226 return false 227 } 228 for _, c := range cs { 229 if !caps.Get(capability.EFFECTIVE, c) { 230 return false 231 } 232 } 233 return true 234 } 235 236 // MaybeRunAsRoot ensures the process runs with capabilities needed to create a 237 // sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed, 238 // it will create a new user namespace and re-execute the process as root 239 // inside the namespace with the same arguments and environment. 240 // 241 // This function returns immediately when no new capability is needed. If 242 // another process is executed, it returns straight from here with the same exit 243 // code as the child. 244 func MaybeRunAsRoot() error { 245 if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) { 246 return nil 247 } 248 249 // Current process doesn't have required capabilities, create user namespace 250 // and run as root inside the namespace to acquire capabilities. 251 log.Infof("*** Re-running as root in new user namespace ***") 252 253 cmd := exec.Command("/proc/self/exe", os.Args[1:]...) 254 255 cmd.SysProcAttr = &unix.SysProcAttr{ 256 Cloneflags: unix.CLONE_NEWUSER | unix.CLONE_NEWNS, 257 // Set current user/group as root inside the namespace. Since we may not 258 // have CAP_SETUID/CAP_SETGID, just map root to the current user/group. 259 UidMappings: []syscall.SysProcIDMap{ 260 {ContainerID: 0, HostID: os.Getuid(), Size: 1}, 261 }, 262 GidMappings: []syscall.SysProcIDMap{ 263 {ContainerID: 0, HostID: os.Getgid(), Size: 1}, 264 }, 265 Credential: &syscall.Credential{Uid: 0, Gid: 0}, 266 GidMappingsEnableSetgroups: false, 267 268 // Make sure child is killed when the parent terminates. 269 Pdeathsig: unix.SIGKILL, 270 271 // Detach from session. Otherwise, signals sent to the foreground process 272 // will also be forwarded by this process, resulting in duplicate signals. 273 Setsid: true, 274 } 275 276 cmd.Env = os.Environ() 277 cmd.Stdin = os.Stdin 278 cmd.Stdout = os.Stdout 279 cmd.Stderr = os.Stderr 280 if err := cmd.Start(); err != nil { 281 return fmt.Errorf("re-executing self: %w", err) 282 } 283 ch := make(chan os.Signal, 1) 284 signal.Notify(ch) 285 go func() { 286 for { 287 // Forward all signals to child process. 288 sig := <-ch 289 if err := cmd.Process.Signal(sig); err != nil { 290 log.Warningf("Error forwarding signal %v to child (PID %d)", sig, cmd.Process.Pid) 291 } 292 } 293 }() 294 if err := cmd.Wait(); err != nil { 295 if exit, ok := err.(*exec.ExitError); ok { 296 if ws, ok := exit.Sys().(syscall.WaitStatus); ok { 297 os.Exit(ws.ExitStatus()) 298 } 299 log.Warningf("No wait status provided, exiting with -1: %v", err) 300 os.Exit(-1) 301 } 302 return err 303 } 304 // Child completed with success. 305 os.Exit(0) 306 panic("unreachable") 307 }