gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/specutils/namespace.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package specutils
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"os/exec"
    21  	"os/signal"
    22  	"path/filepath"
    23  	"runtime"
    24  	"syscall"
    25  
    26  	specs "github.com/opencontainers/runtime-spec/specs-go"
    27  	"github.com/syndtr/gocapability/capability"
    28  	"golang.org/x/sys/unix"
    29  	"gvisor.dev/gvisor/pkg/log"
    30  )
    31  
    32  // nsCloneFlag returns the clone flag that can be used to set a namespace of
    33  // the given type.
    34  func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
    35  	switch nst {
    36  	case specs.IPCNamespace:
    37  		return unix.CLONE_NEWIPC
    38  	case specs.MountNamespace:
    39  		return unix.CLONE_NEWNS
    40  	case specs.NetworkNamespace:
    41  		return unix.CLONE_NEWNET
    42  	case specs.PIDNamespace:
    43  		return unix.CLONE_NEWPID
    44  	case specs.UTSNamespace:
    45  		return unix.CLONE_NEWUTS
    46  	case specs.UserNamespace:
    47  		return unix.CLONE_NEWUSER
    48  	case specs.CgroupNamespace:
    49  		return unix.CLONE_NEWCGROUP
    50  	default:
    51  		panic(fmt.Sprintf("unknown namespace %v", nst))
    52  	}
    53  }
    54  
    55  // nsPath returns the path of the namespace for the current process and the
    56  // given namespace.
    57  func nsPath(nst specs.LinuxNamespaceType) string {
    58  	base := "/proc/self/ns"
    59  	switch nst {
    60  	case specs.CgroupNamespace:
    61  		return filepath.Join(base, "cgroup")
    62  	case specs.IPCNamespace:
    63  		return filepath.Join(base, "ipc")
    64  	case specs.MountNamespace:
    65  		return filepath.Join(base, "mnt")
    66  	case specs.NetworkNamespace:
    67  		return filepath.Join(base, "net")
    68  	case specs.PIDNamespace:
    69  		return filepath.Join(base, "pid")
    70  	case specs.UserNamespace:
    71  		return filepath.Join(base, "user")
    72  	case specs.UTSNamespace:
    73  		return filepath.Join(base, "uts")
    74  	default:
    75  		panic(fmt.Sprintf("unknown namespace %v", nst))
    76  	}
    77  }
    78  
    79  // GetNS returns true and the namespace with the given type from the slice of
    80  // namespaces in the spec.  It returns false if the slice does not contain a
    81  // namespace with the type.
    82  func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
    83  	if s.Linux == nil {
    84  		return specs.LinuxNamespace{}, false
    85  	}
    86  	for _, ns := range s.Linux.Namespaces {
    87  		if ns.Type == nst {
    88  			return ns, true
    89  		}
    90  	}
    91  	return specs.LinuxNamespace{}, false
    92  }
    93  
    94  // setNS sets the namespace of the given type.  It must be called with
    95  // OSThreadLocked.
    96  func setNS(fd, nsType uintptr) error {
    97  	if _, _, err := unix.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
    98  		return err
    99  	}
   100  	return nil
   101  }
   102  
   103  // ApplyNS applies the namespace on the current thread and returns a function
   104  // that will restore the namespace to the original value.
   105  //
   106  // Preconditions: Must be called with os thread locked.
   107  func ApplyNS(ns specs.LinuxNamespace) (func() error, error) {
   108  	log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path)
   109  	newNS, err := os.Open(ns.Path)
   110  	if err != nil {
   111  		return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
   112  	}
   113  	defer newNS.Close()
   114  
   115  	// Store current namespace to restore back.
   116  	curPath := nsPath(ns.Type)
   117  	oldNS, err := os.Open(curPath)
   118  	if err != nil {
   119  		return nil, fmt.Errorf("error opening %q: %v", curPath, err)
   120  	}
   121  
   122  	// Set namespace to the one requested and setup function to restore it back.
   123  	flag := nsCloneFlag(ns.Type)
   124  	if err := setNS(newNS.Fd(), flag); err != nil {
   125  		oldNS.Close()
   126  		return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
   127  	}
   128  	return func() error {
   129  		log.Infof("Restoring namespace %v", ns.Type)
   130  		defer oldNS.Close()
   131  		if err := setNS(oldNS.Fd(), flag); err != nil {
   132  			return fmt.Errorf("error restoring namespace: of type %v: %v", ns.Type, err)
   133  		}
   134  		return nil
   135  	}, nil
   136  }
   137  
   138  // StartInNS joins or creates the given namespaces and calls cmd.Start before
   139  // restoring the namespaces to the original values.
   140  func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
   141  	errChan := make(chan error)
   142  	go func() {
   143  		runtime.LockOSThread()
   144  		defer runtime.UnlockOSThread()
   145  
   146  		rstFuncs, err := startInNS(cmd, nss)
   147  		errChan <- err
   148  		for _, rstFunc := range rstFuncs {
   149  			err := rstFunc()
   150  			if err == nil {
   151  				continue
   152  			}
   153  
   154  			// One or more namespaces have not been restored, but
   155  			// we can't destroy the current system thread, because
   156  			// a child process is execited with Pdeathsig.
   157  			log.Debugf("Block the current system thread due to: %s", err)
   158  			c := make(chan any)
   159  			<-c
   160  		}
   161  	}()
   162  	return <-errChan
   163  }
   164  
   165  func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) ([]func() error, error) {
   166  	if cmd.SysProcAttr == nil {
   167  		cmd.SysProcAttr = &unix.SysProcAttr{}
   168  	}
   169  
   170  	var deferFuncs []func() error
   171  	for _, ns := range nss {
   172  		if ns.Path == "" {
   173  			// No path.  Just set a flag to create a new namespace.
   174  			cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
   175  			continue
   176  		}
   177  		// Join the given namespace, and restore the current namespace
   178  		// before exiting.
   179  		restoreNS, err := ApplyNS(ns)
   180  		if err != nil {
   181  			return deferFuncs, err
   182  		}
   183  		deferFuncs = append(deferFuncs, restoreNS)
   184  	}
   185  
   186  	err := cmd.Start()
   187  	if err != nil && cmd.SysProcAttr.Cloneflags&unix.CLONE_NEWUSER != 0 {
   188  		err = fmt.Errorf("%v: check whether /proc/sys/user/max_user_namespaces is set too low (gvisor.dev/issue/5964)", err)
   189  	}
   190  	return deferFuncs, err
   191  }
   192  
   193  // SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
   194  func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
   195  	if s.Linux == nil {
   196  		return
   197  	}
   198  	if cmd.SysProcAttr == nil {
   199  		cmd.SysProcAttr = &unix.SysProcAttr{}
   200  	}
   201  	for _, idMap := range s.Linux.UIDMappings {
   202  		log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
   203  		cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
   204  			ContainerID: int(idMap.ContainerID),
   205  			HostID:      int(idMap.HostID),
   206  			Size:        int(idMap.Size),
   207  		})
   208  	}
   209  	for _, idMap := range s.Linux.GIDMappings {
   210  		log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
   211  		cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
   212  			ContainerID: int(idMap.ContainerID),
   213  			HostID:      int(idMap.HostID),
   214  			Size:        int(idMap.Size),
   215  		})
   216  	}
   217  }
   218  
   219  // HasCapabilities returns true if the user has all capabilities in 'cs'.
   220  func HasCapabilities(cs ...capability.Cap) bool {
   221  	caps, err := capability.NewPid2(os.Getpid())
   222  	if err != nil {
   223  		return false
   224  	}
   225  	if err := caps.Load(); err != nil {
   226  		return false
   227  	}
   228  	for _, c := range cs {
   229  		if !caps.Get(capability.EFFECTIVE, c) {
   230  			return false
   231  		}
   232  	}
   233  	return true
   234  }
   235  
   236  // MaybeRunAsRoot ensures the process runs with capabilities needed to create a
   237  // sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed,
   238  // it will create a new user namespace and re-execute the process as root
   239  // inside the namespace with the same arguments and environment.
   240  //
   241  // This function returns immediately when no new capability is needed. If
   242  // another process is executed, it returns straight from here with the same exit
   243  // code as the child.
   244  func MaybeRunAsRoot() error {
   245  	if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) {
   246  		return nil
   247  	}
   248  
   249  	// Current process doesn't have required capabilities, create user namespace
   250  	// and run as root inside the namespace to acquire capabilities.
   251  	log.Infof("*** Re-running as root in new user namespace ***")
   252  
   253  	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
   254  
   255  	cmd.SysProcAttr = &unix.SysProcAttr{
   256  		Cloneflags: unix.CLONE_NEWUSER | unix.CLONE_NEWNS,
   257  		// Set current user/group as root inside the namespace. Since we may not
   258  		// have CAP_SETUID/CAP_SETGID, just map root to the current user/group.
   259  		UidMappings: []syscall.SysProcIDMap{
   260  			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
   261  		},
   262  		GidMappings: []syscall.SysProcIDMap{
   263  			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
   264  		},
   265  		Credential:                 &syscall.Credential{Uid: 0, Gid: 0},
   266  		GidMappingsEnableSetgroups: false,
   267  
   268  		// Make sure child is killed when the parent terminates.
   269  		Pdeathsig: unix.SIGKILL,
   270  
   271  		// Detach from session. Otherwise, signals sent to the foreground process
   272  		// will also be forwarded by this process, resulting in duplicate signals.
   273  		Setsid: true,
   274  	}
   275  
   276  	cmd.Env = os.Environ()
   277  	cmd.Stdin = os.Stdin
   278  	cmd.Stdout = os.Stdout
   279  	cmd.Stderr = os.Stderr
   280  	if err := cmd.Start(); err != nil {
   281  		return fmt.Errorf("re-executing self: %w", err)
   282  	}
   283  	ch := make(chan os.Signal, 1)
   284  	signal.Notify(ch)
   285  	go func() {
   286  		for {
   287  			// Forward all signals to child process.
   288  			sig := <-ch
   289  			if err := cmd.Process.Signal(sig); err != nil {
   290  				log.Warningf("Error forwarding signal %v to child (PID %d)", sig, cmd.Process.Pid)
   291  			}
   292  		}
   293  	}()
   294  	if err := cmd.Wait(); err != nil {
   295  		if exit, ok := err.(*exec.ExitError); ok {
   296  			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
   297  				os.Exit(ws.ExitStatus())
   298  			}
   299  			log.Warningf("No wait status provided, exiting with -1: %v", err)
   300  			os.Exit(-1)
   301  		}
   302  		return err
   303  	}
   304  	// Child completed with success.
   305  	os.Exit(0)
   306  	panic("unreachable")
   307  }