github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/specutils/namespace.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package specutils
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"os/exec"
    21  	"os/signal"
    22  	"path/filepath"
    23  	"runtime"
    24  	"syscall"
    25  
    26  	specs "github.com/opencontainers/runtime-spec/specs-go"
    27  	"github.com/syndtr/gocapability/capability"
    28  	"golang.org/x/sys/unix"
    29  	"github.com/SagerNet/gvisor/pkg/log"
    30  )
    31  
    32  // nsCloneFlag returns the clone flag that can be used to set a namespace of
    33  // the given type.
    34  func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
    35  	switch nst {
    36  	case specs.IPCNamespace:
    37  		return unix.CLONE_NEWIPC
    38  	case specs.MountNamespace:
    39  		return unix.CLONE_NEWNS
    40  	case specs.NetworkNamespace:
    41  		return unix.CLONE_NEWNET
    42  	case specs.PIDNamespace:
    43  		return unix.CLONE_NEWPID
    44  	case specs.UTSNamespace:
    45  		return unix.CLONE_NEWUTS
    46  	case specs.UserNamespace:
    47  		return unix.CLONE_NEWUSER
    48  	case specs.CgroupNamespace:
    49  		return unix.CLONE_NEWCGROUP
    50  	default:
    51  		panic(fmt.Sprintf("unknown namespace %v", nst))
    52  	}
    53  }
    54  
    55  // nsPath returns the path of the namespace for the current process and the
    56  // given namespace.
    57  func nsPath(nst specs.LinuxNamespaceType) string {
    58  	base := "/proc/self/ns"
    59  	switch nst {
    60  	case specs.CgroupNamespace:
    61  		return filepath.Join(base, "cgroup")
    62  	case specs.IPCNamespace:
    63  		return filepath.Join(base, "ipc")
    64  	case specs.MountNamespace:
    65  		return filepath.Join(base, "mnt")
    66  	case specs.NetworkNamespace:
    67  		return filepath.Join(base, "net")
    68  	case specs.PIDNamespace:
    69  		return filepath.Join(base, "pid")
    70  	case specs.UserNamespace:
    71  		return filepath.Join(base, "user")
    72  	case specs.UTSNamespace:
    73  		return filepath.Join(base, "uts")
    74  	default:
    75  		panic(fmt.Sprintf("unknown namespace %v", nst))
    76  	}
    77  }
    78  
    79  // GetNS returns true and the namespace with the given type from the slice of
    80  // namespaces in the spec.  It returns false if the slice does not contain a
    81  // namespace with the type.
    82  func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
    83  	if s.Linux == nil {
    84  		return specs.LinuxNamespace{}, false
    85  	}
    86  	for _, ns := range s.Linux.Namespaces {
    87  		if ns.Type == nst {
    88  			return ns, true
    89  		}
    90  	}
    91  	return specs.LinuxNamespace{}, false
    92  }
    93  
    94  // FilterNS returns a slice of namespaces from the spec with types that match
    95  // those in the `filter` slice.
    96  func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
    97  	if s.Linux == nil {
    98  		return nil
    99  	}
   100  	var out []specs.LinuxNamespace
   101  	for _, nst := range filter {
   102  		if ns, ok := GetNS(nst, s); ok {
   103  			out = append(out, ns)
   104  		}
   105  	}
   106  	return out
   107  }
   108  
   109  // setNS sets the namespace of the given type.  It must be called with
   110  // OSThreadLocked.
   111  func setNS(fd, nsType uintptr) error {
   112  	if _, _, err := unix.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
   113  		return err
   114  	}
   115  	return nil
   116  }
   117  
   118  // ApplyNS applies the namespace on the current thread and returns a function
   119  // that will restore the namespace to the original value.
   120  //
   121  // Preconditions: Must be called with os thread locked.
   122  func ApplyNS(ns specs.LinuxNamespace) (func(), error) {
   123  	log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path)
   124  	newNS, err := os.Open(ns.Path)
   125  	if err != nil {
   126  		return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
   127  	}
   128  	defer newNS.Close()
   129  
   130  	// Store current namespace to restore back.
   131  	curPath := nsPath(ns.Type)
   132  	oldNS, err := os.Open(curPath)
   133  	if err != nil {
   134  		return nil, fmt.Errorf("error opening %q: %v", curPath, err)
   135  	}
   136  
   137  	// Set namespace to the one requested and setup function to restore it back.
   138  	flag := nsCloneFlag(ns.Type)
   139  	if err := setNS(newNS.Fd(), flag); err != nil {
   140  		oldNS.Close()
   141  		return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
   142  	}
   143  	return func() {
   144  		log.Infof("Restoring namespace %v", ns.Type)
   145  		defer oldNS.Close()
   146  		if err := setNS(oldNS.Fd(), flag); err != nil {
   147  			panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
   148  		}
   149  	}, nil
   150  }
   151  
   152  // StartInNS joins or creates the given namespaces and calls cmd.Start before
   153  // restoring the namespaces to the original values.
   154  func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
   155  	// We are about to setup namespaces, which requires the os thread being
   156  	// locked so that Go doesn't change the thread out from under us.
   157  	runtime.LockOSThread()
   158  	defer runtime.UnlockOSThread()
   159  
   160  	if cmd.SysProcAttr == nil {
   161  		cmd.SysProcAttr = &unix.SysProcAttr{}
   162  	}
   163  
   164  	for _, ns := range nss {
   165  		if ns.Path == "" {
   166  			// No path.  Just set a flag to create a new namespace.
   167  			cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
   168  			continue
   169  		}
   170  		// Join the given namespace, and restore the current namespace
   171  		// before exiting.
   172  		restoreNS, err := ApplyNS(ns)
   173  		if err != nil {
   174  			return err
   175  		}
   176  		defer restoreNS()
   177  	}
   178  
   179  	return cmd.Start()
   180  }
   181  
   182  // SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
   183  func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
   184  	if s.Linux == nil {
   185  		return
   186  	}
   187  	if cmd.SysProcAttr == nil {
   188  		cmd.SysProcAttr = &unix.SysProcAttr{}
   189  	}
   190  	for _, idMap := range s.Linux.UIDMappings {
   191  		log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
   192  		cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
   193  			ContainerID: int(idMap.ContainerID),
   194  			HostID:      int(idMap.HostID),
   195  			Size:        int(idMap.Size),
   196  		})
   197  	}
   198  	for _, idMap := range s.Linux.GIDMappings {
   199  		log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
   200  		cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
   201  			ContainerID: int(idMap.ContainerID),
   202  			HostID:      int(idMap.HostID),
   203  			Size:        int(idMap.Size),
   204  		})
   205  	}
   206  }
   207  
   208  // HasCapabilities returns true if the user has all capabilities in 'cs'.
   209  func HasCapabilities(cs ...capability.Cap) bool {
   210  	caps, err := capability.NewPid2(os.Getpid())
   211  	if err != nil {
   212  		return false
   213  	}
   214  	if err := caps.Load(); err != nil {
   215  		return false
   216  	}
   217  	for _, c := range cs {
   218  		if !caps.Get(capability.EFFECTIVE, c) {
   219  			return false
   220  		}
   221  	}
   222  	return true
   223  }
   224  
   225  // MaybeRunAsRoot ensures the process runs with capabilities needed to create a
   226  // sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed,
   227  // it will create a new user namespace and re-execute the process as root
   228  // inside the namespace with the same arguments and environment.
   229  //
   230  // This function returns immediately when no new capability is needed. If
   231  // another process is executed, it returns straight from here with the same exit
   232  // code as the child.
   233  func MaybeRunAsRoot() error {
   234  	if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) {
   235  		return nil
   236  	}
   237  
   238  	// Current process doesn't have required capabilities, create user namespace
   239  	// and run as root inside the namespace to acquire capabilities.
   240  	log.Infof("*** Re-running as root in new user namespace ***")
   241  
   242  	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
   243  
   244  	cmd.SysProcAttr = &unix.SysProcAttr{
   245  		Cloneflags: unix.CLONE_NEWUSER | unix.CLONE_NEWNS,
   246  		// Set current user/group as root inside the namespace. Since we may not
   247  		// have CAP_SETUID/CAP_SETGID, just map root to the current user/group.
   248  		UidMappings: []syscall.SysProcIDMap{
   249  			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
   250  		},
   251  		GidMappings: []syscall.SysProcIDMap{
   252  			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
   253  		},
   254  		Credential:                 &syscall.Credential{Uid: 0, Gid: 0},
   255  		GidMappingsEnableSetgroups: false,
   256  
   257  		// Make sure child is killed when the parent terminates.
   258  		Pdeathsig: unix.SIGKILL,
   259  	}
   260  
   261  	cmd.Env = os.Environ()
   262  	cmd.Stdin = os.Stdin
   263  	cmd.Stdout = os.Stdout
   264  	cmd.Stderr = os.Stderr
   265  	if err := cmd.Start(); err != nil {
   266  		return fmt.Errorf("re-executing self: %w", err)
   267  	}
   268  	ch := make(chan os.Signal, 1)
   269  	signal.Notify(ch)
   270  	go func() {
   271  		for {
   272  			// Forward all signals to child process.
   273  			cmd.Process.Signal(<-ch)
   274  		}
   275  	}()
   276  	if err := cmd.Wait(); err != nil {
   277  		if exit, ok := err.(*exec.ExitError); ok {
   278  			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
   279  				os.Exit(ws.ExitStatus())
   280  			}
   281  			log.Warningf("No wait status provided, exiting with -1: %v", err)
   282  			os.Exit(-1)
   283  		}
   284  		return err
   285  	}
   286  	// Child completed with success.
   287  	os.Exit(0)
   288  	panic("unreachable")
   289  }