github.com/containers/podman/v4@v4.9.4/pkg/specgen/generate/oci_linux.go (about)

     1  //go:build !remote
     2  // +build !remote
     3  
     4  package generate
     5  
     6  import (
     7  	"context"
     8  	"encoding/json"
     9  	"fmt"
    10  	"path"
    11  	"strings"
    12  
    13  	"github.com/containers/common/libimage"
    14  	"github.com/containers/common/pkg/cgroups"
    15  	"github.com/containers/common/pkg/config"
    16  	"github.com/containers/podman/v4/libpod"
    17  	"github.com/containers/podman/v4/libpod/define"
    18  	"github.com/containers/podman/v4/pkg/rootless"
    19  	"github.com/containers/podman/v4/pkg/specgen"
    20  	"github.com/docker/go-units"
    21  	spec "github.com/opencontainers/runtime-spec/specs-go"
    22  	"github.com/opencontainers/runtime-tools/generate"
    23  	"github.com/sirupsen/logrus"
    24  	"golang.org/x/sys/unix"
    25  )
    26  
    27  func setProcOpts(s *specgen.SpecGenerator, g *generate.Generator) {
    28  	if s.ProcOpts == nil {
    29  		return
    30  	}
    31  	for i := range g.Config.Mounts {
    32  		if g.Config.Mounts[i].Destination == "/proc" {
    33  			g.Config.Mounts[i].Options = s.ProcOpts
    34  			return
    35  		}
    36  	}
    37  }
    38  
    39  func setDevOptsReadOnly(g *generate.Generator) {
    40  	for i := range g.Config.Mounts {
    41  		if g.Config.Mounts[i].Destination == "/dev" {
    42  			g.Config.Mounts[i].Options = append(g.Config.Mounts[i].Options, "ro")
    43  			return
    44  		}
    45  	}
    46  }
    47  
    48  // canMountSys is a best-effort heuristic to detect whether mounting a new sysfs is permitted in the container
    49  func canMountSys(isRootless, isNewUserns bool, s *specgen.SpecGenerator) bool {
    50  	if s.NetNS.IsHost() && (isRootless || isNewUserns) {
    51  		return false
    52  	}
    53  	if isNewUserns {
    54  		switch s.NetNS.NSMode {
    55  		case specgen.Slirp, specgen.Pasta, specgen.Private, specgen.NoNetwork, specgen.Bridge:
    56  			return true
    57  		default:
    58  			return false
    59  		}
    60  	}
    61  	return true
    62  }
    63  
    64  func getCgroupPermissions(unmask []string) string {
    65  	ro := "ro"
    66  	rw := "rw"
    67  	cgroup := "/sys/fs/cgroup"
    68  
    69  	cgroupv2, _ := cgroups.IsCgroup2UnifiedMode()
    70  	if !cgroupv2 {
    71  		return ro
    72  	}
    73  
    74  	if len(unmask) != 0 && unmask[0] == "ALL" {
    75  		return rw
    76  	}
    77  
    78  	for _, p := range unmask {
    79  		if path.Clean(p) == cgroup {
    80  			return rw
    81  		}
    82  	}
    83  	return ro
    84  }
    85  
    86  // SpecGenToOCI returns the base configuration for the container.
    87  func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runtime, rtc *config.Config, newImage *libimage.Image, mounts []spec.Mount, pod *libpod.Pod, finalCmd []string, compatibleOptions *libpod.InfraInherit) (*spec.Spec, error) {
    88  	cgroupPerm := getCgroupPermissions(s.Unmask)
    89  
    90  	g, err := generate.New("linux")
    91  	if err != nil {
    92  		return nil, err
    93  	}
    94  	// Remove the default /dev/shm mount to ensure we overwrite it
    95  	g.RemoveMount("/dev/shm")
    96  	g.HostSpecific = true
    97  	addCgroup := true
    98  
    99  	isRootless := rootless.IsRootless()
   100  	isNewUserns := s.UserNS.IsContainer() || s.UserNS.IsPath() || s.UserNS.IsPrivate() || s.UserNS.IsPod() || s.UserNS.IsAuto()
   101  
   102  	canMountSys := canMountSys(isRootless, isNewUserns, s)
   103  
   104  	if s.Privileged && canMountSys {
   105  		cgroupPerm = "rw"
   106  		g.RemoveMount("/sys")
   107  		sysMnt := spec.Mount{
   108  			Destination: "/sys",
   109  			Type:        "sysfs",
   110  			Source:      "sysfs",
   111  			Options:     []string{"rprivate", "nosuid", "noexec", "nodev", "rw"},
   112  		}
   113  		g.AddMount(sysMnt)
   114  	}
   115  	if !canMountSys {
   116  		addCgroup = false
   117  		g.RemoveMount("/sys")
   118  		r := "ro"
   119  		if s.Privileged {
   120  			r = "rw"
   121  		}
   122  		sysMnt := spec.Mount{
   123  			Destination: "/sys",
   124  			Type:        define.TypeBind,
   125  			Source:      "/sys",
   126  			Options:     []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"},
   127  		}
   128  		g.AddMount(sysMnt)
   129  		g.RemoveMount("/sys/fs/cgroup")
   130  
   131  		sysFsCgroupMnt := spec.Mount{
   132  			Destination: "/sys/fs/cgroup",
   133  			Type:        "cgroup",
   134  			Source:      "/sys/fs/cgroup",
   135  			Options:     []string{"rprivate", "nosuid", "noexec", "nodev", r},
   136  		}
   137  		g.AddMount(sysFsCgroupMnt)
   138  		if !s.Privileged && isRootless {
   139  			g.AddLinuxMaskedPaths("/sys/kernel")
   140  		}
   141  	}
   142  	gid5Available := true
   143  	if isRootless {
   144  		nGids, err := rootless.GetAvailableGids()
   145  		if err != nil {
   146  			return nil, err
   147  		}
   148  		gid5Available = nGids >= 5
   149  	}
   150  	// When using a different user namespace, check that the GID 5 is mapped inside
   151  	// the container.
   152  	if gid5Available && (s.IDMappings != nil && len(s.IDMappings.GIDMap) > 0) {
   153  		mappingFound := false
   154  		for _, r := range s.IDMappings.GIDMap {
   155  			if r.ContainerID <= 5 && 5 < r.ContainerID+r.Size {
   156  				mappingFound = true
   157  				break
   158  			}
   159  		}
   160  		if !mappingFound {
   161  			gid5Available = false
   162  		}
   163  	}
   164  	if !gid5Available {
   165  		// If we have no GID mappings, the gid=5 default option would fail, so drop it.
   166  		g.RemoveMount("/dev/pts")
   167  		devPts := spec.Mount{
   168  			Destination: "/dev/pts",
   169  			Type:        define.TypeDevpts,
   170  			Source:      define.TypeDevpts,
   171  			Options:     []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"},
   172  		}
   173  		g.AddMount(devPts)
   174  	}
   175  
   176  	inUserNS := isRootless || isNewUserns
   177  
   178  	if inUserNS && s.IpcNS.IsHost() {
   179  		g.RemoveMount("/dev/mqueue")
   180  		devMqueue := spec.Mount{
   181  			Destination: "/dev/mqueue",
   182  			Type:        define.TypeBind, // constant ?
   183  			Source:      "/dev/mqueue",
   184  			Options:     []string{define.TypeBind, "nosuid", "noexec", "nodev"},
   185  		}
   186  		g.AddMount(devMqueue)
   187  	}
   188  	if inUserNS && s.PidNS.IsHost() {
   189  		g.RemoveMount("/proc")
   190  		procMount := spec.Mount{
   191  			Destination: "/proc",
   192  			Type:        define.TypeBind,
   193  			Source:      "/proc",
   194  			Options:     []string{"rbind", "nosuid", "noexec", "nodev"},
   195  		}
   196  		g.AddMount(procMount)
   197  	}
   198  
   199  	if addCgroup {
   200  		cgroupMnt := spec.Mount{
   201  			Destination: "/sys/fs/cgroup",
   202  			Type:        "cgroup",
   203  			Source:      "cgroup",
   204  			Options:     []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm},
   205  		}
   206  		g.AddMount(cgroupMnt)
   207  	}
   208  
   209  	g.Config.Linux.Personality = s.Personality
   210  
   211  	g.SetProcessCwd(s.WorkDir)
   212  
   213  	g.SetProcessArgs(finalCmd)
   214  
   215  	g.SetProcessTerminal(s.Terminal)
   216  
   217  	for key, val := range s.Annotations {
   218  		g.AddAnnotation(key, val)
   219  	}
   220  
   221  	if s.IntelRdt != nil {
   222  		if s.IntelRdt.ClosID != "" {
   223  			g.SetLinuxIntelRdtClosID(s.IntelRdt.ClosID)
   224  		}
   225  	}
   226  
   227  	if s.ResourceLimits != nil {
   228  		out, err := json.Marshal(s.ResourceLimits)
   229  		if err != nil {
   230  			return nil, err
   231  		}
   232  		err = json.Unmarshal(out, g.Config.Linux.Resources)
   233  		if err != nil {
   234  			return nil, err
   235  		}
   236  		g.Config.Linux.Resources = s.ResourceLimits
   237  	}
   238  
   239  	weightDevices, err := WeightDevices(s.WeightDevice)
   240  	if err != nil {
   241  		return nil, err
   242  	}
   243  	if len(weightDevices) > 0 {
   244  		for _, dev := range weightDevices {
   245  			g.AddLinuxResourcesBlockIOWeightDevice(dev.Major, dev.Minor, *dev.Weight)
   246  		}
   247  	}
   248  
   249  	// Devices
   250  	// set the default rule at the beginning of device configuration
   251  	if !inUserNS && !s.Privileged {
   252  		g.AddLinuxResourcesDevice(false, "", nil, nil, "rwm")
   253  	}
   254  
   255  	var userDevices []spec.LinuxDevice
   256  
   257  	if !s.Privileged {
   258  		// add default devices from containers.conf
   259  		for _, device := range rtc.Containers.Devices.Get() {
   260  			if err = DevicesFromPath(&g, device); err != nil {
   261  				return nil, err
   262  			}
   263  		}
   264  		if len(compatibleOptions.HostDeviceList) > 0 && len(s.Devices) == 0 {
   265  			userDevices = compatibleOptions.HostDeviceList
   266  		} else {
   267  			userDevices = s.Devices
   268  		}
   269  		// add default devices specified by caller
   270  		for _, device := range userDevices {
   271  			if err = DevicesFromPath(&g, device.Path); err != nil {
   272  				return nil, err
   273  			}
   274  		}
   275  	}
   276  	s.HostDeviceList = userDevices
   277  
   278  	// set the devices cgroup when not running in a user namespace
   279  	if isRootless && len(s.DeviceCgroupRule) > 0 {
   280  		return nil, fmt.Errorf("device cgroup rules are not supported in rootless mode or in a user namespace")
   281  	}
   282  	if !isRootless && !s.Privileged {
   283  		for _, dev := range s.DeviceCgroupRule {
   284  			g.AddLinuxResourcesDevice(true, dev.Type, dev.Major, dev.Minor, dev.Access)
   285  		}
   286  	}
   287  
   288  	BlockAccessToKernelFilesystems(s.Privileged, s.PidNS.IsHost(), s.Mask, s.Unmask, &g)
   289  
   290  	g.ClearProcessEnv()
   291  	for name, val := range s.Env {
   292  		g.AddProcessEnv(name, val)
   293  	}
   294  
   295  	addRlimits(s, &g)
   296  
   297  	// NAMESPACES
   298  	if err := specConfigureNamespaces(s, &g, rt, pod); err != nil {
   299  		return nil, err
   300  	}
   301  	configSpec := g.Config
   302  
   303  	if err := securityConfigureGenerator(s, &g, newImage, rtc); err != nil {
   304  		return nil, err
   305  	}
   306  
   307  	// BIND MOUNTS
   308  	configSpec.Mounts = SupersedeUserMounts(mounts, configSpec.Mounts)
   309  	// Process mounts to ensure correct options
   310  	if err := InitFSMounts(configSpec.Mounts); err != nil {
   311  		return nil, err
   312  	}
   313  
   314  	// Add annotations
   315  	if configSpec.Annotations == nil {
   316  		configSpec.Annotations = make(map[string]string)
   317  	}
   318  
   319  	if s.Remove {
   320  		configSpec.Annotations[define.InspectAnnotationAutoremove] = define.InspectResponseTrue
   321  	}
   322  
   323  	if len(s.VolumesFrom) > 0 {
   324  		configSpec.Annotations[define.InspectAnnotationVolumesFrom] = strings.Join(s.VolumesFrom, ",")
   325  	}
   326  
   327  	if s.Privileged {
   328  		configSpec.Annotations[define.InspectAnnotationPrivileged] = define.InspectResponseTrue
   329  	}
   330  
   331  	if s.Init {
   332  		configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseTrue
   333  	}
   334  
   335  	if s.OOMScoreAdj != nil {
   336  		g.SetProcessOOMScoreAdj(*s.OOMScoreAdj)
   337  	}
   338  
   339  	setProcOpts(s, &g)
   340  	if s.ReadOnlyFilesystem && !s.ReadWriteTmpfs {
   341  		setDevOptsReadOnly(&g)
   342  	}
   343  
   344  	return configSpec, nil
   345  }
   346  
   347  func WeightDevices(wtDevices map[string]spec.LinuxWeightDevice) ([]spec.LinuxWeightDevice, error) {
   348  	devs := []spec.LinuxWeightDevice{}
   349  	for k, v := range wtDevices {
   350  		statT := unix.Stat_t{}
   351  		if err := unix.Stat(k, &statT); err != nil {
   352  			return nil, fmt.Errorf("failed to inspect '%s' in --blkio-weight-device: %w", k, err)
   353  		}
   354  		dev := new(spec.LinuxWeightDevice)
   355  		dev.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
   356  		dev.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
   357  		dev.Weight = v.Weight
   358  		devs = append(devs, *dev)
   359  	}
   360  	return devs, nil
   361  }
   362  
   363  // subNegativeOne translates Hard or soft limits of -1 to the current
   364  // processes Max limit
   365  func subNegativeOne(u spec.POSIXRlimit) spec.POSIXRlimit {
   366  	if !rootless.IsRootless() ||
   367  		(int64(u.Hard) != -1 && int64(u.Soft) != -1) {
   368  		return u
   369  	}
   370  
   371  	ul, err := units.ParseUlimit(fmt.Sprintf("%s=%d:%d", u.Type, int64(u.Soft), int64(u.Hard)))
   372  	if err != nil {
   373  		logrus.Warnf("Failed to check %s ulimit %q", u.Type, err)
   374  		return u
   375  	}
   376  	rl, err := ul.GetRlimit()
   377  	if err != nil {
   378  		logrus.Warnf("Failed to check %s ulimit %q", u.Type, err)
   379  		return u
   380  	}
   381  
   382  	var rlimit unix.Rlimit
   383  
   384  	if err := unix.Getrlimit(rl.Type, &rlimit); err != nil {
   385  		logrus.Warnf("Failed to return RLIMIT_NOFILE ulimit %q", err)
   386  		return u
   387  	}
   388  	if int64(u.Hard) == -1 {
   389  		u.Hard = rlimit.Max
   390  	}
   391  	if int64(u.Soft) == -1 {
   392  		u.Soft = rlimit.Max
   393  	}
   394  	return u
   395  }