github.com/containerd/nerdctl@v1.7.7/pkg/cmd/container/run_linux.go (about)

     1  /*
     2     Copyright The containerd Authors.
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package container
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  
    24  	"github.com/containerd/containerd"
    25  	"github.com/containerd/containerd/containers"
    26  	"github.com/containerd/containerd/oci"
    27  	"github.com/containerd/log"
    28  	"github.com/containerd/nerdctl/pkg/api/types"
    29  	"github.com/containerd/nerdctl/pkg/bypass4netnsutil"
    30  	"github.com/containerd/nerdctl/pkg/containerutil"
    31  	"github.com/containerd/nerdctl/pkg/idutil/containerwalker"
    32  	"github.com/containerd/nerdctl/pkg/rootlessutil"
    33  	"github.com/containerd/nerdctl/pkg/strutil"
    34  	"github.com/docker/go-units"
    35  	"github.com/moby/sys/userns"
    36  	"github.com/opencontainers/runtime-spec/specs-go"
    37  )
    38  
    39  // WithoutRunMount returns a SpecOpts that unmounts the default tmpfs on "/run"
    40  func WithoutRunMount() func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
    41  	return oci.WithoutRunMount
    42  }
    43  
    44  func setPlatformOptions(ctx context.Context, client *containerd.Client, id, uts string, internalLabels *internalLabels, options types.ContainerCreateOptions) ([]oci.SpecOpts, error) {
    45  	var opts []oci.SpecOpts
    46  	opts = append(opts,
    47  		oci.WithDefaultUnixDevices,
    48  		WithoutRunMount(), // unmount default tmpfs on "/run": https://github.com/containerd/nerdctl/issues/157)
    49  	)
    50  
    51  	opts = append(opts,
    52  		oci.WithMounts([]specs.Mount{
    53  			{Type: "cgroup", Source: "cgroup", Destination: "/sys/fs/cgroup", Options: []string{"ro", "nosuid", "noexec", "nodev"}},
    54  		}))
    55  
    56  	cgOpts, err := generateCgroupOpts(id, options)
    57  	if err != nil {
    58  		return nil, err
    59  	}
    60  	opts = append(opts, cgOpts...)
    61  
    62  	labelsMap, err := readKVStringsMapfFromLabel(options.Label, options.LabelFile)
    63  	if err != nil {
    64  		return nil, err
    65  	}
    66  
    67  	capOpts, err := generateCapOpts(
    68  		strutil.DedupeStrSlice(options.CapAdd),
    69  		strutil.DedupeStrSlice(options.CapDrop))
    70  	if err != nil {
    71  		return nil, err
    72  	}
    73  	opts = append(opts, capOpts...)
    74  	securityOptsMaps := strutil.ConvertKVStringsToMap(strutil.DedupeStrSlice(options.SecurityOpt))
    75  	secOpts, err := generateSecurityOpts(options.Privileged, securityOptsMaps)
    76  	if err != nil {
    77  		return nil, err
    78  	}
    79  	opts = append(opts, secOpts...)
    80  
    81  	b4nnOpts, err := bypass4netnsutil.GenerateBypass4netnsOpts(securityOptsMaps, labelsMap, id)
    82  	if err != nil {
    83  		return nil, err
    84  	}
    85  	opts = append(opts, b4nnOpts...)
    86  	if len(options.ShmSize) > 0 {
    87  		shmBytes, err := units.RAMInBytes(options.ShmSize)
    88  		if err != nil {
    89  			return nil, err
    90  		}
    91  		opts = append(opts, oci.WithDevShmSize(shmBytes/1024))
    92  	}
    93  
    94  	ulimitOpts, err := generateUlimitsOpts(options.Ulimit)
    95  	if err != nil {
    96  		return nil, err
    97  	}
    98  
    99  	// If without any ulimitOpts, we need to reset the default value from spec
   100  	// which has 1024 as file limit. Make this behavior same as containerd/cri.
   101  	if len(ulimitOpts) == 0 {
   102  		ulimitOpts = append(ulimitOpts, withRlimits(nil))
   103  	}
   104  
   105  	opts = append(opts, ulimitOpts...)
   106  	if options.Sysctl != nil {
   107  		opts = append(opts, WithSysctls(strutil.ConvertKVStringsToMap(options.Sysctl)))
   108  	}
   109  	gpuOpt, err := parseGPUOpts(options.GPUs)
   110  	if err != nil {
   111  		return nil, err
   112  	}
   113  	opts = append(opts, gpuOpt...)
   114  
   115  	if options.RDTClass != "" {
   116  		opts = append(opts, oci.WithRdt(options.RDTClass, "", ""))
   117  	}
   118  
   119  	nsOpts, err := generateNamespaceOpts(ctx, client, uts, internalLabels, options)
   120  	if err != nil {
   121  		return nil, err
   122  	}
   123  	opts = append(opts, nsOpts...)
   124  
   125  	opts, err = setOOMScoreAdj(opts, options.OomScoreAdjChanged, options.OomScoreAdj)
   126  	if err != nil {
   127  		return nil, err
   128  	}
   129  
   130  	return opts, nil
   131  }
   132  
   133  // generateNamespaceOpts help to validate the namespace options exposed via run and return the correct opts.
   134  func generateNamespaceOpts(
   135  	ctx context.Context,
   136  	client *containerd.Client,
   137  	uts string,
   138  	internalLabels *internalLabels,
   139  	options types.ContainerCreateOptions,
   140  ) ([]oci.SpecOpts, error) {
   141  	var opts []oci.SpecOpts
   142  
   143  	switch uts {
   144  	case "host":
   145  		opts = append(opts, oci.WithHostNamespace(specs.UTSNamespace))
   146  	case "":
   147  		// Default, do nothing. Every container gets its own UTS ns by default.
   148  	default:
   149  		return nil, fmt.Errorf("unknown uts value. valid value(s) are 'host', got: %q", uts)
   150  	}
   151  
   152  	switch options.IPC {
   153  	case "host":
   154  		opts = append(opts, oci.WithHostNamespace(specs.IPCNamespace))
   155  		opts = append(opts, withBindMountHostIPC)
   156  	case "private", "":
   157  		// If nothing is specified, or if private, default to normal behavior
   158  	default:
   159  		return nil, fmt.Errorf("unknown ipc value. valid values are 'private' or 'host', got: %q", options.IPC)
   160  	}
   161  
   162  	pidOpts, pidLabel, err := generatePIDOpts(ctx, client, options.Pid)
   163  	if err != nil {
   164  		return nil, err
   165  	}
   166  	internalLabels.pidContainer = pidLabel
   167  	opts = append(opts, pidOpts...)
   168  
   169  	return opts, nil
   170  }
   171  
   172  func generatePIDOpts(ctx context.Context, client *containerd.Client, pid string) ([]oci.SpecOpts, string, error) {
   173  	opts := make([]oci.SpecOpts, 0)
   174  	pid = strings.ToLower(pid)
   175  	var pidInternalLabel string
   176  
   177  	switch pid {
   178  	case "":
   179  		// do nothing
   180  	case "host":
   181  		opts = append(opts, oci.WithHostNamespace(specs.PIDNamespace))
   182  		if rootlessutil.IsRootless() {
   183  			opts = append(opts, containerutil.WithBindMountHostProcfs)
   184  		}
   185  	default: // container:<id|name>
   186  		parsed := strings.Split(pid, ":")
   187  		if len(parsed) < 2 || parsed[0] != "container" {
   188  			return nil, "", fmt.Errorf("invalid pid namespace. Set --pid=[host|container:<name|id>")
   189  		}
   190  
   191  		containerName := parsed[1]
   192  		walker := &containerwalker.ContainerWalker{
   193  			Client: client,
   194  			OnFound: func(ctx context.Context, found containerwalker.Found) error {
   195  				if found.MatchCount > 1 {
   196  					return fmt.Errorf("multiple IDs found with provided prefix: %s", found.Req)
   197  				}
   198  
   199  				o, err := containerutil.GenerateSharingPIDOpts(ctx, found.Container)
   200  				if err != nil {
   201  					return err
   202  				}
   203  				opts = append(opts, o...)
   204  				pidInternalLabel = found.Container.ID()
   205  
   206  				return nil
   207  			},
   208  		}
   209  		matchedCount, err := walker.Walk(ctx, containerName)
   210  		if err != nil {
   211  			return nil, "", err
   212  		}
   213  		if matchedCount < 1 {
   214  			return nil, "", fmt.Errorf("no such container: %s", containerName)
   215  		}
   216  	}
   217  
   218  	return opts, pidInternalLabel, nil
   219  }
   220  
   221  func setOOMScoreAdj(opts []oci.SpecOpts, oomScoreAdjChanged bool, oomScoreAdj int) ([]oci.SpecOpts, error) {
   222  	if !oomScoreAdjChanged {
   223  		return opts, nil
   224  	}
   225  	// score=0 means literally zero, not "unchanged"
   226  	if oomScoreAdj < -1000 || oomScoreAdj > 1000 {
   227  		return nil, fmt.Errorf("invalid value %d, range for oom score adj is [-1000, 1000]", oomScoreAdj)
   228  	}
   229  
   230  	if userns.RunningInUserNS() {
   231  		// > The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last value set by a CAP_SYS_RESOURCE process.
   232  		// > To reduce the value any lower requires CAP_SYS_RESOURCE.
   233  		// https://github.com/torvalds/linux/blob/v6.0/Documentation/filesystems/proc.rst#31-procpidoom_adj--procpidoom_score_adj--adjust-the-oom-killer-score
   234  		//
   235  		// The minimum=100 is from `/proc/$(pgrep -u $(id -u) systemd)/oom_score_adj`
   236  		// (FIXME: find a more robust way to get the current minimum value)
   237  		const minimum = 100
   238  		if oomScoreAdj < minimum {
   239  			log.L.Warnf("Limiting oom_score_adj (%d -> %d)", oomScoreAdj, minimum)
   240  			oomScoreAdj = minimum
   241  		}
   242  	}
   243  
   244  	opts = append(opts, withOOMScoreAdj(oomScoreAdj))
   245  	return opts, nil
   246  }
   247  
   248  func withOOMScoreAdj(score int) oci.SpecOpts {
   249  	return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) error {
   250  		s.Process.OOMScoreAdj = &score
   251  		return nil
   252  	}
   253  }