github.com/containerd/nerdctl/v2@v2.0.0-beta.5.0.20240520001846-b5758f54fa28/pkg/cmd/container/run_linux.go (about)

     1  /*
     2     Copyright The containerd Authors.
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package container
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  
    24  	"github.com/containerd/containerd"
    25  	"github.com/containerd/containerd/containers"
    26  	"github.com/containerd/containerd/oci"
    27  	"github.com/containerd/containerd/pkg/userns"
    28  	"github.com/containerd/log"
    29  	"github.com/containerd/nerdctl/v2/pkg/api/types"
    30  	"github.com/containerd/nerdctl/v2/pkg/bypass4netnsutil"
    31  	"github.com/containerd/nerdctl/v2/pkg/containerutil"
    32  	"github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker"
    33  	"github.com/containerd/nerdctl/v2/pkg/ipcutil"
    34  	"github.com/containerd/nerdctl/v2/pkg/rootlessutil"
    35  	"github.com/containerd/nerdctl/v2/pkg/strutil"
    36  	"github.com/opencontainers/runtime-spec/specs-go"
    37  )
    38  
    39  // WithoutRunMount returns a SpecOpts that unmounts the default tmpfs on "/run"
    40  func WithoutRunMount() func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
    41  	return oci.WithoutRunMount
    42  }
    43  
    44  func setPlatformOptions(ctx context.Context, client *containerd.Client, id, uts string, internalLabels *internalLabels, options types.ContainerCreateOptions) ([]oci.SpecOpts, error) {
    45  	var opts []oci.SpecOpts
    46  	opts = append(opts,
    47  		oci.WithDefaultUnixDevices,
    48  		WithoutRunMount(), // unmount default tmpfs on "/run": https://github.com/containerd/nerdctl/issues/157)
    49  	)
    50  
    51  	opts = append(opts,
    52  		oci.WithMounts([]specs.Mount{
    53  			{Type: "cgroup", Source: "cgroup", Destination: "/sys/fs/cgroup", Options: []string{"ro", "nosuid", "noexec", "nodev"}},
    54  		}))
    55  
    56  	cgOpts, err := generateCgroupOpts(id, options)
    57  	if err != nil {
    58  		return nil, err
    59  	}
    60  	opts = append(opts, cgOpts...)
    61  
    62  	annotations := strutil.ConvertKVStringsToMap(options.Annotations)
    63  
    64  	capOpts, err := generateCapOpts(
    65  		strutil.DedupeStrSlice(options.CapAdd),
    66  		strutil.DedupeStrSlice(options.CapDrop))
    67  	if err != nil {
    68  		return nil, err
    69  	}
    70  	opts = append(opts, capOpts...)
    71  	securityOptsMaps := strutil.ConvertKVStringsToMap(strutil.DedupeStrSlice(options.SecurityOpt))
    72  	secOpts, err := generateSecurityOpts(options.Privileged, securityOptsMaps)
    73  	if err != nil {
    74  		return nil, err
    75  	}
    76  	opts = append(opts, secOpts...)
    77  
    78  	b4nnOpts, err := bypass4netnsutil.GenerateBypass4netnsOpts(securityOptsMaps, annotations, id)
    79  	if err != nil {
    80  		return nil, err
    81  	}
    82  	opts = append(opts, b4nnOpts...)
    83  
    84  	ulimitOpts, err := generateUlimitsOpts(options.Ulimit)
    85  	if err != nil {
    86  		return nil, err
    87  	}
    88  
    89  	// If without any ulimitOpts, we need to reset the default value from spec
    90  	// which has 1024 as file limit. Make this behavior same as containerd/cri.
    91  	if len(ulimitOpts) == 0 {
    92  		ulimitOpts = append(ulimitOpts, withRlimits(nil))
    93  	}
    94  
    95  	opts = append(opts, ulimitOpts...)
    96  	if options.Sysctl != nil {
    97  		opts = append(opts, WithSysctls(strutil.ConvertKVStringsToMap(options.Sysctl)))
    98  	}
    99  	gpuOpt, err := parseGPUOpts(options.GPUs)
   100  	if err != nil {
   101  		return nil, err
   102  	}
   103  	opts = append(opts, gpuOpt...)
   104  
   105  	if options.RDTClass != "" {
   106  		opts = append(opts, oci.WithRdt(options.RDTClass, "", ""))
   107  	}
   108  
   109  	nsOpts, err := generateNamespaceOpts(ctx, client, uts, internalLabels, options)
   110  	if err != nil {
   111  		return nil, err
   112  	}
   113  	opts = append(opts, nsOpts...)
   114  
   115  	opts, err = setOOMScoreAdj(opts, options.OomScoreAdjChanged, options.OomScoreAdj)
   116  	if err != nil {
   117  		return nil, err
   118  	}
   119  
   120  	return opts, nil
   121  }
   122  
   123  // generateNamespaceOpts help to validate the namespace options exposed via run and return the correct opts.
   124  func generateNamespaceOpts(
   125  	ctx context.Context,
   126  	client *containerd.Client,
   127  	uts string,
   128  	internalLabels *internalLabels,
   129  	options types.ContainerCreateOptions,
   130  ) ([]oci.SpecOpts, error) {
   131  	var opts []oci.SpecOpts
   132  
   133  	switch uts {
   134  	case "host":
   135  		opts = append(opts, oci.WithHostNamespace(specs.UTSNamespace))
   136  	case "":
   137  		// Default, do nothing. Every container gets its own UTS ns by default.
   138  	default:
   139  		return nil, fmt.Errorf("unknown uts value. valid value(s) are 'host', got: %q", uts)
   140  	}
   141  
   142  	stateDir := internalLabels.stateDir
   143  	ipcOpts, ipcLabel, err := generateIPCOpts(ctx, client, options.IPC, options.ShmSize, stateDir)
   144  	if err != nil {
   145  		return nil, err
   146  	}
   147  	internalLabels.ipc = ipcLabel
   148  	opts = append(opts, ipcOpts...)
   149  
   150  	pidOpts, pidLabel, err := generatePIDOpts(ctx, client, options.Pid)
   151  	if err != nil {
   152  		return nil, err
   153  	}
   154  	internalLabels.pidContainer = pidLabel
   155  	opts = append(opts, pidOpts...)
   156  
   157  	return opts, nil
   158  }
   159  
   160  func generateIPCOpts(ctx context.Context, client *containerd.Client, ipcFlag string, shmSize string, stateDir string) ([]oci.SpecOpts, string, error) {
   161  	ipcFlag = strings.ToLower(ipcFlag)
   162  
   163  	ipc, err := ipcutil.DetectFlags(ctx, client, stateDir, ipcFlag, shmSize)
   164  	if err != nil {
   165  		return nil, "", err
   166  	}
   167  	ipcLabel, err := ipcutil.EncodeIPCLabel(ipc)
   168  	if err != nil {
   169  		return nil, "", err
   170  	}
   171  	opts, err := ipcutil.GenerateIPCOpts(ctx, ipc, client)
   172  	if err != nil {
   173  		return nil, "", err
   174  	}
   175  
   176  	return opts, ipcLabel, nil
   177  }
   178  
   179  func generatePIDOpts(ctx context.Context, client *containerd.Client, pid string) ([]oci.SpecOpts, string, error) {
   180  	opts := make([]oci.SpecOpts, 0)
   181  	pid = strings.ToLower(pid)
   182  	var pidInternalLabel string
   183  
   184  	switch pid {
   185  	case "":
   186  		// do nothing
   187  	case "host":
   188  		opts = append(opts, oci.WithHostNamespace(specs.PIDNamespace))
   189  		if rootlessutil.IsRootless() {
   190  			opts = append(opts, containerutil.WithBindMountHostProcfs)
   191  		}
   192  	default: // container:<id|name>
   193  		parsed := strings.Split(pid, ":")
   194  		if len(parsed) < 2 || parsed[0] != "container" {
   195  			return nil, "", fmt.Errorf("invalid pid namespace. Set --pid=[host|container:<name|id>")
   196  		}
   197  
   198  		containerName := parsed[1]
   199  		walker := &containerwalker.ContainerWalker{
   200  			Client: client,
   201  			OnFound: func(ctx context.Context, found containerwalker.Found) error {
   202  				if found.MatchCount > 1 {
   203  					return fmt.Errorf("multiple IDs found with provided prefix: %s", found.Req)
   204  				}
   205  
   206  				o, err := containerutil.GenerateSharingPIDOpts(ctx, found.Container)
   207  				if err != nil {
   208  					return err
   209  				}
   210  				opts = append(opts, o...)
   211  				pidInternalLabel = found.Container.ID()
   212  
   213  				return nil
   214  			},
   215  		}
   216  		matchedCount, err := walker.Walk(ctx, containerName)
   217  		if err != nil {
   218  			return nil, "", err
   219  		}
   220  		if matchedCount < 1 {
   221  			return nil, "", fmt.Errorf("no such container: %s", containerName)
   222  		}
   223  	}
   224  
   225  	return opts, pidInternalLabel, nil
   226  }
   227  
   228  func setOOMScoreAdj(opts []oci.SpecOpts, oomScoreAdjChanged bool, oomScoreAdj int) ([]oci.SpecOpts, error) {
   229  	if !oomScoreAdjChanged {
   230  		return opts, nil
   231  	}
   232  	// score=0 means literally zero, not "unchanged"
   233  	if oomScoreAdj < -1000 || oomScoreAdj > 1000 {
   234  		return nil, fmt.Errorf("invalid value %d, range for oom score adj is [-1000, 1000]", oomScoreAdj)
   235  	}
   236  
   237  	if userns.RunningInUserNS() {
   238  		// > The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last value set by a CAP_SYS_RESOURCE process.
   239  		// > To reduce the value any lower requires CAP_SYS_RESOURCE.
   240  		// https://github.com/torvalds/linux/blob/v6.0/Documentation/filesystems/proc.rst#31-procpidoom_adj--procpidoom_score_adj--adjust-the-oom-killer-score
   241  		//
   242  		// The minimum=100 is from `/proc/$(pgrep -u $(id -u) systemd)/oom_score_adj`
   243  		// (FIXME: find a more robust way to get the current minimum value)
   244  		const minimum = 100
   245  		if oomScoreAdj < minimum {
   246  			log.L.Warnf("Limiting oom_score_adj (%d -> %d)", oomScoreAdj, minimum)
   247  			oomScoreAdj = minimum
   248  		}
   249  	}
   250  
   251  	opts = append(opts, withOOMScoreAdj(oomScoreAdj))
   252  	return opts, nil
   253  }
   254  
   255  func withOOMScoreAdj(score int) oci.SpecOpts {
   256  	return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) error {
   257  		s.Process.OOMScoreAdj = &score
   258  		return nil
   259  	}
   260  }