github.com/containerd/nerdctl/v2@v2.0.0-beta.5.0.20240520001846-b5758f54fa28/pkg/cmd/container/run_linux.go (about) 1 /* 2 Copyright The containerd Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package container 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 24 "github.com/containerd/containerd" 25 "github.com/containerd/containerd/containers" 26 "github.com/containerd/containerd/oci" 27 "github.com/containerd/containerd/pkg/userns" 28 "github.com/containerd/log" 29 "github.com/containerd/nerdctl/v2/pkg/api/types" 30 "github.com/containerd/nerdctl/v2/pkg/bypass4netnsutil" 31 "github.com/containerd/nerdctl/v2/pkg/containerutil" 32 "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" 33 "github.com/containerd/nerdctl/v2/pkg/ipcutil" 34 "github.com/containerd/nerdctl/v2/pkg/rootlessutil" 35 "github.com/containerd/nerdctl/v2/pkg/strutil" 36 "github.com/opencontainers/runtime-spec/specs-go" 37 ) 38 39 // WithoutRunMount returns a SpecOpts that unmounts the default tmpfs on "/run" 40 func WithoutRunMount() func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error { 41 return oci.WithoutRunMount 42 } 43 44 func setPlatformOptions(ctx context.Context, client *containerd.Client, id, uts string, internalLabels *internalLabels, options types.ContainerCreateOptions) ([]oci.SpecOpts, error) { 45 var opts []oci.SpecOpts 46 opts = append(opts, 47 oci.WithDefaultUnixDevices, 48 WithoutRunMount(), // unmount default tmpfs on "/run": https://github.com/containerd/nerdctl/issues/157) 49 ) 50 51 opts = append(opts, 52 oci.WithMounts([]specs.Mount{ 53 {Type: "cgroup", Source: "cgroup", Destination: "/sys/fs/cgroup", Options: []string{"ro", "nosuid", "noexec", "nodev"}}, 54 })) 55 56 cgOpts, err := generateCgroupOpts(id, options) 57 if err != nil { 58 return nil, err 59 } 60 opts = append(opts, cgOpts...) 61 62 annotations := strutil.ConvertKVStringsToMap(options.Annotations) 63 64 capOpts, err := generateCapOpts( 65 strutil.DedupeStrSlice(options.CapAdd), 66 strutil.DedupeStrSlice(options.CapDrop)) 67 if err != nil { 68 return nil, err 69 } 70 opts = append(opts, capOpts...) 71 securityOptsMaps := strutil.ConvertKVStringsToMap(strutil.DedupeStrSlice(options.SecurityOpt)) 72 secOpts, err := generateSecurityOpts(options.Privileged, securityOptsMaps) 73 if err != nil { 74 return nil, err 75 } 76 opts = append(opts, secOpts...) 77 78 b4nnOpts, err := bypass4netnsutil.GenerateBypass4netnsOpts(securityOptsMaps, annotations, id) 79 if err != nil { 80 return nil, err 81 } 82 opts = append(opts, b4nnOpts...) 83 84 ulimitOpts, err := generateUlimitsOpts(options.Ulimit) 85 if err != nil { 86 return nil, err 87 } 88 89 // If without any ulimitOpts, we need to reset the default value from spec 90 // which has 1024 as file limit. Make this behavior same as containerd/cri. 91 if len(ulimitOpts) == 0 { 92 ulimitOpts = append(ulimitOpts, withRlimits(nil)) 93 } 94 95 opts = append(opts, ulimitOpts...) 96 if options.Sysctl != nil { 97 opts = append(opts, WithSysctls(strutil.ConvertKVStringsToMap(options.Sysctl))) 98 } 99 gpuOpt, err := parseGPUOpts(options.GPUs) 100 if err != nil { 101 return nil, err 102 } 103 opts = append(opts, gpuOpt...) 104 105 if options.RDTClass != "" { 106 opts = append(opts, oci.WithRdt(options.RDTClass, "", "")) 107 } 108 109 nsOpts, err := generateNamespaceOpts(ctx, client, uts, internalLabels, options) 110 if err != nil { 111 return nil, err 112 } 113 opts = append(opts, nsOpts...) 114 115 opts, err = setOOMScoreAdj(opts, options.OomScoreAdjChanged, options.OomScoreAdj) 116 if err != nil { 117 return nil, err 118 } 119 120 return opts, nil 121 } 122 123 // generateNamespaceOpts help to validate the namespace options exposed via run and return the correct opts. 124 func generateNamespaceOpts( 125 ctx context.Context, 126 client *containerd.Client, 127 uts string, 128 internalLabels *internalLabels, 129 options types.ContainerCreateOptions, 130 ) ([]oci.SpecOpts, error) { 131 var opts []oci.SpecOpts 132 133 switch uts { 134 case "host": 135 opts = append(opts, oci.WithHostNamespace(specs.UTSNamespace)) 136 case "": 137 // Default, do nothing. Every container gets its own UTS ns by default. 138 default: 139 return nil, fmt.Errorf("unknown uts value. valid value(s) are 'host', got: %q", uts) 140 } 141 142 stateDir := internalLabels.stateDir 143 ipcOpts, ipcLabel, err := generateIPCOpts(ctx, client, options.IPC, options.ShmSize, stateDir) 144 if err != nil { 145 return nil, err 146 } 147 internalLabels.ipc = ipcLabel 148 opts = append(opts, ipcOpts...) 149 150 pidOpts, pidLabel, err := generatePIDOpts(ctx, client, options.Pid) 151 if err != nil { 152 return nil, err 153 } 154 internalLabels.pidContainer = pidLabel 155 opts = append(opts, pidOpts...) 156 157 return opts, nil 158 } 159 160 func generateIPCOpts(ctx context.Context, client *containerd.Client, ipcFlag string, shmSize string, stateDir string) ([]oci.SpecOpts, string, error) { 161 ipcFlag = strings.ToLower(ipcFlag) 162 163 ipc, err := ipcutil.DetectFlags(ctx, client, stateDir, ipcFlag, shmSize) 164 if err != nil { 165 return nil, "", err 166 } 167 ipcLabel, err := ipcutil.EncodeIPCLabel(ipc) 168 if err != nil { 169 return nil, "", err 170 } 171 opts, err := ipcutil.GenerateIPCOpts(ctx, ipc, client) 172 if err != nil { 173 return nil, "", err 174 } 175 176 return opts, ipcLabel, nil 177 } 178 179 func generatePIDOpts(ctx context.Context, client *containerd.Client, pid string) ([]oci.SpecOpts, string, error) { 180 opts := make([]oci.SpecOpts, 0) 181 pid = strings.ToLower(pid) 182 var pidInternalLabel string 183 184 switch pid { 185 case "": 186 // do nothing 187 case "host": 188 opts = append(opts, oci.WithHostNamespace(specs.PIDNamespace)) 189 if rootlessutil.IsRootless() { 190 opts = append(opts, containerutil.WithBindMountHostProcfs) 191 } 192 default: // container:<id|name> 193 parsed := strings.Split(pid, ":") 194 if len(parsed) < 2 || parsed[0] != "container" { 195 return nil, "", fmt.Errorf("invalid pid namespace. Set --pid=[host|container:<name|id>") 196 } 197 198 containerName := parsed[1] 199 walker := &containerwalker.ContainerWalker{ 200 Client: client, 201 OnFound: func(ctx context.Context, found containerwalker.Found) error { 202 if found.MatchCount > 1 { 203 return fmt.Errorf("multiple IDs found with provided prefix: %s", found.Req) 204 } 205 206 o, err := containerutil.GenerateSharingPIDOpts(ctx, found.Container) 207 if err != nil { 208 return err 209 } 210 opts = append(opts, o...) 211 pidInternalLabel = found.Container.ID() 212 213 return nil 214 }, 215 } 216 matchedCount, err := walker.Walk(ctx, containerName) 217 if err != nil { 218 return nil, "", err 219 } 220 if matchedCount < 1 { 221 return nil, "", fmt.Errorf("no such container: %s", containerName) 222 } 223 } 224 225 return opts, pidInternalLabel, nil 226 } 227 228 func setOOMScoreAdj(opts []oci.SpecOpts, oomScoreAdjChanged bool, oomScoreAdj int) ([]oci.SpecOpts, error) { 229 if !oomScoreAdjChanged { 230 return opts, nil 231 } 232 // score=0 means literally zero, not "unchanged" 233 if oomScoreAdj < -1000 || oomScoreAdj > 1000 { 234 return nil, fmt.Errorf("invalid value %d, range for oom score adj is [-1000, 1000]", oomScoreAdj) 235 } 236 237 if userns.RunningInUserNS() { 238 // > The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last value set by a CAP_SYS_RESOURCE process. 239 // > To reduce the value any lower requires CAP_SYS_RESOURCE. 240 // https://github.com/torvalds/linux/blob/v6.0/Documentation/filesystems/proc.rst#31-procpidoom_adj--procpidoom_score_adj--adjust-the-oom-killer-score 241 // 242 // The minimum=100 is from `/proc/$(pgrep -u $(id -u) systemd)/oom_score_adj` 243 // (FIXME: find a more robust way to get the current minimum value) 244 const minimum = 100 245 if oomScoreAdj < minimum { 246 log.L.Warnf("Limiting oom_score_adj (%d -> %d)", oomScoreAdj, minimum) 247 oomScoreAdj = minimum 248 } 249 } 250 251 opts = append(opts, withOOMScoreAdj(oomScoreAdj)) 252 return opts, nil 253 } 254 255 func withOOMScoreAdj(score int) oci.SpecOpts { 256 return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) error { 257 s.Process.OOMScoreAdj = &score 258 return nil 259 } 260 }