github.com/containerd/nerdctl@v1.7.7/pkg/cmd/container/run_linux.go (about) 1 /* 2 Copyright The containerd Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package container 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 24 "github.com/containerd/containerd" 25 "github.com/containerd/containerd/containers" 26 "github.com/containerd/containerd/oci" 27 "github.com/containerd/log" 28 "github.com/containerd/nerdctl/pkg/api/types" 29 "github.com/containerd/nerdctl/pkg/bypass4netnsutil" 30 "github.com/containerd/nerdctl/pkg/containerutil" 31 "github.com/containerd/nerdctl/pkg/idutil/containerwalker" 32 "github.com/containerd/nerdctl/pkg/rootlessutil" 33 "github.com/containerd/nerdctl/pkg/strutil" 34 "github.com/docker/go-units" 35 "github.com/moby/sys/userns" 36 "github.com/opencontainers/runtime-spec/specs-go" 37 ) 38 39 // WithoutRunMount returns a SpecOpts that unmounts the default tmpfs on "/run" 40 func WithoutRunMount() func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error { 41 return oci.WithoutRunMount 42 } 43 44 func setPlatformOptions(ctx context.Context, client *containerd.Client, id, uts string, internalLabels *internalLabels, options types.ContainerCreateOptions) ([]oci.SpecOpts, error) { 45 var opts []oci.SpecOpts 46 opts = append(opts, 47 oci.WithDefaultUnixDevices, 48 WithoutRunMount(), // unmount default tmpfs on "/run": https://github.com/containerd/nerdctl/issues/157) 49 ) 50 51 opts = append(opts, 52 oci.WithMounts([]specs.Mount{ 53 {Type: "cgroup", Source: "cgroup", Destination: "/sys/fs/cgroup", Options: []string{"ro", "nosuid", "noexec", "nodev"}}, 54 })) 55 56 cgOpts, err := generateCgroupOpts(id, options) 57 if err != nil { 58 return nil, err 59 } 60 opts = append(opts, cgOpts...) 61 62 labelsMap, err := readKVStringsMapfFromLabel(options.Label, options.LabelFile) 63 if err != nil { 64 return nil, err 65 } 66 67 capOpts, err := generateCapOpts( 68 strutil.DedupeStrSlice(options.CapAdd), 69 strutil.DedupeStrSlice(options.CapDrop)) 70 if err != nil { 71 return nil, err 72 } 73 opts = append(opts, capOpts...) 74 securityOptsMaps := strutil.ConvertKVStringsToMap(strutil.DedupeStrSlice(options.SecurityOpt)) 75 secOpts, err := generateSecurityOpts(options.Privileged, securityOptsMaps) 76 if err != nil { 77 return nil, err 78 } 79 opts = append(opts, secOpts...) 80 81 b4nnOpts, err := bypass4netnsutil.GenerateBypass4netnsOpts(securityOptsMaps, labelsMap, id) 82 if err != nil { 83 return nil, err 84 } 85 opts = append(opts, b4nnOpts...) 86 if len(options.ShmSize) > 0 { 87 shmBytes, err := units.RAMInBytes(options.ShmSize) 88 if err != nil { 89 return nil, err 90 } 91 opts = append(opts, oci.WithDevShmSize(shmBytes/1024)) 92 } 93 94 ulimitOpts, err := generateUlimitsOpts(options.Ulimit) 95 if err != nil { 96 return nil, err 97 } 98 99 // If without any ulimitOpts, we need to reset the default value from spec 100 // which has 1024 as file limit. Make this behavior same as containerd/cri. 101 if len(ulimitOpts) == 0 { 102 ulimitOpts = append(ulimitOpts, withRlimits(nil)) 103 } 104 105 opts = append(opts, ulimitOpts...) 106 if options.Sysctl != nil { 107 opts = append(opts, WithSysctls(strutil.ConvertKVStringsToMap(options.Sysctl))) 108 } 109 gpuOpt, err := parseGPUOpts(options.GPUs) 110 if err != nil { 111 return nil, err 112 } 113 opts = append(opts, gpuOpt...) 114 115 if options.RDTClass != "" { 116 opts = append(opts, oci.WithRdt(options.RDTClass, "", "")) 117 } 118 119 nsOpts, err := generateNamespaceOpts(ctx, client, uts, internalLabels, options) 120 if err != nil { 121 return nil, err 122 } 123 opts = append(opts, nsOpts...) 124 125 opts, err = setOOMScoreAdj(opts, options.OomScoreAdjChanged, options.OomScoreAdj) 126 if err != nil { 127 return nil, err 128 } 129 130 return opts, nil 131 } 132 133 // generateNamespaceOpts help to validate the namespace options exposed via run and return the correct opts. 134 func generateNamespaceOpts( 135 ctx context.Context, 136 client *containerd.Client, 137 uts string, 138 internalLabels *internalLabels, 139 options types.ContainerCreateOptions, 140 ) ([]oci.SpecOpts, error) { 141 var opts []oci.SpecOpts 142 143 switch uts { 144 case "host": 145 opts = append(opts, oci.WithHostNamespace(specs.UTSNamespace)) 146 case "": 147 // Default, do nothing. Every container gets its own UTS ns by default. 148 default: 149 return nil, fmt.Errorf("unknown uts value. valid value(s) are 'host', got: %q", uts) 150 } 151 152 switch options.IPC { 153 case "host": 154 opts = append(opts, oci.WithHostNamespace(specs.IPCNamespace)) 155 opts = append(opts, withBindMountHostIPC) 156 case "private", "": 157 // If nothing is specified, or if private, default to normal behavior 158 default: 159 return nil, fmt.Errorf("unknown ipc value. valid values are 'private' or 'host', got: %q", options.IPC) 160 } 161 162 pidOpts, pidLabel, err := generatePIDOpts(ctx, client, options.Pid) 163 if err != nil { 164 return nil, err 165 } 166 internalLabels.pidContainer = pidLabel 167 opts = append(opts, pidOpts...) 168 169 return opts, nil 170 } 171 172 func generatePIDOpts(ctx context.Context, client *containerd.Client, pid string) ([]oci.SpecOpts, string, error) { 173 opts := make([]oci.SpecOpts, 0) 174 pid = strings.ToLower(pid) 175 var pidInternalLabel string 176 177 switch pid { 178 case "": 179 // do nothing 180 case "host": 181 opts = append(opts, oci.WithHostNamespace(specs.PIDNamespace)) 182 if rootlessutil.IsRootless() { 183 opts = append(opts, containerutil.WithBindMountHostProcfs) 184 } 185 default: // container:<id|name> 186 parsed := strings.Split(pid, ":") 187 if len(parsed) < 2 || parsed[0] != "container" { 188 return nil, "", fmt.Errorf("invalid pid namespace. Set --pid=[host|container:<name|id>") 189 } 190 191 containerName := parsed[1] 192 walker := &containerwalker.ContainerWalker{ 193 Client: client, 194 OnFound: func(ctx context.Context, found containerwalker.Found) error { 195 if found.MatchCount > 1 { 196 return fmt.Errorf("multiple IDs found with provided prefix: %s", found.Req) 197 } 198 199 o, err := containerutil.GenerateSharingPIDOpts(ctx, found.Container) 200 if err != nil { 201 return err 202 } 203 opts = append(opts, o...) 204 pidInternalLabel = found.Container.ID() 205 206 return nil 207 }, 208 } 209 matchedCount, err := walker.Walk(ctx, containerName) 210 if err != nil { 211 return nil, "", err 212 } 213 if matchedCount < 1 { 214 return nil, "", fmt.Errorf("no such container: %s", containerName) 215 } 216 } 217 218 return opts, pidInternalLabel, nil 219 } 220 221 func setOOMScoreAdj(opts []oci.SpecOpts, oomScoreAdjChanged bool, oomScoreAdj int) ([]oci.SpecOpts, error) { 222 if !oomScoreAdjChanged { 223 return opts, nil 224 } 225 // score=0 means literally zero, not "unchanged" 226 if oomScoreAdj < -1000 || oomScoreAdj > 1000 { 227 return nil, fmt.Errorf("invalid value %d, range for oom score adj is [-1000, 1000]", oomScoreAdj) 228 } 229 230 if userns.RunningInUserNS() { 231 // > The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last value set by a CAP_SYS_RESOURCE process. 232 // > To reduce the value any lower requires CAP_SYS_RESOURCE. 233 // https://github.com/torvalds/linux/blob/v6.0/Documentation/filesystems/proc.rst#31-procpidoom_adj--procpidoom_score_adj--adjust-the-oom-killer-score 234 // 235 // The minimum=100 is from `/proc/$(pgrep -u $(id -u) systemd)/oom_score_adj` 236 // (FIXME: find a more robust way to get the current minimum value) 237 const minimum = 100 238 if oomScoreAdj < minimum { 239 log.L.Warnf("Limiting oom_score_adj (%d -> %d)", oomScoreAdj, minimum) 240 oomScoreAdj = minimum 241 } 242 } 243 244 opts = append(opts, withOOMScoreAdj(oomScoreAdj)) 245 return opts, nil 246 } 247 248 func withOOMScoreAdj(score int) oci.SpecOpts { 249 return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) error { 250 s.Process.OOMScoreAdj = &score 251 return nil 252 } 253 }