github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/specutils/specutils.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package specutils contains utility functions for working with OCI runtime 16 // specs. 17 package specutils 18 19 import ( 20 "encoding/json" 21 "fmt" 22 "io" 23 "io/ioutil" 24 "os" 25 "path" 26 "path/filepath" 27 "strconv" 28 "strings" 29 "time" 30 31 "github.com/cenkalti/backoff" 32 "github.com/mohae/deepcopy" 33 specs "github.com/opencontainers/runtime-spec/specs-go" 34 "golang.org/x/sys/unix" 35 "github.com/SagerNet/gvisor/pkg/abi/linux" 36 "github.com/SagerNet/gvisor/pkg/bits" 37 "github.com/SagerNet/gvisor/pkg/log" 38 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 39 "github.com/SagerNet/gvisor/runsc/config" 40 ) 41 42 // ExePath must point to runsc binary, which is normally the same binary. It's 43 // changed in tests that aren't linked in the same binary. 44 var ExePath = "/proc/self/exe" 45 46 // Version is the supported spec version. 47 var Version = specs.Version 48 49 // LogSpec logs the spec in a human-friendly way. 50 func LogSpec(orig *specs.Spec) { 51 if !log.IsLogging(log.Debug) { 52 return 53 } 54 55 // Strip down parts of the spec that are not interesting. 56 spec := deepcopy.Copy(orig).(*specs.Spec) 57 if spec.Process != nil { 58 spec.Process.Capabilities = nil 59 } 60 if spec.Linux != nil { 61 spec.Linux.Seccomp = nil 62 spec.Linux.MaskedPaths = nil 63 spec.Linux.ReadonlyPaths = nil 64 if spec.Linux.Resources != nil { 65 spec.Linux.Resources.Devices = nil 66 } 67 } 68 69 out, err := json.MarshalIndent(spec, "", " ") 70 if err != nil { 71 log.Debugf("Failed to marshal spec: %v", err) 72 return 73 } 74 log.Debugf("Spec:\n%s", out) 75 } 76 77 // ValidateSpec validates that the spec is compatible with runsc. 78 func ValidateSpec(spec *specs.Spec) error { 79 // Mandatory fields. 80 if spec.Process == nil { 81 return fmt.Errorf("Spec.Process must be defined: %+v", spec) 82 } 83 if len(spec.Process.Args) == 0 { 84 return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process) 85 } 86 if spec.Root == nil { 87 return fmt.Errorf("Spec.Root must be defined: %+v", spec) 88 } 89 if len(spec.Root.Path) == 0 { 90 return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root) 91 } 92 93 // Unsupported fields. 94 if spec.Solaris != nil { 95 return fmt.Errorf("Spec.Solaris is not supported: %+v", spec) 96 } 97 if spec.Windows != nil { 98 return fmt.Errorf("Spec.Windows is not supported: %+v", spec) 99 } 100 if len(spec.Process.SelinuxLabel) != 0 { 101 return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel) 102 } 103 104 // Docker uses AppArmor by default, so just log that it's being ignored. 105 if spec.Process.ApparmorProfile != "" { 106 log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile) 107 } 108 109 // PR_SET_NO_NEW_PRIVS is assumed to always be set. 110 // See kernel.Task.updateCredsForExecLocked. 111 if !spec.Process.NoNewPrivileges { 112 log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.") 113 } 114 115 if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { 116 if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil { 117 return err 118 } 119 } 120 for _, m := range spec.Mounts { 121 if err := validateMount(&m); err != nil { 122 return err 123 } 124 } 125 126 // CRI specifies whether a container should start a new sandbox, or run 127 // another container in an existing sandbox. 128 switch SpecContainerType(spec) { 129 case ContainerTypeContainer: 130 // When starting a container in an existing sandbox, the 131 // sandbox ID must be set. 132 if _, ok := SandboxID(spec); !ok { 133 return fmt.Errorf("spec has container-type of container, but no sandbox ID set") 134 } 135 case ContainerTypeUnknown: 136 return fmt.Errorf("unknown container-type") 137 default: 138 } 139 140 return nil 141 } 142 143 // absPath turns the given path into an absolute path (if it is not already 144 // absolute) by prepending the base path. 145 func absPath(base, rel string) string { 146 if filepath.IsAbs(rel) { 147 return rel 148 } 149 return filepath.Join(base, rel) 150 } 151 152 // OpenSpec opens an OCI runtime spec from the given bundle directory. 153 func OpenSpec(bundleDir string) (*os.File, error) { 154 // The spec file must be named "config.json" inside the bundle directory. 155 return os.Open(filepath.Join(bundleDir, "config.json")) 156 } 157 158 // ReadSpec reads an OCI runtime spec from the given bundle directory. 159 // ReadSpec also normalizes all potential relative paths into absolute 160 // path, e.g. spec.Root.Path, mount.Source. 161 func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) { 162 specFile, err := OpenSpec(bundleDir) 163 if err != nil { 164 return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err) 165 } 166 defer specFile.Close() 167 return ReadSpecFromFile(bundleDir, specFile, conf) 168 } 169 170 // ReadSpecFromFile reads an OCI runtime spec from the given File, and 171 // normalizes all relative paths into absolute by prepending the bundle dir. 172 func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) { 173 if _, err := specFile.Seek(0, io.SeekStart); err != nil { 174 return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err) 175 } 176 specBytes, err := ioutil.ReadAll(specFile) 177 if err != nil { 178 return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err) 179 } 180 var spec specs.Spec 181 if err := json.Unmarshal(specBytes, &spec); err != nil { 182 return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes)) 183 } 184 if err := ValidateSpec(&spec); err != nil { 185 return nil, err 186 } 187 // Turn any relative paths in the spec to absolute by prepending the bundleDir. 188 spec.Root.Path = absPath(bundleDir, spec.Root.Path) 189 for i := range spec.Mounts { 190 m := &spec.Mounts[i] 191 if m.Source != "" { 192 m.Source = absPath(bundleDir, m.Source) 193 } 194 } 195 196 // Override flags using annotation to allow customization per sandbox 197 // instance. 198 for annotation, val := range spec.Annotations { 199 const flagPrefix = "dev.gvisor.flag." 200 if strings.HasPrefix(annotation, flagPrefix) { 201 name := annotation[len(flagPrefix):] 202 log.Infof("Overriding flag: %s=%q", name, val) 203 if err := conf.Override(name, val); err != nil { 204 return nil, err 205 } 206 } 207 } 208 209 return &spec, nil 210 } 211 212 // ReadMounts reads mount list from a file. 213 func ReadMounts(f *os.File) ([]specs.Mount, error) { 214 bytes, err := ioutil.ReadAll(f) 215 if err != nil { 216 return nil, fmt.Errorf("error reading mounts: %v", err) 217 } 218 var mounts []specs.Mount 219 if err := json.Unmarshal(bytes, &mounts); err != nil { 220 return nil, fmt.Errorf("error unmarshaling mounts: %v\nJSON bytes:\n%s", err, string(bytes)) 221 } 222 return mounts, nil 223 } 224 225 // Capabilities takes in spec and returns a TaskCapabilities corresponding to 226 // the spec. 227 func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) { 228 // Strip CAP_NET_RAW from all capability sets if necessary. 229 skipSet := map[linux.Capability]struct{}{} 230 if !enableRaw { 231 skipSet[linux.CAP_NET_RAW] = struct{}{} 232 } 233 234 var caps auth.TaskCapabilities 235 if specCaps != nil { 236 var err error 237 if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil { 238 return nil, err 239 } 240 if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil { 241 return nil, err 242 } 243 if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil { 244 return nil, err 245 } 246 if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil { 247 return nil, err 248 } 249 // TODO(github.com/SagerNet/issue/3166): Support ambient capabilities. 250 } 251 return &caps, nil 252 } 253 254 // AllCapabilities returns a LinuxCapabilities struct with all capabilities. 255 func AllCapabilities() *specs.LinuxCapabilities { 256 var names []string 257 for n := range capFromName { 258 names = append(names, n) 259 } 260 return &specs.LinuxCapabilities{ 261 Bounding: names, 262 Effective: names, 263 Inheritable: names, 264 Permitted: names, 265 Ambient: names, 266 } 267 } 268 269 // AllCapabilitiesUint64 returns a bitmask containing all capabilities set. 270 func AllCapabilitiesUint64() uint64 { 271 var rv uint64 272 for _, cap := range capFromName { 273 rv |= bits.MaskOf64(int(cap)) 274 } 275 return rv 276 } 277 278 var capFromName = map[string]linux.Capability{ 279 "CAP_CHOWN": linux.CAP_CHOWN, 280 "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE, 281 "CAP_DAC_READ_SEARCH": linux.CAP_DAC_READ_SEARCH, 282 "CAP_FOWNER": linux.CAP_FOWNER, 283 "CAP_FSETID": linux.CAP_FSETID, 284 "CAP_KILL": linux.CAP_KILL, 285 "CAP_SETGID": linux.CAP_SETGID, 286 "CAP_SETUID": linux.CAP_SETUID, 287 "CAP_SETPCAP": linux.CAP_SETPCAP, 288 "CAP_LINUX_IMMUTABLE": linux.CAP_LINUX_IMMUTABLE, 289 "CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE, 290 "CAP_NET_BROADCAST": linux.CAP_NET_BROADCAST, 291 "CAP_NET_ADMIN": linux.CAP_NET_ADMIN, 292 "CAP_NET_RAW": linux.CAP_NET_RAW, 293 "CAP_IPC_LOCK": linux.CAP_IPC_LOCK, 294 "CAP_IPC_OWNER": linux.CAP_IPC_OWNER, 295 "CAP_SYS_MODULE": linux.CAP_SYS_MODULE, 296 "CAP_SYS_RAWIO": linux.CAP_SYS_RAWIO, 297 "CAP_SYS_CHROOT": linux.CAP_SYS_CHROOT, 298 "CAP_SYS_PTRACE": linux.CAP_SYS_PTRACE, 299 "CAP_SYS_PACCT": linux.CAP_SYS_PACCT, 300 "CAP_SYS_ADMIN": linux.CAP_SYS_ADMIN, 301 "CAP_SYS_BOOT": linux.CAP_SYS_BOOT, 302 "CAP_SYS_NICE": linux.CAP_SYS_NICE, 303 "CAP_SYS_RESOURCE": linux.CAP_SYS_RESOURCE, 304 "CAP_SYS_TIME": linux.CAP_SYS_TIME, 305 "CAP_SYS_TTY_CONFIG": linux.CAP_SYS_TTY_CONFIG, 306 "CAP_MKNOD": linux.CAP_MKNOD, 307 "CAP_LEASE": linux.CAP_LEASE, 308 "CAP_AUDIT_WRITE": linux.CAP_AUDIT_WRITE, 309 "CAP_AUDIT_CONTROL": linux.CAP_AUDIT_CONTROL, 310 "CAP_SETFCAP": linux.CAP_SETFCAP, 311 "CAP_MAC_OVERRIDE": linux.CAP_MAC_OVERRIDE, 312 "CAP_MAC_ADMIN": linux.CAP_MAC_ADMIN, 313 "CAP_SYSLOG": linux.CAP_SYSLOG, 314 "CAP_WAKE_ALARM": linux.CAP_WAKE_ALARM, 315 "CAP_BLOCK_SUSPEND": linux.CAP_BLOCK_SUSPEND, 316 "CAP_AUDIT_READ": linux.CAP_AUDIT_READ, 317 } 318 319 func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) { 320 var caps []linux.Capability 321 for _, n := range names { 322 c, ok := capFromName[n] 323 if !ok { 324 return 0, fmt.Errorf("unknown capability %q", n) 325 } 326 // Should we skip this capabilty? 327 if _, ok := skipSet[c]; ok { 328 continue 329 } 330 caps = append(caps, c) 331 } 332 return auth.CapabilitySetOfMany(caps), nil 333 } 334 335 // Is9PMount returns true if the given mount can be mounted as an external 336 // gofer. 337 func Is9PMount(m specs.Mount, vfs2Enabled bool) bool { 338 MaybeConvertToBindMount(&m) 339 return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m, vfs2Enabled) 340 } 341 342 // MaybeConvertToBindMount converts mount type to "bind" in case any of the 343 // mount options are either "bind" or "rbind" as required by the OCI spec. 344 // 345 // "For bind mounts (when options include either bind or rbind), the type is a 346 // dummy, often "none" (not listed in /proc/filesystems)." 347 func MaybeConvertToBindMount(m *specs.Mount) { 348 if m.Type == "bind" { 349 return 350 } 351 for _, opt := range m.Options { 352 if opt == "bind" || opt == "rbind" { 353 m.Type = "bind" 354 return 355 } 356 } 357 } 358 359 // IsSupportedDevMount returns true if m.Destination does not specify a 360 // path that is hardcoded by VFS1's implementation of /dev. 361 func IsSupportedDevMount(m specs.Mount, vfs2Enabled bool) bool { 362 // VFS2 has no hardcoded files under /dev, so everything is allowed. 363 if vfs2Enabled { 364 return true 365 } 366 367 // See pkg/sentry/fs/dev/dev.go. 368 var existingDevices = []string{ 369 "/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr", 370 "/dev/null", "/dev/zero", "/dev/full", "/dev/random", 371 "/dev/urandom", "/dev/shm", "/dev/ptmx", 372 } 373 dst := filepath.Clean(m.Destination) 374 for _, dev := range existingDevices { 375 if dst == dev || strings.HasPrefix(dst, dev+"/") { 376 return false 377 } 378 } 379 return true 380 } 381 382 // WaitForReady waits for a process to become ready. The process is ready when 383 // the 'ready' function returns true. It continues to wait if 'ready' returns 384 // false. It returns error on timeout, if the process stops or if 'ready' fails. 385 func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error { 386 b := backoff.NewExponentialBackOff() 387 b.InitialInterval = 1 * time.Millisecond 388 b.MaxInterval = 1 * time.Second 389 b.MaxElapsedTime = timeout 390 391 op := func() error { 392 if ok, err := ready(); err != nil { 393 return backoff.Permanent(err) 394 } else if ok { 395 return nil 396 } 397 398 // Check if the process is still running. 399 // If the process is alive, child is 0 because of the NOHANG option. 400 // If the process has terminated, child equals the process id. 401 var ws unix.WaitStatus 402 var ru unix.Rusage 403 child, err := unix.Wait4(pid, &ws, unix.WNOHANG, &ru) 404 if err != nil { 405 return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err)) 406 } else if child == pid { 407 return backoff.Permanent(fmt.Errorf("process %d has terminated", pid)) 408 } 409 return fmt.Errorf("process %d not running yet", pid) 410 } 411 return backoff.Retry(op, b) 412 } 413 414 // DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern' 415 // ends with '/', it's used as a directory with default file name. 416 // 'logPattern' can contain variables that are substituted: 417 // - %TIMESTAMP%: is replaced with a timestamp using the following format: 418 // <yyyymmdd-hhmmss.uuuuuu> 419 // - %COMMAND%: is replaced with 'command' 420 // - %TEST%: is replaced with 'test' (omitted by default) 421 func DebugLogFile(logPattern, command, test string) (*os.File, error) { 422 if strings.HasSuffix(logPattern, "/") { 423 // Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command> 424 logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%" 425 } 426 logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1) 427 logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1) 428 logPattern = strings.Replace(logPattern, "%TEST%", test, -1) 429 430 dir := filepath.Dir(logPattern) 431 if err := os.MkdirAll(dir, 0775); err != nil { 432 return nil, fmt.Errorf("error creating dir %q: %v", dir, err) 433 } 434 return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664) 435 } 436 437 // Mount creates the mount point and calls Mount with the given flags. procPath 438 // is the path to procfs. If it is "", procfs is assumed to be mounted at 439 // /proc. 440 func Mount(src, dst, typ string, flags uint32, procPath string) error { 441 // Create the mount point inside. The type must be the same as the 442 // source (file or directory). 443 var isDir bool 444 if typ == "proc" { 445 // Special case, as there is no source directory for proc mounts. 446 isDir = true 447 } else if fi, err := os.Stat(src); err != nil { 448 return fmt.Errorf("stat(%q) failed: %v", src, err) 449 } else { 450 isDir = fi.IsDir() 451 } 452 453 if isDir { 454 // Create the destination directory. 455 if err := os.MkdirAll(dst, 0777); err != nil { 456 return fmt.Errorf("mkdir(%q) failed: %v", dst, err) 457 } 458 } else { 459 // Create the parent destination directory. 460 parent := path.Dir(dst) 461 if err := os.MkdirAll(parent, 0777); err != nil { 462 return fmt.Errorf("mkdir(%q) failed: %v", parent, err) 463 } 464 // Create the destination file if it does not exist. 465 f, err := os.OpenFile(dst, unix.O_CREAT, 0777) 466 if err != nil { 467 return fmt.Errorf("open(%q) failed: %v", dst, err) 468 } 469 f.Close() 470 } 471 472 // Do the mount. 473 if err := SafeMount(src, dst, typ, uintptr(flags), "", procPath); err != nil { 474 return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err) 475 } 476 return nil 477 } 478 479 // ErrSymlinkMount is returned by SafeMount when the mount destination is found 480 // to be a symlink. 481 type ErrSymlinkMount struct { 482 error 483 } 484 485 // SafeMount is like unix.Mount, but will fail if dst is a symlink. procPath is 486 // the path to procfs. If it is "", procfs is assumed to be mounted at /proc. 487 func SafeMount(src, dst, fstype string, flags uintptr, data, procPath string) error { 488 // Open the destination. 489 fd, err := unix.Open(dst, unix.O_PATH|unix.O_CLOEXEC, 0) 490 if err != nil { 491 return fmt.Errorf("failed to safely mount: Open(%s, _, _): %w", dst, err) 492 } 493 defer unix.Close(fd) 494 495 // Use /proc/self/fd/ to verify that we opened the intended destination. This 496 // guards against dst being a symlink, in which case we could accidentally 497 // mount over the symlink's target. 498 if procPath == "" { 499 procPath = "/proc" 500 } 501 safePath := filepath.Join(procPath, "self/fd", strconv.Itoa(fd)) 502 target, err := os.Readlink(safePath) 503 if err != nil { 504 return fmt.Errorf("failed to safely mount: Readlink(%s): %w", safePath, err) 505 } 506 if dst != target { 507 return &ErrSymlinkMount{fmt.Errorf("failed to safely mount: expected to open %s, but found %s", dst, target)} 508 } 509 510 return unix.Mount(src, safePath, fstype, flags, data) 511 } 512 513 // ContainsStr returns true if 'str' is inside 'strs'. 514 func ContainsStr(strs []string, str string) bool { 515 for _, s := range strs { 516 if s == str { 517 return true 518 } 519 } 520 return false 521 } 522 523 // RetryEintr retries the function until an error different than EINTR is 524 // returned. 525 func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) { 526 for { 527 r1, r2, err := f() 528 if err != unix.EINTR { 529 return r1, r2, err 530 } 531 } 532 } 533 534 // GetOOMScoreAdj reads the given process' oom_score_adj 535 func GetOOMScoreAdj(pid int) (int, error) { 536 data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid)) 537 if err != nil { 538 return 0, err 539 } 540 return strconv.Atoi(strings.TrimSpace(string(data))) 541 } 542 543 // EnvVar looks for a varible value in the env slice assuming the following 544 // format: "NAME=VALUE". 545 func EnvVar(env []string, name string) (string, bool) { 546 prefix := name + "=" 547 for _, e := range env { 548 if strings.HasPrefix(e, prefix) { 549 return strings.TrimPrefix(e, prefix), true 550 } 551 } 552 return "", false 553 } 554 555 // ResolveEnvs transforms lists of environment variables into a single list of 556 // environment variables. If a variable is defined multiple times, the last 557 // value is used. 558 func ResolveEnvs(envs ...[]string) ([]string, error) { 559 // First create a map of variable names to values. This removes any 560 // duplicates. 561 envMap := make(map[string]string) 562 for _, env := range envs { 563 for _, str := range env { 564 parts := strings.SplitN(str, "=", 2) 565 if len(parts) != 2 { 566 return nil, fmt.Errorf("invalid variable: %s", str) 567 } 568 envMap[parts[0]] = parts[1] 569 } 570 } 571 // Reassemble envMap into a list of environment variables of the form 572 // NAME=VALUE. 573 env := make([]string, 0, len(envMap)) 574 for k, v := range envMap { 575 env = append(env, fmt.Sprintf("%s=%s", k, v)) 576 } 577 return env, nil 578 } 579 580 // FaqErrorMsg returns an error message pointing to the FAQ. 581 func FaqErrorMsg(anchor, msg string) string { 582 return fmt.Sprintf("%s; see https://github.com/SagerNet/faq#%s for more details", msg, anchor) 583 }