github.com/rkt/rkt@v1.30.1-0.20200224141603-171c416fac02/common/cgroup/v1/cgroup.go (about)

     1  // Copyright 2016 The rkt Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //+build linux
    16  
    17  package v1
    18  
    19  import (
    20  	"bufio"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"io/ioutil"
    25  	"os"
    26  	"path/filepath"
    27  	"strconv"
    28  	"strings"
    29  	"syscall"
    30  
    31  	"github.com/hashicorp/errwrap"
    32  	"github.com/rkt/rkt/pkg/fs"
    33  )
    34  
    35  // mountFsRO remounts the given mountPoint using the given flags read-only.
    36  func mountFsRO(m fs.Mounter, mountPoint string, flags uintptr) error {
    37  	flags = flags |
    38  		syscall.MS_BIND |
    39  		syscall.MS_REMOUNT |
    40  		syscall.MS_RDONLY
    41  
    42  	if err := m.Mount(mountPoint, mountPoint, "", flags, ""); err != nil {
    43  		return errwrap.Wrap(fmt.Errorf("error remounting read-only %q", mountPoint), err)
    44  	}
    45  
    46  	return nil
    47  }
    48  
    49  func parseCgroups(f io.Reader) (map[int][]string, error) {
    50  	sc := bufio.NewScanner(f)
    51  
    52  	// skip first line since it is a comment
    53  	sc.Scan()
    54  
    55  	cgroups := make(map[int][]string)
    56  	for sc.Scan() {
    57  		var controller string
    58  		var hierarchy int
    59  		var num int
    60  		var enabled int
    61  		fmt.Sscanf(sc.Text(), "%s %d %d %d", &controller, &hierarchy, &num, &enabled)
    62  
    63  		if enabled == 1 {
    64  			if _, ok := cgroups[hierarchy]; !ok {
    65  				cgroups[hierarchy] = []string{controller}
    66  			} else {
    67  				cgroups[hierarchy] = append(cgroups[hierarchy], controller)
    68  			}
    69  		}
    70  	}
    71  
    72  	if err := sc.Err(); err != nil {
    73  		return nil, err
    74  	}
    75  
    76  	return cgroups, nil
    77  }
    78  
    79  // GetEnabledCgroups returns a map with the enabled cgroup controllers grouped by
    80  // hierarchy
    81  func GetEnabledCgroups() (map[int][]string, error) {
    82  	cgroupsFile, err := os.Open("/proc/cgroups")
    83  	if err != nil {
    84  		return nil, err
    85  	}
    86  	defer cgroupsFile.Close()
    87  
    88  	cgroups, err := parseCgroups(cgroupsFile)
    89  	if err != nil {
    90  		return nil, errwrap.Wrap(errors.New("error parsing /proc/cgroups"), err)
    91  	}
    92  
    93  	return cgroups, nil
    94  }
    95  
    96  // GetControllerDirs takes a map with the enabled cgroup controllers grouped by
    97  // hierarchy and returns the directory names as they should be in
    98  // /sys/fs/cgroup
    99  func GetControllerDirs(cgroups map[int][]string) []string {
   100  	var controllers []string
   101  	for _, cs := range cgroups {
   102  		controllers = append(controllers, strings.Join(cs, ","))
   103  	}
   104  
   105  	return controllers
   106  }
   107  
   108  func getControllerSymlinks(cgroups map[int][]string) map[string]string {
   109  	symlinks := make(map[string]string)
   110  
   111  	for _, cs := range cgroups {
   112  		if len(cs) > 1 {
   113  			tgt := strings.Join(cs, ",")
   114  			for _, ln := range cs {
   115  				symlinks[ln] = tgt
   116  			}
   117  		}
   118  	}
   119  
   120  	return symlinks
   121  }
   122  
   123  func parseCgroupController(cgroupPath, controller string) ([]string, error) {
   124  	cg, err := os.Open(cgroupPath)
   125  	if err != nil {
   126  		return nil, errwrap.Wrap(errors.New("error opening /proc/self/cgroup"), err)
   127  	}
   128  	defer cg.Close()
   129  
   130  	s := bufio.NewScanner(cg)
   131  	for s.Scan() {
   132  		parts := strings.SplitN(s.Text(), ":", 3)
   133  		if len(parts) < 3 {
   134  			return nil, fmt.Errorf("error parsing /proc/self/cgroup")
   135  		}
   136  		controllerParts := strings.Split(parts[1], ",")
   137  		for _, c := range controllerParts {
   138  			if c == controller {
   139  				return parts, nil
   140  			}
   141  		}
   142  	}
   143  
   144  	return nil, fmt.Errorf("controller %q not found", controller)
   145  }
   146  
   147  // GetOwnCgroupPath returns the cgroup path of this process in controller
   148  // hierarchy
   149  func GetOwnCgroupPath(controller string) (string, error) {
   150  	parts, err := parseCgroupController("/proc/self/cgroup", controller)
   151  	if err != nil {
   152  		return "", err
   153  	}
   154  	return parts[2], nil
   155  }
   156  
   157  // GetCgroupPathByPid returns the cgroup path of the process with the given pid
   158  // and given controller.
   159  func GetCgroupPathByPid(pid int, controller string) (string, error) {
   160  	parts, err := parseCgroupController(fmt.Sprintf("/proc/%d/cgroup", pid), controller)
   161  	if err != nil {
   162  		return "", err
   163  	}
   164  	return parts[2], nil
   165  }
   166  
   167  // JoinSubcgroup makes the calling process join the subcgroup hierarchy on a
   168  // particular controller
   169  func JoinSubcgroup(controller string, subcgroup string) error {
   170  	subcgroupPath := filepath.Join("/sys/fs/cgroup", controller, subcgroup)
   171  	if err := os.MkdirAll(subcgroupPath, 0600); err != nil {
   172  		return errwrap.Wrap(fmt.Errorf("error creating %q subcgroup", subcgroup), err)
   173  	}
   174  	pidBytes := []byte(strconv.Itoa(os.Getpid()))
   175  	if err := ioutil.WriteFile(filepath.Join(subcgroupPath, "cgroup.procs"), pidBytes, 0600); err != nil {
   176  		return errwrap.Wrap(fmt.Errorf("error adding ourselves to the %q subcgroup", subcgroup), err)
   177  	}
   178  
   179  	return nil
   180  }
   181  
   182  // Ensure that the hierarchy has consistent cpu restrictions.
   183  // This may fail; since this is "fixup" code, we should ignore
   184  // the error and proceed.
   185  //
   186  // This was originally a workaround for https://github.com/rkt/rkt/issues/1210
   187  // but is actually useful to have around
   188  //
   189  // cpuSetPath should be <stage1rootfs>/sys/fs/cgroup/cpuset
   190  func fixCpusetKnobs(cpusetPath, subcgroup, knob string) error {
   191  	if err := os.MkdirAll(filepath.Join(cpusetPath, subcgroup), 0755); err != nil {
   192  		return err
   193  	}
   194  
   195  	dirs := strings.Split(subcgroup, "/")
   196  
   197  	// Loop over every entry in the hierarchy, putting in the parent's value
   198  	// unless there is one already there.
   199  	// Read from the root knob
   200  	parentFile := filepath.Join(cpusetPath, knob)
   201  	parentData, err := ioutil.ReadFile(parentFile)
   202  	if err != nil {
   203  		return errwrap.Wrapf("error reading cgroup "+parentFile, err)
   204  	}
   205  
   206  	// Loop over every directory in the subcgroup path
   207  	currDir := cpusetPath
   208  	for _, dir := range dirs {
   209  		currDir = filepath.Join(currDir, dir)
   210  
   211  		childFile := filepath.Join(currDir, knob)
   212  		childData, err := ioutil.ReadFile(childFile)
   213  		if err != nil {
   214  			return errwrap.Wrapf("error reading cgroup "+childFile, err)
   215  		}
   216  
   217  		// If there is already a value, don't write - and propagate
   218  		// this value to subsequent children
   219  		if strings.TrimSpace(string(childData)) != "" {
   220  			parentData = childData
   221  			continue
   222  		}
   223  
   224  		// Workaround: just write twice to workaround the kernel bug fixed by this commit:
   225  		// https://github.com/torvalds/linux/commit/24ee3cf89bef04e8bc23788aca4e029a3f0f06d9
   226  		if err := ioutil.WriteFile(childFile, parentData, 0644); err != nil {
   227  			return errwrap.Wrapf("error writing cgroup "+childFile, err)
   228  		}
   229  		if err := ioutil.WriteFile(childFile, parentData, 0644); err != nil {
   230  			return errwrap.Wrapf("error writing cgroup "+childFile, err)
   231  		}
   232  	}
   233  	return nil
   234  }
   235  
   236  // IsControllerMounted returns whether a controller is mounted by checking that
   237  // cgroup.procs is accessible
   238  func IsControllerMounted(c string) (bool, error) {
   239  	cgroupProcsPath := filepath.Join("/sys/fs/cgroup", c, "cgroup.procs")
   240  	if _, err := os.Stat(cgroupProcsPath); err != nil {
   241  		if !os.IsNotExist(err) {
   242  			return false, err
   243  		}
   244  		return false, nil
   245  	}
   246  
   247  	return true, nil
   248  }
   249  
   250  // CreateCgroups mounts the v1 cgroup controllers hierarchy in /sys/fs/cgroup
   251  // under root
   252  func CreateCgroups(m fs.Mounter, root string, enabledCgroups map[int][]string, mountContext string) error {
   253  	controllers := GetControllerDirs(enabledCgroups)
   254  
   255  	sys := filepath.Join(root, "/sys")
   256  	if err := os.MkdirAll(sys, 0700); err != nil {
   257  		return err
   258  	}
   259  
   260  	var sysfsFlags uintptr = syscall.MS_NOSUID |
   261  		syscall.MS_NOEXEC |
   262  		syscall.MS_NODEV
   263  
   264  	// If we're mounting the host cgroups, /sys is probably mounted so we
   265  	// ignore EBUSY
   266  	if err := m.Mount("sysfs", sys, "sysfs", sysfsFlags, ""); err != nil && err != syscall.EBUSY {
   267  		return errwrap.Wrap(fmt.Errorf("error mounting %q", sys), err)
   268  	}
   269  
   270  	cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup")
   271  	if err := os.MkdirAll(cgroupTmpfs, 0700); err != nil {
   272  		return err
   273  	}
   274  
   275  	var cgroupTmpfsFlags uintptr = syscall.MS_NOSUID |
   276  		syscall.MS_NOEXEC |
   277  		syscall.MS_NODEV |
   278  		syscall.MS_STRICTATIME
   279  
   280  	options := "mode=755"
   281  	if mountContext != "" {
   282  		options = fmt.Sprintf("mode=755,context=\"%s\"", mountContext)
   283  	}
   284  
   285  	if err := m.Mount("tmpfs", cgroupTmpfs, "tmpfs", cgroupTmpfsFlags, options); err != nil {
   286  		return errwrap.Wrap(fmt.Errorf("error mounting %q", cgroupTmpfs), err)
   287  	}
   288  
   289  	// Mount controllers
   290  	for _, c := range controllers {
   291  		cPath := filepath.Join(root, "/sys/fs/cgroup", c)
   292  		if err := os.MkdirAll(cPath, 0700); err != nil {
   293  			return err
   294  		}
   295  
   296  		var flags uintptr = syscall.MS_NOSUID |
   297  			syscall.MS_NOEXEC |
   298  			syscall.MS_NODEV
   299  
   300  		if err := m.Mount("cgroup", cPath, "cgroup", flags, c); err != nil {
   301  			return errwrap.Wrap(fmt.Errorf("error mounting %q", cPath), err)
   302  		}
   303  	}
   304  
   305  	// Create symlinks for combined controllers
   306  	symlinks := getControllerSymlinks(enabledCgroups)
   307  	for ln, tgt := range symlinks {
   308  		lnPath := filepath.Join(cgroupTmpfs, ln)
   309  		if err := os.Symlink(tgt, lnPath); err != nil {
   310  			return errwrap.Wrap(errors.New("error creating symlink"), err)
   311  		}
   312  	}
   313  
   314  	systemdControllerPath := filepath.Join(root, "/sys/fs/cgroup/systemd")
   315  	if err := os.MkdirAll(systemdControllerPath, 0700); err != nil {
   316  		return err
   317  	}
   318  
   319  	unifiedPath := filepath.Join(root, "/sys/fs/cgroup/unified")
   320  	if err := os.MkdirAll(unifiedPath, 0700); err != nil {
   321  		return err
   322  	}
   323  
   324  	// Bind-mount cgroup tmpfs filesystem read-only
   325  	return mountFsRO(m, cgroupTmpfs, cgroupTmpfsFlags)
   326  }
   327  
   328  // RemountCgroups remounts the v1 cgroup hierarchy under root.
   329  // It mounts /sys/fs/cgroup/[controller] read-only,
   330  // but leaves needed knobs in the pod's subcgroup read-write,
   331  // such that systemd inside stage1 can apply isolators to them.
   332  // It leaves /sys read-write if the given readWrite parameter is true.
   333  // When this is done, <stage1>/sys/fs/cgroup/<controller> should be RO, and
   334  // <stage1>/sys/fs/cgroup/<cotroller>/.../machine-rkt/.../system.slice should be RW
   335  func RemountCgroups(m fs.Mounter, root string, enabledCgroups map[int][]string, subcgroup string, readWrite bool) error {
   336  	controllers := GetControllerDirs(enabledCgroups)
   337  	cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup")
   338  	sysPath := filepath.Join(root, "/sys")
   339  
   340  	var flags uintptr = syscall.MS_NOSUID |
   341  		syscall.MS_NOEXEC |
   342  		syscall.MS_NODEV
   343  
   344  	// Mount RW the controllers for this pod
   345  	for _, c := range controllers {
   346  		cPath := filepath.Join(cgroupTmpfs, c)
   347  		subcgroupPath := filepath.Join(cPath, subcgroup, "system.slice")
   348  
   349  		if err := os.MkdirAll(subcgroupPath, 0755); err != nil {
   350  			return err
   351  		}
   352  		if err := m.Mount(subcgroupPath, subcgroupPath, "", syscall.MS_BIND, ""); err != nil {
   353  			return errwrap.Wrap(fmt.Errorf("error bind mounting %q", subcgroupPath), err)
   354  		}
   355  
   356  		// Workaround for https://github.com/rkt/rkt/issues/1210
   357  		// It is OK to ignore errors here.
   358  		if c == "cpuset" {
   359  			_ = fixCpusetKnobs(cPath, subcgroup, "cpuset.mems")
   360  			_ = fixCpusetKnobs(cPath, subcgroup, "cpuset.cpus")
   361  		}
   362  
   363  		// Re-mount controller read-only to prevent the container modifying host controllers
   364  		if err := mountFsRO(m, cPath, flags); err != nil {
   365  			return err
   366  		}
   367  	}
   368  
   369  	if readWrite { // leave sys r/w?
   370  		return nil
   371  	}
   372  
   373  	// Bind-mount sys filesystem read-only
   374  	return mountFsRO(m, sysPath, flags)
   375  }