github.com/blixtra/rkt@v0.8.1-0.20160204105720-ab0d1add1a43/common/cgroup/cgroup.go (about)

     1  // Copyright 2015 The rkt Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //+build linux
    16  
    17  package cgroup
    18  
    19  import (
    20  	"bufio"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"io/ioutil"
    25  	"os"
    26  	"path/filepath"
    27  	"strconv"
    28  	"strings"
    29  	"syscall"
    30  
    31  	"github.com/coreos/go-systemd/unit"
    32  	"github.com/hashicorp/errwrap"
    33  	"k8s.io/kubernetes/pkg/api/resource"
    34  )
    35  
    36  type addIsolatorFunc func(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error)
    37  
    38  var (
    39  	isolatorFuncs = map[string]addIsolatorFunc{
    40  		"cpu":    addCpuLimit,
    41  		"memory": addMemoryLimit,
    42  	}
    43  	cgroupControllerRWFiles = map[string][]string{
    44  		"memory": []string{"memory.limit_in_bytes"},
    45  		"cpu":    []string{"cpu.cfs_quota_us"},
    46  	}
    47  )
    48  
    49  func addCpuLimit(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error) {
    50  	if limit.Value() > resource.MaxMilliValue {
    51  		return nil, fmt.Errorf("cpu limit exceeds the maximum millivalue: %v", limit.String())
    52  	}
    53  	quota := strconv.Itoa(int(limit.MilliValue()/10)) + "%"
    54  	opts = append(opts, unit.NewUnitOption("Service", "CPUQuota", quota))
    55  	return opts, nil
    56  }
    57  
    58  func addMemoryLimit(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error) {
    59  	opts = append(opts, unit.NewUnitOption("Service", "MemoryLimit", strconv.Itoa(int(limit.Value()))))
    60  	return opts, nil
    61  }
    62  
    63  // MaybeAddIsolator considers the given isolator; if the type is known
    64  // (i.e. IsIsolatorSupported is true) and the limit is non-nil, the supplied
    65  // opts will be extended with an appropriate option implementing the desired
    66  // isolation.
    67  func MaybeAddIsolator(opts []*unit.UnitOption, isolator string, limit *resource.Quantity) ([]*unit.UnitOption, error) {
    68  	var err error
    69  	if limit == nil {
    70  		return opts, nil
    71  	}
    72  	if IsIsolatorSupported(isolator) {
    73  		opts, err = isolatorFuncs[isolator](opts, limit)
    74  		if err != nil {
    75  			return nil, err
    76  		}
    77  	} else {
    78  		fmt.Fprintf(os.Stderr, "warning: resource/%s isolator set but support disabled in the kernel, skipping\n", isolator)
    79  	}
    80  	return opts, nil
    81  }
    82  
    83  // IsIsolatorSupported returns whether an isolator is supported in the kernel
    84  func IsIsolatorSupported(isolator string) bool {
    85  	if files, ok := cgroupControllerRWFiles[isolator]; ok {
    86  		for _, f := range files {
    87  			isolatorPath := filepath.Join("/sys/fs/cgroup/", isolator, f)
    88  			if _, err := os.Stat(isolatorPath); os.IsNotExist(err) {
    89  				return false
    90  			}
    91  		}
    92  		return true
    93  	}
    94  	return false
    95  }
    96  
    97  func parseCgroups(f io.Reader) (map[int][]string, error) {
    98  	sc := bufio.NewScanner(f)
    99  
   100  	// skip first line since it is a comment
   101  	sc.Scan()
   102  
   103  	cgroups := make(map[int][]string)
   104  	for sc.Scan() {
   105  		var controller string
   106  		var hierarchy int
   107  		var num int
   108  		var enabled int
   109  		fmt.Sscanf(sc.Text(), "%s %d %d %d", &controller, &hierarchy, &num, &enabled)
   110  
   111  		if enabled == 1 {
   112  			if _, ok := cgroups[hierarchy]; !ok {
   113  				cgroups[hierarchy] = []string{controller}
   114  			} else {
   115  				cgroups[hierarchy] = append(cgroups[hierarchy], controller)
   116  			}
   117  		}
   118  	}
   119  
   120  	if err := sc.Err(); err != nil {
   121  		return nil, err
   122  	}
   123  
   124  	return cgroups, nil
   125  }
   126  
   127  // GetEnabledCgroups returns a map with the enabled cgroup controllers grouped by
   128  // hierarchy
   129  func GetEnabledCgroups() (map[int][]string, error) {
   130  	cgroupsFile, err := os.Open("/proc/cgroups")
   131  	if err != nil {
   132  		return nil, err
   133  	}
   134  	defer cgroupsFile.Close()
   135  
   136  	cgroups, err := parseCgroups(cgroupsFile)
   137  	if err != nil {
   138  		return nil, errwrap.Wrap(errors.New("error parsing /proc/cgroups"), err)
   139  	}
   140  
   141  	return cgroups, nil
   142  }
   143  
   144  // GetControllerDirs takes a map with the enabled cgroup controllers grouped by
   145  // hierarchy and returns the directory names as they should be in
   146  // /sys/fs/cgroup
   147  func GetControllerDirs(cgroups map[int][]string) []string {
   148  	var controllers []string
   149  	for _, cs := range cgroups {
   150  		controllers = append(controllers, strings.Join(cs, ","))
   151  	}
   152  
   153  	return controllers
   154  }
   155  
   156  func getControllerSymlinks(cgroups map[int][]string) map[string]string {
   157  	symlinks := make(map[string]string)
   158  
   159  	for _, cs := range cgroups {
   160  		if len(cs) > 1 {
   161  			tgt := strings.Join(cs, ",")
   162  			for _, ln := range cs {
   163  				symlinks[ln] = tgt
   164  			}
   165  		}
   166  	}
   167  
   168  	return symlinks
   169  }
   170  
   171  func getControllerRWFiles(controller string) []string {
   172  	parts := strings.Split(controller, ",")
   173  	for _, p := range parts {
   174  		if files, ok := cgroupControllerRWFiles[p]; ok {
   175  			// cgroup.procs always needs to be RW for allowing systemd to add
   176  			// processes to the controller
   177  			files = append(files, "cgroup.procs")
   178  			return files
   179  		}
   180  	}
   181  
   182  	return nil
   183  }
   184  
   185  func parseOwnCgroupController(controller string) ([]string, error) {
   186  	cgroupPath := "/proc/self/cgroup"
   187  	cg, err := os.Open(cgroupPath)
   188  	if err != nil {
   189  		return nil, errwrap.Wrap(errors.New("error opening /proc/self/cgroup"), err)
   190  	}
   191  	defer cg.Close()
   192  
   193  	s := bufio.NewScanner(cg)
   194  	for s.Scan() {
   195  		parts := strings.SplitN(s.Text(), ":", 3)
   196  		if len(parts) < 3 {
   197  			return nil, fmt.Errorf("error parsing /proc/self/cgroup")
   198  		}
   199  		controllerParts := strings.Split(parts[1], ",")
   200  		for _, c := range controllerParts {
   201  			if c == controller {
   202  				return parts, nil
   203  			}
   204  		}
   205  	}
   206  
   207  	return nil, fmt.Errorf("controller %q not found", controller)
   208  }
   209  
   210  // GetOwnCgroupPath returns the cgroup path of this process in controller
   211  // hierarchy
   212  func GetOwnCgroupPath(controller string) (string, error) {
   213  	parts, err := parseOwnCgroupController(controller)
   214  	if err != nil {
   215  		return "", err
   216  	}
   217  	return parts[2], nil
   218  }
   219  
   220  // JoinCgroup makes the calling process join the subcgroup hierarchy on a
   221  // particular controller
   222  func JoinSubcgroup(controller string, subcgroup string) error {
   223  	subcgroupPath := filepath.Join("/sys/fs/cgroup", controller, subcgroup)
   224  	if err := os.MkdirAll(subcgroupPath, 0600); err != nil {
   225  		return errwrap.Wrap(fmt.Errorf("error creating %q subcgroup", subcgroup), err)
   226  	}
   227  	pidBytes := []byte(strconv.Itoa(os.Getpid()))
   228  	if err := ioutil.WriteFile(filepath.Join(subcgroupPath, "cgroup.procs"), pidBytes, 0600); err != nil {
   229  		return errwrap.Wrap(fmt.Errorf("error adding ourselves to the %q subcgroup", subcgroup), err)
   230  	}
   231  
   232  	return nil
   233  }
   234  
   235  // If /system.slice does not exist in the cpuset controller, create it and
   236  // configure it.
   237  // Since this is a workaround, we ignore errors
   238  func fixCpusetKnobs(cpusetPath string) {
   239  	cgroupPathFix := filepath.Join(cpusetPath, "system.slice")
   240  	_ = os.MkdirAll(cgroupPathFix, 0755)
   241  	knobs := []string{"cpuset.mems", "cpuset.cpus"}
   242  	for _, knob := range knobs {
   243  		parentFile := filepath.Join(filepath.Dir(cgroupPathFix), knob)
   244  		childFile := filepath.Join(cgroupPathFix, knob)
   245  
   246  		data, err := ioutil.ReadFile(childFile)
   247  		if err != nil {
   248  			continue
   249  		}
   250  		// If the file is already configured, don't change it
   251  		if strings.TrimSpace(string(data)) != "" {
   252  			continue
   253  		}
   254  
   255  		data, err = ioutil.ReadFile(parentFile)
   256  		if err == nil {
   257  			// Workaround: just write twice to workaround the kernel bug fixed by this commit:
   258  			// https://github.com/torvalds/linux/commit/24ee3cf89bef04e8bc23788aca4e029a3f0f06d9
   259  			ioutil.WriteFile(childFile, data, 0644)
   260  			ioutil.WriteFile(childFile, data, 0644)
   261  		}
   262  	}
   263  }
   264  
   265  // IsControllerMounted returns whether a controller is mounted by checking that
   266  // cgroup.procs is accessible
   267  func IsControllerMounted(c string) bool {
   268  	cgroupProcsPath := filepath.Join("/sys/fs/cgroup", c, "cgroup.procs")
   269  	if _, err := os.Stat(cgroupProcsPath); err != nil {
   270  		return false
   271  	}
   272  
   273  	return true
   274  }
   275  
   276  // CreateCgroups mounts the cgroup controllers hierarchy in /sys/fs/cgroup
   277  // under root
   278  func CreateCgroups(root string, enabledCgroups map[int][]string) error {
   279  	controllers := GetControllerDirs(enabledCgroups)
   280  	var flags uintptr
   281  
   282  	sys := filepath.Join(root, "/sys")
   283  	if err := os.MkdirAll(sys, 0700); err != nil {
   284  		return err
   285  	}
   286  	flags = syscall.MS_NOSUID |
   287  		syscall.MS_NOEXEC |
   288  		syscall.MS_NODEV
   289  	// If we're mounting the host cgroups, /sys is probably mounted so we
   290  	// ignore EBUSY
   291  	if err := syscall.Mount("sysfs", sys, "sysfs", flags, ""); err != nil && err != syscall.EBUSY {
   292  		return errwrap.Wrap(fmt.Errorf("error mounting %q", sys), err)
   293  	}
   294  
   295  	cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup")
   296  	if err := os.MkdirAll(cgroupTmpfs, 0700); err != nil {
   297  		return err
   298  	}
   299  	flags = syscall.MS_NOSUID |
   300  		syscall.MS_NOEXEC |
   301  		syscall.MS_NODEV |
   302  		syscall.MS_STRICTATIME
   303  	if err := syscall.Mount("tmpfs", cgroupTmpfs, "tmpfs", flags, "mode=755"); err != nil {
   304  		return errwrap.Wrap(fmt.Errorf("error mounting %q", cgroupTmpfs), err)
   305  	}
   306  
   307  	// Mount controllers
   308  	for _, c := range controllers {
   309  		cPath := filepath.Join(root, "/sys/fs/cgroup", c)
   310  		if err := os.MkdirAll(cPath, 0700); err != nil {
   311  			return err
   312  		}
   313  
   314  		flags = syscall.MS_NOSUID |
   315  			syscall.MS_NOEXEC |
   316  			syscall.MS_NODEV
   317  		if err := syscall.Mount("cgroup", cPath, "cgroup", flags, c); err != nil {
   318  			return errwrap.Wrap(fmt.Errorf("error mounting %q", cPath), err)
   319  		}
   320  	}
   321  
   322  	// Create symlinks for combined controllers
   323  	symlinks := getControllerSymlinks(enabledCgroups)
   324  	for ln, tgt := range symlinks {
   325  		lnPath := filepath.Join(cgroupTmpfs, ln)
   326  		if err := os.Symlink(tgt, lnPath); err != nil {
   327  			return errwrap.Wrap(errors.New("error creating symlink"), err)
   328  		}
   329  	}
   330  
   331  	systemdControllerPath := filepath.Join(root, "/sys/fs/cgroup/systemd")
   332  	if err := os.MkdirAll(systemdControllerPath, 0700); err != nil {
   333  		return err
   334  	}
   335  
   336  	// Bind-mount cgroup tmpfs filesystem read-only
   337  	flags = syscall.MS_BIND |
   338  		syscall.MS_REMOUNT |
   339  		syscall.MS_NOSUID |
   340  		syscall.MS_NOEXEC |
   341  		syscall.MS_NODEV |
   342  		syscall.MS_RDONLY
   343  	if err := syscall.Mount(cgroupTmpfs, cgroupTmpfs, "", flags, ""); err != nil {
   344  		return errwrap.Wrap(fmt.Errorf("error remounting RO %q", cgroupTmpfs), err)
   345  	}
   346  
   347  	return nil
   348  }
   349  
   350  // RemountCgroupsRO remounts the cgroup hierarchy under root read-only, leaving
   351  // the needed knobs in the subcgroup for each app read-write so the systemd
   352  // inside stage1 can apply isolators to them
   353  func RemountCgroupsRO(root string, enabledCgroups map[int][]string, subcgroup string, serviceNames []string) error {
   354  	controllers := GetControllerDirs(enabledCgroups)
   355  	cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup")
   356  	sysPath := filepath.Join(root, "/sys")
   357  
   358  	var flags uintptr
   359  
   360  	// Mount RW knobs we need to make the enabled isolators work
   361  	for _, c := range controllers {
   362  		cPath := filepath.Join(cgroupTmpfs, c)
   363  		subcgroupPath := filepath.Join(cPath, subcgroup)
   364  
   365  		// Workaround for https://github.com/coreos/rkt/issues/1210
   366  		if c == "cpuset" {
   367  			fixCpusetKnobs(cPath)
   368  		}
   369  
   370  		// Create cgroup directories and mount the files we need over
   371  		// themselves so they stay read-write
   372  		for _, serviceName := range serviceNames {
   373  			appCgroup := filepath.Join(subcgroupPath, serviceName)
   374  			if err := os.MkdirAll(appCgroup, 0755); err != nil {
   375  				return err
   376  			}
   377  			for _, f := range getControllerRWFiles(c) {
   378  				cgroupFilePath := filepath.Join(appCgroup, f)
   379  				// the file may not be there if kernel doesn't support the
   380  				// feature, skip it in that case
   381  				if _, err := os.Stat(cgroupFilePath); os.IsNotExist(err) {
   382  					continue
   383  				}
   384  				if err := syscall.Mount(cgroupFilePath, cgroupFilePath, "", syscall.MS_BIND, ""); err != nil {
   385  					return errwrap.Wrap(fmt.Errorf("error bind mounting %q", cgroupFilePath), err)
   386  				}
   387  			}
   388  		}
   389  
   390  		// Re-mount controller read-only to prevent the container modifying host controllers
   391  		flags = syscall.MS_BIND |
   392  			syscall.MS_REMOUNT |
   393  			syscall.MS_NOSUID |
   394  			syscall.MS_NOEXEC |
   395  			syscall.MS_NODEV |
   396  			syscall.MS_RDONLY
   397  		if err := syscall.Mount(cPath, cPath, "", flags, ""); err != nil {
   398  			return errwrap.Wrap(fmt.Errorf("error remounting RO %q", cPath), err)
   399  		}
   400  	}
   401  
   402  	// Bind-mount sys filesystem read-only
   403  	flags = syscall.MS_BIND |
   404  		syscall.MS_REMOUNT |
   405  		syscall.MS_NOSUID |
   406  		syscall.MS_NOEXEC |
   407  		syscall.MS_NODEV |
   408  		syscall.MS_RDONLY
   409  	if err := syscall.Mount(sysPath, sysPath, "", flags, ""); err != nil {
   410  		return errwrap.Wrap(fmt.Errorf("error remounting RO %q", sysPath), err)
   411  	}
   412  
   413  	return nil
   414  }