github.com/stackdocker/rkt@v0.10.1-0.20151109095037-1aa827478248/common/cgroup/cgroup.go (about)

     1  // Copyright 2015 The rkt Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //+build linux
    16  
    17  package cgroup
    18  
    19  import (
    20  	"bufio"
    21  	"fmt"
    22  	"io"
    23  	"io/ioutil"
    24  	"os"
    25  	"path/filepath"
    26  	"strconv"
    27  	"strings"
    28  	"syscall"
    29  
    30  	"github.com/coreos/rkt/Godeps/_workspace/src/github.com/coreos/go-systemd/unit"
    31  	"github.com/coreos/rkt/Godeps/_workspace/src/k8s.io/kubernetes/pkg/api/resource"
    32  )
    33  
    34  type addIsolatorFunc func(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error)
    35  
    36  var (
    37  	isolatorFuncs = map[string]addIsolatorFunc{
    38  		"cpu":    addCpuLimit,
    39  		"memory": addMemoryLimit,
    40  	}
    41  	cgroupControllerRWFiles = map[string][]string{
    42  		"memory": []string{"memory.limit_in_bytes"},
    43  		"cpu":    []string{"cpu.cfs_quota_us"},
    44  	}
    45  )
    46  
    47  func addCpuLimit(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error) {
    48  	if limit.Value() > resource.MaxMilliValue {
    49  		return nil, fmt.Errorf("cpu limit exceeds the maximum millivalue: %v", limit.String())
    50  	}
    51  	quota := strconv.Itoa(int(limit.MilliValue()/10)) + "%"
    52  	opts = append(opts, unit.NewUnitOption("Service", "CPUQuota", quota))
    53  	return opts, nil
    54  }
    55  
    56  func addMemoryLimit(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error) {
    57  	opts = append(opts, unit.NewUnitOption("Service", "MemoryLimit", strconv.Itoa(int(limit.Value()))))
    58  	return opts, nil
    59  }
    60  
    61  // MaybeAddIsolator considers the given isolator; if the type is known
    62  // (i.e. IsIsolatorSupported is true) and the limit is non-nil, the supplied
    63  // opts will be extended with an appropriate option implementing the desired
    64  // isolation.
    65  func MaybeAddIsolator(opts []*unit.UnitOption, isolator string, limit *resource.Quantity) ([]*unit.UnitOption, error) {
    66  	var err error
    67  	if limit == nil {
    68  		return opts, nil
    69  	}
    70  	if IsIsolatorSupported(isolator) {
    71  		opts, err = isolatorFuncs[isolator](opts, limit)
    72  		if err != nil {
    73  			return nil, err
    74  		}
    75  	} else {
    76  		fmt.Fprintf(os.Stderr, "warning: resource/%s isolator set but support disabled in the kernel, skipping\n", isolator)
    77  	}
    78  	return opts, nil
    79  }
    80  
    81  // IsIsolatorSupported returns whether an isolator is supported in the kernel
    82  func IsIsolatorSupported(isolator string) bool {
    83  	if files, ok := cgroupControllerRWFiles[isolator]; ok {
    84  		for _, f := range files {
    85  			isolatorPath := filepath.Join("/sys/fs/cgroup/", isolator, f)
    86  			if _, err := os.Stat(isolatorPath); os.IsNotExist(err) {
    87  				return false
    88  			}
    89  		}
    90  		return true
    91  	}
    92  	return false
    93  }
    94  
    95  func parseCgroups(f io.Reader) (map[int][]string, error) {
    96  	sc := bufio.NewScanner(f)
    97  
    98  	// skip first line since it is a comment
    99  	sc.Scan()
   100  
   101  	cgroups := make(map[int][]string)
   102  	for sc.Scan() {
   103  		var controller string
   104  		var hierarchy int
   105  		var num int
   106  		var enabled int
   107  		fmt.Sscanf(sc.Text(), "%s %d %d %d", &controller, &hierarchy, &num, &enabled)
   108  
   109  		if enabled == 1 {
   110  			if _, ok := cgroups[hierarchy]; !ok {
   111  				cgroups[hierarchy] = []string{controller}
   112  			} else {
   113  				cgroups[hierarchy] = append(cgroups[hierarchy], controller)
   114  			}
   115  		}
   116  	}
   117  
   118  	if err := sc.Err(); err != nil {
   119  		return nil, err
   120  	}
   121  
   122  	return cgroups, nil
   123  }
   124  
   125  // GetEnabledCgroups returns a map with the enabled cgroup controllers grouped by
   126  // hierarchy
   127  func GetEnabledCgroups() (map[int][]string, error) {
   128  	cgroupsFile, err := os.Open("/proc/cgroups")
   129  	if err != nil {
   130  		return nil, err
   131  	}
   132  	defer cgroupsFile.Close()
   133  
   134  	cgroups, err := parseCgroups(cgroupsFile)
   135  	if err != nil {
   136  		return nil, fmt.Errorf("error parsing /proc/cgroups: %v", err)
   137  	}
   138  
   139  	return cgroups, nil
   140  }
   141  
   142  // GetControllerDirs takes a map with the enabled cgroup controllers grouped by
   143  // hierarchy and returns the directory names as they should be in
   144  // /sys/fs/cgroup
   145  func GetControllerDirs(cgroups map[int][]string) []string {
   146  	var controllers []string
   147  	for _, cs := range cgroups {
   148  		controllers = append(controllers, strings.Join(cs, ","))
   149  	}
   150  
   151  	return controllers
   152  }
   153  
   154  func getControllerSymlinks(cgroups map[int][]string) map[string]string {
   155  	symlinks := make(map[string]string)
   156  
   157  	for _, cs := range cgroups {
   158  		if len(cs) > 1 {
   159  			tgt := strings.Join(cs, ",")
   160  			for _, ln := range cs {
   161  				symlinks[ln] = tgt
   162  			}
   163  		}
   164  	}
   165  
   166  	return symlinks
   167  }
   168  
   169  func getControllerRWFiles(controller string) []string {
   170  	parts := strings.Split(controller, ",")
   171  	for _, p := range parts {
   172  		if files, ok := cgroupControllerRWFiles[p]; ok {
   173  			// cgroup.procs always needs to be RW for allowing systemd to add
   174  			// processes to the controller
   175  			files = append(files, "cgroup.procs")
   176  			return files
   177  		}
   178  	}
   179  
   180  	return nil
   181  }
   182  
   183  func parseOwnCgroupController(controller string) ([]string, error) {
   184  	cgroupPath := "/proc/self/cgroup"
   185  	cg, err := os.Open(cgroupPath)
   186  	if err != nil {
   187  		return nil, fmt.Errorf("error opening /proc/self/cgroup: %v", err)
   188  	}
   189  	defer cg.Close()
   190  
   191  	s := bufio.NewScanner(cg)
   192  	for s.Scan() {
   193  		parts := strings.SplitN(s.Text(), ":", 3)
   194  		if len(parts) < 3 {
   195  			return nil, fmt.Errorf("error parsing /proc/self/cgroup")
   196  		}
   197  		controllerParts := strings.Split(parts[1], ",")
   198  		for _, c := range controllerParts {
   199  			if c == controller {
   200  				return parts, nil
   201  			}
   202  		}
   203  	}
   204  
   205  	return nil, fmt.Errorf("controller %q not found", controller)
   206  }
   207  
   208  // GetOwnCgroupPath returns the cgroup path of this process in controller
   209  // hierarchy
   210  func GetOwnCgroupPath(controller string) (string, error) {
   211  	parts, err := parseOwnCgroupController(controller)
   212  	if err != nil {
   213  		return "", err
   214  	}
   215  	return parts[2], nil
   216  }
   217  
   218  // JoinCgroup makes the calling process join the subcgroup hierarchy on a
   219  // particular controller
   220  func JoinSubcgroup(controller string, subcgroup string) error {
   221  	subcgroupPath := filepath.Join("/sys/fs/cgroup", controller, subcgroup)
   222  	if err := os.MkdirAll(subcgroupPath, 0600); err != nil {
   223  		return fmt.Errorf("error creating %q subcgroup: %v", subcgroup, err)
   224  	}
   225  	pidBytes := []byte(strconv.Itoa(os.Getpid()))
   226  	if err := ioutil.WriteFile(filepath.Join(subcgroupPath, "cgroup.procs"), pidBytes, 0600); err != nil {
   227  		return fmt.Errorf("error adding ourselves to the %q subcgroup: %v", subcgroup, err)
   228  	}
   229  
   230  	return nil
   231  }
   232  
   233  // If /system.slice does not exist in the cpuset controller, create it and
   234  // configure it.
   235  // Since this is a workaround, we ignore errors
   236  func fixCpusetKnobs(cpusetPath string) {
   237  	cgroupPathFix := filepath.Join(cpusetPath, "system.slice")
   238  	_ = os.MkdirAll(cgroupPathFix, 0755)
   239  	knobs := []string{"cpuset.mems", "cpuset.cpus"}
   240  	for _, knob := range knobs {
   241  		parentFile := filepath.Join(filepath.Dir(cgroupPathFix), knob)
   242  		childFile := filepath.Join(cgroupPathFix, knob)
   243  
   244  		data, err := ioutil.ReadFile(childFile)
   245  		if err != nil {
   246  			continue
   247  		}
   248  		// If the file is already configured, don't change it
   249  		if strings.TrimSpace(string(data)) != "" {
   250  			continue
   251  		}
   252  
   253  		data, err = ioutil.ReadFile(parentFile)
   254  		if err == nil {
   255  			// Workaround: just write twice to workaround the kernel bug fixed by this commit:
   256  			// https://github.com/torvalds/linux/commit/24ee3cf89bef04e8bc23788aca4e029a3f0f06d9
   257  			ioutil.WriteFile(childFile, data, 0644)
   258  			ioutil.WriteFile(childFile, data, 0644)
   259  		}
   260  	}
   261  }
   262  
   263  // IsControllerMounted returns whether a controller is mounted by checking that
   264  // cgroup.procs is accessible
   265  func IsControllerMounted(c string) bool {
   266  	cgroupProcsPath := filepath.Join("/sys/fs/cgroup", c, "cgroup.procs")
   267  	if _, err := os.Stat(cgroupProcsPath); err != nil {
   268  		return false
   269  	}
   270  
   271  	return true
   272  }
   273  
   274  // CreateCgroups mounts the cgroup controllers hierarchy in /sys/fs/cgroup
   275  // under root
   276  func CreateCgroups(root string, enabledCgroups map[int][]string) error {
   277  	controllers := GetControllerDirs(enabledCgroups)
   278  	var flags uintptr
   279  
   280  	sys := filepath.Join(root, "/sys")
   281  	if err := os.MkdirAll(sys, 0700); err != nil {
   282  		return err
   283  	}
   284  	flags = syscall.MS_NOSUID |
   285  		syscall.MS_NOEXEC |
   286  		syscall.MS_NODEV
   287  	// If we're mounting the host cgroups, /sys is probably mounted so we
   288  	// ignore EBUSY
   289  	if err := syscall.Mount("sysfs", sys, "sysfs", flags, ""); err != nil && err != syscall.EBUSY {
   290  		return fmt.Errorf("error mounting %q: %v", sys, err)
   291  	}
   292  
   293  	cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup")
   294  	if err := os.MkdirAll(cgroupTmpfs, 0700); err != nil {
   295  		return err
   296  	}
   297  	flags = syscall.MS_NOSUID |
   298  		syscall.MS_NOEXEC |
   299  		syscall.MS_NODEV |
   300  		syscall.MS_STRICTATIME
   301  	if err := syscall.Mount("tmpfs", cgroupTmpfs, "tmpfs", flags, "mode=755"); err != nil {
   302  		return fmt.Errorf("error mounting %q: %v", cgroupTmpfs, err)
   303  	}
   304  
   305  	// Mount controllers
   306  	for _, c := range controllers {
   307  		cPath := filepath.Join(root, "/sys/fs/cgroup", c)
   308  		if err := os.MkdirAll(cPath, 0700); err != nil {
   309  			return err
   310  		}
   311  
   312  		flags = syscall.MS_NOSUID |
   313  			syscall.MS_NOEXEC |
   314  			syscall.MS_NODEV
   315  		if err := syscall.Mount("cgroup", cPath, "cgroup", flags, c); err != nil {
   316  			return fmt.Errorf("error mounting %q: %v", cPath, err)
   317  		}
   318  	}
   319  
   320  	// Create symlinks for combined controllers
   321  	symlinks := getControllerSymlinks(enabledCgroups)
   322  	for ln, tgt := range symlinks {
   323  		lnPath := filepath.Join(cgroupTmpfs, ln)
   324  		if err := os.Symlink(tgt, lnPath); err != nil {
   325  			return fmt.Errorf("error creating symlink: %v", err)
   326  		}
   327  	}
   328  
   329  	systemdControllerPath := filepath.Join(root, "/sys/fs/cgroup/systemd")
   330  	if err := os.MkdirAll(systemdControllerPath, 0700); err != nil {
   331  		return err
   332  	}
   333  
   334  	// Bind-mount cgroup tmpfs filesystem read-only
   335  	flags = syscall.MS_BIND |
   336  		syscall.MS_REMOUNT |
   337  		syscall.MS_NOSUID |
   338  		syscall.MS_NOEXEC |
   339  		syscall.MS_NODEV |
   340  		syscall.MS_RDONLY
   341  	if err := syscall.Mount(cgroupTmpfs, cgroupTmpfs, "", flags, ""); err != nil {
   342  		return fmt.Errorf("error remounting RO %q: %v", cgroupTmpfs, err)
   343  	}
   344  
   345  	return nil
   346  }
   347  
   348  // RemountCgroupsRO remounts the cgroup hierarchy under root read-only, leaving
   349  // the needed knobs in the subcgroup for each app read-write so the systemd
   350  // inside stage1 can apply isolators to them
   351  func RemountCgroupsRO(root string, enabledCgroups map[int][]string, subcgroup string, serviceNames []string) error {
   352  	controllers := GetControllerDirs(enabledCgroups)
   353  	cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup")
   354  	sysPath := filepath.Join(root, "/sys")
   355  
   356  	var flags uintptr
   357  
   358  	// Mount RW knobs we need to make the enabled isolators work
   359  	for _, c := range controllers {
   360  		cPath := filepath.Join(cgroupTmpfs, c)
   361  		subcgroupPath := filepath.Join(cPath, subcgroup)
   362  
   363  		// Workaround for https://github.com/coreos/rkt/issues/1210
   364  		if c == "cpuset" {
   365  			fixCpusetKnobs(cPath)
   366  		}
   367  
   368  		// Create cgroup directories and mount the files we need over
   369  		// themselves so they stay read-write
   370  		for _, serviceName := range serviceNames {
   371  			appCgroup := filepath.Join(subcgroupPath, serviceName)
   372  			if err := os.MkdirAll(appCgroup, 0755); err != nil {
   373  				return err
   374  			}
   375  			for _, f := range getControllerRWFiles(c) {
   376  				cgroupFilePath := filepath.Join(appCgroup, f)
   377  				// the file may not be there if kernel doesn't support the
   378  				// feature, skip it in that case
   379  				if _, err := os.Stat(cgroupFilePath); os.IsNotExist(err) {
   380  					continue
   381  				}
   382  				if err := syscall.Mount(cgroupFilePath, cgroupFilePath, "", syscall.MS_BIND, ""); err != nil {
   383  					return fmt.Errorf("error bind mounting %q: %v", cgroupFilePath, err)
   384  				}
   385  			}
   386  		}
   387  
   388  		// Re-mount controller read-only to prevent the container modifying host controllers
   389  		flags = syscall.MS_BIND |
   390  			syscall.MS_REMOUNT |
   391  			syscall.MS_NOSUID |
   392  			syscall.MS_NOEXEC |
   393  			syscall.MS_NODEV |
   394  			syscall.MS_RDONLY
   395  		if err := syscall.Mount(cPath, cPath, "", flags, ""); err != nil {
   396  			return fmt.Errorf("error remounting RO %q: %v", cPath, err)
   397  		}
   398  	}
   399  
   400  	// Bind-mount sys filesystem read-only
   401  	flags = syscall.MS_BIND |
   402  		syscall.MS_REMOUNT |
   403  		syscall.MS_NOSUID |
   404  		syscall.MS_NOEXEC |
   405  		syscall.MS_NODEV |
   406  		syscall.MS_RDONLY
   407  	if err := syscall.Mount(sysPath, sysPath, "", flags, ""); err != nil {
   408  		return fmt.Errorf("error remounting RO %q: %v", sysPath, err)
   409  	}
   410  
   411  	return nil
   412  }