github.com/blixtra/rkt@v0.8.1-0.20160204105720-ab0d1add1a43/common/cgroup/cgroup.go (about) 1 // Copyright 2015 The rkt Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //+build linux 16 17 package cgroup 18 19 import ( 20 "bufio" 21 "errors" 22 "fmt" 23 "io" 24 "io/ioutil" 25 "os" 26 "path/filepath" 27 "strconv" 28 "strings" 29 "syscall" 30 31 "github.com/coreos/go-systemd/unit" 32 "github.com/hashicorp/errwrap" 33 "k8s.io/kubernetes/pkg/api/resource" 34 ) 35 36 type addIsolatorFunc func(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error) 37 38 var ( 39 isolatorFuncs = map[string]addIsolatorFunc{ 40 "cpu": addCpuLimit, 41 "memory": addMemoryLimit, 42 } 43 cgroupControllerRWFiles = map[string][]string{ 44 "memory": []string{"memory.limit_in_bytes"}, 45 "cpu": []string{"cpu.cfs_quota_us"}, 46 } 47 ) 48 49 func addCpuLimit(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error) { 50 if limit.Value() > resource.MaxMilliValue { 51 return nil, fmt.Errorf("cpu limit exceeds the maximum millivalue: %v", limit.String()) 52 } 53 quota := strconv.Itoa(int(limit.MilliValue()/10)) + "%" 54 opts = append(opts, unit.NewUnitOption("Service", "CPUQuota", quota)) 55 return opts, nil 56 } 57 58 func addMemoryLimit(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error) { 59 opts = append(opts, unit.NewUnitOption("Service", "MemoryLimit", strconv.Itoa(int(limit.Value())))) 60 return opts, nil 61 } 62 63 // MaybeAddIsolator considers the given isolator; if the type is known 64 // (i.e. IsIsolatorSupported is true) and the limit is non-nil, the supplied 65 // opts will be extended with an appropriate option implementing the desired 66 // isolation. 67 func MaybeAddIsolator(opts []*unit.UnitOption, isolator string, limit *resource.Quantity) ([]*unit.UnitOption, error) { 68 var err error 69 if limit == nil { 70 return opts, nil 71 } 72 if IsIsolatorSupported(isolator) { 73 opts, err = isolatorFuncs[isolator](opts, limit) 74 if err != nil { 75 return nil, err 76 } 77 } else { 78 fmt.Fprintf(os.Stderr, "warning: resource/%s isolator set but support disabled in the kernel, skipping\n", isolator) 79 } 80 return opts, nil 81 } 82 83 // IsIsolatorSupported returns whether an isolator is supported in the kernel 84 func IsIsolatorSupported(isolator string) bool { 85 if files, ok := cgroupControllerRWFiles[isolator]; ok { 86 for _, f := range files { 87 isolatorPath := filepath.Join("/sys/fs/cgroup/", isolator, f) 88 if _, err := os.Stat(isolatorPath); os.IsNotExist(err) { 89 return false 90 } 91 } 92 return true 93 } 94 return false 95 } 96 97 func parseCgroups(f io.Reader) (map[int][]string, error) { 98 sc := bufio.NewScanner(f) 99 100 // skip first line since it is a comment 101 sc.Scan() 102 103 cgroups := make(map[int][]string) 104 for sc.Scan() { 105 var controller string 106 var hierarchy int 107 var num int 108 var enabled int 109 fmt.Sscanf(sc.Text(), "%s %d %d %d", &controller, &hierarchy, &num, &enabled) 110 111 if enabled == 1 { 112 if _, ok := cgroups[hierarchy]; !ok { 113 cgroups[hierarchy] = []string{controller} 114 } else { 115 cgroups[hierarchy] = append(cgroups[hierarchy], controller) 116 } 117 } 118 } 119 120 if err := sc.Err(); err != nil { 121 return nil, err 122 } 123 124 return cgroups, nil 125 } 126 127 // GetEnabledCgroups returns a map with the enabled cgroup controllers grouped by 128 // hierarchy 129 func GetEnabledCgroups() (map[int][]string, error) { 130 cgroupsFile, err := os.Open("/proc/cgroups") 131 if err != nil { 132 return nil, err 133 } 134 defer cgroupsFile.Close() 135 136 cgroups, err := parseCgroups(cgroupsFile) 137 if err != nil { 138 return nil, errwrap.Wrap(errors.New("error parsing /proc/cgroups"), err) 139 } 140 141 return cgroups, nil 142 } 143 144 // GetControllerDirs takes a map with the enabled cgroup controllers grouped by 145 // hierarchy and returns the directory names as they should be in 146 // /sys/fs/cgroup 147 func GetControllerDirs(cgroups map[int][]string) []string { 148 var controllers []string 149 for _, cs := range cgroups { 150 controllers = append(controllers, strings.Join(cs, ",")) 151 } 152 153 return controllers 154 } 155 156 func getControllerSymlinks(cgroups map[int][]string) map[string]string { 157 symlinks := make(map[string]string) 158 159 for _, cs := range cgroups { 160 if len(cs) > 1 { 161 tgt := strings.Join(cs, ",") 162 for _, ln := range cs { 163 symlinks[ln] = tgt 164 } 165 } 166 } 167 168 return symlinks 169 } 170 171 func getControllerRWFiles(controller string) []string { 172 parts := strings.Split(controller, ",") 173 for _, p := range parts { 174 if files, ok := cgroupControllerRWFiles[p]; ok { 175 // cgroup.procs always needs to be RW for allowing systemd to add 176 // processes to the controller 177 files = append(files, "cgroup.procs") 178 return files 179 } 180 } 181 182 return nil 183 } 184 185 func parseOwnCgroupController(controller string) ([]string, error) { 186 cgroupPath := "/proc/self/cgroup" 187 cg, err := os.Open(cgroupPath) 188 if err != nil { 189 return nil, errwrap.Wrap(errors.New("error opening /proc/self/cgroup"), err) 190 } 191 defer cg.Close() 192 193 s := bufio.NewScanner(cg) 194 for s.Scan() { 195 parts := strings.SplitN(s.Text(), ":", 3) 196 if len(parts) < 3 { 197 return nil, fmt.Errorf("error parsing /proc/self/cgroup") 198 } 199 controllerParts := strings.Split(parts[1], ",") 200 for _, c := range controllerParts { 201 if c == controller { 202 return parts, nil 203 } 204 } 205 } 206 207 return nil, fmt.Errorf("controller %q not found", controller) 208 } 209 210 // GetOwnCgroupPath returns the cgroup path of this process in controller 211 // hierarchy 212 func GetOwnCgroupPath(controller string) (string, error) { 213 parts, err := parseOwnCgroupController(controller) 214 if err != nil { 215 return "", err 216 } 217 return parts[2], nil 218 } 219 220 // JoinCgroup makes the calling process join the subcgroup hierarchy on a 221 // particular controller 222 func JoinSubcgroup(controller string, subcgroup string) error { 223 subcgroupPath := filepath.Join("/sys/fs/cgroup", controller, subcgroup) 224 if err := os.MkdirAll(subcgroupPath, 0600); err != nil { 225 return errwrap.Wrap(fmt.Errorf("error creating %q subcgroup", subcgroup), err) 226 } 227 pidBytes := []byte(strconv.Itoa(os.Getpid())) 228 if err := ioutil.WriteFile(filepath.Join(subcgroupPath, "cgroup.procs"), pidBytes, 0600); err != nil { 229 return errwrap.Wrap(fmt.Errorf("error adding ourselves to the %q subcgroup", subcgroup), err) 230 } 231 232 return nil 233 } 234 235 // If /system.slice does not exist in the cpuset controller, create it and 236 // configure it. 237 // Since this is a workaround, we ignore errors 238 func fixCpusetKnobs(cpusetPath string) { 239 cgroupPathFix := filepath.Join(cpusetPath, "system.slice") 240 _ = os.MkdirAll(cgroupPathFix, 0755) 241 knobs := []string{"cpuset.mems", "cpuset.cpus"} 242 for _, knob := range knobs { 243 parentFile := filepath.Join(filepath.Dir(cgroupPathFix), knob) 244 childFile := filepath.Join(cgroupPathFix, knob) 245 246 data, err := ioutil.ReadFile(childFile) 247 if err != nil { 248 continue 249 } 250 // If the file is already configured, don't change it 251 if strings.TrimSpace(string(data)) != "" { 252 continue 253 } 254 255 data, err = ioutil.ReadFile(parentFile) 256 if err == nil { 257 // Workaround: just write twice to workaround the kernel bug fixed by this commit: 258 // https://github.com/torvalds/linux/commit/24ee3cf89bef04e8bc23788aca4e029a3f0f06d9 259 ioutil.WriteFile(childFile, data, 0644) 260 ioutil.WriteFile(childFile, data, 0644) 261 } 262 } 263 } 264 265 // IsControllerMounted returns whether a controller is mounted by checking that 266 // cgroup.procs is accessible 267 func IsControllerMounted(c string) bool { 268 cgroupProcsPath := filepath.Join("/sys/fs/cgroup", c, "cgroup.procs") 269 if _, err := os.Stat(cgroupProcsPath); err != nil { 270 return false 271 } 272 273 return true 274 } 275 276 // CreateCgroups mounts the cgroup controllers hierarchy in /sys/fs/cgroup 277 // under root 278 func CreateCgroups(root string, enabledCgroups map[int][]string) error { 279 controllers := GetControllerDirs(enabledCgroups) 280 var flags uintptr 281 282 sys := filepath.Join(root, "/sys") 283 if err := os.MkdirAll(sys, 0700); err != nil { 284 return err 285 } 286 flags = syscall.MS_NOSUID | 287 syscall.MS_NOEXEC | 288 syscall.MS_NODEV 289 // If we're mounting the host cgroups, /sys is probably mounted so we 290 // ignore EBUSY 291 if err := syscall.Mount("sysfs", sys, "sysfs", flags, ""); err != nil && err != syscall.EBUSY { 292 return errwrap.Wrap(fmt.Errorf("error mounting %q", sys), err) 293 } 294 295 cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup") 296 if err := os.MkdirAll(cgroupTmpfs, 0700); err != nil { 297 return err 298 } 299 flags = syscall.MS_NOSUID | 300 syscall.MS_NOEXEC | 301 syscall.MS_NODEV | 302 syscall.MS_STRICTATIME 303 if err := syscall.Mount("tmpfs", cgroupTmpfs, "tmpfs", flags, "mode=755"); err != nil { 304 return errwrap.Wrap(fmt.Errorf("error mounting %q", cgroupTmpfs), err) 305 } 306 307 // Mount controllers 308 for _, c := range controllers { 309 cPath := filepath.Join(root, "/sys/fs/cgroup", c) 310 if err := os.MkdirAll(cPath, 0700); err != nil { 311 return err 312 } 313 314 flags = syscall.MS_NOSUID | 315 syscall.MS_NOEXEC | 316 syscall.MS_NODEV 317 if err := syscall.Mount("cgroup", cPath, "cgroup", flags, c); err != nil { 318 return errwrap.Wrap(fmt.Errorf("error mounting %q", cPath), err) 319 } 320 } 321 322 // Create symlinks for combined controllers 323 symlinks := getControllerSymlinks(enabledCgroups) 324 for ln, tgt := range symlinks { 325 lnPath := filepath.Join(cgroupTmpfs, ln) 326 if err := os.Symlink(tgt, lnPath); err != nil { 327 return errwrap.Wrap(errors.New("error creating symlink"), err) 328 } 329 } 330 331 systemdControllerPath := filepath.Join(root, "/sys/fs/cgroup/systemd") 332 if err := os.MkdirAll(systemdControllerPath, 0700); err != nil { 333 return err 334 } 335 336 // Bind-mount cgroup tmpfs filesystem read-only 337 flags = syscall.MS_BIND | 338 syscall.MS_REMOUNT | 339 syscall.MS_NOSUID | 340 syscall.MS_NOEXEC | 341 syscall.MS_NODEV | 342 syscall.MS_RDONLY 343 if err := syscall.Mount(cgroupTmpfs, cgroupTmpfs, "", flags, ""); err != nil { 344 return errwrap.Wrap(fmt.Errorf("error remounting RO %q", cgroupTmpfs), err) 345 } 346 347 return nil 348 } 349 350 // RemountCgroupsRO remounts the cgroup hierarchy under root read-only, leaving 351 // the needed knobs in the subcgroup for each app read-write so the systemd 352 // inside stage1 can apply isolators to them 353 func RemountCgroupsRO(root string, enabledCgroups map[int][]string, subcgroup string, serviceNames []string) error { 354 controllers := GetControllerDirs(enabledCgroups) 355 cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup") 356 sysPath := filepath.Join(root, "/sys") 357 358 var flags uintptr 359 360 // Mount RW knobs we need to make the enabled isolators work 361 for _, c := range controllers { 362 cPath := filepath.Join(cgroupTmpfs, c) 363 subcgroupPath := filepath.Join(cPath, subcgroup) 364 365 // Workaround for https://github.com/coreos/rkt/issues/1210 366 if c == "cpuset" { 367 fixCpusetKnobs(cPath) 368 } 369 370 // Create cgroup directories and mount the files we need over 371 // themselves so they stay read-write 372 for _, serviceName := range serviceNames { 373 appCgroup := filepath.Join(subcgroupPath, serviceName) 374 if err := os.MkdirAll(appCgroup, 0755); err != nil { 375 return err 376 } 377 for _, f := range getControllerRWFiles(c) { 378 cgroupFilePath := filepath.Join(appCgroup, f) 379 // the file may not be there if kernel doesn't support the 380 // feature, skip it in that case 381 if _, err := os.Stat(cgroupFilePath); os.IsNotExist(err) { 382 continue 383 } 384 if err := syscall.Mount(cgroupFilePath, cgroupFilePath, "", syscall.MS_BIND, ""); err != nil { 385 return errwrap.Wrap(fmt.Errorf("error bind mounting %q", cgroupFilePath), err) 386 } 387 } 388 } 389 390 // Re-mount controller read-only to prevent the container modifying host controllers 391 flags = syscall.MS_BIND | 392 syscall.MS_REMOUNT | 393 syscall.MS_NOSUID | 394 syscall.MS_NOEXEC | 395 syscall.MS_NODEV | 396 syscall.MS_RDONLY 397 if err := syscall.Mount(cPath, cPath, "", flags, ""); err != nil { 398 return errwrap.Wrap(fmt.Errorf("error remounting RO %q", cPath), err) 399 } 400 } 401 402 // Bind-mount sys filesystem read-only 403 flags = syscall.MS_BIND | 404 syscall.MS_REMOUNT | 405 syscall.MS_NOSUID | 406 syscall.MS_NOEXEC | 407 syscall.MS_NODEV | 408 syscall.MS_RDONLY 409 if err := syscall.Mount(sysPath, sysPath, "", flags, ""); err != nil { 410 return errwrap.Wrap(fmt.Errorf("error remounting RO %q", sysPath), err) 411 } 412 413 return nil 414 }