github.com/stackdocker/rkt@v0.10.1-0.20151109095037-1aa827478248/common/cgroup/cgroup.go (about) 1 // Copyright 2015 The rkt Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //+build linux 16 17 package cgroup 18 19 import ( 20 "bufio" 21 "fmt" 22 "io" 23 "io/ioutil" 24 "os" 25 "path/filepath" 26 "strconv" 27 "strings" 28 "syscall" 29 30 "github.com/coreos/rkt/Godeps/_workspace/src/github.com/coreos/go-systemd/unit" 31 "github.com/coreos/rkt/Godeps/_workspace/src/k8s.io/kubernetes/pkg/api/resource" 32 ) 33 34 type addIsolatorFunc func(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error) 35 36 var ( 37 isolatorFuncs = map[string]addIsolatorFunc{ 38 "cpu": addCpuLimit, 39 "memory": addMemoryLimit, 40 } 41 cgroupControllerRWFiles = map[string][]string{ 42 "memory": []string{"memory.limit_in_bytes"}, 43 "cpu": []string{"cpu.cfs_quota_us"}, 44 } 45 ) 46 47 func addCpuLimit(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error) { 48 if limit.Value() > resource.MaxMilliValue { 49 return nil, fmt.Errorf("cpu limit exceeds the maximum millivalue: %v", limit.String()) 50 } 51 quota := strconv.Itoa(int(limit.MilliValue()/10)) + "%" 52 opts = append(opts, unit.NewUnitOption("Service", "CPUQuota", quota)) 53 return opts, nil 54 } 55 56 func addMemoryLimit(opts []*unit.UnitOption, limit *resource.Quantity) ([]*unit.UnitOption, error) { 57 opts = append(opts, unit.NewUnitOption("Service", "MemoryLimit", strconv.Itoa(int(limit.Value())))) 58 return opts, nil 59 } 60 61 // MaybeAddIsolator considers the given isolator; if the type is known 62 // (i.e. IsIsolatorSupported is true) and the limit is non-nil, the supplied 63 // opts will be extended with an appropriate option implementing the desired 64 // isolation. 65 func MaybeAddIsolator(opts []*unit.UnitOption, isolator string, limit *resource.Quantity) ([]*unit.UnitOption, error) { 66 var err error 67 if limit == nil { 68 return opts, nil 69 } 70 if IsIsolatorSupported(isolator) { 71 opts, err = isolatorFuncs[isolator](opts, limit) 72 if err != nil { 73 return nil, err 74 } 75 } else { 76 fmt.Fprintf(os.Stderr, "warning: resource/%s isolator set but support disabled in the kernel, skipping\n", isolator) 77 } 78 return opts, nil 79 } 80 81 // IsIsolatorSupported returns whether an isolator is supported in the kernel 82 func IsIsolatorSupported(isolator string) bool { 83 if files, ok := cgroupControllerRWFiles[isolator]; ok { 84 for _, f := range files { 85 isolatorPath := filepath.Join("/sys/fs/cgroup/", isolator, f) 86 if _, err := os.Stat(isolatorPath); os.IsNotExist(err) { 87 return false 88 } 89 } 90 return true 91 } 92 return false 93 } 94 95 func parseCgroups(f io.Reader) (map[int][]string, error) { 96 sc := bufio.NewScanner(f) 97 98 // skip first line since it is a comment 99 sc.Scan() 100 101 cgroups := make(map[int][]string) 102 for sc.Scan() { 103 var controller string 104 var hierarchy int 105 var num int 106 var enabled int 107 fmt.Sscanf(sc.Text(), "%s %d %d %d", &controller, &hierarchy, &num, &enabled) 108 109 if enabled == 1 { 110 if _, ok := cgroups[hierarchy]; !ok { 111 cgroups[hierarchy] = []string{controller} 112 } else { 113 cgroups[hierarchy] = append(cgroups[hierarchy], controller) 114 } 115 } 116 } 117 118 if err := sc.Err(); err != nil { 119 return nil, err 120 } 121 122 return cgroups, nil 123 } 124 125 // GetEnabledCgroups returns a map with the enabled cgroup controllers grouped by 126 // hierarchy 127 func GetEnabledCgroups() (map[int][]string, error) { 128 cgroupsFile, err := os.Open("/proc/cgroups") 129 if err != nil { 130 return nil, err 131 } 132 defer cgroupsFile.Close() 133 134 cgroups, err := parseCgroups(cgroupsFile) 135 if err != nil { 136 return nil, fmt.Errorf("error parsing /proc/cgroups: %v", err) 137 } 138 139 return cgroups, nil 140 } 141 142 // GetControllerDirs takes a map with the enabled cgroup controllers grouped by 143 // hierarchy and returns the directory names as they should be in 144 // /sys/fs/cgroup 145 func GetControllerDirs(cgroups map[int][]string) []string { 146 var controllers []string 147 for _, cs := range cgroups { 148 controllers = append(controllers, strings.Join(cs, ",")) 149 } 150 151 return controllers 152 } 153 154 func getControllerSymlinks(cgroups map[int][]string) map[string]string { 155 symlinks := make(map[string]string) 156 157 for _, cs := range cgroups { 158 if len(cs) > 1 { 159 tgt := strings.Join(cs, ",") 160 for _, ln := range cs { 161 symlinks[ln] = tgt 162 } 163 } 164 } 165 166 return symlinks 167 } 168 169 func getControllerRWFiles(controller string) []string { 170 parts := strings.Split(controller, ",") 171 for _, p := range parts { 172 if files, ok := cgroupControllerRWFiles[p]; ok { 173 // cgroup.procs always needs to be RW for allowing systemd to add 174 // processes to the controller 175 files = append(files, "cgroup.procs") 176 return files 177 } 178 } 179 180 return nil 181 } 182 183 func parseOwnCgroupController(controller string) ([]string, error) { 184 cgroupPath := "/proc/self/cgroup" 185 cg, err := os.Open(cgroupPath) 186 if err != nil { 187 return nil, fmt.Errorf("error opening /proc/self/cgroup: %v", err) 188 } 189 defer cg.Close() 190 191 s := bufio.NewScanner(cg) 192 for s.Scan() { 193 parts := strings.SplitN(s.Text(), ":", 3) 194 if len(parts) < 3 { 195 return nil, fmt.Errorf("error parsing /proc/self/cgroup") 196 } 197 controllerParts := strings.Split(parts[1], ",") 198 for _, c := range controllerParts { 199 if c == controller { 200 return parts, nil 201 } 202 } 203 } 204 205 return nil, fmt.Errorf("controller %q not found", controller) 206 } 207 208 // GetOwnCgroupPath returns the cgroup path of this process in controller 209 // hierarchy 210 func GetOwnCgroupPath(controller string) (string, error) { 211 parts, err := parseOwnCgroupController(controller) 212 if err != nil { 213 return "", err 214 } 215 return parts[2], nil 216 } 217 218 // JoinCgroup makes the calling process join the subcgroup hierarchy on a 219 // particular controller 220 func JoinSubcgroup(controller string, subcgroup string) error { 221 subcgroupPath := filepath.Join("/sys/fs/cgroup", controller, subcgroup) 222 if err := os.MkdirAll(subcgroupPath, 0600); err != nil { 223 return fmt.Errorf("error creating %q subcgroup: %v", subcgroup, err) 224 } 225 pidBytes := []byte(strconv.Itoa(os.Getpid())) 226 if err := ioutil.WriteFile(filepath.Join(subcgroupPath, "cgroup.procs"), pidBytes, 0600); err != nil { 227 return fmt.Errorf("error adding ourselves to the %q subcgroup: %v", subcgroup, err) 228 } 229 230 return nil 231 } 232 233 // If /system.slice does not exist in the cpuset controller, create it and 234 // configure it. 235 // Since this is a workaround, we ignore errors 236 func fixCpusetKnobs(cpusetPath string) { 237 cgroupPathFix := filepath.Join(cpusetPath, "system.slice") 238 _ = os.MkdirAll(cgroupPathFix, 0755) 239 knobs := []string{"cpuset.mems", "cpuset.cpus"} 240 for _, knob := range knobs { 241 parentFile := filepath.Join(filepath.Dir(cgroupPathFix), knob) 242 childFile := filepath.Join(cgroupPathFix, knob) 243 244 data, err := ioutil.ReadFile(childFile) 245 if err != nil { 246 continue 247 } 248 // If the file is already configured, don't change it 249 if strings.TrimSpace(string(data)) != "" { 250 continue 251 } 252 253 data, err = ioutil.ReadFile(parentFile) 254 if err == nil { 255 // Workaround: just write twice to workaround the kernel bug fixed by this commit: 256 // https://github.com/torvalds/linux/commit/24ee3cf89bef04e8bc23788aca4e029a3f0f06d9 257 ioutil.WriteFile(childFile, data, 0644) 258 ioutil.WriteFile(childFile, data, 0644) 259 } 260 } 261 } 262 263 // IsControllerMounted returns whether a controller is mounted by checking that 264 // cgroup.procs is accessible 265 func IsControllerMounted(c string) bool { 266 cgroupProcsPath := filepath.Join("/sys/fs/cgroup", c, "cgroup.procs") 267 if _, err := os.Stat(cgroupProcsPath); err != nil { 268 return false 269 } 270 271 return true 272 } 273 274 // CreateCgroups mounts the cgroup controllers hierarchy in /sys/fs/cgroup 275 // under root 276 func CreateCgroups(root string, enabledCgroups map[int][]string) error { 277 controllers := GetControllerDirs(enabledCgroups) 278 var flags uintptr 279 280 sys := filepath.Join(root, "/sys") 281 if err := os.MkdirAll(sys, 0700); err != nil { 282 return err 283 } 284 flags = syscall.MS_NOSUID | 285 syscall.MS_NOEXEC | 286 syscall.MS_NODEV 287 // If we're mounting the host cgroups, /sys is probably mounted so we 288 // ignore EBUSY 289 if err := syscall.Mount("sysfs", sys, "sysfs", flags, ""); err != nil && err != syscall.EBUSY { 290 return fmt.Errorf("error mounting %q: %v", sys, err) 291 } 292 293 cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup") 294 if err := os.MkdirAll(cgroupTmpfs, 0700); err != nil { 295 return err 296 } 297 flags = syscall.MS_NOSUID | 298 syscall.MS_NOEXEC | 299 syscall.MS_NODEV | 300 syscall.MS_STRICTATIME 301 if err := syscall.Mount("tmpfs", cgroupTmpfs, "tmpfs", flags, "mode=755"); err != nil { 302 return fmt.Errorf("error mounting %q: %v", cgroupTmpfs, err) 303 } 304 305 // Mount controllers 306 for _, c := range controllers { 307 cPath := filepath.Join(root, "/sys/fs/cgroup", c) 308 if err := os.MkdirAll(cPath, 0700); err != nil { 309 return err 310 } 311 312 flags = syscall.MS_NOSUID | 313 syscall.MS_NOEXEC | 314 syscall.MS_NODEV 315 if err := syscall.Mount("cgroup", cPath, "cgroup", flags, c); err != nil { 316 return fmt.Errorf("error mounting %q: %v", cPath, err) 317 } 318 } 319 320 // Create symlinks for combined controllers 321 symlinks := getControllerSymlinks(enabledCgroups) 322 for ln, tgt := range symlinks { 323 lnPath := filepath.Join(cgroupTmpfs, ln) 324 if err := os.Symlink(tgt, lnPath); err != nil { 325 return fmt.Errorf("error creating symlink: %v", err) 326 } 327 } 328 329 systemdControllerPath := filepath.Join(root, "/sys/fs/cgroup/systemd") 330 if err := os.MkdirAll(systemdControllerPath, 0700); err != nil { 331 return err 332 } 333 334 // Bind-mount cgroup tmpfs filesystem read-only 335 flags = syscall.MS_BIND | 336 syscall.MS_REMOUNT | 337 syscall.MS_NOSUID | 338 syscall.MS_NOEXEC | 339 syscall.MS_NODEV | 340 syscall.MS_RDONLY 341 if err := syscall.Mount(cgroupTmpfs, cgroupTmpfs, "", flags, ""); err != nil { 342 return fmt.Errorf("error remounting RO %q: %v", cgroupTmpfs, err) 343 } 344 345 return nil 346 } 347 348 // RemountCgroupsRO remounts the cgroup hierarchy under root read-only, leaving 349 // the needed knobs in the subcgroup for each app read-write so the systemd 350 // inside stage1 can apply isolators to them 351 func RemountCgroupsRO(root string, enabledCgroups map[int][]string, subcgroup string, serviceNames []string) error { 352 controllers := GetControllerDirs(enabledCgroups) 353 cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup") 354 sysPath := filepath.Join(root, "/sys") 355 356 var flags uintptr 357 358 // Mount RW knobs we need to make the enabled isolators work 359 for _, c := range controllers { 360 cPath := filepath.Join(cgroupTmpfs, c) 361 subcgroupPath := filepath.Join(cPath, subcgroup) 362 363 // Workaround for https://github.com/coreos/rkt/issues/1210 364 if c == "cpuset" { 365 fixCpusetKnobs(cPath) 366 } 367 368 // Create cgroup directories and mount the files we need over 369 // themselves so they stay read-write 370 for _, serviceName := range serviceNames { 371 appCgroup := filepath.Join(subcgroupPath, serviceName) 372 if err := os.MkdirAll(appCgroup, 0755); err != nil { 373 return err 374 } 375 for _, f := range getControllerRWFiles(c) { 376 cgroupFilePath := filepath.Join(appCgroup, f) 377 // the file may not be there if kernel doesn't support the 378 // feature, skip it in that case 379 if _, err := os.Stat(cgroupFilePath); os.IsNotExist(err) { 380 continue 381 } 382 if err := syscall.Mount(cgroupFilePath, cgroupFilePath, "", syscall.MS_BIND, ""); err != nil { 383 return fmt.Errorf("error bind mounting %q: %v", cgroupFilePath, err) 384 } 385 } 386 } 387 388 // Re-mount controller read-only to prevent the container modifying host controllers 389 flags = syscall.MS_BIND | 390 syscall.MS_REMOUNT | 391 syscall.MS_NOSUID | 392 syscall.MS_NOEXEC | 393 syscall.MS_NODEV | 394 syscall.MS_RDONLY 395 if err := syscall.Mount(cPath, cPath, "", flags, ""); err != nil { 396 return fmt.Errorf("error remounting RO %q: %v", cPath, err) 397 } 398 } 399 400 // Bind-mount sys filesystem read-only 401 flags = syscall.MS_BIND | 402 syscall.MS_REMOUNT | 403 syscall.MS_NOSUID | 404 syscall.MS_NOEXEC | 405 syscall.MS_NODEV | 406 syscall.MS_RDONLY 407 if err := syscall.Mount(sysPath, sysPath, "", flags, ""); err != nil { 408 return fmt.Errorf("error remounting RO %q: %v", sysPath, err) 409 } 410 411 return nil 412 }