github.com/rkt/rkt@v1.30.1-0.20200224141603-171c416fac02/common/cgroup/v1/cgroup.go (about) 1 // Copyright 2016 The rkt Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //+build linux 16 17 package v1 18 19 import ( 20 "bufio" 21 "errors" 22 "fmt" 23 "io" 24 "io/ioutil" 25 "os" 26 "path/filepath" 27 "strconv" 28 "strings" 29 "syscall" 30 31 "github.com/hashicorp/errwrap" 32 "github.com/rkt/rkt/pkg/fs" 33 ) 34 35 // mountFsRO remounts the given mountPoint using the given flags read-only. 36 func mountFsRO(m fs.Mounter, mountPoint string, flags uintptr) error { 37 flags = flags | 38 syscall.MS_BIND | 39 syscall.MS_REMOUNT | 40 syscall.MS_RDONLY 41 42 if err := m.Mount(mountPoint, mountPoint, "", flags, ""); err != nil { 43 return errwrap.Wrap(fmt.Errorf("error remounting read-only %q", mountPoint), err) 44 } 45 46 return nil 47 } 48 49 func parseCgroups(f io.Reader) (map[int][]string, error) { 50 sc := bufio.NewScanner(f) 51 52 // skip first line since it is a comment 53 sc.Scan() 54 55 cgroups := make(map[int][]string) 56 for sc.Scan() { 57 var controller string 58 var hierarchy int 59 var num int 60 var enabled int 61 fmt.Sscanf(sc.Text(), "%s %d %d %d", &controller, &hierarchy, &num, &enabled) 62 63 if enabled == 1 { 64 if _, ok := cgroups[hierarchy]; !ok { 65 cgroups[hierarchy] = []string{controller} 66 } else { 67 cgroups[hierarchy] = append(cgroups[hierarchy], controller) 68 } 69 } 70 } 71 72 if err := sc.Err(); err != nil { 73 return nil, err 74 } 75 76 return cgroups, nil 77 } 78 79 // GetEnabledCgroups returns a map with the enabled cgroup controllers grouped by 80 // hierarchy 81 func GetEnabledCgroups() (map[int][]string, error) { 82 cgroupsFile, err := os.Open("/proc/cgroups") 83 if err != nil { 84 return nil, err 85 } 86 defer cgroupsFile.Close() 87 88 cgroups, err := parseCgroups(cgroupsFile) 89 if err != nil { 90 return nil, errwrap.Wrap(errors.New("error parsing /proc/cgroups"), err) 91 } 92 93 return cgroups, nil 94 } 95 96 // GetControllerDirs takes a map with the enabled cgroup controllers grouped by 97 // hierarchy and returns the directory names as they should be in 98 // /sys/fs/cgroup 99 func GetControllerDirs(cgroups map[int][]string) []string { 100 var controllers []string 101 for _, cs := range cgroups { 102 controllers = append(controllers, strings.Join(cs, ",")) 103 } 104 105 return controllers 106 } 107 108 func getControllerSymlinks(cgroups map[int][]string) map[string]string { 109 symlinks := make(map[string]string) 110 111 for _, cs := range cgroups { 112 if len(cs) > 1 { 113 tgt := strings.Join(cs, ",") 114 for _, ln := range cs { 115 symlinks[ln] = tgt 116 } 117 } 118 } 119 120 return symlinks 121 } 122 123 func parseCgroupController(cgroupPath, controller string) ([]string, error) { 124 cg, err := os.Open(cgroupPath) 125 if err != nil { 126 return nil, errwrap.Wrap(errors.New("error opening /proc/self/cgroup"), err) 127 } 128 defer cg.Close() 129 130 s := bufio.NewScanner(cg) 131 for s.Scan() { 132 parts := strings.SplitN(s.Text(), ":", 3) 133 if len(parts) < 3 { 134 return nil, fmt.Errorf("error parsing /proc/self/cgroup") 135 } 136 controllerParts := strings.Split(parts[1], ",") 137 for _, c := range controllerParts { 138 if c == controller { 139 return parts, nil 140 } 141 } 142 } 143 144 return nil, fmt.Errorf("controller %q not found", controller) 145 } 146 147 // GetOwnCgroupPath returns the cgroup path of this process in controller 148 // hierarchy 149 func GetOwnCgroupPath(controller string) (string, error) { 150 parts, err := parseCgroupController("/proc/self/cgroup", controller) 151 if err != nil { 152 return "", err 153 } 154 return parts[2], nil 155 } 156 157 // GetCgroupPathByPid returns the cgroup path of the process with the given pid 158 // and given controller. 159 func GetCgroupPathByPid(pid int, controller string) (string, error) { 160 parts, err := parseCgroupController(fmt.Sprintf("/proc/%d/cgroup", pid), controller) 161 if err != nil { 162 return "", err 163 } 164 return parts[2], nil 165 } 166 167 // JoinSubcgroup makes the calling process join the subcgroup hierarchy on a 168 // particular controller 169 func JoinSubcgroup(controller string, subcgroup string) error { 170 subcgroupPath := filepath.Join("/sys/fs/cgroup", controller, subcgroup) 171 if err := os.MkdirAll(subcgroupPath, 0600); err != nil { 172 return errwrap.Wrap(fmt.Errorf("error creating %q subcgroup", subcgroup), err) 173 } 174 pidBytes := []byte(strconv.Itoa(os.Getpid())) 175 if err := ioutil.WriteFile(filepath.Join(subcgroupPath, "cgroup.procs"), pidBytes, 0600); err != nil { 176 return errwrap.Wrap(fmt.Errorf("error adding ourselves to the %q subcgroup", subcgroup), err) 177 } 178 179 return nil 180 } 181 182 // Ensure that the hierarchy has consistent cpu restrictions. 183 // This may fail; since this is "fixup" code, we should ignore 184 // the error and proceed. 185 // 186 // This was originally a workaround for https://github.com/rkt/rkt/issues/1210 187 // but is actually useful to have around 188 // 189 // cpuSetPath should be <stage1rootfs>/sys/fs/cgroup/cpuset 190 func fixCpusetKnobs(cpusetPath, subcgroup, knob string) error { 191 if err := os.MkdirAll(filepath.Join(cpusetPath, subcgroup), 0755); err != nil { 192 return err 193 } 194 195 dirs := strings.Split(subcgroup, "/") 196 197 // Loop over every entry in the hierarchy, putting in the parent's value 198 // unless there is one already there. 199 // Read from the root knob 200 parentFile := filepath.Join(cpusetPath, knob) 201 parentData, err := ioutil.ReadFile(parentFile) 202 if err != nil { 203 return errwrap.Wrapf("error reading cgroup "+parentFile, err) 204 } 205 206 // Loop over every directory in the subcgroup path 207 currDir := cpusetPath 208 for _, dir := range dirs { 209 currDir = filepath.Join(currDir, dir) 210 211 childFile := filepath.Join(currDir, knob) 212 childData, err := ioutil.ReadFile(childFile) 213 if err != nil { 214 return errwrap.Wrapf("error reading cgroup "+childFile, err) 215 } 216 217 // If there is already a value, don't write - and propagate 218 // this value to subsequent children 219 if strings.TrimSpace(string(childData)) != "" { 220 parentData = childData 221 continue 222 } 223 224 // Workaround: just write twice to workaround the kernel bug fixed by this commit: 225 // https://github.com/torvalds/linux/commit/24ee3cf89bef04e8bc23788aca4e029a3f0f06d9 226 if err := ioutil.WriteFile(childFile, parentData, 0644); err != nil { 227 return errwrap.Wrapf("error writing cgroup "+childFile, err) 228 } 229 if err := ioutil.WriteFile(childFile, parentData, 0644); err != nil { 230 return errwrap.Wrapf("error writing cgroup "+childFile, err) 231 } 232 } 233 return nil 234 } 235 236 // IsControllerMounted returns whether a controller is mounted by checking that 237 // cgroup.procs is accessible 238 func IsControllerMounted(c string) (bool, error) { 239 cgroupProcsPath := filepath.Join("/sys/fs/cgroup", c, "cgroup.procs") 240 if _, err := os.Stat(cgroupProcsPath); err != nil { 241 if !os.IsNotExist(err) { 242 return false, err 243 } 244 return false, nil 245 } 246 247 return true, nil 248 } 249 250 // CreateCgroups mounts the v1 cgroup controllers hierarchy in /sys/fs/cgroup 251 // under root 252 func CreateCgroups(m fs.Mounter, root string, enabledCgroups map[int][]string, mountContext string) error { 253 controllers := GetControllerDirs(enabledCgroups) 254 255 sys := filepath.Join(root, "/sys") 256 if err := os.MkdirAll(sys, 0700); err != nil { 257 return err 258 } 259 260 var sysfsFlags uintptr = syscall.MS_NOSUID | 261 syscall.MS_NOEXEC | 262 syscall.MS_NODEV 263 264 // If we're mounting the host cgroups, /sys is probably mounted so we 265 // ignore EBUSY 266 if err := m.Mount("sysfs", sys, "sysfs", sysfsFlags, ""); err != nil && err != syscall.EBUSY { 267 return errwrap.Wrap(fmt.Errorf("error mounting %q", sys), err) 268 } 269 270 cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup") 271 if err := os.MkdirAll(cgroupTmpfs, 0700); err != nil { 272 return err 273 } 274 275 var cgroupTmpfsFlags uintptr = syscall.MS_NOSUID | 276 syscall.MS_NOEXEC | 277 syscall.MS_NODEV | 278 syscall.MS_STRICTATIME 279 280 options := "mode=755" 281 if mountContext != "" { 282 options = fmt.Sprintf("mode=755,context=\"%s\"", mountContext) 283 } 284 285 if err := m.Mount("tmpfs", cgroupTmpfs, "tmpfs", cgroupTmpfsFlags, options); err != nil { 286 return errwrap.Wrap(fmt.Errorf("error mounting %q", cgroupTmpfs), err) 287 } 288 289 // Mount controllers 290 for _, c := range controllers { 291 cPath := filepath.Join(root, "/sys/fs/cgroup", c) 292 if err := os.MkdirAll(cPath, 0700); err != nil { 293 return err 294 } 295 296 var flags uintptr = syscall.MS_NOSUID | 297 syscall.MS_NOEXEC | 298 syscall.MS_NODEV 299 300 if err := m.Mount("cgroup", cPath, "cgroup", flags, c); err != nil { 301 return errwrap.Wrap(fmt.Errorf("error mounting %q", cPath), err) 302 } 303 } 304 305 // Create symlinks for combined controllers 306 symlinks := getControllerSymlinks(enabledCgroups) 307 for ln, tgt := range symlinks { 308 lnPath := filepath.Join(cgroupTmpfs, ln) 309 if err := os.Symlink(tgt, lnPath); err != nil { 310 return errwrap.Wrap(errors.New("error creating symlink"), err) 311 } 312 } 313 314 systemdControllerPath := filepath.Join(root, "/sys/fs/cgroup/systemd") 315 if err := os.MkdirAll(systemdControllerPath, 0700); err != nil { 316 return err 317 } 318 319 unifiedPath := filepath.Join(root, "/sys/fs/cgroup/unified") 320 if err := os.MkdirAll(unifiedPath, 0700); err != nil { 321 return err 322 } 323 324 // Bind-mount cgroup tmpfs filesystem read-only 325 return mountFsRO(m, cgroupTmpfs, cgroupTmpfsFlags) 326 } 327 328 // RemountCgroups remounts the v1 cgroup hierarchy under root. 329 // It mounts /sys/fs/cgroup/[controller] read-only, 330 // but leaves needed knobs in the pod's subcgroup read-write, 331 // such that systemd inside stage1 can apply isolators to them. 332 // It leaves /sys read-write if the given readWrite parameter is true. 333 // When this is done, <stage1>/sys/fs/cgroup/<controller> should be RO, and 334 // <stage1>/sys/fs/cgroup/<cotroller>/.../machine-rkt/.../system.slice should be RW 335 func RemountCgroups(m fs.Mounter, root string, enabledCgroups map[int][]string, subcgroup string, readWrite bool) error { 336 controllers := GetControllerDirs(enabledCgroups) 337 cgroupTmpfs := filepath.Join(root, "/sys/fs/cgroup") 338 sysPath := filepath.Join(root, "/sys") 339 340 var flags uintptr = syscall.MS_NOSUID | 341 syscall.MS_NOEXEC | 342 syscall.MS_NODEV 343 344 // Mount RW the controllers for this pod 345 for _, c := range controllers { 346 cPath := filepath.Join(cgroupTmpfs, c) 347 subcgroupPath := filepath.Join(cPath, subcgroup, "system.slice") 348 349 if err := os.MkdirAll(subcgroupPath, 0755); err != nil { 350 return err 351 } 352 if err := m.Mount(subcgroupPath, subcgroupPath, "", syscall.MS_BIND, ""); err != nil { 353 return errwrap.Wrap(fmt.Errorf("error bind mounting %q", subcgroupPath), err) 354 } 355 356 // Workaround for https://github.com/rkt/rkt/issues/1210 357 // It is OK to ignore errors here. 358 if c == "cpuset" { 359 _ = fixCpusetKnobs(cPath, subcgroup, "cpuset.mems") 360 _ = fixCpusetKnobs(cPath, subcgroup, "cpuset.cpus") 361 } 362 363 // Re-mount controller read-only to prevent the container modifying host controllers 364 if err := mountFsRO(m, cPath, flags); err != nil { 365 return err 366 } 367 } 368 369 if readWrite { // leave sys r/w? 370 return nil 371 } 372 373 // Bind-mount sys filesystem read-only 374 return mountFsRO(m, sysPath, flags) 375 }