github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package cgroupfs implements cgroupfs. 16 // 17 // A cgroup is a collection of tasks on the system, organized into a tree-like 18 // structure similar to a filesystem directory tree. In fact, each cgroup is 19 // represented by a directory on cgroupfs, and is manipulated through control 20 // files in the directory. 21 // 22 // All cgroups on a system are organized into hierarchies. Hierarchies are a 23 // distinct tree of cgroups, with a common set of controllers. One or more 24 // cgroupfs mounts may point to each hierarchy. These mounts provide a common 25 // view into the same tree of cgroups. 26 // 27 // A controller (also known as a "resource controller", or a cgroup "subsystem") 28 // determines the behaviour of each cgroup. 29 // 30 // In addition to cgroupfs, the kernel has a cgroup registry that tracks 31 // system-wide state related to cgroups such as active hierarchies and the 32 // controllers associated with them. 33 // 34 // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between 35 // cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref 36 // counted and exist until they're unlinked once or the FS is destroyed. 37 // 38 // # Synchronization 39 // 40 // Cgroup hierarchy creation and destruction is protected by the 41 // kernel.CgroupRegistry.mu. Once created, a hierarchy's set of controllers, the 42 // filesystem associated with it, and the root cgroup for the hierarchy are 43 // immutable. 44 // 45 // Membership of tasks within cgroups is protected by 46 // cgroupfs.filesystem.tasksMu. Tasks also maintain a set of all cgroups they're 47 // in, and this list is protected by Task.mu. 48 // 49 // Lock order: 50 // 51 // kernel.CgroupRegistry.mu 52 // kernfs.filesystem.mu 53 // kernel.TaskSet.mu 54 // kernel.Task.mu 55 // cgroupfs.filesystem.tasksMu. 56 // cgroupfs.dir.OrderedChildren.mu 57 package cgroupfs 58 59 import ( 60 "bytes" 61 "fmt" 62 "sort" 63 "strconv" 64 "strings" 65 66 "github.com/metacubex/gvisor/pkg/abi/linux" 67 "github.com/metacubex/gvisor/pkg/atomicbitops" 68 "github.com/metacubex/gvisor/pkg/context" 69 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 70 "github.com/metacubex/gvisor/pkg/fspath" 71 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs" 72 "github.com/metacubex/gvisor/pkg/sentry/kernel" 73 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 74 "github.com/metacubex/gvisor/pkg/sentry/vfs" 75 "github.com/metacubex/gvisor/pkg/usermem" 76 ) 77 78 const ( 79 // Name is the default filesystem name. 80 Name = "cgroup" 81 readonlyFileMode = linux.FileMode(0444) 82 writableFileMode = linux.FileMode(0644) 83 defaultDirMode = linux.FileMode(0555) | linux.ModeDirectory 84 85 defaultMaxCachedDentries = uint64(1000) 86 ) 87 88 var allControllers = []kernel.CgroupControllerType{ 89 kernel.CgroupControllerCPU, 90 kernel.CgroupControllerCPUAcct, 91 kernel.CgroupControllerCPUSet, 92 kernel.CgroupControllerDevices, 93 kernel.CgroupControllerJob, 94 kernel.CgroupControllerMemory, 95 kernel.CgroupControllerPIDs, 96 } 97 98 // SupportedMountOptions is the set of supported mount options for cgroupfs. 99 var SupportedMountOptions = []string{"all", "cpu", "cpuacct", "cpuset", "devices", "job", "memory", "pids"} 100 101 // FilesystemType implements vfs.FilesystemType. 102 // 103 // +stateify savable 104 type FilesystemType struct{} 105 106 // InitialCgroup specifies properties of the cgroup for the init task. 107 // 108 // +stateify savable 109 type InitialCgroup struct { 110 // Path is an absolute path relative to the root of a cgroupfs filesystem 111 // that indicates where to place the init task. An empty string indicates 112 // the root of the filesystem. 113 Path string 114 115 // SetOwner indicates the UID and GID fields contain valid values. If true, 116 // Both UID and GID must be provided. 117 SetOwner bool 118 // UID of the initial cgroup path components, excluding the root cgroup. 119 UID auth.KUID 120 // GID of the initial cgroup path components, excluding the root cgroup. 121 GID auth.KGID 122 123 // SetMode indicates the Mode field contains a valid value. 124 SetMode bool 125 // Mode of the initial cgroup path components, excluding the root cgroup. 126 Mode linux.FileMode 127 } 128 129 // InternalData contains internal data passed in to the cgroupfs mount via 130 // vfs.GetFilesystemOptions.InternalData. 131 // 132 // +stateify savable 133 type InternalData struct { 134 DefaultControlValues map[string]int64 135 InitialCgroup InitialCgroup 136 } 137 138 // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS. 139 // 140 // +stateify savable 141 type filesystem struct { 142 kernfs.Filesystem 143 devMinor uint32 144 145 // hierarchyID is the id the cgroup registry assigns to this hierarchy. Has 146 // the value kernel.InvalidCgroupHierarchyID until the FS is fully 147 // initialized. 148 // 149 // hierarchyID is immutable after initialization. 150 hierarchyID uint32 151 152 // hierarchyName is the name for a named hierarchy. May be empty if the 153 // 'name=' mount option was not used when the hierarchy was created. 154 // 155 // Immutable after initialization. 156 hierarchyName string 157 158 // controllers and kcontrollers are both the list of controllers attached to 159 // this cgroupfs. Both lists are the same set of controllers, but typecast 160 // to different interfaces for convenience. Both must stay in sync, and are 161 // immutable. 162 controllers []controller 163 kcontrollers []kernel.CgroupController 164 165 numCgroups atomicbitops.Uint64 // Protected by atomic ops. 166 167 root *kernfs.Dentry 168 // effectiveRoot is the initial cgroup new tasks are created in. Unless 169 // overwritten by internal mount options, root == effectiveRoot. If 170 // effectiveRoot != root, an extra reference is held on effectiveRoot for 171 // the lifetime of the filesystem. 172 effectiveRoot *kernfs.Dentry 173 174 // tasksMu serializes task membership changes across all cgroups within a 175 // filesystem. 176 tasksMu taskRWMutex `state:"nosave"` 177 } 178 179 // InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID. 180 func (fs *filesystem) InitializeHierarchyID(hid uint32) { 181 fs.hierarchyID = hid 182 } 183 184 // RootCgroup implements kernel.cgroupFS.RootCgroup. 185 func (fs *filesystem) RootCgroup() kernel.Cgroup { 186 return kernel.Cgroup{ 187 Dentry: fs.root, 188 CgroupImpl: fs.root.Inode().(kernel.CgroupImpl), 189 } 190 } 191 192 // Name implements vfs.FilesystemType.Name. 193 func (FilesystemType) Name() string { 194 return Name 195 } 196 197 // Release implements vfs.FilesystemType.Release. 198 func (FilesystemType) Release(ctx context.Context) {} 199 200 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 201 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 202 devMinor, err := vfsObj.GetAnonBlockDevMinor() 203 if err != nil { 204 return nil, nil, err 205 } 206 207 mopts := vfs.GenericParseMountOptions(opts.Data) 208 maxCachedDentries := defaultMaxCachedDentries 209 if str, ok := mopts["dentry_cache_limit"]; ok { 210 delete(mopts, "dentry_cache_limit") 211 maxCachedDentries, err = strconv.ParseUint(str, 10, 64) 212 if err != nil { 213 ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) 214 return nil, nil, linuxerr.EINVAL 215 } 216 } 217 218 var wantControllers []kernel.CgroupControllerType 219 if _, ok := mopts["cpu"]; ok { 220 delete(mopts, "cpu") 221 wantControllers = append(wantControllers, kernel.CgroupControllerCPU) 222 } 223 if _, ok := mopts["cpuacct"]; ok { 224 delete(mopts, "cpuacct") 225 wantControllers = append(wantControllers, kernel.CgroupControllerCPUAcct) 226 } 227 if _, ok := mopts["cpuset"]; ok { 228 delete(mopts, "cpuset") 229 wantControllers = append(wantControllers, kernel.CgroupControllerCPUSet) 230 } 231 if _, ok := mopts["devices"]; ok { 232 delete(mopts, "devices") 233 wantControllers = append(wantControllers, kernel.CgroupControllerDevices) 234 } 235 if _, ok := mopts["job"]; ok { 236 delete(mopts, "job") 237 wantControllers = append(wantControllers, kernel.CgroupControllerJob) 238 } 239 if _, ok := mopts["memory"]; ok { 240 delete(mopts, "memory") 241 wantControllers = append(wantControllers, kernel.CgroupControllerMemory) 242 } 243 if _, ok := mopts["pids"]; ok { 244 delete(mopts, "pids") 245 wantControllers = append(wantControllers, kernel.CgroupControllerPIDs) 246 } 247 if _, ok := mopts["all"]; ok { 248 if len(wantControllers) > 0 { 249 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers) 250 return nil, nil, linuxerr.EINVAL 251 } 252 253 delete(mopts, "all") 254 wantControllers = allControllers 255 } 256 257 var name string 258 var ok bool 259 if name, ok = mopts["name"]; ok { 260 delete(mopts, "name") 261 } 262 263 var none bool 264 if _, ok = mopts["none"]; ok { 265 none = true 266 delete(mopts, "none") 267 } 268 269 if !none && len(wantControllers) == 0 { 270 // Specifying no controllers implies all controllers, unless "none" was 271 // explicitly requested. 272 wantControllers = allControllers 273 } 274 275 // Some combinations of "none", "all", "name=" and explicit controllers are 276 // not allowed. See Linux, kernel/cgroup.c:parse_cgroupfs_options(). 277 278 // All empty hierarchies must have a name. 279 if len(wantControllers) == 0 && name == "" { 280 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: empty hierarchy with no name") 281 return nil, nil, linuxerr.EINVAL 282 } 283 284 // Can't have "none" and some controllers. 285 if none && len(wantControllers) != 0 { 286 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: 'none' specified with controllers: %v", wantControllers) 287 return nil, nil, linuxerr.EINVAL 288 } 289 290 if len(mopts) != 0 { 291 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) 292 return nil, nil, linuxerr.EINVAL 293 } 294 295 k := kernel.KernelFromContext(ctx) 296 r := k.CgroupRegistry() 297 298 // "It is not possible to mount the same controller against multiple 299 // cgroup hierarchies. For example, it is not possible to mount both 300 // the cpu and cpuacct controllers against one hierarchy, and to mount 301 // the cpu controller alone against another hierarchy." - man cgroups(7) 302 // 303 // Is there a hierarchy available with all the controllers we want? If so, 304 // this mount is a view into the same hierarchy. 305 // 306 // Note: we're guaranteed to have at least one requested controller, since 307 // no explicit controller name implies all controllers. 308 vfsfs, err := r.FindHierarchy(name, wantControllers) 309 if err != nil { 310 return nil, nil, err 311 } 312 if vfsfs != nil { 313 fs := vfsfs.Impl().(*filesystem) 314 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID) 315 fs.root.IncRef() 316 if fs.effectiveRoot != fs.root { 317 fs.effectiveRoot.IncRef() 318 } 319 return vfsfs, fs.root.VFSDentry(), nil 320 } 321 322 // No existing hierarchy with the exactly controllers found. Make a new 323 // one. Note that it's possible this mount creation is unsatisfiable, if one 324 // or more of the requested controllers are already on existing 325 // hierarchies. We'll find out about such collisions when we try to register 326 // the new hierarchy later. 327 fs := &filesystem{ 328 devMinor: devMinor, 329 hierarchyName: name, 330 } 331 fs.MaxCachedDentries = maxCachedDentries 332 fs.VFSFilesystem().Init(vfsObj, &fsType, fs) 333 334 var defaults map[string]int64 335 if opts.InternalData != nil { 336 defaults = opts.InternalData.(*InternalData).DefaultControlValues 337 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) 338 } 339 340 for _, ty := range wantControllers { 341 var c controller 342 switch ty { 343 case kernel.CgroupControllerCPU: 344 c = newCPUController(fs, defaults) 345 case kernel.CgroupControllerCPUAcct: 346 c = newCPUAcctController(fs) 347 case kernel.CgroupControllerCPUSet: 348 c = newCPUSetController(k, fs) 349 case kernel.CgroupControllerDevices: 350 c = newDevicesController(fs) 351 case kernel.CgroupControllerJob: 352 c = newJobController(fs) 353 case kernel.CgroupControllerMemory: 354 c = newMemoryController(fs, defaults) 355 case kernel.CgroupControllerPIDs: 356 c = newRootPIDsController(fs) 357 default: 358 panic(fmt.Sprintf("Unreachable: unknown cgroup controller %q", ty)) 359 } 360 fs.controllers = append(fs.controllers, c) 361 } 362 363 if len(defaults) != 0 { 364 // Internal data is always provided at sentry startup and unused values 365 // indicate a problem with the sandbox config. Fail fast. 366 panic(fmt.Sprintf("cgroupfs.FilesystemType.GetFilesystem: unknown internal mount data: %v", defaults)) 367 } 368 369 // Controllers usually appear in alphabetical order when displayed. Sort it 370 // here now, so it never needs to be sorted elsewhere. 371 sort.Slice(fs.controllers, func(i, j int) bool { return fs.controllers[i].Type() < fs.controllers[j].Type() }) 372 fs.kcontrollers = make([]kernel.CgroupController, 0, len(fs.controllers)) 373 for _, c := range fs.controllers { 374 fs.kcontrollers = append(fs.kcontrollers, c) 375 } 376 377 root := fs.newCgroupInode(ctx, creds, nil, defaultDirMode) 378 var rootD kernfs.Dentry 379 rootD.InitRoot(&fs.Filesystem, root) 380 fs.root = &rootD 381 fs.effectiveRoot = fs.root 382 383 if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil { 384 ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err) 385 rootD.DecRef(ctx) 386 fs.VFSFilesystem().DecRef(ctx) 387 return nil, nil, err 388 } 389 390 // Register controllers. The registry may be modified concurrently, so if we 391 // get an error, we raced with someone else who registered the same 392 // controllers first. 393 if err := r.Register(name, fs.kcontrollers, fs); err != nil { 394 ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err) 395 rootD.DecRef(ctx) 396 fs.VFSFilesystem().DecRef(ctx) 397 return nil, nil, linuxerr.EBUSY 398 } 399 400 // Move all existing tasks to the root of the new hierarchy. 401 k.PopulateNewCgroupHierarchy(fs.effectiveRootCgroup()) 402 403 return fs.VFSFilesystem(), rootD.VFSDentry(), nil 404 } 405 406 // prepareInitialCgroup creates the initial cgroup according to opts. An initial 407 // cgroup is optional, and if not specified, this function is a no-op. 408 func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error { 409 if opts.InternalData == nil { 410 return nil 411 } 412 idata := opts.InternalData.(*InternalData) 413 414 initPathStr := idata.InitialCgroup.Path 415 if initPathStr == "" { 416 return nil 417 } 418 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr) 419 initPath := fspath.Parse(initPathStr) 420 if !initPath.Absolute { 421 ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath) 422 return linuxerr.EINVAL 423 } 424 if !initPath.HasComponents() { 425 // Explicit "/" as initial cgroup, nothing to do. 426 return nil 427 } 428 429 ownerCreds := auth.CredentialsFromContext(ctx).Fork() 430 if idata.InitialCgroup.SetOwner { 431 ownerCreds.EffectiveKUID = idata.InitialCgroup.UID 432 ownerCreds.EffectiveKGID = idata.InitialCgroup.GID 433 } 434 mode := defaultDirMode 435 if idata.InitialCgroup.SetMode { 436 mode = idata.InitialCgroup.Mode 437 } 438 439 // Have initial cgroup target, create the tree. 440 cgDir := fs.root.Inode().(*cgroupInode) 441 for pit := initPath.Begin; pit.Ok(); pit = pit.Next() { 442 cgDirI, err := cgDir.newDirWithOwner(ctx, ownerCreds, pit.String(), vfs.MkdirOptions{Mode: mode}) 443 if err != nil { 444 return err 445 } 446 cgDir = cgDirI.(*cgroupInode) 447 } 448 449 // Walk to target dentry. 450 initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath) 451 if err != nil { 452 ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err) 453 return linuxerr.ENOENT 454 } 455 fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here. 456 return nil 457 } 458 459 func (fs *filesystem) effectiveRootCgroup() kernel.Cgroup { 460 return kernel.Cgroup{ 461 Dentry: fs.effectiveRoot, 462 CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl), 463 } 464 } 465 466 // Release implements vfs.FilesystemImpl.Release. 467 func (fs *filesystem) Release(ctx context.Context) { 468 k := kernel.KernelFromContext(ctx) 469 r := k.CgroupRegistry() 470 471 if fs.hierarchyID != kernel.InvalidCgroupHierarchyID { 472 k.ReleaseCgroupHierarchy(fs.hierarchyID) 473 r.Unregister(fs.hierarchyID) 474 } 475 476 if fs.root != fs.effectiveRoot { 477 fs.effectiveRoot.DecRef(ctx) 478 } 479 480 fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 481 fs.Filesystem.Release(ctx) 482 } 483 484 // MountOptions implements vfs.FilesystemImpl.MountOptions. 485 func (fs *filesystem) MountOptions() string { 486 var cnames []string 487 for _, c := range fs.controllers { 488 cnames = append(cnames, string(c.Type())) 489 } 490 return strings.Join(cnames, ",") 491 } 492 493 // +stateify savable 494 type implStatFS struct{} 495 496 // StatFS implements kernfs.Inode.StatFS. 497 func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { 498 return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil 499 } 500 501 // dir implements kernfs.Inode for a generic cgroup resource controller 502 // directory. Specific controllers extend this to add their own functionality. 503 // 504 // +stateify savable 505 type dir struct { 506 kernfs.InodeAlwaysValid 507 kernfs.InodeAttrs 508 kernfs.InodeDirectoryNoNewChildren 509 kernfs.InodeNoopRefCount 510 kernfs.InodeNotAnonymous 511 kernfs.InodeNotSymlink 512 kernfs.InodeWatches 513 kernfs.OrderedChildren 514 implStatFS 515 516 locks vfs.FileLocks 517 518 fs *filesystem // Immutable. 519 cgi *cgroupInode // Immutable. 520 } 521 522 // Keep implements kernfs.Inode.Keep. 523 func (*dir) Keep() bool { 524 return true 525 } 526 527 // SetStat implements kernfs.Inode.SetStat. 528 func (d *dir) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 529 return d.InodeAttrs.SetStat(ctx, fs, creds, opts) 530 } 531 532 // Open implements kernfs.Inode.Open. 533 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 534 opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | 535 linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY 536 fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ 537 SeekEnd: kernfs.SeekEndStaticEntries, 538 }) 539 if err != nil { 540 return nil, err 541 } 542 return fd.VFSFileDescription(), nil 543 } 544 545 // NewDir implements kernfs.Inode.NewDir. 546 func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { 547 return d.newDirWithOwner(ctx, auth.CredentialsFromContext(ctx), name, opts) 548 } 549 550 func (d *dir) newDirWithOwner(ctx context.Context, ownerCreds *auth.Credentials, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { 551 // "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable." 552 // -- Linux, kernel/cgroup.c:cgroup_mkdir(). 553 if strings.Contains(name, "\n") { 554 return nil, linuxerr.EINVAL 555 } 556 mode := opts.Mode.Permissions() | linux.ModeDirectory 557 return d.OrderedChildren.Inserter(name, func() kernfs.Inode { 558 d.IncLinks(1) 559 return d.fs.newCgroupInode(ctx, ownerCreds, d.cgi, mode) 560 }) 561 } 562 563 // Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of 564 // cgroup directories, and the rename may only change the name within the same 565 // parent. See linux, kernel/cgroup.c:cgroup_rename(). 566 func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error { 567 if _, ok := child.(*cgroupInode); !ok { 568 // Not a cgroup directory. Control files are backed by different types. 569 return linuxerr.ENOTDIR 570 } 571 572 dstCGInode, ok := dst.(*cgroupInode) 573 if !ok { 574 // Not a cgroup inode, so definitely can't be *this* inode. 575 return linuxerr.EIO 576 } 577 // Note: We're intentionally comparing addresses, since two different dirs 578 // could plausibly be identical in memory, but would occupy different 579 // locations in memory. 580 if d != &dstCGInode.dir { 581 // Destination dir is a different cgroup inode. Cross directory renames 582 // aren't allowed. 583 return linuxerr.EIO 584 } 585 586 // Rename moves oldname to newname within d. Proceed. 587 return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst) 588 } 589 590 // Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only 591 // files in the filesystem are control files, which can't be deleted. 592 func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error { 593 return linuxerr.EPERM 594 } 595 596 // hasChildrenLocked returns whether the cgroup dir contains any objects that 597 // prevent it from being deleted. 598 func (d *dir) hasChildrenLocked() bool { 599 // Subdirs take a link on the parent, so checks if there are any direct 600 // children cgroups. Exclude the dir's self link and the link from ".". 601 if d.InodeAttrs.Links()-2 > 0 { 602 return true 603 } 604 return len(d.cgi.ts) > 0 605 } 606 607 // HasChildren implements kernfs.Inode.HasChildren. 608 // 609 // The empty check for a cgroupfs directory is unlike a regular directory since 610 // a cgroupfs directory will always have control files. A cgroupfs directory can 611 // be deleted if cgroup contains no tasks and has no sub-cgroups. 612 func (d *dir) HasChildren() bool { 613 d.fs.tasksMu.RLock() 614 defer d.fs.tasksMu.RUnlock() 615 return d.hasChildrenLocked() 616 } 617 618 // RmDir implements kernfs.Inode.RmDir. 619 func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error { 620 // Unlike a normal directory, we need to recheck if d is empty again, since 621 // vfs/kernfs can't stop tasks from entering or leaving the cgroup. 622 d.fs.tasksMu.RLock() 623 defer d.fs.tasksMu.RUnlock() 624 625 cgi, ok := child.(*cgroupInode) 626 if !ok { 627 return linuxerr.ENOTDIR 628 } 629 if cgi.dir.hasChildrenLocked() { 630 return linuxerr.ENOTEMPTY 631 } 632 633 // Disallow deletion of the effective root cgroup. 634 if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) { 635 ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath()) 636 return linuxerr.EBUSY 637 } 638 639 err := d.OrderedChildren.RmDir(ctx, name, child) 640 if err == nil { 641 d.InodeAttrs.DecLinks() 642 } 643 return err 644 } 645 646 func (d *dir) forEachChildDir(fn func(*dir)) { 647 d.OrderedChildren.ForEachChild(func(_ string, i kernfs.Inode) { 648 if childI, ok := i.(*cgroupInode); ok { 649 fn(&childI.dir) 650 } 651 }) 652 } 653 654 // controllerFileImpl represents common cgroupfs-specific operations for control 655 // files. 656 type controllerFileImpl interface { 657 // Source extracts the underlying DynamicBytesFile for a control file. 658 Source() *kernfs.DynamicBytesFile 659 // AllowBackgroundAccess indicates whether a control file can be accessed 660 // from a background (i.e. non-task) context. Some control files cannot be 661 // meaningfully accessed from a non-task context because accessing them 662 // either have side effects on the calling context (ex: task migration 663 // across cgroups), or they refer to data which must be interpreted within 664 // the calling context (ex: when referring to a pid, in which pid 665 // namespace?). 666 // 667 // Currently, all writable control files that allow access from a background 668 // process can handle a nil FD, since a background write doesn't explicitly 669 // open the control file. This is enforced through the 670 // writableControllerFileImpl. 671 AllowBackgroundAccess() bool 672 } 673 674 // writableControllerFileImpl represents common cgroupfs-specific operations for 675 // a writable control file. 676 type writableControllerFileImpl interface { 677 controllerFileImpl 678 // WriteBackground writes data to a control file from a background 679 // context. This means the write isn't performed through and FD may be 680 // performed from a background context. 681 // 682 // Control files that support this should also return true for 683 // controllerFileImpl.AllowBackgroundAccess(). 684 WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) 685 } 686 687 // controllerFile represents a generic control file that appears within a cgroup 688 // directory. 689 // 690 // +stateify savable 691 type controllerFile struct { 692 kernfs.DynamicBytesFile 693 implStatFS 694 695 allowBackgroundAccess bool 696 } 697 698 var _ controllerFileImpl = (*controllerFile)(nil) 699 700 // Source implements controllerFileImpl.Source. 701 func (f *controllerFile) Source() *kernfs.DynamicBytesFile { 702 return &f.DynamicBytesFile 703 } 704 705 // AllowBackgroundAccess implements controllerFileImpl.AllowBackgroundAccess. 706 func (f *controllerFile) AllowBackgroundAccess() bool { 707 return f.allowBackgroundAccess 708 } 709 710 // SetStat implements kernfs.Inode.SetStat. 711 func (f *controllerFile) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 712 return f.InodeAttrs.SetStat(ctx, fs, creds, opts) 713 } 714 715 func (fs *filesystem) newControllerFile(ctx context.Context, creds *auth.Credentials, data vfs.DynamicBytesSource, allowBackgroundAccess bool) kernfs.Inode { 716 f := &controllerFile{ 717 allowBackgroundAccess: allowBackgroundAccess, 718 } 719 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, readonlyFileMode) 720 return f 721 } 722 723 func (fs *filesystem) newControllerWritableFile(ctx context.Context, creds *auth.Credentials, data vfs.WritableDynamicBytesSource, allowBackgroundAccess bool) kernfs.Inode { 724 f := &controllerFile{ 725 allowBackgroundAccess: allowBackgroundAccess, 726 } 727 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, writableFileMode) 728 return f 729 } 730 731 // staticControllerFile represents a generic control file that appears within a 732 // cgroup directory which always returns the same data when read. 733 // staticControllerFiles are not writable. 734 // 735 // +stateify savable 736 type staticControllerFile struct { 737 kernfs.DynamicBytesFile 738 vfs.StaticData 739 } 740 741 var _ controllerFileImpl = (*staticControllerFile)(nil) 742 743 // Source implements controllerFileImpl.Source. 744 func (f *staticControllerFile) Source() *kernfs.DynamicBytesFile { 745 return &f.DynamicBytesFile 746 } 747 748 // AllowBackgroundAccess implements controllerFileImpl.AllowBackgroundAccess. 749 func (f *staticControllerFile) AllowBackgroundAccess() bool { 750 return true 751 } 752 753 // SetStat implements kernfs.Inode.SetStat. 754 func (f *staticControllerFile) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 755 return f.InodeAttrs.SetStat(ctx, fs, creds, opts) 756 } 757 758 // Note: We let the caller provide the mode so that static files may be used to 759 // fake both readable and writable control files. However, static files are 760 // effectively readonly, as attempting to write to them will return EIO 761 // regardless of the mode. 762 func (fs *filesystem) newStaticControllerFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode { 763 f := &staticControllerFile{StaticData: vfs.StaticData{Data: data}} 764 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, mode) 765 return f 766 } 767 768 // stubControllerFile is a writable control file that remembers the control 769 // value written to it. 770 // 771 // +stateify savable 772 type stubControllerFile struct { 773 controllerFile 774 775 // data is accessed through atomic ops. 776 data *atomicbitops.Int64 777 } 778 779 var _ controllerFileImpl = (*stubControllerFile)(nil) 780 781 // Generate implements vfs.DynamicBytesSource.Generate. 782 func (f *stubControllerFile) Generate(ctx context.Context, buf *bytes.Buffer) error { 783 fmt.Fprintf(buf, "%d\n", f.data.Load()) 784 return nil 785 } 786 787 // Write implements vfs.WritableDynamicBytesSource.Write. 788 func (f *stubControllerFile) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 789 return f.WriteBackground(ctx, src) 790 } 791 792 // WriteBackground implements writableControllerFileImpl.WriteBackground. 793 func (f *stubControllerFile) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) { 794 val, n, err := parseInt64FromString(ctx, src) 795 if err != nil { 796 return 0, err 797 } 798 f.data.Store(val) 799 return n, nil 800 } 801 802 // newStubControllerFile creates a new stub controller file that loads and 803 // stores a control value from data. 804 func (fs *filesystem) newStubControllerFile(ctx context.Context, creds *auth.Credentials, data *atomicbitops.Int64, allowBackgroundAccess bool) kernfs.Inode { 805 f := &stubControllerFile{ 806 controllerFile: controllerFile{ 807 allowBackgroundAccess: allowBackgroundAccess, 808 }, 809 data: data, 810 } 811 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, writableFileMode) 812 return f 813 }