github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package cgroupfs implements cgroupfs. 16 // 17 // A cgroup is a collection of tasks on the system, organized into a tree-like 18 // structure similar to a filesystem directory tree. In fact, each cgroup is 19 // represented by a directory on cgroupfs, and is manipulated through control 20 // files in the directory. 21 // 22 // All cgroups on a system are organized into hierarchies. Hierarchies are a 23 // distinct tree of cgroups, with a common set of controllers. One or more 24 // cgroupfs mounts may point to each hierarchy. These mounts provide a common 25 // view into the same tree of cgroups. 26 // 27 // A controller (also known as a "resource controller", or a cgroup "subsystem") 28 // determines the behaviour of each cgroup. 29 // 30 // In addition to cgroupfs, the kernel has a cgroup registry that tracks 31 // system-wide state related to cgroups such as active hierarchies and the 32 // controllers associated with them. 33 // 34 // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between 35 // cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref 36 // counted and exist until they're unlinked once or the FS is destroyed. 37 // 38 // # Synchronization 39 // 40 // Cgroup hierarchy creation and destruction is protected by the 41 // kernel.CgroupRegistry.mu. Once created, a hierarchy's set of controllers, the 42 // filesystem associated with it, and the root cgroup for the hierarchy are 43 // immutable. 44 // 45 // Membership of tasks within cgroups is protected by 46 // cgroupfs.filesystem.tasksMu. Tasks also maintain a set of all cgroups they're 47 // in, and this list is protected by Task.mu. 48 // 49 // Lock order: 50 // 51 // kernel.CgroupRegistry.mu 52 // kernfs.filesystem.mu 53 // kernel.TaskSet.mu 54 // kernel.Task.mu 55 // cgroupfs.filesystem.tasksMu. 56 // cgroupfs.dir.OrderedChildren.mu 57 package cgroupfs 58 59 import ( 60 "bytes" 61 "fmt" 62 "sort" 63 "strconv" 64 "strings" 65 66 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 67 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 68 "github.com/nicocha30/gvisor-ligolo/pkg/context" 69 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 70 "github.com/nicocha30/gvisor-ligolo/pkg/fspath" 71 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/kernfs" 72 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel" 73 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 74 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 75 "github.com/nicocha30/gvisor-ligolo/pkg/usermem" 76 ) 77 78 const ( 79 // Name is the default filesystem name. 80 Name = "cgroup" 81 readonlyFileMode = linux.FileMode(0444) 82 writableFileMode = linux.FileMode(0644) 83 defaultDirMode = linux.FileMode(0555) | linux.ModeDirectory 84 85 defaultMaxCachedDentries = uint64(1000) 86 ) 87 88 var allControllers = []kernel.CgroupControllerType{ 89 kernel.CgroupControllerCPU, 90 kernel.CgroupControllerCPUAcct, 91 kernel.CgroupControllerCPUSet, 92 kernel.CgroupControllerJob, 93 kernel.CgroupControllerMemory, 94 kernel.CgroupControllerPIDs, 95 } 96 97 // SupportedMountOptions is the set of supported mount options for cgroupfs. 98 var SupportedMountOptions = []string{"all", "cpu", "cpuacct", "cpuset", "job", "memory", "pids"} 99 100 // FilesystemType implements vfs.FilesystemType. 101 // 102 // +stateify savable 103 type FilesystemType struct{} 104 105 // InitialCgroup specifies properties of the cgroup for the init task. 106 // 107 // +stateify savable 108 type InitialCgroup struct { 109 // Path is an absolute path relative to the root of a cgroupfs filesystem 110 // that indicates where to place the init task. An empty string indicates 111 // the root of the filesystem. 112 Path string 113 114 // SetOwner indicates the UID and GID fields contain valid values. If true, 115 // Both UID and GID must be provided. 116 SetOwner bool 117 // UID of the initial cgroup path components, excluding the root cgroup. 118 UID auth.KUID 119 // GID of the initial cgroup path components, excluding the root cgroup. 120 GID auth.KGID 121 122 // SetMode indicates the Mode field contains a valid value. 123 SetMode bool 124 // Mode of the initial cgroup path components, excluding the root cgroup. 125 Mode linux.FileMode 126 } 127 128 // InternalData contains internal data passed in to the cgroupfs mount via 129 // vfs.GetFilesystemOptions.InternalData. 130 // 131 // +stateify savable 132 type InternalData struct { 133 DefaultControlValues map[string]int64 134 InitialCgroup InitialCgroup 135 } 136 137 // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS. 138 // 139 // +stateify savable 140 type filesystem struct { 141 kernfs.Filesystem 142 devMinor uint32 143 144 // hierarchyID is the id the cgroup registry assigns to this hierarchy. Has 145 // the value kernel.InvalidCgroupHierarchyID until the FS is fully 146 // initialized. 147 // 148 // hierarchyID is immutable after initialization. 149 hierarchyID uint32 150 151 // hierarchyName is the name for a named hierarchy. May be empty if the 152 // 'name=' mount option was not used when the hierarchy was created. 153 // 154 // Immutable after initialization. 155 hierarchyName string 156 157 // controllers and kcontrollers are both the list of controllers attached to 158 // this cgroupfs. Both lists are the same set of controllers, but typecast 159 // to different interfaces for convenience. Both must stay in sync, and are 160 // immutable. 161 controllers []controller 162 kcontrollers []kernel.CgroupController 163 164 numCgroups atomicbitops.Uint64 // Protected by atomic ops. 165 166 root *kernfs.Dentry 167 // effectiveRoot is the initial cgroup new tasks are created in. Unless 168 // overwritten by internal mount options, root == effectiveRoot. If 169 // effectiveRoot != root, an extra reference is held on effectiveRoot for 170 // the lifetime of the filesystem. 171 effectiveRoot *kernfs.Dentry 172 173 // tasksMu serializes task membership changes across all cgroups within a 174 // filesystem. 175 tasksMu taskRWMutex `state:"nosave"` 176 } 177 178 // InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID. 179 func (fs *filesystem) InitializeHierarchyID(hid uint32) { 180 fs.hierarchyID = hid 181 } 182 183 // RootCgroup implements kernel.cgroupFS.RootCgroup. 184 func (fs *filesystem) RootCgroup() kernel.Cgroup { 185 return kernel.Cgroup{ 186 Dentry: fs.root, 187 CgroupImpl: fs.root.Inode().(kernel.CgroupImpl), 188 } 189 } 190 191 // Name implements vfs.FilesystemType.Name. 192 func (FilesystemType) Name() string { 193 return Name 194 } 195 196 // Release implements vfs.FilesystemType.Release. 197 func (FilesystemType) Release(ctx context.Context) {} 198 199 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 200 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 201 devMinor, err := vfsObj.GetAnonBlockDevMinor() 202 if err != nil { 203 return nil, nil, err 204 } 205 206 mopts := vfs.GenericParseMountOptions(opts.Data) 207 maxCachedDentries := defaultMaxCachedDentries 208 if str, ok := mopts["dentry_cache_limit"]; ok { 209 delete(mopts, "dentry_cache_limit") 210 maxCachedDentries, err = strconv.ParseUint(str, 10, 64) 211 if err != nil { 212 ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) 213 return nil, nil, linuxerr.EINVAL 214 } 215 } 216 217 var wantControllers []kernel.CgroupControllerType 218 if _, ok := mopts["cpu"]; ok { 219 delete(mopts, "cpu") 220 wantControllers = append(wantControllers, kernel.CgroupControllerCPU) 221 } 222 if _, ok := mopts["cpuacct"]; ok { 223 delete(mopts, "cpuacct") 224 wantControllers = append(wantControllers, kernel.CgroupControllerCPUAcct) 225 } 226 if _, ok := mopts["cpuset"]; ok { 227 delete(mopts, "cpuset") 228 wantControllers = append(wantControllers, kernel.CgroupControllerCPUSet) 229 } 230 if _, ok := mopts["job"]; ok { 231 delete(mopts, "job") 232 wantControllers = append(wantControllers, kernel.CgroupControllerJob) 233 } 234 if _, ok := mopts["memory"]; ok { 235 delete(mopts, "memory") 236 wantControllers = append(wantControllers, kernel.CgroupControllerMemory) 237 } 238 if _, ok := mopts["pids"]; ok { 239 delete(mopts, "pids") 240 wantControllers = append(wantControllers, kernel.CgroupControllerPIDs) 241 } 242 if _, ok := mopts["all"]; ok { 243 if len(wantControllers) > 0 { 244 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers) 245 return nil, nil, linuxerr.EINVAL 246 } 247 248 delete(mopts, "all") 249 wantControllers = allControllers 250 } 251 252 var name string 253 var ok bool 254 if name, ok = mopts["name"]; ok { 255 delete(mopts, "name") 256 } 257 258 var none bool 259 if _, ok = mopts["none"]; ok { 260 none = true 261 delete(mopts, "none") 262 } 263 264 if !none && len(wantControllers) == 0 { 265 // Specifying no controllers implies all controllers, unless "none" was 266 // explicitly requested. 267 wantControllers = allControllers 268 } 269 270 // Some combinations of "none", "all", "name=" and explicit controllers are 271 // not allowed. See Linux, kernel/cgroup.c:parse_cgroupfs_options(). 272 273 // All empty hierarchies must have a name. 274 if len(wantControllers) == 0 && name == "" { 275 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: empty hierarchy with no name") 276 return nil, nil, linuxerr.EINVAL 277 } 278 279 // Can't have "none" and some controllers. 280 if none && len(wantControllers) != 0 { 281 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: 'none' specified with controllers: %v", wantControllers) 282 return nil, nil, linuxerr.EINVAL 283 } 284 285 if len(mopts) != 0 { 286 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) 287 return nil, nil, linuxerr.EINVAL 288 } 289 290 k := kernel.KernelFromContext(ctx) 291 r := k.CgroupRegistry() 292 293 // "It is not possible to mount the same controller against multiple 294 // cgroup hierarchies. For example, it is not possible to mount both 295 // the cpu and cpuacct controllers against one hierarchy, and to mount 296 // the cpu controller alone against another hierarchy." - man cgroups(7) 297 // 298 // Is there a hierarchy available with all the controllers we want? If so, 299 // this mount is a view into the same hierarchy. 300 // 301 // Note: we're guaranteed to have at least one requested controller, since 302 // no explicit controller name implies all controllers. 303 vfsfs, err := r.FindHierarchy(name, wantControllers) 304 if err != nil { 305 return nil, nil, err 306 } 307 if vfsfs != nil { 308 fs := vfsfs.Impl().(*filesystem) 309 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID) 310 fs.root.IncRef() 311 if fs.effectiveRoot != fs.root { 312 fs.effectiveRoot.IncRef() 313 } 314 return vfsfs, fs.root.VFSDentry(), nil 315 } 316 317 // No existing hierarchy with the exactly controllers found. Make a new 318 // one. Note that it's possible this mount creation is unsatisfiable, if one 319 // or more of the requested controllers are already on existing 320 // hierarchies. We'll find out about such collisions when we try to register 321 // the new hierarchy later. 322 fs := &filesystem{ 323 devMinor: devMinor, 324 hierarchyName: name, 325 } 326 fs.MaxCachedDentries = maxCachedDentries 327 fs.VFSFilesystem().Init(vfsObj, &fsType, fs) 328 329 var defaults map[string]int64 330 if opts.InternalData != nil { 331 defaults = opts.InternalData.(*InternalData).DefaultControlValues 332 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) 333 } 334 335 for _, ty := range wantControllers { 336 var c controller 337 switch ty { 338 case kernel.CgroupControllerCPU: 339 c = newCPUController(fs, defaults) 340 case kernel.CgroupControllerCPUAcct: 341 c = newCPUAcctController(fs) 342 case kernel.CgroupControllerCPUSet: 343 c = newCPUSetController(k, fs) 344 case kernel.CgroupControllerJob: 345 c = newJobController(fs) 346 case kernel.CgroupControllerMemory: 347 c = newMemoryController(fs, defaults) 348 case kernel.CgroupControllerPIDs: 349 c = newRootPIDsController(fs) 350 default: 351 panic(fmt.Sprintf("Unreachable: unknown cgroup controller %q", ty)) 352 } 353 fs.controllers = append(fs.controllers, c) 354 } 355 356 if len(defaults) != 0 { 357 // Internal data is always provided at sentry startup and unused values 358 // indicate a problem with the sandbox config. Fail fast. 359 panic(fmt.Sprintf("cgroupfs.FilesystemType.GetFilesystem: unknown internal mount data: %v", defaults)) 360 } 361 362 // Controllers usually appear in alphabetical order when displayed. Sort it 363 // here now, so it never needs to be sorted elsewhere. 364 sort.Slice(fs.controllers, func(i, j int) bool { return fs.controllers[i].Type() < fs.controllers[j].Type() }) 365 fs.kcontrollers = make([]kernel.CgroupController, 0, len(fs.controllers)) 366 for _, c := range fs.controllers { 367 fs.kcontrollers = append(fs.kcontrollers, c) 368 } 369 370 root := fs.newCgroupInode(ctx, creds, nil, defaultDirMode) 371 var rootD kernfs.Dentry 372 rootD.InitRoot(&fs.Filesystem, root) 373 fs.root = &rootD 374 fs.effectiveRoot = fs.root 375 376 if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil { 377 ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err) 378 rootD.DecRef(ctx) 379 fs.VFSFilesystem().DecRef(ctx) 380 return nil, nil, err 381 } 382 383 // Register controllers. The registry may be modified concurrently, so if we 384 // get an error, we raced with someone else who registered the same 385 // controllers first. 386 if err := r.Register(name, fs.kcontrollers, fs); err != nil { 387 ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err) 388 rootD.DecRef(ctx) 389 fs.VFSFilesystem().DecRef(ctx) 390 return nil, nil, linuxerr.EBUSY 391 } 392 393 // Move all existing tasks to the root of the new hierarchy. 394 k.PopulateNewCgroupHierarchy(fs.effectiveRootCgroup()) 395 396 return fs.VFSFilesystem(), rootD.VFSDentry(), nil 397 } 398 399 // prepareInitialCgroup creates the initial cgroup according to opts. An initial 400 // cgroup is optional, and if not specified, this function is a no-op. 401 func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error { 402 if opts.InternalData == nil { 403 return nil 404 } 405 idata := opts.InternalData.(*InternalData) 406 407 initPathStr := idata.InitialCgroup.Path 408 if initPathStr == "" { 409 return nil 410 } 411 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr) 412 initPath := fspath.Parse(initPathStr) 413 if !initPath.Absolute { 414 ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath) 415 return linuxerr.EINVAL 416 } 417 if !initPath.HasComponents() { 418 // Explicit "/" as initial cgroup, nothing to do. 419 return nil 420 } 421 422 ownerCreds := auth.CredentialsFromContext(ctx).Fork() 423 if idata.InitialCgroup.SetOwner { 424 ownerCreds.EffectiveKUID = idata.InitialCgroup.UID 425 ownerCreds.EffectiveKGID = idata.InitialCgroup.GID 426 } 427 mode := defaultDirMode 428 if idata.InitialCgroup.SetMode { 429 mode = idata.InitialCgroup.Mode 430 } 431 432 // Have initial cgroup target, create the tree. 433 cgDir := fs.root.Inode().(*cgroupInode) 434 for pit := initPath.Begin; pit.Ok(); pit = pit.Next() { 435 cgDirI, err := cgDir.newDirWithOwner(ctx, ownerCreds, pit.String(), vfs.MkdirOptions{Mode: mode}) 436 if err != nil { 437 return err 438 } 439 cgDir = cgDirI.(*cgroupInode) 440 } 441 442 // Walk to target dentry. 443 initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath) 444 if err != nil { 445 ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err) 446 return linuxerr.ENOENT 447 } 448 fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here. 449 return nil 450 } 451 452 func (fs *filesystem) effectiveRootCgroup() kernel.Cgroup { 453 return kernel.Cgroup{ 454 Dentry: fs.effectiveRoot, 455 CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl), 456 } 457 } 458 459 // Release implements vfs.FilesystemImpl.Release. 460 func (fs *filesystem) Release(ctx context.Context) { 461 k := kernel.KernelFromContext(ctx) 462 r := k.CgroupRegistry() 463 464 if fs.hierarchyID != kernel.InvalidCgroupHierarchyID { 465 k.ReleaseCgroupHierarchy(fs.hierarchyID) 466 r.Unregister(fs.hierarchyID) 467 } 468 469 if fs.root != fs.effectiveRoot { 470 fs.effectiveRoot.DecRef(ctx) 471 } 472 473 fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 474 fs.Filesystem.Release(ctx) 475 } 476 477 // MountOptions implements vfs.FilesystemImpl.MountOptions. 478 func (fs *filesystem) MountOptions() string { 479 var cnames []string 480 for _, c := range fs.controllers { 481 cnames = append(cnames, string(c.Type())) 482 } 483 return strings.Join(cnames, ",") 484 } 485 486 // +stateify savable 487 type implStatFS struct{} 488 489 // StatFS implements kernfs.Inode.StatFS. 490 func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { 491 return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil 492 } 493 494 // dir implements kernfs.Inode for a generic cgroup resource controller 495 // directory. Specific controllers extend this to add their own functionality. 496 // 497 // +stateify savable 498 type dir struct { 499 kernfs.InodeAlwaysValid 500 kernfs.InodeAttrs 501 kernfs.InodeDirectoryNoNewChildren 502 kernfs.InodeNoopRefCount 503 kernfs.InodeNotAnonymous 504 kernfs.InodeNotSymlink 505 kernfs.InodeWatches 506 kernfs.OrderedChildren 507 implStatFS 508 509 locks vfs.FileLocks 510 511 fs *filesystem // Immutable. 512 cgi *cgroupInode // Immutable. 513 } 514 515 // Keep implements kernfs.Inode.Keep. 516 func (*dir) Keep() bool { 517 return true 518 } 519 520 // SetStat implements kernfs.Inode.SetStat. 521 func (d *dir) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 522 return d.InodeAttrs.SetStat(ctx, fs, creds, opts) 523 } 524 525 // Open implements kernfs.Inode.Open. 526 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 527 opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | 528 linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY 529 fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ 530 SeekEnd: kernfs.SeekEndStaticEntries, 531 }) 532 if err != nil { 533 return nil, err 534 } 535 return fd.VFSFileDescription(), nil 536 } 537 538 // NewDir implements kernfs.Inode.NewDir. 539 func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { 540 return d.newDirWithOwner(ctx, auth.CredentialsFromContext(ctx), name, opts) 541 } 542 543 func (d *dir) newDirWithOwner(ctx context.Context, ownerCreds *auth.Credentials, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { 544 // "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable." 545 // -- Linux, kernel/cgroup.c:cgroup_mkdir(). 546 if strings.Contains(name, "\n") { 547 return nil, linuxerr.EINVAL 548 } 549 mode := opts.Mode.Permissions() | linux.ModeDirectory 550 return d.OrderedChildren.Inserter(name, func() kernfs.Inode { 551 d.IncLinks(1) 552 return d.fs.newCgroupInode(ctx, ownerCreds, d.cgi, mode) 553 }) 554 } 555 556 // Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of 557 // cgroup directories, and the rename may only change the name within the same 558 // parent. See linux, kernel/cgroup.c:cgroup_rename(). 559 func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error { 560 if _, ok := child.(*cgroupInode); !ok { 561 // Not a cgroup directory. Control files are backed by different types. 562 return linuxerr.ENOTDIR 563 } 564 565 dstCGInode, ok := dst.(*cgroupInode) 566 if !ok { 567 // Not a cgroup inode, so definitely can't be *this* inode. 568 return linuxerr.EIO 569 } 570 // Note: We're intentionally comparing addresses, since two different dirs 571 // could plausibly be identical in memory, but would occupy different 572 // locations in memory. 573 if d != &dstCGInode.dir { 574 // Destination dir is a different cgroup inode. Cross directory renames 575 // aren't allowed. 576 return linuxerr.EIO 577 } 578 579 // Rename moves oldname to newname within d. Proceed. 580 return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst) 581 } 582 583 // Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only 584 // files in the filesystem are control files, which can't be deleted. 585 func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error { 586 return linuxerr.EPERM 587 } 588 589 // hasChildrenLocked returns whether the cgroup dir contains any objects that 590 // prevent it from being deleted. 591 func (d *dir) hasChildrenLocked() bool { 592 // Subdirs take a link on the parent, so checks if there are any direct 593 // children cgroups. Exclude the dir's self link and the link from ".". 594 if d.InodeAttrs.Links()-2 > 0 { 595 return true 596 } 597 return len(d.cgi.ts) > 0 598 } 599 600 // HasChildren implements kernfs.Inode.HasChildren. 601 // 602 // The empty check for a cgroupfs directory is unlike a regular directory since 603 // a cgroupfs directory will always have control files. A cgroupfs directory can 604 // be deleted if cgroup contains no tasks and has no sub-cgroups. 605 func (d *dir) HasChildren() bool { 606 d.fs.tasksMu.RLock() 607 defer d.fs.tasksMu.RUnlock() 608 return d.hasChildrenLocked() 609 } 610 611 // RmDir implements kernfs.Inode.RmDir. 612 func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error { 613 // Unlike a normal directory, we need to recheck if d is empty again, since 614 // vfs/kernfs can't stop tasks from entering or leaving the cgroup. 615 d.fs.tasksMu.RLock() 616 defer d.fs.tasksMu.RUnlock() 617 618 cgi, ok := child.(*cgroupInode) 619 if !ok { 620 return linuxerr.ENOTDIR 621 } 622 if cgi.dir.hasChildrenLocked() { 623 return linuxerr.ENOTEMPTY 624 } 625 626 // Disallow deletion of the effective root cgroup. 627 if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) { 628 ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath()) 629 return linuxerr.EBUSY 630 } 631 632 err := d.OrderedChildren.RmDir(ctx, name, child) 633 if err == nil { 634 d.InodeAttrs.DecLinks() 635 } 636 return err 637 } 638 639 func (d *dir) forEachChildDir(fn func(*dir)) { 640 d.OrderedChildren.ForEachChild(func(_ string, i kernfs.Inode) { 641 if childI, ok := i.(*cgroupInode); ok { 642 fn(&childI.dir) 643 } 644 }) 645 } 646 647 // controllerFileImpl represents common cgroupfs-specific operations for control 648 // files. 649 type controllerFileImpl interface { 650 // Source extracts the underlying DynamicBytesFile for a control file. 651 Source() *kernfs.DynamicBytesFile 652 // AllowBackgroundAccess indicates whether a control file can be accessed 653 // from a background (i.e. non-task) context. Some control files cannot be 654 // meaningfully accessed from a non-task context because accessing them 655 // either have side effects on the calling context (ex: task migration 656 // across cgroups), or they refer to data which must be interpreted within 657 // the calling context (ex: when referring to a pid, in which pid 658 // namespace?). 659 // 660 // Currently, all writable control files that allow access from a background 661 // process can handle a nil FD, since a background write doesn't explicitly 662 // open the control file. This is enforced through the 663 // writableControllerFileImpl. 664 AllowBackgroundAccess() bool 665 } 666 667 // writableControllerFileImpl represents common cgroupfs-specific operations for 668 // a writable control file. 669 type writableControllerFileImpl interface { 670 controllerFileImpl 671 // WriteBackground writes data to a control file from a background 672 // context. This means the write isn't performed through and FD may be 673 // performed from a background context. 674 // 675 // Control files that support this should also return true for 676 // controllerFileImpl.AllowBackgroundAccess(). 677 WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) 678 } 679 680 // controllerFile represents a generic control file that appears within a cgroup 681 // directory. 682 // 683 // +stateify savable 684 type controllerFile struct { 685 kernfs.DynamicBytesFile 686 implStatFS 687 688 allowBackgroundAccess bool 689 } 690 691 var _ controllerFileImpl = (*controllerFile)(nil) 692 693 // Source implements controllerFileImpl.Source. 694 func (f *controllerFile) Source() *kernfs.DynamicBytesFile { 695 return &f.DynamicBytesFile 696 } 697 698 // AllowBackgroundAccess implements controllerFileImpl.AllowBackgroundAccess. 699 func (f *controllerFile) AllowBackgroundAccess() bool { 700 return f.allowBackgroundAccess 701 } 702 703 // SetStat implements kernfs.Inode.SetStat. 704 func (f *controllerFile) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 705 return f.InodeAttrs.SetStat(ctx, fs, creds, opts) 706 } 707 708 func (fs *filesystem) newControllerFile(ctx context.Context, creds *auth.Credentials, data vfs.DynamicBytesSource, allowBackgroundAccess bool) kernfs.Inode { 709 f := &controllerFile{ 710 allowBackgroundAccess: allowBackgroundAccess, 711 } 712 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, readonlyFileMode) 713 return f 714 } 715 716 func (fs *filesystem) newControllerWritableFile(ctx context.Context, creds *auth.Credentials, data vfs.WritableDynamicBytesSource, allowBackgroundAccess bool) kernfs.Inode { 717 f := &controllerFile{ 718 allowBackgroundAccess: allowBackgroundAccess, 719 } 720 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, writableFileMode) 721 return f 722 } 723 724 // staticControllerFile represents a generic control file that appears within a 725 // cgroup directory which always returns the same data when read. 726 // staticControllerFiles are not writable. 727 // 728 // +stateify savable 729 type staticControllerFile struct { 730 kernfs.DynamicBytesFile 731 vfs.StaticData 732 } 733 734 var _ controllerFileImpl = (*staticControllerFile)(nil) 735 736 // Source implements controllerFileImpl.Source. 737 func (f *staticControllerFile) Source() *kernfs.DynamicBytesFile { 738 return &f.DynamicBytesFile 739 } 740 741 // AllowBackgroundAccess implements controllerFileImpl.AllowBackgroundAccess. 742 func (f *staticControllerFile) AllowBackgroundAccess() bool { 743 return true 744 } 745 746 // SetStat implements kernfs.Inode.SetStat. 747 func (f *staticControllerFile) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 748 return f.InodeAttrs.SetStat(ctx, fs, creds, opts) 749 } 750 751 // Note: We let the caller provide the mode so that static files may be used to 752 // fake both readable and writable control files. However, static files are 753 // effectively readonly, as attempting to write to them will return EIO 754 // regardless of the mode. 755 func (fs *filesystem) newStaticControllerFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode { 756 f := &staticControllerFile{StaticData: vfs.StaticData{Data: data}} 757 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, mode) 758 return f 759 } 760 761 // stubControllerFile is a writable control file that remembers the control 762 // value written to it. 763 // 764 // +stateify savable 765 type stubControllerFile struct { 766 controllerFile 767 768 // data is accessed through atomic ops. 769 data *atomicbitops.Int64 770 } 771 772 var _ controllerFileImpl = (*stubControllerFile)(nil) 773 774 // Generate implements vfs.DynamicBytesSource.Generate. 775 func (f *stubControllerFile) Generate(ctx context.Context, buf *bytes.Buffer) error { 776 fmt.Fprintf(buf, "%d\n", f.data.Load()) 777 return nil 778 } 779 780 // Write implements vfs.WritableDynamicBytesSource.Write. 781 func (f *stubControllerFile) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 782 return f.WriteBackground(ctx, src) 783 } 784 785 // WriteBackground implements writableControllerFileImpl.WriteBackground. 786 func (f *stubControllerFile) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) { 787 val, n, err := parseInt64FromString(ctx, src) 788 if err != nil { 789 return 0, err 790 } 791 f.data.Store(val) 792 return n, nil 793 } 794 795 // newStubControllerFile creates a new stub controller file that loads and 796 // stores a control value from data. 797 func (fs *filesystem) newStubControllerFile(ctx context.Context, creds *auth.Credentials, data *atomicbitops.Int64, allowBackgroundAccess bool) kernfs.Inode { 798 f := &stubControllerFile{ 799 controllerFile: controllerFile{ 800 allowBackgroundAccess: allowBackgroundAccess, 801 }, 802 data: data, 803 } 804 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, writableFileMode) 805 return f 806 }