github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package cgroupfs implements cgroupfs.
    16  //
    17  // A cgroup is a collection of tasks on the system, organized into a tree-like
    18  // structure similar to a filesystem directory tree. In fact, each cgroup is
    19  // represented by a directory on cgroupfs, and is manipulated through control
    20  // files in the directory.
    21  //
    22  // All cgroups on a system are organized into hierarchies. Hierarchies are a
    23  // distinct tree of cgroups, with a common set of controllers. One or more
    24  // cgroupfs mounts may point to each hierarchy. These mounts provide a common
    25  // view into the same tree of cgroups.
    26  //
    27  // A controller (also known as a "resource controller", or a cgroup "subsystem")
    28  // determines the behaviour of each cgroup.
    29  //
    30  // In addition to cgroupfs, the kernel has a cgroup registry that tracks
    31  // system-wide state related to cgroups such as active hierarchies and the
    32  // controllers associated with them.
    33  //
    34  // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between
    35  // cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref
    36  // counted and exist until they're unlinked once or the FS is destroyed.
    37  //
    38  // # Synchronization
    39  //
    40  // Cgroup hierarchy creation and destruction is protected by the
    41  // kernel.CgroupRegistry.mu. Once created, a hierarchy's set of controllers, the
    42  // filesystem associated with it, and the root cgroup for the hierarchy are
    43  // immutable.
    44  //
    45  // Membership of tasks within cgroups is protected by
    46  // cgroupfs.filesystem.tasksMu. Tasks also maintain a set of all cgroups they're
    47  // in, and this list is protected by Task.mu.
    48  //
    49  // Lock order:
    50  //
    51  //	kernel.CgroupRegistry.mu
    52  //		kernfs.filesystem.mu
    53  //		kernel.TaskSet.mu
    54  //	  	kernel.Task.mu
    55  //	    	cgroupfs.filesystem.tasksMu.
    56  //	      	cgroupfs.dir.OrderedChildren.mu
    57  package cgroupfs
    58  
    59  import (
    60  	"bytes"
    61  	"fmt"
    62  	"sort"
    63  	"strconv"
    64  	"strings"
    65  
    66  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    67  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    68  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    69  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    70  	"github.com/nicocha30/gvisor-ligolo/pkg/fspath"
    71  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/kernfs"
    72  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    73  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    74  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    75  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    76  )
    77  
    78  const (
    79  	// Name is the default filesystem name.
    80  	Name             = "cgroup"
    81  	readonlyFileMode = linux.FileMode(0444)
    82  	writableFileMode = linux.FileMode(0644)
    83  	defaultDirMode   = linux.FileMode(0555) | linux.ModeDirectory
    84  
    85  	defaultMaxCachedDentries = uint64(1000)
    86  )
    87  
    88  var allControllers = []kernel.CgroupControllerType{
    89  	kernel.CgroupControllerCPU,
    90  	kernel.CgroupControllerCPUAcct,
    91  	kernel.CgroupControllerCPUSet,
    92  	kernel.CgroupControllerJob,
    93  	kernel.CgroupControllerMemory,
    94  	kernel.CgroupControllerPIDs,
    95  }
    96  
    97  // SupportedMountOptions is the set of supported mount options for cgroupfs.
    98  var SupportedMountOptions = []string{"all", "cpu", "cpuacct", "cpuset", "job", "memory", "pids"}
    99  
   100  // FilesystemType implements vfs.FilesystemType.
   101  //
   102  // +stateify savable
   103  type FilesystemType struct{}
   104  
   105  // InitialCgroup specifies properties of the cgroup for the init task.
   106  //
   107  // +stateify savable
   108  type InitialCgroup struct {
   109  	// Path is an absolute path relative to the root of a cgroupfs filesystem
   110  	// that indicates where to place the init task. An empty string indicates
   111  	// the root of the filesystem.
   112  	Path string
   113  
   114  	// SetOwner indicates the UID and GID fields contain valid values. If true,
   115  	// Both UID and GID must be provided.
   116  	SetOwner bool
   117  	// UID of the initial cgroup path components, excluding the root cgroup.
   118  	UID auth.KUID
   119  	// GID of the initial cgroup path components, excluding the root cgroup.
   120  	GID auth.KGID
   121  
   122  	// SetMode indicates the Mode field contains a valid value.
   123  	SetMode bool
   124  	// Mode of the initial cgroup path components, excluding the root cgroup.
   125  	Mode linux.FileMode
   126  }
   127  
   128  // InternalData contains internal data passed in to the cgroupfs mount via
   129  // vfs.GetFilesystemOptions.InternalData.
   130  //
   131  // +stateify savable
   132  type InternalData struct {
   133  	DefaultControlValues map[string]int64
   134  	InitialCgroup        InitialCgroup
   135  }
   136  
   137  // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS.
   138  //
   139  // +stateify savable
   140  type filesystem struct {
   141  	kernfs.Filesystem
   142  	devMinor uint32
   143  
   144  	// hierarchyID is the id the cgroup registry assigns to this hierarchy. Has
   145  	// the value kernel.InvalidCgroupHierarchyID until the FS is fully
   146  	// initialized.
   147  	//
   148  	// hierarchyID is immutable after initialization.
   149  	hierarchyID uint32
   150  
   151  	// hierarchyName is the name for a named hierarchy. May be empty if the
   152  	// 'name=' mount option was not used when the hierarchy was created.
   153  	//
   154  	// Immutable after initialization.
   155  	hierarchyName string
   156  
   157  	// controllers and kcontrollers are both the list of controllers attached to
   158  	// this cgroupfs. Both lists are the same set of controllers, but typecast
   159  	// to different interfaces for convenience. Both must stay in sync, and are
   160  	// immutable.
   161  	controllers  []controller
   162  	kcontrollers []kernel.CgroupController
   163  
   164  	numCgroups atomicbitops.Uint64 // Protected by atomic ops.
   165  
   166  	root *kernfs.Dentry
   167  	// effectiveRoot is the initial cgroup new tasks are created in. Unless
   168  	// overwritten by internal mount options, root == effectiveRoot. If
   169  	// effectiveRoot != root, an extra reference is held on effectiveRoot for
   170  	// the lifetime of the filesystem.
   171  	effectiveRoot *kernfs.Dentry
   172  
   173  	// tasksMu serializes task membership changes across all cgroups within a
   174  	// filesystem.
   175  	tasksMu taskRWMutex `state:"nosave"`
   176  }
   177  
   178  // InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID.
   179  func (fs *filesystem) InitializeHierarchyID(hid uint32) {
   180  	fs.hierarchyID = hid
   181  }
   182  
   183  // RootCgroup implements kernel.cgroupFS.RootCgroup.
   184  func (fs *filesystem) RootCgroup() kernel.Cgroup {
   185  	return kernel.Cgroup{
   186  		Dentry:     fs.root,
   187  		CgroupImpl: fs.root.Inode().(kernel.CgroupImpl),
   188  	}
   189  }
   190  
   191  // Name implements vfs.FilesystemType.Name.
   192  func (FilesystemType) Name() string {
   193  	return Name
   194  }
   195  
   196  // Release implements vfs.FilesystemType.Release.
   197  func (FilesystemType) Release(ctx context.Context) {}
   198  
   199  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   200  func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   201  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   202  	if err != nil {
   203  		return nil, nil, err
   204  	}
   205  
   206  	mopts := vfs.GenericParseMountOptions(opts.Data)
   207  	maxCachedDentries := defaultMaxCachedDentries
   208  	if str, ok := mopts["dentry_cache_limit"]; ok {
   209  		delete(mopts, "dentry_cache_limit")
   210  		maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
   211  		if err != nil {
   212  			ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
   213  			return nil, nil, linuxerr.EINVAL
   214  		}
   215  	}
   216  
   217  	var wantControllers []kernel.CgroupControllerType
   218  	if _, ok := mopts["cpu"]; ok {
   219  		delete(mopts, "cpu")
   220  		wantControllers = append(wantControllers, kernel.CgroupControllerCPU)
   221  	}
   222  	if _, ok := mopts["cpuacct"]; ok {
   223  		delete(mopts, "cpuacct")
   224  		wantControllers = append(wantControllers, kernel.CgroupControllerCPUAcct)
   225  	}
   226  	if _, ok := mopts["cpuset"]; ok {
   227  		delete(mopts, "cpuset")
   228  		wantControllers = append(wantControllers, kernel.CgroupControllerCPUSet)
   229  	}
   230  	if _, ok := mopts["job"]; ok {
   231  		delete(mopts, "job")
   232  		wantControllers = append(wantControllers, kernel.CgroupControllerJob)
   233  	}
   234  	if _, ok := mopts["memory"]; ok {
   235  		delete(mopts, "memory")
   236  		wantControllers = append(wantControllers, kernel.CgroupControllerMemory)
   237  	}
   238  	if _, ok := mopts["pids"]; ok {
   239  		delete(mopts, "pids")
   240  		wantControllers = append(wantControllers, kernel.CgroupControllerPIDs)
   241  	}
   242  	if _, ok := mopts["all"]; ok {
   243  		if len(wantControllers) > 0 {
   244  			ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers)
   245  			return nil, nil, linuxerr.EINVAL
   246  		}
   247  
   248  		delete(mopts, "all")
   249  		wantControllers = allControllers
   250  	}
   251  
   252  	var name string
   253  	var ok bool
   254  	if name, ok = mopts["name"]; ok {
   255  		delete(mopts, "name")
   256  	}
   257  
   258  	var none bool
   259  	if _, ok = mopts["none"]; ok {
   260  		none = true
   261  		delete(mopts, "none")
   262  	}
   263  
   264  	if !none && len(wantControllers) == 0 {
   265  		// Specifying no controllers implies all controllers, unless "none" was
   266  		// explicitly requested.
   267  		wantControllers = allControllers
   268  	}
   269  
   270  	// Some combinations of "none", "all", "name=" and explicit controllers are
   271  	// not allowed. See Linux, kernel/cgroup.c:parse_cgroupfs_options().
   272  
   273  	// All empty hierarchies must have a name.
   274  	if len(wantControllers) == 0 && name == "" {
   275  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: empty hierarchy with no name")
   276  		return nil, nil, linuxerr.EINVAL
   277  	}
   278  
   279  	// Can't have "none" and some controllers.
   280  	if none && len(wantControllers) != 0 {
   281  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: 'none' specified with controllers: %v", wantControllers)
   282  		return nil, nil, linuxerr.EINVAL
   283  	}
   284  
   285  	if len(mopts) != 0 {
   286  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   287  		return nil, nil, linuxerr.EINVAL
   288  	}
   289  
   290  	k := kernel.KernelFromContext(ctx)
   291  	r := k.CgroupRegistry()
   292  
   293  	// "It is not possible to mount the same controller against multiple
   294  	// cgroup hierarchies. For example, it is not possible to mount both
   295  	// the cpu and cpuacct controllers against one hierarchy, and to mount
   296  	// the cpu controller alone against another hierarchy." - man cgroups(7)
   297  	//
   298  	// Is there a hierarchy available with all the controllers we want? If so,
   299  	// this mount is a view into the same hierarchy.
   300  	//
   301  	// Note: we're guaranteed to have at least one requested controller, since
   302  	// no explicit controller name implies all controllers.
   303  	vfsfs, err := r.FindHierarchy(name, wantControllers)
   304  	if err != nil {
   305  		return nil, nil, err
   306  	}
   307  	if vfsfs != nil {
   308  		fs := vfsfs.Impl().(*filesystem)
   309  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID)
   310  		fs.root.IncRef()
   311  		if fs.effectiveRoot != fs.root {
   312  			fs.effectiveRoot.IncRef()
   313  		}
   314  		return vfsfs, fs.root.VFSDentry(), nil
   315  	}
   316  
   317  	// No existing hierarchy with the exactly controllers found. Make a new
   318  	// one. Note that it's possible this mount creation is unsatisfiable, if one
   319  	// or more of the requested controllers are already on existing
   320  	// hierarchies. We'll find out about such collisions when we try to register
   321  	// the new hierarchy later.
   322  	fs := &filesystem{
   323  		devMinor:      devMinor,
   324  		hierarchyName: name,
   325  	}
   326  	fs.MaxCachedDentries = maxCachedDentries
   327  	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
   328  
   329  	var defaults map[string]int64
   330  	if opts.InternalData != nil {
   331  		defaults = opts.InternalData.(*InternalData).DefaultControlValues
   332  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults)
   333  	}
   334  
   335  	for _, ty := range wantControllers {
   336  		var c controller
   337  		switch ty {
   338  		case kernel.CgroupControllerCPU:
   339  			c = newCPUController(fs, defaults)
   340  		case kernel.CgroupControllerCPUAcct:
   341  			c = newCPUAcctController(fs)
   342  		case kernel.CgroupControllerCPUSet:
   343  			c = newCPUSetController(k, fs)
   344  		case kernel.CgroupControllerJob:
   345  			c = newJobController(fs)
   346  		case kernel.CgroupControllerMemory:
   347  			c = newMemoryController(fs, defaults)
   348  		case kernel.CgroupControllerPIDs:
   349  			c = newRootPIDsController(fs)
   350  		default:
   351  			panic(fmt.Sprintf("Unreachable: unknown cgroup controller %q", ty))
   352  		}
   353  		fs.controllers = append(fs.controllers, c)
   354  	}
   355  
   356  	if len(defaults) != 0 {
   357  		// Internal data is always provided at sentry startup and unused values
   358  		// indicate a problem with the sandbox config. Fail fast.
   359  		panic(fmt.Sprintf("cgroupfs.FilesystemType.GetFilesystem: unknown internal mount data: %v", defaults))
   360  	}
   361  
   362  	// Controllers usually appear in alphabetical order when displayed. Sort it
   363  	// here now, so it never needs to be sorted elsewhere.
   364  	sort.Slice(fs.controllers, func(i, j int) bool { return fs.controllers[i].Type() < fs.controllers[j].Type() })
   365  	fs.kcontrollers = make([]kernel.CgroupController, 0, len(fs.controllers))
   366  	for _, c := range fs.controllers {
   367  		fs.kcontrollers = append(fs.kcontrollers, c)
   368  	}
   369  
   370  	root := fs.newCgroupInode(ctx, creds, nil, defaultDirMode)
   371  	var rootD kernfs.Dentry
   372  	rootD.InitRoot(&fs.Filesystem, root)
   373  	fs.root = &rootD
   374  	fs.effectiveRoot = fs.root
   375  
   376  	if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil {
   377  		ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err)
   378  		rootD.DecRef(ctx)
   379  		fs.VFSFilesystem().DecRef(ctx)
   380  		return nil, nil, err
   381  	}
   382  
   383  	// Register controllers. The registry may be modified concurrently, so if we
   384  	// get an error, we raced with someone else who registered the same
   385  	// controllers first.
   386  	if err := r.Register(name, fs.kcontrollers, fs); err != nil {
   387  		ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err)
   388  		rootD.DecRef(ctx)
   389  		fs.VFSFilesystem().DecRef(ctx)
   390  		return nil, nil, linuxerr.EBUSY
   391  	}
   392  
   393  	// Move all existing tasks to the root of the new hierarchy.
   394  	k.PopulateNewCgroupHierarchy(fs.effectiveRootCgroup())
   395  
   396  	return fs.VFSFilesystem(), rootD.VFSDentry(), nil
   397  }
   398  
   399  // prepareInitialCgroup creates the initial cgroup according to opts. An initial
   400  // cgroup is optional, and if not specified, this function is a no-op.
   401  func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error {
   402  	if opts.InternalData == nil {
   403  		return nil
   404  	}
   405  	idata := opts.InternalData.(*InternalData)
   406  
   407  	initPathStr := idata.InitialCgroup.Path
   408  	if initPathStr == "" {
   409  		return nil
   410  	}
   411  	ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr)
   412  	initPath := fspath.Parse(initPathStr)
   413  	if !initPath.Absolute {
   414  		ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath)
   415  		return linuxerr.EINVAL
   416  	}
   417  	if !initPath.HasComponents() {
   418  		// Explicit "/" as initial cgroup, nothing to do.
   419  		return nil
   420  	}
   421  
   422  	ownerCreds := auth.CredentialsFromContext(ctx).Fork()
   423  	if idata.InitialCgroup.SetOwner {
   424  		ownerCreds.EffectiveKUID = idata.InitialCgroup.UID
   425  		ownerCreds.EffectiveKGID = idata.InitialCgroup.GID
   426  	}
   427  	mode := defaultDirMode
   428  	if idata.InitialCgroup.SetMode {
   429  		mode = idata.InitialCgroup.Mode
   430  	}
   431  
   432  	// Have initial cgroup target, create the tree.
   433  	cgDir := fs.root.Inode().(*cgroupInode)
   434  	for pit := initPath.Begin; pit.Ok(); pit = pit.Next() {
   435  		cgDirI, err := cgDir.newDirWithOwner(ctx, ownerCreds, pit.String(), vfs.MkdirOptions{Mode: mode})
   436  		if err != nil {
   437  			return err
   438  		}
   439  		cgDir = cgDirI.(*cgroupInode)
   440  	}
   441  
   442  	// Walk to target dentry.
   443  	initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath)
   444  	if err != nil {
   445  		ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err)
   446  		return linuxerr.ENOENT
   447  	}
   448  	fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here.
   449  	return nil
   450  }
   451  
   452  func (fs *filesystem) effectiveRootCgroup() kernel.Cgroup {
   453  	return kernel.Cgroup{
   454  		Dentry:     fs.effectiveRoot,
   455  		CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl),
   456  	}
   457  }
   458  
   459  // Release implements vfs.FilesystemImpl.Release.
   460  func (fs *filesystem) Release(ctx context.Context) {
   461  	k := kernel.KernelFromContext(ctx)
   462  	r := k.CgroupRegistry()
   463  
   464  	if fs.hierarchyID != kernel.InvalidCgroupHierarchyID {
   465  		k.ReleaseCgroupHierarchy(fs.hierarchyID)
   466  		r.Unregister(fs.hierarchyID)
   467  	}
   468  
   469  	if fs.root != fs.effectiveRoot {
   470  		fs.effectiveRoot.DecRef(ctx)
   471  	}
   472  
   473  	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   474  	fs.Filesystem.Release(ctx)
   475  }
   476  
   477  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   478  func (fs *filesystem) MountOptions() string {
   479  	var cnames []string
   480  	for _, c := range fs.controllers {
   481  		cnames = append(cnames, string(c.Type()))
   482  	}
   483  	return strings.Join(cnames, ",")
   484  }
   485  
   486  // +stateify savable
   487  type implStatFS struct{}
   488  
   489  // StatFS implements kernfs.Inode.StatFS.
   490  func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
   491  	return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil
   492  }
   493  
   494  // dir implements kernfs.Inode for a generic cgroup resource controller
   495  // directory. Specific controllers extend this to add their own functionality.
   496  //
   497  // +stateify savable
   498  type dir struct {
   499  	kernfs.InodeAlwaysValid
   500  	kernfs.InodeAttrs
   501  	kernfs.InodeDirectoryNoNewChildren
   502  	kernfs.InodeNoopRefCount
   503  	kernfs.InodeNotAnonymous
   504  	kernfs.InodeNotSymlink
   505  	kernfs.InodeWatches
   506  	kernfs.OrderedChildren
   507  	implStatFS
   508  
   509  	locks vfs.FileLocks
   510  
   511  	fs  *filesystem  // Immutable.
   512  	cgi *cgroupInode // Immutable.
   513  }
   514  
   515  // Keep implements kernfs.Inode.Keep.
   516  func (*dir) Keep() bool {
   517  	return true
   518  }
   519  
   520  // SetStat implements kernfs.Inode.SetStat.
   521  func (d *dir) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   522  	return d.InodeAttrs.SetStat(ctx, fs, creds, opts)
   523  }
   524  
   525  // Open implements kernfs.Inode.Open.
   526  func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   527  	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC |
   528  		linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY
   529  	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
   530  		SeekEnd: kernfs.SeekEndStaticEntries,
   531  	})
   532  	if err != nil {
   533  		return nil, err
   534  	}
   535  	return fd.VFSFileDescription(), nil
   536  }
   537  
   538  // NewDir implements kernfs.Inode.NewDir.
   539  func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
   540  	return d.newDirWithOwner(ctx, auth.CredentialsFromContext(ctx), name, opts)
   541  }
   542  
   543  func (d *dir) newDirWithOwner(ctx context.Context, ownerCreds *auth.Credentials, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
   544  	// "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable."
   545  	//   -- Linux, kernel/cgroup.c:cgroup_mkdir().
   546  	if strings.Contains(name, "\n") {
   547  		return nil, linuxerr.EINVAL
   548  	}
   549  	mode := opts.Mode.Permissions() | linux.ModeDirectory
   550  	return d.OrderedChildren.Inserter(name, func() kernfs.Inode {
   551  		d.IncLinks(1)
   552  		return d.fs.newCgroupInode(ctx, ownerCreds, d.cgi, mode)
   553  	})
   554  }
   555  
   556  // Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of
   557  // cgroup directories, and the rename may only change the name within the same
   558  // parent. See linux, kernel/cgroup.c:cgroup_rename().
   559  func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error {
   560  	if _, ok := child.(*cgroupInode); !ok {
   561  		// Not a cgroup directory. Control files are backed by different types.
   562  		return linuxerr.ENOTDIR
   563  	}
   564  
   565  	dstCGInode, ok := dst.(*cgroupInode)
   566  	if !ok {
   567  		// Not a cgroup inode, so definitely can't be *this* inode.
   568  		return linuxerr.EIO
   569  	}
   570  	// Note: We're intentionally comparing addresses, since two different dirs
   571  	// could plausibly be identical in memory, but would occupy different
   572  	// locations in memory.
   573  	if d != &dstCGInode.dir {
   574  		// Destination dir is a different cgroup inode. Cross directory renames
   575  		// aren't allowed.
   576  		return linuxerr.EIO
   577  	}
   578  
   579  	// Rename moves oldname to newname within d. Proceed.
   580  	return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst)
   581  }
   582  
   583  // Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only
   584  // files in the filesystem are control files, which can't be deleted.
   585  func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
   586  	return linuxerr.EPERM
   587  }
   588  
   589  // hasChildrenLocked returns whether the cgroup dir contains any objects that
   590  // prevent it from being deleted.
   591  func (d *dir) hasChildrenLocked() bool {
   592  	// Subdirs take a link on the parent, so checks if there are any direct
   593  	// children cgroups. Exclude the dir's self link and the link from ".".
   594  	if d.InodeAttrs.Links()-2 > 0 {
   595  		return true
   596  	}
   597  	return len(d.cgi.ts) > 0
   598  }
   599  
   600  // HasChildren implements kernfs.Inode.HasChildren.
   601  //
   602  // The empty check for a cgroupfs directory is unlike a regular directory since
   603  // a cgroupfs directory will always have control files. A cgroupfs directory can
   604  // be deleted if cgroup contains no tasks and has no sub-cgroups.
   605  func (d *dir) HasChildren() bool {
   606  	d.fs.tasksMu.RLock()
   607  	defer d.fs.tasksMu.RUnlock()
   608  	return d.hasChildrenLocked()
   609  }
   610  
   611  // RmDir implements kernfs.Inode.RmDir.
   612  func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
   613  	// Unlike a normal directory, we need to recheck if d is empty again, since
   614  	// vfs/kernfs can't stop tasks from entering or leaving the cgroup.
   615  	d.fs.tasksMu.RLock()
   616  	defer d.fs.tasksMu.RUnlock()
   617  
   618  	cgi, ok := child.(*cgroupInode)
   619  	if !ok {
   620  		return linuxerr.ENOTDIR
   621  	}
   622  	if cgi.dir.hasChildrenLocked() {
   623  		return linuxerr.ENOTEMPTY
   624  	}
   625  
   626  	// Disallow deletion of the effective root cgroup.
   627  	if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) {
   628  		ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath())
   629  		return linuxerr.EBUSY
   630  	}
   631  
   632  	err := d.OrderedChildren.RmDir(ctx, name, child)
   633  	if err == nil {
   634  		d.InodeAttrs.DecLinks()
   635  	}
   636  	return err
   637  }
   638  
   639  func (d *dir) forEachChildDir(fn func(*dir)) {
   640  	d.OrderedChildren.ForEachChild(func(_ string, i kernfs.Inode) {
   641  		if childI, ok := i.(*cgroupInode); ok {
   642  			fn(&childI.dir)
   643  		}
   644  	})
   645  }
   646  
   647  // controllerFileImpl represents common cgroupfs-specific operations for control
   648  // files.
   649  type controllerFileImpl interface {
   650  	// Source extracts the underlying DynamicBytesFile for a control file.
   651  	Source() *kernfs.DynamicBytesFile
   652  	// AllowBackgroundAccess indicates whether a control file can be accessed
   653  	// from a background (i.e. non-task) context. Some control files cannot be
   654  	// meaningfully accessed from a non-task context because accessing them
   655  	// either have side effects on the calling context (ex: task migration
   656  	// across cgroups), or they refer to data which must be interpreted within
   657  	// the calling context (ex: when referring to a pid, in which pid
   658  	// namespace?).
   659  	//
   660  	// Currently, all writable control files that allow access from a background
   661  	// process can handle a nil FD, since a background write doesn't explicitly
   662  	// open the control file. This is enforced through the
   663  	// writableControllerFileImpl.
   664  	AllowBackgroundAccess() bool
   665  }
   666  
   667  // writableControllerFileImpl represents common cgroupfs-specific operations for
   668  // a writable control file.
   669  type writableControllerFileImpl interface {
   670  	controllerFileImpl
   671  	// WriteBackground writes data to a control file from a background
   672  	// context. This means the write isn't performed through and FD may be
   673  	// performed from a background context.
   674  	//
   675  	// Control files that support this should also return true for
   676  	// controllerFileImpl.AllowBackgroundAccess().
   677  	WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error)
   678  }
   679  
   680  // controllerFile represents a generic control file that appears within a cgroup
   681  // directory.
   682  //
   683  // +stateify savable
   684  type controllerFile struct {
   685  	kernfs.DynamicBytesFile
   686  	implStatFS
   687  
   688  	allowBackgroundAccess bool
   689  }
   690  
   691  var _ controllerFileImpl = (*controllerFile)(nil)
   692  
   693  // Source implements controllerFileImpl.Source.
   694  func (f *controllerFile) Source() *kernfs.DynamicBytesFile {
   695  	return &f.DynamicBytesFile
   696  }
   697  
   698  // AllowBackgroundAccess implements controllerFileImpl.AllowBackgroundAccess.
   699  func (f *controllerFile) AllowBackgroundAccess() bool {
   700  	return f.allowBackgroundAccess
   701  }
   702  
   703  // SetStat implements kernfs.Inode.SetStat.
   704  func (f *controllerFile) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   705  	return f.InodeAttrs.SetStat(ctx, fs, creds, opts)
   706  }
   707  
   708  func (fs *filesystem) newControllerFile(ctx context.Context, creds *auth.Credentials, data vfs.DynamicBytesSource, allowBackgroundAccess bool) kernfs.Inode {
   709  	f := &controllerFile{
   710  		allowBackgroundAccess: allowBackgroundAccess,
   711  	}
   712  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, readonlyFileMode)
   713  	return f
   714  }
   715  
   716  func (fs *filesystem) newControllerWritableFile(ctx context.Context, creds *auth.Credentials, data vfs.WritableDynamicBytesSource, allowBackgroundAccess bool) kernfs.Inode {
   717  	f := &controllerFile{
   718  		allowBackgroundAccess: allowBackgroundAccess,
   719  	}
   720  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, writableFileMode)
   721  	return f
   722  }
   723  
   724  // staticControllerFile represents a generic control file that appears within a
   725  // cgroup directory which always returns the same data when read.
   726  // staticControllerFiles are not writable.
   727  //
   728  // +stateify savable
   729  type staticControllerFile struct {
   730  	kernfs.DynamicBytesFile
   731  	vfs.StaticData
   732  }
   733  
   734  var _ controllerFileImpl = (*staticControllerFile)(nil)
   735  
   736  // Source implements controllerFileImpl.Source.
   737  func (f *staticControllerFile) Source() *kernfs.DynamicBytesFile {
   738  	return &f.DynamicBytesFile
   739  }
   740  
   741  // AllowBackgroundAccess implements controllerFileImpl.AllowBackgroundAccess.
   742  func (f *staticControllerFile) AllowBackgroundAccess() bool {
   743  	return true
   744  }
   745  
   746  // SetStat implements kernfs.Inode.SetStat.
   747  func (f *staticControllerFile) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   748  	return f.InodeAttrs.SetStat(ctx, fs, creds, opts)
   749  }
   750  
   751  // Note: We let the caller provide the mode so that static files may be used to
   752  // fake both readable and writable control files. However, static files are
   753  // effectively readonly, as attempting to write to them will return EIO
   754  // regardless of the mode.
   755  func (fs *filesystem) newStaticControllerFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode {
   756  	f := &staticControllerFile{StaticData: vfs.StaticData{Data: data}}
   757  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, mode)
   758  	return f
   759  }
   760  
   761  // stubControllerFile is a writable control file that remembers the control
   762  // value written to it.
   763  //
   764  // +stateify savable
   765  type stubControllerFile struct {
   766  	controllerFile
   767  
   768  	// data is accessed through atomic ops.
   769  	data *atomicbitops.Int64
   770  }
   771  
   772  var _ controllerFileImpl = (*stubControllerFile)(nil)
   773  
   774  // Generate implements vfs.DynamicBytesSource.Generate.
   775  func (f *stubControllerFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
   776  	fmt.Fprintf(buf, "%d\n", f.data.Load())
   777  	return nil
   778  }
   779  
   780  // Write implements vfs.WritableDynamicBytesSource.Write.
   781  func (f *stubControllerFile) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   782  	return f.WriteBackground(ctx, src)
   783  }
   784  
   785  // WriteBackground implements writableControllerFileImpl.WriteBackground.
   786  func (f *stubControllerFile) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) {
   787  	val, n, err := parseInt64FromString(ctx, src)
   788  	if err != nil {
   789  		return 0, err
   790  	}
   791  	f.data.Store(val)
   792  	return n, nil
   793  }
   794  
   795  // newStubControllerFile creates a new stub controller file that loads and
   796  // stores a control value from data.
   797  func (fs *filesystem) newStubControllerFile(ctx context.Context, creds *auth.Credentials, data *atomicbitops.Int64, allowBackgroundAccess bool) kernfs.Inode {
   798  	f := &stubControllerFile{
   799  		controllerFile: controllerFile{
   800  			allowBackgroundAccess: allowBackgroundAccess,
   801  		},
   802  		data: data,
   803  	}
   804  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, writableFileMode)
   805  	return f
   806  }