gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package cgroupfs implements cgroupfs.
    16  //
    17  // A cgroup is a collection of tasks on the system, organized into a tree-like
    18  // structure similar to a filesystem directory tree. In fact, each cgroup is
    19  // represented by a directory on cgroupfs, and is manipulated through control
    20  // files in the directory.
    21  //
    22  // All cgroups on a system are organized into hierarchies. Hierarchies are a
    23  // distinct tree of cgroups, with a common set of controllers. One or more
    24  // cgroupfs mounts may point to each hierarchy. These mounts provide a common
    25  // view into the same tree of cgroups.
    26  //
    27  // A controller (also known as a "resource controller", or a cgroup "subsystem")
    28  // determines the behaviour of each cgroup.
    29  //
    30  // In addition to cgroupfs, the kernel has a cgroup registry that tracks
    31  // system-wide state related to cgroups such as active hierarchies and the
    32  // controllers associated with them.
    33  //
    34  // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between
    35  // cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref
    36  // counted and exist until they're unlinked once or the FS is destroyed.
    37  //
    38  // # Synchronization
    39  //
    40  // Cgroup hierarchy creation and destruction is protected by the
    41  // kernel.CgroupRegistry.mu. Once created, a hierarchy's set of controllers, the
    42  // filesystem associated with it, and the root cgroup for the hierarchy are
    43  // immutable.
    44  //
    45  // Membership of tasks within cgroups is protected by
    46  // cgroupfs.filesystem.tasksMu. Tasks also maintain a set of all cgroups they're
    47  // in, and this list is protected by Task.mu.
    48  //
    49  // Lock order:
    50  //
    51  //	kernel.CgroupRegistry.mu
    52  //		kernfs.filesystem.mu
    53  //		kernel.TaskSet.mu
    54  //	  	kernel.Task.mu
    55  //	    	cgroupfs.filesystem.tasksMu.
    56  //	      	cgroupfs.dir.OrderedChildren.mu
    57  package cgroupfs
    58  
    59  import (
    60  	"bytes"
    61  	"fmt"
    62  	"sort"
    63  	"strconv"
    64  	"strings"
    65  
    66  	"gvisor.dev/gvisor/pkg/abi/linux"
    67  	"gvisor.dev/gvisor/pkg/atomicbitops"
    68  	"gvisor.dev/gvisor/pkg/context"
    69  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    70  	"gvisor.dev/gvisor/pkg/fspath"
    71  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
    72  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    73  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    74  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    75  	"gvisor.dev/gvisor/pkg/usermem"
    76  )
    77  
    78  const (
    79  	// Name is the default filesystem name.
    80  	Name             = "cgroup"
    81  	readonlyFileMode = linux.FileMode(0444)
    82  	writableFileMode = linux.FileMode(0644)
    83  	defaultDirMode   = linux.FileMode(0555) | linux.ModeDirectory
    84  
    85  	defaultMaxCachedDentries = uint64(1000)
    86  )
    87  
    88  var allControllers = []kernel.CgroupControllerType{
    89  	kernel.CgroupControllerCPU,
    90  	kernel.CgroupControllerCPUAcct,
    91  	kernel.CgroupControllerCPUSet,
    92  	kernel.CgroupControllerDevices,
    93  	kernel.CgroupControllerJob,
    94  	kernel.CgroupControllerMemory,
    95  	kernel.CgroupControllerPIDs,
    96  }
    97  
    98  // SupportedMountOptions is the set of supported mount options for cgroupfs.
    99  var SupportedMountOptions = []string{"all", "cpu", "cpuacct", "cpuset", "devices", "job", "memory", "pids"}
   100  
   101  // FilesystemType implements vfs.FilesystemType.
   102  //
   103  // +stateify savable
   104  type FilesystemType struct{}
   105  
   106  // InitialCgroup specifies properties of the cgroup for the init task.
   107  //
   108  // +stateify savable
   109  type InitialCgroup struct {
   110  	// Path is an absolute path relative to the root of a cgroupfs filesystem
   111  	// that indicates where to place the init task. An empty string indicates
   112  	// the root of the filesystem.
   113  	Path string
   114  
   115  	// SetOwner indicates the UID and GID fields contain valid values. If true,
   116  	// Both UID and GID must be provided.
   117  	SetOwner bool
   118  	// UID of the initial cgroup path components, excluding the root cgroup.
   119  	UID auth.KUID
   120  	// GID of the initial cgroup path components, excluding the root cgroup.
   121  	GID auth.KGID
   122  
   123  	// SetMode indicates the Mode field contains a valid value.
   124  	SetMode bool
   125  	// Mode of the initial cgroup path components, excluding the root cgroup.
   126  	Mode linux.FileMode
   127  }
   128  
   129  // InternalData contains internal data passed in to the cgroupfs mount via
   130  // vfs.GetFilesystemOptions.InternalData.
   131  //
   132  // +stateify savable
   133  type InternalData struct {
   134  	DefaultControlValues map[string]int64
   135  	InitialCgroup        InitialCgroup
   136  }
   137  
   138  // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS.
   139  //
   140  // +stateify savable
   141  type filesystem struct {
   142  	kernfs.Filesystem
   143  	devMinor uint32
   144  
   145  	// hierarchyID is the id the cgroup registry assigns to this hierarchy. Has
   146  	// the value kernel.InvalidCgroupHierarchyID until the FS is fully
   147  	// initialized.
   148  	//
   149  	// hierarchyID is immutable after initialization.
   150  	hierarchyID uint32
   151  
   152  	// hierarchyName is the name for a named hierarchy. May be empty if the
   153  	// 'name=' mount option was not used when the hierarchy was created.
   154  	//
   155  	// Immutable after initialization.
   156  	hierarchyName string
   157  
   158  	// controllers and kcontrollers are both the list of controllers attached to
   159  	// this cgroupfs. Both lists are the same set of controllers, but typecast
   160  	// to different interfaces for convenience. Both must stay in sync, and are
   161  	// immutable.
   162  	controllers  []controller
   163  	kcontrollers []kernel.CgroupController
   164  
   165  	numCgroups atomicbitops.Uint64 // Protected by atomic ops.
   166  
   167  	root *kernfs.Dentry
   168  	// effectiveRoot is the initial cgroup new tasks are created in. Unless
   169  	// overwritten by internal mount options, root == effectiveRoot. If
   170  	// effectiveRoot != root, an extra reference is held on effectiveRoot for
   171  	// the lifetime of the filesystem.
   172  	effectiveRoot *kernfs.Dentry
   173  
   174  	// tasksMu serializes task membership changes across all cgroups within a
   175  	// filesystem.
   176  	tasksMu taskRWMutex `state:"nosave"`
   177  }
   178  
   179  // InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID.
   180  func (fs *filesystem) InitializeHierarchyID(hid uint32) {
   181  	fs.hierarchyID = hid
   182  }
   183  
   184  // RootCgroup implements kernel.cgroupFS.RootCgroup.
   185  func (fs *filesystem) RootCgroup() kernel.Cgroup {
   186  	return kernel.Cgroup{
   187  		Dentry:     fs.root,
   188  		CgroupImpl: fs.root.Inode().(kernel.CgroupImpl),
   189  	}
   190  }
   191  
   192  // Name implements vfs.FilesystemType.Name.
   193  func (FilesystemType) Name() string {
   194  	return Name
   195  }
   196  
   197  // Release implements vfs.FilesystemType.Release.
   198  func (FilesystemType) Release(ctx context.Context) {}
   199  
   200  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   201  func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   202  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   203  	if err != nil {
   204  		return nil, nil, err
   205  	}
   206  
   207  	mopts := vfs.GenericParseMountOptions(opts.Data)
   208  	maxCachedDentries := defaultMaxCachedDentries
   209  	if str, ok := mopts["dentry_cache_limit"]; ok {
   210  		delete(mopts, "dentry_cache_limit")
   211  		maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
   212  		if err != nil {
   213  			ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
   214  			return nil, nil, linuxerr.EINVAL
   215  		}
   216  	}
   217  
   218  	var wantControllers []kernel.CgroupControllerType
   219  	if _, ok := mopts["cpu"]; ok {
   220  		delete(mopts, "cpu")
   221  		wantControllers = append(wantControllers, kernel.CgroupControllerCPU)
   222  	}
   223  	if _, ok := mopts["cpuacct"]; ok {
   224  		delete(mopts, "cpuacct")
   225  		wantControllers = append(wantControllers, kernel.CgroupControllerCPUAcct)
   226  	}
   227  	if _, ok := mopts["cpuset"]; ok {
   228  		delete(mopts, "cpuset")
   229  		wantControllers = append(wantControllers, kernel.CgroupControllerCPUSet)
   230  	}
   231  	if _, ok := mopts["devices"]; ok {
   232  		delete(mopts, "devices")
   233  		wantControllers = append(wantControllers, kernel.CgroupControllerDevices)
   234  	}
   235  	if _, ok := mopts["job"]; ok {
   236  		delete(mopts, "job")
   237  		wantControllers = append(wantControllers, kernel.CgroupControllerJob)
   238  	}
   239  	if _, ok := mopts["memory"]; ok {
   240  		delete(mopts, "memory")
   241  		wantControllers = append(wantControllers, kernel.CgroupControllerMemory)
   242  	}
   243  	if _, ok := mopts["pids"]; ok {
   244  		delete(mopts, "pids")
   245  		wantControllers = append(wantControllers, kernel.CgroupControllerPIDs)
   246  	}
   247  	if _, ok := mopts["all"]; ok {
   248  		if len(wantControllers) > 0 {
   249  			ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers)
   250  			return nil, nil, linuxerr.EINVAL
   251  		}
   252  
   253  		delete(mopts, "all")
   254  		wantControllers = allControllers
   255  	}
   256  
   257  	var name string
   258  	var ok bool
   259  	if name, ok = mopts["name"]; ok {
   260  		delete(mopts, "name")
   261  	}
   262  
   263  	var none bool
   264  	if _, ok = mopts["none"]; ok {
   265  		none = true
   266  		delete(mopts, "none")
   267  	}
   268  
   269  	if !none && len(wantControllers) == 0 {
   270  		// Specifying no controllers implies all controllers, unless "none" was
   271  		// explicitly requested.
   272  		wantControllers = allControllers
   273  	}
   274  
   275  	// Some combinations of "none", "all", "name=" and explicit controllers are
   276  	// not allowed. See Linux, kernel/cgroup.c:parse_cgroupfs_options().
   277  
   278  	// All empty hierarchies must have a name.
   279  	if len(wantControllers) == 0 && name == "" {
   280  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: empty hierarchy with no name")
   281  		return nil, nil, linuxerr.EINVAL
   282  	}
   283  
   284  	// Can't have "none" and some controllers.
   285  	if none && len(wantControllers) != 0 {
   286  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: 'none' specified with controllers: %v", wantControllers)
   287  		return nil, nil, linuxerr.EINVAL
   288  	}
   289  
   290  	if len(mopts) != 0 {
   291  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   292  		return nil, nil, linuxerr.EINVAL
   293  	}
   294  
   295  	k := kernel.KernelFromContext(ctx)
   296  	r := k.CgroupRegistry()
   297  
   298  	// "It is not possible to mount the same controller against multiple
   299  	// cgroup hierarchies. For example, it is not possible to mount both
   300  	// the cpu and cpuacct controllers against one hierarchy, and to mount
   301  	// the cpu controller alone against another hierarchy." - man cgroups(7)
   302  	//
   303  	// Is there a hierarchy available with all the controllers we want? If so,
   304  	// this mount is a view into the same hierarchy.
   305  	//
   306  	// Note: we're guaranteed to have at least one requested controller, since
   307  	// no explicit controller name implies all controllers.
   308  	vfsfs, err := r.FindHierarchy(name, wantControllers)
   309  	if err != nil {
   310  		return nil, nil, err
   311  	}
   312  	if vfsfs != nil {
   313  		fs := vfsfs.Impl().(*filesystem)
   314  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID)
   315  		fs.root.IncRef()
   316  		if fs.effectiveRoot != fs.root {
   317  			fs.effectiveRoot.IncRef()
   318  		}
   319  		return vfsfs, fs.root.VFSDentry(), nil
   320  	}
   321  
   322  	// No existing hierarchy with the exactly controllers found. Make a new
   323  	// one. Note that it's possible this mount creation is unsatisfiable, if one
   324  	// or more of the requested controllers are already on existing
   325  	// hierarchies. We'll find out about such collisions when we try to register
   326  	// the new hierarchy later.
   327  	fs := &filesystem{
   328  		devMinor:      devMinor,
   329  		hierarchyName: name,
   330  	}
   331  	fs.MaxCachedDentries = maxCachedDentries
   332  	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
   333  
   334  	var defaults map[string]int64
   335  	if opts.InternalData != nil {
   336  		defaults = opts.InternalData.(*InternalData).DefaultControlValues
   337  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults)
   338  	}
   339  
   340  	for _, ty := range wantControllers {
   341  		var c controller
   342  		switch ty {
   343  		case kernel.CgroupControllerCPU:
   344  			c = newCPUController(fs, defaults)
   345  		case kernel.CgroupControllerCPUAcct:
   346  			c = newCPUAcctController(fs)
   347  		case kernel.CgroupControllerCPUSet:
   348  			c = newCPUSetController(k, fs)
   349  		case kernel.CgroupControllerDevices:
   350  			c = newDevicesController(fs)
   351  		case kernel.CgroupControllerJob:
   352  			c = newJobController(fs)
   353  		case kernel.CgroupControllerMemory:
   354  			c = newMemoryController(fs, defaults)
   355  		case kernel.CgroupControllerPIDs:
   356  			c = newRootPIDsController(fs)
   357  		default:
   358  			panic(fmt.Sprintf("Unreachable: unknown cgroup controller %q", ty))
   359  		}
   360  		fs.controllers = append(fs.controllers, c)
   361  	}
   362  
   363  	if len(defaults) != 0 {
   364  		// Internal data is always provided at sentry startup and unused values
   365  		// indicate a problem with the sandbox config. Fail fast.
   366  		panic(fmt.Sprintf("cgroupfs.FilesystemType.GetFilesystem: unknown internal mount data: %v", defaults))
   367  	}
   368  
   369  	// Controllers usually appear in alphabetical order when displayed. Sort it
   370  	// here now, so it never needs to be sorted elsewhere.
   371  	sort.Slice(fs.controllers, func(i, j int) bool { return fs.controllers[i].Type() < fs.controllers[j].Type() })
   372  	fs.kcontrollers = make([]kernel.CgroupController, 0, len(fs.controllers))
   373  	for _, c := range fs.controllers {
   374  		fs.kcontrollers = append(fs.kcontrollers, c)
   375  	}
   376  
   377  	root := fs.newCgroupInode(ctx, creds, nil, defaultDirMode)
   378  	var rootD kernfs.Dentry
   379  	rootD.InitRoot(&fs.Filesystem, root)
   380  	fs.root = &rootD
   381  	fs.effectiveRoot = fs.root
   382  
   383  	if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil {
   384  		ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err)
   385  		rootD.DecRef(ctx)
   386  		fs.VFSFilesystem().DecRef(ctx)
   387  		return nil, nil, err
   388  	}
   389  
   390  	// Register controllers. The registry may be modified concurrently, so if we
   391  	// get an error, we raced with someone else who registered the same
   392  	// controllers first.
   393  	if err := r.Register(name, fs.kcontrollers, fs); err != nil {
   394  		ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err)
   395  		rootD.DecRef(ctx)
   396  		fs.VFSFilesystem().DecRef(ctx)
   397  		return nil, nil, linuxerr.EBUSY
   398  	}
   399  
   400  	// Move all existing tasks to the root of the new hierarchy.
   401  	k.PopulateNewCgroupHierarchy(fs.effectiveRootCgroup())
   402  
   403  	return fs.VFSFilesystem(), rootD.VFSDentry(), nil
   404  }
   405  
   406  // prepareInitialCgroup creates the initial cgroup according to opts. An initial
   407  // cgroup is optional, and if not specified, this function is a no-op.
   408  func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error {
   409  	if opts.InternalData == nil {
   410  		return nil
   411  	}
   412  	idata := opts.InternalData.(*InternalData)
   413  
   414  	initPathStr := idata.InitialCgroup.Path
   415  	if initPathStr == "" {
   416  		return nil
   417  	}
   418  	ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr)
   419  	initPath := fspath.Parse(initPathStr)
   420  	if !initPath.Absolute {
   421  		ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath)
   422  		return linuxerr.EINVAL
   423  	}
   424  	if !initPath.HasComponents() {
   425  		// Explicit "/" as initial cgroup, nothing to do.
   426  		return nil
   427  	}
   428  
   429  	ownerCreds := auth.CredentialsFromContext(ctx).Fork()
   430  	if idata.InitialCgroup.SetOwner {
   431  		ownerCreds.EffectiveKUID = idata.InitialCgroup.UID
   432  		ownerCreds.EffectiveKGID = idata.InitialCgroup.GID
   433  	}
   434  	mode := defaultDirMode
   435  	if idata.InitialCgroup.SetMode {
   436  		mode = idata.InitialCgroup.Mode
   437  	}
   438  
   439  	// Have initial cgroup target, create the tree.
   440  	cgDir := fs.root.Inode().(*cgroupInode)
   441  	for pit := initPath.Begin; pit.Ok(); pit = pit.Next() {
   442  		cgDirI, err := cgDir.newDirWithOwner(ctx, ownerCreds, pit.String(), vfs.MkdirOptions{Mode: mode})
   443  		if err != nil {
   444  			return err
   445  		}
   446  		cgDir = cgDirI.(*cgroupInode)
   447  	}
   448  
   449  	// Walk to target dentry.
   450  	initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath)
   451  	if err != nil {
   452  		ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err)
   453  		return linuxerr.ENOENT
   454  	}
   455  	fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here.
   456  	return nil
   457  }
   458  
   459  func (fs *filesystem) effectiveRootCgroup() kernel.Cgroup {
   460  	return kernel.Cgroup{
   461  		Dentry:     fs.effectiveRoot,
   462  		CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl),
   463  	}
   464  }
   465  
   466  // Release implements vfs.FilesystemImpl.Release.
   467  func (fs *filesystem) Release(ctx context.Context) {
   468  	k := kernel.KernelFromContext(ctx)
   469  	r := k.CgroupRegistry()
   470  
   471  	if fs.hierarchyID != kernel.InvalidCgroupHierarchyID {
   472  		k.ReleaseCgroupHierarchy(fs.hierarchyID)
   473  		r.Unregister(fs.hierarchyID)
   474  	}
   475  
   476  	if fs.root != fs.effectiveRoot {
   477  		fs.effectiveRoot.DecRef(ctx)
   478  	}
   479  
   480  	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   481  	fs.Filesystem.Release(ctx)
   482  }
   483  
   484  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   485  func (fs *filesystem) MountOptions() string {
   486  	var cnames []string
   487  	for _, c := range fs.controllers {
   488  		cnames = append(cnames, string(c.Type()))
   489  	}
   490  	return strings.Join(cnames, ",")
   491  }
   492  
   493  // +stateify savable
   494  type implStatFS struct{}
   495  
   496  // StatFS implements kernfs.Inode.StatFS.
   497  func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
   498  	return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil
   499  }
   500  
   501  // dir implements kernfs.Inode for a generic cgroup resource controller
   502  // directory. Specific controllers extend this to add their own functionality.
   503  //
   504  // +stateify savable
   505  type dir struct {
   506  	kernfs.InodeAlwaysValid
   507  	kernfs.InodeAttrs
   508  	kernfs.InodeDirectoryNoNewChildren
   509  	kernfs.InodeNoopRefCount
   510  	kernfs.InodeNotAnonymous
   511  	kernfs.InodeNotSymlink
   512  	kernfs.InodeWatches
   513  	kernfs.OrderedChildren
   514  	implStatFS
   515  
   516  	locks vfs.FileLocks
   517  
   518  	fs  *filesystem  // Immutable.
   519  	cgi *cgroupInode // Immutable.
   520  }
   521  
   522  // Keep implements kernfs.Inode.Keep.
   523  func (*dir) Keep() bool {
   524  	return true
   525  }
   526  
   527  // SetStat implements kernfs.Inode.SetStat.
   528  func (d *dir) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   529  	return d.InodeAttrs.SetStat(ctx, fs, creds, opts)
   530  }
   531  
   532  // Open implements kernfs.Inode.Open.
   533  func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   534  	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC |
   535  		linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY
   536  	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
   537  		SeekEnd: kernfs.SeekEndStaticEntries,
   538  	})
   539  	if err != nil {
   540  		return nil, err
   541  	}
   542  	return fd.VFSFileDescription(), nil
   543  }
   544  
   545  // NewDir implements kernfs.Inode.NewDir.
   546  func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
   547  	return d.newDirWithOwner(ctx, auth.CredentialsFromContext(ctx), name, opts)
   548  }
   549  
   550  func (d *dir) newDirWithOwner(ctx context.Context, ownerCreds *auth.Credentials, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
   551  	// "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable."
   552  	//   -- Linux, kernel/cgroup.c:cgroup_mkdir().
   553  	if strings.Contains(name, "\n") {
   554  		return nil, linuxerr.EINVAL
   555  	}
   556  	mode := opts.Mode.Permissions() | linux.ModeDirectory
   557  	return d.OrderedChildren.Inserter(name, func() kernfs.Inode {
   558  		d.IncLinks(1)
   559  		return d.fs.newCgroupInode(ctx, ownerCreds, d.cgi, mode)
   560  	})
   561  }
   562  
   563  // Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of
   564  // cgroup directories, and the rename may only change the name within the same
   565  // parent. See linux, kernel/cgroup.c:cgroup_rename().
   566  func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error {
   567  	if _, ok := child.(*cgroupInode); !ok {
   568  		// Not a cgroup directory. Control files are backed by different types.
   569  		return linuxerr.ENOTDIR
   570  	}
   571  
   572  	dstCGInode, ok := dst.(*cgroupInode)
   573  	if !ok {
   574  		// Not a cgroup inode, so definitely can't be *this* inode.
   575  		return linuxerr.EIO
   576  	}
   577  	// Note: We're intentionally comparing addresses, since two different dirs
   578  	// could plausibly be identical in memory, but would occupy different
   579  	// locations in memory.
   580  	if d != &dstCGInode.dir {
   581  		// Destination dir is a different cgroup inode. Cross directory renames
   582  		// aren't allowed.
   583  		return linuxerr.EIO
   584  	}
   585  
   586  	// Rename moves oldname to newname within d. Proceed.
   587  	return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst)
   588  }
   589  
   590  // Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only
   591  // files in the filesystem are control files, which can't be deleted.
   592  func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
   593  	return linuxerr.EPERM
   594  }
   595  
   596  // hasChildrenLocked returns whether the cgroup dir contains any objects that
   597  // prevent it from being deleted.
   598  func (d *dir) hasChildrenLocked() bool {
   599  	// Subdirs take a link on the parent, so checks if there are any direct
   600  	// children cgroups. Exclude the dir's self link and the link from ".".
   601  	if d.InodeAttrs.Links()-2 > 0 {
   602  		return true
   603  	}
   604  	return len(d.cgi.ts) > 0
   605  }
   606  
   607  // HasChildren implements kernfs.Inode.HasChildren.
   608  //
   609  // The empty check for a cgroupfs directory is unlike a regular directory since
   610  // a cgroupfs directory will always have control files. A cgroupfs directory can
   611  // be deleted if cgroup contains no tasks and has no sub-cgroups.
   612  func (d *dir) HasChildren() bool {
   613  	d.fs.tasksMu.RLock()
   614  	defer d.fs.tasksMu.RUnlock()
   615  	return d.hasChildrenLocked()
   616  }
   617  
   618  // RmDir implements kernfs.Inode.RmDir.
   619  func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
   620  	// Unlike a normal directory, we need to recheck if d is empty again, since
   621  	// vfs/kernfs can't stop tasks from entering or leaving the cgroup.
   622  	d.fs.tasksMu.RLock()
   623  	defer d.fs.tasksMu.RUnlock()
   624  
   625  	cgi, ok := child.(*cgroupInode)
   626  	if !ok {
   627  		return linuxerr.ENOTDIR
   628  	}
   629  	if cgi.dir.hasChildrenLocked() {
   630  		return linuxerr.ENOTEMPTY
   631  	}
   632  
   633  	// Disallow deletion of the effective root cgroup.
   634  	if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) {
   635  		ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath())
   636  		return linuxerr.EBUSY
   637  	}
   638  
   639  	err := d.OrderedChildren.RmDir(ctx, name, child)
   640  	if err == nil {
   641  		d.InodeAttrs.DecLinks()
   642  	}
   643  	return err
   644  }
   645  
   646  func (d *dir) forEachChildDir(fn func(*dir)) {
   647  	d.OrderedChildren.ForEachChild(func(_ string, i kernfs.Inode) {
   648  		if childI, ok := i.(*cgroupInode); ok {
   649  			fn(&childI.dir)
   650  		}
   651  	})
   652  }
   653  
   654  // controllerFileImpl represents common cgroupfs-specific operations for control
   655  // files.
   656  type controllerFileImpl interface {
   657  	// Source extracts the underlying DynamicBytesFile for a control file.
   658  	Source() *kernfs.DynamicBytesFile
   659  	// AllowBackgroundAccess indicates whether a control file can be accessed
   660  	// from a background (i.e. non-task) context. Some control files cannot be
   661  	// meaningfully accessed from a non-task context because accessing them
   662  	// either have side effects on the calling context (ex: task migration
   663  	// across cgroups), or they refer to data which must be interpreted within
   664  	// the calling context (ex: when referring to a pid, in which pid
   665  	// namespace?).
   666  	//
   667  	// Currently, all writable control files that allow access from a background
   668  	// process can handle a nil FD, since a background write doesn't explicitly
   669  	// open the control file. This is enforced through the
   670  	// writableControllerFileImpl.
   671  	AllowBackgroundAccess() bool
   672  }
   673  
   674  // writableControllerFileImpl represents common cgroupfs-specific operations for
   675  // a writable control file.
   676  type writableControllerFileImpl interface {
   677  	controllerFileImpl
   678  	// WriteBackground writes data to a control file from a background
   679  	// context. This means the write isn't performed through and FD may be
   680  	// performed from a background context.
   681  	//
   682  	// Control files that support this should also return true for
   683  	// controllerFileImpl.AllowBackgroundAccess().
   684  	WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error)
   685  }
   686  
   687  // controllerFile represents a generic control file that appears within a cgroup
   688  // directory.
   689  //
   690  // +stateify savable
   691  type controllerFile struct {
   692  	kernfs.DynamicBytesFile
   693  	implStatFS
   694  
   695  	allowBackgroundAccess bool
   696  }
   697  
   698  var _ controllerFileImpl = (*controllerFile)(nil)
   699  
   700  // Source implements controllerFileImpl.Source.
   701  func (f *controllerFile) Source() *kernfs.DynamicBytesFile {
   702  	return &f.DynamicBytesFile
   703  }
   704  
   705  // AllowBackgroundAccess implements controllerFileImpl.AllowBackgroundAccess.
   706  func (f *controllerFile) AllowBackgroundAccess() bool {
   707  	return f.allowBackgroundAccess
   708  }
   709  
   710  // SetStat implements kernfs.Inode.SetStat.
   711  func (f *controllerFile) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   712  	return f.InodeAttrs.SetStat(ctx, fs, creds, opts)
   713  }
   714  
   715  func (fs *filesystem) newControllerFile(ctx context.Context, creds *auth.Credentials, data vfs.DynamicBytesSource, allowBackgroundAccess bool) kernfs.Inode {
   716  	f := &controllerFile{
   717  		allowBackgroundAccess: allowBackgroundAccess,
   718  	}
   719  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, readonlyFileMode)
   720  	return f
   721  }
   722  
   723  func (fs *filesystem) newControllerWritableFile(ctx context.Context, creds *auth.Credentials, data vfs.WritableDynamicBytesSource, allowBackgroundAccess bool) kernfs.Inode {
   724  	f := &controllerFile{
   725  		allowBackgroundAccess: allowBackgroundAccess,
   726  	}
   727  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, writableFileMode)
   728  	return f
   729  }
   730  
   731  // staticControllerFile represents a generic control file that appears within a
   732  // cgroup directory which always returns the same data when read.
   733  // staticControllerFiles are not writable.
   734  //
   735  // +stateify savable
   736  type staticControllerFile struct {
   737  	kernfs.DynamicBytesFile
   738  	vfs.StaticData
   739  }
   740  
   741  var _ controllerFileImpl = (*staticControllerFile)(nil)
   742  
   743  // Source implements controllerFileImpl.Source.
   744  func (f *staticControllerFile) Source() *kernfs.DynamicBytesFile {
   745  	return &f.DynamicBytesFile
   746  }
   747  
   748  // AllowBackgroundAccess implements controllerFileImpl.AllowBackgroundAccess.
   749  func (f *staticControllerFile) AllowBackgroundAccess() bool {
   750  	return true
   751  }
   752  
   753  // SetStat implements kernfs.Inode.SetStat.
   754  func (f *staticControllerFile) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   755  	return f.InodeAttrs.SetStat(ctx, fs, creds, opts)
   756  }
   757  
   758  // Note: We let the caller provide the mode so that static files may be used to
   759  // fake both readable and writable control files. However, static files are
   760  // effectively readonly, as attempting to write to them will return EIO
   761  // regardless of the mode.
   762  func (fs *filesystem) newStaticControllerFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode {
   763  	f := &staticControllerFile{StaticData: vfs.StaticData{Data: data}}
   764  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, mode)
   765  	return f
   766  }
   767  
   768  // stubControllerFile is a writable control file that remembers the control
   769  // value written to it.
   770  //
   771  // +stateify savable
   772  type stubControllerFile struct {
   773  	controllerFile
   774  
   775  	// data is accessed through atomic ops.
   776  	data *atomicbitops.Int64
   777  }
   778  
   779  var _ controllerFileImpl = (*stubControllerFile)(nil)
   780  
   781  // Generate implements vfs.DynamicBytesSource.Generate.
   782  func (f *stubControllerFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
   783  	fmt.Fprintf(buf, "%d\n", f.data.Load())
   784  	return nil
   785  }
   786  
   787  // Write implements vfs.WritableDynamicBytesSource.Write.
   788  func (f *stubControllerFile) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   789  	return f.WriteBackground(ctx, src)
   790  }
   791  
   792  // WriteBackground implements writableControllerFileImpl.WriteBackground.
   793  func (f *stubControllerFile) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) {
   794  	val, n, err := parseInt64FromString(ctx, src)
   795  	if err != nil {
   796  		return 0, err
   797  	}
   798  	f.data.Store(val)
   799  	return n, nil
   800  }
   801  
   802  // newStubControllerFile creates a new stub controller file that loads and
   803  // stores a control value from data.
   804  func (fs *filesystem) newStubControllerFile(ctx context.Context, creds *auth.Credentials, data *atomicbitops.Int64, allowBackgroundAccess bool) kernfs.Inode {
   805  	f := &stubControllerFile{
   806  		controllerFile: controllerFile{
   807  			allowBackgroundAccess: allowBackgroundAccess,
   808  		},
   809  		data: data,
   810  	}
   811  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, writableFileMode)
   812  	return f
   813  }