github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package cgroupfs implements cgroupfs.
    16  //
    17  // A cgroup is a collection of tasks on the system, organized into a tree-like
    18  // structure similar to a filesystem directory tree. In fact, each cgroup is
    19  // represented by a directory on cgroupfs, and is manipulated through control
    20  // files in the directory.
    21  //
    22  // All cgroups on a system are organized into hierarchies. Hierarchies are a
    23  // distinct tree of cgroups, with a common set of controllers. One or more
    24  // cgroupfs mounts may point to each hierarchy. These mounts provide a common
    25  // view into the same tree of cgroups.
    26  //
    27  // A controller (also known as a "resource controller", or a cgroup "subsystem")
    28  // determines the behaviour of each cgroup.
    29  //
    30  // In addition to cgroupfs, the kernel has a cgroup registry that tracks
    31  // system-wide state related to cgroups such as active hierarchies and the
    32  // controllers associated with them.
    33  //
    34  // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between
    35  // cgroupfs dentries and inodes.
    36  //
    37  // # Synchronization
    38  //
    39  // Cgroup hierarchy creation and destruction is protected by the
    40  // kernel.CgroupRegistry.mu. Once created, a hierarchy's set of controllers, the
    41  // filesystem associated with it, and the root cgroup for the hierarchy are
    42  // immutable.
    43  //
    44  // Membership of tasks within cgroups is protected by
    45  // cgroupfs.filesystem.tasksMu. Tasks also maintain a set of all cgroups they're
    46  // in, and this list is protected by Task.mu.
    47  //
    48  // Lock order:
    49  //
    50  // kernel.CgroupRegistry.mu
    51  //   cgroupfs.filesystem.mu
    52  //     kernel.TaskSet.mu
    53  //       kernel.Task.mu
    54  //         cgroupfs.filesystem.tasksMu.
    55  package cgroupfs
    56  
    57  import (
    58  	"fmt"
    59  	"sort"
    60  	"strconv"
    61  	"strings"
    62  
    63  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    64  	"github.com/SagerNet/gvisor/pkg/context"
    65  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    66  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/kernfs"
    67  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    68  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    69  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    70  	"github.com/SagerNet/gvisor/pkg/sync"
    71  )
    72  
    73  const (
    74  	// Name is the default filesystem name.
    75  	Name                     = "cgroup"
    76  	readonlyFileMode         = linux.FileMode(0444)
    77  	writableFileMode         = linux.FileMode(0644)
    78  	defaultMaxCachedDentries = uint64(1000)
    79  )
    80  
    81  const (
    82  	controllerCPU     = kernel.CgroupControllerType("cpu")
    83  	controllerCPUAcct = kernel.CgroupControllerType("cpuacct")
    84  	controllerCPUSet  = kernel.CgroupControllerType("cpuset")
    85  	controllerJob     = kernel.CgroupControllerType("job")
    86  	controllerMemory  = kernel.CgroupControllerType("memory")
    87  )
    88  
    89  var allControllers = []kernel.CgroupControllerType{
    90  	controllerCPU,
    91  	controllerCPUAcct,
    92  	controllerCPUSet,
    93  	controllerJob,
    94  	controllerMemory,
    95  }
    96  
    97  // SupportedMountOptions is the set of supported mount options for cgroupfs.
    98  var SupportedMountOptions = []string{"all", "cpu", "cpuacct", "cpuset", "job", "memory"}
    99  
   100  // FilesystemType implements vfs.FilesystemType.
   101  //
   102  // +stateify savable
   103  type FilesystemType struct{}
   104  
   105  // InternalData contains internal data passed in to the cgroupfs mount via
   106  // vfs.GetFilesystemOptions.InternalData.
   107  //
   108  // +stateify savable
   109  type InternalData struct {
   110  	DefaultControlValues map[string]int64
   111  }
   112  
   113  // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS.
   114  //
   115  // +stateify savable
   116  type filesystem struct {
   117  	kernfs.Filesystem
   118  	devMinor uint32
   119  
   120  	// hierarchyID is the id the cgroup registry assigns to this hierarchy. Has
   121  	// the value kernel.InvalidCgroupHierarchyID until the FS is fully
   122  	// initialized.
   123  	//
   124  	// hierarchyID is immutable after initialization.
   125  	hierarchyID uint32
   126  
   127  	// controllers and kcontrollers are both the list of controllers attached to
   128  	// this cgroupfs. Both lists are the same set of controllers, but typecast
   129  	// to different interfaces for convenience. Both must stay in sync, and are
   130  	// immutable.
   131  	controllers  []controller
   132  	kcontrollers []kernel.CgroupController
   133  
   134  	numCgroups uint64 // Protected by atomic ops.
   135  
   136  	root *kernfs.Dentry
   137  
   138  	// tasksMu serializes task membership changes across all cgroups within a
   139  	// filesystem.
   140  	tasksMu sync.RWMutex `state:"nosave"`
   141  }
   142  
   143  // InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID.
   144  func (fs *filesystem) InitializeHierarchyID(hid uint32) {
   145  	fs.hierarchyID = hid
   146  }
   147  
   148  // Name implements vfs.FilesystemType.Name.
   149  func (FilesystemType) Name() string {
   150  	return Name
   151  }
   152  
   153  // Release implements vfs.FilesystemType.Release.
   154  func (FilesystemType) Release(ctx context.Context) {}
   155  
   156  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   157  func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   158  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   159  	if err != nil {
   160  		return nil, nil, err
   161  	}
   162  
   163  	mopts := vfs.GenericParseMountOptions(opts.Data)
   164  	maxCachedDentries := defaultMaxCachedDentries
   165  	if str, ok := mopts["dentry_cache_limit"]; ok {
   166  		delete(mopts, "dentry_cache_limit")
   167  		maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
   168  		if err != nil {
   169  			ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
   170  			return nil, nil, linuxerr.EINVAL
   171  		}
   172  	}
   173  
   174  	var wantControllers []kernel.CgroupControllerType
   175  	if _, ok := mopts["cpu"]; ok {
   176  		delete(mopts, "cpu")
   177  		wantControllers = append(wantControllers, controllerCPU)
   178  	}
   179  	if _, ok := mopts["cpuacct"]; ok {
   180  		delete(mopts, "cpuacct")
   181  		wantControllers = append(wantControllers, controllerCPUAcct)
   182  	}
   183  	if _, ok := mopts["cpuset"]; ok {
   184  		delete(mopts, "cpuset")
   185  		wantControllers = append(wantControllers, controllerCPUSet)
   186  	}
   187  	if _, ok := mopts["job"]; ok {
   188  		delete(mopts, "job")
   189  		wantControllers = append(wantControllers, controllerJob)
   190  	}
   191  	if _, ok := mopts["memory"]; ok {
   192  		delete(mopts, "memory")
   193  		wantControllers = append(wantControllers, controllerMemory)
   194  	}
   195  	if _, ok := mopts["all"]; ok {
   196  		if len(wantControllers) > 0 {
   197  			ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers)
   198  			return nil, nil, linuxerr.EINVAL
   199  		}
   200  
   201  		delete(mopts, "all")
   202  		wantControllers = allControllers
   203  	}
   204  
   205  	if len(wantControllers) == 0 {
   206  		// Specifying no controllers implies all controllers.
   207  		wantControllers = allControllers
   208  	}
   209  
   210  	if len(mopts) != 0 {
   211  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   212  		return nil, nil, linuxerr.EINVAL
   213  	}
   214  
   215  	k := kernel.KernelFromContext(ctx)
   216  	r := k.CgroupRegistry()
   217  
   218  	// "It is not possible to mount the same controller against multiple
   219  	// cgroup hierarchies. For example, it is not possible to mount both
   220  	// the cpu and cpuacct controllers against one hierarchy, and to mount
   221  	// the cpu controller alone against another hierarchy." - man cgroups(7)
   222  	//
   223  	// Is there a hierarchy available with all the controllers we want? If so,
   224  	// this mount is a view into the same hierarchy.
   225  	//
   226  	// Note: we're guaranteed to have at least one requested controller, since
   227  	// no explicit controller name implies all controllers.
   228  	if vfsfs := r.FindHierarchy(wantControllers); vfsfs != nil {
   229  		fs := vfsfs.Impl().(*filesystem)
   230  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID)
   231  		fs.root.IncRef()
   232  		return vfsfs, fs.root.VFSDentry(), nil
   233  	}
   234  
   235  	// No existing hierarchy with the exactly controllers found. Make a new
   236  	// one. Note that it's possible this mount creation is unsatisfiable, if one
   237  	// or more of the requested controllers are already on existing
   238  	// hierarchies. We'll find out about such collisions when we try to register
   239  	// the new hierarchy later.
   240  	fs := &filesystem{
   241  		devMinor: devMinor,
   242  	}
   243  	fs.MaxCachedDentries = maxCachedDentries
   244  	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
   245  
   246  	var defaults map[string]int64
   247  	if opts.InternalData != nil {
   248  		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults)
   249  		defaults = opts.InternalData.(*InternalData).DefaultControlValues
   250  	}
   251  
   252  	for _, ty := range wantControllers {
   253  		var c controller
   254  		switch ty {
   255  		case controllerCPU:
   256  			c = newCPUController(fs, defaults)
   257  		case controllerCPUAcct:
   258  			c = newCPUAcctController(fs)
   259  		case controllerCPUSet:
   260  			c = newCPUSetController(fs)
   261  		case controllerJob:
   262  			c = newJobController(fs)
   263  		case controllerMemory:
   264  			c = newMemoryController(fs, defaults)
   265  		default:
   266  			panic(fmt.Sprintf("Unreachable: unknown cgroup controller %q", ty))
   267  		}
   268  		fs.controllers = append(fs.controllers, c)
   269  	}
   270  
   271  	if len(defaults) != 0 {
   272  		// Internal data is always provided at sentry startup and unused values
   273  		// indicate a problem with the sandbox config. Fail fast.
   274  		panic(fmt.Sprintf("cgroupfs.FilesystemType.GetFilesystem: unknown internal mount data: %v", defaults))
   275  	}
   276  
   277  	// Controllers usually appear in alphabetical order when displayed. Sort it
   278  	// here now, so it never needs to be sorted elsewhere.
   279  	sort.Slice(fs.controllers, func(i, j int) bool { return fs.controllers[i].Type() < fs.controllers[j].Type() })
   280  	fs.kcontrollers = make([]kernel.CgroupController, 0, len(fs.controllers))
   281  	for _, c := range fs.controllers {
   282  		fs.kcontrollers = append(fs.kcontrollers, c)
   283  	}
   284  
   285  	root := fs.newCgroupInode(ctx, creds)
   286  	var rootD kernfs.Dentry
   287  	rootD.InitRoot(&fs.Filesystem, root)
   288  	fs.root = &rootD
   289  
   290  	// Register controllers. The registry may be modified concurrently, so if we
   291  	// get an error, we raced with someone else who registered the same
   292  	// controllers first.
   293  	if err := r.Register(fs.kcontrollers, fs); err != nil {
   294  		ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err)
   295  		rootD.DecRef(ctx)
   296  		fs.VFSFilesystem().DecRef(ctx)
   297  		return nil, nil, linuxerr.EBUSY
   298  	}
   299  
   300  	// Move all existing tasks to the root of the new hierarchy.
   301  	k.PopulateNewCgroupHierarchy(fs.rootCgroup())
   302  
   303  	return fs.VFSFilesystem(), rootD.VFSDentry(), nil
   304  }
   305  
   306  func (fs *filesystem) rootCgroup() kernel.Cgroup {
   307  	return kernel.Cgroup{
   308  		Dentry:     fs.root,
   309  		CgroupImpl: fs.root.Inode().(kernel.CgroupImpl),
   310  	}
   311  }
   312  
   313  // Release implements vfs.FilesystemImpl.Release.
   314  func (fs *filesystem) Release(ctx context.Context) {
   315  	k := kernel.KernelFromContext(ctx)
   316  	r := k.CgroupRegistry()
   317  
   318  	if fs.hierarchyID != kernel.InvalidCgroupHierarchyID {
   319  		k.ReleaseCgroupHierarchy(fs.hierarchyID)
   320  		r.Unregister(fs.hierarchyID)
   321  	}
   322  
   323  	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   324  	fs.Filesystem.Release(ctx)
   325  }
   326  
   327  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   328  func (fs *filesystem) MountOptions() string {
   329  	var cnames []string
   330  	for _, c := range fs.controllers {
   331  		cnames = append(cnames, string(c.Type()))
   332  	}
   333  	return strings.Join(cnames, ",")
   334  }
   335  
   336  // +stateify savable
   337  type implStatFS struct{}
   338  
   339  // StatFS implements kernfs.Inode.StatFS.
   340  func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
   341  	return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil
   342  }
   343  
   344  // dir implements kernfs.Inode for a generic cgroup resource controller
   345  // directory. Specific controllers extend this to add their own functionality.
   346  //
   347  // +stateify savable
   348  type dir struct {
   349  	dirRefs
   350  	kernfs.InodeAlwaysValid
   351  	kernfs.InodeAttrs
   352  	kernfs.InodeNotSymlink
   353  	kernfs.InodeDirectoryNoNewChildren // TODO(b/183137098): Implement mkdir.
   354  	kernfs.OrderedChildren
   355  	implStatFS
   356  
   357  	locks vfs.FileLocks
   358  }
   359  
   360  // Keep implements kernfs.Inode.Keep.
   361  func (*dir) Keep() bool {
   362  	return true
   363  }
   364  
   365  // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
   366  func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
   367  	return linuxerr.EPERM
   368  }
   369  
   370  // Open implements kernfs.Inode.Open.
   371  func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   372  	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
   373  		SeekEnd: kernfs.SeekEndStaticEntries,
   374  	})
   375  	if err != nil {
   376  		return nil, err
   377  	}
   378  	return fd.VFSFileDescription(), nil
   379  }
   380  
   381  // DecRef implements kernfs.Inode.DecRef.
   382  func (d *dir) DecRef(ctx context.Context) {
   383  	d.dirRefs.DecRef(func() { d.Destroy(ctx) })
   384  }
   385  
   386  // StatFS implements kernfs.Inode.StatFS.
   387  func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
   388  	return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil
   389  }
   390  
   391  // controllerFile represents a generic control file that appears within a cgroup
   392  // directory.
   393  //
   394  // +stateify savable
   395  type controllerFile struct {
   396  	kernfs.DynamicBytesFile
   397  }
   398  
   399  func (fs *filesystem) newControllerFile(ctx context.Context, creds *auth.Credentials, data vfs.DynamicBytesSource) kernfs.Inode {
   400  	f := &controllerFile{}
   401  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, readonlyFileMode)
   402  	return f
   403  }
   404  
   405  func (fs *filesystem) newControllerWritableFile(ctx context.Context, creds *auth.Credentials, data vfs.WritableDynamicBytesSource) kernfs.Inode {
   406  	f := &controllerFile{}
   407  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, writableFileMode)
   408  	return f
   409  }
   410  
   411  // staticControllerFile represents a generic control file that appears within a
   412  // cgroup directory which always returns the same data when read.
   413  // staticControllerFiles are not writable.
   414  //
   415  // +stateify savable
   416  type staticControllerFile struct {
   417  	kernfs.DynamicBytesFile
   418  	vfs.StaticData
   419  }
   420  
   421  // Note: We let the caller provide the mode so that static files may be used to
   422  // fake both readable and writable control files. However, static files are
   423  // effectively readonly, as attempting to write to them will return EIO
   424  // regardless of the mode.
   425  func (fs *filesystem) newStaticControllerFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode {
   426  	f := &staticControllerFile{StaticData: vfs.StaticData{Data: data}}
   427  	f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, mode)
   428  	return f
   429  }