github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/cgroupfs/base.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cgroupfs
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"sort"
    21  	"strconv"
    22  	"strings"
    23  
    24  	"github.com/metacubex/gvisor/pkg/abi/linux"
    25  	"github.com/metacubex/gvisor/pkg/context"
    26  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    27  	"github.com/metacubex/gvisor/pkg/hostarch"
    28  	"github.com/metacubex/gvisor/pkg/log"
    29  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs"
    30  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    31  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    32  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    33  	"github.com/metacubex/gvisor/pkg/usermem"
    34  )
    35  
    36  // controllerCommon implements kernel.CgroupController.
    37  //
    38  // Must call init before use.
    39  //
    40  // +stateify savable
    41  type controllerCommon struct {
    42  	ty kernel.CgroupControllerType
    43  	fs *filesystem
    44  	// parent is the parent controller if any. Immutable.
    45  	//
    46  	// Note that we don't have to update this on renames, since cgroup
    47  	// directories can't be moved to a different parent directory.
    48  	parent controller
    49  }
    50  
    51  func (c *controllerCommon) init(ty kernel.CgroupControllerType, fs *filesystem) {
    52  	c.ty = ty
    53  	c.fs = fs
    54  }
    55  
    56  func (c *controllerCommon) cloneFromParent(parent controller) {
    57  	c.ty = parent.Type()
    58  	c.fs = parent.Filesystem()
    59  	c.parent = parent
    60  }
    61  
    62  // Filesystem implements controller.Filesystem.
    63  func (c *controllerCommon) Filesystem() *filesystem {
    64  	return c.fs
    65  }
    66  
    67  // Type implements kernel.CgroupController.Type.
    68  func (c *controllerCommon) Type() kernel.CgroupControllerType {
    69  	return kernel.CgroupControllerType(c.ty)
    70  }
    71  
    72  // HierarchyID implements kernel.CgroupController.HierarchyID.
    73  func (c *controllerCommon) HierarchyID() uint32 {
    74  	return c.fs.hierarchyID
    75  }
    76  
    77  // NumCgroups implements kernel.CgroupController.NumCgroups.
    78  func (c *controllerCommon) NumCgroups() uint64 {
    79  	return c.fs.numCgroups.Load()
    80  }
    81  
    82  // Enabled implements kernel.CgroupController.Enabled.
    83  //
    84  // Controllers are currently always enabled.
    85  func (c *controllerCommon) Enabled() bool {
    86  	return true
    87  }
    88  
    89  // EffectiveRootCgroup implements kernel.CgroupController.EffectiveRootCgroup.
    90  func (c *controllerCommon) EffectiveRootCgroup() kernel.Cgroup {
    91  	return c.fs.effectiveRootCgroup()
    92  }
    93  
    94  // controller is an interface for common functionality related to all cgroups.
    95  // It is an extension of the public cgroup interface, containing cgroup
    96  // functionality private to cgroupfs.
    97  type controller interface {
    98  	kernel.CgroupController
    99  
   100  	// Filesystem returns the cgroupfs filesystem backing this controller.
   101  	Filesystem() *filesystem
   102  
   103  	// Clone creates a new controller based on the internal state of this
   104  	// controller. This is used to initialize a sub-cgroup based on the state of
   105  	// the parent.
   106  	Clone() controller
   107  
   108  	// AddControlFiles should extend the contents map with inodes representing
   109  	// control files defined by this controller.
   110  	AddControlFiles(ctx context.Context, creds *auth.Credentials, c *cgroupInode, contents map[string]kernfs.Inode)
   111  
   112  	// Enter is called when a task initially moves into a cgroup. This is
   113  	// distinct from migration because the task isn't migrating away from a
   114  	// cgroup. Enter is called when a task is created and joins its initial
   115  	// cgroup, or when cgroupfs is mounted and existing tasks are moved into
   116  	// cgroups.
   117  	Enter(t *kernel.Task)
   118  
   119  	// Leave is called when a task leaves a cgroup. This is distinct from
   120  	// migration because the task isn't migrating to another cgroup. Leave is
   121  	// called when a task exits.
   122  	Leave(t *kernel.Task)
   123  
   124  	// PrepareMigrate signals the controller that a migration is about to
   125  	// happen. The controller should check for any conditions that would prevent
   126  	// the migration. If PrepareMigrate succeeds, the controller must
   127  	// unconditionally either accept the migration via CommitMigrate, or roll it
   128  	// back via AbortMigrate.
   129  	//
   130  	// Postcondition: If PrepareMigrate returns nil, caller must resolve the
   131  	// migration by calling either CommitMigrate or AbortMigrate.
   132  	PrepareMigrate(t *kernel.Task, src controller) error
   133  
   134  	// CommitMigrate completes an in-flight migration.
   135  	//
   136  	// Precondition: Caller must call a corresponding PrepareMigrate.
   137  	CommitMigrate(t *kernel.Task, src controller)
   138  
   139  	// AbortMigrate cancels an in-flight migration.
   140  	//
   141  	// Precondition: Caller must call a corresponding PrepareMigrate.
   142  	AbortMigrate(t *kernel.Task, src controller)
   143  
   144  	// Charge charges a controller for a particular resource. The implementation
   145  	// should panic if passed a resource type they do not control.
   146  	Charge(t *kernel.Task, d *kernfs.Dentry, res kernel.CgroupResourceType, value int64) error
   147  }
   148  
   149  // cgroupInode implements kernel.CgroupImpl and kernfs.Inode.
   150  //
   151  // +stateify savable
   152  type cgroupInode struct {
   153  	dir
   154  
   155  	// id is the id of this cgroup.
   156  	id uint32
   157  
   158  	// controllers is the set of controllers for this cgroup. This is used to
   159  	// store controller-specific state per cgroup. The set of controllers should
   160  	// match the controllers for this hierarchy as tracked by the filesystem
   161  	// object. Immutable.
   162  	controllers map[kernel.CgroupControllerType]controller
   163  
   164  	// ts is the list of tasks in this cgroup. The kernel is responsible for
   165  	// removing tasks from this list before they're destroyed, so any tasks on
   166  	// this list are always valid.
   167  	//
   168  	// ts, and cgroup membership in general is protected by fs.tasksMu.
   169  	ts map[*kernel.Task]struct{}
   170  }
   171  
   172  var _ kernel.CgroupImpl = (*cgroupInode)(nil)
   173  
   174  func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials, parent *cgroupInode, mode linux.FileMode) kernfs.Inode {
   175  	c := &cgroupInode{
   176  		dir:         dir{fs: fs},
   177  		ts:          make(map[*kernel.Task]struct{}),
   178  		controllers: make(map[kernel.CgroupControllerType]controller),
   179  	}
   180  	c.dir.cgi = c
   181  
   182  	k := kernel.KernelFromContext(ctx)
   183  	r := k.CgroupRegistry()
   184  	// Assign id for the cgroup.
   185  	cid, err := r.NextCgroupID()
   186  	if err != nil {
   187  		log.Warningf("cgroupfs newCgroupInode: Failed to assign id to the cgroup: %v", err)
   188  	}
   189  	c.id = cid
   190  	r.AddCgroup(c)
   191  
   192  	contents := make(map[string]kernfs.Inode)
   193  	contents["cgroup.procs"] = fs.newControllerWritableFile(ctx, creds, &cgroupProcsData{c}, false)
   194  	contents["tasks"] = fs.newControllerWritableFile(ctx, creds, &tasksData{c}, false)
   195  
   196  	if parent != nil {
   197  		for ty, ctl := range parent.controllers {
   198  			new := ctl.Clone()
   199  			c.controllers[ty] = new
   200  			new.AddControlFiles(ctx, creds, c, contents)
   201  		}
   202  	} else {
   203  		for _, ctl := range fs.controllers {
   204  			// Uniqueness of controllers enforced by the filesystem on
   205  			// creation. The root cgroup uses the controllers directly from the
   206  			// filesystem.
   207  			c.controllers[ctl.Type()] = ctl
   208  			ctl.AddControlFiles(ctx, creds, c, contents)
   209  		}
   210  	}
   211  
   212  	c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), mode)
   213  	c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
   214  	c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents))
   215  
   216  	fs.numCgroups.Add(1)
   217  
   218  	return c
   219  }
   220  
   221  // HierarchyID implements kernel.CgroupImpl.HierarchyID.
   222  func (c *cgroupInode) HierarchyID() uint32 {
   223  	return c.fs.hierarchyID
   224  }
   225  
   226  // Name implements kernel.CgroupImpl.Name.
   227  func (c *cgroupInode) Name() string {
   228  	return c.fs.hierarchyName
   229  }
   230  
   231  // Controllers implements kernel.CgroupImpl.Controllers.
   232  func (c *cgroupInode) Controllers() []kernel.CgroupController {
   233  	return c.fs.kcontrollers
   234  }
   235  
   236  // tasks returns a snapshot of the tasks inside the cgroup.
   237  func (c *cgroupInode) tasks() []*kernel.Task {
   238  	c.fs.tasksMu.RLock()
   239  	defer c.fs.tasksMu.RUnlock()
   240  
   241  	ts := make([]*kernel.Task, 0, len(c.ts))
   242  	for t := range c.ts {
   243  		ts = append(ts, t)
   244  	}
   245  	return ts
   246  }
   247  
   248  // Enter implements kernel.CgroupImpl.Enter.
   249  func (c *cgroupInode) Enter(t *kernel.Task) {
   250  	c.fs.tasksMu.Lock()
   251  	defer c.fs.tasksMu.Unlock()
   252  
   253  	c.ts[t] = struct{}{}
   254  	for _, ctl := range c.controllers {
   255  		ctl.Enter(t)
   256  	}
   257  }
   258  
   259  // Leave implements kernel.CgroupImpl.Leave.
   260  func (c *cgroupInode) Leave(t *kernel.Task) {
   261  	c.fs.tasksMu.Lock()
   262  	defer c.fs.tasksMu.Unlock()
   263  
   264  	for _, ctl := range c.controllers {
   265  		ctl.Leave(t)
   266  	}
   267  	delete(c.ts, t)
   268  }
   269  
   270  // PrepareMigrate implements kernel.CgroupImpl.PrepareMigrate.
   271  func (c *cgroupInode) PrepareMigrate(t *kernel.Task, src *kernel.Cgroup) error {
   272  	prepared := make([]controller, 0, len(c.controllers))
   273  	rollback := func() {
   274  		for _, p := range prepared {
   275  			c.controllers[p.Type()].AbortMigrate(t, p)
   276  		}
   277  	}
   278  
   279  	for srcType, srcCtl := range src.CgroupImpl.(*cgroupInode).controllers {
   280  		ctl := c.controllers[srcType]
   281  		if err := ctl.PrepareMigrate(t, srcCtl); err != nil {
   282  			rollback()
   283  			return err
   284  		}
   285  		prepared = append(prepared, srcCtl)
   286  	}
   287  	return nil
   288  }
   289  
   290  // CommitMigrate implements kernel.CgroupImpl.CommitMigrate.
   291  func (c *cgroupInode) CommitMigrate(t *kernel.Task, src *kernel.Cgroup) {
   292  	c.fs.tasksMu.Lock()
   293  	defer c.fs.tasksMu.Unlock()
   294  
   295  	for srcType, srcCtl := range src.CgroupImpl.(*cgroupInode).controllers {
   296  		c.controllers[srcType].CommitMigrate(t, srcCtl)
   297  	}
   298  
   299  	srcI := src.CgroupImpl.(*cgroupInode)
   300  	delete(srcI.ts, t)
   301  	c.ts[t] = struct{}{}
   302  }
   303  
   304  // AbortMigrate implements kernel.CgroupImpl.AbortMigrate.
   305  func (c *cgroupInode) AbortMigrate(t *kernel.Task, src *kernel.Cgroup) {
   306  	for srcType, srcCtl := range src.CgroupImpl.(*cgroupInode).controllers {
   307  		c.controllers[srcType].AbortMigrate(t, srcCtl)
   308  	}
   309  }
   310  
   311  // CgroupFromControlFileFD returns a cgroup object given a control file FD for the cgroup.
   312  func (c *cgroupInode) CgroupFromControlFileFD(fd *vfs.FileDescription) kernel.Cgroup {
   313  	controlFileDentry := fd.Dentry().Impl().(*kernfs.Dentry)
   314  	// The returned parent dentry remains valid without holding locks because in
   315  	// cgroupfs, the parent directory relationship of a control file is
   316  	// effectively immutable. Control files cannot be unlinked, renamed or
   317  	// destroyed independently from their parent directory.
   318  	parentD := controlFileDentry.Parent()
   319  	return kernel.Cgroup{
   320  		Dentry:     parentD,
   321  		CgroupImpl: c,
   322  	}
   323  }
   324  
   325  // Charge implements kernel.CgroupImpl.Charge.
   326  //
   327  // Charge notifies a matching controller of a change in resource usage. Due to
   328  // the uniqueness of controllers, at most one controller will match. If no
   329  // matching controller is present in this directory, the call silently
   330  // succeeds. The caller should call Charge on all hierarchies to ensure any
   331  // matching controller across the entire system is charged.
   332  func (c *cgroupInode) Charge(t *kernel.Task, d *kernfs.Dentry, ctlType kernel.CgroupControllerType, res kernel.CgroupResourceType, value int64) error {
   333  	c.fs.tasksMu.RLock()
   334  	defer c.fs.tasksMu.RUnlock()
   335  	if ctl, ok := c.controllers[ctlType]; ok {
   336  		return ctl.Charge(t, d, res, value)
   337  	}
   338  	return nil
   339  }
   340  
   341  // ReadControl implements kernel.CgroupImpl.ReadControl.
   342  func (c *cgroupInode) ReadControl(ctx context.Context, name string) (string, error) {
   343  	cfi, err := c.Lookup(ctx, name)
   344  	if err != nil {
   345  		return "", fmt.Errorf("no such control file")
   346  	}
   347  	cbf, ok := cfi.(controllerFileImpl)
   348  	if !ok {
   349  		return "", fmt.Errorf("no such control file")
   350  	}
   351  	if !cbf.AllowBackgroundAccess() {
   352  		return "", fmt.Errorf("this control may not be accessed from a background context")
   353  	}
   354  
   355  	var buf bytes.Buffer
   356  	err = cbf.Source().Data().Generate(ctx, &buf)
   357  	return buf.String(), err
   358  }
   359  
   360  // WriteControl implements kernel.CgroupImpl.WriteControl.
   361  func (c *cgroupInode) WriteControl(ctx context.Context, name string, value string) error {
   362  	cfi, err := c.Lookup(ctx, name)
   363  	if err != nil {
   364  		return fmt.Errorf("no such control file")
   365  	}
   366  	// Do the more general cast first so we can give a meaningful error message when
   367  	// the control file exists, but isn't accessible (either due to being
   368  	// unwritable, or not being available from a background context).
   369  	cbf, ok := cfi.(controllerFileImpl)
   370  	if !ok {
   371  		return fmt.Errorf("no such control file")
   372  	}
   373  	if !cbf.AllowBackgroundAccess() {
   374  		return fmt.Errorf("this control may not be accessed from a background context")
   375  	}
   376  	wcbf, ok := cfi.(writableControllerFileImpl)
   377  	if !ok {
   378  		return fmt.Errorf("control file not writable")
   379  	}
   380  
   381  	ioSeq := usermem.BytesIOSequence([]byte(value))
   382  	n, err := wcbf.WriteBackground(ctx, ioSeq)
   383  	if err != nil {
   384  		return err
   385  	}
   386  	if n != int64(len(value)) {
   387  		return fmt.Errorf("short write")
   388  	}
   389  
   390  	return nil
   391  }
   392  
   393  // ID implements kernel.CgroupImpl.ID.
   394  func (c *cgroupInode) ID() uint32 {
   395  	return c.id
   396  }
   397  
   398  func sortTIDs(tids []kernel.ThreadID) {
   399  	sort.Slice(tids, func(i, j int) bool { return tids[i] < tids[j] })
   400  }
   401  
   402  // +stateify savable
   403  type cgroupProcsData struct {
   404  	*cgroupInode
   405  }
   406  
   407  // Generate implements vfs.DynamicBytesSource.Generate.
   408  func (d *cgroupProcsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   409  	t := kernel.TaskFromContext(ctx)
   410  	currPidns := t.ThreadGroup().PIDNamespace()
   411  
   412  	pgids := make(map[kernel.ThreadID]struct{})
   413  
   414  	for _, task := range d.tasks() {
   415  		// Map dedups pgid, since iterating over all tasks produces multiple
   416  		// entries for the group leaders.
   417  		if pgid := currPidns.IDOfThreadGroup(task.ThreadGroup()); pgid != 0 {
   418  			pgids[pgid] = struct{}{}
   419  		}
   420  	}
   421  
   422  	pgidList := make([]kernel.ThreadID, 0, len(pgids))
   423  	for pgid := range pgids {
   424  		pgidList = append(pgidList, pgid)
   425  	}
   426  	sortTIDs(pgidList)
   427  
   428  	for _, pgid := range pgidList {
   429  		fmt.Fprintf(buf, "%d\n", pgid)
   430  	}
   431  
   432  	return nil
   433  }
   434  
   435  // Write implements vfs.WritableDynamicBytesSource.Write.
   436  func (d *cgroupProcsData) Write(ctx context.Context, fd *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   437  	tgid, n, err := parseInt64FromString(ctx, src)
   438  	if err != nil {
   439  		return n, err
   440  	}
   441  
   442  	t := kernel.TaskFromContext(ctx)
   443  	currPidns := t.ThreadGroup().PIDNamespace()
   444  	var targetTG *kernel.ThreadGroup
   445  	if tgid != 0 {
   446  		targetTG = currPidns.ThreadGroupWithID(kernel.ThreadID(tgid))
   447  	} else {
   448  		targetTG = t.ThreadGroup()
   449  	}
   450  
   451  	if targetTG == nil {
   452  		return 0, linuxerr.EINVAL
   453  	}
   454  	return n, targetTG.MigrateCgroup(d.CgroupFromControlFileFD(fd))
   455  }
   456  
   457  // +stateify savable
   458  type tasksData struct {
   459  	*cgroupInode
   460  }
   461  
   462  // Generate implements vfs.DynamicBytesSource.Generate.
   463  func (d *tasksData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   464  	t := kernel.TaskFromContext(ctx)
   465  	currPidns := t.ThreadGroup().PIDNamespace()
   466  
   467  	var pids []kernel.ThreadID
   468  
   469  	for _, task := range d.tasks() {
   470  		if pid := currPidns.IDOfTask(task); pid != 0 {
   471  			pids = append(pids, pid)
   472  		}
   473  	}
   474  	sortTIDs(pids)
   475  
   476  	for _, pid := range pids {
   477  		fmt.Fprintf(buf, "%d\n", pid)
   478  	}
   479  
   480  	return nil
   481  }
   482  
   483  // Write implements vfs.WritableDynamicBytesSource.Write.
   484  func (d *tasksData) Write(ctx context.Context, fd *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   485  	tid, n, err := parseInt64FromString(ctx, src)
   486  	if err != nil {
   487  		return n, err
   488  	}
   489  
   490  	t := kernel.TaskFromContext(ctx)
   491  	currPidns := t.ThreadGroup().PIDNamespace()
   492  	var targetTask *kernel.Task
   493  	if tid != 0 {
   494  		targetTask = currPidns.TaskWithID(kernel.ThreadID(tid))
   495  	} else {
   496  		targetTask = t
   497  	}
   498  	if targetTask == nil {
   499  		return 0, linuxerr.EINVAL
   500  	}
   501  	return n, targetTask.MigrateCgroup(d.CgroupFromControlFileFD(fd))
   502  }
   503  
   504  // parseInt64FromString interprets src as string encoding a int64 value, and
   505  // returns the parsed value.
   506  func parseInt64FromString(ctx context.Context, src usermem.IOSequence) (val, len int64, err error) {
   507  	const maxInt64StrLen = 20 // i.e. len(fmt.Sprintf("%d", math.MinInt64)) == 20
   508  
   509  	buf := copyScratchBufferFromContext(ctx, maxInt64StrLen)
   510  	n, err := src.CopyIn(ctx, buf)
   511  	if err != nil {
   512  		return 0, int64(n), err
   513  	}
   514  	str := strings.TrimSpace(string(buf[:n]))
   515  
   516  	val, err = strconv.ParseInt(str, 10, 64)
   517  	if err != nil {
   518  		// Note: This also handles zero-len writes if offset is beyond the end
   519  		// of src, or src is empty.
   520  		ctx.Debugf("cgroupfs.parseInt64FromString: failed to parse %q: %v", str, err)
   521  		return 0, int64(n), linuxerr.EINVAL
   522  	}
   523  
   524  	return val, int64(n), nil
   525  }
   526  
   527  // copyScratchBufferFromContext returns a scratch buffer of the given size. It
   528  // tries to use the task's copy scratch buffer if we're on a task context,
   529  // otherwise it allocates a new buffer.
   530  func copyScratchBufferFromContext(ctx context.Context, size int) []byte {
   531  	t := kernel.TaskFromContext(ctx)
   532  	if t != nil {
   533  		return t.CopyScratchBuffer(hostarch.PageSize)
   534  	}
   535  	// Not on task context.
   536  	return make([]byte, hostarch.PageSize)
   537  }
   538  
   539  // controllerStateless partially implements controller. It stubs the migration
   540  // methods with noops for a stateless controller.
   541  type controllerStateless struct{}
   542  
   543  // Enter implements controller.Enter.
   544  func (*controllerStateless) Enter(t *kernel.Task) {}
   545  
   546  // Leave implements controller.Leave.
   547  func (*controllerStateless) Leave(t *kernel.Task) {}
   548  
   549  // PrepareMigrate implements controller.PrepareMigrate.
   550  func (*controllerStateless) PrepareMigrate(t *kernel.Task, src controller) error {
   551  	return nil
   552  }
   553  
   554  // CommitMigrate implements controller.CommitMigrate.
   555  func (*controllerStateless) CommitMigrate(t *kernel.Task, src controller) {}
   556  
   557  // AbortMigrate implements controller.AbortMigrate.
   558  func (*controllerStateless) AbortMigrate(t *kernel.Task, src controller) {}
   559  
   560  // controllerNoResource partially implements controller. It stubs out the Charge
   561  // method for controllers that don't track resource usage through the charge
   562  // mechanism.
   563  type controllerNoResource struct{}
   564  
   565  // Charge implements controller.Charge.
   566  func (*controllerNoResource) Charge(t *kernel.Task, d *kernfs.Dentry, res kernel.CgroupResourceType, value int64) error {
   567  	panic(fmt.Sprintf("cgroupfs: Attempted to charge a controller with unknown resource %v for value %v", res, value))
   568  }