gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/task_cgroup.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"sort"
    21  	"strings"
    22  
    23  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    24  	"gvisor.dev/gvisor/pkg/log"
    25  )
    26  
    27  // EnterInitialCgroups moves t into an initial set of cgroups.
    28  // If initCgroups is not nil, the new task will be placed in the specified cgroups.
    29  // Otherwise, if parent is not nil, the new task will be placed in the parent's cgroups.
    30  // If neither is specified, the new task will be in the root cgroups.
    31  //
    32  // This is analogous to Linux's kernel/cgroup/cgroup.c:cgroup_css_set_fork().
    33  //
    34  // Precondition: t isn't in any cgroups yet, t.cgroups is empty.
    35  func (t *Task) EnterInitialCgroups(parent *Task, initCgroups map[Cgroup]struct{}) {
    36  	var inherit map[Cgroup]struct{}
    37  	if initCgroups != nil {
    38  		inherit = initCgroups
    39  	} else if parent != nil {
    40  		parent.mu.Lock()
    41  		defer parent.mu.Unlock()
    42  		inherit = parent.cgroups
    43  	}
    44  	joinSet := t.k.cgroupRegistry.computeInitialGroups(inherit)
    45  
    46  	t.mu.NestedLock(taskLockChild)
    47  	defer t.mu.NestedUnlock(taskLockChild)
    48  	// Transfer ownership of joinSet refs to the task's cgset.
    49  	t.cgroups = joinSet
    50  	for c := range t.cgroups {
    51  		// Since t isn't in any cgroup yet, we can skip the check against
    52  		// existing cgroups.
    53  		c.Enter(t)
    54  		t.SetMemCgIDFromCgroup(c)
    55  	}
    56  }
    57  
    58  // SetMemCgID sets the given memory cgroup id to the task.
    59  func (t *Task) SetMemCgID(memCgID uint32) {
    60  	t.memCgID.Store(memCgID)
    61  }
    62  
    63  // SetMemCgIDFromCgroup sets the id of the given memory cgroup to the task.
    64  func (t *Task) SetMemCgIDFromCgroup(cg Cgroup) {
    65  	for _, ctl := range cg.Controllers() {
    66  		if ctl.Type() == CgroupControllerMemory {
    67  			t.SetMemCgID(cg.ID())
    68  			return
    69  		}
    70  	}
    71  }
    72  
    73  // ResetMemCgIDFromCgroup sets the memory cgroup id to zero, if the task has
    74  // a memory cgroup.
    75  func (t *Task) ResetMemCgIDFromCgroup(cg Cgroup) {
    76  	for _, ctl := range cg.Controllers() {
    77  		if ctl.Type() == CgroupControllerMemory {
    78  			t.SetMemCgID(0)
    79  			return
    80  		}
    81  	}
    82  }
    83  
    84  // EnterCgroup moves t into c.
    85  func (t *Task) EnterCgroup(c Cgroup) error {
    86  	newControllers := make(map[CgroupControllerType]struct{})
    87  	for _, ctl := range c.Controllers() {
    88  		newControllers[ctl.Type()] = struct{}{}
    89  	}
    90  
    91  	t.mu.Lock()
    92  	defer t.mu.Unlock()
    93  
    94  	for oldCG := range t.cgroups {
    95  		if oldCG.HierarchyID() == c.HierarchyID() {
    96  			log.Warningf("Cannot enter new cgroup %v due to conflicting controllers. Try migrate instead?", c)
    97  			return linuxerr.EBUSY
    98  		}
    99  	}
   100  
   101  	// No migration required.
   102  	t.enterCgroupLocked(c)
   103  
   104  	return nil
   105  }
   106  
   107  // +checklocks:t.mu
   108  func (t *Task) enterCgroupLocked(c Cgroup) {
   109  	c.IncRef()
   110  	t.cgroups[c] = struct{}{}
   111  	c.Enter(t)
   112  	t.SetMemCgIDFromCgroup(c)
   113  }
   114  
   115  // +checklocks:t.mu
   116  func (t *Task) enterCgroupIfNotYetLocked(c Cgroup) {
   117  	if _, ok := t.cgroups[c]; ok {
   118  		return
   119  	}
   120  	t.enterCgroupLocked(c)
   121  }
   122  
   123  // LeaveCgroups removes t out from all its cgroups.
   124  func (t *Task) LeaveCgroups() {
   125  	t.tg.pidns.owner.mu.Lock() // Prevent migration.
   126  	t.mu.Lock()
   127  	cgs := t.cgroups
   128  	t.cgroups = nil
   129  	for c := range cgs {
   130  		c.Leave(t)
   131  	}
   132  	t.SetMemCgID(0)
   133  	t.mu.Unlock()
   134  	t.tg.pidns.owner.mu.Unlock()
   135  
   136  	for c := range cgs {
   137  		c.decRef()
   138  	}
   139  }
   140  
   141  // +checklocks:t.mu
   142  func (t *Task) findCgroupWithMatchingHierarchyLocked(other Cgroup) (Cgroup, bool) {
   143  	for c := range t.cgroups {
   144  		if c.HierarchyID() != other.HierarchyID() {
   145  			continue
   146  		}
   147  		return c, true
   148  	}
   149  	return Cgroup{}, false
   150  }
   151  
   152  // CgroupPrepareMigrate starts a cgroup migration for this task to dst. The
   153  // migration must be completed through the returned context.
   154  func (t *Task) CgroupPrepareMigrate(dst Cgroup) (*CgroupMigrationContext, error) {
   155  	t.mu.Lock()
   156  	defer t.mu.Unlock()
   157  	src, found := t.findCgroupWithMatchingHierarchyLocked(dst)
   158  	if !found {
   159  		log.Warningf("Cannot migrate to cgroup %v since task %v not currently in target hierarchy %v", dst, t, dst.HierarchyID())
   160  		return nil, linuxerr.EINVAL
   161  	}
   162  	if err := dst.PrepareMigrate(t, &src); err != nil {
   163  		return nil, err
   164  	}
   165  	return &CgroupMigrationContext{
   166  		src: src,
   167  		dst: dst,
   168  		t:   t,
   169  	}, nil
   170  }
   171  
   172  // MigrateCgroup migrates all tasks in tg to the dst cgroup. Either all tasks
   173  // are migrated, or none are. Atomicity of migrations wrt cgroup membership
   174  // (i.e. a task can't switch cgroups mid-migration due to another migration) is
   175  // guaranteed because migrations are serialized by TaskSet.mu.
   176  func (tg *ThreadGroup) MigrateCgroup(dst Cgroup) error {
   177  	tg.pidns.owner.mu.RLock()
   178  	defer tg.pidns.owner.mu.RUnlock()
   179  
   180  	var ctxs []*CgroupMigrationContext
   181  
   182  	// Prepare migrations. On partial failure, abort.
   183  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   184  		ctx, err := t.CgroupPrepareMigrate(dst)
   185  		if err != nil {
   186  			// Rollback.
   187  			for _, ctx := range ctxs {
   188  				ctx.Abort()
   189  			}
   190  			return err
   191  		}
   192  		ctxs = append(ctxs, ctx)
   193  	}
   194  
   195  	// All migrations are now guaranteed to succeed.
   196  
   197  	for _, ctx := range ctxs {
   198  		ctx.Commit()
   199  	}
   200  
   201  	return nil
   202  }
   203  
   204  // MigrateCgroup migrates this task to the dst cgroup.
   205  func (t *Task) MigrateCgroup(dst Cgroup) error {
   206  	t.tg.pidns.owner.mu.RLock()
   207  	defer t.tg.pidns.owner.mu.RUnlock()
   208  
   209  	ctx, err := t.CgroupPrepareMigrate(dst)
   210  	if err != nil {
   211  		return err
   212  	}
   213  	ctx.Commit()
   214  	return nil
   215  }
   216  
   217  // TaskCgroupEntry represents a line in /proc/<pid>/cgroup, and is used to
   218  // format a cgroup for display.
   219  type TaskCgroupEntry struct {
   220  	HierarchyID uint32 `json:"hierarchy_id"`
   221  	Controllers string `json:"controllers,omitempty"`
   222  	Path        string `json:"path,omitempty"`
   223  }
   224  
   225  // GetCgroupEntries generates the contents of /proc/<pid>/cgroup as
   226  // a TaskCgroupEntry array.
   227  func (t *Task) GetCgroupEntries() []TaskCgroupEntry {
   228  	t.mu.Lock()
   229  	defer t.mu.Unlock()
   230  
   231  	cgEntries := make([]TaskCgroupEntry, 0, len(t.cgroups))
   232  	for c := range t.cgroups {
   233  		ctls := c.Controllers()
   234  		ctlNames := make([]string, 0, len(ctls))
   235  
   236  		// We're guaranteed to have a valid name, a non-empty controller list,
   237  		// or both.
   238  
   239  		// Explicit hierarchy name, if any.
   240  		if name := c.Name(); name != "" {
   241  			ctlNames = append(ctlNames, fmt.Sprintf("name=%s", name))
   242  		}
   243  
   244  		// Controllers attached to this hierarchy, if any.
   245  		for _, ctl := range ctls {
   246  			ctlNames = append(ctlNames, string(ctl.Type()))
   247  		}
   248  
   249  		cgEntries = append(cgEntries, TaskCgroupEntry{
   250  			HierarchyID: c.HierarchyID(),
   251  			Controllers: strings.Join(ctlNames, ","),
   252  			Path:        c.Path(),
   253  		})
   254  	}
   255  
   256  	sort.Slice(cgEntries, func(i, j int) bool { return cgEntries[i].HierarchyID > cgEntries[j].HierarchyID })
   257  	return cgEntries
   258  }
   259  
   260  // GenerateProcTaskCgroup writes the contents of /proc/<pid>/cgroup for t to buf.
   261  func (t *Task) GenerateProcTaskCgroup(buf *bytes.Buffer) {
   262  	cgEntries := t.GetCgroupEntries()
   263  	for _, cgE := range cgEntries {
   264  		fmt.Fprintf(buf, "%d:%s:%s\n", cgE.HierarchyID, cgE.Controllers, cgE.Path)
   265  	}
   266  }
   267  
   268  // +checklocks:t.mu
   269  func (t *Task) chargeLocked(target *Task, ctl CgroupControllerType, res CgroupResourceType, value int64) (bool, Cgroup, error) {
   270  	// Due to the uniqueness of controllers on hierarchies, at most one cgroup
   271  	// in t.cgroups will match.
   272  	for c := range t.cgroups {
   273  		err := c.Charge(target, c.Dentry, ctl, res, value)
   274  		if err == nil {
   275  			c.IncRef()
   276  		}
   277  		return err == nil, c, err
   278  	}
   279  	return false, Cgroup{}, nil
   280  }
   281  
   282  // ChargeFor charges t's cgroup on behalf of some other task. Returns
   283  // the cgroup that's charged if any. Returned cgroup has an extra ref
   284  // that's transferred to the caller.
   285  func (t *Task) ChargeFor(other *Task, ctl CgroupControllerType, res CgroupResourceType, value int64) (bool, Cgroup, error) {
   286  	t.mu.Lock()
   287  	defer t.mu.Unlock()
   288  	return t.chargeLocked(other, ctl, res, value)
   289  }