github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/cgroupfs/base.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cgroupfs 16 17 import ( 18 "bytes" 19 "fmt" 20 "sort" 21 "strconv" 22 "strings" 23 24 "github.com/metacubex/gvisor/pkg/abi/linux" 25 "github.com/metacubex/gvisor/pkg/context" 26 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 27 "github.com/metacubex/gvisor/pkg/hostarch" 28 "github.com/metacubex/gvisor/pkg/log" 29 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs" 30 "github.com/metacubex/gvisor/pkg/sentry/kernel" 31 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 32 "github.com/metacubex/gvisor/pkg/sentry/vfs" 33 "github.com/metacubex/gvisor/pkg/usermem" 34 ) 35 36 // controllerCommon implements kernel.CgroupController. 37 // 38 // Must call init before use. 39 // 40 // +stateify savable 41 type controllerCommon struct { 42 ty kernel.CgroupControllerType 43 fs *filesystem 44 // parent is the parent controller if any. Immutable. 45 // 46 // Note that we don't have to update this on renames, since cgroup 47 // directories can't be moved to a different parent directory. 48 parent controller 49 } 50 51 func (c *controllerCommon) init(ty kernel.CgroupControllerType, fs *filesystem) { 52 c.ty = ty 53 c.fs = fs 54 } 55 56 func (c *controllerCommon) cloneFromParent(parent controller) { 57 c.ty = parent.Type() 58 c.fs = parent.Filesystem() 59 c.parent = parent 60 } 61 62 // Filesystem implements controller.Filesystem. 63 func (c *controllerCommon) Filesystem() *filesystem { 64 return c.fs 65 } 66 67 // Type implements kernel.CgroupController.Type. 68 func (c *controllerCommon) Type() kernel.CgroupControllerType { 69 return kernel.CgroupControllerType(c.ty) 70 } 71 72 // HierarchyID implements kernel.CgroupController.HierarchyID. 73 func (c *controllerCommon) HierarchyID() uint32 { 74 return c.fs.hierarchyID 75 } 76 77 // NumCgroups implements kernel.CgroupController.NumCgroups. 78 func (c *controllerCommon) NumCgroups() uint64 { 79 return c.fs.numCgroups.Load() 80 } 81 82 // Enabled implements kernel.CgroupController.Enabled. 83 // 84 // Controllers are currently always enabled. 85 func (c *controllerCommon) Enabled() bool { 86 return true 87 } 88 89 // EffectiveRootCgroup implements kernel.CgroupController.EffectiveRootCgroup. 90 func (c *controllerCommon) EffectiveRootCgroup() kernel.Cgroup { 91 return c.fs.effectiveRootCgroup() 92 } 93 94 // controller is an interface for common functionality related to all cgroups. 95 // It is an extension of the public cgroup interface, containing cgroup 96 // functionality private to cgroupfs. 97 type controller interface { 98 kernel.CgroupController 99 100 // Filesystem returns the cgroupfs filesystem backing this controller. 101 Filesystem() *filesystem 102 103 // Clone creates a new controller based on the internal state of this 104 // controller. This is used to initialize a sub-cgroup based on the state of 105 // the parent. 106 Clone() controller 107 108 // AddControlFiles should extend the contents map with inodes representing 109 // control files defined by this controller. 110 AddControlFiles(ctx context.Context, creds *auth.Credentials, c *cgroupInode, contents map[string]kernfs.Inode) 111 112 // Enter is called when a task initially moves into a cgroup. This is 113 // distinct from migration because the task isn't migrating away from a 114 // cgroup. Enter is called when a task is created and joins its initial 115 // cgroup, or when cgroupfs is mounted and existing tasks are moved into 116 // cgroups. 117 Enter(t *kernel.Task) 118 119 // Leave is called when a task leaves a cgroup. This is distinct from 120 // migration because the task isn't migrating to another cgroup. Leave is 121 // called when a task exits. 122 Leave(t *kernel.Task) 123 124 // PrepareMigrate signals the controller that a migration is about to 125 // happen. The controller should check for any conditions that would prevent 126 // the migration. If PrepareMigrate succeeds, the controller must 127 // unconditionally either accept the migration via CommitMigrate, or roll it 128 // back via AbortMigrate. 129 // 130 // Postcondition: If PrepareMigrate returns nil, caller must resolve the 131 // migration by calling either CommitMigrate or AbortMigrate. 132 PrepareMigrate(t *kernel.Task, src controller) error 133 134 // CommitMigrate completes an in-flight migration. 135 // 136 // Precondition: Caller must call a corresponding PrepareMigrate. 137 CommitMigrate(t *kernel.Task, src controller) 138 139 // AbortMigrate cancels an in-flight migration. 140 // 141 // Precondition: Caller must call a corresponding PrepareMigrate. 142 AbortMigrate(t *kernel.Task, src controller) 143 144 // Charge charges a controller for a particular resource. The implementation 145 // should panic if passed a resource type they do not control. 146 Charge(t *kernel.Task, d *kernfs.Dentry, res kernel.CgroupResourceType, value int64) error 147 } 148 149 // cgroupInode implements kernel.CgroupImpl and kernfs.Inode. 150 // 151 // +stateify savable 152 type cgroupInode struct { 153 dir 154 155 // id is the id of this cgroup. 156 id uint32 157 158 // controllers is the set of controllers for this cgroup. This is used to 159 // store controller-specific state per cgroup. The set of controllers should 160 // match the controllers for this hierarchy as tracked by the filesystem 161 // object. Immutable. 162 controllers map[kernel.CgroupControllerType]controller 163 164 // ts is the list of tasks in this cgroup. The kernel is responsible for 165 // removing tasks from this list before they're destroyed, so any tasks on 166 // this list are always valid. 167 // 168 // ts, and cgroup membership in general is protected by fs.tasksMu. 169 ts map[*kernel.Task]struct{} 170 } 171 172 var _ kernel.CgroupImpl = (*cgroupInode)(nil) 173 174 func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials, parent *cgroupInode, mode linux.FileMode) kernfs.Inode { 175 c := &cgroupInode{ 176 dir: dir{fs: fs}, 177 ts: make(map[*kernel.Task]struct{}), 178 controllers: make(map[kernel.CgroupControllerType]controller), 179 } 180 c.dir.cgi = c 181 182 k := kernel.KernelFromContext(ctx) 183 r := k.CgroupRegistry() 184 // Assign id for the cgroup. 185 cid, err := r.NextCgroupID() 186 if err != nil { 187 log.Warningf("cgroupfs newCgroupInode: Failed to assign id to the cgroup: %v", err) 188 } 189 c.id = cid 190 r.AddCgroup(c) 191 192 contents := make(map[string]kernfs.Inode) 193 contents["cgroup.procs"] = fs.newControllerWritableFile(ctx, creds, &cgroupProcsData{c}, false) 194 contents["tasks"] = fs.newControllerWritableFile(ctx, creds, &tasksData{c}, false) 195 196 if parent != nil { 197 for ty, ctl := range parent.controllers { 198 new := ctl.Clone() 199 c.controllers[ty] = new 200 new.AddControlFiles(ctx, creds, c, contents) 201 } 202 } else { 203 for _, ctl := range fs.controllers { 204 // Uniqueness of controllers enforced by the filesystem on 205 // creation. The root cgroup uses the controllers directly from the 206 // filesystem. 207 c.controllers[ctl.Type()] = ctl 208 ctl.AddControlFiles(ctx, creds, c, contents) 209 } 210 } 211 212 c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), mode) 213 c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) 214 c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents)) 215 216 fs.numCgroups.Add(1) 217 218 return c 219 } 220 221 // HierarchyID implements kernel.CgroupImpl.HierarchyID. 222 func (c *cgroupInode) HierarchyID() uint32 { 223 return c.fs.hierarchyID 224 } 225 226 // Name implements kernel.CgroupImpl.Name. 227 func (c *cgroupInode) Name() string { 228 return c.fs.hierarchyName 229 } 230 231 // Controllers implements kernel.CgroupImpl.Controllers. 232 func (c *cgroupInode) Controllers() []kernel.CgroupController { 233 return c.fs.kcontrollers 234 } 235 236 // tasks returns a snapshot of the tasks inside the cgroup. 237 func (c *cgroupInode) tasks() []*kernel.Task { 238 c.fs.tasksMu.RLock() 239 defer c.fs.tasksMu.RUnlock() 240 241 ts := make([]*kernel.Task, 0, len(c.ts)) 242 for t := range c.ts { 243 ts = append(ts, t) 244 } 245 return ts 246 } 247 248 // Enter implements kernel.CgroupImpl.Enter. 249 func (c *cgroupInode) Enter(t *kernel.Task) { 250 c.fs.tasksMu.Lock() 251 defer c.fs.tasksMu.Unlock() 252 253 c.ts[t] = struct{}{} 254 for _, ctl := range c.controllers { 255 ctl.Enter(t) 256 } 257 } 258 259 // Leave implements kernel.CgroupImpl.Leave. 260 func (c *cgroupInode) Leave(t *kernel.Task) { 261 c.fs.tasksMu.Lock() 262 defer c.fs.tasksMu.Unlock() 263 264 for _, ctl := range c.controllers { 265 ctl.Leave(t) 266 } 267 delete(c.ts, t) 268 } 269 270 // PrepareMigrate implements kernel.CgroupImpl.PrepareMigrate. 271 func (c *cgroupInode) PrepareMigrate(t *kernel.Task, src *kernel.Cgroup) error { 272 prepared := make([]controller, 0, len(c.controllers)) 273 rollback := func() { 274 for _, p := range prepared { 275 c.controllers[p.Type()].AbortMigrate(t, p) 276 } 277 } 278 279 for srcType, srcCtl := range src.CgroupImpl.(*cgroupInode).controllers { 280 ctl := c.controllers[srcType] 281 if err := ctl.PrepareMigrate(t, srcCtl); err != nil { 282 rollback() 283 return err 284 } 285 prepared = append(prepared, srcCtl) 286 } 287 return nil 288 } 289 290 // CommitMigrate implements kernel.CgroupImpl.CommitMigrate. 291 func (c *cgroupInode) CommitMigrate(t *kernel.Task, src *kernel.Cgroup) { 292 c.fs.tasksMu.Lock() 293 defer c.fs.tasksMu.Unlock() 294 295 for srcType, srcCtl := range src.CgroupImpl.(*cgroupInode).controllers { 296 c.controllers[srcType].CommitMigrate(t, srcCtl) 297 } 298 299 srcI := src.CgroupImpl.(*cgroupInode) 300 delete(srcI.ts, t) 301 c.ts[t] = struct{}{} 302 } 303 304 // AbortMigrate implements kernel.CgroupImpl.AbortMigrate. 305 func (c *cgroupInode) AbortMigrate(t *kernel.Task, src *kernel.Cgroup) { 306 for srcType, srcCtl := range src.CgroupImpl.(*cgroupInode).controllers { 307 c.controllers[srcType].AbortMigrate(t, srcCtl) 308 } 309 } 310 311 // CgroupFromControlFileFD returns a cgroup object given a control file FD for the cgroup. 312 func (c *cgroupInode) CgroupFromControlFileFD(fd *vfs.FileDescription) kernel.Cgroup { 313 controlFileDentry := fd.Dentry().Impl().(*kernfs.Dentry) 314 // The returned parent dentry remains valid without holding locks because in 315 // cgroupfs, the parent directory relationship of a control file is 316 // effectively immutable. Control files cannot be unlinked, renamed or 317 // destroyed independently from their parent directory. 318 parentD := controlFileDentry.Parent() 319 return kernel.Cgroup{ 320 Dentry: parentD, 321 CgroupImpl: c, 322 } 323 } 324 325 // Charge implements kernel.CgroupImpl.Charge. 326 // 327 // Charge notifies a matching controller of a change in resource usage. Due to 328 // the uniqueness of controllers, at most one controller will match. If no 329 // matching controller is present in this directory, the call silently 330 // succeeds. The caller should call Charge on all hierarchies to ensure any 331 // matching controller across the entire system is charged. 332 func (c *cgroupInode) Charge(t *kernel.Task, d *kernfs.Dentry, ctlType kernel.CgroupControllerType, res kernel.CgroupResourceType, value int64) error { 333 c.fs.tasksMu.RLock() 334 defer c.fs.tasksMu.RUnlock() 335 if ctl, ok := c.controllers[ctlType]; ok { 336 return ctl.Charge(t, d, res, value) 337 } 338 return nil 339 } 340 341 // ReadControl implements kernel.CgroupImpl.ReadControl. 342 func (c *cgroupInode) ReadControl(ctx context.Context, name string) (string, error) { 343 cfi, err := c.Lookup(ctx, name) 344 if err != nil { 345 return "", fmt.Errorf("no such control file") 346 } 347 cbf, ok := cfi.(controllerFileImpl) 348 if !ok { 349 return "", fmt.Errorf("no such control file") 350 } 351 if !cbf.AllowBackgroundAccess() { 352 return "", fmt.Errorf("this control may not be accessed from a background context") 353 } 354 355 var buf bytes.Buffer 356 err = cbf.Source().Data().Generate(ctx, &buf) 357 return buf.String(), err 358 } 359 360 // WriteControl implements kernel.CgroupImpl.WriteControl. 361 func (c *cgroupInode) WriteControl(ctx context.Context, name string, value string) error { 362 cfi, err := c.Lookup(ctx, name) 363 if err != nil { 364 return fmt.Errorf("no such control file") 365 } 366 // Do the more general cast first so we can give a meaningful error message when 367 // the control file exists, but isn't accessible (either due to being 368 // unwritable, or not being available from a background context). 369 cbf, ok := cfi.(controllerFileImpl) 370 if !ok { 371 return fmt.Errorf("no such control file") 372 } 373 if !cbf.AllowBackgroundAccess() { 374 return fmt.Errorf("this control may not be accessed from a background context") 375 } 376 wcbf, ok := cfi.(writableControllerFileImpl) 377 if !ok { 378 return fmt.Errorf("control file not writable") 379 } 380 381 ioSeq := usermem.BytesIOSequence([]byte(value)) 382 n, err := wcbf.WriteBackground(ctx, ioSeq) 383 if err != nil { 384 return err 385 } 386 if n != int64(len(value)) { 387 return fmt.Errorf("short write") 388 } 389 390 return nil 391 } 392 393 // ID implements kernel.CgroupImpl.ID. 394 func (c *cgroupInode) ID() uint32 { 395 return c.id 396 } 397 398 func sortTIDs(tids []kernel.ThreadID) { 399 sort.Slice(tids, func(i, j int) bool { return tids[i] < tids[j] }) 400 } 401 402 // +stateify savable 403 type cgroupProcsData struct { 404 *cgroupInode 405 } 406 407 // Generate implements vfs.DynamicBytesSource.Generate. 408 func (d *cgroupProcsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 409 t := kernel.TaskFromContext(ctx) 410 currPidns := t.ThreadGroup().PIDNamespace() 411 412 pgids := make(map[kernel.ThreadID]struct{}) 413 414 for _, task := range d.tasks() { 415 // Map dedups pgid, since iterating over all tasks produces multiple 416 // entries for the group leaders. 417 if pgid := currPidns.IDOfThreadGroup(task.ThreadGroup()); pgid != 0 { 418 pgids[pgid] = struct{}{} 419 } 420 } 421 422 pgidList := make([]kernel.ThreadID, 0, len(pgids)) 423 for pgid := range pgids { 424 pgidList = append(pgidList, pgid) 425 } 426 sortTIDs(pgidList) 427 428 for _, pgid := range pgidList { 429 fmt.Fprintf(buf, "%d\n", pgid) 430 } 431 432 return nil 433 } 434 435 // Write implements vfs.WritableDynamicBytesSource.Write. 436 func (d *cgroupProcsData) Write(ctx context.Context, fd *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 437 tgid, n, err := parseInt64FromString(ctx, src) 438 if err != nil { 439 return n, err 440 } 441 442 t := kernel.TaskFromContext(ctx) 443 currPidns := t.ThreadGroup().PIDNamespace() 444 var targetTG *kernel.ThreadGroup 445 if tgid != 0 { 446 targetTG = currPidns.ThreadGroupWithID(kernel.ThreadID(tgid)) 447 } else { 448 targetTG = t.ThreadGroup() 449 } 450 451 if targetTG == nil { 452 return 0, linuxerr.EINVAL 453 } 454 return n, targetTG.MigrateCgroup(d.CgroupFromControlFileFD(fd)) 455 } 456 457 // +stateify savable 458 type tasksData struct { 459 *cgroupInode 460 } 461 462 // Generate implements vfs.DynamicBytesSource.Generate. 463 func (d *tasksData) Generate(ctx context.Context, buf *bytes.Buffer) error { 464 t := kernel.TaskFromContext(ctx) 465 currPidns := t.ThreadGroup().PIDNamespace() 466 467 var pids []kernel.ThreadID 468 469 for _, task := range d.tasks() { 470 if pid := currPidns.IDOfTask(task); pid != 0 { 471 pids = append(pids, pid) 472 } 473 } 474 sortTIDs(pids) 475 476 for _, pid := range pids { 477 fmt.Fprintf(buf, "%d\n", pid) 478 } 479 480 return nil 481 } 482 483 // Write implements vfs.WritableDynamicBytesSource.Write. 484 func (d *tasksData) Write(ctx context.Context, fd *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 485 tid, n, err := parseInt64FromString(ctx, src) 486 if err != nil { 487 return n, err 488 } 489 490 t := kernel.TaskFromContext(ctx) 491 currPidns := t.ThreadGroup().PIDNamespace() 492 var targetTask *kernel.Task 493 if tid != 0 { 494 targetTask = currPidns.TaskWithID(kernel.ThreadID(tid)) 495 } else { 496 targetTask = t 497 } 498 if targetTask == nil { 499 return 0, linuxerr.EINVAL 500 } 501 return n, targetTask.MigrateCgroup(d.CgroupFromControlFileFD(fd)) 502 } 503 504 // parseInt64FromString interprets src as string encoding a int64 value, and 505 // returns the parsed value. 506 func parseInt64FromString(ctx context.Context, src usermem.IOSequence) (val, len int64, err error) { 507 const maxInt64StrLen = 20 // i.e. len(fmt.Sprintf("%d", math.MinInt64)) == 20 508 509 buf := copyScratchBufferFromContext(ctx, maxInt64StrLen) 510 n, err := src.CopyIn(ctx, buf) 511 if err != nil { 512 return 0, int64(n), err 513 } 514 str := strings.TrimSpace(string(buf[:n])) 515 516 val, err = strconv.ParseInt(str, 10, 64) 517 if err != nil { 518 // Note: This also handles zero-len writes if offset is beyond the end 519 // of src, or src is empty. 520 ctx.Debugf("cgroupfs.parseInt64FromString: failed to parse %q: %v", str, err) 521 return 0, int64(n), linuxerr.EINVAL 522 } 523 524 return val, int64(n), nil 525 } 526 527 // copyScratchBufferFromContext returns a scratch buffer of the given size. It 528 // tries to use the task's copy scratch buffer if we're on a task context, 529 // otherwise it allocates a new buffer. 530 func copyScratchBufferFromContext(ctx context.Context, size int) []byte { 531 t := kernel.TaskFromContext(ctx) 532 if t != nil { 533 return t.CopyScratchBuffer(hostarch.PageSize) 534 } 535 // Not on task context. 536 return make([]byte, hostarch.PageSize) 537 } 538 539 // controllerStateless partially implements controller. It stubs the migration 540 // methods with noops for a stateless controller. 541 type controllerStateless struct{} 542 543 // Enter implements controller.Enter. 544 func (*controllerStateless) Enter(t *kernel.Task) {} 545 546 // Leave implements controller.Leave. 547 func (*controllerStateless) Leave(t *kernel.Task) {} 548 549 // PrepareMigrate implements controller.PrepareMigrate. 550 func (*controllerStateless) PrepareMigrate(t *kernel.Task, src controller) error { 551 return nil 552 } 553 554 // CommitMigrate implements controller.CommitMigrate. 555 func (*controllerStateless) CommitMigrate(t *kernel.Task, src controller) {} 556 557 // AbortMigrate implements controller.AbortMigrate. 558 func (*controllerStateless) AbortMigrate(t *kernel.Task, src controller) {} 559 560 // controllerNoResource partially implements controller. It stubs out the Charge 561 // method for controllers that don't track resource usage through the charge 562 // mechanism. 563 type controllerNoResource struct{} 564 565 // Charge implements controller.Charge. 566 func (*controllerNoResource) Charge(t *kernel.Task, d *kernfs.Dentry, res kernel.CgroupResourceType, value int64) error { 567 panic(fmt.Sprintf("cgroupfs: Attempted to charge a controller with unknown resource %v for value %v", res, value)) 568 }