github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/cgroup.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "bytes" 19 "fmt" 20 "sort" 21 22 "github.com/metacubex/gvisor/pkg/atomicbitops" 23 "github.com/metacubex/gvisor/pkg/context" 24 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 25 "github.com/metacubex/gvisor/pkg/fspath" 26 "github.com/metacubex/gvisor/pkg/log" 27 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs" 28 "github.com/metacubex/gvisor/pkg/sentry/vfs" 29 ) 30 31 // InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID. 32 const InvalidCgroupHierarchyID uint32 = 0 33 34 // InvalidCgroupID indicates an uninitialized cgroup ID. 35 const InvalidCgroupID uint32 = 0 36 37 // CgroupControllerType is the name of a cgroup controller. 38 type CgroupControllerType string 39 40 // Available cgroup controllers. 41 const ( 42 CgroupControllerCPU = CgroupControllerType("cpu") 43 CgroupControllerCPUAcct = CgroupControllerType("cpuacct") 44 CgroupControllerCPUSet = CgroupControllerType("cpuset") 45 CgroupControllerDevices = CgroupControllerType("devices") 46 CgroupControllerJob = CgroupControllerType("job") 47 CgroupControllerMemory = CgroupControllerType("memory") 48 CgroupControllerPIDs = CgroupControllerType("pids") 49 ) 50 51 // CgroupCtrls is the list of cgroup controllers. 52 var CgroupCtrls = []CgroupControllerType{"cpu", "cpuacct", "cpuset", "devices", "job", "memory", "pids"} 53 54 // ParseCgroupController parses a string as a CgroupControllerType. 55 func ParseCgroupController(val string) (CgroupControllerType, error) { 56 switch val { 57 case "cpu": 58 return CgroupControllerCPU, nil 59 case "cpuacct": 60 return CgroupControllerCPUAcct, nil 61 case "cpuset": 62 return CgroupControllerCPUSet, nil 63 case "devices": 64 return CgroupControllerDevices, nil 65 case "job": 66 return CgroupControllerJob, nil 67 case "memory": 68 return CgroupControllerMemory, nil 69 case "pids": 70 return CgroupControllerPIDs, nil 71 default: 72 return "", fmt.Errorf("no such cgroup controller") 73 } 74 } 75 76 // CgroupResourceType represents a resource type tracked by a particular 77 // controller. 78 type CgroupResourceType int 79 80 // Resources for the cpuacct controller. 81 const ( 82 // CgroupResourcePID represents a charge for pids.current. 83 CgroupResourcePID CgroupResourceType = iota 84 ) 85 86 // CgroupController is the common interface to cgroup controllers available to 87 // the entire sentry. The controllers themselves are defined by cgroupfs. 88 // 89 // Callers of this interface are often unable access synchronization needed to 90 // ensure returned values remain valid. Some of values returned from this 91 // interface are thus snapshots in time, and may become stale. This is ok for 92 // many callers like procfs. 93 type CgroupController interface { 94 // Returns the type of this cgroup controller (ex "memory", "cpu"). Returned 95 // value is valid for the lifetime of the controller. 96 Type() CgroupControllerType 97 98 // Hierarchy returns the ID of the hierarchy this cgroup controller is 99 // attached to. Returned value is valid for the lifetime of the controller. 100 HierarchyID() uint32 101 102 // EffectiveRootCgroup returns the effective root cgroup for this 103 // controller. This is either the actual root of the underlying cgroupfs 104 // filesystem, or the override root configured at sandbox startup. Returned 105 // value is valid for the lifetime of the controller. 106 EffectiveRootCgroup() Cgroup 107 108 // NumCgroups returns the number of cgroups managed by this controller. 109 // Returned value is a snapshot in time. 110 NumCgroups() uint64 111 112 // Enabled returns whether this controller is enabled. Returned value is a 113 // snapshot in time. 114 Enabled() bool 115 } 116 117 // Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters 118 // a cgroup, it holds a reference on the underlying dentry pointing to the 119 // cgroup. 120 // 121 // +stateify savable 122 type Cgroup struct { 123 *kernfs.Dentry 124 CgroupImpl 125 } 126 127 // decRef drops a reference on the cgroup. This must happen outside a Task.mu 128 // critical section. 129 func (c *Cgroup) decRef() { 130 c.Dentry.DecRef(context.Background()) 131 } 132 133 // Path returns the absolute path of c, relative to its hierarchy root. 134 func (c *Cgroup) Path() string { 135 return c.FSLocalPath() 136 } 137 138 // Walk returns the cgroup at p, starting from c. 139 func (c *Cgroup) Walk(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (Cgroup, error) { 140 d, err := c.Dentry.WalkDentryTree(ctx, vfsObj, p) 141 if err != nil { 142 return Cgroup{}, err 143 } 144 return Cgroup{ 145 Dentry: d, 146 CgroupImpl: d.Inode().(CgroupImpl), 147 }, nil 148 } 149 150 // CgroupMigrationContext represents an in-flight cgroup migration for 151 // a single task. 152 type CgroupMigrationContext struct { 153 src Cgroup 154 dst Cgroup 155 t *Task 156 } 157 158 // Abort cancels a migration. 159 func (ctx *CgroupMigrationContext) Abort() { 160 ctx.dst.AbortMigrate(ctx.t, &ctx.src) 161 } 162 163 // Commit completes a migration. 164 func (ctx *CgroupMigrationContext) Commit() { 165 ctx.dst.CommitMigrate(ctx.t, &ctx.src) 166 167 ctx.t.mu.Lock() 168 delete(ctx.t.cgroups, ctx.src) 169 ctx.src.DecRef(ctx.t) 170 ctx.dst.IncRef() 171 ctx.t.cgroups[ctx.dst] = struct{}{} 172 ctx.t.mu.Unlock() 173 } 174 175 // CgroupImpl is the common interface to cgroups. 176 type CgroupImpl interface { 177 // Controllers lists the controller associated with this cgroup. 178 Controllers() []CgroupController 179 180 // HierarchyID returns the id of the hierarchy that contains this cgroup. 181 HierarchyID() uint32 182 183 // Name returns the name for this cgroup, if any. If no name was provided 184 // when the hierarchy was created, returns "". 185 Name() string 186 187 // Enter moves t into this cgroup. 188 Enter(t *Task) 189 190 // Leave moves t out of this cgroup. 191 Leave(t *Task) 192 193 // PrepareMigrate initiates a migration of t from src to this cgroup. See 194 // cgroupfs.controller.PrepareMigrate. 195 PrepareMigrate(t *Task, src *Cgroup) error 196 197 // CommitMigrate completes an in-flight migration. See 198 // cgroupfs.controller.CommitMigrate. 199 CommitMigrate(t *Task, src *Cgroup) 200 201 // AbortMigrate cancels an in-flight migration. See 202 // cgroupfs.controller.AbortMigrate. 203 AbortMigrate(t *Task, src *Cgroup) 204 205 // Charge charges a controller in this cgroup for a particular resource. key 206 // must match a valid resource for the specified controller type. 207 // 208 // The implementer should silently succeed if no matching controllers are 209 // found. 210 // 211 // The underlying implementation will panic if passed an incompatible 212 // resource type for a given controller. 213 // 214 // See cgroupfs.controller.Charge. 215 Charge(t *Task, d *kernfs.Dentry, ctl CgroupControllerType, res CgroupResourceType, value int64) error 216 217 // ReadControlFromBackground allows a background context to read a cgroup's 218 // control values. 219 ReadControl(ctx context.Context, name string) (string, error) 220 221 // WriteControl allows a background context to write a cgroup's control 222 // values. 223 WriteControl(ctx context.Context, name string, val string) error 224 225 // ID returns the id of this cgroup. 226 ID() uint32 227 } 228 229 // hierarchy represents a cgroupfs filesystem instance, with a unique set of 230 // controllers attached to it. Multiple cgroupfs mounts may reference the same 231 // hierarchy. 232 // 233 // +stateify savable 234 type hierarchy struct { 235 id uint32 236 name string 237 // These are a subset of the controllers in CgroupRegistry.controllers, 238 // grouped here by hierarchy for convenient lookup. 239 controllers map[CgroupControllerType]CgroupController 240 // fs is not owned by hierarchy. The FS is responsible for unregistering the 241 // hierarchy on destruction, which removes this association. 242 fs *vfs.Filesystem 243 } 244 245 func (h *hierarchy) match(ctypes []CgroupControllerType) bool { 246 if len(ctypes) != len(h.controllers) { 247 return false 248 } 249 for _, ty := range ctypes { 250 if _, ok := h.controllers[ty]; !ok { 251 return false 252 } 253 } 254 return true 255 } 256 257 // cgroupFS is the public interface to cgroupfs. This lets the kernel package 258 // refer to cgroupfs.filesystem methods without directly depending on the 259 // cgroupfs package, which would lead to a circular dependency. 260 type cgroupFS interface { 261 // Returns the vfs.Filesystem for the cgroupfs. 262 VFSFilesystem() *vfs.Filesystem 263 264 // InitializeHierarchyID sets the hierarchy ID for this filesystem during 265 // filesystem creation. May only be called before the filesystem is visible 266 // to the vfs layer. 267 InitializeHierarchyID(hid uint32) 268 269 // RootCgroup returns the root cgroup of this instance. This returns the 270 // actual root, and ignores any overrides setting an effective root. 271 RootCgroup() Cgroup 272 } 273 274 // CgroupRegistry tracks the active set of cgroup controllers on the system. 275 // 276 // +stateify savable 277 type CgroupRegistry struct { 278 // lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid 279 // ids are from 1 to math.MaxUint32. 280 // 281 lastHierarchyID atomicbitops.Uint32 282 283 // lastCgroupID is the id of the last allocated cgroup. Valid ids are 284 // from 1 to math.MaxUint32. 285 // 286 lastCgroupID atomicbitops.Uint32 287 288 mu cgroupMutex `state:"nosave"` 289 290 // controllers is the set of currently known cgroup controllers on the 291 // system. 292 // 293 // +checklocks:mu 294 controllers map[CgroupControllerType]CgroupController 295 296 // hierarchies is the active set of cgroup hierarchies. This contains all 297 // hierarchies on the system. 298 // 299 // +checklocks:mu 300 hierarchies map[uint32]hierarchy 301 302 // hierarchiesByName is a map of named hierarchies. Only named hierarchies 303 // are tracked on this map. 304 // 305 // +checklocks:mu 306 hierarchiesByName map[string]hierarchy 307 308 // cgroups is the active set of cgroups. This contains all the cgroups 309 // on the system. 310 // 311 // +checklocks:mu 312 cgroups map[uint32]CgroupImpl 313 } 314 315 func newCgroupRegistry() *CgroupRegistry { 316 return &CgroupRegistry{ 317 controllers: make(map[CgroupControllerType]CgroupController), 318 hierarchies: make(map[uint32]hierarchy), 319 hierarchiesByName: make(map[string]hierarchy), 320 cgroups: make(map[uint32]CgroupImpl), 321 } 322 } 323 324 // nextHierarchyID returns a newly allocated, unique hierarchy ID. 325 func (r *CgroupRegistry) nextHierarchyID() (uint32, error) { 326 if hid := r.lastHierarchyID.Add(1); hid != 0 { 327 return hid, nil 328 } 329 return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow") 330 } 331 332 // FindHierarchy returns a cgroup filesystem containing exactly the set of 333 // controllers named in ctypes, and optionally the name specified in name if it 334 // isn't empty. If no such FS is found, FindHierarchy return nil. FindHierarchy 335 // takes a reference on the returned FS, which is transferred to the caller. 336 func (r *CgroupRegistry) FindHierarchy(name string, ctypes []CgroupControllerType) (*vfs.Filesystem, error) { 337 r.mu.Lock() 338 defer r.mu.Unlock() 339 340 // If we have a hierarchy name, lookup by name. 341 if name != "" { 342 h, ok := r.hierarchiesByName[name] 343 if !ok { 344 // Name not found. 345 return nil, nil 346 } 347 348 if h.match(ctypes) { 349 if !h.fs.TryIncRef() { 350 // May be racing with filesystem destruction, see below. 351 r.unregisterLocked(h.id) 352 return nil, nil 353 } 354 return h.fs, nil 355 } 356 357 // Name matched, but controllers didn't. Fail per linux 358 // kernel/cgroup.c:cgroup_mount(). 359 log.Debugf("cgroupfs: Registry lookup for name=%s controllers=%v failed; named matched but controllers didn't (have controllers=%v)", name, ctypes, h.controllers) 360 return nil, linuxerr.EBUSY 361 } 362 363 for _, h := range r.hierarchies { 364 if h.match(ctypes) { 365 if !h.fs.TryIncRef() { 366 // Racing with filesystem destruction, namely h.fs.Release. 367 // Since we hold r.mu, we know the hierarchy hasn't been 368 // unregistered yet, but its associated filesystem is tearing 369 // down. 370 // 371 // If we simply indicate the hierarchy wasn't found without 372 // cleaning up the registry, the caller can race with the 373 // unregister and find itself temporarily unable to create a new 374 // hierarchy with a subset of the relevant controllers. 375 // 376 // To keep the result of FindHierarchy consistent with the 377 // uniqueness of controllers enforced by Register, drop the 378 // dying hierarchy now. The eventual unregister by the FS 379 // teardown will become a no-op. 380 r.unregisterLocked(h.id) 381 return nil, nil 382 } 383 return h.fs, nil 384 } 385 } 386 387 return nil, nil 388 } 389 390 // FindCgroup locates a cgroup with the given parameters. 391 // 392 // A cgroup is considered a match even if it contains other controllers on the 393 // same hierarchy. 394 func (r *CgroupRegistry) FindCgroup(ctx context.Context, ctype CgroupControllerType, path string) (Cgroup, error) { 395 p := fspath.Parse(path) 396 if !p.Absolute { 397 return Cgroup{}, fmt.Errorf("path must be absolute") 398 } 399 k := KernelFromContext(ctx) 400 vfsfs, err := r.FindHierarchy("", []CgroupControllerType{ctype}) 401 if err != nil { 402 return Cgroup{}, err 403 } 404 if vfsfs == nil { 405 return Cgroup{}, fmt.Errorf("controller not active") 406 } 407 defer vfsfs.DecRef(ctx) 408 409 rootCG := vfsfs.Impl().(cgroupFS).RootCgroup() 410 411 if !p.HasComponents() { 412 // Explicit root '/'. 413 return rootCG, nil 414 } 415 416 return rootCG.Walk(ctx, k.VFS(), p) 417 } 418 419 // Register registers the provided set of controllers with the registry as a new 420 // hierarchy. If any controller is already registered, the function returns an 421 // error without modifying the registry. Register sets the hierarchy ID for the 422 // filesystem on success. 423 func (r *CgroupRegistry) Register(name string, cs []CgroupController, fs cgroupFS) error { 424 r.mu.Lock() 425 defer r.mu.Unlock() 426 427 if name == "" && len(cs) == 0 { 428 return fmt.Errorf("can't register hierarchy with both no controllers and no name") 429 } 430 431 for _, c := range cs { 432 if _, ok := r.controllers[c.Type()]; ok { 433 return fmt.Errorf("controllers may only be mounted on a single hierarchy") 434 } 435 } 436 437 if _, ok := r.hierarchiesByName[name]; name != "" && ok { 438 return fmt.Errorf("hierarchy named %q already exists", name) 439 } 440 441 hid, err := r.nextHierarchyID() 442 if err != nil { 443 return err 444 } 445 446 // Must not fail below here, once we publish the hierarchy ID. 447 448 fs.InitializeHierarchyID(hid) 449 450 h := hierarchy{ 451 id: hid, 452 name: name, 453 controllers: make(map[CgroupControllerType]CgroupController), 454 fs: fs.VFSFilesystem(), 455 } 456 for _, c := range cs { 457 n := c.Type() 458 r.controllers[n] = c 459 h.controllers[n] = c 460 } 461 r.hierarchies[hid] = h 462 if name != "" { 463 r.hierarchiesByName[name] = h 464 } 465 return nil 466 } 467 468 // Unregister removes a previously registered hierarchy from the registry. If no 469 // such hierarchy is registered, Unregister is a no-op. 470 func (r *CgroupRegistry) Unregister(hid uint32) { 471 r.mu.Lock() 472 r.unregisterLocked(hid) 473 r.mu.Unlock() 474 } 475 476 // Precondition: Caller must hold r.mu. 477 // +checklocks:r.mu 478 func (r *CgroupRegistry) unregisterLocked(hid uint32) { 479 if h, ok := r.hierarchies[hid]; ok { 480 for name := range h.controllers { 481 delete(r.controllers, name) 482 } 483 delete(r.hierarchies, hid) 484 } 485 } 486 487 // computeInitialGroups takes a reference on each of the returned cgroups. The 488 // caller takes ownership of this returned reference. 489 func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} { 490 r.mu.Lock() 491 defer r.mu.Unlock() 492 493 ctlSet := make(map[CgroupControllerType]CgroupController) 494 cgset := make(map[Cgroup]struct{}) 495 496 // Remember controllers from the inherited cgroups set... 497 for cg := range inherit { 498 cg.IncRef() // Ref transferred to caller. 499 for _, ctl := range cg.Controllers() { 500 ctlSet[ctl.Type()] = ctl 501 cgset[cg] = struct{}{} 502 } 503 } 504 505 // ... and add the root cgroups of all the missing controllers. 506 for name, ctl := range r.controllers { 507 if _, ok := ctlSet[name]; !ok { 508 cg := ctl.EffectiveRootCgroup() 509 // Multiple controllers may share the same hierarchy, so may have 510 // the same root cgroup. Grab a single ref per hierarchy root. 511 if _, ok := cgset[cg]; ok { 512 continue 513 } 514 cg.IncRef() // Ref transferred to caller. 515 cgset[cg] = struct{}{} 516 } 517 } 518 return cgset 519 } 520 521 // GenerateProcCgroups writes the contents of /proc/cgroups to buf. 522 func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) { 523 r.mu.Lock() 524 entries := make([]string, 0, len(r.controllers)) 525 for _, c := range r.controllers { 526 en := 0 527 if c.Enabled() { 528 en = 1 529 } 530 entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en)) 531 } 532 r.mu.Unlock() 533 534 sort.Strings(entries) 535 fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n") 536 for _, e := range entries { 537 fmt.Fprint(buf, e) 538 } 539 } 540 541 // NextCgroupID returns a newly allocated, unique cgroup ID. 542 func (r *CgroupRegistry) NextCgroupID() (uint32, error) { 543 if cid := r.lastCgroupID.Add(1); cid != 0 { 544 return cid, nil 545 } 546 return InvalidCgroupID, fmt.Errorf("cgroup ID overflow") 547 } 548 549 // AddCgroup adds the ID and cgroup in the map. 550 func (r *CgroupRegistry) AddCgroup(cg CgroupImpl) { 551 r.mu.Lock() 552 r.cgroups[cg.ID()] = cg 553 r.mu.Unlock() 554 } 555 556 // GetCgroup returns the cgroup associated with the cgroup ID. 557 func (r *CgroupRegistry) GetCgroup(cid uint32) (CgroupImpl, error) { 558 r.mu.Lock() 559 defer r.mu.Unlock() 560 cg, ok := r.cgroups[cid] 561 if !ok { 562 return nil, fmt.Errorf("cgroup with ID %d does not exist", cid) 563 } 564 return cg, nil 565 }