github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/cgroup.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "bytes" 19 "fmt" 20 "sort" 21 22 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 23 "github.com/MerlinKodo/gvisor/pkg/context" 24 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 25 "github.com/MerlinKodo/gvisor/pkg/fspath" 26 "github.com/MerlinKodo/gvisor/pkg/log" 27 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/kernfs" 28 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 29 ) 30 31 // InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID. 32 const InvalidCgroupHierarchyID uint32 = 0 33 34 // InvalidCgroupID indicates an uninitialized cgroup ID. 35 const InvalidCgroupID uint32 = 0 36 37 // CgroupControllerType is the name of a cgroup controller. 38 type CgroupControllerType string 39 40 // Available cgroup controllers. 41 const ( 42 CgroupControllerCPU = CgroupControllerType("cpu") 43 CgroupControllerCPUAcct = CgroupControllerType("cpuacct") 44 CgroupControllerCPUSet = CgroupControllerType("cpuset") 45 CgroupControllerDevices = CgroupControllerType("devices") 46 CgroupControllerJob = CgroupControllerType("job") 47 CgroupControllerMemory = CgroupControllerType("memory") 48 CgroupControllerPIDs = CgroupControllerType("pids") 49 ) 50 51 // ParseCgroupController parses a string as a CgroupControllerType. 52 func ParseCgroupController(val string) (CgroupControllerType, error) { 53 switch val { 54 case "cpu": 55 return CgroupControllerCPU, nil 56 case "cpuacct": 57 return CgroupControllerCPUAcct, nil 58 case "cpuset": 59 return CgroupControllerCPUSet, nil 60 case "devices": 61 return CgroupControllerDevices, nil 62 case "job": 63 return CgroupControllerJob, nil 64 case "memory": 65 return CgroupControllerMemory, nil 66 case "pids": 67 return CgroupControllerPIDs, nil 68 default: 69 return "", fmt.Errorf("no such cgroup controller") 70 } 71 } 72 73 // CgroupResourceType represents a resource type tracked by a particular 74 // controller. 75 type CgroupResourceType int 76 77 // Resources for the cpuacct controller. 78 const ( 79 // CgroupResourcePID represents a charge for pids.current. 80 CgroupResourcePID CgroupResourceType = iota 81 ) 82 83 // CgroupController is the common interface to cgroup controllers available to 84 // the entire sentry. The controllers themselves are defined by cgroupfs. 85 // 86 // Callers of this interface are often unable access synchronization needed to 87 // ensure returned values remain valid. Some of values returned from this 88 // interface are thus snapshots in time, and may become stale. This is ok for 89 // many callers like procfs. 90 type CgroupController interface { 91 // Returns the type of this cgroup controller (ex "memory", "cpu"). Returned 92 // value is valid for the lifetime of the controller. 93 Type() CgroupControllerType 94 95 // Hierarchy returns the ID of the hierarchy this cgroup controller is 96 // attached to. Returned value is valid for the lifetime of the controller. 97 HierarchyID() uint32 98 99 // EffectiveRootCgroup returns the effective root cgroup for this 100 // controller. This is either the actual root of the underlying cgroupfs 101 // filesystem, or the override root configured at sandbox startup. Returned 102 // value is valid for the lifetime of the controller. 103 EffectiveRootCgroup() Cgroup 104 105 // NumCgroups returns the number of cgroups managed by this controller. 106 // Returned value is a snapshot in time. 107 NumCgroups() uint64 108 109 // Enabled returns whether this controller is enabled. Returned value is a 110 // snapshot in time. 111 Enabled() bool 112 } 113 114 // Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters 115 // a cgroup, it holds a reference on the underlying dentry pointing to the 116 // cgroup. 117 // 118 // +stateify savable 119 type Cgroup struct { 120 *kernfs.Dentry 121 CgroupImpl 122 } 123 124 // decRef drops a reference on the cgroup. This must happen outside a Task.mu 125 // critical section. 126 func (c *Cgroup) decRef() { 127 c.Dentry.DecRef(context.Background()) 128 } 129 130 // Path returns the absolute path of c, relative to its hierarchy root. 131 func (c *Cgroup) Path() string { 132 return c.FSLocalPath() 133 } 134 135 // Walk returns the cgroup at p, starting from c. 136 func (c *Cgroup) Walk(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (Cgroup, error) { 137 d, err := c.Dentry.WalkDentryTree(ctx, vfsObj, p) 138 if err != nil { 139 return Cgroup{}, err 140 } 141 return Cgroup{ 142 Dentry: d, 143 CgroupImpl: d.Inode().(CgroupImpl), 144 }, nil 145 } 146 147 // CgroupMigrationContext represents an in-flight cgroup migration for 148 // a single task. 149 type CgroupMigrationContext struct { 150 src Cgroup 151 dst Cgroup 152 t *Task 153 } 154 155 // Abort cancels a migration. 156 func (ctx *CgroupMigrationContext) Abort() { 157 ctx.dst.AbortMigrate(ctx.t, &ctx.src) 158 } 159 160 // Commit completes a migration. 161 func (ctx *CgroupMigrationContext) Commit() { 162 ctx.dst.CommitMigrate(ctx.t, &ctx.src) 163 164 ctx.t.mu.Lock() 165 delete(ctx.t.cgroups, ctx.src) 166 ctx.src.DecRef(ctx.t) 167 ctx.dst.IncRef() 168 ctx.t.cgroups[ctx.dst] = struct{}{} 169 ctx.t.mu.Unlock() 170 } 171 172 // CgroupImpl is the common interface to cgroups. 173 type CgroupImpl interface { 174 // Controllers lists the controller associated with this cgroup. 175 Controllers() []CgroupController 176 177 // HierarchyID returns the id of the hierarchy that contains this cgroup. 178 HierarchyID() uint32 179 180 // Name returns the name for this cgroup, if any. If no name was provided 181 // when the hierarchy was created, returns "". 182 Name() string 183 184 // Enter moves t into this cgroup. 185 Enter(t *Task) 186 187 // Leave moves t out of this cgroup. 188 Leave(t *Task) 189 190 // PrepareMigrate initiates a migration of t from src to this cgroup. See 191 // cgroupfs.controller.PrepareMigrate. 192 PrepareMigrate(t *Task, src *Cgroup) error 193 194 // CommitMigrate completes an in-flight migration. See 195 // cgroupfs.controller.CommitMigrate. 196 CommitMigrate(t *Task, src *Cgroup) 197 198 // AbortMigrate cancels an in-flight migration. See 199 // cgroupfs.controller.AbortMigrate. 200 AbortMigrate(t *Task, src *Cgroup) 201 202 // Charge charges a controller in this cgroup for a particular resource. key 203 // must match a valid resource for the specified controller type. 204 // 205 // The implementer should silently succeed if no matching controllers are 206 // found. 207 // 208 // The underlying implementaion will panic if passed an incompatible 209 // resource type for a given controller. 210 // 211 // See cgroupfs.controller.Charge. 212 Charge(t *Task, d *kernfs.Dentry, ctl CgroupControllerType, res CgroupResourceType, value int64) error 213 214 // ReadControlFromBackground allows a background context to read a cgroup's 215 // control values. 216 ReadControl(ctx context.Context, name string) (string, error) 217 218 // WriteControl allows a background context to write a cgroup's control 219 // values. 220 WriteControl(ctx context.Context, name string, val string) error 221 222 // ID returns the id of this cgroup. 223 ID() uint32 224 } 225 226 // hierarchy represents a cgroupfs filesystem instance, with a unique set of 227 // controllers attached to it. Multiple cgroupfs mounts may reference the same 228 // hierarchy. 229 // 230 // +stateify savable 231 type hierarchy struct { 232 id uint32 233 name string 234 // These are a subset of the controllers in CgroupRegistry.controllers, 235 // grouped here by hierarchy for conveninent lookup. 236 controllers map[CgroupControllerType]CgroupController 237 // fs is not owned by hierarchy. The FS is responsible for unregistering the 238 // hierarchy on destruction, which removes this association. 239 fs *vfs.Filesystem 240 } 241 242 func (h *hierarchy) match(ctypes []CgroupControllerType) bool { 243 if len(ctypes) != len(h.controllers) { 244 return false 245 } 246 for _, ty := range ctypes { 247 if _, ok := h.controllers[ty]; !ok { 248 return false 249 } 250 } 251 return true 252 } 253 254 // cgroupFS is the public interface to cgroupfs. This lets the kernel package 255 // refer to cgroupfs.filesystem methods without directly depending on the 256 // cgroupfs package, which would lead to a circular dependency. 257 type cgroupFS interface { 258 // Returns the vfs.Filesystem for the cgroupfs. 259 VFSFilesystem() *vfs.Filesystem 260 261 // InitializeHierarchyID sets the hierarchy ID for this filesystem during 262 // filesystem creation. May only be called before the filesystem is visible 263 // to the vfs layer. 264 InitializeHierarchyID(hid uint32) 265 266 // RootCgroup returns the root cgroup of this instance. This returns the 267 // actual root, and ignores any overrides setting an effective root. 268 RootCgroup() Cgroup 269 } 270 271 // CgroupRegistry tracks the active set of cgroup controllers on the system. 272 // 273 // +stateify savable 274 type CgroupRegistry struct { 275 // lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid 276 // ids are from 1 to math.MaxUint32. 277 // 278 lastHierarchyID atomicbitops.Uint32 279 280 // lastCgroupID is the id of the last allocated cgroup. Valid ids are 281 // from 1 to math.MaxUint32. 282 // 283 lastCgroupID atomicbitops.Uint32 284 285 mu cgroupMutex `state:"nosave"` 286 287 // controllers is the set of currently known cgroup controllers on the 288 // system. 289 // 290 // +checklocks:mu 291 controllers map[CgroupControllerType]CgroupController 292 293 // hierarchies is the active set of cgroup hierarchies. This contains all 294 // hierarchies on the system. 295 // 296 // +checklocks:mu 297 hierarchies map[uint32]hierarchy 298 299 // hierarchiesByName is a map of named hierarchies. Only named hierarchies 300 // are tracked on this map. 301 // 302 // +checklocks:mu 303 hierarchiesByName map[string]hierarchy 304 305 // cgroups is the active set of cgroups. This contains all the cgroups 306 // on the system. 307 // 308 // +checklocks:mu 309 cgroups map[uint32]CgroupImpl 310 } 311 312 func newCgroupRegistry() *CgroupRegistry { 313 return &CgroupRegistry{ 314 controllers: make(map[CgroupControllerType]CgroupController), 315 hierarchies: make(map[uint32]hierarchy), 316 hierarchiesByName: make(map[string]hierarchy), 317 cgroups: make(map[uint32]CgroupImpl), 318 } 319 } 320 321 // nextHierarchyID returns a newly allocated, unique hierarchy ID. 322 func (r *CgroupRegistry) nextHierarchyID() (uint32, error) { 323 if hid := r.lastHierarchyID.Add(1); hid != 0 { 324 return hid, nil 325 } 326 return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow") 327 } 328 329 // FindHierarchy returns a cgroup filesystem containing exactly the set of 330 // controllers named in ctypes, and optionally the name specified in name if it 331 // isn't empty. If no such FS is found, FindHierarchy return nil. FindHierarchy 332 // takes a reference on the returned FS, which is transferred to the caller. 333 func (r *CgroupRegistry) FindHierarchy(name string, ctypes []CgroupControllerType) (*vfs.Filesystem, error) { 334 r.mu.Lock() 335 defer r.mu.Unlock() 336 337 // If we have a hierarchy name, lookup by name. 338 if name != "" { 339 h, ok := r.hierarchiesByName[name] 340 if !ok { 341 // Name not found. 342 return nil, nil 343 } 344 345 if h.match(ctypes) { 346 if !h.fs.TryIncRef() { 347 // May be racing with filesystem destruction, see below. 348 r.unregisterLocked(h.id) 349 return nil, nil 350 } 351 return h.fs, nil 352 } 353 354 // Name matched, but controllers didn't. Fail per linux 355 // kernel/cgroup.c:cgroup_mount(). 356 log.Debugf("cgroupfs: Registry lookup for name=%s controllers=%v failed; named matched but controllers didn't (have controllers=%v)", name, ctypes, h.controllers) 357 return nil, linuxerr.EBUSY 358 } 359 360 for _, h := range r.hierarchies { 361 if h.match(ctypes) { 362 if !h.fs.TryIncRef() { 363 // Racing with filesystem destruction, namely h.fs.Release. 364 // Since we hold r.mu, we know the hierarchy hasn't been 365 // unregistered yet, but its associated filesystem is tearing 366 // down. 367 // 368 // If we simply indicate the hierarchy wasn't found without 369 // cleaning up the registry, the caller can race with the 370 // unregister and find itself temporarily unable to create a new 371 // hierarchy with a subset of the relevant controllers. 372 // 373 // To keep the result of FindHierarchy consistent with the 374 // uniqueness of controllers enforced by Register, drop the 375 // dying hierarchy now. The eventual unregister by the FS 376 // teardown will become a no-op. 377 r.unregisterLocked(h.id) 378 return nil, nil 379 } 380 return h.fs, nil 381 } 382 } 383 384 return nil, nil 385 } 386 387 // FindCgroup locates a cgroup with the given parameters. 388 // 389 // A cgroup is considered a match even if it contains other controllers on the 390 // same hierarchy. 391 func (r *CgroupRegistry) FindCgroup(ctx context.Context, ctype CgroupControllerType, path string) (Cgroup, error) { 392 p := fspath.Parse(path) 393 if !p.Absolute { 394 return Cgroup{}, fmt.Errorf("path must be absolute") 395 } 396 k := KernelFromContext(ctx) 397 vfsfs, err := r.FindHierarchy("", []CgroupControllerType{ctype}) 398 if err != nil { 399 return Cgroup{}, err 400 } 401 if vfsfs == nil { 402 return Cgroup{}, fmt.Errorf("controller not active") 403 } 404 405 rootCG := vfsfs.Impl().(cgroupFS).RootCgroup() 406 407 if !p.HasComponents() { 408 // Explicit root '/'. 409 return rootCG, nil 410 } 411 412 return rootCG.Walk(ctx, k.VFS(), p) 413 } 414 415 // Register registers the provided set of controllers with the registry as a new 416 // hierarchy. If any controller is already registered, the function returns an 417 // error without modifying the registry. Register sets the hierarchy ID for the 418 // filesystem on success. 419 func (r *CgroupRegistry) Register(name string, cs []CgroupController, fs cgroupFS) error { 420 r.mu.Lock() 421 defer r.mu.Unlock() 422 423 if name == "" && len(cs) == 0 { 424 return fmt.Errorf("can't register hierarchy with both no controllers and no name") 425 } 426 427 for _, c := range cs { 428 if _, ok := r.controllers[c.Type()]; ok { 429 return fmt.Errorf("controllers may only be mounted on a single hierarchy") 430 } 431 } 432 433 if _, ok := r.hierarchiesByName[name]; name != "" && ok { 434 return fmt.Errorf("hierarchy named %q already exists", name) 435 } 436 437 hid, err := r.nextHierarchyID() 438 if err != nil { 439 return err 440 } 441 442 // Must not fail below here, once we publish the hierarchy ID. 443 444 fs.InitializeHierarchyID(hid) 445 446 h := hierarchy{ 447 id: hid, 448 name: name, 449 controllers: make(map[CgroupControllerType]CgroupController), 450 fs: fs.VFSFilesystem(), 451 } 452 for _, c := range cs { 453 n := c.Type() 454 r.controllers[n] = c 455 h.controllers[n] = c 456 } 457 r.hierarchies[hid] = h 458 if name != "" { 459 r.hierarchiesByName[name] = h 460 } 461 return nil 462 } 463 464 // Unregister removes a previously registered hierarchy from the registry. If no 465 // such hierarchy is registered, Unregister is a no-op. 466 func (r *CgroupRegistry) Unregister(hid uint32) { 467 r.mu.Lock() 468 r.unregisterLocked(hid) 469 r.mu.Unlock() 470 } 471 472 // Precondition: Caller must hold r.mu. 473 // +checklocks:r.mu 474 func (r *CgroupRegistry) unregisterLocked(hid uint32) { 475 if h, ok := r.hierarchies[hid]; ok { 476 for name := range h.controllers { 477 delete(r.controllers, name) 478 } 479 delete(r.hierarchies, hid) 480 } 481 } 482 483 // computeInitialGroups takes a reference on each of the returned cgroups. The 484 // caller takes ownership of this returned reference. 485 func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} { 486 r.mu.Lock() 487 defer r.mu.Unlock() 488 489 ctlSet := make(map[CgroupControllerType]CgroupController) 490 cgset := make(map[Cgroup]struct{}) 491 492 // Remember controllers from the inherited cgroups set... 493 for cg := range inherit { 494 cg.IncRef() // Ref transferred to caller. 495 for _, ctl := range cg.Controllers() { 496 ctlSet[ctl.Type()] = ctl 497 cgset[cg] = struct{}{} 498 } 499 } 500 501 // ... and add the root cgroups of all the missing controllers. 502 for name, ctl := range r.controllers { 503 if _, ok := ctlSet[name]; !ok { 504 cg := ctl.EffectiveRootCgroup() 505 // Multiple controllers may share the same hierarchy, so may have 506 // the same root cgroup. Grab a single ref per hierarchy root. 507 if _, ok := cgset[cg]; ok { 508 continue 509 } 510 cg.IncRef() // Ref transferred to caller. 511 cgset[cg] = struct{}{} 512 } 513 } 514 return cgset 515 } 516 517 // GenerateProcCgroups writes the contents of /proc/cgroups to buf. 518 func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) { 519 r.mu.Lock() 520 entries := make([]string, 0, len(r.controllers)) 521 for _, c := range r.controllers { 522 en := 0 523 if c.Enabled() { 524 en = 1 525 } 526 entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en)) 527 } 528 r.mu.Unlock() 529 530 sort.Strings(entries) 531 fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n") 532 for _, e := range entries { 533 fmt.Fprint(buf, e) 534 } 535 } 536 537 // NextCgroupID returns a newly allocated, unique cgroup ID. 538 func (r *CgroupRegistry) NextCgroupID() (uint32, error) { 539 if cid := r.lastCgroupID.Add(1); cid != 0 { 540 return cid, nil 541 } 542 return InvalidCgroupID, fmt.Errorf("cgroup ID overflow") 543 } 544 545 // AddCgroup adds the ID and cgroup in the map. 546 func (r *CgroupRegistry) AddCgroup(cg CgroupImpl) { 547 r.mu.Lock() 548 r.cgroups[cg.ID()] = cg 549 r.mu.Unlock() 550 } 551 552 // GetCgroup returns the cgroup associated with the cgroup ID. 553 func (r *CgroupRegistry) GetCgroup(cid uint32) (CgroupImpl, error) { 554 r.mu.Lock() 555 defer r.mu.Unlock() 556 cg, ok := r.cgroups[cid] 557 if !ok { 558 return nil, fmt.Errorf("cgroup with ID %d does not exist", cid) 559 } 560 return cg, nil 561 }