github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/lib/cgutil/cpuset_manager_v2.go (about) 1 //go:build linux 2 3 package cgutil 4 5 import ( 6 "context" 7 "fmt" 8 "os" 9 "path/filepath" 10 "strings" 11 "sync" 12 "time" 13 14 "github.com/hashicorp/go-hclog" 15 "github.com/hashicorp/go-set" 16 "github.com/hashicorp/nomad/helper" 17 "github.com/hashicorp/nomad/lib/cpuset" 18 "github.com/hashicorp/nomad/nomad/structs" 19 "github.com/opencontainers/runc/libcontainer/cgroups" 20 "github.com/opencontainers/runc/libcontainer/cgroups/fs2" 21 "github.com/opencontainers/runc/libcontainer/configs" 22 ) 23 24 const ( 25 // CreationPID is a special PID in libcontainer used to denote a cgroup 26 // should be created, but with no process added. 27 // 28 // https://github.com/opencontainers/runc/blob/v1.0.3/libcontainer/cgroups/utils.go#L372 29 CreationPID = -1 30 31 // DefaultCgroupParentV2 is the name of Nomad's default parent cgroup, under which 32 // all other cgroups are managed. This can be changed with client configuration 33 // in case for e.g. Nomad tasks should be further constrained by an externally 34 // configured systemd cgroup. 35 DefaultCgroupParentV2 = "nomad.slice" 36 ) 37 38 // nothing is used for treating a map like a set with no values 39 type nothing struct{} 40 41 // present indicates something exists 42 var present = nothing{} 43 44 type cpusetManagerV2 struct { 45 logger hclog.Logger 46 47 parent string // relative to cgroup root (e.g. "nomad.slice") 48 parentAbs string // absolute path (e.g. "/sys/fs/cgroup/nomad.slice") 49 initial cpuset.CPUSet // set of initial cores (never changes) 50 51 lock sync.Mutex // hold this when managing pool / sharing / isolating 52 pool cpuset.CPUSet // pool of cores being shared among all tasks 53 sharing map[identity]nothing // sharing tasks using cores only from the pool 54 isolating map[identity]cpuset.CPUSet // isolating tasks using cores from the pool + reserved cores 55 } 56 57 func NewCpusetManagerV2(parent string, reservable []uint16, logger hclog.Logger) CpusetManager { 58 if err := minimumRootControllers(); err != nil { 59 logger.Error("failed to enabled minimum set of cgroup controllers; disabling cpuset management", "error", err) 60 return new(NoopCpusetManager) 61 } 62 63 parentAbs := filepath.Join(CgroupRoot, parent) 64 if err := os.MkdirAll(parentAbs, 0o755); err != nil { 65 logger.Error("failed to ensure nomad parent cgroup exists; disabling cpuset management", "error", err) 66 return new(NoopCpusetManager) 67 } 68 69 if len(reservable) == 0 { 70 // read from group 71 if cpus, err := GetCPUsFromCgroup(parent); err != nil { 72 logger.Error("failed to lookup cpus from parent cgroup; disabling cpuset management", "error", err) 73 return new(NoopCpusetManager) 74 } else { 75 reservable = cpus 76 } 77 } 78 79 return &cpusetManagerV2{ 80 initial: cpuset.New(reservable...), 81 parent: parent, 82 parentAbs: parentAbs, 83 logger: logger, 84 sharing: make(map[identity]nothing), 85 isolating: make(map[identity]cpuset.CPUSet), 86 } 87 } 88 89 // minimumControllers sets the minimum set of required controllers on the 90 // /sys/fs/cgroup/cgroup.subtree_control file - ensuring [cpuset, cpu, io, memory, pids] 91 // are enabled. 92 func minimumRootControllers() error { 93 e := new(editor) 94 s, err := e.read("cgroup.subtree_control") 95 if err != nil { 96 return err 97 } 98 99 required := set.From[string]([]string{"cpuset", "cpu", "io", "memory", "pids"}) 100 enabled := set.From[string](strings.Fields(s)) 101 needed := required.Difference(enabled) 102 103 if needed.Size() == 0 { 104 return nil // already sufficient 105 } 106 107 sb := new(strings.Builder) 108 for _, controller := range needed.List() { 109 sb.WriteString("+" + controller + " ") 110 } 111 112 activation := strings.TrimSpace(sb.String()) 113 return e.write("cgroup.subtree_control", activation) 114 } 115 116 func (c *cpusetManagerV2) Init() { 117 c.logger.Debug("initializing with", "cores", c.initial) 118 } 119 120 func (c *cpusetManagerV2) AddAlloc(alloc *structs.Allocation) { 121 if alloc == nil || alloc.AllocatedResources == nil { 122 return 123 } 124 c.logger.Trace("add allocation", "name", alloc.Name, "id", alloc.ID) 125 126 // grab write lock while we recompute and apply changes 127 c.lock.Lock() 128 defer c.lock.Unlock() 129 130 // first update our tracking of isolating and sharing tasks 131 for task, resources := range alloc.AllocatedResources.Tasks { 132 id := makeID(alloc.ID, task) 133 if len(resources.Cpu.ReservedCores) > 0 { 134 c.isolating[id] = cpuset.New(resources.Cpu.ReservedCores...) 135 } else { 136 c.sharing[id] = present 137 } 138 } 139 140 // recompute the available sharable cpu cores 141 c.recalculate() 142 143 // now write out the entire cgroups space 144 c.reconcile() 145 146 // no need to cleanup on adds, we did not remove a task 147 } 148 149 func (c *cpusetManagerV2) RemoveAlloc(allocID string) { 150 c.logger.Trace("remove allocation", "id", allocID) 151 152 // grab write lock while we recompute and apply changes. 153 c.lock.Lock() 154 defer c.lock.Unlock() 155 156 // remove tasks of allocID from the sharing set 157 for id := range c.sharing { 158 if strings.HasPrefix(string(id), allocID) { 159 delete(c.sharing, id) 160 } 161 } 162 163 // remove tasks of allocID from the isolating set 164 for id := range c.isolating { 165 if strings.HasPrefix(string(id), allocID) { 166 delete(c.isolating, id) 167 } 168 } 169 170 // recompute available sharable cpu cores 171 c.recalculate() 172 173 // now write out the entire cgroups space 174 c.reconcile() 175 176 // now remove any tasks no longer running 177 c.cleanup() 178 } 179 180 func (c *cpusetManagerV2) CgroupPathFor(allocID, task string) CgroupPathGetter { 181 // The CgroupPathFor implementation must block until cgroup for allocID.task 182 // exists [and can accept a PID]. 183 return func(ctx context.Context) (string, error) { 184 ticks, cancel := helper.NewSafeTimer(100 * time.Millisecond) 185 defer cancel() 186 187 for { 188 path := c.pathOf(makeID(allocID, task)) 189 mgr, err := fs2.NewManager(nil, path) 190 if err != nil { 191 return "", err 192 } 193 194 if mgr.Exists() { 195 return path, nil 196 } 197 198 select { 199 case <-ctx.Done(): 200 return "", ctx.Err() 201 case <-ticks.C: 202 continue 203 } 204 } 205 } 206 } 207 208 // recalculate the number of cores sharable by non-isolating tasks (and isolating tasks) 209 // 210 // must be called while holding c.lock 211 func (c *cpusetManagerV2) recalculate() { 212 remaining := c.initial.Copy() 213 for _, set := range c.isolating { 214 remaining = remaining.Difference(set) 215 } 216 c.pool = remaining 217 } 218 219 // reconcile will actually write the cpuset values for all tracked tasks. 220 // 221 // must be called while holding c.lock 222 func (c *cpusetManagerV2) reconcile() { 223 for id := range c.sharing { 224 c.write(id, c.pool) 225 } 226 227 for id, set := range c.isolating { 228 c.write(id, c.pool.Union(set)) 229 } 230 } 231 232 // cleanup will remove any cgroups for allocations no longer being tracked 233 // 234 // must be called while holding c.lock 235 func (c *cpusetManagerV2) cleanup() { 236 // create a map to lookup ids we know about 237 size := len(c.sharing) + len(c.isolating) 238 ids := make(map[identity]nothing, size) 239 for id := range c.sharing { 240 ids[id] = present 241 } 242 for id := range c.isolating { 243 ids[id] = present 244 } 245 246 if err := filepath.WalkDir(c.parentAbs, func(path string, entry os.DirEntry, err error) error { 247 // a cgroup is a directory 248 if !entry.IsDir() { 249 return nil 250 } 251 252 dir := filepath.Dir(path) 253 base := filepath.Base(path) 254 255 // only manage scopes directly under nomad.slice 256 if dir != c.parentAbs || !strings.HasSuffix(base, ".scope") { 257 return nil 258 } 259 260 // only remove the scope if we do not track it 261 id := identity(strings.TrimSuffix(base, ".scope")) 262 _, exists := ids[id] 263 if !exists { 264 c.remove(path) 265 } 266 267 return nil 268 }); err != nil { 269 c.logger.Error("failed to cleanup cgroup", "error", err) 270 } 271 } 272 273 // pathOf returns the absolute path to a task with identity id. 274 func (c *cpusetManagerV2) pathOf(id identity) string { 275 return filepath.Join(c.parentAbs, makeScope(id)) 276 } 277 278 // remove does the actual fs delete of the cgroup 279 // 280 // We avoid removing a cgroup if it still contains a PID, as the cpuset manager 281 // may be initially empty on a Nomad client restart. 282 func (c *cpusetManagerV2) remove(path string) { 283 mgr, err := fs2.NewManager(nil, path) 284 if err != nil { 285 c.logger.Warn("failed to create manager", "path", path, "error", err) 286 return 287 } 288 289 // get the list of pids managed by this scope (should be 0 or 1) 290 pids, _ := mgr.GetPids() 291 292 // do not destroy the scope if a PID is still present 293 // this is a normal condition when an agent restarts with running tasks 294 // and the v2 manager is still rebuilding its tracked tasks 295 if len(pids) > 0 { 296 return 297 } 298 299 // remove the cgroup 300 if err3 := mgr.Destroy(); err3 != nil { 301 c.logger.Warn("failed to cleanup cgroup", "path", path, "error", err) 302 return 303 } 304 } 305 306 // write does the actual write of cpuset set for cgroup id 307 func (c *cpusetManagerV2) write(id identity, set cpuset.CPUSet) { 308 path := c.pathOf(id) 309 310 // make a manager for the cgroup 311 m, err := fs2.NewManager(new(configs.Cgroup), path) 312 if err != nil { 313 c.logger.Error("failed to manage cgroup", "path", path, "error", err) 314 return 315 } 316 317 // create the cgroup 318 if err = m.Apply(CreationPID); err != nil { 319 c.logger.Error("failed to apply cgroup", "path", path, "error", err) 320 return 321 } 322 323 // set the cpuset value for the cgroup 324 if err = m.Set(&configs.Resources{ 325 CpusetCpus: set.String(), 326 }); err != nil { 327 c.logger.Error("failed to set cgroup", "path", path, "error", err) 328 return 329 } 330 } 331 332 // fromRoot returns the joined filepath of group on the CgroupRoot 333 func fromRoot(group string) string { 334 return filepath.Join(CgroupRoot, group) 335 } 336 337 // getCPUsFromCgroupV2 retrieves the effective cpuset for the group, which must 338 // be directly under the cgroup root (i.e. the parent, like nomad.slice). 339 func getCPUsFromCgroupV2(group string) ([]uint16, error) { 340 path := fromRoot(group) 341 effective, err := cgroups.ReadFile(path, "cpuset.cpus.effective") 342 if err != nil { 343 return nil, err 344 } 345 set, err := cpuset.Parse(effective) 346 if err != nil { 347 return nil, err 348 } 349 return set.ToSlice(), nil 350 } 351 352 // identity is the "<allocID>.<taskName>" string that uniquely identifies an 353 // individual instance of a task within the flat cgroup namespace 354 type identity string 355 356 func makeID(allocID, task string) identity { 357 return identity(fmt.Sprintf("%s.%s", allocID, task)) 358 } 359 360 func makeScope(id identity) string { 361 return string(id) + ".scope" 362 }