github.com/zhuohuang-hust/src-cbuild@v0.0.0-20230105071821-c7aab3e7c840/mergeCode/runc/libcontainer/cgroups/systemd/apply_systemd.go (about) 1 // +build linux 2 3 package systemd 4 5 import ( 6 "errors" 7 "fmt" 8 "io/ioutil" 9 "os" 10 "path/filepath" 11 "strings" 12 "sync" 13 "time" 14 15 systemdDbus "github.com/coreos/go-systemd/dbus" 16 systemdUtil "github.com/coreos/go-systemd/util" 17 "github.com/godbus/dbus" 18 "github.com/opencontainers/runc/libcontainer/cgroups" 19 "github.com/opencontainers/runc/libcontainer/cgroups/fs" 20 "github.com/opencontainers/runc/libcontainer/configs" 21 ) 22 23 type Manager struct { 24 mu sync.Mutex 25 Cgroups *configs.Cgroup 26 Paths map[string]string 27 } 28 29 type subsystem interface { 30 // Name returns the name of the subsystem. 31 Name() string 32 // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. 33 GetStats(path string, stats *cgroups.Stats) error 34 // Set the cgroup represented by cgroup. 35 Set(path string, cgroup *configs.Cgroup) error 36 } 37 38 var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") 39 40 type subsystemSet []subsystem 41 42 func (s subsystemSet) Get(name string) (subsystem, error) { 43 for _, ss := range s { 44 if ss.Name() == name { 45 return ss, nil 46 } 47 } 48 return nil, errSubsystemDoesNotExist 49 } 50 51 var subsystems = subsystemSet{ 52 &fs.CpusetGroup{}, 53 &fs.DevicesGroup{}, 54 &fs.MemoryGroup{}, 55 &fs.CpuGroup{}, 56 &fs.CpuacctGroup{}, 57 &fs.PidsGroup{}, 58 &fs.BlkioGroup{}, 59 &fs.HugetlbGroup{}, 60 &fs.PerfEventGroup{}, 61 &fs.FreezerGroup{}, 62 &fs.NetPrioGroup{}, 63 &fs.NetClsGroup{}, 64 &fs.NameGroup{GroupName: "name=systemd"}, 65 } 66 67 const ( 68 testScopeWait = 4 69 testSliceWait = 4 70 ) 71 72 var ( 73 connLock sync.Mutex 74 theConn *systemdDbus.Conn 75 hasStartTransientUnit bool 76 hasStartTransientSliceUnit bool 77 hasTransientDefaultDependencies bool 78 hasDelegate bool 79 ) 80 81 func newProp(name string, units interface{}) systemdDbus.Property { 82 return systemdDbus.Property{ 83 Name: name, 84 Value: dbus.MakeVariant(units), 85 } 86 } 87 88 func UseSystemd() bool { 89 if !systemdUtil.IsRunningSystemd() { 90 return false 91 } 92 93 connLock.Lock() 94 defer connLock.Unlock() 95 96 if theConn == nil { 97 var err error 98 theConn, err = systemdDbus.New() 99 if err != nil { 100 return false 101 } 102 103 // Assume we have StartTransientUnit 104 hasStartTransientUnit = true 105 106 // But if we get UnknownMethod error we don't 107 if _, err := theConn.StartTransientUnit("test.scope", "invalid", nil, nil); err != nil { 108 if dbusError, ok := err.(dbus.Error); ok { 109 if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" { 110 hasStartTransientUnit = false 111 return hasStartTransientUnit 112 } 113 } 114 } 115 116 // Ensure the scope name we use doesn't exist. Use the Pid to 117 // avoid collisions between multiple libcontainer users on a 118 // single host. 119 scope := fmt.Sprintf("libcontainer-%d-systemd-test-default-dependencies.scope", os.Getpid()) 120 testScopeExists := true 121 for i := 0; i <= testScopeWait; i++ { 122 if _, err := theConn.StopUnit(scope, "replace", nil); err != nil { 123 if dbusError, ok := err.(dbus.Error); ok { 124 if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") { 125 testScopeExists = false 126 break 127 } 128 } 129 } 130 time.Sleep(time.Millisecond) 131 } 132 133 // Bail out if we can't kill this scope without testing for DefaultDependencies 134 if testScopeExists { 135 return hasStartTransientUnit 136 } 137 138 // Assume StartTransientUnit on a scope allows DefaultDependencies 139 hasTransientDefaultDependencies = true 140 ddf := newProp("DefaultDependencies", false) 141 if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{ddf}, nil); err != nil { 142 if dbusError, ok := err.(dbus.Error); ok { 143 if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { 144 hasTransientDefaultDependencies = false 145 } 146 } 147 } 148 149 // Not critical because of the stop unit logic above. 150 theConn.StopUnit(scope, "replace", nil) 151 152 // Assume StartTransientUnit on a scope allows Delegate 153 hasDelegate = true 154 dl := newProp("Delegate", true) 155 if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil { 156 if dbusError, ok := err.(dbus.Error); ok { 157 if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { 158 hasDelegate = false 159 } 160 } 161 } 162 163 // Assume we have the ability to start a transient unit as a slice 164 // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 165 // For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299 166 hasStartTransientSliceUnit = true 167 168 // To ensure simple clean-up, we create a slice off the root with no hierarchy 169 slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid()) 170 if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil { 171 if _, ok := err.(dbus.Error); ok { 172 hasStartTransientSliceUnit = false 173 } 174 } 175 176 for i := 0; i <= testSliceWait; i++ { 177 if _, err := theConn.StopUnit(slice, "replace", nil); err != nil { 178 if dbusError, ok := err.(dbus.Error); ok { 179 if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") { 180 hasStartTransientSliceUnit = false 181 break 182 } 183 } 184 } else { 185 break 186 } 187 time.Sleep(time.Millisecond) 188 } 189 190 // Not critical because of the stop unit logic above. 191 theConn.StopUnit(scope, "replace", nil) 192 theConn.StopUnit(slice, "replace", nil) 193 } 194 return hasStartTransientUnit 195 } 196 197 func (m *Manager) Apply(pid int) error { 198 var ( 199 c = m.Cgroups 200 unitName = getUnitName(c) 201 slice = "system.slice" 202 properties []systemdDbus.Property 203 ) 204 205 if c.Paths != nil { 206 paths := make(map[string]string) 207 for name, path := range c.Paths { 208 _, err := getSubsystemPath(m.Cgroups, name) 209 if err != nil { 210 // Don't fail if a cgroup hierarchy was not found, just skip this subsystem 211 if cgroups.IsNotFound(err) { 212 continue 213 } 214 return err 215 } 216 paths[name] = path 217 } 218 m.Paths = paths 219 return cgroups.EnterPid(m.Paths, pid) 220 } 221 222 if c.Parent != "" { 223 slice = c.Parent 224 } 225 226 properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) 227 228 // if we create a slice, the parent is defined via a Wants= 229 if strings.HasSuffix(unitName, ".slice") { 230 // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 231 if !hasStartTransientSliceUnit { 232 return fmt.Errorf("systemd version does not support ability to start a slice as transient unit") 233 } 234 properties = append(properties, systemdDbus.PropWants(slice)) 235 } else { 236 // otherwise, we use Slice= 237 properties = append(properties, systemdDbus.PropSlice(slice)) 238 } 239 240 // only add pid if its valid, -1 is used w/ general slice creation. 241 if pid != -1 { 242 properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) 243 } 244 245 if hasDelegate { 246 // This is only supported on systemd versions 218 and above. 247 properties = append(properties, newProp("Delegate", true)) 248 } 249 250 // Always enable accounting, this gets us the same behaviour as the fs implementation, 251 // plus the kernel has some problems with joining the memory cgroup at a later time. 252 properties = append(properties, 253 newProp("MemoryAccounting", true), 254 newProp("CPUAccounting", true), 255 newProp("BlockIOAccounting", true)) 256 257 if hasTransientDefaultDependencies { 258 properties = append(properties, 259 newProp("DefaultDependencies", false)) 260 } 261 262 if c.Resources.Memory != 0 { 263 properties = append(properties, 264 newProp("MemoryLimit", uint64(c.Resources.Memory))) 265 } 266 267 if c.Resources.CpuShares != 0 { 268 properties = append(properties, 269 newProp("CPUShares", uint64(c.Resources.CpuShares))) 270 } 271 272 if c.Resources.BlkioWeight != 0 { 273 properties = append(properties, 274 newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) 275 } 276 277 // We have to set kernel memory here, as we can't change it once 278 // processes have been attached to the cgroup. 279 if c.Resources.KernelMemory != 0 { 280 if err := setKernelMemory(c); err != nil { 281 return err 282 } 283 } 284 285 if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil && !isUnitExists(err) { 286 return err 287 } 288 289 if err := joinCgroups(c, pid); err != nil { 290 return err 291 } 292 293 paths := make(map[string]string) 294 for _, s := range subsystems { 295 subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) 296 if err != nil { 297 // Don't fail if a cgroup hierarchy was not found, just skip this subsystem 298 if cgroups.IsNotFound(err) { 299 continue 300 } 301 return err 302 } 303 paths[s.Name()] = subsystemPath 304 } 305 m.Paths = paths 306 return nil 307 } 308 309 func (m *Manager) Destroy() error { 310 if m.Cgroups.Paths != nil { 311 return nil 312 } 313 m.mu.Lock() 314 defer m.mu.Unlock() 315 theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) 316 if err := cgroups.RemovePaths(m.Paths); err != nil { 317 return err 318 } 319 m.Paths = make(map[string]string) 320 return nil 321 } 322 323 func (m *Manager) GetPaths() map[string]string { 324 m.mu.Lock() 325 paths := m.Paths 326 m.mu.Unlock() 327 return paths 328 } 329 330 func writeFile(dir, file, data string) error { 331 // Normally dir should not be empty, one case is that cgroup subsystem 332 // is not mounted, we will get empty dir, and we want it fail here. 333 if dir == "" { 334 return fmt.Errorf("no such directory for %s", file) 335 } 336 return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) 337 } 338 339 func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { 340 path, err := getSubsystemPath(c, subsystem) 341 if err != nil { 342 return "", err 343 } 344 if err := os.MkdirAll(path, 0755); err != nil { 345 return "", err 346 } 347 if err := cgroups.WriteCgroupProc(path, pid); err != nil { 348 return "", err 349 } 350 return path, nil 351 } 352 353 func joinCgroups(c *configs.Cgroup, pid int) error { 354 for _, sys := range subsystems { 355 name := sys.Name() 356 switch name { 357 case "name=systemd": 358 // let systemd handle this 359 break 360 case "cpuset": 361 path, err := getSubsystemPath(c, name) 362 if err != nil && !cgroups.IsNotFound(err) { 363 return err 364 } 365 s := &fs.CpusetGroup{} 366 if err := s.ApplyDir(path, c, pid); err != nil { 367 return err 368 } 369 break 370 default: 371 _, err := join(c, name, pid) 372 if err != nil { 373 // Even if it's `not found` error, we'll return err 374 // because devices cgroup is hard requirement for 375 // container security. 376 if name == "devices" { 377 return err 378 } 379 // For other subsystems, omit the `not found` error 380 // because they are optional. 381 if !cgroups.IsNotFound(err) { 382 return err 383 } 384 } 385 } 386 } 387 388 return nil 389 } 390 391 // systemd represents slice hierarchy using `-`, so we need to follow suit when 392 // generating the path of slice. Essentially, test-a-b.slice becomes 393 // test.slice/test-a.slice/test-a-b.slice. 394 func ExpandSlice(slice string) (string, error) { 395 suffix := ".slice" 396 // Name has to end with ".slice", but can't be just ".slice". 397 if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { 398 return "", fmt.Errorf("invalid slice name: %s", slice) 399 } 400 401 // Path-separators are not allowed. 402 if strings.Contains(slice, "/") { 403 return "", fmt.Errorf("invalid slice name: %s", slice) 404 } 405 406 var path, prefix string 407 sliceName := strings.TrimSuffix(slice, suffix) 408 // if input was -.slice, we should just return root now 409 if sliceName == "-" { 410 return "/", nil 411 } 412 for _, component := range strings.Split(sliceName, "-") { 413 // test--a.slice isn't permitted, nor is -test.slice. 414 if component == "" { 415 return "", fmt.Errorf("invalid slice name: %s", slice) 416 } 417 418 // Append the component to the path and to the prefix. 419 path += prefix + component + suffix + "/" 420 prefix += component + "-" 421 } 422 423 return path, nil 424 } 425 426 func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { 427 mountpoint, err := cgroups.FindCgroupMountpoint(subsystem) 428 if err != nil { 429 return "", err 430 } 431 432 initPath, err := cgroups.GetInitCgroupDir(subsystem) 433 if err != nil { 434 return "", err 435 } 436 // if pid 1 is systemd 226 or later, it will be in init.scope, not the root 437 initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope") 438 439 slice := "system.slice" 440 if c.Parent != "" { 441 slice = c.Parent 442 } 443 444 slice, err = ExpandSlice(slice) 445 if err != nil { 446 return "", err 447 } 448 449 return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil 450 } 451 452 func (m *Manager) Freeze(state configs.FreezerState) error { 453 path, err := getSubsystemPath(m.Cgroups, "freezer") 454 if err != nil { 455 return err 456 } 457 prevState := m.Cgroups.Resources.Freezer 458 m.Cgroups.Resources.Freezer = state 459 freezer, err := subsystems.Get("freezer") 460 if err != nil { 461 return err 462 } 463 err = freezer.Set(path, m.Cgroups) 464 if err != nil { 465 m.Cgroups.Resources.Freezer = prevState 466 return err 467 } 468 return nil 469 } 470 471 func (m *Manager) GetPids() ([]int, error) { 472 path, err := getSubsystemPath(m.Cgroups, "devices") 473 if err != nil { 474 return nil, err 475 } 476 return cgroups.GetPids(path) 477 } 478 479 func (m *Manager) GetAllPids() ([]int, error) { 480 path, err := getSubsystemPath(m.Cgroups, "devices") 481 if err != nil { 482 return nil, err 483 } 484 return cgroups.GetAllPids(path) 485 } 486 487 func (m *Manager) GetStats() (*cgroups.Stats, error) { 488 m.mu.Lock() 489 defer m.mu.Unlock() 490 stats := cgroups.NewStats() 491 for name, path := range m.Paths { 492 sys, err := subsystems.Get(name) 493 if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { 494 continue 495 } 496 if err := sys.GetStats(path, stats); err != nil { 497 return nil, err 498 } 499 } 500 501 return stats, nil 502 } 503 504 func (m *Manager) Set(container *configs.Config) error { 505 // If Paths are set, then we are just joining cgroups paths 506 // and there is no need to set any values. 507 if m.Cgroups.Paths != nil { 508 return nil 509 } 510 for _, sys := range subsystems { 511 // Get the subsystem path, but don't error out for not found cgroups. 512 path, err := getSubsystemPath(container.Cgroups, sys.Name()) 513 if err != nil && !cgroups.IsNotFound(err) { 514 return err 515 } 516 517 if err := sys.Set(path, container.Cgroups); err != nil { 518 return err 519 } 520 } 521 522 if m.Paths["cpu"] != "" { 523 if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { 524 return err 525 } 526 } 527 return nil 528 } 529 530 func getUnitName(c *configs.Cgroup) string { 531 // by default, we create a scope unless the user explicitly asks for a slice. 532 if !strings.HasSuffix(c.Name, ".slice") { 533 return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) 534 } 535 return c.Name 536 } 537 538 func setKernelMemory(c *configs.Cgroup) error { 539 path, err := getSubsystemPath(c, "memory") 540 if err != nil && !cgroups.IsNotFound(err) { 541 return err 542 } 543 544 if err := os.MkdirAll(path, 0755); err != nil { 545 return err 546 } 547 return fs.EnableKernelMemoryAccounting(path) 548 } 549 550 // isUnitExists returns true if the error is that a systemd unit already exists. 551 func isUnitExists(err error) bool { 552 if err != nil { 553 if dbusError, ok := err.(dbus.Error); ok { 554 return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists") 555 } 556 } 557 return false 558 }