github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/runsc/cgroup/systemd.go (about) 1 // Copyright 2022 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cgroup 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "os" 22 "path/filepath" 23 "regexp" 24 "strconv" 25 "strings" 26 "sync" 27 "time" 28 29 systemdDbus "github.com/coreos/go-systemd/v22/dbus" 30 dbus "github.com/godbus/dbus/v5" 31 specs "github.com/opencontainers/runtime-spec/specs-go" 32 "github.com/nicocha30/gvisor-ligolo/pkg/cleanup" 33 "github.com/nicocha30/gvisor-ligolo/pkg/log" 34 ) 35 36 var ( 37 // ErrBadResourceSpec indicates that a cgroupSystemd function was 38 // passed a specs.LinuxResources object that is impossible or illegal 39 // to process. 40 ErrBadResourceSpec = errors.New("misconfigured resource spec") 41 // ErrInvalidSlice indicates that the slice name passed via cgroup.Path is 42 // invalid. 43 ErrInvalidSlice = errors.New("invalid slice name") 44 ) 45 46 // cgroupSystemd represents a cgroupv2 managed by systemd. 47 type cgroupSystemd struct { 48 cgroupV2 49 // Name is the name of the of the systemd scope that controls the cgroups. 50 Name string 51 // Parent is the encapsulating slice. 52 Parent string 53 // ScopePrefix is the prefix for the scope name. 54 ScopePrefix string 55 56 properties []systemdDbus.Property 57 dbusConn *systemdDbus.Conn 58 } 59 60 func newCgroupV2Systemd(cgv2 *cgroupV2) (*cgroupSystemd, error) { 61 if !isRunningSystemd() { 62 return nil, fmt.Errorf("systemd not running on host") 63 } 64 ctx := context.Background() 65 cg := &cgroupSystemd{cgroupV2: *cgv2} 66 // Parse the path from expected "slice:prefix:name" 67 // for e.g. "system.slice:docker:1234" 68 parts := strings.Split(cg.Path, ":") 69 if len(parts) != 3 { 70 return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", cg.Path) 71 } 72 cg.Parent = parts[0] 73 cg.ScopePrefix = parts[1] 74 cg.Name = parts[2] 75 if err := validSlice(cg.Parent); err != nil { 76 return nil, fmt.Errorf("%w: %v", ErrInvalidGroupPath, err) 77 } 78 // Rewrite Path so that it is compatible with cgroupv2 methods. 79 cg.Path = filepath.Join(expandSlice(cg.Parent), cg.unitName()) 80 conn, err := systemdDbus.NewWithContext(ctx) 81 if err != nil { 82 return nil, err 83 } 84 var version int 85 if version, err = systemdVersion(conn); err != nil { 86 return nil, fmt.Errorf("error parsing systemd version: %v", err) 87 } 88 if version < 244 { 89 return nil, fmt.Errorf("systemd version %d not supported, please upgrade to at least 244", version) 90 } 91 cg.dbusConn = conn 92 return cg, err 93 } 94 95 // Install configures the properties for a scope unit but does not start the 96 // unit. 97 func (c *cgroupSystemd) Install(res *specs.LinuxResources) error { 98 log.Debugf("Installing systemd cgroup resource controller under %v", c.Parent) 99 c.properties = append(c.properties, systemdDbus.PropSlice(c.Parent)) 100 c.properties = append(c.properties, systemdDbus.PropDescription("Secure container "+c.Name)) 101 pid := os.Getpid() 102 c.properties = append(c.properties, systemdDbus.PropPids(uint32(pid))) 103 // We always want proper accounting for the container for reporting resource 104 // usage. 105 c.addProp("MemoryAccounting", true) 106 c.addProp("CPUAccounting", true) 107 c.addProp("TasksAccounting", true) 108 c.addProp("IOAccounting", true) 109 // Delegate must be true so that the container can manage its own cgroups. 110 c.addProp("Delegate", true) 111 // For compatibility with runc. 112 c.addProp("DefaultDependencies", false) 113 114 for controllerName, ctrlr := range controllers2 { 115 // First check if our controller is found in the system. 116 found := false 117 for _, knownController := range c.Controllers { 118 if controllerName == knownController { 119 found = true 120 } 121 } 122 if found { 123 props, err := ctrlr.generateProperties(res) 124 if err != nil { 125 return err 126 } 127 c.properties = append(c.properties, props...) 128 continue 129 } 130 if ctrlr.optional() { 131 if err := ctrlr.skip(res); err != nil { 132 return err 133 } 134 } else { 135 return fmt.Errorf("mandatory cgroup controller %q is missing for %q", controllerName, c.Path) 136 } 137 } 138 return nil 139 } 140 141 func (c *cgroupSystemd) unitName() string { 142 return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) 143 } 144 145 // MakePath builds a path to the given controller. 146 func (c *cgroupSystemd) MakePath(string) string { 147 fullSlicePath := expandSlice(c.Parent) 148 path := filepath.Join(c.Mountpoint, fullSlicePath, c.unitName()) 149 return path 150 } 151 152 // Join implements Cgroup.Join. 153 func (c *cgroupSystemd) Join() (func(), error) { 154 log.Debugf("Joining systemd cgroup %v", c.unitName()) 155 timeout := 30 * time.Second 156 ctx := context.Background() 157 // Clean up partially created cgroups on error. Errors during cleanup itself 158 // are ignored. 159 clean := cleanup.Make(func() { _ = c.Uninstall() }) 160 defer clean.Clean() 161 162 conn, err := systemdDbus.NewWithContext(ctx) 163 if err != nil { 164 return nil, err 165 } 166 c.dbusConn = conn 167 unitName := c.unitName() 168 statusChan := make(chan string) 169 timedCtx, cancel := context.WithTimeout(ctx, timeout) 170 defer cancel() 171 if _, err := c.dbusConn.StartTransientUnitContext(timedCtx, unitName, "replace", c.properties, statusChan); err == nil { 172 s := <-statusChan 173 close(statusChan) 174 switch s { 175 case "done": 176 // All cases that are not "done" according to the dbus package. 177 case "cancelled", "timeout", "failed", "dependency", "skipped": 178 c.dbusConn.ResetFailedUnitContext(ctx, unitName) 179 return nil, fmt.Errorf("error creating systemd unit `%s`: got %s", unitName, s) 180 default: 181 c.dbusConn.ResetFailedUnitContext(ctx, unitName) 182 return nil, fmt.Errorf("unknown job completion status %q", s) 183 } 184 } else if unitAlreadyExists(err) { 185 return clean.Release(), nil 186 } else { 187 return nil, fmt.Errorf("systemd error: %v", err) 188 } 189 if _, err = c.createCgroupPaths(); err != nil { 190 return nil, err 191 } 192 return clean.Release(), nil 193 } 194 195 // unitAlreadyExists returns true if the error is that a systemd unit already 196 // exists. 197 func unitAlreadyExists(err error) bool { 198 if err != nil { 199 var derr dbus.Error 200 if errors.As(err, &derr) { 201 return strings.Contains(derr.Name, "org.freedesktop.systemd1.UnitExists") 202 } 203 } 204 return false 205 } 206 207 // systemd represents slice hierarchy using `-`, so we need to follow suit when 208 // generating the path of slice. Essentially, test-a-b.slice becomes 209 // /test.slice/test-a.slice/test-a-b.slice. 210 func expandSlice(slice string) string { 211 var path, prefix string 212 suffix := ".slice" 213 sliceName := strings.TrimSuffix(slice, suffix) 214 // If input was -.slice, we should just return root now. 215 if sliceName == "-" { 216 return "/" 217 } 218 for _, component := range strings.Split(sliceName, "-") { 219 // Append the component to the path and to the prefix. 220 path += "/" + prefix + component + suffix 221 prefix += component + "-" 222 } 223 return path 224 } 225 226 func validSlice(slice string) error { 227 suffix := ".slice" 228 // Name has to end with ".slice", but can't be just ".slice". 229 if slice == suffix || !strings.HasSuffix(slice, suffix) { 230 return fmt.Errorf("%w: %s", ErrInvalidSlice, slice) 231 } 232 233 // Path-separators are not allowed. 234 if strings.Contains(slice, "/") { 235 return fmt.Errorf("%w: %s", ErrInvalidSlice, slice) 236 } 237 238 sliceName := strings.TrimSuffix(slice, suffix) 239 // If input was -.slice, we should just return root now. 240 if sliceName == "-" { 241 return nil 242 } 243 for _, component := range strings.Split(sliceName, "-") { 244 // test--a.slice isn't permitted, nor is -test.slice. 245 if component == "" { 246 return fmt.Errorf("%w: %s", ErrInvalidSlice, slice) 247 } 248 } 249 return nil 250 } 251 252 var systemdCheck struct { 253 once sync.Once 254 cache bool 255 } 256 257 func isRunningSystemd() bool { 258 systemdCheck.once.Do(func() { 259 fi, err := os.Lstat("/run/systemd/system") 260 systemdCheck.cache = err == nil && fi.IsDir() 261 }) 262 return systemdCheck.cache 263 } 264 265 func systemdVersion(conn *systemdDbus.Conn) (int, error) { 266 vStr, err := conn.GetManagerProperty("Version") 267 if err != nil { 268 return -1, errors.New("unable to get systemd version") 269 } 270 // vStr should be of the form: 271 // "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes). 272 // The result for all of the above should be 245. 273 // Thus, we unconditionally remove the "v" prefix 274 // and then match on the first integer we can grab. 275 re := regexp.MustCompile(`v?([0-9]+)`) 276 matches := re.FindStringSubmatch(vStr) 277 if len(matches) < 2 { 278 return -1, fmt.Errorf("can't parse version %q: incorrect number of matches %d", vStr, len(matches)) 279 } 280 version, err := strconv.Atoi(matches[1]) 281 if err != nil { 282 return -1, fmt.Errorf("%w: can't parse version %q", err, vStr) 283 } 284 return version, nil 285 } 286 287 func addIOProps(props []systemdDbus.Property, name string, devs []specs.LinuxThrottleDevice) []systemdDbus.Property { 288 for _, dev := range devs { 289 val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate) 290 props = append(props, newProp(name, val)) 291 } 292 return props 293 } 294 295 func (c *cgroupSystemd) addProp(name string, value any) { 296 if value == nil { 297 return 298 } 299 c.properties = append(c.properties, newProp(name, value)) 300 } 301 302 func newProp(name string, units any) systemdDbus.Property { 303 return systemdDbus.Property{ 304 Name: name, 305 Value: dbus.MakeVariant(units), 306 } 307 } 308 309 // CreateMockSystemdCgroup returns a mock Cgroup configured for systemd. This 310 // is useful for testing. 311 func CreateMockSystemdCgroup() Cgroup { 312 return &cgroupSystemd{ 313 Name: "test", 314 ScopePrefix: "runsc", 315 Parent: "system.slice", 316 cgroupV2: cgroupV2{ 317 Mountpoint: "/sys/fs/cgroup", 318 Path: "/a/random/path", 319 }, 320 } 321 }