github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/cgroups/devices/devices_emulator.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 /* 3 * Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com> 4 * Copyright (C) 2020 SUSE LLC 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 package devices 20 21 import ( 22 "bufio" 23 "fmt" 24 "io" 25 "sort" 26 "strconv" 27 "strings" 28 29 "github.com/opencontainers/runc/libcontainer/devices" 30 ) 31 32 // deviceMeta is a Rule without the Allow or Permissions fields, and no 33 // wildcard-type support. It's effectively the "match" portion of a metadata 34 // rule, for the purposes of our emulation. 35 type deviceMeta struct { 36 node devices.Type 37 major int64 38 minor int64 39 } 40 41 // deviceRule is effectively the tuple (deviceMeta, Permissions). 42 type deviceRule struct { 43 meta deviceMeta 44 perms devices.Permissions 45 } 46 47 // deviceRules is a mapping of device metadata rules to the associated 48 // permissions in the ruleset. 49 type deviceRules map[deviceMeta]devices.Permissions 50 51 func (r deviceRules) orderedEntries() []deviceRule { 52 var rules []deviceRule 53 for meta, perms := range r { 54 rules = append(rules, deviceRule{meta: meta, perms: perms}) 55 } 56 sort.Slice(rules, func(i, j int) bool { 57 // Sort by (major, minor, type). 58 a, b := rules[i].meta, rules[j].meta 59 return a.major < b.major || 60 (a.major == b.major && a.minor < b.minor) || 61 (a.major == b.major && a.minor == b.minor && a.node < b.node) 62 }) 63 return rules 64 } 65 66 type emulator struct { 67 defaultAllow bool 68 rules deviceRules 69 } 70 71 func (e *emulator) IsBlacklist() bool { 72 return e.defaultAllow 73 } 74 75 func (e *emulator) IsAllowAll() bool { 76 return e.IsBlacklist() && len(e.rules) == 0 77 } 78 79 func parseLine(line string) (*deviceRule, error) { 80 // Input: node major:minor perms. 81 fields := strings.FieldsFunc(line, func(r rune) bool { 82 return r == ' ' || r == ':' 83 }) 84 if len(fields) != 4 { 85 return nil, fmt.Errorf("malformed devices.list rule %s", line) 86 } 87 88 var ( 89 rule deviceRule 90 node = fields[0] 91 major = fields[1] 92 minor = fields[2] 93 perms = fields[3] 94 ) 95 96 // Parse the node type. 97 switch node { 98 case "a": 99 // Super-special case -- "a" always means every device with every 100 // access mode. In fact, for devices.list this actually indicates that 101 // the cgroup is in black-list mode. 102 // TODO: Double-check that the entire file is "a *:* rwm". 103 return nil, nil 104 case "b": 105 rule.meta.node = devices.BlockDevice 106 case "c": 107 rule.meta.node = devices.CharDevice 108 default: 109 return nil, fmt.Errorf("unknown device type %q", node) 110 } 111 112 // Parse the major number. 113 if major == "*" { 114 rule.meta.major = devices.Wildcard 115 } else { 116 val, err := strconv.ParseUint(major, 10, 32) 117 if err != nil { 118 return nil, fmt.Errorf("invalid major number: %w", err) 119 } 120 rule.meta.major = int64(val) 121 } 122 123 // Parse the minor number. 124 if minor == "*" { 125 rule.meta.minor = devices.Wildcard 126 } else { 127 val, err := strconv.ParseUint(minor, 10, 32) 128 if err != nil { 129 return nil, fmt.Errorf("invalid minor number: %w", err) 130 } 131 rule.meta.minor = int64(val) 132 } 133 134 // Parse the access permissions. 135 rule.perms = devices.Permissions(perms) 136 if !rule.perms.IsValid() || rule.perms.IsEmpty() { 137 return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms) 138 } 139 return &rule, nil 140 } 141 142 func (e *emulator) addRule(rule deviceRule) error { //nolint:unparam 143 if e.rules == nil { 144 e.rules = make(map[deviceMeta]devices.Permissions) 145 } 146 147 // Merge with any pre-existing permissions. 148 oldPerms := e.rules[rule.meta] 149 newPerms := rule.perms.Union(oldPerms) 150 e.rules[rule.meta] = newPerms 151 return nil 152 } 153 154 func (e *emulator) rmRule(rule deviceRule) error { 155 // Give an error if any of the permissions requested to be removed are 156 // present in a partially-matching wildcard rule, because such rules will 157 // be ignored by cgroupv1. 158 // 159 // This is a diversion from cgroupv1, but is necessary to avoid leading 160 // users into a false sense of security. cgroupv1 will silently(!) ignore 161 // requests to remove partial exceptions, but we really shouldn't do that. 162 // 163 // It may seem like we could just "split" wildcard rules which hit this 164 // issue, but unfortunately there are 2^32 possible major and minor 165 // numbers, which would exhaust kernel memory quickly if we did this. Not 166 // to mention it'd be really slow (the kernel side is implemented as a 167 // linked-list of exceptions). 168 for _, partialMeta := range []deviceMeta{ 169 {node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor}, 170 {node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard}, 171 {node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard}, 172 } { 173 // This wildcard rule is equivalent to the requested rule, so skip it. 174 if rule.meta == partialMeta { 175 continue 176 } 177 // Only give an error if the set of permissions overlap. 178 partialPerms := e.rules[partialMeta] 179 if !partialPerms.Intersection(rule.perms).IsEmpty() { 180 return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms) 181 } 182 } 183 184 // Subtract all of the permissions listed from the full match rule. If the 185 // rule didn't exist, all of this is a no-op. 186 newPerms := e.rules[rule.meta].Difference(rule.perms) 187 if newPerms.IsEmpty() { 188 delete(e.rules, rule.meta) 189 } else { 190 e.rules[rule.meta] = newPerms 191 } 192 // TODO: The actual cgroup code doesn't care if an exception didn't exist 193 // during removal, so not erroring out here is /accurate/ but quite 194 // worrying. Maybe we should do additional validation, but again we 195 // have to worry about backwards-compatibility. 196 return nil 197 } 198 199 func (e *emulator) allow(rule *deviceRule) error { 200 // This cgroup is configured as a black-list. Reset the entire emulator, 201 // and put is into black-list mode. 202 if rule == nil || rule.meta.node == devices.WildcardDevice { 203 *e = emulator{ 204 defaultAllow: true, 205 rules: nil, 206 } 207 return nil 208 } 209 210 var err error 211 if e.defaultAllow { 212 err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception") 213 } else { 214 err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception") 215 } 216 return err 217 } 218 219 func (e *emulator) deny(rule *deviceRule) error { 220 // This cgroup is configured as a white-list. Reset the entire emulator, 221 // and put is into white-list mode. 222 if rule == nil || rule.meta.node == devices.WildcardDevice { 223 *e = emulator{ 224 defaultAllow: false, 225 rules: nil, 226 } 227 return nil 228 } 229 230 var err error 231 if e.defaultAllow { 232 err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception") 233 } else { 234 err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception") 235 } 236 return err 237 } 238 239 func (e *emulator) Apply(rule devices.Rule) error { 240 if !rule.Type.CanCgroup() { 241 return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type) 242 } 243 244 innerRule := &deviceRule{ 245 meta: deviceMeta{ 246 node: rule.Type, 247 major: rule.Major, 248 minor: rule.Minor, 249 }, 250 perms: rule.Permissions, 251 } 252 if innerRule.meta.node == devices.WildcardDevice { 253 innerRule = nil 254 } 255 256 if rule.Allow { 257 return e.allow(innerRule) 258 } 259 260 return e.deny(innerRule) 261 } 262 263 // emulatorFromList takes a reader to a "devices.list"-like source, and returns 264 // a new Emulator that represents the state of the devices cgroup. Note that 265 // black-list devices cgroups cannot be fully reconstructed, due to limitations 266 // in the devices cgroup API. Instead, such cgroups are always treated as 267 // "allow all" cgroups. 268 func emulatorFromList(list io.Reader) (*emulator, error) { 269 // Normally cgroups are in black-list mode by default, but the way we 270 // figure out the current mode is whether or not devices.list has an 271 // allow-all rule. So we default to a white-list, and the existence of an 272 // "a *:* rwm" entry will tell us otherwise. 273 e := &emulator{ 274 defaultAllow: false, 275 } 276 277 // Parse the "devices.list". 278 s := bufio.NewScanner(list) 279 for s.Scan() { 280 line := s.Text() 281 deviceRule, err := parseLine(line) 282 if err != nil { 283 return nil, fmt.Errorf("error parsing line %q: %w", line, err) 284 } 285 // "devices.list" is an allow list. Note that this means that in 286 // black-list mode, we have no idea what rules are in play. As a 287 // result, we need to be very careful in Transition(). 288 if err := e.allow(deviceRule); err != nil { 289 return nil, fmt.Errorf("error adding devices.list rule: %w", err) 290 } 291 } 292 if err := s.Err(); err != nil { 293 return nil, fmt.Errorf("error reading devices.list lines: %w", err) 294 } 295 return e, nil 296 } 297 298 // Transition calculates what is the minimally-disruptive set of rules need to 299 // be applied to a devices cgroup in order to transition to the given target. 300 // This means that any already-existing rules will not be applied, and 301 // disruptive rules (like denying all device access) will only be applied if 302 // necessary. 303 // 304 // This function is the sole reason for all of Emulator -- to allow us 305 // to figure out how to update a containers' cgroups without causing spurious 306 // device errors (if possible). 307 func (source *emulator) Transition(target *emulator) ([]*devices.Rule, error) { //nolint:revive // Ignore receiver-naming warning. 308 var transitionRules []*devices.Rule 309 oldRules := source.rules 310 311 // If the default policy doesn't match, we need to include a "disruptive" 312 // rule (either allow-all or deny-all) in order to switch the cgroup to the 313 // correct default policy. 314 // 315 // However, due to a limitation in "devices.list" we cannot be sure what 316 // deny rules are in place in a black-list cgroup. Thus if the source is a 317 // black-list we also have to include a disruptive rule. 318 if source.IsBlacklist() || source.defaultAllow != target.defaultAllow { 319 transitionRules = append(transitionRules, &devices.Rule{ 320 Type: 'a', 321 Major: -1, 322 Minor: -1, 323 Permissions: devices.Permissions("rwm"), 324 Allow: target.defaultAllow, 325 }) 326 // The old rules are only relevant if we aren't starting out with a 327 // disruptive rule. 328 oldRules = nil 329 } 330 331 // NOTE: We traverse through the rules in a sorted order so we always write 332 // the same set of rules (this is to aid testing). 333 334 // First, we create inverse rules for any old rules not in the new set. 335 // This includes partial-inverse rules for specific permissions. This is a 336 // no-op if we added a disruptive rule, since oldRules will be empty. 337 for _, rule := range oldRules.orderedEntries() { 338 meta, oldPerms := rule.meta, rule.perms 339 newPerms := target.rules[meta] 340 droppedPerms := oldPerms.Difference(newPerms) 341 if !droppedPerms.IsEmpty() { 342 transitionRules = append(transitionRules, &devices.Rule{ 343 Type: meta.node, 344 Major: meta.major, 345 Minor: meta.minor, 346 Permissions: droppedPerms, 347 Allow: target.defaultAllow, 348 }) 349 } 350 } 351 352 // Add any additional rules which weren't in the old set. We happen to 353 // filter out rules which are present in both sets, though this isn't 354 // strictly necessary. 355 for _, rule := range target.rules.orderedEntries() { 356 meta, newPerms := rule.meta, rule.perms 357 oldPerms := oldRules[meta] 358 gainedPerms := newPerms.Difference(oldPerms) 359 if !gainedPerms.IsEmpty() { 360 transitionRules = append(transitionRules, &devices.Rule{ 361 Type: meta.node, 362 Major: meta.major, 363 Minor: meta.minor, 364 Permissions: gainedPerms, 365 Allow: !target.defaultAllow, 366 }) 367 } 368 } 369 return transitionRules, nil 370 } 371 372 // Rules returns the minimum set of rules necessary to convert a *deny-all* 373 // cgroup to the emulated filter state (note that this is not the same as a 374 // default cgroupv1 cgroup -- which is allow-all). This is effectively just a 375 // wrapper around Transition() with the source emulator being an empty cgroup. 376 func (e *emulator) Rules() ([]*devices.Rule, error) { 377 defaultCgroup := &emulator{defaultAllow: false} 378 return defaultCgroup.Transition(e) 379 } 380 381 func wrapErr(err error, text string) error { 382 if err == nil { 383 return nil 384 } 385 return fmt.Errorf(text+": %w", err) 386 }