k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/userns/userns_manager.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package userns 18 19 import ( 20 "encoding/json" 21 "fmt" 22 "os" 23 "path/filepath" 24 "sync" 25 26 v1 "k8s.io/api/core/v1" 27 "k8s.io/apimachinery/pkg/types" 28 "k8s.io/apimachinery/pkg/util/sets" 29 utilfeature "k8s.io/apiserver/pkg/util/feature" 30 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 31 "k8s.io/klog/v2" 32 "k8s.io/kubernetes/pkg/features" 33 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 34 utilstore "k8s.io/kubernetes/pkg/kubelet/util/store" 35 "k8s.io/kubernetes/pkg/registry/core/service/allocator" 36 utilfs "k8s.io/kubernetes/pkg/util/filesystem" 37 ) 38 39 // length for the user namespace to create (65536). 40 const userNsLength = (1 << 16) 41 42 // Create a new map when we removed enough pods to avoid memory leaks 43 // since Go maps never free memory. 44 const mapReInitializeThreshold = 1000 45 46 type userNsPodsManager interface { 47 HandlerSupportsUserNamespaces(runtimeHandler string) (bool, error) 48 GetPodDir(podUID types.UID) string 49 ListPodsFromDisk() ([]types.UID, error) 50 GetKubeletMappings() (uint32, uint32, error) 51 GetMaxPods() int 52 } 53 54 type UsernsManager struct { 55 used *allocator.AllocationBitmap 56 usedBy map[types.UID]uint32 // Map pod.UID to range used 57 removed int 58 59 off int 60 len int 61 62 kl userNsPodsManager 63 // This protects all members except for kl.anager 64 lock sync.Mutex 65 } 66 67 // UserNamespace holds the configuration for the user namespace. 68 type userNamespace struct { 69 // UIDs mappings for the user namespace. 70 UIDMappings []idMapping `json:"uidMappings"` 71 // GIDs mappings for the user namespace. 72 GIDMappings []idMapping `json:"gidMappings"` 73 } 74 75 // Pod user namespace mapping 76 type idMapping struct { 77 // Required. 78 HostId uint32 `json:"hostId"` 79 // Required. 80 ContainerId uint32 `json:"containerId"` 81 // Required. 82 Length uint32 `json:"length"` 83 } 84 85 // mappingsFile is the file where the user namespace mappings are persisted. 86 const mappingsFile = "userns" 87 88 // writeMappingsToFile writes the specified user namespace configuration to the pod 89 // directory. 90 func (m *UsernsManager) writeMappingsToFile(pod types.UID, userNs userNamespace) error { 91 dir := m.kl.GetPodDir(pod) 92 93 data, err := json.Marshal(userNs) 94 if err != nil { 95 return err 96 } 97 98 fstore, err := utilstore.NewFileStore(dir, &utilfs.DefaultFs{}) 99 if err != nil { 100 return fmt.Errorf("create user namespace store: %w", err) 101 } 102 if err := fstore.Write(mappingsFile, data); err != nil { 103 return err 104 } 105 106 // We need to fsync the parent dir so the file is guaranteed to be there. 107 // fstore guarantees an atomic write, we need durability too. 108 parentDir, err := os.Open(dir) 109 if err != nil { 110 return err 111 } 112 113 if err = parentDir.Sync(); err != nil { 114 // Ignore return here, there is already an error reported. 115 parentDir.Close() 116 return err 117 } 118 119 return parentDir.Close() 120 } 121 122 // readMappingsFromFile reads the user namespace configuration from the pod directory. 123 func (m *UsernsManager) readMappingsFromFile(pod types.UID) ([]byte, error) { 124 dir := m.kl.GetPodDir(pod) 125 fstore, err := utilstore.NewFileStore(dir, &utilfs.DefaultFs{}) 126 if err != nil { 127 return nil, fmt.Errorf("create user namespace store: %w", err) 128 } 129 return fstore.Read(mappingsFile) 130 } 131 132 func MakeUserNsManager(kl userNsPodsManager) (*UsernsManager, error) { 133 kubeletMappingID, kubeletMappingLen, err := kl.GetKubeletMappings() 134 if err != nil { 135 return nil, err 136 } 137 138 if kubeletMappingID%userNsLength != 0 { 139 return nil, fmt.Errorf("kubelet user assigned ID %v is not a multiple of %v", kubeletMappingID, userNsLength) 140 } 141 if kubeletMappingID < userNsLength { 142 // We don't allow to map 0, as security is circumvented. 143 return nil, fmt.Errorf("kubelet user assigned ID %v must be greater or equal to %v", kubeletMappingID, userNsLength) 144 } 145 if kubeletMappingLen%userNsLength != 0 { 146 return nil, fmt.Errorf("kubelet user assigned IDs length %v is not a multiple of %v", kubeletMappingLen, userNsLength) 147 } 148 if kubeletMappingLen/userNsLength < uint32(kl.GetMaxPods()) { 149 return nil, fmt.Errorf("kubelet user assigned IDs are not enough to support %v pods", kl.GetMaxPods()) 150 } 151 off := int(kubeletMappingID / userNsLength) 152 len := int(kubeletMappingLen / userNsLength) 153 154 m := UsernsManager{ 155 used: allocator.NewAllocationMap(len, "user namespaces"), 156 usedBy: make(map[types.UID]uint32), 157 kl: kl, 158 off: off, 159 len: len, 160 } 161 162 // do not bother reading the list of pods if user namespaces are not enabled. 163 if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) { 164 return &m, nil 165 } 166 167 found, err := kl.ListPodsFromDisk() 168 if err != nil { 169 if os.IsNotExist(err) { 170 return &m, nil 171 } 172 return nil, fmt.Errorf("read pods from disk: %w", err) 173 174 } 175 for _, podUID := range found { 176 klog.V(5).InfoS("reading pod from disk for user namespace", "podUID", podUID) 177 if err := m.recordPodMappings(podUID); err != nil { 178 return nil, fmt.Errorf("record pod mappings: %w", err) 179 } 180 } 181 182 return &m, nil 183 } 184 185 // recordPodMappings registers the range used for the user namespace if the 186 // usernsConfFile exists in the pod directory. 187 func (m *UsernsManager) recordPodMappings(pod types.UID) error { 188 content, err := m.readMappingsFromFile(pod) 189 if err != nil && err != utilstore.ErrKeyNotFound { 190 return err 191 } 192 193 // If no content, it means the pod doesn't have userns. Nothing else to do 194 if len(content) == 0 { 195 return nil 196 } 197 198 _, err = m.parseUserNsFileAndRecord(pod, content) 199 return err 200 } 201 202 // isSet checks if the specified index is already set. 203 func (m *UsernsManager) isSet(v uint32) bool { 204 index := int(v/userNsLength) - m.off 205 if index < 0 || index >= m.len { 206 return true 207 } 208 return m.used.Has(index) 209 } 210 211 // allocateOne finds a free user namespace and allocate it to the specified pod. 212 // The first return value is the first ID in the user namespace, the second returns 213 // the length for the user namespace range. 214 func (m *UsernsManager) allocateOne(pod types.UID) (firstID uint32, length uint32, err error) { 215 firstZero, found, err := m.used.AllocateNext() 216 if err != nil { 217 return 0, 0, err 218 } 219 if !found { 220 return 0, 0, fmt.Errorf("could not find an empty slot to allocate a user namespace") 221 } 222 223 klog.V(5).InfoS("new pod user namespace allocation", "podUID", pod) 224 225 firstID = uint32((firstZero + m.off) * userNsLength) 226 m.usedBy[pod] = firstID 227 return firstID, userNsLength, nil 228 } 229 230 // record stores the user namespace [from; from+length] to the specified pod. 231 func (m *UsernsManager) record(pod types.UID, from, length uint32) (err error) { 232 if length != userNsLength { 233 return fmt.Errorf("wrong user namespace length %v", length) 234 } 235 if from%userNsLength != 0 { 236 return fmt.Errorf("wrong user namespace offset specified %v", from) 237 } 238 prevFrom, found := m.usedBy[pod] 239 if found && prevFrom != from { 240 return fmt.Errorf("different user namespace range already used by pod %q", pod) 241 } 242 index := int(from/userNsLength) - m.off 243 if index < 0 || index >= m.len { 244 return fmt.Errorf("id %v is out of range", from) 245 } 246 // if the pod wasn't found then verify the range is free. 247 if !found && m.used.Has(index) { 248 return fmt.Errorf("range picked for pod %q already taken", pod) 249 } 250 // The pod is already registered, nothing to do. 251 if found && prevFrom == from { 252 return nil 253 } 254 255 klog.V(5).InfoS("new pod user namespace allocation", "podUID", pod) 256 257 // "from" is a ID (UID/GID), set the corresponding userns of size 258 // userNsLength in the bit-array. 259 m.used.Allocate(index) 260 m.usedBy[pod] = from 261 return nil 262 } 263 264 // Release releases the user namespace allocated to the specified pod. 265 func (m *UsernsManager) Release(podUID types.UID) { 266 if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) { 267 return 268 } 269 270 m.lock.Lock() 271 defer m.lock.Unlock() 272 273 m.releaseWithLock(podUID) 274 } 275 276 // podAllocated returns true if the pod is allocated, false otherwise. 277 func (m *UsernsManager) podAllocated(podUID types.UID) bool { 278 if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) { 279 return false 280 } 281 282 m.lock.Lock() 283 defer m.lock.Unlock() 284 285 _, ok := m.usedBy[podUID] 286 return ok 287 } 288 289 func (m *UsernsManager) releaseWithLock(pod types.UID) { 290 v, ok := m.usedBy[pod] 291 if !ok { 292 klog.V(5).InfoS("pod user namespace allocation not present", "podUID", pod) 293 return 294 } 295 delete(m.usedBy, pod) 296 297 klog.V(5).InfoS("releasing pod user namespace allocation", "podUID", pod) 298 m.removed++ 299 300 _ = os.Remove(filepath.Join(m.kl.GetPodDir(pod), mappingsFile)) 301 302 if m.removed%mapReInitializeThreshold == 0 { 303 n := make(map[types.UID]uint32) 304 for k, v := range m.usedBy { 305 n[k] = v 306 } 307 m.usedBy = n 308 m.removed = 0 309 } 310 _ = m.used.Release(int(v/userNsLength) - m.off) 311 } 312 313 func (m *UsernsManager) parseUserNsFileAndRecord(pod types.UID, content []byte) (userNs userNamespace, err error) { 314 if err = json.Unmarshal([]byte(content), &userNs); err != nil { 315 err = fmt.Errorf("invalid user namespace mappings file: %w", err) 316 return 317 } 318 319 if len(userNs.UIDMappings) != 1 { 320 err = fmt.Errorf("invalid user namespace configuration: no more than one mapping allowed.") 321 return 322 } 323 324 if len(userNs.UIDMappings) != len(userNs.GIDMappings) { 325 err = fmt.Errorf("invalid user namespace configuration: GID and UID mappings should be identical.") 326 return 327 } 328 329 if userNs.UIDMappings[0] != userNs.GIDMappings[0] { 330 err = fmt.Errorf("invalid user namespace configuration: GID and UID mapping should be identical") 331 return 332 } 333 334 // We don't produce configs without root mapped and some runtimes assume it is mapped. 335 // Validate the file has something we produced and can digest. 336 if userNs.UIDMappings[0].ContainerId != 0 { 337 err = fmt.Errorf("invalid user namespace configuration: UID 0 must be mapped") 338 return 339 } 340 341 if userNs.GIDMappings[0].ContainerId != 0 { 342 err = fmt.Errorf("invalid user namespace configuration: GID 0 must be mapped") 343 return 344 } 345 346 hostId := userNs.UIDMappings[0].HostId 347 length := userNs.UIDMappings[0].Length 348 349 err = m.record(pod, hostId, length) 350 return 351 } 352 353 func (m *UsernsManager) createUserNs(pod *v1.Pod) (userNs userNamespace, err error) { 354 firstID, length, err := m.allocateOne(pod.UID) 355 if err != nil { 356 return 357 } 358 359 defer func() { 360 if err != nil { 361 m.releaseWithLock(pod.UID) 362 } 363 }() 364 365 userNs = userNamespace{ 366 UIDMappings: []idMapping{ 367 { 368 ContainerId: 0, 369 HostId: firstID, 370 Length: length, 371 }, 372 }, 373 GIDMappings: []idMapping{ 374 { 375 ContainerId: 0, 376 HostId: firstID, 377 Length: length, 378 }, 379 }, 380 } 381 382 return userNs, m.writeMappingsToFile(pod.UID, userNs) 383 } 384 385 // GetOrCreateUserNamespaceMappings returns the configuration for the sandbox user namespace 386 func (m *UsernsManager) GetOrCreateUserNamespaceMappings(pod *v1.Pod, runtimeHandler string) (*runtimeapi.UserNamespace, error) { 387 featureEnabled := utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) 388 389 if pod == nil || pod.Spec.HostUsers == nil { 390 // if the feature is enabled, specify to use the node mode... 391 if featureEnabled { 392 return &runtimeapi.UserNamespace{ 393 Mode: runtimeapi.NamespaceMode_NODE, 394 }, nil 395 } 396 // ...otherwise don't even specify it 397 return nil, nil 398 } 399 // pod.Spec.HostUsers is set to true/false 400 if !featureEnabled { 401 return nil, fmt.Errorf("the feature gate %q is disabled: can't set spec.HostUsers", features.UserNamespacesSupport) 402 } 403 if *pod.Spec.HostUsers { 404 return &runtimeapi.UserNamespace{ 405 Mode: runtimeapi.NamespaceMode_NODE, 406 }, nil 407 } 408 409 // From here onwards, hostUsers=false and the feature gate is enabled. 410 411 // if the pod requested a user namespace and the runtime doesn't support user namespaces then return an error. 412 if handlerSupportsUserns, err := m.kl.HandlerSupportsUserNamespaces(runtimeHandler); err != nil { 413 return nil, err 414 } else if !handlerSupportsUserns { 415 return nil, fmt.Errorf("RuntimeClass handler %q does not support user namespaces", runtimeHandler) 416 } 417 418 m.lock.Lock() 419 defer m.lock.Unlock() 420 421 content, err := m.readMappingsFromFile(pod.UID) 422 if err != nil && err != utilstore.ErrKeyNotFound { 423 return nil, err 424 } 425 426 var userNs userNamespace 427 if string(content) != "" { 428 userNs, err = m.parseUserNsFileAndRecord(pod.UID, content) 429 if err != nil { 430 return nil, err 431 } 432 } else { 433 userNs, err = m.createUserNs(pod) 434 if err != nil { 435 return nil, err 436 } 437 } 438 439 var uids []*runtimeapi.IDMapping 440 var gids []*runtimeapi.IDMapping 441 442 for _, u := range userNs.UIDMappings { 443 uids = append(uids, &runtimeapi.IDMapping{ 444 HostId: u.HostId, 445 ContainerId: u.ContainerId, 446 Length: u.Length, 447 }) 448 } 449 for _, g := range userNs.GIDMappings { 450 gids = append(gids, &runtimeapi.IDMapping{ 451 HostId: g.HostId, 452 ContainerId: g.ContainerId, 453 Length: g.Length, 454 }) 455 } 456 457 return &runtimeapi.UserNamespace{ 458 Mode: runtimeapi.NamespaceMode_POD, 459 Uids: uids, 460 Gids: gids, 461 }, nil 462 } 463 464 // CleanupOrphanedPodUsernsAllocations reconciliates the state of user namespace 465 // allocations with the pods actually running. It frees any user namespace 466 // allocation for orphaned pods. 467 func (m *UsernsManager) CleanupOrphanedPodUsernsAllocations(pods []*v1.Pod, runningPods []*kubecontainer.Pod) error { 468 if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) { 469 return nil 470 } 471 472 m.lock.Lock() 473 defer m.lock.Unlock() 474 475 allPods := sets.NewString() 476 for _, pod := range pods { 477 allPods.Insert(string(pod.UID)) 478 } 479 for _, pod := range runningPods { 480 allPods.Insert(string(pod.ID)) 481 } 482 483 allFound := sets.NewString() 484 found, err := m.kl.ListPodsFromDisk() 485 if err != nil { 486 return err 487 } 488 489 for _, podUID := range found { 490 allFound.Insert(string(podUID)) 491 } 492 493 // Lets remove all the pods "found" that are not known. 494 for _, podUID := range found { 495 if allPods.Has(string(podUID)) { 496 continue 497 } 498 499 klog.V(5).InfoS("Clean up orphaned pod user namespace possible allocation", "podUID", podUID) 500 m.releaseWithLock(podUID) 501 } 502 503 // Lets remove any existing allocation for a pod that is not "found". 504 for podUID := range m.usedBy { 505 if allFound.Has(string(podUID)) { 506 continue 507 } 508 509 klog.V(5).InfoS("Clean up orphaned pod user namespace possible allocation", "podUID", podUID) 510 m.releaseWithLock(podUID) 511 } 512 513 return nil 514 }