k8s.io/kubernetes@v1.29.3/pkg/kubelet/userns/userns_manager.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package userns 18 19 import ( 20 "encoding/json" 21 "fmt" 22 "math" 23 "os" 24 "path/filepath" 25 "sync" 26 27 v1 "k8s.io/api/core/v1" 28 "k8s.io/apimachinery/pkg/types" 29 "k8s.io/apimachinery/pkg/util/sets" 30 utilfeature "k8s.io/apiserver/pkg/util/feature" 31 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 32 "k8s.io/klog/v2" 33 "k8s.io/kubernetes/pkg/features" 34 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 35 utilstore "k8s.io/kubernetes/pkg/kubelet/util/store" 36 "k8s.io/kubernetes/pkg/registry/core/service/allocator" 37 utilfs "k8s.io/kubernetes/pkg/util/filesystem" 38 ) 39 40 // length for the user namespace to create (65536). 41 const userNsLength = (1 << 16) 42 43 // Limit the total number of pods using userns in this node to this value. 44 // This is an alpha limitation that will probably be lifted later. 45 const maxPods = 1024 46 47 // Create a new map when we removed enough pods to avoid memory leaks 48 // since Go maps never free memory. 49 const mapReInitializeThreshold = 1000 50 51 type userNsPodsManager interface { 52 GetPodDir(podUID types.UID) string 53 ListPodsFromDisk() ([]types.UID, error) 54 } 55 56 type UsernsManager struct { 57 used *allocator.AllocationBitmap 58 usedBy map[types.UID]uint32 // Map pod.UID to range used 59 removed int 60 numAllocated int 61 kl userNsPodsManager 62 // This protects all members except for kl.anager 63 lock sync.Mutex 64 } 65 66 // UserNamespace holds the configuration for the user namespace. 67 type userNamespace struct { 68 // UIDs mappings for the user namespace. 69 UIDMappings []idMapping `json:"uidMappings"` 70 // GIDs mappings for the user namespace. 71 GIDMappings []idMapping `json:"gidMappings"` 72 } 73 74 // Pod user namespace mapping 75 type idMapping struct { 76 // Required. 77 HostId uint32 `json:"hostId"` 78 // Required. 79 ContainerId uint32 `json:"containerId"` 80 // Required. 81 Length uint32 `json:"length"` 82 } 83 84 // mappingsFile is the file where the user namespace mappings are persisted. 85 const mappingsFile = "userns" 86 87 // writeMappingsToFile writes the specified user namespace configuration to the pod 88 // directory. 89 func (m *UsernsManager) writeMappingsToFile(pod types.UID, userNs userNamespace) error { 90 dir := m.kl.GetPodDir(pod) 91 92 data, err := json.Marshal(userNs) 93 if err != nil { 94 return err 95 } 96 97 fstore, err := utilstore.NewFileStore(dir, &utilfs.DefaultFs{}) 98 if err != nil { 99 return err 100 } 101 if err := fstore.Write(mappingsFile, data); err != nil { 102 return err 103 } 104 105 // We need to fsync the parent dir so the file is guaranteed to be there. 106 // fstore guarantees an atomic write, we need durability too. 107 parentDir, err := os.Open(dir) 108 if err != nil { 109 return err 110 } 111 112 if err = parentDir.Sync(); err != nil { 113 // Ignore return here, there is already an error reported. 114 parentDir.Close() 115 return err 116 } 117 118 return parentDir.Close() 119 } 120 121 // readMappingsFromFile reads the user namespace configuration from the pod directory. 122 func (m *UsernsManager) readMappingsFromFile(pod types.UID) ([]byte, error) { 123 dir := m.kl.GetPodDir(pod) 124 fstore, err := utilstore.NewFileStore(dir, &utilfs.DefaultFs{}) 125 if err != nil { 126 return nil, err 127 } 128 return fstore.Read(mappingsFile) 129 } 130 131 func MakeUserNsManager(kl userNsPodsManager) (*UsernsManager, error) { 132 m := UsernsManager{ 133 // Create a bitArray for all the UID space (2^32). 134 // As a by product of that, no index param to bitArray can be out of bounds (index is uint32). 135 used: allocator.NewAllocationMap((math.MaxUint32+1)/userNsLength, "user namespaces"), 136 usedBy: make(map[types.UID]uint32), 137 kl: kl, 138 } 139 // First block is reserved for the host. 140 if _, err := m.used.Allocate(0); err != nil { 141 return nil, err 142 } 143 144 // do not bother reading the list of pods if user namespaces are not enabled. 145 if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) { 146 return &m, nil 147 } 148 149 found, err := kl.ListPodsFromDisk() 150 if err != nil { 151 if os.IsNotExist(err) { 152 return &m, nil 153 } 154 return nil, fmt.Errorf("user namespace manager can't read pods from disk: %w", err) 155 156 } 157 for _, podUID := range found { 158 klog.V(5).InfoS("reading pod from disk for user namespace", "podUID", podUID) 159 if err := m.recordPodMappings(podUID); err != nil { 160 return nil, err 161 } 162 } 163 164 return &m, nil 165 } 166 167 // recordPodMappings registers the range used for the user namespace if the 168 // usernsConfFile exists in the pod directory. 169 func (m *UsernsManager) recordPodMappings(pod types.UID) error { 170 content, err := m.readMappingsFromFile(pod) 171 if err != nil && err != utilstore.ErrKeyNotFound { 172 return err 173 } 174 175 // If no content, it means the pod doesn't have userns. Nothing else to do 176 if len(content) == 0 { 177 return nil 178 } 179 180 _, err = m.parseUserNsFileAndRecord(pod, content) 181 return err 182 } 183 184 // isSet checks if the specified index is already set. 185 func (m *UsernsManager) isSet(v uint32) bool { 186 index := int(v / userNsLength) 187 return m.used.Has(index) 188 } 189 190 // allocateOne finds a free user namespace and allocate it to the specified pod. 191 // The first return value is the first ID in the user namespace, the second returns 192 // the length for the user namespace range. 193 func (m *UsernsManager) allocateOne(pod types.UID) (firstID uint32, length uint32, err error) { 194 if m.numAllocated >= maxPods { 195 return 0, 0, fmt.Errorf("limit on count of pods with user namespaces exceeded (limit is %v, current pods with userns: %v)", maxPods, m.numAllocated) 196 } 197 m.numAllocated++ 198 defer func() { 199 if err != nil { 200 m.numAllocated-- 201 } 202 }() 203 204 firstZero, found, err := m.used.AllocateNext() 205 if err != nil { 206 return 0, 0, err 207 } 208 if !found { 209 return 0, 0, fmt.Errorf("could not find an empty slot to allocate a user namespace") 210 } 211 212 klog.V(5).InfoS("new pod user namespace allocation", "podUID", pod) 213 214 firstID = uint32(firstZero * userNsLength) 215 m.usedBy[pod] = firstID 216 return firstID, userNsLength, nil 217 } 218 219 // record stores the user namespace [from; from+length] to the specified pod. 220 func (m *UsernsManager) record(pod types.UID, from, length uint32) (err error) { 221 if length != userNsLength { 222 return fmt.Errorf("wrong user namespace length %v", length) 223 } 224 if from%userNsLength != 0 { 225 return fmt.Errorf("wrong user namespace offset specified %v", from) 226 } 227 prevFrom, found := m.usedBy[pod] 228 if found && prevFrom != from { 229 return fmt.Errorf("different user namespace range already used by pod %q", pod) 230 } 231 index := int(from / userNsLength) 232 // if the pod wasn't found then verify the range is free. 233 if !found && m.used.Has(index) { 234 return fmt.Errorf("range picked for pod %q already taken", pod) 235 } 236 // The pod is already registered, nothing to do. 237 if found && prevFrom == from { 238 return nil 239 } 240 if m.numAllocated >= maxPods { 241 return fmt.Errorf("limit on count of pods with user namespaces exceeded (limit is %v, current pods with userns: %v)", maxPods, m.numAllocated) 242 } 243 m.numAllocated++ 244 defer func() { 245 if err != nil { 246 m.numAllocated-- 247 } 248 }() 249 250 klog.V(5).InfoS("new pod user namespace allocation", "podUID", pod) 251 252 // "from" is a ID (UID/GID), set the corresponding userns of size 253 // userNsLength in the bit-array. 254 m.used.Allocate(index) 255 m.usedBy[pod] = from 256 return nil 257 } 258 259 // Release releases the user namespace allocated to the specified pod. 260 func (m *UsernsManager) Release(podUID types.UID) { 261 if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) { 262 return 263 } 264 265 m.lock.Lock() 266 defer m.lock.Unlock() 267 268 m.releaseWithLock(podUID) 269 } 270 271 // podAllocated returns true if the pod is allocated, false otherwise. 272 func (m *UsernsManager) podAllocated(podUID types.UID) bool { 273 if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) { 274 return false 275 } 276 277 m.lock.Lock() 278 defer m.lock.Unlock() 279 280 _, ok := m.usedBy[podUID] 281 return ok 282 } 283 284 func (m *UsernsManager) releaseWithLock(pod types.UID) { 285 v, ok := m.usedBy[pod] 286 if !ok { 287 klog.V(5).InfoS("pod user namespace allocation not present", "podUID", pod) 288 return 289 } 290 delete(m.usedBy, pod) 291 292 klog.V(5).InfoS("releasing pod user namespace allocation", "podUID", pod) 293 m.numAllocated-- 294 m.removed++ 295 296 _ = os.Remove(filepath.Join(m.kl.GetPodDir(pod), mappingsFile)) 297 298 if m.removed%mapReInitializeThreshold == 0 { 299 n := make(map[types.UID]uint32) 300 for k, v := range m.usedBy { 301 n[k] = v 302 } 303 m.usedBy = n 304 m.removed = 0 305 } 306 m.used.Release(int(v / userNsLength)) 307 } 308 309 func (m *UsernsManager) parseUserNsFileAndRecord(pod types.UID, content []byte) (userNs userNamespace, err error) { 310 if err = json.Unmarshal([]byte(content), &userNs); err != nil { 311 err = fmt.Errorf("can't parse file: %w", err) 312 return 313 } 314 315 if len(userNs.UIDMappings) != 1 { 316 err = fmt.Errorf("invalid user namespace configuration: no more than one mapping allowed.") 317 return 318 } 319 320 if len(userNs.UIDMappings) != len(userNs.GIDMappings) { 321 err = fmt.Errorf("invalid user namespace configuration: GID and UID mappings should be identical.") 322 return 323 } 324 325 if userNs.UIDMappings[0] != userNs.GIDMappings[0] { 326 err = fmt.Errorf("invalid user namespace configuration: GID and UID mapping should be identical") 327 return 328 } 329 330 // We don't produce configs without root mapped and some runtimes assume it is mapped. 331 // Validate the file has something we produced and can digest. 332 if userNs.UIDMappings[0].ContainerId != 0 { 333 err = fmt.Errorf("invalid user namespace configuration: UID 0 must be mapped") 334 return 335 } 336 337 if userNs.GIDMappings[0].ContainerId != 0 { 338 err = fmt.Errorf("invalid user namespace configuration: GID 0 must be mapped") 339 return 340 } 341 342 hostId := userNs.UIDMappings[0].HostId 343 length := userNs.UIDMappings[0].Length 344 345 err = m.record(pod, hostId, length) 346 return 347 } 348 349 func (m *UsernsManager) createUserNs(pod *v1.Pod) (userNs userNamespace, err error) { 350 firstID, length, err := m.allocateOne(pod.UID) 351 if err != nil { 352 return 353 } 354 355 defer func() { 356 if err != nil { 357 m.releaseWithLock(pod.UID) 358 } 359 }() 360 361 userNs = userNamespace{ 362 UIDMappings: []idMapping{ 363 { 364 ContainerId: 0, 365 HostId: firstID, 366 Length: length, 367 }, 368 }, 369 GIDMappings: []idMapping{ 370 { 371 ContainerId: 0, 372 HostId: firstID, 373 Length: length, 374 }, 375 }, 376 } 377 378 return userNs, m.writeMappingsToFile(pod.UID, userNs) 379 } 380 381 // GetOrCreateUserNamespaceMappings returns the configuration for the sandbox user namespace 382 func (m *UsernsManager) GetOrCreateUserNamespaceMappings(pod *v1.Pod) (*runtimeapi.UserNamespace, error) { 383 if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) { 384 return nil, nil 385 } 386 387 m.lock.Lock() 388 defer m.lock.Unlock() 389 390 if pod.Spec.HostUsers == nil || *pod.Spec.HostUsers { 391 return &runtimeapi.UserNamespace{ 392 Mode: runtimeapi.NamespaceMode_NODE, 393 }, nil 394 } 395 396 content, err := m.readMappingsFromFile(pod.UID) 397 if err != nil && err != utilstore.ErrKeyNotFound { 398 return nil, err 399 } 400 401 var userNs userNamespace 402 if string(content) != "" { 403 userNs, err = m.parseUserNsFileAndRecord(pod.UID, content) 404 if err != nil { 405 return nil, err 406 } 407 } else { 408 userNs, err = m.createUserNs(pod) 409 if err != nil { 410 return nil, err 411 } 412 } 413 414 var uids []*runtimeapi.IDMapping 415 var gids []*runtimeapi.IDMapping 416 417 for _, u := range userNs.UIDMappings { 418 uids = append(uids, &runtimeapi.IDMapping{ 419 HostId: u.HostId, 420 ContainerId: u.ContainerId, 421 Length: u.Length, 422 }) 423 } 424 for _, g := range userNs.GIDMappings { 425 gids = append(gids, &runtimeapi.IDMapping{ 426 HostId: g.HostId, 427 ContainerId: g.ContainerId, 428 Length: g.Length, 429 }) 430 } 431 432 return &runtimeapi.UserNamespace{ 433 Mode: runtimeapi.NamespaceMode_POD, 434 Uids: uids, 435 Gids: gids, 436 }, nil 437 } 438 439 // CleanupOrphanedPodUsernsAllocations reconciliates the state of user namespace 440 // allocations with the pods actually running. It frees any user namespace 441 // allocation for orphaned pods. 442 func (m *UsernsManager) CleanupOrphanedPodUsernsAllocations(pods []*v1.Pod, runningPods []*kubecontainer.Pod) error { 443 if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) { 444 return nil 445 } 446 447 m.lock.Lock() 448 defer m.lock.Unlock() 449 450 allPods := sets.NewString() 451 for _, pod := range pods { 452 allPods.Insert(string(pod.UID)) 453 } 454 for _, pod := range runningPods { 455 allPods.Insert(string(pod.ID)) 456 } 457 458 allFound := sets.NewString() 459 found, err := m.kl.ListPodsFromDisk() 460 if err != nil { 461 return err 462 } 463 464 for _, podUID := range found { 465 allFound.Insert(string(podUID)) 466 } 467 468 // Lets remove all the pods "found" that are not known. 469 for _, podUID := range found { 470 if allPods.Has(string(podUID)) { 471 continue 472 } 473 474 klog.V(5).InfoS("Clean up orphaned pod user namespace possible allocation", "podUID", podUID) 475 m.releaseWithLock(podUID) 476 } 477 478 // Lets remove any existing allocation for a pod that is not "found". 479 for podUID := range m.usedBy { 480 if allFound.Has(string(podUID)) { 481 continue 482 } 483 484 klog.V(5).InfoS("Clean up orphaned pod user namespace possible allocation", "podUID", podUID) 485 m.releaseWithLock(podUID) 486 } 487 488 return nil 489 }