k8s.io/kubernetes@v1.29.3/pkg/kubelet/userns/userns_manager.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package userns
    18  
    19  import (
    20  	"encoding/json"
    21  	"fmt"
    22  	"math"
    23  	"os"
    24  	"path/filepath"
    25  	"sync"
    26  
    27  	v1 "k8s.io/api/core/v1"
    28  	"k8s.io/apimachinery/pkg/types"
    29  	"k8s.io/apimachinery/pkg/util/sets"
    30  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    31  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    32  	"k8s.io/klog/v2"
    33  	"k8s.io/kubernetes/pkg/features"
    34  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    35  	utilstore "k8s.io/kubernetes/pkg/kubelet/util/store"
    36  	"k8s.io/kubernetes/pkg/registry/core/service/allocator"
    37  	utilfs "k8s.io/kubernetes/pkg/util/filesystem"
    38  )
    39  
    40  // length for the user namespace to create (65536).
    41  const userNsLength = (1 << 16)
    42  
    43  // Limit the total number of pods using userns in this node to this value.
    44  // This is an alpha limitation that will probably be lifted later.
    45  const maxPods = 1024
    46  
    47  // Create a new map when we removed enough pods to avoid memory leaks
    48  // since Go maps never free memory.
    49  const mapReInitializeThreshold = 1000
    50  
    51  type userNsPodsManager interface {
    52  	GetPodDir(podUID types.UID) string
    53  	ListPodsFromDisk() ([]types.UID, error)
    54  }
    55  
    56  type UsernsManager struct {
    57  	used         *allocator.AllocationBitmap
    58  	usedBy       map[types.UID]uint32 // Map pod.UID to range used
    59  	removed      int
    60  	numAllocated int
    61  	kl           userNsPodsManager
    62  	// This protects all members except for kl.anager
    63  	lock sync.Mutex
    64  }
    65  
    66  // UserNamespace holds the configuration for the user namespace.
    67  type userNamespace struct {
    68  	// UIDs mappings for the user namespace.
    69  	UIDMappings []idMapping `json:"uidMappings"`
    70  	// GIDs mappings for the user namespace.
    71  	GIDMappings []idMapping `json:"gidMappings"`
    72  }
    73  
    74  // Pod user namespace mapping
    75  type idMapping struct {
    76  	// Required.
    77  	HostId uint32 `json:"hostId"`
    78  	// Required.
    79  	ContainerId uint32 `json:"containerId"`
    80  	// Required.
    81  	Length uint32 `json:"length"`
    82  }
    83  
    84  // mappingsFile is the file where the user namespace mappings are persisted.
    85  const mappingsFile = "userns"
    86  
    87  // writeMappingsToFile writes the specified user namespace configuration to the pod
    88  // directory.
    89  func (m *UsernsManager) writeMappingsToFile(pod types.UID, userNs userNamespace) error {
    90  	dir := m.kl.GetPodDir(pod)
    91  
    92  	data, err := json.Marshal(userNs)
    93  	if err != nil {
    94  		return err
    95  	}
    96  
    97  	fstore, err := utilstore.NewFileStore(dir, &utilfs.DefaultFs{})
    98  	if err != nil {
    99  		return err
   100  	}
   101  	if err := fstore.Write(mappingsFile, data); err != nil {
   102  		return err
   103  	}
   104  
   105  	// We need to fsync the parent dir so the file is guaranteed to be there.
   106  	// fstore guarantees an atomic write, we need durability too.
   107  	parentDir, err := os.Open(dir)
   108  	if err != nil {
   109  		return err
   110  	}
   111  
   112  	if err = parentDir.Sync(); err != nil {
   113  		// Ignore return here, there is already an error reported.
   114  		parentDir.Close()
   115  		return err
   116  	}
   117  
   118  	return parentDir.Close()
   119  }
   120  
   121  // readMappingsFromFile reads the user namespace configuration from the pod directory.
   122  func (m *UsernsManager) readMappingsFromFile(pod types.UID) ([]byte, error) {
   123  	dir := m.kl.GetPodDir(pod)
   124  	fstore, err := utilstore.NewFileStore(dir, &utilfs.DefaultFs{})
   125  	if err != nil {
   126  		return nil, err
   127  	}
   128  	return fstore.Read(mappingsFile)
   129  }
   130  
   131  func MakeUserNsManager(kl userNsPodsManager) (*UsernsManager, error) {
   132  	m := UsernsManager{
   133  		// Create a bitArray for all the UID space (2^32).
   134  		// As a by product of that, no index param to bitArray can be out of bounds (index is uint32).
   135  		used:   allocator.NewAllocationMap((math.MaxUint32+1)/userNsLength, "user namespaces"),
   136  		usedBy: make(map[types.UID]uint32),
   137  		kl:     kl,
   138  	}
   139  	// First block is reserved for the host.
   140  	if _, err := m.used.Allocate(0); err != nil {
   141  		return nil, err
   142  	}
   143  
   144  	// do not bother reading the list of pods if user namespaces are not enabled.
   145  	if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) {
   146  		return &m, nil
   147  	}
   148  
   149  	found, err := kl.ListPodsFromDisk()
   150  	if err != nil {
   151  		if os.IsNotExist(err) {
   152  			return &m, nil
   153  		}
   154  		return nil, fmt.Errorf("user namespace manager can't read pods from disk: %w", err)
   155  
   156  	}
   157  	for _, podUID := range found {
   158  		klog.V(5).InfoS("reading pod from disk for user namespace", "podUID", podUID)
   159  		if err := m.recordPodMappings(podUID); err != nil {
   160  			return nil, err
   161  		}
   162  	}
   163  
   164  	return &m, nil
   165  }
   166  
   167  // recordPodMappings registers the range used for the user namespace if the
   168  // usernsConfFile exists in the pod directory.
   169  func (m *UsernsManager) recordPodMappings(pod types.UID) error {
   170  	content, err := m.readMappingsFromFile(pod)
   171  	if err != nil && err != utilstore.ErrKeyNotFound {
   172  		return err
   173  	}
   174  
   175  	// If no content, it means the pod doesn't have userns. Nothing else to do
   176  	if len(content) == 0 {
   177  		return nil
   178  	}
   179  
   180  	_, err = m.parseUserNsFileAndRecord(pod, content)
   181  	return err
   182  }
   183  
   184  // isSet checks if the specified index is already set.
   185  func (m *UsernsManager) isSet(v uint32) bool {
   186  	index := int(v / userNsLength)
   187  	return m.used.Has(index)
   188  }
   189  
   190  // allocateOne finds a free user namespace and allocate it to the specified pod.
   191  // The first return value is the first ID in the user namespace, the second returns
   192  // the length for the user namespace range.
   193  func (m *UsernsManager) allocateOne(pod types.UID) (firstID uint32, length uint32, err error) {
   194  	if m.numAllocated >= maxPods {
   195  		return 0, 0, fmt.Errorf("limit on count of pods with user namespaces exceeded (limit is %v, current pods with userns: %v)", maxPods, m.numAllocated)
   196  	}
   197  	m.numAllocated++
   198  	defer func() {
   199  		if err != nil {
   200  			m.numAllocated--
   201  		}
   202  	}()
   203  
   204  	firstZero, found, err := m.used.AllocateNext()
   205  	if err != nil {
   206  		return 0, 0, err
   207  	}
   208  	if !found {
   209  		return 0, 0, fmt.Errorf("could not find an empty slot to allocate a user namespace")
   210  	}
   211  
   212  	klog.V(5).InfoS("new pod user namespace allocation", "podUID", pod)
   213  
   214  	firstID = uint32(firstZero * userNsLength)
   215  	m.usedBy[pod] = firstID
   216  	return firstID, userNsLength, nil
   217  }
   218  
   219  // record stores the user namespace [from; from+length] to the specified pod.
   220  func (m *UsernsManager) record(pod types.UID, from, length uint32) (err error) {
   221  	if length != userNsLength {
   222  		return fmt.Errorf("wrong user namespace length %v", length)
   223  	}
   224  	if from%userNsLength != 0 {
   225  		return fmt.Errorf("wrong user namespace offset specified %v", from)
   226  	}
   227  	prevFrom, found := m.usedBy[pod]
   228  	if found && prevFrom != from {
   229  		return fmt.Errorf("different user namespace range already used by pod %q", pod)
   230  	}
   231  	index := int(from / userNsLength)
   232  	// if the pod wasn't found then verify the range is free.
   233  	if !found && m.used.Has(index) {
   234  		return fmt.Errorf("range picked for pod %q already taken", pod)
   235  	}
   236  	// The pod is already registered, nothing to do.
   237  	if found && prevFrom == from {
   238  		return nil
   239  	}
   240  	if m.numAllocated >= maxPods {
   241  		return fmt.Errorf("limit on count of pods with user namespaces exceeded (limit is %v, current pods with userns: %v)", maxPods, m.numAllocated)
   242  	}
   243  	m.numAllocated++
   244  	defer func() {
   245  		if err != nil {
   246  			m.numAllocated--
   247  		}
   248  	}()
   249  
   250  	klog.V(5).InfoS("new pod user namespace allocation", "podUID", pod)
   251  
   252  	// "from" is a ID (UID/GID), set the corresponding userns of size
   253  	// userNsLength in the bit-array.
   254  	m.used.Allocate(index)
   255  	m.usedBy[pod] = from
   256  	return nil
   257  }
   258  
   259  // Release releases the user namespace allocated to the specified pod.
   260  func (m *UsernsManager) Release(podUID types.UID) {
   261  	if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) {
   262  		return
   263  	}
   264  
   265  	m.lock.Lock()
   266  	defer m.lock.Unlock()
   267  
   268  	m.releaseWithLock(podUID)
   269  }
   270  
   271  // podAllocated returns true if the pod is allocated, false otherwise.
   272  func (m *UsernsManager) podAllocated(podUID types.UID) bool {
   273  	if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) {
   274  		return false
   275  	}
   276  
   277  	m.lock.Lock()
   278  	defer m.lock.Unlock()
   279  
   280  	_, ok := m.usedBy[podUID]
   281  	return ok
   282  }
   283  
   284  func (m *UsernsManager) releaseWithLock(pod types.UID) {
   285  	v, ok := m.usedBy[pod]
   286  	if !ok {
   287  		klog.V(5).InfoS("pod user namespace allocation not present", "podUID", pod)
   288  		return
   289  	}
   290  	delete(m.usedBy, pod)
   291  
   292  	klog.V(5).InfoS("releasing pod user namespace allocation", "podUID", pod)
   293  	m.numAllocated--
   294  	m.removed++
   295  
   296  	_ = os.Remove(filepath.Join(m.kl.GetPodDir(pod), mappingsFile))
   297  
   298  	if m.removed%mapReInitializeThreshold == 0 {
   299  		n := make(map[types.UID]uint32)
   300  		for k, v := range m.usedBy {
   301  			n[k] = v
   302  		}
   303  		m.usedBy = n
   304  		m.removed = 0
   305  	}
   306  	m.used.Release(int(v / userNsLength))
   307  }
   308  
   309  func (m *UsernsManager) parseUserNsFileAndRecord(pod types.UID, content []byte) (userNs userNamespace, err error) {
   310  	if err = json.Unmarshal([]byte(content), &userNs); err != nil {
   311  		err = fmt.Errorf("can't parse file: %w", err)
   312  		return
   313  	}
   314  
   315  	if len(userNs.UIDMappings) != 1 {
   316  		err = fmt.Errorf("invalid user namespace configuration: no more than one mapping allowed.")
   317  		return
   318  	}
   319  
   320  	if len(userNs.UIDMappings) != len(userNs.GIDMappings) {
   321  		err = fmt.Errorf("invalid user namespace configuration: GID and UID mappings should be identical.")
   322  		return
   323  	}
   324  
   325  	if userNs.UIDMappings[0] != userNs.GIDMappings[0] {
   326  		err = fmt.Errorf("invalid user namespace configuration: GID and UID mapping should be identical")
   327  		return
   328  	}
   329  
   330  	// We don't produce configs without root mapped and some runtimes assume it is mapped.
   331  	// Validate the file has something we produced and can digest.
   332  	if userNs.UIDMappings[0].ContainerId != 0 {
   333  		err = fmt.Errorf("invalid user namespace configuration: UID 0 must be mapped")
   334  		return
   335  	}
   336  
   337  	if userNs.GIDMappings[0].ContainerId != 0 {
   338  		err = fmt.Errorf("invalid user namespace configuration: GID 0 must be mapped")
   339  		return
   340  	}
   341  
   342  	hostId := userNs.UIDMappings[0].HostId
   343  	length := userNs.UIDMappings[0].Length
   344  
   345  	err = m.record(pod, hostId, length)
   346  	return
   347  }
   348  
   349  func (m *UsernsManager) createUserNs(pod *v1.Pod) (userNs userNamespace, err error) {
   350  	firstID, length, err := m.allocateOne(pod.UID)
   351  	if err != nil {
   352  		return
   353  	}
   354  
   355  	defer func() {
   356  		if err != nil {
   357  			m.releaseWithLock(pod.UID)
   358  		}
   359  	}()
   360  
   361  	userNs = userNamespace{
   362  		UIDMappings: []idMapping{
   363  			{
   364  				ContainerId: 0,
   365  				HostId:      firstID,
   366  				Length:      length,
   367  			},
   368  		},
   369  		GIDMappings: []idMapping{
   370  			{
   371  				ContainerId: 0,
   372  				HostId:      firstID,
   373  				Length:      length,
   374  			},
   375  		},
   376  	}
   377  
   378  	return userNs, m.writeMappingsToFile(pod.UID, userNs)
   379  }
   380  
   381  // GetOrCreateUserNamespaceMappings returns the configuration for the sandbox user namespace
   382  func (m *UsernsManager) GetOrCreateUserNamespaceMappings(pod *v1.Pod) (*runtimeapi.UserNamespace, error) {
   383  	if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) {
   384  		return nil, nil
   385  	}
   386  
   387  	m.lock.Lock()
   388  	defer m.lock.Unlock()
   389  
   390  	if pod.Spec.HostUsers == nil || *pod.Spec.HostUsers {
   391  		return &runtimeapi.UserNamespace{
   392  			Mode: runtimeapi.NamespaceMode_NODE,
   393  		}, nil
   394  	}
   395  
   396  	content, err := m.readMappingsFromFile(pod.UID)
   397  	if err != nil && err != utilstore.ErrKeyNotFound {
   398  		return nil, err
   399  	}
   400  
   401  	var userNs userNamespace
   402  	if string(content) != "" {
   403  		userNs, err = m.parseUserNsFileAndRecord(pod.UID, content)
   404  		if err != nil {
   405  			return nil, err
   406  		}
   407  	} else {
   408  		userNs, err = m.createUserNs(pod)
   409  		if err != nil {
   410  			return nil, err
   411  		}
   412  	}
   413  
   414  	var uids []*runtimeapi.IDMapping
   415  	var gids []*runtimeapi.IDMapping
   416  
   417  	for _, u := range userNs.UIDMappings {
   418  		uids = append(uids, &runtimeapi.IDMapping{
   419  			HostId:      u.HostId,
   420  			ContainerId: u.ContainerId,
   421  			Length:      u.Length,
   422  		})
   423  	}
   424  	for _, g := range userNs.GIDMappings {
   425  		gids = append(gids, &runtimeapi.IDMapping{
   426  			HostId:      g.HostId,
   427  			ContainerId: g.ContainerId,
   428  			Length:      g.Length,
   429  		})
   430  	}
   431  
   432  	return &runtimeapi.UserNamespace{
   433  		Mode: runtimeapi.NamespaceMode_POD,
   434  		Uids: uids,
   435  		Gids: gids,
   436  	}, nil
   437  }
   438  
   439  // CleanupOrphanedPodUsernsAllocations reconciliates the state of user namespace
   440  // allocations with the pods actually running. It frees any user namespace
   441  // allocation for orphaned pods.
   442  func (m *UsernsManager) CleanupOrphanedPodUsernsAllocations(pods []*v1.Pod, runningPods []*kubecontainer.Pod) error {
   443  	if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) {
   444  		return nil
   445  	}
   446  
   447  	m.lock.Lock()
   448  	defer m.lock.Unlock()
   449  
   450  	allPods := sets.NewString()
   451  	for _, pod := range pods {
   452  		allPods.Insert(string(pod.UID))
   453  	}
   454  	for _, pod := range runningPods {
   455  		allPods.Insert(string(pod.ID))
   456  	}
   457  
   458  	allFound := sets.NewString()
   459  	found, err := m.kl.ListPodsFromDisk()
   460  	if err != nil {
   461  		return err
   462  	}
   463  
   464  	for _, podUID := range found {
   465  		allFound.Insert(string(podUID))
   466  	}
   467  
   468  	// Lets remove all the pods "found" that are not known.
   469  	for _, podUID := range found {
   470  		if allPods.Has(string(podUID)) {
   471  			continue
   472  		}
   473  
   474  		klog.V(5).InfoS("Clean up orphaned pod user namespace possible allocation", "podUID", podUID)
   475  		m.releaseWithLock(podUID)
   476  	}
   477  
   478  	// Lets remove any existing allocation for a pod that is not "found".
   479  	for podUID := range m.usedBy {
   480  		if allFound.Has(string(podUID)) {
   481  			continue
   482  		}
   483  
   484  		klog.V(5).InfoS("Clean up orphaned pod user namespace possible allocation", "podUID", podUID)
   485  		m.releaseWithLock(podUID)
   486  	}
   487  
   488  	return nil
   489  }