k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/userns/userns_manager.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package userns
    18  
    19  import (
    20  	"encoding/json"
    21  	"fmt"
    22  	"os"
    23  	"path/filepath"
    24  	"sync"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	"k8s.io/apimachinery/pkg/types"
    28  	"k8s.io/apimachinery/pkg/util/sets"
    29  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    30  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    31  	"k8s.io/klog/v2"
    32  	"k8s.io/kubernetes/pkg/features"
    33  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    34  	utilstore "k8s.io/kubernetes/pkg/kubelet/util/store"
    35  	"k8s.io/kubernetes/pkg/registry/core/service/allocator"
    36  	utilfs "k8s.io/kubernetes/pkg/util/filesystem"
    37  )
    38  
    39  // length for the user namespace to create (65536).
    40  const userNsLength = (1 << 16)
    41  
    42  // Create a new map when we removed enough pods to avoid memory leaks
    43  // since Go maps never free memory.
    44  const mapReInitializeThreshold = 1000
    45  
    46  type userNsPodsManager interface {
    47  	HandlerSupportsUserNamespaces(runtimeHandler string) (bool, error)
    48  	GetPodDir(podUID types.UID) string
    49  	ListPodsFromDisk() ([]types.UID, error)
    50  	GetKubeletMappings() (uint32, uint32, error)
    51  	GetMaxPods() int
    52  }
    53  
    54  type UsernsManager struct {
    55  	used    *allocator.AllocationBitmap
    56  	usedBy  map[types.UID]uint32 // Map pod.UID to range used
    57  	removed int
    58  
    59  	off int
    60  	len int
    61  
    62  	kl userNsPodsManager
    63  	// This protects all members except for kl.anager
    64  	lock sync.Mutex
    65  }
    66  
    67  // UserNamespace holds the configuration for the user namespace.
    68  type userNamespace struct {
    69  	// UIDs mappings for the user namespace.
    70  	UIDMappings []idMapping `json:"uidMappings"`
    71  	// GIDs mappings for the user namespace.
    72  	GIDMappings []idMapping `json:"gidMappings"`
    73  }
    74  
    75  // Pod user namespace mapping
    76  type idMapping struct {
    77  	// Required.
    78  	HostId uint32 `json:"hostId"`
    79  	// Required.
    80  	ContainerId uint32 `json:"containerId"`
    81  	// Required.
    82  	Length uint32 `json:"length"`
    83  }
    84  
    85  // mappingsFile is the file where the user namespace mappings are persisted.
    86  const mappingsFile = "userns"
    87  
    88  // writeMappingsToFile writes the specified user namespace configuration to the pod
    89  // directory.
    90  func (m *UsernsManager) writeMappingsToFile(pod types.UID, userNs userNamespace) error {
    91  	dir := m.kl.GetPodDir(pod)
    92  
    93  	data, err := json.Marshal(userNs)
    94  	if err != nil {
    95  		return err
    96  	}
    97  
    98  	fstore, err := utilstore.NewFileStore(dir, &utilfs.DefaultFs{})
    99  	if err != nil {
   100  		return fmt.Errorf("create user namespace store: %w", err)
   101  	}
   102  	if err := fstore.Write(mappingsFile, data); err != nil {
   103  		return err
   104  	}
   105  
   106  	// We need to fsync the parent dir so the file is guaranteed to be there.
   107  	// fstore guarantees an atomic write, we need durability too.
   108  	parentDir, err := os.Open(dir)
   109  	if err != nil {
   110  		return err
   111  	}
   112  
   113  	if err = parentDir.Sync(); err != nil {
   114  		// Ignore return here, there is already an error reported.
   115  		parentDir.Close()
   116  		return err
   117  	}
   118  
   119  	return parentDir.Close()
   120  }
   121  
   122  // readMappingsFromFile reads the user namespace configuration from the pod directory.
   123  func (m *UsernsManager) readMappingsFromFile(pod types.UID) ([]byte, error) {
   124  	dir := m.kl.GetPodDir(pod)
   125  	fstore, err := utilstore.NewFileStore(dir, &utilfs.DefaultFs{})
   126  	if err != nil {
   127  		return nil, fmt.Errorf("create user namespace store: %w", err)
   128  	}
   129  	return fstore.Read(mappingsFile)
   130  }
   131  
   132  func MakeUserNsManager(kl userNsPodsManager) (*UsernsManager, error) {
   133  	kubeletMappingID, kubeletMappingLen, err := kl.GetKubeletMappings()
   134  	if err != nil {
   135  		return nil, err
   136  	}
   137  
   138  	if kubeletMappingID%userNsLength != 0 {
   139  		return nil, fmt.Errorf("kubelet user assigned ID %v is not a multiple of %v", kubeletMappingID, userNsLength)
   140  	}
   141  	if kubeletMappingID < userNsLength {
   142  		// We don't allow to map 0, as security is circumvented.
   143  		return nil, fmt.Errorf("kubelet user assigned ID %v must be greater or equal to %v", kubeletMappingID, userNsLength)
   144  	}
   145  	if kubeletMappingLen%userNsLength != 0 {
   146  		return nil, fmt.Errorf("kubelet user assigned IDs length %v is not a multiple of %v", kubeletMappingLen, userNsLength)
   147  	}
   148  	if kubeletMappingLen/userNsLength < uint32(kl.GetMaxPods()) {
   149  		return nil, fmt.Errorf("kubelet user assigned IDs are not enough to support %v pods", kl.GetMaxPods())
   150  	}
   151  	off := int(kubeletMappingID / userNsLength)
   152  	len := int(kubeletMappingLen / userNsLength)
   153  
   154  	m := UsernsManager{
   155  		used:   allocator.NewAllocationMap(len, "user namespaces"),
   156  		usedBy: make(map[types.UID]uint32),
   157  		kl:     kl,
   158  		off:    off,
   159  		len:    len,
   160  	}
   161  
   162  	// do not bother reading the list of pods if user namespaces are not enabled.
   163  	if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) {
   164  		return &m, nil
   165  	}
   166  
   167  	found, err := kl.ListPodsFromDisk()
   168  	if err != nil {
   169  		if os.IsNotExist(err) {
   170  			return &m, nil
   171  		}
   172  		return nil, fmt.Errorf("read pods from disk: %w", err)
   173  
   174  	}
   175  	for _, podUID := range found {
   176  		klog.V(5).InfoS("reading pod from disk for user namespace", "podUID", podUID)
   177  		if err := m.recordPodMappings(podUID); err != nil {
   178  			return nil, fmt.Errorf("record pod mappings: %w", err)
   179  		}
   180  	}
   181  
   182  	return &m, nil
   183  }
   184  
   185  // recordPodMappings registers the range used for the user namespace if the
   186  // usernsConfFile exists in the pod directory.
   187  func (m *UsernsManager) recordPodMappings(pod types.UID) error {
   188  	content, err := m.readMappingsFromFile(pod)
   189  	if err != nil && err != utilstore.ErrKeyNotFound {
   190  		return err
   191  	}
   192  
   193  	// If no content, it means the pod doesn't have userns. Nothing else to do
   194  	if len(content) == 0 {
   195  		return nil
   196  	}
   197  
   198  	_, err = m.parseUserNsFileAndRecord(pod, content)
   199  	return err
   200  }
   201  
   202  // isSet checks if the specified index is already set.
   203  func (m *UsernsManager) isSet(v uint32) bool {
   204  	index := int(v/userNsLength) - m.off
   205  	if index < 0 || index >= m.len {
   206  		return true
   207  	}
   208  	return m.used.Has(index)
   209  }
   210  
   211  // allocateOne finds a free user namespace and allocate it to the specified pod.
   212  // The first return value is the first ID in the user namespace, the second returns
   213  // the length for the user namespace range.
   214  func (m *UsernsManager) allocateOne(pod types.UID) (firstID uint32, length uint32, err error) {
   215  	firstZero, found, err := m.used.AllocateNext()
   216  	if err != nil {
   217  		return 0, 0, err
   218  	}
   219  	if !found {
   220  		return 0, 0, fmt.Errorf("could not find an empty slot to allocate a user namespace")
   221  	}
   222  
   223  	klog.V(5).InfoS("new pod user namespace allocation", "podUID", pod)
   224  
   225  	firstID = uint32((firstZero + m.off) * userNsLength)
   226  	m.usedBy[pod] = firstID
   227  	return firstID, userNsLength, nil
   228  }
   229  
   230  // record stores the user namespace [from; from+length] to the specified pod.
   231  func (m *UsernsManager) record(pod types.UID, from, length uint32) (err error) {
   232  	if length != userNsLength {
   233  		return fmt.Errorf("wrong user namespace length %v", length)
   234  	}
   235  	if from%userNsLength != 0 {
   236  		return fmt.Errorf("wrong user namespace offset specified %v", from)
   237  	}
   238  	prevFrom, found := m.usedBy[pod]
   239  	if found && prevFrom != from {
   240  		return fmt.Errorf("different user namespace range already used by pod %q", pod)
   241  	}
   242  	index := int(from/userNsLength) - m.off
   243  	if index < 0 || index >= m.len {
   244  		return fmt.Errorf("id %v is out of range", from)
   245  	}
   246  	// if the pod wasn't found then verify the range is free.
   247  	if !found && m.used.Has(index) {
   248  		return fmt.Errorf("range picked for pod %q already taken", pod)
   249  	}
   250  	// The pod is already registered, nothing to do.
   251  	if found && prevFrom == from {
   252  		return nil
   253  	}
   254  
   255  	klog.V(5).InfoS("new pod user namespace allocation", "podUID", pod)
   256  
   257  	// "from" is a ID (UID/GID), set the corresponding userns of size
   258  	// userNsLength in the bit-array.
   259  	m.used.Allocate(index)
   260  	m.usedBy[pod] = from
   261  	return nil
   262  }
   263  
   264  // Release releases the user namespace allocated to the specified pod.
   265  func (m *UsernsManager) Release(podUID types.UID) {
   266  	if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) {
   267  		return
   268  	}
   269  
   270  	m.lock.Lock()
   271  	defer m.lock.Unlock()
   272  
   273  	m.releaseWithLock(podUID)
   274  }
   275  
   276  // podAllocated returns true if the pod is allocated, false otherwise.
   277  func (m *UsernsManager) podAllocated(podUID types.UID) bool {
   278  	if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) {
   279  		return false
   280  	}
   281  
   282  	m.lock.Lock()
   283  	defer m.lock.Unlock()
   284  
   285  	_, ok := m.usedBy[podUID]
   286  	return ok
   287  }
   288  
   289  func (m *UsernsManager) releaseWithLock(pod types.UID) {
   290  	v, ok := m.usedBy[pod]
   291  	if !ok {
   292  		klog.V(5).InfoS("pod user namespace allocation not present", "podUID", pod)
   293  		return
   294  	}
   295  	delete(m.usedBy, pod)
   296  
   297  	klog.V(5).InfoS("releasing pod user namespace allocation", "podUID", pod)
   298  	m.removed++
   299  
   300  	_ = os.Remove(filepath.Join(m.kl.GetPodDir(pod), mappingsFile))
   301  
   302  	if m.removed%mapReInitializeThreshold == 0 {
   303  		n := make(map[types.UID]uint32)
   304  		for k, v := range m.usedBy {
   305  			n[k] = v
   306  		}
   307  		m.usedBy = n
   308  		m.removed = 0
   309  	}
   310  	_ = m.used.Release(int(v/userNsLength) - m.off)
   311  }
   312  
   313  func (m *UsernsManager) parseUserNsFileAndRecord(pod types.UID, content []byte) (userNs userNamespace, err error) {
   314  	if err = json.Unmarshal([]byte(content), &userNs); err != nil {
   315  		err = fmt.Errorf("invalid user namespace mappings file: %w", err)
   316  		return
   317  	}
   318  
   319  	if len(userNs.UIDMappings) != 1 {
   320  		err = fmt.Errorf("invalid user namespace configuration: no more than one mapping allowed.")
   321  		return
   322  	}
   323  
   324  	if len(userNs.UIDMappings) != len(userNs.GIDMappings) {
   325  		err = fmt.Errorf("invalid user namespace configuration: GID and UID mappings should be identical.")
   326  		return
   327  	}
   328  
   329  	if userNs.UIDMappings[0] != userNs.GIDMappings[0] {
   330  		err = fmt.Errorf("invalid user namespace configuration: GID and UID mapping should be identical")
   331  		return
   332  	}
   333  
   334  	// We don't produce configs without root mapped and some runtimes assume it is mapped.
   335  	// Validate the file has something we produced and can digest.
   336  	if userNs.UIDMappings[0].ContainerId != 0 {
   337  		err = fmt.Errorf("invalid user namespace configuration: UID 0 must be mapped")
   338  		return
   339  	}
   340  
   341  	if userNs.GIDMappings[0].ContainerId != 0 {
   342  		err = fmt.Errorf("invalid user namespace configuration: GID 0 must be mapped")
   343  		return
   344  	}
   345  
   346  	hostId := userNs.UIDMappings[0].HostId
   347  	length := userNs.UIDMappings[0].Length
   348  
   349  	err = m.record(pod, hostId, length)
   350  	return
   351  }
   352  
   353  func (m *UsernsManager) createUserNs(pod *v1.Pod) (userNs userNamespace, err error) {
   354  	firstID, length, err := m.allocateOne(pod.UID)
   355  	if err != nil {
   356  		return
   357  	}
   358  
   359  	defer func() {
   360  		if err != nil {
   361  			m.releaseWithLock(pod.UID)
   362  		}
   363  	}()
   364  
   365  	userNs = userNamespace{
   366  		UIDMappings: []idMapping{
   367  			{
   368  				ContainerId: 0,
   369  				HostId:      firstID,
   370  				Length:      length,
   371  			},
   372  		},
   373  		GIDMappings: []idMapping{
   374  			{
   375  				ContainerId: 0,
   376  				HostId:      firstID,
   377  				Length:      length,
   378  			},
   379  		},
   380  	}
   381  
   382  	return userNs, m.writeMappingsToFile(pod.UID, userNs)
   383  }
   384  
   385  // GetOrCreateUserNamespaceMappings returns the configuration for the sandbox user namespace
   386  func (m *UsernsManager) GetOrCreateUserNamespaceMappings(pod *v1.Pod, runtimeHandler string) (*runtimeapi.UserNamespace, error) {
   387  	featureEnabled := utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport)
   388  
   389  	if pod == nil || pod.Spec.HostUsers == nil {
   390  		// if the feature is enabled, specify to use the node mode...
   391  		if featureEnabled {
   392  			return &runtimeapi.UserNamespace{
   393  				Mode: runtimeapi.NamespaceMode_NODE,
   394  			}, nil
   395  		}
   396  		// ...otherwise don't even specify it
   397  		return nil, nil
   398  	}
   399  	// pod.Spec.HostUsers is set to true/false
   400  	if !featureEnabled {
   401  		return nil, fmt.Errorf("the feature gate %q is disabled: can't set spec.HostUsers", features.UserNamespacesSupport)
   402  	}
   403  	if *pod.Spec.HostUsers {
   404  		return &runtimeapi.UserNamespace{
   405  			Mode: runtimeapi.NamespaceMode_NODE,
   406  		}, nil
   407  	}
   408  
   409  	// From here onwards, hostUsers=false and the feature gate is enabled.
   410  
   411  	// if the pod requested a user namespace and the runtime doesn't support user namespaces then return an error.
   412  	if handlerSupportsUserns, err := m.kl.HandlerSupportsUserNamespaces(runtimeHandler); err != nil {
   413  		return nil, err
   414  	} else if !handlerSupportsUserns {
   415  		return nil, fmt.Errorf("RuntimeClass handler %q does not support user namespaces", runtimeHandler)
   416  	}
   417  
   418  	m.lock.Lock()
   419  	defer m.lock.Unlock()
   420  
   421  	content, err := m.readMappingsFromFile(pod.UID)
   422  	if err != nil && err != utilstore.ErrKeyNotFound {
   423  		return nil, err
   424  	}
   425  
   426  	var userNs userNamespace
   427  	if string(content) != "" {
   428  		userNs, err = m.parseUserNsFileAndRecord(pod.UID, content)
   429  		if err != nil {
   430  			return nil, err
   431  		}
   432  	} else {
   433  		userNs, err = m.createUserNs(pod)
   434  		if err != nil {
   435  			return nil, err
   436  		}
   437  	}
   438  
   439  	var uids []*runtimeapi.IDMapping
   440  	var gids []*runtimeapi.IDMapping
   441  
   442  	for _, u := range userNs.UIDMappings {
   443  		uids = append(uids, &runtimeapi.IDMapping{
   444  			HostId:      u.HostId,
   445  			ContainerId: u.ContainerId,
   446  			Length:      u.Length,
   447  		})
   448  	}
   449  	for _, g := range userNs.GIDMappings {
   450  		gids = append(gids, &runtimeapi.IDMapping{
   451  			HostId:      g.HostId,
   452  			ContainerId: g.ContainerId,
   453  			Length:      g.Length,
   454  		})
   455  	}
   456  
   457  	return &runtimeapi.UserNamespace{
   458  		Mode: runtimeapi.NamespaceMode_POD,
   459  		Uids: uids,
   460  		Gids: gids,
   461  	}, nil
   462  }
   463  
   464  // CleanupOrphanedPodUsernsAllocations reconciliates the state of user namespace
   465  // allocations with the pods actually running. It frees any user namespace
   466  // allocation for orphaned pods.
   467  func (m *UsernsManager) CleanupOrphanedPodUsernsAllocations(pods []*v1.Pod, runningPods []*kubecontainer.Pod) error {
   468  	if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) {
   469  		return nil
   470  	}
   471  
   472  	m.lock.Lock()
   473  	defer m.lock.Unlock()
   474  
   475  	allPods := sets.NewString()
   476  	for _, pod := range pods {
   477  		allPods.Insert(string(pod.UID))
   478  	}
   479  	for _, pod := range runningPods {
   480  		allPods.Insert(string(pod.ID))
   481  	}
   482  
   483  	allFound := sets.NewString()
   484  	found, err := m.kl.ListPodsFromDisk()
   485  	if err != nil {
   486  		return err
   487  	}
   488  
   489  	for _, podUID := range found {
   490  		allFound.Insert(string(podUID))
   491  	}
   492  
   493  	// Lets remove all the pods "found" that are not known.
   494  	for _, podUID := range found {
   495  		if allPods.Has(string(podUID)) {
   496  			continue
   497  		}
   498  
   499  		klog.V(5).InfoS("Clean up orphaned pod user namespace possible allocation", "podUID", podUID)
   500  		m.releaseWithLock(podUID)
   501  	}
   502  
   503  	// Lets remove any existing allocation for a pod that is not "found".
   504  	for podUID := range m.usedBy {
   505  		if allFound.Has(string(podUID)) {
   506  			continue
   507  		}
   508  
   509  		klog.V(5).InfoS("Clean up orphaned pod user namespace possible allocation", "podUID", podUID)
   510  		m.releaseWithLock(podUID)
   511  	}
   512  
   513  	return nil
   514  }