github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/shim/utils/volumes.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     https://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package utils
    16  
    17  import (
    18  	"fmt"
    19  	"path/filepath"
    20  	"strings"
    21  
    22  	specs "github.com/opencontainers/runtime-spec/specs-go"
    23  	"github.com/metacubex/gvisor/runsc/specutils"
    24  )
    25  
    26  const (
    27  	volumeKeyPrefix = "dev.gvisor.spec.mount."
    28  
    29  	// devshmName is the volume name used for /dev/shm. Pick a name that is
    30  	// unlikely to be used.
    31  	devshmName = "gvisorinternaldevshm"
    32  
    33  	// emptyDirVolumesDir is the directory inside kubeletPodsDir/{uid}/volumes/
    34  	// that hosts all the EmptyDir volumes used by the pod.
    35  	emptyDirVolumesDir = "kubernetes.io~empty-dir"
    36  )
    37  
    38  // The directory structure for volumes is as follows:
    39  // /var/lib/kubelet/pods/{uid}/volumes/{type} where `uid` is the pod UID and
    40  // `type` is the volume type.
    41  var kubeletPodsDir = "/var/lib/kubelet/pods"
    42  
    43  // volumeName gets volume name from volume annotation key, example:
    44  //
    45  //	dev.gvisor.spec.mount.NAME.share
    46  func volumeName(k string) string {
    47  	return strings.SplitN(strings.TrimPrefix(k, volumeKeyPrefix), ".", 2)[0]
    48  }
    49  
    50  // volumeFieldName gets volume field name from volume annotation key, example:
    51  //
    52  //	`type` is the field of dev.gvisor.spec.mount.NAME.type
    53  func volumeFieldName(k string) string {
    54  	parts := strings.Split(strings.TrimPrefix(k, volumeKeyPrefix), ".")
    55  	return parts[len(parts)-1]
    56  }
    57  
    58  // podUID gets pod UID from the pod log path.
    59  func podUID(s *specs.Spec) (string, error) {
    60  	sandboxLogDir := s.Annotations[sandboxLogDirAnnotation]
    61  	if sandboxLogDir == "" {
    62  		return "", fmt.Errorf("no sandbox log path annotation")
    63  	}
    64  	fields := strings.Split(filepath.Base(sandboxLogDir), "_")
    65  	switch len(fields) {
    66  	case 1: // This is the old CRI logging path.
    67  		return fields[0], nil
    68  	case 3: // This is the new CRI logging path.
    69  		return fields[2], nil
    70  	}
    71  	return "", fmt.Errorf("unexpected sandbox log path %q", sandboxLogDir)
    72  }
    73  
    74  // isVolumeKey checks whether an annotation key is for volume.
    75  func isVolumeKey(k string) bool {
    76  	return strings.HasPrefix(k, volumeKeyPrefix)
    77  }
    78  
    79  // volumeSourceKey constructs the annotation key for volume source.
    80  func volumeSourceKey(volume string) string {
    81  	return volumeKeyPrefix + volume + ".source"
    82  }
    83  
    84  // volumePath searches the volume path in the kubelet pod directory.
    85  func volumePath(volume, uid string) (string, error) {
    86  	// TODO: Support subpath when gvisor supports pod volume bind mount.
    87  	volumeSearchPath := fmt.Sprintf("%s/%s/volumes/*/%s", kubeletPodsDir, uid, volume)
    88  	dirs, err := filepath.Glob(volumeSearchPath)
    89  	if err != nil {
    90  		return "", err
    91  	}
    92  	if len(dirs) != 1 {
    93  		return "", fmt.Errorf("unexpected matched volume list %v", dirs)
    94  	}
    95  	return dirs[0], nil
    96  }
    97  
    98  // isVolumePath checks whether a string is the volume path.
    99  func isVolumePath(volume, path string) (bool, error) {
   100  	// TODO: Support subpath when gvisor supports pod volume bind mount.
   101  	volumeSearchPath := fmt.Sprintf("%s/*/volumes/*/%s", kubeletPodsDir, volume)
   102  	return filepath.Match(volumeSearchPath, path)
   103  }
   104  
   105  // UpdateVolumeAnnotations add necessary OCI annotations for gvisor
   106  // volume optimization. Returns true if the spec was modified.
   107  //
   108  // Note about EmptyDir handling:
   109  // The admission controller sets mount annotations for EmptyDir as follows:
   110  // - For EmptyDir volumes with medium=Memory, the "type" field is set to tmpfs.
   111  // - For EmptyDir volumes with medium="", the "type" field is set to bind.
   112  //
   113  // The container spec has EmptyDir mount points as bind mounts. This method
   114  // modifies the spec as follows:
   115  // - The "type" mount annotation for all EmptyDirs is changed to tmpfs.
   116  // - The mount type in spec.Mounts[i].Type is changed as follows:
   117  //   - For EmptyDir volumes with medium=Memory, we change it to tmpfs.
   118  //   - For EmptyDir volumes with medium="", we leave it as a bind mount.
   119  //   - (Essentially we set it to what the admission controller said.)
   120  //
   121  // runsc should use these two setting to infer EmptyDir medium:
   122  //   - tmpfs annotation type + tmpfs mount type = memory-backed EmptyDir
   123  //   - tmpfs annotation type + bind mount type = disk-backed EmptyDir
   124  func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) {
   125  	var uid string
   126  	if IsSandbox(s) {
   127  		var err error
   128  		uid, err = podUID(s)
   129  		if err != nil {
   130  			// Skip if we can't get pod UID, because this doesn't work
   131  			// for containerd 1.1.
   132  			return false, nil
   133  		}
   134  	}
   135  	updated := false
   136  	for k, v := range s.Annotations {
   137  		if !isVolumeKey(k) {
   138  			continue
   139  		}
   140  		if volumeFieldName(k) != "type" {
   141  			continue
   142  		}
   143  		volume := volumeName(k)
   144  		if uid != "" {
   145  			// This is the root (first) container. Mount annotations are only
   146  			// consumed from this container's spec. So fix mount annotations by:
   147  			// 1. Adding source annotation.
   148  			// 2. Fixing type annotation.
   149  			path, err := volumePath(volume, uid)
   150  			if err != nil {
   151  				return false, fmt.Errorf("get volume path for %q: %w", volume, err)
   152  			}
   153  			s.Annotations[volumeSourceKey(volume)] = path
   154  			if strings.Contains(path, emptyDirVolumesDir) {
   155  				s.Annotations[k] = "tmpfs" // See note about EmptyDir.
   156  			}
   157  			updated = true
   158  		} else {
   159  			// This is a sub-container. Mount annotations are ignored. So no need to
   160  			// bother fixing those.
   161  			for i := range s.Mounts {
   162  				// An error is returned for sandbox if source annotation is not
   163  				// successfully applied, so it is guaranteed that the source annotation
   164  				// for sandbox has already been successfully applied at this point.
   165  				//
   166  				// The volume name is unique inside a pod, so matching without podUID
   167  				// is fine here.
   168  				//
   169  				// TODO: Pass podUID down to shim for containers to do more accurate
   170  				// matching.
   171  				if yes, _ := isVolumePath(volume, s.Mounts[i].Source); yes {
   172  					// Container mount type must match the mount type specified by
   173  					// admission controller. See note about EmptyDir.
   174  					specutils.ChangeMountType(&s.Mounts[i], v)
   175  					updated = true
   176  				}
   177  			}
   178  		}
   179  	}
   180  
   181  	if ok, err := configureShm(s); err != nil {
   182  		return false, err
   183  	} else if ok {
   184  		updated = true
   185  	}
   186  
   187  	return updated, nil
   188  }
   189  
   190  // configureShm sets up annotations to mount /dev/shm as a pod shared tmpfs
   191  // mount inside containers.
   192  //
   193  // Pods are configured to mount /dev/shm to a common path in the host, so it's
   194  // shared among containers in the same pod. In gVisor, /dev/shm must be
   195  // converted to a tmpfs mount inside the sandbox, otherwise shm_open(3) doesn't
   196  // use it (see where_is_shmfs() in glibc). Mount annotation hints are used to
   197  // instruct runsc to mount the same tmpfs volume in all containers inside the
   198  // pod.
   199  func configureShm(s *specs.Spec) (bool, error) {
   200  	const (
   201  		shmPath    = "/dev/shm"
   202  		devshmType = "tmpfs"
   203  	)
   204  
   205  	// Some containers contain a duplicate mount entry for /dev/shm using tmpfs.
   206  	// If this is detected, remove the extraneous entry to ensure the correct one
   207  	// is used.
   208  	duplicate := -1
   209  	for i, m := range s.Mounts {
   210  		if m.Destination == shmPath && m.Type == devshmType {
   211  			duplicate = i
   212  			break
   213  		}
   214  	}
   215  
   216  	updated := false
   217  	for i := range s.Mounts {
   218  		m := &s.Mounts[i]
   219  		if m.Destination == shmPath && m.Type == "bind" {
   220  			if IsSandbox(s) {
   221  				s.Annotations[volumeKeyPrefix+devshmName+".source"] = m.Source
   222  				s.Annotations[volumeKeyPrefix+devshmName+".type"] = devshmType
   223  				s.Annotations[volumeKeyPrefix+devshmName+".share"] = "pod"
   224  				// Given that we don't have visibility into mount options for all
   225  				// containers, assume broad access for the master mount (it's tmpfs
   226  				// inside the sandbox anyways) and apply options to subcontainers as
   227  				// they bind mount individually.
   228  				s.Annotations[volumeKeyPrefix+devshmName+".options"] = "rw"
   229  			}
   230  
   231  			specutils.ChangeMountType(m, devshmType)
   232  			updated = true
   233  
   234  			// Remove the duplicate entry now that we found the shared /dev/shm mount.
   235  			if duplicate >= 0 {
   236  				s.Mounts = append(s.Mounts[:duplicate], s.Mounts[duplicate+1:]...)
   237  			}
   238  			break
   239  		}
   240  	}
   241  	return updated, nil
   242  }