github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/shim/utils/volumes.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     https://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package utils
    16  
    17  import (
    18  	"fmt"
    19  	"path/filepath"
    20  	"strings"
    21  
    22  	specs "github.com/opencontainers/runtime-spec/specs-go"
    23  )
    24  
    25  const (
    26  	volumeKeyPrefix = "dev.gvisor.spec.mount."
    27  
    28  	// devshmName is the volume name used for /dev/shm. Pick a name that is
    29  	// unlikely to be used.
    30  	devshmName = "gvisorinternaldevshm"
    31  
    32  	// emptyDirVolumesDir is the directory inside kubeletPodsDir/{uid}/volumes/
    33  	// that hosts all the EmptyDir volumes used by the pod.
    34  	emptyDirVolumesDir = "kubernetes.io~empty-dir"
    35  )
    36  
    37  // The directory structure for volumes is as follows:
    38  // /var/lib/kubelet/pods/{uid}/volumes/{type} where `uid` is the pod UID and
    39  // `type` is the volume type.
    40  var kubeletPodsDir = "/var/lib/kubelet/pods"
    41  
    42  // volumeName gets volume name from volume annotation key, example:
    43  //
    44  //	dev.gvisor.spec.mount.NAME.share
    45  func volumeName(k string) string {
    46  	return strings.SplitN(strings.TrimPrefix(k, volumeKeyPrefix), ".", 2)[0]
    47  }
    48  
    49  // volumeFieldName gets volume field name from volume annotation key, example:
    50  //
    51  //	`type` is the field of dev.gvisor.spec.mount.NAME.type
    52  func volumeFieldName(k string) string {
    53  	parts := strings.Split(strings.TrimPrefix(k, volumeKeyPrefix), ".")
    54  	return parts[len(parts)-1]
    55  }
    56  
    57  // podUID gets pod UID from the pod log path.
    58  func podUID(s *specs.Spec) (string, error) {
    59  	sandboxLogDir := s.Annotations[sandboxLogDirAnnotation]
    60  	if sandboxLogDir == "" {
    61  		return "", fmt.Errorf("no sandbox log path annotation")
    62  	}
    63  	fields := strings.Split(filepath.Base(sandboxLogDir), "_")
    64  	switch len(fields) {
    65  	case 1: // This is the old CRI logging path.
    66  		return fields[0], nil
    67  	case 3: // This is the new CRI logging path.
    68  		return fields[2], nil
    69  	}
    70  	return "", fmt.Errorf("unexpected sandbox log path %q", sandboxLogDir)
    71  }
    72  
    73  // isVolumeKey checks whether an annotation key is for volume.
    74  func isVolumeKey(k string) bool {
    75  	return strings.HasPrefix(k, volumeKeyPrefix)
    76  }
    77  
    78  // volumeSourceKey constructs the annotation key for volume source.
    79  func volumeSourceKey(volume string) string {
    80  	return volumeKeyPrefix + volume + ".source"
    81  }
    82  
    83  // volumeLifecycleKey constructs the annotation key for volume lifecycle.
    84  func volumeLifecycleKey(volume string) string {
    85  	return volumeKeyPrefix + volume + ".lifecycle"
    86  }
    87  
    88  // volumePath searches the volume path in the kubelet pod directory.
    89  func volumePath(volume, uid string) (string, error) {
    90  	// TODO: Support subpath when gvisor supports pod volume bind mount.
    91  	volumeSearchPath := fmt.Sprintf("%s/%s/volumes/*/%s", kubeletPodsDir, uid, volume)
    92  	dirs, err := filepath.Glob(volumeSearchPath)
    93  	if err != nil {
    94  		return "", err
    95  	}
    96  	if len(dirs) != 1 {
    97  		return "", fmt.Errorf("unexpected matched volume list %v", dirs)
    98  	}
    99  	return dirs[0], nil
   100  }
   101  
   102  // isVolumePath checks whether a string is the volume path.
   103  func isVolumePath(volume, path string) (bool, error) {
   104  	// TODO: Support subpath when gvisor supports pod volume bind mount.
   105  	volumeSearchPath := fmt.Sprintf("%s/*/volumes/*/%s", kubeletPodsDir, volume)
   106  	return filepath.Match(volumeSearchPath, path)
   107  }
   108  
   109  // UpdateVolumeAnnotations add necessary OCI annotations for gvisor
   110  // volume optimization. Returns true if the spec was modified.
   111  func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) {
   112  	var uid string
   113  	if IsSandbox(s) {
   114  		var err error
   115  		uid, err = podUID(s)
   116  		if err != nil {
   117  			// Skip if we can't get pod UID, because this doesn't work
   118  			// for containerd 1.1.
   119  			return false, nil
   120  		}
   121  	}
   122  	updated := false
   123  	for k, v := range s.Annotations {
   124  		if !isVolumeKey(k) {
   125  			continue
   126  		}
   127  		if volumeFieldName(k) != "type" {
   128  			continue
   129  		}
   130  		volume := volumeName(k)
   131  		if uid != "" {
   132  			// This is a sandbox. Add source and lifecycle annotations for volumes.
   133  			path, err := volumePath(volume, uid)
   134  			if err != nil {
   135  				return false, fmt.Errorf("get volume path for %q: %w", volume, err)
   136  			}
   137  			s.Annotations[volumeSourceKey(volume)] = path
   138  			// TODO(b/142076984): Remove the lifecycle setting logic after it has
   139  			// been adopted in GKE admission plugin.
   140  			lifecycleKey := volumeLifecycleKey(volume)
   141  			if _, ok := s.Annotations[lifecycleKey]; !ok {
   142  				// Only set lifecycle annotation if not already set.
   143  				if strings.Contains(path, emptyDirVolumesDir) {
   144  					// Emptydir is created and destroyed with the pod.
   145  					s.Annotations[lifecycleKey] = "pod"
   146  				}
   147  			}
   148  			updated = true
   149  		} else {
   150  			// This is a container.
   151  			for i := range s.Mounts {
   152  				// An error is returned for sandbox if source annotation is not
   153  				// successfully applied, so it is guaranteed that the source annotation
   154  				// for sandbox has already been successfully applied at this point.
   155  				//
   156  				// The volume name is unique inside a pod, so matching without podUID
   157  				// is fine here.
   158  				//
   159  				// TODO: Pass podUID down to shim for containers to do more accurate
   160  				// matching.
   161  				if yes, _ := isVolumePath(volume, s.Mounts[i].Source); yes {
   162  					// Container mount type must match the sandbox's mount type.
   163  					changeMountType(&s.Mounts[i], v)
   164  					updated = true
   165  				}
   166  			}
   167  		}
   168  	}
   169  
   170  	if ok, err := configureShm(s); err != nil {
   171  		return false, err
   172  	} else if ok {
   173  		updated = true
   174  	}
   175  
   176  	return updated, nil
   177  }
   178  
   179  // configureShm sets up annotations to mount /dev/shm as a pod shared tmpfs
   180  // mount inside containers.
   181  //
   182  // Pods are configured to mount /dev/shm to a common path in the host, so it's
   183  // shared among containers in the same pod. In gVisor, /dev/shm must be
   184  // converted to a tmpfs mount inside the sandbox, otherwise shm_open(3) doesn't
   185  // use it (see where_is_shmfs() in glibc). Mount annotation hints are used to
   186  // instruct runsc to mount the same tmpfs volume in all containers inside the
   187  // pod.
   188  func configureShm(s *specs.Spec) (bool, error) {
   189  	const (
   190  		shmPath    = "/dev/shm"
   191  		devshmType = "tmpfs"
   192  	)
   193  
   194  	// Some containers contain a duplicate mount entry for /dev/shm using tmpfs.
   195  	// If this is detected, remove the extraneous entry to ensure the correct one
   196  	// is used.
   197  	duplicate := -1
   198  	for i, m := range s.Mounts {
   199  		if m.Destination == shmPath && m.Type == devshmType {
   200  			duplicate = i
   201  			break
   202  		}
   203  	}
   204  
   205  	updated := false
   206  	for i := range s.Mounts {
   207  		m := &s.Mounts[i]
   208  		if m.Destination == shmPath && m.Type == "bind" {
   209  			if IsSandbox(s) {
   210  				s.Annotations[volumeKeyPrefix+devshmName+".source"] = m.Source
   211  				s.Annotations[volumeKeyPrefix+devshmName+".type"] = devshmType
   212  				s.Annotations[volumeKeyPrefix+devshmName+".share"] = "pod"
   213  				s.Annotations[volumeKeyPrefix+devshmName+".lifecycle"] = "pod"
   214  				// Given that we don't have visibility into mount options for all
   215  				// containers, assume broad access for the master mount (it's tmpfs
   216  				// inside the sandbox anyways) and apply options to subcontainers as
   217  				// they bind mount individually.
   218  				s.Annotations[volumeKeyPrefix+devshmName+".options"] = "rw"
   219  			}
   220  
   221  			changeMountType(m, devshmType)
   222  			updated = true
   223  
   224  			// Remove the duplicate entry now that we found the shared /dev/shm mount.
   225  			if duplicate >= 0 {
   226  				s.Mounts = append(s.Mounts[:duplicate], s.Mounts[duplicate+1:]...)
   227  			}
   228  			break
   229  		}
   230  	}
   231  	return updated, nil
   232  }
   233  
   234  func changeMountType(m *specs.Mount, newType string) {
   235  	m.Type = newType
   236  
   237  	// OCI spec allows bind mounts to be specified in options only. So if new type
   238  	// is not bind, remove bind/rbind from options.
   239  	//
   240  	// "For bind mounts (when options include either bind or rbind), the type is
   241  	// a dummy, often "none" (not listed in /proc/filesystems)."
   242  	if newType != "bind" {
   243  		newOpts := make([]string, 0, len(m.Options))
   244  		for _, opt := range m.Options {
   245  			if opt != "rbind" && opt != "bind" {
   246  				newOpts = append(newOpts, opt)
   247  			}
   248  		}
   249  		m.Options = newOpts
   250  	}
   251  }