github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/shim/utils/volumes.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package utils 16 17 import ( 18 "fmt" 19 "path/filepath" 20 "strings" 21 22 specs "github.com/opencontainers/runtime-spec/specs-go" 23 "github.com/metacubex/gvisor/runsc/specutils" 24 ) 25 26 const ( 27 volumeKeyPrefix = "dev.gvisor.spec.mount." 28 29 // devshmName is the volume name used for /dev/shm. Pick a name that is 30 // unlikely to be used. 31 devshmName = "gvisorinternaldevshm" 32 33 // emptyDirVolumesDir is the directory inside kubeletPodsDir/{uid}/volumes/ 34 // that hosts all the EmptyDir volumes used by the pod. 35 emptyDirVolumesDir = "kubernetes.io~empty-dir" 36 ) 37 38 // The directory structure for volumes is as follows: 39 // /var/lib/kubelet/pods/{uid}/volumes/{type} where `uid` is the pod UID and 40 // `type` is the volume type. 41 var kubeletPodsDir = "/var/lib/kubelet/pods" 42 43 // volumeName gets volume name from volume annotation key, example: 44 // 45 // dev.gvisor.spec.mount.NAME.share 46 func volumeName(k string) string { 47 return strings.SplitN(strings.TrimPrefix(k, volumeKeyPrefix), ".", 2)[0] 48 } 49 50 // volumeFieldName gets volume field name from volume annotation key, example: 51 // 52 // `type` is the field of dev.gvisor.spec.mount.NAME.type 53 func volumeFieldName(k string) string { 54 parts := strings.Split(strings.TrimPrefix(k, volumeKeyPrefix), ".") 55 return parts[len(parts)-1] 56 } 57 58 // podUID gets pod UID from the pod log path. 59 func podUID(s *specs.Spec) (string, error) { 60 sandboxLogDir := s.Annotations[sandboxLogDirAnnotation] 61 if sandboxLogDir == "" { 62 return "", fmt.Errorf("no sandbox log path annotation") 63 } 64 fields := strings.Split(filepath.Base(sandboxLogDir), "_") 65 switch len(fields) { 66 case 1: // This is the old CRI logging path. 67 return fields[0], nil 68 case 3: // This is the new CRI logging path. 69 return fields[2], nil 70 } 71 return "", fmt.Errorf("unexpected sandbox log path %q", sandboxLogDir) 72 } 73 74 // isVolumeKey checks whether an annotation key is for volume. 75 func isVolumeKey(k string) bool { 76 return strings.HasPrefix(k, volumeKeyPrefix) 77 } 78 79 // volumeSourceKey constructs the annotation key for volume source. 80 func volumeSourceKey(volume string) string { 81 return volumeKeyPrefix + volume + ".source" 82 } 83 84 // volumePath searches the volume path in the kubelet pod directory. 85 func volumePath(volume, uid string) (string, error) { 86 // TODO: Support subpath when gvisor supports pod volume bind mount. 87 volumeSearchPath := fmt.Sprintf("%s/%s/volumes/*/%s", kubeletPodsDir, uid, volume) 88 dirs, err := filepath.Glob(volumeSearchPath) 89 if err != nil { 90 return "", err 91 } 92 if len(dirs) != 1 { 93 return "", fmt.Errorf("unexpected matched volume list %v", dirs) 94 } 95 return dirs[0], nil 96 } 97 98 // isVolumePath checks whether a string is the volume path. 99 func isVolumePath(volume, path string) (bool, error) { 100 // TODO: Support subpath when gvisor supports pod volume bind mount. 101 volumeSearchPath := fmt.Sprintf("%s/*/volumes/*/%s", kubeletPodsDir, volume) 102 return filepath.Match(volumeSearchPath, path) 103 } 104 105 // UpdateVolumeAnnotations add necessary OCI annotations for gvisor 106 // volume optimization. Returns true if the spec was modified. 107 // 108 // Note about EmptyDir handling: 109 // The admission controller sets mount annotations for EmptyDir as follows: 110 // - For EmptyDir volumes with medium=Memory, the "type" field is set to tmpfs. 111 // - For EmptyDir volumes with medium="", the "type" field is set to bind. 112 // 113 // The container spec has EmptyDir mount points as bind mounts. This method 114 // modifies the spec as follows: 115 // - The "type" mount annotation for all EmptyDirs is changed to tmpfs. 116 // - The mount type in spec.Mounts[i].Type is changed as follows: 117 // - For EmptyDir volumes with medium=Memory, we change it to tmpfs. 118 // - For EmptyDir volumes with medium="", we leave it as a bind mount. 119 // - (Essentially we set it to what the admission controller said.) 120 // 121 // runsc should use these two setting to infer EmptyDir medium: 122 // - tmpfs annotation type + tmpfs mount type = memory-backed EmptyDir 123 // - tmpfs annotation type + bind mount type = disk-backed EmptyDir 124 func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) { 125 var uid string 126 if IsSandbox(s) { 127 var err error 128 uid, err = podUID(s) 129 if err != nil { 130 // Skip if we can't get pod UID, because this doesn't work 131 // for containerd 1.1. 132 return false, nil 133 } 134 } 135 updated := false 136 for k, v := range s.Annotations { 137 if !isVolumeKey(k) { 138 continue 139 } 140 if volumeFieldName(k) != "type" { 141 continue 142 } 143 volume := volumeName(k) 144 if uid != "" { 145 // This is the root (first) container. Mount annotations are only 146 // consumed from this container's spec. So fix mount annotations by: 147 // 1. Adding source annotation. 148 // 2. Fixing type annotation. 149 path, err := volumePath(volume, uid) 150 if err != nil { 151 return false, fmt.Errorf("get volume path for %q: %w", volume, err) 152 } 153 s.Annotations[volumeSourceKey(volume)] = path 154 if strings.Contains(path, emptyDirVolumesDir) { 155 s.Annotations[k] = "tmpfs" // See note about EmptyDir. 156 } 157 updated = true 158 } else { 159 // This is a sub-container. Mount annotations are ignored. So no need to 160 // bother fixing those. 161 for i := range s.Mounts { 162 // An error is returned for sandbox if source annotation is not 163 // successfully applied, so it is guaranteed that the source annotation 164 // for sandbox has already been successfully applied at this point. 165 // 166 // The volume name is unique inside a pod, so matching without podUID 167 // is fine here. 168 // 169 // TODO: Pass podUID down to shim for containers to do more accurate 170 // matching. 171 if yes, _ := isVolumePath(volume, s.Mounts[i].Source); yes { 172 // Container mount type must match the mount type specified by 173 // admission controller. See note about EmptyDir. 174 specutils.ChangeMountType(&s.Mounts[i], v) 175 updated = true 176 } 177 } 178 } 179 } 180 181 if ok, err := configureShm(s); err != nil { 182 return false, err 183 } else if ok { 184 updated = true 185 } 186 187 return updated, nil 188 } 189 190 // configureShm sets up annotations to mount /dev/shm as a pod shared tmpfs 191 // mount inside containers. 192 // 193 // Pods are configured to mount /dev/shm to a common path in the host, so it's 194 // shared among containers in the same pod. In gVisor, /dev/shm must be 195 // converted to a tmpfs mount inside the sandbox, otherwise shm_open(3) doesn't 196 // use it (see where_is_shmfs() in glibc). Mount annotation hints are used to 197 // instruct runsc to mount the same tmpfs volume in all containers inside the 198 // pod. 199 func configureShm(s *specs.Spec) (bool, error) { 200 const ( 201 shmPath = "/dev/shm" 202 devshmType = "tmpfs" 203 ) 204 205 // Some containers contain a duplicate mount entry for /dev/shm using tmpfs. 206 // If this is detected, remove the extraneous entry to ensure the correct one 207 // is used. 208 duplicate := -1 209 for i, m := range s.Mounts { 210 if m.Destination == shmPath && m.Type == devshmType { 211 duplicate = i 212 break 213 } 214 } 215 216 updated := false 217 for i := range s.Mounts { 218 m := &s.Mounts[i] 219 if m.Destination == shmPath && m.Type == "bind" { 220 if IsSandbox(s) { 221 s.Annotations[volumeKeyPrefix+devshmName+".source"] = m.Source 222 s.Annotations[volumeKeyPrefix+devshmName+".type"] = devshmType 223 s.Annotations[volumeKeyPrefix+devshmName+".share"] = "pod" 224 // Given that we don't have visibility into mount options for all 225 // containers, assume broad access for the master mount (it's tmpfs 226 // inside the sandbox anyways) and apply options to subcontainers as 227 // they bind mount individually. 228 s.Annotations[volumeKeyPrefix+devshmName+".options"] = "rw" 229 } 230 231 specutils.ChangeMountType(m, devshmType) 232 updated = true 233 234 // Remove the duplicate entry now that we found the shared /dev/shm mount. 235 if duplicate >= 0 { 236 s.Mounts = append(s.Mounts[:duplicate], s.Mounts[duplicate+1:]...) 237 } 238 break 239 } 240 } 241 return updated, nil 242 }