github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/shim/utils/volumes.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package utils 16 17 import ( 18 "fmt" 19 "path/filepath" 20 "strings" 21 22 specs "github.com/opencontainers/runtime-spec/specs-go" 23 ) 24 25 const ( 26 volumeKeyPrefix = "dev.gvisor.spec.mount." 27 28 // devshmName is the volume name used for /dev/shm. Pick a name that is 29 // unlikely to be used. 30 devshmName = "gvisorinternaldevshm" 31 32 // emptyDirVolumesDir is the directory inside kubeletPodsDir/{uid}/volumes/ 33 // that hosts all the EmptyDir volumes used by the pod. 34 emptyDirVolumesDir = "kubernetes.io~empty-dir" 35 ) 36 37 // The directory structure for volumes is as follows: 38 // /var/lib/kubelet/pods/{uid}/volumes/{type} where `uid` is the pod UID and 39 // `type` is the volume type. 40 var kubeletPodsDir = "/var/lib/kubelet/pods" 41 42 // volumeName gets volume name from volume annotation key, example: 43 // 44 // dev.gvisor.spec.mount.NAME.share 45 func volumeName(k string) string { 46 return strings.SplitN(strings.TrimPrefix(k, volumeKeyPrefix), ".", 2)[0] 47 } 48 49 // volumeFieldName gets volume field name from volume annotation key, example: 50 // 51 // `type` is the field of dev.gvisor.spec.mount.NAME.type 52 func volumeFieldName(k string) string { 53 parts := strings.Split(strings.TrimPrefix(k, volumeKeyPrefix), ".") 54 return parts[len(parts)-1] 55 } 56 57 // podUID gets pod UID from the pod log path. 58 func podUID(s *specs.Spec) (string, error) { 59 sandboxLogDir := s.Annotations[sandboxLogDirAnnotation] 60 if sandboxLogDir == "" { 61 return "", fmt.Errorf("no sandbox log path annotation") 62 } 63 fields := strings.Split(filepath.Base(sandboxLogDir), "_") 64 switch len(fields) { 65 case 1: // This is the old CRI logging path. 66 return fields[0], nil 67 case 3: // This is the new CRI logging path. 68 return fields[2], nil 69 } 70 return "", fmt.Errorf("unexpected sandbox log path %q", sandboxLogDir) 71 } 72 73 // isVolumeKey checks whether an annotation key is for volume. 74 func isVolumeKey(k string) bool { 75 return strings.HasPrefix(k, volumeKeyPrefix) 76 } 77 78 // volumeSourceKey constructs the annotation key for volume source. 79 func volumeSourceKey(volume string) string { 80 return volumeKeyPrefix + volume + ".source" 81 } 82 83 // volumeLifecycleKey constructs the annotation key for volume lifecycle. 84 func volumeLifecycleKey(volume string) string { 85 return volumeKeyPrefix + volume + ".lifecycle" 86 } 87 88 // volumePath searches the volume path in the kubelet pod directory. 89 func volumePath(volume, uid string) (string, error) { 90 // TODO: Support subpath when gvisor supports pod volume bind mount. 91 volumeSearchPath := fmt.Sprintf("%s/%s/volumes/*/%s", kubeletPodsDir, uid, volume) 92 dirs, err := filepath.Glob(volumeSearchPath) 93 if err != nil { 94 return "", err 95 } 96 if len(dirs) != 1 { 97 return "", fmt.Errorf("unexpected matched volume list %v", dirs) 98 } 99 return dirs[0], nil 100 } 101 102 // isVolumePath checks whether a string is the volume path. 103 func isVolumePath(volume, path string) (bool, error) { 104 // TODO: Support subpath when gvisor supports pod volume bind mount. 105 volumeSearchPath := fmt.Sprintf("%s/*/volumes/*/%s", kubeletPodsDir, volume) 106 return filepath.Match(volumeSearchPath, path) 107 } 108 109 // UpdateVolumeAnnotations add necessary OCI annotations for gvisor 110 // volume optimization. Returns true if the spec was modified. 111 func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) { 112 var uid string 113 if IsSandbox(s) { 114 var err error 115 uid, err = podUID(s) 116 if err != nil { 117 // Skip if we can't get pod UID, because this doesn't work 118 // for containerd 1.1. 119 return false, nil 120 } 121 } 122 updated := false 123 for k, v := range s.Annotations { 124 if !isVolumeKey(k) { 125 continue 126 } 127 if volumeFieldName(k) != "type" { 128 continue 129 } 130 volume := volumeName(k) 131 if uid != "" { 132 // This is a sandbox. Add source and lifecycle annotations for volumes. 133 path, err := volumePath(volume, uid) 134 if err != nil { 135 return false, fmt.Errorf("get volume path for %q: %w", volume, err) 136 } 137 s.Annotations[volumeSourceKey(volume)] = path 138 // TODO(b/142076984): Remove the lifecycle setting logic after it has 139 // been adopted in GKE admission plugin. 140 lifecycleKey := volumeLifecycleKey(volume) 141 if _, ok := s.Annotations[lifecycleKey]; !ok { 142 // Only set lifecycle annotation if not already set. 143 if strings.Contains(path, emptyDirVolumesDir) { 144 // Emptydir is created and destroyed with the pod. 145 s.Annotations[lifecycleKey] = "pod" 146 } 147 } 148 updated = true 149 } else { 150 // This is a container. 151 for i := range s.Mounts { 152 // An error is returned for sandbox if source annotation is not 153 // successfully applied, so it is guaranteed that the source annotation 154 // for sandbox has already been successfully applied at this point. 155 // 156 // The volume name is unique inside a pod, so matching without podUID 157 // is fine here. 158 // 159 // TODO: Pass podUID down to shim for containers to do more accurate 160 // matching. 161 if yes, _ := isVolumePath(volume, s.Mounts[i].Source); yes { 162 // Container mount type must match the sandbox's mount type. 163 changeMountType(&s.Mounts[i], v) 164 updated = true 165 } 166 } 167 } 168 } 169 170 if ok, err := configureShm(s); err != nil { 171 return false, err 172 } else if ok { 173 updated = true 174 } 175 176 return updated, nil 177 } 178 179 // configureShm sets up annotations to mount /dev/shm as a pod shared tmpfs 180 // mount inside containers. 181 // 182 // Pods are configured to mount /dev/shm to a common path in the host, so it's 183 // shared among containers in the same pod. In gVisor, /dev/shm must be 184 // converted to a tmpfs mount inside the sandbox, otherwise shm_open(3) doesn't 185 // use it (see where_is_shmfs() in glibc). Mount annotation hints are used to 186 // instruct runsc to mount the same tmpfs volume in all containers inside the 187 // pod. 188 func configureShm(s *specs.Spec) (bool, error) { 189 const ( 190 shmPath = "/dev/shm" 191 devshmType = "tmpfs" 192 ) 193 194 // Some containers contain a duplicate mount entry for /dev/shm using tmpfs. 195 // If this is detected, remove the extraneous entry to ensure the correct one 196 // is used. 197 duplicate := -1 198 for i, m := range s.Mounts { 199 if m.Destination == shmPath && m.Type == devshmType { 200 duplicate = i 201 break 202 } 203 } 204 205 updated := false 206 for i := range s.Mounts { 207 m := &s.Mounts[i] 208 if m.Destination == shmPath && m.Type == "bind" { 209 if IsSandbox(s) { 210 s.Annotations[volumeKeyPrefix+devshmName+".source"] = m.Source 211 s.Annotations[volumeKeyPrefix+devshmName+".type"] = devshmType 212 s.Annotations[volumeKeyPrefix+devshmName+".share"] = "pod" 213 s.Annotations[volumeKeyPrefix+devshmName+".lifecycle"] = "pod" 214 // Given that we don't have visibility into mount options for all 215 // containers, assume broad access for the master mount (it's tmpfs 216 // inside the sandbox anyways) and apply options to subcontainers as 217 // they bind mount individually. 218 s.Annotations[volumeKeyPrefix+devshmName+".options"] = "rw" 219 } 220 221 changeMountType(m, devshmType) 222 updated = true 223 224 // Remove the duplicate entry now that we found the shared /dev/shm mount. 225 if duplicate >= 0 { 226 s.Mounts = append(s.Mounts[:duplicate], s.Mounts[duplicate+1:]...) 227 } 228 break 229 } 230 } 231 return updated, nil 232 } 233 234 func changeMountType(m *specs.Mount, newType string) { 235 m.Type = newType 236 237 // OCI spec allows bind mounts to be specified in options only. So if new type 238 // is not bind, remove bind/rbind from options. 239 // 240 // "For bind mounts (when options include either bind or rbind), the type is 241 // a dummy, often "none" (not listed in /proc/filesystems)." 242 if newType != "bind" { 243 newOpts := make([]string, 0, len(m.Options)) 244 for _, opt := range m.Options { 245 if opt != "rbind" && opt != "bind" { 246 newOpts = append(newOpts, opt) 247 } 248 } 249 m.Options = newOpts 250 } 251 }