istio.io/istio@v0.0.0-20240520182934-d79c90f27776/cni/pkg/nodeagent/podcgroupns.go (about) 1 // Copyright Istio Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nodeagent 16 17 import ( 18 "bufio" 19 "bytes" 20 "fmt" 21 "io" 22 "io/fs" 23 "path" 24 "regexp" 25 "strings" 26 "unicode" 27 28 corev1 "k8s.io/api/core/v1" 29 "k8s.io/apimachinery/pkg/types" 30 31 "istio.io/istio/pkg/maps" 32 "istio.io/istio/pkg/util/sets" 33 ) 34 35 type PodToNetns map[string]WorkloadInfo 36 37 func (p PodToNetns) Close() { 38 for _, wl := range p { 39 wl.Netns.Close() 40 } 41 } 42 43 type PodNetnsFinder interface { 44 FindNetnsForPods(filter map[types.UID]*corev1.Pod) (PodToNetns, error) 45 } 46 47 type PodNetnsProcFinder struct { 48 proc fs.FS 49 } 50 51 func NewPodNetnsProcFinder(proc fs.FS) *PodNetnsProcFinder { 52 return &PodNetnsProcFinder{proc: proc} 53 } 54 55 func isNotNumber(r rune) bool { 56 return r < '0' || r > '9' 57 } 58 59 func (p *PodNetnsProcFinder) FindNetnsForPods(pods map[types.UID]*corev1.Pod) (PodToNetns, error) { 60 /* 61 for each process, find its netns inode, 62 if we already seen the inode, skip it 63 if we haven't seen the inode, check the process cgroup and see if we 64 can extract a pod uid from it. 65 if we can, open the netns, and save a map of uid->netns-fd 66 */ 67 68 podUIDNetns := make(PodToNetns) 69 netnsObserved := sets.New[uint64]() 70 71 entries, err := fs.ReadDir(p.proc, ".") 72 if err != nil { 73 return nil, err 74 } 75 76 desiredUIDs := sets.New(maps.Keys(pods)...) 77 for _, entry := range entries { 78 // we can't break here because we need to close all the netns we opened 79 // plus we want to return whatever we can to the user. 80 res, err := p.processEntry(p.proc, netnsObserved, desiredUIDs, entry) 81 if err != nil { 82 log.Debugf("error processing entry: %s %v", entry.Name(), err) 83 continue 84 } 85 if res == nil { 86 continue 87 } 88 pod := pods[res.uid] 89 netns := &NetnsWithFd{ 90 netns: res.netns, 91 fd: res.netnsfd, 92 inode: res.inode, 93 } 94 workload := WorkloadInfo{ 95 Workload: podToWorkload(pod), 96 Netns: netns, 97 } 98 podUIDNetns[string(res.uid)] = workload 99 100 } 101 return podUIDNetns, nil 102 } 103 104 type PodNetnsEntry struct { 105 uid types.UID 106 netns fs.File 107 netnsfd uintptr 108 inode uint64 109 } 110 111 func (p *PodNetnsProcFinder) processEntry(proc fs.FS, netnsObserved sets.Set[uint64], filter sets.Set[types.UID], entry fs.DirEntry) (*PodNetnsEntry, error) { 112 if !isProcess(entry) { 113 return nil, nil 114 } 115 116 netnsName := path.Join(entry.Name(), "ns", "net") 117 fi, err := fs.Stat(proc, netnsName) 118 if err != nil { 119 return nil, err 120 } 121 122 inode, err := GetInode(fi) 123 if err != nil { 124 return nil, err 125 } 126 if _, ok := netnsObserved[inode]; ok { 127 log.Debugf("netns: %d already processed. skipping", inode) 128 return nil, nil 129 } 130 131 cgroup, err := proc.Open(path.Join(entry.Name(), "cgroup")) 132 if err != nil { 133 return nil, nil 134 } 135 defer cgroup.Close() 136 137 var cgroupData bytes.Buffer 138 _, err = io.Copy(&cgroupData, cgroup) 139 if err != nil { 140 return nil, nil 141 } 142 143 uid, _, err := GetPodUIDAndContainerID(cgroupData) 144 if err != nil { 145 return nil, err 146 } 147 if filter != nil && !filter.Contains(uid) { 148 return nil, nil 149 } 150 151 netns, err := proc.Open(netnsName) 152 if err != nil { 153 return nil, err 154 } 155 fd, err := GetFd(netns) 156 if err != nil { 157 netns.Close() 158 return nil, err 159 } 160 netnsObserved[inode] = struct{}{} 161 log.Debugf("found pod to netns: %s %d", uid, inode) 162 163 return &PodNetnsEntry{ 164 uid: uid, 165 netns: netns, 166 netnsfd: fd, 167 inode: inode, 168 }, nil 169 } 170 171 func isProcess(entry fs.DirEntry) bool { 172 // check if it is a directory 173 if !entry.IsDir() { 174 return false 175 } 176 177 // check if it is a number 178 if strings.IndexFunc(entry.Name(), isNotNumber) != -1 { 179 return false 180 } 181 return true 182 } 183 184 func GetFd(f fs.File) (uintptr, error) { 185 if fdable, ok := f.(interface{ Fd() uintptr }); ok { 186 return fdable.Fd(), nil 187 } 188 189 return 0, fmt.Errorf("unable to get fd") 190 } 191 192 /// mostly copy pasted from spire below: 193 194 // regexes listed here have to exclusively match a cgroup path 195 // the regexes must include two named groups "poduid" and "containerid" 196 // if the regex needs to exclude certain substrings, the "mustnotmatch" group can be used 197 // nolint: lll 198 var cgroupREs = []*regexp.Regexp{ 199 // the regex used to parse out the pod UID and container ID from a 200 // cgroup name. It assumes that any ".scope" suffix has been trimmed off 201 // beforehand. CAUTION: we used to verify that the pod and container id were 202 // descendants of a kubepods directory, however, as of Kubernetes 1.21, cgroups 203 // namespaces are in use and therefore we can no longer discern if that is the 204 // case from within SPIRE agent container (since the container itself is 205 // namespaced). As such, the regex has been relaxed to simply find the pod UID 206 // followed by the container ID with allowances for arbitrary punctuation, and 207 // container runtime prefixes, etc. 208 regexp.MustCompile(`` + 209 // "pod"-prefixed Pod UID (with punctuation separated groups) followed by punctuation 210 `[[:punct:]]pod(?P<poduid>[[:xdigit:]]{8}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{12})[[:punct:]]` + 211 // zero or more punctuation separated "segments" (e.g. "docker-") 212 `(?:[[:^punct:]]+[[:punct:]])*` + 213 // non-punctuation end of string, i.e., the container ID 214 `(?P<containerid>[[:^punct:]]+)$`), 215 216 // This regex applies for container runtimes, that won't put the PodUID into 217 // the cgroup name. 218 // Currently only cri-o in combination with kubeedge is known for this abnormally. 219 regexp.MustCompile(`` + 220 // intentionally empty poduid group 221 `(?P<poduid>)` + 222 // mustnotmatch group: cgroup path must not include a poduid 223 `(?P<mustnotmatch>pod[[:xdigit:]]{8}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{12}[[:punct:]])?` + 224 // /crio- 225 `(?:[[:^punct:]]*/*)*crio[[:punct:]]` + 226 // non-punctuation end of string, i.e., the container ID 227 `(?P<containerid>[[:^punct:]]+)$`), 228 } 229 230 func reSubMatchMap(r *regexp.Regexp, str string) map[string]string { 231 match := r.FindStringSubmatch(str) 232 if match == nil { 233 return nil 234 } 235 subMatchMap := make(map[string]string) 236 for i, name := range r.SubexpNames() { 237 if i != 0 { 238 subMatchMap[name] = match[i] 239 } 240 } 241 return subMatchMap 242 } 243 244 func isValidCGroupPathMatches(matches map[string]string) bool { 245 if matches == nil { 246 return false 247 } 248 if matches["mustnotmatch"] != "" { 249 return false 250 } 251 return true 252 } 253 254 // nolint: lll 255 func getPodUIDAndContainerIDFromCGroupPath(cgroupPath string) (types.UID, string, bool) { 256 // We are only interested in kube pods entries, for example: 257 // - /kubepods/burstable/pod2c48913c-b29f-11e7-9350-020968147796/9bca8d63d5fa610783847915bcff0ecac1273e5b4bed3f6fa1b07350e0135961 258 // - /docker/8d461fa5765781bcf5f7eb192f101bc3103d4b932e26236f43feecfa20664f96/kubepods/besteffort/poddaa5c7ee-3484-4533-af39-3591564fd03e/aff34703e5e1f89443e9a1bffcc80f43f74d4808a2dd22c8f88c08547b323934 259 // - /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod2c48913c-b29f-11e7-9350-020968147796.slice/docker-9bca8d63d5fa610783847915bcff0ecac1273e5b4bed3f6fa1b07350e0135961.scope 260 // - /kubepods-besteffort-pod72f7f152_440c_66ac_9084_e0fc1d8a910c.slice:cri-containerd:b2a102854b4969b2ce98dc329c86b4fb2b06e4ad2cc8da9d8a7578c9cd2004a2" 261 // - /../../pod2c48913c-b29f-11e7-9350-020968147796/9bca8d63d5fa610783847915bcff0ecac1273e5b4bed3f6fa1b07350e0135961 262 // - 0::/../crio-45490e76e0878aaa4d9808f7d2eefba37f093c3efbba9838b6d8ab804d9bd814.scope 263 // First trim off any .scope suffix. This allows for a cleaner regex since 264 // we don't have to muck with greediness. TrimSuffix is no-copy so this 265 // is cheap. 266 cgroupPath = strings.TrimSuffix(cgroupPath, ".scope") 267 268 var matchResults map[string]string 269 for _, regex := range cgroupREs { 270 matches := reSubMatchMap(regex, cgroupPath) 271 if isValidCGroupPathMatches(matches) { 272 if matchResults != nil { 273 return "", "", false 274 } 275 matchResults = matches 276 } 277 } 278 279 if matchResults != nil { 280 var podUID types.UID 281 if matchResults["poduid"] != "" { 282 podUID = canonicalizePodUID(matchResults["poduid"]) 283 } 284 return podUID, matchResults["containerid"], true 285 } 286 return "", "", false 287 } 288 289 // canonicalizePodUID converts a Pod UID, as represented in a cgroup path, into 290 // a canonical form. Practically this means that we convert any punctuation to 291 // dashes, which is how the UID is represented within Kubernetes. 292 func canonicalizePodUID(uid string) types.UID { 293 return types.UID(strings.Map(func(r rune) rune { 294 if unicode.IsPunct(r) { 295 r = '-' 296 } 297 return r 298 }, uid)) 299 } 300 301 // Cgroup represents a linux cgroup. 302 type Cgroup struct { 303 HierarchyID string 304 ControllerList string 305 GroupPath string 306 } 307 308 // GetCGroups returns a slice of cgroups for pid using fs for filesystem calls. 309 // 310 // The expected cgroup format is "hierarchy-ID:controller-list:cgroup-path", and 311 // this function will return an error if every cgroup does not meet that format. 312 // 313 // For more information, see: 314 // - http://man7.org/linux/man-pages/man7/cgroups.7.html 315 // - https://www.kernel.org/doc/Documentation/cgroup-v2.txt 316 func GetCgroups(procCgroupData bytes.Buffer) ([]Cgroup, error) { 317 reader := bytes.NewReader(procCgroupData.Bytes()) 318 var cgroups []Cgroup 319 scanner := bufio.NewScanner(reader) 320 321 for scanner.Scan() { 322 token := scanner.Text() 323 substrings := strings.SplitN(token, ":", 3) 324 if len(substrings) < 3 { 325 return nil, fmt.Errorf("cgroup entry contains %v colons, but expected at least 2 colons: %q", len(substrings), token) 326 } 327 cgroups = append(cgroups, Cgroup{ 328 HierarchyID: substrings[0], 329 ControllerList: substrings[1], 330 GroupPath: substrings[2], 331 }) 332 } 333 334 if err := scanner.Err(); err != nil { 335 return nil, err 336 } 337 338 return cgroups, nil 339 } 340 341 func GetPodUIDAndContainerID(procCgroupData bytes.Buffer) (types.UID, string, error) { 342 cgroups, err := GetCgroups(procCgroupData) 343 if err != nil { 344 return "", "", fmt.Errorf("unable to obtain cgroups: %v", err) 345 } 346 347 return getPodUIDAndContainerIDFromCGroups(cgroups) 348 } 349 350 func getPodUIDAndContainerIDFromCGroups(cgroups []Cgroup) (types.UID, string, error) { 351 var podUID types.UID 352 var containerID string 353 for _, cgroup := range cgroups { 354 candidatePodUID, candidateContainerID, ok := getPodUIDAndContainerIDFromCGroupPath(cgroup.GroupPath) 355 switch { 356 case !ok: 357 // Cgroup did not contain a container ID. 358 continue 359 case containerID == "": 360 // This is the first container ID found so far. 361 podUID = candidatePodUID 362 containerID = candidateContainerID 363 case containerID != candidateContainerID: 364 // More than one container ID found in the cgroups. 365 return "", "", fmt.Errorf("multiple container IDs found in cgroups (%s, %s)", 366 containerID, candidateContainerID) 367 case podUID != candidatePodUID: 368 // More than one pod UID found in the cgroups. 369 return "", "", fmt.Errorf("multiple pod UIDs found in cgroups (%s, %s)", 370 podUID, candidatePodUID) 371 } 372 } 373 374 return podUID, containerID, nil 375 }