istio.io/istio@v0.0.0-20240520182934-d79c90f27776/cni/pkg/nodeagent/podcgroupns.go (about)

     1  // Copyright Istio Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package nodeagent
    16  
    17  import (
    18  	"bufio"
    19  	"bytes"
    20  	"fmt"
    21  	"io"
    22  	"io/fs"
    23  	"path"
    24  	"regexp"
    25  	"strings"
    26  	"unicode"
    27  
    28  	corev1 "k8s.io/api/core/v1"
    29  	"k8s.io/apimachinery/pkg/types"
    30  
    31  	"istio.io/istio/pkg/maps"
    32  	"istio.io/istio/pkg/util/sets"
    33  )
    34  
    35  type PodToNetns map[string]WorkloadInfo
    36  
    37  func (p PodToNetns) Close() {
    38  	for _, wl := range p {
    39  		wl.Netns.Close()
    40  	}
    41  }
    42  
    43  type PodNetnsFinder interface {
    44  	FindNetnsForPods(filter map[types.UID]*corev1.Pod) (PodToNetns, error)
    45  }
    46  
    47  type PodNetnsProcFinder struct {
    48  	proc fs.FS
    49  }
    50  
    51  func NewPodNetnsProcFinder(proc fs.FS) *PodNetnsProcFinder {
    52  	return &PodNetnsProcFinder{proc: proc}
    53  }
    54  
    55  func isNotNumber(r rune) bool {
    56  	return r < '0' || r > '9'
    57  }
    58  
    59  func (p *PodNetnsProcFinder) FindNetnsForPods(pods map[types.UID]*corev1.Pod) (PodToNetns, error) {
    60  	/*
    61  		for each process, find its netns inode,
    62  		if we already seen the inode, skip it
    63  		if we haven't seen the inode, check the process cgroup and see if we
    64  		can extract a pod uid from it.
    65  		if we can, open the netns, and save a map of uid->netns-fd
    66  	*/
    67  
    68  	podUIDNetns := make(PodToNetns)
    69  	netnsObserved := sets.New[uint64]()
    70  
    71  	entries, err := fs.ReadDir(p.proc, ".")
    72  	if err != nil {
    73  		return nil, err
    74  	}
    75  
    76  	desiredUIDs := sets.New(maps.Keys(pods)...)
    77  	for _, entry := range entries {
    78  		// we can't break here because we need to close all the netns we opened
    79  		// plus we want to return whatever we can to the user.
    80  		res, err := p.processEntry(p.proc, netnsObserved, desiredUIDs, entry)
    81  		if err != nil {
    82  			log.Debugf("error processing entry: %s %v", entry.Name(), err)
    83  			continue
    84  		}
    85  		if res == nil {
    86  			continue
    87  		}
    88  		pod := pods[res.uid]
    89  		netns := &NetnsWithFd{
    90  			netns: res.netns,
    91  			fd:    res.netnsfd,
    92  			inode: res.inode,
    93  		}
    94  		workload := WorkloadInfo{
    95  			Workload: podToWorkload(pod),
    96  			Netns:    netns,
    97  		}
    98  		podUIDNetns[string(res.uid)] = workload
    99  
   100  	}
   101  	return podUIDNetns, nil
   102  }
   103  
   104  type PodNetnsEntry struct {
   105  	uid     types.UID
   106  	netns   fs.File
   107  	netnsfd uintptr
   108  	inode   uint64
   109  }
   110  
   111  func (p *PodNetnsProcFinder) processEntry(proc fs.FS, netnsObserved sets.Set[uint64], filter sets.Set[types.UID], entry fs.DirEntry) (*PodNetnsEntry, error) {
   112  	if !isProcess(entry) {
   113  		return nil, nil
   114  	}
   115  
   116  	netnsName := path.Join(entry.Name(), "ns", "net")
   117  	fi, err := fs.Stat(proc, netnsName)
   118  	if err != nil {
   119  		return nil, err
   120  	}
   121  
   122  	inode, err := GetInode(fi)
   123  	if err != nil {
   124  		return nil, err
   125  	}
   126  	if _, ok := netnsObserved[inode]; ok {
   127  		log.Debugf("netns: %d already processed. skipping", inode)
   128  		return nil, nil
   129  	}
   130  
   131  	cgroup, err := proc.Open(path.Join(entry.Name(), "cgroup"))
   132  	if err != nil {
   133  		return nil, nil
   134  	}
   135  	defer cgroup.Close()
   136  
   137  	var cgroupData bytes.Buffer
   138  	_, err = io.Copy(&cgroupData, cgroup)
   139  	if err != nil {
   140  		return nil, nil
   141  	}
   142  
   143  	uid, _, err := GetPodUIDAndContainerID(cgroupData)
   144  	if err != nil {
   145  		return nil, err
   146  	}
   147  	if filter != nil && !filter.Contains(uid) {
   148  		return nil, nil
   149  	}
   150  
   151  	netns, err := proc.Open(netnsName)
   152  	if err != nil {
   153  		return nil, err
   154  	}
   155  	fd, err := GetFd(netns)
   156  	if err != nil {
   157  		netns.Close()
   158  		return nil, err
   159  	}
   160  	netnsObserved[inode] = struct{}{}
   161  	log.Debugf("found pod to netns: %s %d", uid, inode)
   162  
   163  	return &PodNetnsEntry{
   164  		uid:     uid,
   165  		netns:   netns,
   166  		netnsfd: fd,
   167  		inode:   inode,
   168  	}, nil
   169  }
   170  
   171  func isProcess(entry fs.DirEntry) bool {
   172  	// check if it is a directory
   173  	if !entry.IsDir() {
   174  		return false
   175  	}
   176  
   177  	// check if it is a number
   178  	if strings.IndexFunc(entry.Name(), isNotNumber) != -1 {
   179  		return false
   180  	}
   181  	return true
   182  }
   183  
   184  func GetFd(f fs.File) (uintptr, error) {
   185  	if fdable, ok := f.(interface{ Fd() uintptr }); ok {
   186  		return fdable.Fd(), nil
   187  	}
   188  
   189  	return 0, fmt.Errorf("unable to get fd")
   190  }
   191  
   192  /// mostly copy pasted from spire below:
   193  
   194  // regexes listed here have to exclusively match a cgroup path
   195  // the regexes must include two named groups "poduid" and "containerid"
   196  // if the regex needs to exclude certain substrings, the "mustnotmatch" group can be used
   197  // nolint: lll
   198  var cgroupREs = []*regexp.Regexp{
   199  	// the regex used to parse out the pod UID and container ID from a
   200  	// cgroup name. It assumes that any ".scope" suffix has been trimmed off
   201  	// beforehand.  CAUTION: we used to verify that the pod and container id were
   202  	// descendants of a kubepods directory, however, as of Kubernetes 1.21, cgroups
   203  	// namespaces are in use and therefore we can no longer discern if that is the
   204  	// case from within SPIRE agent container (since the container itself is
   205  	// namespaced). As such, the regex has been relaxed to simply find the pod UID
   206  	// followed by the container ID with allowances for arbitrary punctuation, and
   207  	// container runtime prefixes, etc.
   208  	regexp.MustCompile(`` +
   209  		// "pod"-prefixed Pod UID (with punctuation separated groups) followed by punctuation
   210  		`[[:punct:]]pod(?P<poduid>[[:xdigit:]]{8}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{12})[[:punct:]]` +
   211  		// zero or more punctuation separated "segments" (e.g. "docker-")
   212  		`(?:[[:^punct:]]+[[:punct:]])*` +
   213  		// non-punctuation end of string, i.e., the container ID
   214  		`(?P<containerid>[[:^punct:]]+)$`),
   215  
   216  	// This regex applies for container runtimes, that won't put the PodUID into
   217  	// the cgroup name.
   218  	// Currently only cri-o in combination with kubeedge is known for this abnormally.
   219  	regexp.MustCompile(`` +
   220  		// intentionally empty poduid group
   221  		`(?P<poduid>)` +
   222  		// mustnotmatch group: cgroup path must not include a poduid
   223  		`(?P<mustnotmatch>pod[[:xdigit:]]{8}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{4}[[:punct:]]?[[:xdigit:]]{12}[[:punct:]])?` +
   224  		// /crio-
   225  		`(?:[[:^punct:]]*/*)*crio[[:punct:]]` +
   226  		// non-punctuation end of string, i.e., the container ID
   227  		`(?P<containerid>[[:^punct:]]+)$`),
   228  }
   229  
   230  func reSubMatchMap(r *regexp.Regexp, str string) map[string]string {
   231  	match := r.FindStringSubmatch(str)
   232  	if match == nil {
   233  		return nil
   234  	}
   235  	subMatchMap := make(map[string]string)
   236  	for i, name := range r.SubexpNames() {
   237  		if i != 0 {
   238  			subMatchMap[name] = match[i]
   239  		}
   240  	}
   241  	return subMatchMap
   242  }
   243  
   244  func isValidCGroupPathMatches(matches map[string]string) bool {
   245  	if matches == nil {
   246  		return false
   247  	}
   248  	if matches["mustnotmatch"] != "" {
   249  		return false
   250  	}
   251  	return true
   252  }
   253  
   254  // nolint: lll
   255  func getPodUIDAndContainerIDFromCGroupPath(cgroupPath string) (types.UID, string, bool) {
   256  	// We are only interested in kube pods entries, for example:
   257  	// - /kubepods/burstable/pod2c48913c-b29f-11e7-9350-020968147796/9bca8d63d5fa610783847915bcff0ecac1273e5b4bed3f6fa1b07350e0135961
   258  	// - /docker/8d461fa5765781bcf5f7eb192f101bc3103d4b932e26236f43feecfa20664f96/kubepods/besteffort/poddaa5c7ee-3484-4533-af39-3591564fd03e/aff34703e5e1f89443e9a1bffcc80f43f74d4808a2dd22c8f88c08547b323934
   259  	// - /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod2c48913c-b29f-11e7-9350-020968147796.slice/docker-9bca8d63d5fa610783847915bcff0ecac1273e5b4bed3f6fa1b07350e0135961.scope
   260  	// - /kubepods-besteffort-pod72f7f152_440c_66ac_9084_e0fc1d8a910c.slice:cri-containerd:b2a102854b4969b2ce98dc329c86b4fb2b06e4ad2cc8da9d8a7578c9cd2004a2"
   261  	// - /../../pod2c48913c-b29f-11e7-9350-020968147796/9bca8d63d5fa610783847915bcff0ecac1273e5b4bed3f6fa1b07350e0135961
   262  	// - 0::/../crio-45490e76e0878aaa4d9808f7d2eefba37f093c3efbba9838b6d8ab804d9bd814.scope
   263  	// First trim off any .scope suffix. This allows for a cleaner regex since
   264  	// we don't have to muck with greediness. TrimSuffix is no-copy so this
   265  	// is cheap.
   266  	cgroupPath = strings.TrimSuffix(cgroupPath, ".scope")
   267  
   268  	var matchResults map[string]string
   269  	for _, regex := range cgroupREs {
   270  		matches := reSubMatchMap(regex, cgroupPath)
   271  		if isValidCGroupPathMatches(matches) {
   272  			if matchResults != nil {
   273  				return "", "", false
   274  			}
   275  			matchResults = matches
   276  		}
   277  	}
   278  
   279  	if matchResults != nil {
   280  		var podUID types.UID
   281  		if matchResults["poduid"] != "" {
   282  			podUID = canonicalizePodUID(matchResults["poduid"])
   283  		}
   284  		return podUID, matchResults["containerid"], true
   285  	}
   286  	return "", "", false
   287  }
   288  
   289  // canonicalizePodUID converts a Pod UID, as represented in a cgroup path, into
   290  // a canonical form. Practically this means that we convert any punctuation to
   291  // dashes, which is how the UID is represented within Kubernetes.
   292  func canonicalizePodUID(uid string) types.UID {
   293  	return types.UID(strings.Map(func(r rune) rune {
   294  		if unicode.IsPunct(r) {
   295  			r = '-'
   296  		}
   297  		return r
   298  	}, uid))
   299  }
   300  
   301  // Cgroup represents a linux cgroup.
   302  type Cgroup struct {
   303  	HierarchyID    string
   304  	ControllerList string
   305  	GroupPath      string
   306  }
   307  
   308  // GetCGroups returns a slice of cgroups for pid using fs for filesystem calls.
   309  //
   310  // The expected cgroup format is "hierarchy-ID:controller-list:cgroup-path", and
   311  // this function will return an error if every cgroup does not meet that format.
   312  //
   313  // For more information, see:
   314  //   - http://man7.org/linux/man-pages/man7/cgroups.7.html
   315  //   - https://www.kernel.org/doc/Documentation/cgroup-v2.txt
   316  func GetCgroups(procCgroupData bytes.Buffer) ([]Cgroup, error) {
   317  	reader := bytes.NewReader(procCgroupData.Bytes())
   318  	var cgroups []Cgroup
   319  	scanner := bufio.NewScanner(reader)
   320  
   321  	for scanner.Scan() {
   322  		token := scanner.Text()
   323  		substrings := strings.SplitN(token, ":", 3)
   324  		if len(substrings) < 3 {
   325  			return nil, fmt.Errorf("cgroup entry contains %v colons, but expected at least 2 colons: %q", len(substrings), token)
   326  		}
   327  		cgroups = append(cgroups, Cgroup{
   328  			HierarchyID:    substrings[0],
   329  			ControllerList: substrings[1],
   330  			GroupPath:      substrings[2],
   331  		})
   332  	}
   333  
   334  	if err := scanner.Err(); err != nil {
   335  		return nil, err
   336  	}
   337  
   338  	return cgroups, nil
   339  }
   340  
   341  func GetPodUIDAndContainerID(procCgroupData bytes.Buffer) (types.UID, string, error) {
   342  	cgroups, err := GetCgroups(procCgroupData)
   343  	if err != nil {
   344  		return "", "", fmt.Errorf("unable to obtain cgroups: %v", err)
   345  	}
   346  
   347  	return getPodUIDAndContainerIDFromCGroups(cgroups)
   348  }
   349  
   350  func getPodUIDAndContainerIDFromCGroups(cgroups []Cgroup) (types.UID, string, error) {
   351  	var podUID types.UID
   352  	var containerID string
   353  	for _, cgroup := range cgroups {
   354  		candidatePodUID, candidateContainerID, ok := getPodUIDAndContainerIDFromCGroupPath(cgroup.GroupPath)
   355  		switch {
   356  		case !ok:
   357  			// Cgroup did not contain a container ID.
   358  			continue
   359  		case containerID == "":
   360  			// This is the first container ID found so far.
   361  			podUID = candidatePodUID
   362  			containerID = candidateContainerID
   363  		case containerID != candidateContainerID:
   364  			// More than one container ID found in the cgroups.
   365  			return "", "", fmt.Errorf("multiple container IDs found in cgroups (%s, %s)",
   366  				containerID, candidateContainerID)
   367  		case podUID != candidatePodUID:
   368  			// More than one pod UID found in the cgroups.
   369  			return "", "", fmt.Errorf("multiple pod UIDs found in cgroups (%s, %s)",
   370  				podUID, candidatePodUID)
   371  		}
   372  	}
   373  
   374  	return podUID, containerID, nil
   375  }