k8s.io/kubernetes@v1.29.3/test/e2e/framework/debug/resource_usage_gatherer.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package debug
    18  
    19  import (
    20  	"bufio"
    21  	"bytes"
    22  	"context"
    23  	"encoding/json"
    24  	"fmt"
    25  	"math"
    26  	"regexp"
    27  	"sort"
    28  	"strconv"
    29  	"strings"
    30  	"sync"
    31  	"text/tabwriter"
    32  	"time"
    33  
    34  	v1 "k8s.io/api/core/v1"
    35  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    36  	"k8s.io/apimachinery/pkg/fields"
    37  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    38  	clientset "k8s.io/client-go/kubernetes"
    39  	kubeletstatsv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    40  
    41  	"k8s.io/kubernetes/test/e2e/framework"
    42  	e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
    43  )
    44  
    45  // ResourceConstraint is a struct to hold constraints.
    46  type ResourceConstraint struct {
    47  	CPUConstraint    float64
    48  	MemoryConstraint uint64
    49  }
    50  
    51  // SingleContainerSummary is a struct to hold single container summary.
    52  type SingleContainerSummary struct {
    53  	Name string
    54  	CPU  float64
    55  	Mem  uint64
    56  }
    57  
    58  // ContainerResourceUsage is a structure for gathering container resource usage.
    59  type ContainerResourceUsage struct {
    60  	Name                    string
    61  	Timestamp               time.Time
    62  	CPUUsageInCores         float64
    63  	MemoryUsageInBytes      uint64
    64  	MemoryWorkingSetInBytes uint64
    65  	MemoryRSSInBytes        uint64
    66  	// The interval used to calculate CPUUsageInCores.
    67  	CPUInterval time.Duration
    68  }
    69  
    70  // ResourceUsagePerContainer is map of ContainerResourceUsage
    71  type ResourceUsagePerContainer map[string]*ContainerResourceUsage
    72  
    73  // ResourceUsageSummary is a struct to hold resource usage summary.
    74  // we can't have int here, as JSON does not accept integer keys.
    75  type ResourceUsageSummary map[string][]SingleContainerSummary
    76  
    77  // PrintHumanReadable prints resource usage summary in human readable.
    78  func (s *ResourceUsageSummary) PrintHumanReadable() string {
    79  	buf := &bytes.Buffer{}
    80  	w := tabwriter.NewWriter(buf, 1, 0, 1, ' ', 0)
    81  	for perc, summaries := range *s {
    82  		buf.WriteString(fmt.Sprintf("%v percentile:\n", perc))
    83  		fmt.Fprintf(w, "container\tcpu(cores)\tmemory(MB)\n")
    84  		for _, summary := range summaries {
    85  			fmt.Fprintf(w, "%q\t%.3f\t%.2f\n", summary.Name, summary.CPU, float64(summary.Mem)/(1024*1024))
    86  		}
    87  		w.Flush()
    88  	}
    89  	return buf.String()
    90  }
    91  
    92  // PrintJSON prints resource usage summary in JSON.
    93  func (s *ResourceUsageSummary) PrintJSON() string {
    94  	return framework.PrettyPrintJSON(*s)
    95  }
    96  
    97  // SummaryKind returns string of ResourceUsageSummary
    98  func (s *ResourceUsageSummary) SummaryKind() string {
    99  	return "ResourceUsageSummary"
   100  }
   101  
   102  type uint64arr []uint64
   103  
   104  func (a uint64arr) Len() int           { return len(a) }
   105  func (a uint64arr) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
   106  func (a uint64arr) Less(i, j int) bool { return a[i] < a[j] }
   107  
   108  type usageDataPerContainer struct {
   109  	cpuData        []float64
   110  	memUseData     []uint64
   111  	memWorkSetData []uint64
   112  }
   113  
   114  func computePercentiles(timeSeries []ResourceUsagePerContainer, percentilesToCompute []int) map[int]ResourceUsagePerContainer {
   115  	if len(timeSeries) == 0 {
   116  		return make(map[int]ResourceUsagePerContainer)
   117  	}
   118  	dataMap := make(map[string]*usageDataPerContainer)
   119  	for i := range timeSeries {
   120  		for name, data := range timeSeries[i] {
   121  			if dataMap[name] == nil {
   122  				dataMap[name] = &usageDataPerContainer{
   123  					cpuData:        make([]float64, 0, len(timeSeries)),
   124  					memUseData:     make([]uint64, 0, len(timeSeries)),
   125  					memWorkSetData: make([]uint64, 0, len(timeSeries)),
   126  				}
   127  			}
   128  			dataMap[name].cpuData = append(dataMap[name].cpuData, data.CPUUsageInCores)
   129  			dataMap[name].memUseData = append(dataMap[name].memUseData, data.MemoryUsageInBytes)
   130  			dataMap[name].memWorkSetData = append(dataMap[name].memWorkSetData, data.MemoryWorkingSetInBytes)
   131  		}
   132  	}
   133  	for _, v := range dataMap {
   134  		sort.Float64s(v.cpuData)
   135  		sort.Sort(uint64arr(v.memUseData))
   136  		sort.Sort(uint64arr(v.memWorkSetData))
   137  	}
   138  
   139  	result := make(map[int]ResourceUsagePerContainer)
   140  	for _, perc := range percentilesToCompute {
   141  		data := make(ResourceUsagePerContainer)
   142  		for k, v := range dataMap {
   143  			percentileIndex := int(math.Ceil(float64(len(v.cpuData)*perc)/100)) - 1
   144  			data[k] = &ContainerResourceUsage{
   145  				Name:                    k,
   146  				CPUUsageInCores:         v.cpuData[percentileIndex],
   147  				MemoryUsageInBytes:      v.memUseData[percentileIndex],
   148  				MemoryWorkingSetInBytes: v.memWorkSetData[percentileIndex],
   149  			}
   150  		}
   151  		result[perc] = data
   152  	}
   153  	return result
   154  }
   155  
   156  func leftMergeData(left, right map[int]ResourceUsagePerContainer) map[int]ResourceUsagePerContainer {
   157  	result := make(map[int]ResourceUsagePerContainer)
   158  	for percentile, data := range left {
   159  		result[percentile] = data
   160  		if _, ok := right[percentile]; !ok {
   161  			continue
   162  		}
   163  		for k, v := range right[percentile] {
   164  			result[percentile][k] = v
   165  		}
   166  	}
   167  	return result
   168  }
   169  
   170  type resourceGatherWorker struct {
   171  	c                           clientset.Interface
   172  	nodeName                    string
   173  	wg                          *sync.WaitGroup
   174  	containerIDs                []string
   175  	stopCh                      chan struct{}
   176  	dataSeries                  []ResourceUsagePerContainer
   177  	finished                    bool
   178  	inKubemark                  bool
   179  	resourceDataGatheringPeriod time.Duration
   180  	probeDuration               time.Duration
   181  	printVerboseLogs            bool
   182  }
   183  
   184  func (w *resourceGatherWorker) singleProbe(ctx context.Context) {
   185  	data := make(ResourceUsagePerContainer)
   186  	if w.inKubemark {
   187  		kubemarkData := getKubemarkMasterComponentsResourceUsage(ctx)
   188  		if kubemarkData == nil {
   189  			return
   190  		}
   191  		for k, v := range kubemarkData {
   192  			data[k] = &ContainerResourceUsage{
   193  				Name:                    v.Name,
   194  				MemoryWorkingSetInBytes: v.MemoryWorkingSetInBytes,
   195  				CPUUsageInCores:         v.CPUUsageInCores,
   196  			}
   197  		}
   198  	} else {
   199  		nodeUsage, err := getOneTimeResourceUsageOnNode(w.c, w.nodeName, w.probeDuration, func() []string { return w.containerIDs })
   200  		if err != nil {
   201  			framework.Logf("Error while reading data from %v: %v", w.nodeName, err)
   202  			return
   203  		}
   204  		for k, v := range nodeUsage {
   205  			data[k] = v
   206  			if w.printVerboseLogs {
   207  				framework.Logf("Get container %v usage on node %v. CPUUsageInCores: %v, MemoryUsageInBytes: %v, MemoryWorkingSetInBytes: %v", k, w.nodeName, v.CPUUsageInCores, v.MemoryUsageInBytes, v.MemoryWorkingSetInBytes)
   208  			}
   209  		}
   210  	}
   211  	w.dataSeries = append(w.dataSeries, data)
   212  }
   213  
   214  // getOneTimeResourceUsageOnNode queries the node's /stats/summary endpoint
   215  // and returns the resource usage of all containerNames for the past
   216  // cpuInterval.
   217  // The acceptable range of the interval is 2s~120s. Be warned that as the
   218  // interval (and #containers) increases, the size of kubelet's response
   219  // could be significant. E.g., the 60s interval stats for ~20 containers is
   220  // ~1.5MB. Don't hammer the node with frequent, heavy requests.
   221  //
   222  // cadvisor records cumulative cpu usage in nanoseconds, so we need to have two
   223  // stats points to compute the cpu usage over the interval. Assuming cadvisor
   224  // polls every second, we'd need to get N stats points for N-second interval.
   225  // Note that this is an approximation and may not be accurate, hence we also
   226  // write the actual interval used for calculation (based on the timestamps of
   227  // the stats points in ContainerResourceUsage.CPUInterval.
   228  //
   229  // containerNames is a function returning a collection of container names in which
   230  // user is interested in.
   231  func getOneTimeResourceUsageOnNode(
   232  	c clientset.Interface,
   233  	nodeName string,
   234  	cpuInterval time.Duration,
   235  	containerNames func() []string,
   236  ) (ResourceUsagePerContainer, error) {
   237  	const (
   238  		// cadvisor records stats about every second.
   239  		cadvisorStatsPollingIntervalInSeconds float64 = 1.0
   240  		// cadvisor caches up to 2 minutes of stats (configured by kubelet).
   241  		maxNumStatsToRequest int = 120
   242  	)
   243  
   244  	numStats := int(float64(cpuInterval.Seconds()) / cadvisorStatsPollingIntervalInSeconds)
   245  	if numStats < 2 || numStats > maxNumStatsToRequest {
   246  		return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest)
   247  	}
   248  	// Get information of all containers on the node.
   249  	summary, err := getStatsSummary(c, nodeName)
   250  	if err != nil {
   251  		return nil, err
   252  	}
   253  
   254  	f := func(name string, newStats *kubeletstatsv1alpha1.ContainerStats) *ContainerResourceUsage {
   255  		if newStats == nil || newStats.CPU == nil || newStats.Memory == nil {
   256  			return nil
   257  		}
   258  		return &ContainerResourceUsage{
   259  			Name:                    name,
   260  			Timestamp:               newStats.StartTime.Time,
   261  			CPUUsageInCores:         float64(removeUint64Ptr(newStats.CPU.UsageNanoCores)) / 1000000000,
   262  			MemoryUsageInBytes:      removeUint64Ptr(newStats.Memory.UsageBytes),
   263  			MemoryWorkingSetInBytes: removeUint64Ptr(newStats.Memory.WorkingSetBytes),
   264  			MemoryRSSInBytes:        removeUint64Ptr(newStats.Memory.RSSBytes),
   265  			CPUInterval:             0,
   266  		}
   267  	}
   268  	// Process container infos that are relevant to us.
   269  	containers := containerNames()
   270  	usageMap := make(ResourceUsagePerContainer, len(containers))
   271  	for _, pod := range summary.Pods {
   272  		for _, container := range pod.Containers {
   273  			isInteresting := false
   274  			for _, interestingContainerName := range containers {
   275  				if container.Name == interestingContainerName {
   276  					isInteresting = true
   277  					break
   278  				}
   279  			}
   280  			if !isInteresting {
   281  				continue
   282  			}
   283  			if usage := f(pod.PodRef.Name+"/"+container.Name, &container); usage != nil {
   284  				usageMap[pod.PodRef.Name+"/"+container.Name] = usage
   285  			}
   286  		}
   287  	}
   288  	return usageMap, nil
   289  }
   290  
   291  // getStatsSummary contacts kubelet for the container information.
   292  func getStatsSummary(c clientset.Interface, nodeName string) (*kubeletstatsv1alpha1.Summary, error) {
   293  	ctx, cancel := context.WithTimeout(context.Background(), framework.SingleCallTimeout)
   294  	defer cancel()
   295  
   296  	data, err := c.CoreV1().RESTClient().Get().
   297  		Resource("nodes").
   298  		SubResource("proxy").
   299  		Name(fmt.Sprintf("%v:%v", nodeName, framework.KubeletPort)).
   300  		Suffix("stats/summary").
   301  		Do(ctx).Raw()
   302  
   303  	if err != nil {
   304  		return nil, err
   305  	}
   306  
   307  	summary := kubeletstatsv1alpha1.Summary{}
   308  	err = json.Unmarshal(data, &summary)
   309  	if err != nil {
   310  		return nil, err
   311  	}
   312  	return &summary, nil
   313  }
   314  
   315  func removeUint64Ptr(ptr *uint64) uint64 {
   316  	if ptr == nil {
   317  		return 0
   318  	}
   319  	return *ptr
   320  }
   321  
   322  func (w *resourceGatherWorker) gather(ctx context.Context, initialSleep time.Duration) {
   323  	defer utilruntime.HandleCrash()
   324  	defer w.wg.Done()
   325  	defer framework.Logf("Closing worker for %v", w.nodeName)
   326  	defer func() { w.finished = true }()
   327  	select {
   328  	case <-time.After(initialSleep):
   329  		w.singleProbe(ctx)
   330  		for {
   331  			select {
   332  			case <-time.After(w.resourceDataGatheringPeriod):
   333  				w.singleProbe(ctx)
   334  			case <-ctx.Done():
   335  				return
   336  			case <-w.stopCh:
   337  				return
   338  			}
   339  		}
   340  	case <-ctx.Done():
   341  		return
   342  	case <-w.stopCh:
   343  		return
   344  	}
   345  }
   346  
   347  // ContainerResourceGatherer is a struct for gathering container resource.
   348  type ContainerResourceGatherer struct {
   349  	client       clientset.Interface
   350  	stopCh       chan struct{}
   351  	workers      []resourceGatherWorker
   352  	workerWg     sync.WaitGroup
   353  	containerIDs []string
   354  	options      ResourceGathererOptions
   355  }
   356  
   357  // ResourceGathererOptions is a struct to hold options for resource.
   358  type ResourceGathererOptions struct {
   359  	InKubemark                  bool
   360  	Nodes                       NodesSet
   361  	ResourceDataGatheringPeriod time.Duration
   362  	ProbeDuration               time.Duration
   363  	PrintVerboseLogs            bool
   364  }
   365  
   366  // NodesSet is a value of nodes set.
   367  type NodesSet int
   368  
   369  const (
   370  	// AllNodes means all containers on all nodes.
   371  	AllNodes NodesSet = 0
   372  	// MasterNodes means all containers on Master nodes only.
   373  	MasterNodes NodesSet = 1
   374  	// MasterAndDNSNodes means all containers on Master nodes and DNS containers on other nodes.
   375  	MasterAndDNSNodes NodesSet = 2
   376  )
   377  
   378  // nodeHasControlPlanePods returns true if specified node has control plane pods
   379  // (kube-scheduler and/or kube-controller-manager).
   380  func nodeHasControlPlanePods(ctx context.Context, c clientset.Interface, nodeName string) (bool, error) {
   381  	regKubeScheduler := regexp.MustCompile("kube-scheduler-.*")
   382  	regKubeControllerManager := regexp.MustCompile("kube-controller-manager-.*")
   383  
   384  	podList, err := c.CoreV1().Pods(metav1.NamespaceSystem).List(ctx, metav1.ListOptions{
   385  		FieldSelector: fields.OneTermEqualSelector("spec.nodeName", nodeName).String(),
   386  	})
   387  	if err != nil {
   388  		return false, err
   389  	}
   390  	if len(podList.Items) < 1 {
   391  		framework.Logf("Can't find any pods in namespace %s to grab metrics from", metav1.NamespaceSystem)
   392  	}
   393  	for _, pod := range podList.Items {
   394  		if regKubeScheduler.MatchString(pod.Name) || regKubeControllerManager.MatchString(pod.Name) {
   395  			return true, nil
   396  		}
   397  	}
   398  	return false, nil
   399  }
   400  
   401  // NewResourceUsageGatherer returns a new ContainerResourceGatherer.
   402  func NewResourceUsageGatherer(ctx context.Context, c clientset.Interface, options ResourceGathererOptions, pods *v1.PodList) (*ContainerResourceGatherer, error) {
   403  	g := ContainerResourceGatherer{
   404  		client:       c,
   405  		stopCh:       make(chan struct{}),
   406  		containerIDs: make([]string, 0),
   407  		options:      options,
   408  	}
   409  
   410  	if options.InKubemark {
   411  		g.workerWg.Add(1)
   412  		g.workers = append(g.workers, resourceGatherWorker{
   413  			inKubemark:                  true,
   414  			stopCh:                      g.stopCh,
   415  			wg:                          &g.workerWg,
   416  			finished:                    false,
   417  			resourceDataGatheringPeriod: options.ResourceDataGatheringPeriod,
   418  			probeDuration:               options.ProbeDuration,
   419  			printVerboseLogs:            options.PrintVerboseLogs,
   420  		})
   421  		return &g, nil
   422  	}
   423  
   424  	// Tracks kube-system pods if no valid PodList is passed in.
   425  	var err error
   426  	if pods == nil {
   427  		pods, err = c.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{})
   428  		if err != nil {
   429  			framework.Logf("Error while listing Pods: %v", err)
   430  			return nil, err
   431  		}
   432  	}
   433  	dnsNodes := make(map[string]bool)
   434  	for _, pod := range pods.Items {
   435  		if options.Nodes == MasterNodes {
   436  			isControlPlane, err := nodeHasControlPlanePods(ctx, c, pod.Spec.NodeName)
   437  			if err != nil {
   438  				return nil, err
   439  			}
   440  			if !isControlPlane {
   441  				continue
   442  			}
   443  		}
   444  		if options.Nodes == MasterAndDNSNodes {
   445  			isControlPlane, err := nodeHasControlPlanePods(ctx, c, pod.Spec.NodeName)
   446  			if err != nil {
   447  				return nil, err
   448  			}
   449  			if !isControlPlane && pod.Labels["k8s-app"] != "kube-dns" {
   450  				continue
   451  			}
   452  		}
   453  		for _, container := range pod.Status.InitContainerStatuses {
   454  			g.containerIDs = append(g.containerIDs, container.Name)
   455  		}
   456  		for _, container := range pod.Status.ContainerStatuses {
   457  			g.containerIDs = append(g.containerIDs, container.Name)
   458  		}
   459  		if options.Nodes == MasterAndDNSNodes {
   460  			dnsNodes[pod.Spec.NodeName] = true
   461  		}
   462  	}
   463  	nodeList, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
   464  	if err != nil {
   465  		framework.Logf("Error while listing Nodes: %v", err)
   466  		return nil, err
   467  	}
   468  
   469  	for _, node := range nodeList.Items {
   470  		isControlPlane, err := nodeHasControlPlanePods(ctx, c, node.Name)
   471  		if err != nil {
   472  			return nil, err
   473  		}
   474  		if options.Nodes == AllNodes || isControlPlane || dnsNodes[node.Name] {
   475  			g.workerWg.Add(1)
   476  			g.workers = append(g.workers, resourceGatherWorker{
   477  				c:                           c,
   478  				nodeName:                    node.Name,
   479  				wg:                          &g.workerWg,
   480  				containerIDs:                g.containerIDs,
   481  				stopCh:                      g.stopCh,
   482  				finished:                    false,
   483  				inKubemark:                  false,
   484  				resourceDataGatheringPeriod: options.ResourceDataGatheringPeriod,
   485  				probeDuration:               options.ProbeDuration,
   486  				printVerboseLogs:            options.PrintVerboseLogs,
   487  			})
   488  			if options.Nodes == MasterNodes {
   489  				break
   490  			}
   491  		}
   492  	}
   493  	return &g, nil
   494  }
   495  
   496  // StartGatheringData starts a stat gathering worker blocks for each node to track,
   497  // and blocks until StopAndSummarize is called.
   498  func (g *ContainerResourceGatherer) StartGatheringData(ctx context.Context) {
   499  	if len(g.workers) == 0 {
   500  		return
   501  	}
   502  	delayPeriod := g.options.ResourceDataGatheringPeriod / time.Duration(len(g.workers))
   503  	delay := time.Duration(0)
   504  	for i := range g.workers {
   505  		go g.workers[i].gather(ctx, delay)
   506  		delay += delayPeriod
   507  	}
   508  	g.workerWg.Wait()
   509  }
   510  
   511  // StopAndSummarize stops stat gathering workers, processes the collected stats,
   512  // generates resource summary for the passed-in percentiles, and returns the summary.
   513  // It returns an error if the resource usage at any percentile is beyond the
   514  // specified resource constraints.
   515  func (g *ContainerResourceGatherer) StopAndSummarize(percentiles []int, constraints map[string]ResourceConstraint) (*ResourceUsageSummary, error) {
   516  	close(g.stopCh)
   517  	framework.Logf("Closed stop channel. Waiting for %v workers", len(g.workers))
   518  	finished := make(chan struct{}, 1)
   519  	go func() {
   520  		g.workerWg.Wait()
   521  		finished <- struct{}{}
   522  	}()
   523  	select {
   524  	case <-finished:
   525  		framework.Logf("Waitgroup finished.")
   526  	case <-time.After(2 * time.Minute):
   527  		unfinished := make([]string, 0)
   528  		for i := range g.workers {
   529  			if !g.workers[i].finished {
   530  				unfinished = append(unfinished, g.workers[i].nodeName)
   531  			}
   532  		}
   533  		framework.Logf("Timed out while waiting for waitgroup, some workers failed to finish: %v", unfinished)
   534  	}
   535  
   536  	if len(percentiles) == 0 {
   537  		framework.Logf("Warning! Empty percentile list for stopAndPrintData.")
   538  		return &ResourceUsageSummary{}, fmt.Errorf("Failed to get any resource usage data")
   539  	}
   540  	data := make(map[int]ResourceUsagePerContainer)
   541  	for i := range g.workers {
   542  		if g.workers[i].finished {
   543  			stats := computePercentiles(g.workers[i].dataSeries, percentiles)
   544  			data = leftMergeData(stats, data)
   545  		}
   546  	}
   547  
   548  	// Workers has been stopped. We need to gather data stored in them.
   549  	sortedKeys := []string{}
   550  	for name := range data[percentiles[0]] {
   551  		sortedKeys = append(sortedKeys, name)
   552  	}
   553  	sort.Strings(sortedKeys)
   554  	violatedConstraints := make([]string, 0)
   555  	summary := make(ResourceUsageSummary)
   556  	for _, perc := range percentiles {
   557  		for _, name := range sortedKeys {
   558  			usage := data[perc][name]
   559  			summary[strconv.Itoa(perc)] = append(summary[strconv.Itoa(perc)], SingleContainerSummary{
   560  				Name: name,
   561  				CPU:  usage.CPUUsageInCores,
   562  				Mem:  usage.MemoryWorkingSetInBytes,
   563  			})
   564  
   565  			// Verifying 99th percentile of resource usage
   566  			if perc != 99 {
   567  				continue
   568  			}
   569  			// Name has a form: <pod_name>/<container_name>
   570  			containerName := strings.Split(name, "/")[1]
   571  			constraint, ok := constraints[containerName]
   572  			if !ok {
   573  				continue
   574  			}
   575  			if usage.CPUUsageInCores > constraint.CPUConstraint {
   576  				violatedConstraints = append(
   577  					violatedConstraints,
   578  					fmt.Sprintf("Container %v is using %v/%v CPU",
   579  						name,
   580  						usage.CPUUsageInCores,
   581  						constraint.CPUConstraint,
   582  					),
   583  				)
   584  			}
   585  			if usage.MemoryWorkingSetInBytes > constraint.MemoryConstraint {
   586  				violatedConstraints = append(
   587  					violatedConstraints,
   588  					fmt.Sprintf("Container %v is using %v/%v MB of memory",
   589  						name,
   590  						float64(usage.MemoryWorkingSetInBytes)/(1024*1024),
   591  						float64(constraint.MemoryConstraint)/(1024*1024),
   592  					),
   593  				)
   594  			}
   595  		}
   596  	}
   597  	if len(violatedConstraints) > 0 {
   598  		return &summary, fmt.Errorf(strings.Join(violatedConstraints, "\n"))
   599  	}
   600  	return &summary, nil
   601  }
   602  
   603  // kubemarkResourceUsage is a struct for tracking the resource usage of kubemark.
   604  type kubemarkResourceUsage struct {
   605  	Name                    string
   606  	MemoryWorkingSetInBytes uint64
   607  	CPUUsageInCores         float64
   608  }
   609  
   610  func getMasterUsageByPrefix(ctx context.Context, prefix string) (string, error) {
   611  	sshResult, err := e2essh.SSH(ctx, fmt.Sprintf("ps ax -o %%cpu,rss,command | tail -n +2 | grep %v | sed 's/\\s+/ /g'", prefix), framework.APIAddress()+":22", framework.TestContext.Provider)
   612  	if err != nil {
   613  		return "", err
   614  	}
   615  	return sshResult.Stdout, nil
   616  }
   617  
   618  // getKubemarkMasterComponentsResourceUsage returns the resource usage of kubemark which contains multiple combinations of cpu and memory usage for each pod name.
   619  func getKubemarkMasterComponentsResourceUsage(ctx context.Context) map[string]*kubemarkResourceUsage {
   620  	result := make(map[string]*kubemarkResourceUsage)
   621  	// Get kubernetes component resource usage
   622  	sshResult, err := getMasterUsageByPrefix(ctx, "kube")
   623  	if err != nil {
   624  		framework.Logf("Error when trying to SSH to master machine. Skipping probe. %v", err)
   625  		return nil
   626  	}
   627  	scanner := bufio.NewScanner(strings.NewReader(sshResult))
   628  	for scanner.Scan() {
   629  		var cpu float64
   630  		var mem uint64
   631  		var name string
   632  		fmt.Sscanf(strings.TrimSpace(scanner.Text()), "%f %d /usr/local/bin/kube-%s", &cpu, &mem, &name)
   633  		if name != "" {
   634  			// Gatherer expects pod_name/container_name format
   635  			fullName := name + "/" + name
   636  			result[fullName] = &kubemarkResourceUsage{Name: fullName, MemoryWorkingSetInBytes: mem * 1024, CPUUsageInCores: cpu / 100}
   637  		}
   638  	}
   639  	// Get etcd resource usage
   640  	sshResult, err = getMasterUsageByPrefix(ctx, "bin/etcd")
   641  	if err != nil {
   642  		framework.Logf("Error when trying to SSH to master machine. Skipping probe")
   643  		return nil
   644  	}
   645  	scanner = bufio.NewScanner(strings.NewReader(sshResult))
   646  	for scanner.Scan() {
   647  		var cpu float64
   648  		var mem uint64
   649  		var etcdKind string
   650  		fmt.Sscanf(strings.TrimSpace(scanner.Text()), "%f %d /bin/sh -c /usr/local/bin/etcd", &cpu, &mem)
   651  		dataDirStart := strings.Index(scanner.Text(), "--data-dir")
   652  		if dataDirStart < 0 {
   653  			continue
   654  		}
   655  		fmt.Sscanf(scanner.Text()[dataDirStart:], "--data-dir=/var/%s", &etcdKind)
   656  		if etcdKind != "" {
   657  			// Gatherer expects pod_name/container_name format
   658  			fullName := "etcd/" + etcdKind
   659  			result[fullName] = &kubemarkResourceUsage{Name: fullName, MemoryWorkingSetInBytes: mem * 1024, CPUUsageInCores: cpu / 100}
   660  		}
   661  	}
   662  	return result
   663  }