k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/util/gatherers/container_resource_gatherer.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package gatherers
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sort"
    23  	"strconv"
    24  	"sync"
    25  	"time"
    26  
    27  	corev1 "k8s.io/api/core/v1"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/util/sets"
    30  	clientset "k8s.io/client-go/kubernetes"
    31  	"k8s.io/klog/v2"
    32  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    33  	"k8s.io/perf-tests/clusterloader2/pkg/provider"
    34  	pkgutil "k8s.io/perf-tests/clusterloader2/pkg/util"
    35  )
    36  
    37  // NodesSet is a flag defining the node set range.
    38  type NodesSet int
    39  
    40  const (
    41  	// AllNodes - all containers on all nodes
    42  	AllNodes NodesSet = 0
    43  	// MasterAndNonDaemons - all containers on Master nodes and non-daemons on other nodes.
    44  	MasterAndNonDaemons NodesSet = 1
    45  )
    46  
    47  // ResourceUsageSummary represents summary of resource usage per container.
    48  type ResourceUsageSummary map[string][]util.SingleContainerSummary
    49  
    50  // Get returns collection of SingleContainerSummaries for given percentile.
    51  func (r *ResourceUsageSummary) Get(perc string) []util.SingleContainerSummary {
    52  	return (*r)[perc]
    53  }
    54  
    55  // ContainerResourceGatherer gathers resource metrics from containers.
    56  type ContainerResourceGatherer struct {
    57  	client       clientset.Interface
    58  	isRunning    bool
    59  	stopCh       chan struct{}
    60  	workers      []resourceGatherWorker
    61  	workerWg     sync.WaitGroup
    62  	containerIDs []string
    63  	options      ResourceGathererOptions
    64  }
    65  
    66  // ResourceGathererOptions specifies options for ContainerResourceGatherer.
    67  type ResourceGathererOptions struct {
    68  	InKubemark                        bool
    69  	Nodes                             NodesSet
    70  	ResourceDataGatheringPeriod       time.Duration
    71  	MasterResourceDataGatheringPeriod time.Duration
    72  }
    73  
    74  func isDaemonPod(pod *corev1.Pod) bool {
    75  	controller := metav1.GetControllerOf(pod)
    76  	if controller == nil {
    77  		// If controller is unset, assume it's not a daemon pod.
    78  		return false
    79  	}
    80  	return controller.Kind == "DaemonSet" || controller.Kind == "Node"
    81  }
    82  
    83  // NewResourceUsageGatherer creates new instance of ContainerResourceGatherer
    84  func NewResourceUsageGatherer(c clientset.Interface, host string, port int, provider provider.Provider, options ResourceGathererOptions, namespace string) (*ContainerResourceGatherer, error) {
    85  	g := ContainerResourceGatherer{
    86  		client:       c,
    87  		isRunning:    true,
    88  		stopCh:       make(chan struct{}),
    89  		containerIDs: make([]string, 0),
    90  		options:      options,
    91  	}
    92  
    93  	if options.InKubemark {
    94  		g.workerWg.Add(1)
    95  		g.workers = append(g.workers, resourceGatherWorker{
    96  			inKubemark:                  true,
    97  			stopCh:                      g.stopCh,
    98  			wg:                          &g.workerWg,
    99  			finished:                    false,
   100  			resourceDataGatheringPeriod: options.ResourceDataGatheringPeriod,
   101  			host:                        host,
   102  			port:                        port,
   103  			provider:                    provider,
   104  		})
   105  	} else {
   106  		pods, err := c.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{})
   107  		if err != nil {
   108  			return nil, fmt.Errorf("listing pods error: %v", err)
   109  		}
   110  
   111  		nodeList, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
   112  		if err != nil {
   113  			return nil, fmt.Errorf("listing nodes error: %v", err)
   114  		}
   115  
   116  		masterNodes := sets.NewString()
   117  		for _, node := range nodeList.Items {
   118  			if pkgutil.LegacyIsMasterNode(&node) || pkgutil.IsControlPlaneNode(&node) {
   119  				masterNodes.Insert(node.Name)
   120  			}
   121  		}
   122  
   123  		nodesToConsider := make(map[string]bool)
   124  		for _, pod := range pods.Items {
   125  			if (options.Nodes == MasterAndNonDaemons) && !masterNodes.Has(pod.Spec.NodeName) && isDaemonPod(&pod) {
   126  				continue
   127  			}
   128  			for _, container := range pod.Status.InitContainerStatuses {
   129  				g.containerIDs = append(g.containerIDs, container.Name)
   130  			}
   131  			for _, container := range pod.Status.ContainerStatuses {
   132  				g.containerIDs = append(g.containerIDs, container.Name)
   133  			}
   134  			if options.Nodes == MasterAndNonDaemons {
   135  				nodesToConsider[pod.Spec.NodeName] = true
   136  			}
   137  		}
   138  
   139  		for _, node := range nodeList.Items {
   140  			if options.Nodes == AllNodes || masterNodes.Has(node.Name) || nodesToConsider[node.Name] {
   141  				g.workerWg.Add(1)
   142  				resourceDataGatheringPeriod := options.ResourceDataGatheringPeriod
   143  				if masterNodes.Has(node.Name) {
   144  					resourceDataGatheringPeriod = options.MasterResourceDataGatheringPeriod
   145  				}
   146  				g.workers = append(g.workers, resourceGatherWorker{
   147  					c:                           c,
   148  					nodeName:                    node.Name,
   149  					wg:                          &g.workerWg,
   150  					containerIDs:                g.containerIDs,
   151  					stopCh:                      g.stopCh,
   152  					finished:                    false,
   153  					inKubemark:                  false,
   154  					resourceDataGatheringPeriod: resourceDataGatheringPeriod,
   155  					port:                        port,
   156  				})
   157  			}
   158  		}
   159  	}
   160  	return &g, nil
   161  }
   162  
   163  // StartGatheringData starts a stat gathering worker blocks for each node to track,
   164  // and blocks until StopAndSummarize is called.
   165  func (g *ContainerResourceGatherer) StartGatheringData() {
   166  	if len(g.workers) == 0 {
   167  		return
   168  	}
   169  	delayPeriod := g.options.ResourceDataGatheringPeriod / time.Duration(len(g.workers))
   170  	delay := time.Duration(0)
   171  	for i := range g.workers {
   172  		go g.workers[i].gather(delay)
   173  		delay += delayPeriod
   174  	}
   175  	g.workerWg.Wait()
   176  }
   177  
   178  // StopAndSummarize stops stat gathering workers, processes the collected stats,
   179  // generates resource summary for the passed-in percentiles, and returns the summary.
   180  func (g *ContainerResourceGatherer) StopAndSummarize(percentiles []int) (*ResourceUsageSummary, error) {
   181  	g.stop()
   182  	klog.V(2).Infof("Closed stop channel. Waiting for %v workers", len(g.workers))
   183  	finished := make(chan struct{})
   184  	go func() {
   185  		g.workerWg.Wait()
   186  		finished <- struct{}{}
   187  	}()
   188  	select {
   189  	case <-finished:
   190  		klog.V(2).Infof("Waitgroup finished.")
   191  	case <-time.After(2 * time.Minute):
   192  		unfinished := make([]string, 0)
   193  		for i := range g.workers {
   194  			if !g.workers[i].finished {
   195  				unfinished = append(unfinished, g.workers[i].nodeName)
   196  			}
   197  		}
   198  		klog.V(1).Infof("Timed out while waiting for waitgroup, some workers failed to finish: %v", unfinished)
   199  	}
   200  
   201  	if len(percentiles) == 0 {
   202  		klog.Warningf("Empty percentile list for stopAndPrintData.")
   203  		return &ResourceUsageSummary{}, fmt.Errorf("failed to get any resource usage data")
   204  	}
   205  	data := make(map[int]util.ResourceUsagePerContainer)
   206  	for i := range g.workers {
   207  		if g.workers[i].finished {
   208  			stats := util.ComputePercentiles(g.workers[i].dataSeries, percentiles)
   209  			data = util.LeftMergeData(stats, data)
   210  		}
   211  	}
   212  
   213  	// Workers has been stopped. We need to gather data stored in them.
   214  	sortedKeys := []string{}
   215  	for name := range data[percentiles[0]] {
   216  		sortedKeys = append(sortedKeys, name)
   217  	}
   218  	sort.Strings(sortedKeys)
   219  	summary := make(ResourceUsageSummary)
   220  	for _, perc := range percentiles {
   221  		for _, name := range sortedKeys {
   222  			usage := data[perc][name]
   223  			summary[strconv.Itoa(perc)] = append(summary[strconv.Itoa(perc)], util.SingleContainerSummary{
   224  				Name: name,
   225  				CPU:  usage.CPUUsageInCores,
   226  				Mem:  usage.MemoryWorkingSetInBytes,
   227  			})
   228  		}
   229  	}
   230  	return &summary, nil
   231  }
   232  
   233  // Dispose disposes container resource gatherer.
   234  func (g *ContainerResourceGatherer) Dispose() {
   235  	g.stop()
   236  }
   237  
   238  func (g *ContainerResourceGatherer) stop() {
   239  	if g.isRunning {
   240  		g.isRunning = false
   241  		close(g.stopCh)
   242  	}
   243  }