k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/util/gatherers/container_resource_gatherer.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package gatherers 18 19 import ( 20 "context" 21 "fmt" 22 "sort" 23 "strconv" 24 "sync" 25 "time" 26 27 corev1 "k8s.io/api/core/v1" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/util/sets" 30 clientset "k8s.io/client-go/kubernetes" 31 "k8s.io/klog/v2" 32 "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 33 "k8s.io/perf-tests/clusterloader2/pkg/provider" 34 pkgutil "k8s.io/perf-tests/clusterloader2/pkg/util" 35 ) 36 37 // NodesSet is a flag defining the node set range. 38 type NodesSet int 39 40 const ( 41 // AllNodes - all containers on all nodes 42 AllNodes NodesSet = 0 43 // MasterAndNonDaemons - all containers on Master nodes and non-daemons on other nodes. 44 MasterAndNonDaemons NodesSet = 1 45 ) 46 47 // ResourceUsageSummary represents summary of resource usage per container. 48 type ResourceUsageSummary map[string][]util.SingleContainerSummary 49 50 // Get returns collection of SingleContainerSummaries for given percentile. 51 func (r *ResourceUsageSummary) Get(perc string) []util.SingleContainerSummary { 52 return (*r)[perc] 53 } 54 55 // ContainerResourceGatherer gathers resource metrics from containers. 56 type ContainerResourceGatherer struct { 57 client clientset.Interface 58 isRunning bool 59 stopCh chan struct{} 60 workers []resourceGatherWorker 61 workerWg sync.WaitGroup 62 containerIDs []string 63 options ResourceGathererOptions 64 } 65 66 // ResourceGathererOptions specifies options for ContainerResourceGatherer. 67 type ResourceGathererOptions struct { 68 InKubemark bool 69 Nodes NodesSet 70 ResourceDataGatheringPeriod time.Duration 71 MasterResourceDataGatheringPeriod time.Duration 72 } 73 74 func isDaemonPod(pod *corev1.Pod) bool { 75 controller := metav1.GetControllerOf(pod) 76 if controller == nil { 77 // If controller is unset, assume it's not a daemon pod. 78 return false 79 } 80 return controller.Kind == "DaemonSet" || controller.Kind == "Node" 81 } 82 83 // NewResourceUsageGatherer creates new instance of ContainerResourceGatherer 84 func NewResourceUsageGatherer(c clientset.Interface, host string, port int, provider provider.Provider, options ResourceGathererOptions, namespace string) (*ContainerResourceGatherer, error) { 85 g := ContainerResourceGatherer{ 86 client: c, 87 isRunning: true, 88 stopCh: make(chan struct{}), 89 containerIDs: make([]string, 0), 90 options: options, 91 } 92 93 if options.InKubemark { 94 g.workerWg.Add(1) 95 g.workers = append(g.workers, resourceGatherWorker{ 96 inKubemark: true, 97 stopCh: g.stopCh, 98 wg: &g.workerWg, 99 finished: false, 100 resourceDataGatheringPeriod: options.ResourceDataGatheringPeriod, 101 host: host, 102 port: port, 103 provider: provider, 104 }) 105 } else { 106 pods, err := c.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{}) 107 if err != nil { 108 return nil, fmt.Errorf("listing pods error: %v", err) 109 } 110 111 nodeList, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) 112 if err != nil { 113 return nil, fmt.Errorf("listing nodes error: %v", err) 114 } 115 116 masterNodes := sets.NewString() 117 for _, node := range nodeList.Items { 118 if pkgutil.LegacyIsMasterNode(&node) || pkgutil.IsControlPlaneNode(&node) { 119 masterNodes.Insert(node.Name) 120 } 121 } 122 123 nodesToConsider := make(map[string]bool) 124 for _, pod := range pods.Items { 125 if (options.Nodes == MasterAndNonDaemons) && !masterNodes.Has(pod.Spec.NodeName) && isDaemonPod(&pod) { 126 continue 127 } 128 for _, container := range pod.Status.InitContainerStatuses { 129 g.containerIDs = append(g.containerIDs, container.Name) 130 } 131 for _, container := range pod.Status.ContainerStatuses { 132 g.containerIDs = append(g.containerIDs, container.Name) 133 } 134 if options.Nodes == MasterAndNonDaemons { 135 nodesToConsider[pod.Spec.NodeName] = true 136 } 137 } 138 139 for _, node := range nodeList.Items { 140 if options.Nodes == AllNodes || masterNodes.Has(node.Name) || nodesToConsider[node.Name] { 141 g.workerWg.Add(1) 142 resourceDataGatheringPeriod := options.ResourceDataGatheringPeriod 143 if masterNodes.Has(node.Name) { 144 resourceDataGatheringPeriod = options.MasterResourceDataGatheringPeriod 145 } 146 g.workers = append(g.workers, resourceGatherWorker{ 147 c: c, 148 nodeName: node.Name, 149 wg: &g.workerWg, 150 containerIDs: g.containerIDs, 151 stopCh: g.stopCh, 152 finished: false, 153 inKubemark: false, 154 resourceDataGatheringPeriod: resourceDataGatheringPeriod, 155 port: port, 156 }) 157 } 158 } 159 } 160 return &g, nil 161 } 162 163 // StartGatheringData starts a stat gathering worker blocks for each node to track, 164 // and blocks until StopAndSummarize is called. 165 func (g *ContainerResourceGatherer) StartGatheringData() { 166 if len(g.workers) == 0 { 167 return 168 } 169 delayPeriod := g.options.ResourceDataGatheringPeriod / time.Duration(len(g.workers)) 170 delay := time.Duration(0) 171 for i := range g.workers { 172 go g.workers[i].gather(delay) 173 delay += delayPeriod 174 } 175 g.workerWg.Wait() 176 } 177 178 // StopAndSummarize stops stat gathering workers, processes the collected stats, 179 // generates resource summary for the passed-in percentiles, and returns the summary. 180 func (g *ContainerResourceGatherer) StopAndSummarize(percentiles []int) (*ResourceUsageSummary, error) { 181 g.stop() 182 klog.V(2).Infof("Closed stop channel. Waiting for %v workers", len(g.workers)) 183 finished := make(chan struct{}) 184 go func() { 185 g.workerWg.Wait() 186 finished <- struct{}{} 187 }() 188 select { 189 case <-finished: 190 klog.V(2).Infof("Waitgroup finished.") 191 case <-time.After(2 * time.Minute): 192 unfinished := make([]string, 0) 193 for i := range g.workers { 194 if !g.workers[i].finished { 195 unfinished = append(unfinished, g.workers[i].nodeName) 196 } 197 } 198 klog.V(1).Infof("Timed out while waiting for waitgroup, some workers failed to finish: %v", unfinished) 199 } 200 201 if len(percentiles) == 0 { 202 klog.Warningf("Empty percentile list for stopAndPrintData.") 203 return &ResourceUsageSummary{}, fmt.Errorf("failed to get any resource usage data") 204 } 205 data := make(map[int]util.ResourceUsagePerContainer) 206 for i := range g.workers { 207 if g.workers[i].finished { 208 stats := util.ComputePercentiles(g.workers[i].dataSeries, percentiles) 209 data = util.LeftMergeData(stats, data) 210 } 211 } 212 213 // Workers has been stopped. We need to gather data stored in them. 214 sortedKeys := []string{} 215 for name := range data[percentiles[0]] { 216 sortedKeys = append(sortedKeys, name) 217 } 218 sort.Strings(sortedKeys) 219 summary := make(ResourceUsageSummary) 220 for _, perc := range percentiles { 221 for _, name := range sortedKeys { 222 usage := data[perc][name] 223 summary[strconv.Itoa(perc)] = append(summary[strconv.Itoa(perc)], util.SingleContainerSummary{ 224 Name: name, 225 CPU: usage.CPUUsageInCores, 226 Mem: usage.MemoryWorkingSetInBytes, 227 }) 228 } 229 } 230 return &summary, nil 231 } 232 233 // Dispose disposes container resource gatherer. 234 func (g *ContainerResourceGatherer) Dispose() { 235 g.stop() 236 } 237 238 func (g *ContainerResourceGatherer) stop() { 239 if g.isRunning { 240 g.isRunning = false 241 close(g.stopCh) 242 } 243 }