k8s.io/kubernetes@v1.29.3/test/e2e/framework/debug/log_size_monitoring.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package debug
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"strconv"
    24  	"strings"
    25  	"sync"
    26  	"text/tabwriter"
    27  	"time"
    28  
    29  	clientset "k8s.io/client-go/kubernetes"
    30  
    31  	"k8s.io/kubernetes/test/e2e/framework"
    32  	e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
    33  )
    34  
    35  const (
    36  	// Minimal period between polling log sizes from components
    37  	pollingPeriod            = 60 * time.Second
    38  	workersNo                = 5
    39  	kubeletLogsPath          = "/var/log/kubelet.log"
    40  	kubeProxyLogsPath        = "/var/log/kube-proxy.log"
    41  	kubeAddonsLogsPath       = "/var/log/kube-addons.log"
    42  	kubeMasterAddonsLogsPath = "/var/log/kube-master-addons.log"
    43  	apiServerLogsPath        = "/var/log/kube-apiserver.log"
    44  	controllersLogsPath      = "/var/log/kube-controller-manager.log"
    45  	schedulerLogsPath        = "/var/log/kube-scheduler.log"
    46  )
    47  
    48  var (
    49  	nodeLogsToCheck   = []string{kubeletLogsPath, kubeProxyLogsPath}
    50  	masterLogsToCheck = []string{kubeletLogsPath, kubeAddonsLogsPath, kubeMasterAddonsLogsPath,
    51  		apiServerLogsPath, controllersLogsPath, schedulerLogsPath}
    52  )
    53  
    54  // TimestampedSize contains a size together with a time of measurement.
    55  type TimestampedSize struct {
    56  	timestamp time.Time
    57  	size      int
    58  }
    59  
    60  // LogSizeGatherer is a worker which grabs a WorkItem from the channel and does assigned work.
    61  type LogSizeGatherer struct {
    62  	stopChannel chan bool
    63  	data        *LogsSizeData
    64  	wg          *sync.WaitGroup
    65  	workChannel chan WorkItem
    66  }
    67  
    68  // LogsSizeVerifier gathers data about log files sizes from master and node machines.
    69  // It oversees a <workersNo> workers which do the gathering.
    70  type LogsSizeVerifier struct {
    71  	client      clientset.Interface
    72  	stopChannel chan bool
    73  	// data stores LogSizeData groupped per IP and log_path
    74  	data          *LogsSizeData
    75  	masterAddress string
    76  	nodeAddresses []string
    77  	wg            sync.WaitGroup
    78  	workChannel   chan WorkItem
    79  	workers       []*LogSizeGatherer
    80  }
    81  
    82  // SingleLogSummary is a structure for handling average generation rate and number of probes.
    83  type SingleLogSummary struct {
    84  	AverageGenerationRate int
    85  	NumberOfProbes        int
    86  }
    87  
    88  // LogSizeDataTimeseries is map of timestamped size.
    89  type LogSizeDataTimeseries map[string]map[string][]TimestampedSize
    90  
    91  // LogsSizeDataSummary is map of log summary.
    92  // node -> file -> data
    93  type LogsSizeDataSummary map[string]map[string]SingleLogSummary
    94  
    95  // PrintHumanReadable returns string of log size data summary.
    96  // TODO: make sure that we don't need locking here
    97  func (s *LogsSizeDataSummary) PrintHumanReadable() string {
    98  	buf := &bytes.Buffer{}
    99  	w := tabwriter.NewWriter(buf, 1, 0, 1, ' ', 0)
   100  	fmt.Fprintf(w, "host\tlog_file\taverage_rate (B/s)\tnumber_of_probes\n")
   101  	for k, v := range *s {
   102  		fmt.Fprintf(w, "%v\t\t\t\n", k)
   103  		for path, data := range v {
   104  			fmt.Fprintf(w, "\t%v\t%v\t%v\n", path, data.AverageGenerationRate, data.NumberOfProbes)
   105  		}
   106  	}
   107  	w.Flush()
   108  	return buf.String()
   109  }
   110  
   111  // PrintJSON returns the summary of log size data with JSON format.
   112  func (s *LogsSizeDataSummary) PrintJSON() string {
   113  	return framework.PrettyPrintJSON(*s)
   114  }
   115  
   116  // SummaryKind returns the summary of log size data summary.
   117  func (s *LogsSizeDataSummary) SummaryKind() string {
   118  	return "LogSizeSummary"
   119  }
   120  
   121  // LogsSizeData is a structure for handling timeseries of log size data and lock.
   122  type LogsSizeData struct {
   123  	data LogSizeDataTimeseries
   124  	lock sync.Mutex
   125  }
   126  
   127  // WorkItem is a command for a worker that contains an IP of machine from which we want to
   128  // gather data and paths to all files we're interested in.
   129  type WorkItem struct {
   130  	ip                string
   131  	paths             []string
   132  	backoffMultiplier int
   133  }
   134  
   135  func prepareData(masterAddress string, nodeAddresses []string) *LogsSizeData {
   136  	data := make(LogSizeDataTimeseries)
   137  	ips := append(nodeAddresses, masterAddress)
   138  	for _, ip := range ips {
   139  		data[ip] = make(map[string][]TimestampedSize)
   140  	}
   141  	return &LogsSizeData{
   142  		data: data,
   143  		lock: sync.Mutex{},
   144  	}
   145  }
   146  
   147  func (d *LogsSizeData) addNewData(ip, path string, timestamp time.Time, size int) {
   148  	d.lock.Lock()
   149  	defer d.lock.Unlock()
   150  	d.data[ip][path] = append(
   151  		d.data[ip][path],
   152  		TimestampedSize{
   153  			timestamp: timestamp,
   154  			size:      size,
   155  		},
   156  	)
   157  }
   158  
   159  // NewLogsVerifier creates a new LogsSizeVerifier which will stop when stopChannel is closed
   160  func NewLogsVerifier(ctx context.Context, c clientset.Interface) *LogsSizeVerifier {
   161  	nodeAddresses, err := e2essh.NodeSSHHosts(ctx, c)
   162  	framework.ExpectNoError(err)
   163  	instanceAddress := framework.APIAddress() + ":22"
   164  
   165  	workChannel := make(chan WorkItem, len(nodeAddresses)+1)
   166  	workers := make([]*LogSizeGatherer, workersNo)
   167  
   168  	verifier := &LogsSizeVerifier{
   169  		client:        c,
   170  		data:          prepareData(instanceAddress, nodeAddresses),
   171  		masterAddress: instanceAddress,
   172  		nodeAddresses: nodeAddresses,
   173  		wg:            sync.WaitGroup{},
   174  		workChannel:   workChannel,
   175  		workers:       workers,
   176  	}
   177  	verifier.wg.Add(workersNo)
   178  	for i := 0; i < workersNo; i++ {
   179  		workers[i] = &LogSizeGatherer{
   180  			data:        verifier.data,
   181  			wg:          &verifier.wg,
   182  			workChannel: workChannel,
   183  		}
   184  	}
   185  	return verifier
   186  }
   187  
   188  // GetSummary returns a summary (average generation rate and number of probes) of the data gathered by LogSizeVerifier
   189  func (s *LogsSizeVerifier) GetSummary() *LogsSizeDataSummary {
   190  	result := make(LogsSizeDataSummary)
   191  	for k, v := range s.data.data {
   192  		result[k] = make(map[string]SingleLogSummary)
   193  		for path, data := range v {
   194  			if len(data) > 1 {
   195  				last := data[len(data)-1]
   196  				first := data[0]
   197  				rate := (last.size - first.size) / int(last.timestamp.Sub(first.timestamp)/time.Second)
   198  				result[k][path] = SingleLogSummary{
   199  					AverageGenerationRate: rate,
   200  					NumberOfProbes:        len(data),
   201  				}
   202  			}
   203  		}
   204  	}
   205  	return &result
   206  }
   207  
   208  // Run starts log size gathering. It starts a gorouting for every worker and then blocks until stopChannel is closed
   209  func (s *LogsSizeVerifier) Run(ctx context.Context) {
   210  	s.workChannel <- WorkItem{
   211  		ip:                s.masterAddress,
   212  		paths:             masterLogsToCheck,
   213  		backoffMultiplier: 1,
   214  	}
   215  	for _, node := range s.nodeAddresses {
   216  		s.workChannel <- WorkItem{
   217  			ip:                node,
   218  			paths:             nodeLogsToCheck,
   219  			backoffMultiplier: 1,
   220  		}
   221  	}
   222  	for _, worker := range s.workers {
   223  		go worker.Run(ctx)
   224  	}
   225  	<-s.stopChannel
   226  	s.wg.Wait()
   227  }
   228  
   229  // Run starts log size gathering.
   230  func (g *LogSizeGatherer) Run(ctx context.Context) {
   231  	for g.Work(ctx) {
   232  	}
   233  }
   234  
   235  func (g *LogSizeGatherer) pushWorkItem(workItem WorkItem) {
   236  	select {
   237  	case <-time.After(time.Duration(workItem.backoffMultiplier) * pollingPeriod):
   238  		g.workChannel <- workItem
   239  	case <-g.stopChannel:
   240  		return
   241  	}
   242  }
   243  
   244  // Work does a single unit of work: tries to take out a WorkItem from the queue, ssh-es into a given machine,
   245  // gathers data, writes it to the shared <data> map, and creates a gorouting which reinserts work item into
   246  // the queue with a <pollingPeriod> delay. Returns false if worker should exit.
   247  func (g *LogSizeGatherer) Work(ctx context.Context) bool {
   248  	var workItem WorkItem
   249  	select {
   250  	case <-g.stopChannel:
   251  		g.wg.Done()
   252  		return false
   253  	case workItem = <-g.workChannel:
   254  	}
   255  	sshResult, err := e2essh.SSH(
   256  		ctx,
   257  		fmt.Sprintf("ls -l %v | awk '{print $9, $5}' | tr '\n' ' '", strings.Join(workItem.paths, " ")),
   258  		workItem.ip,
   259  		framework.TestContext.Provider,
   260  	)
   261  	if err != nil {
   262  		framework.Logf("Error while trying to SSH to %v, skipping probe. Error: %v", workItem.ip, err)
   263  		// In case of repeated error give up.
   264  		if workItem.backoffMultiplier >= 128 {
   265  			framework.Logf("Failed to ssh to a node %v multiple times in a row. Giving up.", workItem.ip)
   266  			g.wg.Done()
   267  			return false
   268  		}
   269  		workItem.backoffMultiplier *= 2
   270  		go g.pushWorkItem(workItem)
   271  		return true
   272  	}
   273  	workItem.backoffMultiplier = 1
   274  	results := strings.Split(sshResult.Stdout, " ")
   275  
   276  	now := time.Now()
   277  	for i := 0; i+1 < len(results); i = i + 2 {
   278  		path := results[i]
   279  		size, err := strconv.Atoi(results[i+1])
   280  		if err != nil {
   281  			framework.Logf("Error during conversion to int: %v, skipping data. Error: %v", results[i+1], err)
   282  			continue
   283  		}
   284  		g.data.addNewData(workItem.ip, path, now, size)
   285  	}
   286  	go g.pushWorkItem(workItem)
   287  	return true
   288  }