github.com/kubewharf/katalyst-core@v0.5.3/pkg/custom-metric/collector/prometheus/collector_promethes.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package prometheus
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"net"
    23  	"net/http"
    24  	"path"
    25  	"sync"
    26  	"time"
    27  
    28  	"go.uber.org/atomic"
    29  	v1 "k8s.io/api/core/v1"
    30  	"k8s.io/apimachinery/pkg/api/errors"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/labels"
    33  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    34  	"k8s.io/apimachinery/pkg/util/wait"
    35  	"k8s.io/client-go/informers"
    36  	corelisters "k8s.io/client-go/listers/core/v1"
    37  	"k8s.io/client-go/tools/cache"
    38  	"k8s.io/client-go/util/workqueue"
    39  	"k8s.io/klog/v2"
    40  
    41  	katalystbase "github.com/kubewharf/katalyst-core/cmd/base"
    42  	"github.com/kubewharf/katalyst-core/pkg/config/metric"
    43  	"github.com/kubewharf/katalyst-core/pkg/custom-metric/collector"
    44  	"github.com/kubewharf/katalyst-core/pkg/custom-metric/store"
    45  	"github.com/kubewharf/katalyst-core/pkg/custom-metric/store/data"
    46  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    47  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    48  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    49  )
    50  
    51  const MetricCollectorNamePrometheus = "prometheus-collector"
    52  
    53  const (
    54  	metricNamePromCollectorSyncCosts = "kcmas_collector_sync_costs"
    55  
    56  	metricNamePromCollectorScrapeReqCount  = "kcmas_collector_scrape_req_cnt"
    57  	metricNamePromCollectorScrapeItemCount = "kcmas_collector_scrape_item_cnt"
    58  	metricNamePromCollectorScrapeLatency   = "kcmas_collector_scrape_latency"
    59  
    60  	metricNamePromCollectorStoreReqCount  = "kcmas_collector_store_req_cnt"
    61  	metricNamePromCollectorStoreItemCount = "kcmas_collector_store_item_cnt"
    62  	metricNamePromCollectorStoreLatency   = "kcmas_collector_store_latency"
    63  
    64  	fileNameUsername = "username"
    65  	fileNamePassword = "password"
    66  )
    67  
    68  // prometheusCollector implements MetricCollector using self-defined parser functionality
    69  // for prometheus formatted contents, and sends to store will standard formats.
    70  // todo: if we restarts, we may lose some metric since the collecting logic interrupts,
    71  // and we need to consider a more reliable way to handle this.
    72  type prometheusCollector struct {
    73  	ctx         context.Context
    74  	collectConf *metric.CollectorConfiguration
    75  	genericConf *metric.GenericMetricConfiguration
    76  
    77  	client   *http.Client
    78  	username string
    79  	password string
    80  
    81  	emitter     metrics.MetricEmitter
    82  	metricStore store.MetricStore
    83  
    84  	podFactory  informers.SharedInformerFactory
    85  	nodeFactory informers.SharedInformerFactory
    86  
    87  	podLister  corelisters.PodLister
    88  	nodeLister corelisters.NodeLister
    89  
    90  	syncedFunc  []cache.InformerSynced
    91  	syncSuccess bool
    92  
    93  	// scrapes maps pod identifier (namespace/name) to its scrapManager,
    94  	// and the scrapManager will use port as unique keys.
    95  	sync.Mutex
    96  	scrapes map[string]*ScrapeManager
    97  }
    98  
    99  var _ collector.MetricCollector = &prometheusCollector{}
   100  
   101  func NewPrometheusCollector(ctx context.Context, baseCtx *katalystbase.GenericContext, genericConf *metric.GenericMetricConfiguration,
   102  	collectConf *metric.CollectorConfiguration, metricStore store.MetricStore,
   103  ) (collector.MetricCollector, error) {
   104  	client, err := newPrometheusClient()
   105  	if err != nil {
   106  		return nil, fmt.Errorf("creating HTTP client failed: %v", err)
   107  	}
   108  
   109  	username, password := extractCredential(collectConf.CredentialPath)
   110  
   111  	// since collector will define its own pod/node label selectors, so we will construct informer separately
   112  	klog.Infof("enabled with pod selector: %v, node selector: %v", collectConf.PodSelector.String(), collectConf.NodeSelector.String())
   113  	podFactory := informers.NewSharedInformerFactoryWithOptions(baseCtx.Client.KubeClient, time.Hour*24,
   114  		informers.WithTweakListOptions(func(options *metav1.ListOptions) {
   115  			options.LabelSelector = collectConf.PodSelector.String()
   116  		}))
   117  	podInformer := podFactory.Core().V1().Pods()
   118  
   119  	nodeFactory := informers.NewSharedInformerFactoryWithOptions(baseCtx.Client.KubeClient, time.Hour*24,
   120  		informers.WithTweakListOptions(func(options *metav1.ListOptions) {
   121  			options.LabelSelector = collectConf.NodeSelector.String()
   122  		}))
   123  	nodeInformer := nodeFactory.Core().V1().Nodes()
   124  
   125  	p := &prometheusCollector{
   126  		ctx:         ctx,
   127  		genericConf: genericConf,
   128  		collectConf: collectConf,
   129  		podFactory:  podFactory,
   130  		nodeFactory: nodeFactory,
   131  		podLister:   podInformer.Lister(),
   132  		nodeLister:  nodeInformer.Lister(),
   133  		syncedFunc: []cache.InformerSynced{
   134  			podInformer.Informer().HasSynced,
   135  			nodeInformer.Informer().HasSynced,
   136  		},
   137  		client:      client,
   138  		username:    username,
   139  		password:    password,
   140  		emitter:     baseCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags("prom_collector"),
   141  		scrapes:     make(map[string]*ScrapeManager),
   142  		syncSuccess: false,
   143  		metricStore: metricStore,
   144  	}
   145  
   146  	podInformer.Informer().AddEventHandler(cache.FilteringResourceEventHandler{
   147  		FilterFunc: func(obj interface{}) bool {
   148  			switch t := obj.(type) {
   149  			case *v1.Pod:
   150  				return p.collectConf.PodSelector.Matches(labels.Set(t.Labels))
   151  			case cache.DeletedFinalStateUnknown:
   152  				if pod, ok := t.Obj.(*v1.Pod); ok {
   153  					return p.collectConf.PodSelector.Matches(labels.Set(pod.Labels))
   154  				}
   155  				utilruntime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod", obj))
   156  				return false
   157  			default:
   158  				utilruntime.HandleError(fmt.Errorf("unable to handle object: %T", obj))
   159  				return false
   160  			}
   161  		},
   162  		Handler: cache.ResourceEventHandlerFuncs{
   163  			AddFunc:    p.addPod,
   164  			UpdateFunc: p.updatePod,
   165  			DeleteFunc: p.deletePod,
   166  		},
   167  	})
   168  
   169  	podFactory.Start(ctx.Done())
   170  	nodeFactory.Start(ctx.Done())
   171  
   172  	return p, nil
   173  }
   174  
   175  func (p *prometheusCollector) Name() string { return MetricCollectorNamePrometheus }
   176  
   177  func (p *prometheusCollector) Start() error {
   178  	p.podFactory.Start(p.ctx.Done())
   179  	p.nodeFactory.Start(p.ctx.Done())
   180  	klog.Info("starting scrape prometheus to collect contents")
   181  	if !cache.WaitForCacheSync(p.ctx.Done(), p.syncedFunc...) {
   182  		return fmt.Errorf("unable to scrape caches for %s", MetricCollectorNamePrometheus)
   183  	}
   184  	klog.Info("started scrape prometheus to collect contents")
   185  	p.syncSuccess = true
   186  
   187  	go wait.Until(p.sync, p.collectConf.SyncInterval, p.ctx.Done())
   188  	go wait.Until(p.reviseRequest, time.Minute*5, p.ctx.Done())
   189  	return nil
   190  }
   191  
   192  func (p *prometheusCollector) Stop() error {
   193  	return nil
   194  }
   195  
   196  func (p *prometheusCollector) addPod(obj interface{}) {
   197  	pod, ok := obj.(*v1.Pod)
   198  	if !ok {
   199  		klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", obj)
   200  		return
   201  	}
   202  
   203  	if p.checkTargetPod(pod) {
   204  		klog.Info("pod %v added with target scraping", pod.Name)
   205  		p.addRequest(pod)
   206  	}
   207  }
   208  
   209  func (p *prometheusCollector) updatePod(oldObj, newObj interface{}) {
   210  	oldPod, ok := oldObj.(*v1.Pod)
   211  	if !ok {
   212  		klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", oldObj)
   213  		return
   214  	}
   215  	oldMatch := p.checkTargetPod(oldPod)
   216  
   217  	newPod, ok := newObj.(*v1.Pod)
   218  	if !ok {
   219  		klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", newObj)
   220  		return
   221  	}
   222  	newMatch := p.checkTargetPod(newPod)
   223  
   224  	if !oldMatch && newMatch {
   225  		klog.Infof("pod %v updated with target scraping", newPod.Name)
   226  		p.addRequest(newPod)
   227  	}
   228  }
   229  
   230  func (p *prometheusCollector) deletePod(obj interface{}) {
   231  	pod, ok := obj.(*v1.Pod)
   232  	if !ok {
   233  		klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", obj)
   234  		return
   235  	}
   236  
   237  	// regardless whether current pod can match up with the logic
   238  	p.removeRequest(pod)
   239  }
   240  
   241  // checkTargetPod checks whether the given pod is targeted
   242  // for metric scrapping logic.
   243  func (p *prometheusCollector) checkTargetPod(pod *v1.Pod) bool {
   244  	// if local cache hasn't been synced successfully, just return not matched
   245  	if !p.syncSuccess {
   246  		return false
   247  	}
   248  
   249  	if pod == nil || pod.Spec.NodeName == "" {
   250  		return false
   251  	}
   252  
   253  	node, err := p.nodeLister.Get(pod.Spec.NodeName)
   254  	if err != nil {
   255  		klog.Errorf("get node %v failed: %v", pod.Spec.NodeName, err)
   256  		return false
   257  	}
   258  
   259  	klog.V(6).Infof("check for pod %v: %v, %v, %v",
   260  		pod.Name, native.PodIsReady(pod), p.collectConf.PodSelector.Matches(labels.Set(pod.Labels)), p.checkTargetNode(node))
   261  
   262  	return native.PodIsReady(pod) && p.collectConf.PodSelector.Matches(labels.Set(pod.Labels)) && p.checkTargetNode(node)
   263  }
   264  
   265  // checkTargetNode checks whether the given node is targeted
   266  // for metric scrapping logic.
   267  func (p *prometheusCollector) checkTargetNode(node *v1.Node) bool {
   268  	return node != nil && native.NodeReady(node) && p.collectConf.NodeSelector.Matches(labels.Set(node.Labels))
   269  }
   270  
   271  // reviseRequest is used to maintain requests based on current status
   272  func (p *prometheusCollector) reviseRequest() {
   273  	klog.Info("revise requests for requests")
   274  	candidatePods, err := p.podLister.List(p.collectConf.PodSelector)
   275  	if err != nil {
   276  		klog.Errorf("failed to list pods: %v", err)
   277  		return
   278  	}
   279  
   280  	for _, pod := range candidatePods {
   281  		if p.checkTargetPod(pod) {
   282  			p.addRequest(pod)
   283  		}
   284  	}
   285  	p.clearRequests()
   286  }
   287  
   288  // addRequest constructs http.Request based on pod info
   289  func (p *prometheusCollector) addRequest(pod *v1.Pod) {
   290  	if pod == nil {
   291  		return
   292  	}
   293  
   294  	key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(pod)
   295  	if err != nil {
   296  		klog.Errorf("couldn't get key for pod %#v: %v", pod, err)
   297  		return
   298  	}
   299  
   300  	p.Lock()
   301  	defer p.Unlock()
   302  	if _, ok := p.scrapes[key]; ok {
   303  		return
   304  	}
   305  
   306  	port, ok := native.ParseHostPortForPod(pod, native.ContainerMetricPortName)
   307  	if !ok {
   308  		klog.Errorf("get pod %v port failed", key)
   309  		return
   310  	}
   311  
   312  	hostIPs, ok := native.GetPodHostIPs(pod)
   313  	if !ok {
   314  		klog.Errorf("get pod %v hostIPs failed", key)
   315  		return
   316  	}
   317  
   318  	var targetURL string
   319  	for _, hostIP := range hostIPs {
   320  		url := fmt.Sprintf("[%s]:%d", hostIP, port)
   321  		if conn, err := net.DialTimeout("tcp", url, time.Second*5); err == nil {
   322  			if conn != nil {
   323  				_ = conn.Close()
   324  			}
   325  			klog.Infof("successfully dial for pod %v with url %v", key, url)
   326  			targetURL = fmt.Sprintf(httpMetricURL, hostIP, port)
   327  			break
   328  		} else {
   329  			klog.Errorf("pod %v dial %v failed: %v", key, url, err)
   330  		}
   331  	}
   332  	if len(targetURL) == 0 {
   333  		klog.Errorf("pod %v has no valid url", key)
   334  		return
   335  	}
   336  	klog.Infof("add requests for pod %v with url %v", key, targetURL)
   337  
   338  	// todo all ScrapeManager will share the same http connection now,
   339  	//  reconsider whether it's reasonable in production
   340  	s, err := NewScrapeManager(p.ctx, p.genericConf.OutOfDataPeriod, p.client, pod.Spec.NodeName, targetURL,
   341  		p.emitter, p.username, p.password)
   342  	if err != nil {
   343  		klog.Errorf("failed to new http.Request: %v", err)
   344  		return
   345  	}
   346  	s.Start(p.collectConf.SyncInterval)
   347  	p.scrapes[key] = s
   348  }
   349  
   350  // addRequest delete http.Request for the given pod
   351  func (p *prometheusCollector) removeRequest(pod *v1.Pod) {
   352  	p.Lock()
   353  	defer p.Unlock()
   354  
   355  	key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(pod)
   356  	if err != nil {
   357  		klog.Errorf("couldn't get key for pod %#v: %v", pod, err)
   358  		return
   359  	}
   360  
   361  	if _, ok := p.scrapes[key]; ok {
   362  		klog.Infof("remove requests for pod %v", pod.Name)
   363  		p.scrapes[key].Stop()
   364  		delete(p.scrapes, key)
   365  	}
   366  }
   367  
   368  // addRequest delete http.Request for the given pod
   369  func (p *prometheusCollector) clearRequests() {
   370  	p.Lock()
   371  	defer p.Unlock()
   372  
   373  	for key := range p.scrapes {
   374  		namespace, name, err := cache.SplitMetaNamespaceKey(key)
   375  		if err != nil {
   376  			klog.Errorf("failed to split namespace and name from key %s", key)
   377  			continue
   378  		}
   379  
   380  		if _, err := p.podLister.Pods(namespace).Get(name); err != nil {
   381  			if errors.IsNotFound(err) {
   382  				p.scrapes[key].Stop()
   383  				delete(p.scrapes, key)
   384  			} else {
   385  				klog.Errorf("failed to get pod %v/%v: %s", namespace, name, err)
   386  			}
   387  		}
   388  	}
   389  	_ = p.emitter.StoreInt64(metricNamePromCollectorScrapeReqCount, int64(len(p.scrapes)), metrics.MetricTypeNameRaw, []metrics.MetricTag{
   390  		{Key: "type", Val: "total"},
   391  	}...)
   392  }
   393  
   394  // sync syncs buffered data from each ScrapeManager, and put them into store
   395  func (p *prometheusCollector) sync() {
   396  	var scrapeManagers []*ScrapeManager
   397  	p.Lock()
   398  	for _, s := range p.scrapes {
   399  		scrapeManagers = append(scrapeManagers, s)
   400  	}
   401  	p.Unlock()
   402  
   403  	syncStart := time.Now()
   404  	defer func() {
   405  		costs := time.Since(syncStart)
   406  		klog.Infof("prom collector handled with total %v requests, cost %s", len(scrapeManagers), costs.String())
   407  		_ = p.emitter.StoreInt64(metricNamePromCollectorSyncCosts, costs.Microseconds(), metrics.MetricTypeNameRaw)
   408  	}()
   409  
   410  	var (
   411  		successReqs = atomic.NewInt64(0)
   412  		failedReqs  = atomic.NewInt64(0)
   413  	)
   414  	handler := func(d []*data.MetricSeries, tags ...metrics.MetricTag) error {
   415  		storeStart := time.Now()
   416  		defer func() {
   417  			_ = p.emitter.StoreInt64(metricNamePromCollectorStoreLatency, time.Since(storeStart).Microseconds(), metrics.MetricTypeNameRaw, tags...)
   418  		}()
   419  
   420  		if err := p.metricStore.InsertMetric(d); err != nil {
   421  			failedReqs.Inc()
   422  			return err
   423  		}
   424  
   425  		successReqs.Inc()
   426  		return nil
   427  	}
   428  	scrape := func(i int) {
   429  		scrapeManagers[i].HandleMetric(handler)
   430  	}
   431  	workqueue.ParallelizeUntil(p.ctx, general.Max(32, len(scrapeManagers)/64), len(scrapeManagers), scrape)
   432  
   433  	klog.Infof("prom collector handle %v succeeded requests, %v failed requests", successReqs.Load(), failedReqs.Load())
   434  	_ = p.emitter.StoreInt64(metricNamePromCollectorStoreReqCount, successReqs.Load(), metrics.MetricTypeNameCount, []metrics.MetricTag{
   435  		{Key: "type", Val: "succeeded"},
   436  	}...)
   437  	_ = p.emitter.StoreInt64(metricNamePromCollectorStoreReqCount, failedReqs.Load(), metrics.MetricTypeNameCount, []metrics.MetricTag{
   438  		{Key: "type", Val: "failed"},
   439  	}...)
   440  }
   441  
   442  // extractCredential get username and password from the credential directory
   443  func extractCredential(credentialDir string) (string, string) {
   444  	usernameFilePath := path.Join(credentialDir, fileNameUsername)
   445  	username, usernameErr := extractCredentialFile(usernameFilePath)
   446  	if usernameErr != nil {
   447  		general.Warningf("get username failed, err:%v", usernameErr)
   448  		return "", ""
   449  	}
   450  
   451  	passwordFilePath := path.Join(credentialDir, fileNamePassword)
   452  	password, passwordErr := extractCredentialFile(passwordFilePath)
   453  	if passwordErr != nil {
   454  		general.Warningf("get password failed, err:%v", passwordErr)
   455  		return "", ""
   456  	}
   457  
   458  	return username, password
   459  }
   460  
   461  func extractCredentialFile(filePath string) (string, error) {
   462  	FileExists := general.IsPathExists(filePath)
   463  	if !FileExists {
   464  		return "", fmt.Errorf("file %v does not exist", filePath)
   465  	}
   466  
   467  	lines, err := general.ReadFileIntoLines(filePath)
   468  	if err != nil {
   469  		return "", fmt.Errorf("read username file failed, err:%v", err)
   470  	}
   471  	if len(lines) != 1 {
   472  		return "", fmt.Errorf("username is more than 1 line which is unexpected")
   473  	}
   474  	return lines[0], nil
   475  }