github.com/kubewharf/katalyst-core@v0.5.3/pkg/custom-metric/collector/prometheus/scrape.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package prometheus
    18  
    19  import (
    20  	"bytes"
    21  	"compress/gzip"
    22  	"context"
    23  	"fmt"
    24  	"io"
    25  	"net/http"
    26  	"strconv"
    27  	"sync"
    28  	"time"
    29  
    30  	"github.com/alecthomas/units"
    31  	"github.com/cespare/xxhash"
    32  	dto "github.com/prometheus/client_model/go"
    33  	"github.com/prometheus/common/config"
    34  	"github.com/prometheus/common/expfmt"
    35  	"k8s.io/apimachinery/pkg/util/wait"
    36  	"k8s.io/klog/v2"
    37  
    38  	"github.com/kubewharf/katalyst-core/pkg/custom-metric/store/data"
    39  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    40  )
    41  
    42  // those variables define the http-related configurations for
    43  var (
    44  	httpMetricURL    = "http://%v:%v/custom_metric"
    45  	httpAcceptHeader = "application/openmetrics-text;version=1.0.0,application/openmetrics-text;version=0.0.1;q=0.75,text/plain;version=0.0.4;q=0.5,*/*;q=0.1"
    46  	httpUserAgent    = "katalyst/v1alpha1"
    47  
    48  	httpBodyLimit    = int64(10 * units.MiB)
    49  	httpBodyExceeded = fmt.Errorf("body size limit exceeded")
    50  )
    51  
    52  // ScrapeManager is responsible for scraping logic through http requests
    53  // and each endpoint will have one manager instance for efficiency.
    54  type ScrapeManager struct {
    55  	ctx    context.Context
    56  	cancel context.CancelFunc
    57  
    58  	// lastScrapeSize is used to initialize the buffer size using historical length
    59  	sync.Mutex
    60  	outOfDataPeriod time.Duration
    61  	storedSeriesMap map[uint64]*data.MetricSeries
    62  
    63  	node string
    64  	url  string
    65  
    66  	req        *http.Request
    67  	client     *http.Client
    68  	emitter    metrics.MetricEmitter
    69  	metricTags []metrics.MetricTag
    70  }
    71  
    72  func NewScrapeManager(ctx context.Context, outOfDataPeriod time.Duration, client *http.Client, node, url string, emitter metrics.MetricEmitter, username, password string) (*ScrapeManager, error) {
    73  	req, err := http.NewRequest("GET", url, nil)
    74  	if err != nil {
    75  		return nil, err
    76  	}
    77  
    78  	req.Header.Add("Accept", httpAcceptHeader)
    79  	req.Header.Add("Accept-Encoding", "gzip")
    80  	req.Header.Set("User-Agent", httpUserAgent)
    81  	req.Header.Set("X-Prometheus-Scrape-Timeout-Seconds", strconv.FormatFloat(60, 'f', -1, 64))
    82  	req.SetBasicAuth(username, password)
    83  
    84  	sCtx, cancel := context.WithCancel(ctx)
    85  	return &ScrapeManager{
    86  		ctx:     sCtx,
    87  		cancel:  cancel,
    88  		req:     req,
    89  		client:  client,
    90  		node:    node,
    91  		url:     url,
    92  		emitter: emitter,
    93  		metricTags: []metrics.MetricTag{
    94  			{Key: "node", Val: node},
    95  		},
    96  
    97  		outOfDataPeriod: outOfDataPeriod,
    98  		storedSeriesMap: make(map[uint64]*data.MetricSeries),
    99  	}, nil
   100  }
   101  
   102  func (s *ScrapeManager) Start(duration time.Duration) {
   103  	klog.Infof("start scrape manger with url: %v", s.url)
   104  	go wait.Until(func() { s.scrape() }, duration, s.ctx.Done())
   105  	go wait.Until(func() { s.gc() }, time.Second*10, s.ctx.Done())
   106  }
   107  
   108  func (s *ScrapeManager) Stop() {
   109  	klog.Infof("stop scrape manger with url: %v", s.url)
   110  	s.cancel()
   111  }
   112  
   113  // HandleMetric handles the in-cached metric, clears those metric if handle successes
   114  // keep them in memory otherwise
   115  func (s *ScrapeManager) HandleMetric(f func(d []*data.MetricSeries, tags ...metrics.MetricTag) error) {
   116  	s.Lock()
   117  	defer s.Unlock()
   118  
   119  	if len(s.storedSeriesMap) == 0 {
   120  		return
   121  	}
   122  
   123  	var totalMetricDataCount int64
   124  	storedSeriesList := make([]*data.MetricSeries, 0, len(s.storedSeriesMap))
   125  	for _, series := range s.storedSeriesMap {
   126  		storedSeriesList = append(storedSeriesList, series)
   127  		totalMetricDataCount += int64(len(series.Series))
   128  	}
   129  
   130  	if err := f(storedSeriesList, s.metricTags...); err != nil {
   131  		klog.Errorf("failed to scrape [%v] total metric series: %v, total metric data count: %v, err: %v",
   132  			s.url, len(s.storedSeriesMap), totalMetricDataCount, err)
   133  		return
   134  	}
   135  
   136  	_ = s.emitter.StoreInt64(metricNamePromCollectorStoreItemCount, totalMetricDataCount, metrics.MetricTypeNameCount, s.metricTags...)
   137  	klog.V(6).Infof("success scrape [%v] total metric series: %v, total metric data count: %v",
   138  		s.url, len(s.storedSeriesMap), totalMetricDataCount)
   139  	s.storedSeriesMap = make(map[uint64]*data.MetricSeries)
   140  }
   141  
   142  func (s *ScrapeManager) gc() {
   143  	s.Lock()
   144  	defer s.Unlock()
   145  
   146  	expiredTime := time.Now().Add(-1 * s.outOfDataPeriod).UnixMilli()
   147  	for hash, seriesMap := range s.storedSeriesMap {
   148  		var updatedSeries []*data.MetricData
   149  
   150  		for _, series := range seriesMap.Series {
   151  			if series.Timestamp > expiredTime {
   152  				updatedSeries = append(updatedSeries, series)
   153  			}
   154  		}
   155  
   156  		if len(updatedSeries) == 0 {
   157  			delete(s.storedSeriesMap, hash)
   158  		} else {
   159  			s.storedSeriesMap[hash].Series = updatedSeries
   160  		}
   161  	}
   162  }
   163  
   164  // scrape periodically scrape metric info from prometheus service, and then puts in the given store.
   165  func (s *ScrapeManager) scrape() {
   166  	var (
   167  		start                = time.Now()
   168  		err                  error
   169  		mf                   map[string]*dto.MetricFamily
   170  		totalMetricDataCount int64
   171  	)
   172  	defer func() {
   173  		tags := append(s.metricTags,
   174  			metrics.MetricTag{Key: "success", Val: fmt.Sprintf("%v", err == nil)},
   175  		)
   176  		_ = s.emitter.StoreInt64(metricNamePromCollectorScrapeLatency, time.Since(start).Microseconds(), metrics.MetricTypeNameRaw, tags...)
   177  		_ = s.emitter.StoreInt64(metricNamePromCollectorScrapeItemCount, totalMetricDataCount, metrics.MetricTypeNameCount, s.metricTags...)
   178  	}()
   179  
   180  	buf := bytes.NewBuffer([]byte{})
   181  	err = s.fetch(s.ctx, buf)
   182  	if err != nil {
   183  		klog.Errorf("fetch contents %v failed: %v", s.url, err)
   184  		return
   185  	}
   186  
   187  	klog.V(6).Infof("node %v parseContents size %v", s.node, len(buf.Bytes()))
   188  	mf, err = parseContents(buf)
   189  	if err != nil {
   190  		klog.Errorf("node %v parseContents contents failed: %v", s.node, err)
   191  		return
   192  	}
   193  	klog.V(6).Infof("node %v parseContents contents successfully", s.node)
   194  
   195  	s.Lock()
   196  	defer s.Unlock()
   197  	// we only cares about metric with valid contents and types
   198  	for _, v := range mf {
   199  		if v == nil || v.Name == nil || len(v.Metric) == 0 || v.Type == nil || *v.Type != dto.MetricType_GAUGE {
   200  			continue
   201  		}
   202  
   203  		for _, m := range v.Metric {
   204  			if m == nil || m.Gauge == nil || m.Gauge.Value == nil {
   205  				continue
   206  			}
   207  
   208  			labels := parseLabels(m)
   209  
   210  			timestamp, ok := parseTimestamp(labels, m)
   211  			if !ok {
   212  				continue
   213  			}
   214  
   215  			// calculating hash does not need to consider timestamp
   216  			delete(labels, string(data.CustomMetricLabelKeyTimestamp))
   217  			hash := calculateHash(*v.Name, labels, m)
   218  			if _, ok := s.storedSeriesMap[hash]; ok {
   219  				continue
   220  			}
   221  
   222  			if _, ok := s.storedSeriesMap[hash]; !ok {
   223  				s.storedSeriesMap[hash] = &data.MetricSeries{
   224  					Name:   *v.Name,
   225  					Labels: labels,
   226  					Series: []*data.MetricData{},
   227  				}
   228  			}
   229  
   230  			totalMetricDataCount++
   231  			s.storedSeriesMap[hash].Series = append(s.storedSeriesMap[hash].Series, &data.MetricData{
   232  				Data:      *m.Gauge.Value,
   233  				Timestamp: timestamp,
   234  			})
   235  
   236  		}
   237  	}
   238  }
   239  
   240  // fetch gets contents from prometheus http service.
   241  func (s *ScrapeManager) fetch(ctx context.Context, w io.Writer) error {
   242  	resp, err := s.client.Do(s.req.WithContext(ctx))
   243  	if err != nil {
   244  		return err
   245  	}
   246  
   247  	defer func() {
   248  		_, _ = io.Copy(io.Discard, resp.Body)
   249  		_ = resp.Body.Close()
   250  	}()
   251  
   252  	if resp.StatusCode != http.StatusOK {
   253  		return fmt.Errorf("server returned HTTP status %s", resp.Status)
   254  	}
   255  
   256  	klog.V(6).Infof("url: %v content type: %v", s.url, resp.Header.Get("Content-Encoding"))
   257  	if resp.Header.Get("Content-Encoding") != "gzip" {
   258  		n, err := io.Copy(w, io.LimitReader(resp.Body, httpBodyLimit))
   259  		if err != nil {
   260  			return err
   261  		}
   262  		if n >= httpBodyLimit {
   263  			return httpBodyExceeded
   264  		}
   265  		return nil
   266  	}
   267  
   268  	klog.V(6).Infof("use gzip to parse url: %v", s.url)
   269  	gzipR, err := gzip.NewReader(resp.Body)
   270  	if err != nil {
   271  		return fmt.Errorf("failed to init gzipR: %v", err)
   272  	}
   273  
   274  	_ = gzipR.Close()
   275  	n, err := io.Copy(w, io.LimitReader(gzipR, httpBodyLimit))
   276  	if err != nil {
   277  		return err
   278  	}
   279  	if n >= httpBodyLimit {
   280  		return httpBodyExceeded
   281  	}
   282  
   283  	return nil
   284  }
   285  
   286  // parseContents analyzes the contents scraped from prometheus http service.
   287  func parseContents(r io.Reader) (map[string]*dto.MetricFamily, error) {
   288  	var parser expfmt.TextParser
   289  	mf, err := parser.TextToMetricFamilies(r)
   290  	if err != nil {
   291  		return nil, err
   292  	}
   293  
   294  	return mf, nil
   295  }
   296  
   297  // calculateHash makes sure that we won't store duplicated metric contents
   298  func calculateHash(name string, labels map[string]string, metric *dto.Metric) uint64 {
   299  	b := make([]byte, 0, 1024)
   300  	b = append(b, name...)
   301  
   302  	for k, v := range labels {
   303  		b = append(b, '\xff')
   304  		b = append(b, k...)
   305  		b = append(b, '\xff')
   306  		b = append(b, v...)
   307  	}
   308  
   309  	if metric.TimestampMs != nil {
   310  		b = append(b, '\xff')
   311  		b = append(b, fmt.Sprintf("%v", *metric.TimestampMs)...)
   312  	}
   313  
   314  	return xxhash.Sum64(b)
   315  }
   316  
   317  // parseLabels returns labels in key-value formats
   318  func parseLabels(metric *dto.Metric) map[string]string {
   319  	res := make(map[string]string)
   320  	if metric.Label != nil {
   321  		for _, v := range metric.Label {
   322  			if v != nil && v.Name != nil && v.Value != nil {
   323  				res[*v.Name] = *v.Value
   324  			}
   325  		}
   326  	}
   327  	return res
   328  }
   329  
   330  // parseTimestamp is an adaptive logic for openTelemetry since its
   331  // default prometheus exporter doesn't enable the ability of timestamp
   332  // like the standard format. but the TimestampMs fields is always prior
   333  // to label-parsed results.
   334  func parseTimestamp(labels map[string]string, metric *dto.Metric) (int64, bool) {
   335  	if metric.TimestampMs != nil {
   336  		return *metric.TimestampMs, true
   337  	}
   338  
   339  	if ts, ok := labels[fmt.Sprintf("%s", data.CustomMetricLabelKeyTimestamp)]; ok {
   340  		i, err := strconv.ParseInt(ts, 10, 64)
   341  		if err != nil {
   342  			klog.Errorf("invalid ts %s for custom metric", ts)
   343  			return 0, false
   344  		}
   345  		return i, true
   346  	}
   347  	return 0, false
   348  }
   349  
   350  func newPrometheusClient() (*http.Client, error) {
   351  	return config.NewClientFromConfig(config.HTTPClientConfig{
   352  		FollowRedirects: true,
   353  	}, "prometheus-collector")
   354  }