github.com/kubewharf/katalyst-core@v0.5.3/pkg/custom-metric/collector/prometheus/scrape.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package prometheus 18 19 import ( 20 "bytes" 21 "compress/gzip" 22 "context" 23 "fmt" 24 "io" 25 "net/http" 26 "strconv" 27 "sync" 28 "time" 29 30 "github.com/alecthomas/units" 31 "github.com/cespare/xxhash" 32 dto "github.com/prometheus/client_model/go" 33 "github.com/prometheus/common/config" 34 "github.com/prometheus/common/expfmt" 35 "k8s.io/apimachinery/pkg/util/wait" 36 "k8s.io/klog/v2" 37 38 "github.com/kubewharf/katalyst-core/pkg/custom-metric/store/data" 39 "github.com/kubewharf/katalyst-core/pkg/metrics" 40 ) 41 42 // those variables define the http-related configurations for 43 var ( 44 httpMetricURL = "http://%v:%v/custom_metric" 45 httpAcceptHeader = "application/openmetrics-text;version=1.0.0,application/openmetrics-text;version=0.0.1;q=0.75,text/plain;version=0.0.4;q=0.5,*/*;q=0.1" 46 httpUserAgent = "katalyst/v1alpha1" 47 48 httpBodyLimit = int64(10 * units.MiB) 49 httpBodyExceeded = fmt.Errorf("body size limit exceeded") 50 ) 51 52 // ScrapeManager is responsible for scraping logic through http requests 53 // and each endpoint will have one manager instance for efficiency. 54 type ScrapeManager struct { 55 ctx context.Context 56 cancel context.CancelFunc 57 58 // lastScrapeSize is used to initialize the buffer size using historical length 59 sync.Mutex 60 outOfDataPeriod time.Duration 61 storedSeriesMap map[uint64]*data.MetricSeries 62 63 node string 64 url string 65 66 req *http.Request 67 client *http.Client 68 emitter metrics.MetricEmitter 69 metricTags []metrics.MetricTag 70 } 71 72 func NewScrapeManager(ctx context.Context, outOfDataPeriod time.Duration, client *http.Client, node, url string, emitter metrics.MetricEmitter, username, password string) (*ScrapeManager, error) { 73 req, err := http.NewRequest("GET", url, nil) 74 if err != nil { 75 return nil, err 76 } 77 78 req.Header.Add("Accept", httpAcceptHeader) 79 req.Header.Add("Accept-Encoding", "gzip") 80 req.Header.Set("User-Agent", httpUserAgent) 81 req.Header.Set("X-Prometheus-Scrape-Timeout-Seconds", strconv.FormatFloat(60, 'f', -1, 64)) 82 req.SetBasicAuth(username, password) 83 84 sCtx, cancel := context.WithCancel(ctx) 85 return &ScrapeManager{ 86 ctx: sCtx, 87 cancel: cancel, 88 req: req, 89 client: client, 90 node: node, 91 url: url, 92 emitter: emitter, 93 metricTags: []metrics.MetricTag{ 94 {Key: "node", Val: node}, 95 }, 96 97 outOfDataPeriod: outOfDataPeriod, 98 storedSeriesMap: make(map[uint64]*data.MetricSeries), 99 }, nil 100 } 101 102 func (s *ScrapeManager) Start(duration time.Duration) { 103 klog.Infof("start scrape manger with url: %v", s.url) 104 go wait.Until(func() { s.scrape() }, duration, s.ctx.Done()) 105 go wait.Until(func() { s.gc() }, time.Second*10, s.ctx.Done()) 106 } 107 108 func (s *ScrapeManager) Stop() { 109 klog.Infof("stop scrape manger with url: %v", s.url) 110 s.cancel() 111 } 112 113 // HandleMetric handles the in-cached metric, clears those metric if handle successes 114 // keep them in memory otherwise 115 func (s *ScrapeManager) HandleMetric(f func(d []*data.MetricSeries, tags ...metrics.MetricTag) error) { 116 s.Lock() 117 defer s.Unlock() 118 119 if len(s.storedSeriesMap) == 0 { 120 return 121 } 122 123 var totalMetricDataCount int64 124 storedSeriesList := make([]*data.MetricSeries, 0, len(s.storedSeriesMap)) 125 for _, series := range s.storedSeriesMap { 126 storedSeriesList = append(storedSeriesList, series) 127 totalMetricDataCount += int64(len(series.Series)) 128 } 129 130 if err := f(storedSeriesList, s.metricTags...); err != nil { 131 klog.Errorf("failed to scrape [%v] total metric series: %v, total metric data count: %v, err: %v", 132 s.url, len(s.storedSeriesMap), totalMetricDataCount, err) 133 return 134 } 135 136 _ = s.emitter.StoreInt64(metricNamePromCollectorStoreItemCount, totalMetricDataCount, metrics.MetricTypeNameCount, s.metricTags...) 137 klog.V(6).Infof("success scrape [%v] total metric series: %v, total metric data count: %v", 138 s.url, len(s.storedSeriesMap), totalMetricDataCount) 139 s.storedSeriesMap = make(map[uint64]*data.MetricSeries) 140 } 141 142 func (s *ScrapeManager) gc() { 143 s.Lock() 144 defer s.Unlock() 145 146 expiredTime := time.Now().Add(-1 * s.outOfDataPeriod).UnixMilli() 147 for hash, seriesMap := range s.storedSeriesMap { 148 var updatedSeries []*data.MetricData 149 150 for _, series := range seriesMap.Series { 151 if series.Timestamp > expiredTime { 152 updatedSeries = append(updatedSeries, series) 153 } 154 } 155 156 if len(updatedSeries) == 0 { 157 delete(s.storedSeriesMap, hash) 158 } else { 159 s.storedSeriesMap[hash].Series = updatedSeries 160 } 161 } 162 } 163 164 // scrape periodically scrape metric info from prometheus service, and then puts in the given store. 165 func (s *ScrapeManager) scrape() { 166 var ( 167 start = time.Now() 168 err error 169 mf map[string]*dto.MetricFamily 170 totalMetricDataCount int64 171 ) 172 defer func() { 173 tags := append(s.metricTags, 174 metrics.MetricTag{Key: "success", Val: fmt.Sprintf("%v", err == nil)}, 175 ) 176 _ = s.emitter.StoreInt64(metricNamePromCollectorScrapeLatency, time.Since(start).Microseconds(), metrics.MetricTypeNameRaw, tags...) 177 _ = s.emitter.StoreInt64(metricNamePromCollectorScrapeItemCount, totalMetricDataCount, metrics.MetricTypeNameCount, s.metricTags...) 178 }() 179 180 buf := bytes.NewBuffer([]byte{}) 181 err = s.fetch(s.ctx, buf) 182 if err != nil { 183 klog.Errorf("fetch contents %v failed: %v", s.url, err) 184 return 185 } 186 187 klog.V(6).Infof("node %v parseContents size %v", s.node, len(buf.Bytes())) 188 mf, err = parseContents(buf) 189 if err != nil { 190 klog.Errorf("node %v parseContents contents failed: %v", s.node, err) 191 return 192 } 193 klog.V(6).Infof("node %v parseContents contents successfully", s.node) 194 195 s.Lock() 196 defer s.Unlock() 197 // we only cares about metric with valid contents and types 198 for _, v := range mf { 199 if v == nil || v.Name == nil || len(v.Metric) == 0 || v.Type == nil || *v.Type != dto.MetricType_GAUGE { 200 continue 201 } 202 203 for _, m := range v.Metric { 204 if m == nil || m.Gauge == nil || m.Gauge.Value == nil { 205 continue 206 } 207 208 labels := parseLabels(m) 209 210 timestamp, ok := parseTimestamp(labels, m) 211 if !ok { 212 continue 213 } 214 215 // calculating hash does not need to consider timestamp 216 delete(labels, string(data.CustomMetricLabelKeyTimestamp)) 217 hash := calculateHash(*v.Name, labels, m) 218 if _, ok := s.storedSeriesMap[hash]; ok { 219 continue 220 } 221 222 if _, ok := s.storedSeriesMap[hash]; !ok { 223 s.storedSeriesMap[hash] = &data.MetricSeries{ 224 Name: *v.Name, 225 Labels: labels, 226 Series: []*data.MetricData{}, 227 } 228 } 229 230 totalMetricDataCount++ 231 s.storedSeriesMap[hash].Series = append(s.storedSeriesMap[hash].Series, &data.MetricData{ 232 Data: *m.Gauge.Value, 233 Timestamp: timestamp, 234 }) 235 236 } 237 } 238 } 239 240 // fetch gets contents from prometheus http service. 241 func (s *ScrapeManager) fetch(ctx context.Context, w io.Writer) error { 242 resp, err := s.client.Do(s.req.WithContext(ctx)) 243 if err != nil { 244 return err 245 } 246 247 defer func() { 248 _, _ = io.Copy(io.Discard, resp.Body) 249 _ = resp.Body.Close() 250 }() 251 252 if resp.StatusCode != http.StatusOK { 253 return fmt.Errorf("server returned HTTP status %s", resp.Status) 254 } 255 256 klog.V(6).Infof("url: %v content type: %v", s.url, resp.Header.Get("Content-Encoding")) 257 if resp.Header.Get("Content-Encoding") != "gzip" { 258 n, err := io.Copy(w, io.LimitReader(resp.Body, httpBodyLimit)) 259 if err != nil { 260 return err 261 } 262 if n >= httpBodyLimit { 263 return httpBodyExceeded 264 } 265 return nil 266 } 267 268 klog.V(6).Infof("use gzip to parse url: %v", s.url) 269 gzipR, err := gzip.NewReader(resp.Body) 270 if err != nil { 271 return fmt.Errorf("failed to init gzipR: %v", err) 272 } 273 274 _ = gzipR.Close() 275 n, err := io.Copy(w, io.LimitReader(gzipR, httpBodyLimit)) 276 if err != nil { 277 return err 278 } 279 if n >= httpBodyLimit { 280 return httpBodyExceeded 281 } 282 283 return nil 284 } 285 286 // parseContents analyzes the contents scraped from prometheus http service. 287 func parseContents(r io.Reader) (map[string]*dto.MetricFamily, error) { 288 var parser expfmt.TextParser 289 mf, err := parser.TextToMetricFamilies(r) 290 if err != nil { 291 return nil, err 292 } 293 294 return mf, nil 295 } 296 297 // calculateHash makes sure that we won't store duplicated metric contents 298 func calculateHash(name string, labels map[string]string, metric *dto.Metric) uint64 { 299 b := make([]byte, 0, 1024) 300 b = append(b, name...) 301 302 for k, v := range labels { 303 b = append(b, '\xff') 304 b = append(b, k...) 305 b = append(b, '\xff') 306 b = append(b, v...) 307 } 308 309 if metric.TimestampMs != nil { 310 b = append(b, '\xff') 311 b = append(b, fmt.Sprintf("%v", *metric.TimestampMs)...) 312 } 313 314 return xxhash.Sum64(b) 315 } 316 317 // parseLabels returns labels in key-value formats 318 func parseLabels(metric *dto.Metric) map[string]string { 319 res := make(map[string]string) 320 if metric.Label != nil { 321 for _, v := range metric.Label { 322 if v != nil && v.Name != nil && v.Value != nil { 323 res[*v.Name] = *v.Value 324 } 325 } 326 } 327 return res 328 } 329 330 // parseTimestamp is an adaptive logic for openTelemetry since its 331 // default prometheus exporter doesn't enable the ability of timestamp 332 // like the standard format. but the TimestampMs fields is always prior 333 // to label-parsed results. 334 func parseTimestamp(labels map[string]string, metric *dto.Metric) (int64, bool) { 335 if metric.TimestampMs != nil { 336 return *metric.TimestampMs, true 337 } 338 339 if ts, ok := labels[fmt.Sprintf("%s", data.CustomMetricLabelKeyTimestamp)]; ok { 340 i, err := strconv.ParseInt(ts, 10, 64) 341 if err != nil { 342 klog.Errorf("invalid ts %s for custom metric", ts) 343 return 0, false 344 } 345 return i, true 346 } 347 return 0, false 348 } 349 350 func newPrometheusClient() (*http.Client, error) { 351 return config.NewClientFromConfig(config.HTTPClientConfig{ 352 FollowRedirects: true, 353 }, "prometheus-collector") 354 }