github.com/kubewharf/katalyst-core@v0.5.3/pkg/custom-metric/collector/prometheus/collector_promethes.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package prometheus 18 19 import ( 20 "context" 21 "fmt" 22 "net" 23 "net/http" 24 "path" 25 "sync" 26 "time" 27 28 "go.uber.org/atomic" 29 v1 "k8s.io/api/core/v1" 30 "k8s.io/apimachinery/pkg/api/errors" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/labels" 33 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 34 "k8s.io/apimachinery/pkg/util/wait" 35 "k8s.io/client-go/informers" 36 corelisters "k8s.io/client-go/listers/core/v1" 37 "k8s.io/client-go/tools/cache" 38 "k8s.io/client-go/util/workqueue" 39 "k8s.io/klog/v2" 40 41 katalystbase "github.com/kubewharf/katalyst-core/cmd/base" 42 "github.com/kubewharf/katalyst-core/pkg/config/metric" 43 "github.com/kubewharf/katalyst-core/pkg/custom-metric/collector" 44 "github.com/kubewharf/katalyst-core/pkg/custom-metric/store" 45 "github.com/kubewharf/katalyst-core/pkg/custom-metric/store/data" 46 "github.com/kubewharf/katalyst-core/pkg/metrics" 47 "github.com/kubewharf/katalyst-core/pkg/util/general" 48 "github.com/kubewharf/katalyst-core/pkg/util/native" 49 ) 50 51 const MetricCollectorNamePrometheus = "prometheus-collector" 52 53 const ( 54 metricNamePromCollectorSyncCosts = "kcmas_collector_sync_costs" 55 56 metricNamePromCollectorScrapeReqCount = "kcmas_collector_scrape_req_cnt" 57 metricNamePromCollectorScrapeItemCount = "kcmas_collector_scrape_item_cnt" 58 metricNamePromCollectorScrapeLatency = "kcmas_collector_scrape_latency" 59 60 metricNamePromCollectorStoreReqCount = "kcmas_collector_store_req_cnt" 61 metricNamePromCollectorStoreItemCount = "kcmas_collector_store_item_cnt" 62 metricNamePromCollectorStoreLatency = "kcmas_collector_store_latency" 63 64 fileNameUsername = "username" 65 fileNamePassword = "password" 66 ) 67 68 // prometheusCollector implements MetricCollector using self-defined parser functionality 69 // for prometheus formatted contents, and sends to store will standard formats. 70 // todo: if we restarts, we may lose some metric since the collecting logic interrupts, 71 // and we need to consider a more reliable way to handle this. 72 type prometheusCollector struct { 73 ctx context.Context 74 collectConf *metric.CollectorConfiguration 75 genericConf *metric.GenericMetricConfiguration 76 77 client *http.Client 78 username string 79 password string 80 81 emitter metrics.MetricEmitter 82 metricStore store.MetricStore 83 84 podFactory informers.SharedInformerFactory 85 nodeFactory informers.SharedInformerFactory 86 87 podLister corelisters.PodLister 88 nodeLister corelisters.NodeLister 89 90 syncedFunc []cache.InformerSynced 91 syncSuccess bool 92 93 // scrapes maps pod identifier (namespace/name) to its scrapManager, 94 // and the scrapManager will use port as unique keys. 95 sync.Mutex 96 scrapes map[string]*ScrapeManager 97 } 98 99 var _ collector.MetricCollector = &prometheusCollector{} 100 101 func NewPrometheusCollector(ctx context.Context, baseCtx *katalystbase.GenericContext, genericConf *metric.GenericMetricConfiguration, 102 collectConf *metric.CollectorConfiguration, metricStore store.MetricStore, 103 ) (collector.MetricCollector, error) { 104 client, err := newPrometheusClient() 105 if err != nil { 106 return nil, fmt.Errorf("creating HTTP client failed: %v", err) 107 } 108 109 username, password := extractCredential(collectConf.CredentialPath) 110 111 // since collector will define its own pod/node label selectors, so we will construct informer separately 112 klog.Infof("enabled with pod selector: %v, node selector: %v", collectConf.PodSelector.String(), collectConf.NodeSelector.String()) 113 podFactory := informers.NewSharedInformerFactoryWithOptions(baseCtx.Client.KubeClient, time.Hour*24, 114 informers.WithTweakListOptions(func(options *metav1.ListOptions) { 115 options.LabelSelector = collectConf.PodSelector.String() 116 })) 117 podInformer := podFactory.Core().V1().Pods() 118 119 nodeFactory := informers.NewSharedInformerFactoryWithOptions(baseCtx.Client.KubeClient, time.Hour*24, 120 informers.WithTweakListOptions(func(options *metav1.ListOptions) { 121 options.LabelSelector = collectConf.NodeSelector.String() 122 })) 123 nodeInformer := nodeFactory.Core().V1().Nodes() 124 125 p := &prometheusCollector{ 126 ctx: ctx, 127 genericConf: genericConf, 128 collectConf: collectConf, 129 podFactory: podFactory, 130 nodeFactory: nodeFactory, 131 podLister: podInformer.Lister(), 132 nodeLister: nodeInformer.Lister(), 133 syncedFunc: []cache.InformerSynced{ 134 podInformer.Informer().HasSynced, 135 nodeInformer.Informer().HasSynced, 136 }, 137 client: client, 138 username: username, 139 password: password, 140 emitter: baseCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags("prom_collector"), 141 scrapes: make(map[string]*ScrapeManager), 142 syncSuccess: false, 143 metricStore: metricStore, 144 } 145 146 podInformer.Informer().AddEventHandler(cache.FilteringResourceEventHandler{ 147 FilterFunc: func(obj interface{}) bool { 148 switch t := obj.(type) { 149 case *v1.Pod: 150 return p.collectConf.PodSelector.Matches(labels.Set(t.Labels)) 151 case cache.DeletedFinalStateUnknown: 152 if pod, ok := t.Obj.(*v1.Pod); ok { 153 return p.collectConf.PodSelector.Matches(labels.Set(pod.Labels)) 154 } 155 utilruntime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod", obj)) 156 return false 157 default: 158 utilruntime.HandleError(fmt.Errorf("unable to handle object: %T", obj)) 159 return false 160 } 161 }, 162 Handler: cache.ResourceEventHandlerFuncs{ 163 AddFunc: p.addPod, 164 UpdateFunc: p.updatePod, 165 DeleteFunc: p.deletePod, 166 }, 167 }) 168 169 podFactory.Start(ctx.Done()) 170 nodeFactory.Start(ctx.Done()) 171 172 return p, nil 173 } 174 175 func (p *prometheusCollector) Name() string { return MetricCollectorNamePrometheus } 176 177 func (p *prometheusCollector) Start() error { 178 p.podFactory.Start(p.ctx.Done()) 179 p.nodeFactory.Start(p.ctx.Done()) 180 klog.Info("starting scrape prometheus to collect contents") 181 if !cache.WaitForCacheSync(p.ctx.Done(), p.syncedFunc...) { 182 return fmt.Errorf("unable to scrape caches for %s", MetricCollectorNamePrometheus) 183 } 184 klog.Info("started scrape prometheus to collect contents") 185 p.syncSuccess = true 186 187 go wait.Until(p.sync, p.collectConf.SyncInterval, p.ctx.Done()) 188 go wait.Until(p.reviseRequest, time.Minute*5, p.ctx.Done()) 189 return nil 190 } 191 192 func (p *prometheusCollector) Stop() error { 193 return nil 194 } 195 196 func (p *prometheusCollector) addPod(obj interface{}) { 197 pod, ok := obj.(*v1.Pod) 198 if !ok { 199 klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", obj) 200 return 201 } 202 203 if p.checkTargetPod(pod) { 204 klog.Info("pod %v added with target scraping", pod.Name) 205 p.addRequest(pod) 206 } 207 } 208 209 func (p *prometheusCollector) updatePod(oldObj, newObj interface{}) { 210 oldPod, ok := oldObj.(*v1.Pod) 211 if !ok { 212 klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", oldObj) 213 return 214 } 215 oldMatch := p.checkTargetPod(oldPod) 216 217 newPod, ok := newObj.(*v1.Pod) 218 if !ok { 219 klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", newObj) 220 return 221 } 222 newMatch := p.checkTargetPod(newPod) 223 224 if !oldMatch && newMatch { 225 klog.Infof("pod %v updated with target scraping", newPod.Name) 226 p.addRequest(newPod) 227 } 228 } 229 230 func (p *prometheusCollector) deletePod(obj interface{}) { 231 pod, ok := obj.(*v1.Pod) 232 if !ok { 233 klog.ErrorS(nil, "Cannot convert to *v1.Pod", "obj", obj) 234 return 235 } 236 237 // regardless whether current pod can match up with the logic 238 p.removeRequest(pod) 239 } 240 241 // checkTargetPod checks whether the given pod is targeted 242 // for metric scrapping logic. 243 func (p *prometheusCollector) checkTargetPod(pod *v1.Pod) bool { 244 // if local cache hasn't been synced successfully, just return not matched 245 if !p.syncSuccess { 246 return false 247 } 248 249 if pod == nil || pod.Spec.NodeName == "" { 250 return false 251 } 252 253 node, err := p.nodeLister.Get(pod.Spec.NodeName) 254 if err != nil { 255 klog.Errorf("get node %v failed: %v", pod.Spec.NodeName, err) 256 return false 257 } 258 259 klog.V(6).Infof("check for pod %v: %v, %v, %v", 260 pod.Name, native.PodIsReady(pod), p.collectConf.PodSelector.Matches(labels.Set(pod.Labels)), p.checkTargetNode(node)) 261 262 return native.PodIsReady(pod) && p.collectConf.PodSelector.Matches(labels.Set(pod.Labels)) && p.checkTargetNode(node) 263 } 264 265 // checkTargetNode checks whether the given node is targeted 266 // for metric scrapping logic. 267 func (p *prometheusCollector) checkTargetNode(node *v1.Node) bool { 268 return node != nil && native.NodeReady(node) && p.collectConf.NodeSelector.Matches(labels.Set(node.Labels)) 269 } 270 271 // reviseRequest is used to maintain requests based on current status 272 func (p *prometheusCollector) reviseRequest() { 273 klog.Info("revise requests for requests") 274 candidatePods, err := p.podLister.List(p.collectConf.PodSelector) 275 if err != nil { 276 klog.Errorf("failed to list pods: %v", err) 277 return 278 } 279 280 for _, pod := range candidatePods { 281 if p.checkTargetPod(pod) { 282 p.addRequest(pod) 283 } 284 } 285 p.clearRequests() 286 } 287 288 // addRequest constructs http.Request based on pod info 289 func (p *prometheusCollector) addRequest(pod *v1.Pod) { 290 if pod == nil { 291 return 292 } 293 294 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(pod) 295 if err != nil { 296 klog.Errorf("couldn't get key for pod %#v: %v", pod, err) 297 return 298 } 299 300 p.Lock() 301 defer p.Unlock() 302 if _, ok := p.scrapes[key]; ok { 303 return 304 } 305 306 port, ok := native.ParseHostPortForPod(pod, native.ContainerMetricPortName) 307 if !ok { 308 klog.Errorf("get pod %v port failed", key) 309 return 310 } 311 312 hostIPs, ok := native.GetPodHostIPs(pod) 313 if !ok { 314 klog.Errorf("get pod %v hostIPs failed", key) 315 return 316 } 317 318 var targetURL string 319 for _, hostIP := range hostIPs { 320 url := fmt.Sprintf("[%s]:%d", hostIP, port) 321 if conn, err := net.DialTimeout("tcp", url, time.Second*5); err == nil { 322 if conn != nil { 323 _ = conn.Close() 324 } 325 klog.Infof("successfully dial for pod %v with url %v", key, url) 326 targetURL = fmt.Sprintf(httpMetricURL, hostIP, port) 327 break 328 } else { 329 klog.Errorf("pod %v dial %v failed: %v", key, url, err) 330 } 331 } 332 if len(targetURL) == 0 { 333 klog.Errorf("pod %v has no valid url", key) 334 return 335 } 336 klog.Infof("add requests for pod %v with url %v", key, targetURL) 337 338 // todo all ScrapeManager will share the same http connection now, 339 // reconsider whether it's reasonable in production 340 s, err := NewScrapeManager(p.ctx, p.genericConf.OutOfDataPeriod, p.client, pod.Spec.NodeName, targetURL, 341 p.emitter, p.username, p.password) 342 if err != nil { 343 klog.Errorf("failed to new http.Request: %v", err) 344 return 345 } 346 s.Start(p.collectConf.SyncInterval) 347 p.scrapes[key] = s 348 } 349 350 // addRequest delete http.Request for the given pod 351 func (p *prometheusCollector) removeRequest(pod *v1.Pod) { 352 p.Lock() 353 defer p.Unlock() 354 355 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(pod) 356 if err != nil { 357 klog.Errorf("couldn't get key for pod %#v: %v", pod, err) 358 return 359 } 360 361 if _, ok := p.scrapes[key]; ok { 362 klog.Infof("remove requests for pod %v", pod.Name) 363 p.scrapes[key].Stop() 364 delete(p.scrapes, key) 365 } 366 } 367 368 // addRequest delete http.Request for the given pod 369 func (p *prometheusCollector) clearRequests() { 370 p.Lock() 371 defer p.Unlock() 372 373 for key := range p.scrapes { 374 namespace, name, err := cache.SplitMetaNamespaceKey(key) 375 if err != nil { 376 klog.Errorf("failed to split namespace and name from key %s", key) 377 continue 378 } 379 380 if _, err := p.podLister.Pods(namespace).Get(name); err != nil { 381 if errors.IsNotFound(err) { 382 p.scrapes[key].Stop() 383 delete(p.scrapes, key) 384 } else { 385 klog.Errorf("failed to get pod %v/%v: %s", namespace, name, err) 386 } 387 } 388 } 389 _ = p.emitter.StoreInt64(metricNamePromCollectorScrapeReqCount, int64(len(p.scrapes)), metrics.MetricTypeNameRaw, []metrics.MetricTag{ 390 {Key: "type", Val: "total"}, 391 }...) 392 } 393 394 // sync syncs buffered data from each ScrapeManager, and put them into store 395 func (p *prometheusCollector) sync() { 396 var scrapeManagers []*ScrapeManager 397 p.Lock() 398 for _, s := range p.scrapes { 399 scrapeManagers = append(scrapeManagers, s) 400 } 401 p.Unlock() 402 403 syncStart := time.Now() 404 defer func() { 405 costs := time.Since(syncStart) 406 klog.Infof("prom collector handled with total %v requests, cost %s", len(scrapeManagers), costs.String()) 407 _ = p.emitter.StoreInt64(metricNamePromCollectorSyncCosts, costs.Microseconds(), metrics.MetricTypeNameRaw) 408 }() 409 410 var ( 411 successReqs = atomic.NewInt64(0) 412 failedReqs = atomic.NewInt64(0) 413 ) 414 handler := func(d []*data.MetricSeries, tags ...metrics.MetricTag) error { 415 storeStart := time.Now() 416 defer func() { 417 _ = p.emitter.StoreInt64(metricNamePromCollectorStoreLatency, time.Since(storeStart).Microseconds(), metrics.MetricTypeNameRaw, tags...) 418 }() 419 420 if err := p.metricStore.InsertMetric(d); err != nil { 421 failedReqs.Inc() 422 return err 423 } 424 425 successReqs.Inc() 426 return nil 427 } 428 scrape := func(i int) { 429 scrapeManagers[i].HandleMetric(handler) 430 } 431 workqueue.ParallelizeUntil(p.ctx, general.Max(32, len(scrapeManagers)/64), len(scrapeManagers), scrape) 432 433 klog.Infof("prom collector handle %v succeeded requests, %v failed requests", successReqs.Load(), failedReqs.Load()) 434 _ = p.emitter.StoreInt64(metricNamePromCollectorStoreReqCount, successReqs.Load(), metrics.MetricTypeNameCount, []metrics.MetricTag{ 435 {Key: "type", Val: "succeeded"}, 436 }...) 437 _ = p.emitter.StoreInt64(metricNamePromCollectorStoreReqCount, failedReqs.Load(), metrics.MetricTypeNameCount, []metrics.MetricTag{ 438 {Key: "type", Val: "failed"}, 439 }...) 440 } 441 442 // extractCredential get username and password from the credential directory 443 func extractCredential(credentialDir string) (string, string) { 444 usernameFilePath := path.Join(credentialDir, fileNameUsername) 445 username, usernameErr := extractCredentialFile(usernameFilePath) 446 if usernameErr != nil { 447 general.Warningf("get username failed, err:%v", usernameErr) 448 return "", "" 449 } 450 451 passwordFilePath := path.Join(credentialDir, fileNamePassword) 452 password, passwordErr := extractCredentialFile(passwordFilePath) 453 if passwordErr != nil { 454 general.Warningf("get password failed, err:%v", passwordErr) 455 return "", "" 456 } 457 458 return username, password 459 } 460 461 func extractCredentialFile(filePath string) (string, error) { 462 FileExists := general.IsPathExists(filePath) 463 if !FileExists { 464 return "", fmt.Errorf("file %v does not exist", filePath) 465 } 466 467 lines, err := general.ReadFileIntoLines(filePath) 468 if err != nil { 469 return "", fmt.Errorf("read username file failed, err:%v", err) 470 } 471 if len(lines) != 1 { 472 return "", fmt.Errorf("username is more than 1 line which is unexpected") 473 } 474 return lines[0], nil 475 }