github.com/pf-qiu/concourse/v6@v6.7.3-0.20201207032516-1f455d73275f/atc/metric/emitter/prometheus.go (about) 1 package emitter 2 3 import ( 4 "fmt" 5 "net" 6 "net/http" 7 "sort" 8 "strings" 9 "sync" 10 "time" 11 12 "code.cloudfoundry.org/lager" 13 "github.com/pf-qiu/concourse/v6/atc/db" 14 "github.com/pf-qiu/concourse/v6/atc/metric" 15 16 "github.com/prometheus/client_golang/prometheus" 17 "github.com/prometheus/client_golang/prometheus/promhttp" 18 ) 19 20 type PrometheusEmitter struct { 21 jobsScheduled prometheus.Counter 22 jobsScheduling prometheus.Gauge 23 24 buildsStarted prometheus.Counter 25 buildsRunning prometheus.Gauge 26 27 concurrentRequestsLimitHit *prometheus.CounterVec 28 concurrentRequests *prometheus.GaugeVec 29 30 tasksWaiting *prometheus.GaugeVec 31 tasksWaitingDuration *prometheus.HistogramVec 32 33 buildDurationsVec *prometheus.HistogramVec 34 buildsAborted prometheus.Counter 35 buildsErrored prometheus.Counter 36 buildsFailed prometheus.Counter 37 buildsFinished prometheus.Counter 38 buildsFinishedVec *prometheus.CounterVec 39 buildsSucceeded prometheus.Counter 40 41 dbConnections *prometheus.GaugeVec 42 dbQueriesTotal prometheus.Counter 43 44 errorLogs *prometheus.CounterVec 45 46 httpRequestsDuration *prometheus.HistogramVec 47 48 locksHeld *prometheus.GaugeVec 49 50 checksFinished *prometheus.CounterVec 51 checksQueueSize prometheus.Gauge 52 checksStarted prometheus.Counter 53 checksEnqueued prometheus.Counter 54 55 volumesStreamed prometheus.Counter 56 57 workerContainers *prometheus.GaugeVec 58 workerUnknownContainers *prometheus.GaugeVec 59 workerVolumes *prometheus.GaugeVec 60 workerUnknownVolumes *prometheus.GaugeVec 61 workerTasks *prometheus.GaugeVec 62 workersRegistered *prometheus.GaugeVec 63 64 workerContainersLabels map[string]map[string]prometheus.Labels 65 workerVolumesLabels map[string]map[string]prometheus.Labels 66 workerTasksLabels map[string]map[string]prometheus.Labels 67 workerLastSeen map[string]time.Time 68 mu sync.Mutex 69 } 70 71 type PrometheusConfig struct { 72 BindIP string `long:"prometheus-bind-ip" description:"IP to listen on to expose Prometheus metrics."` 73 BindPort string `long:"prometheus-bind-port" description:"Port to listen on to expose Prometheus metrics."` 74 } 75 76 // The most natural data type to hold the labels is a set because each worker can have multiple but 77 // unique sets of labels. A set in Go is represented by a map[T]struct{}. Unfortunately, we cannot 78 // put prometheus.Labels inside a map[prometheus.Labels]struct{} because prometheus.Labels are not 79 // hashable. To work around this, we compute a string from the labels and use this as the keys of 80 // the map. 81 func serializeLabels(labels *prometheus.Labels) string { 82 var ( 83 key string 84 names []string 85 ) 86 for _, v := range *labels { 87 names = append(names, v) 88 } 89 sort.Strings(names) 90 key = strings.Join(names, "_") 91 92 return key 93 } 94 95 func init() { 96 metric.Metrics.RegisterEmitter(&PrometheusConfig{}) 97 } 98 99 func (config *PrometheusConfig) Description() string { return "Prometheus" } 100 func (config *PrometheusConfig) IsConfigured() bool { 101 return config.BindPort != "" && config.BindIP != "" 102 } 103 func (config *PrometheusConfig) bind() string { 104 return fmt.Sprintf("%s:%s", config.BindIP, config.BindPort) 105 } 106 107 func (config *PrometheusConfig) NewEmitter() (metric.Emitter, error) { 108 // error log metrics 109 errorLogs := prometheus.NewCounterVec( 110 prometheus.CounterOpts{ 111 Namespace: "concourse", 112 Subsystem: "error", 113 Name: "logs", 114 Help: "Number of error logged", 115 }, []string{"message"}, 116 ) 117 prometheus.MustRegister(errorLogs) 118 119 // lock metrics 120 locksHeld := prometheus.NewGaugeVec(prometheus.GaugeOpts{ 121 Namespace: "concourse", 122 Subsystem: "locks", 123 Name: "held", 124 Help: "Database locks held", 125 }, []string{"type"}) 126 prometheus.MustRegister(locksHeld) 127 128 // job metrics 129 jobsScheduled := prometheus.NewCounter(prometheus.CounterOpts{ 130 Namespace: "concourse", 131 Subsystem: "jobs", 132 Name: "scheduled_total", 133 Help: "Total number of Concourse jobs scheduled.", 134 }) 135 prometheus.MustRegister(jobsScheduled) 136 137 jobsScheduling := prometheus.NewGauge(prometheus.GaugeOpts{ 138 Namespace: "concourse", 139 Subsystem: "jobs", 140 Name: "scheduling", 141 Help: "Number of Concourse jobs currently being scheduled.", 142 }) 143 prometheus.MustRegister(jobsScheduling) 144 145 // build metrics 146 buildsStarted := prometheus.NewCounter(prometheus.CounterOpts{ 147 Namespace: "concourse", 148 Subsystem: "builds", 149 Name: "started_total", 150 Help: "Total number of Concourse builds started.", 151 }) 152 prometheus.MustRegister(buildsStarted) 153 154 buildsRunning := prometheus.NewGauge(prometheus.GaugeOpts{ 155 Namespace: "concourse", 156 Subsystem: "builds", 157 Name: "running", 158 Help: "Number of Concourse builds currently running.", 159 }) 160 prometheus.MustRegister(buildsRunning) 161 162 concurrentRequestsLimitHit := prometheus.NewCounterVec(prometheus.CounterOpts{ 163 Namespace: "concourse", 164 Subsystem: "concurrent_requests", 165 Name: "limit_hit_total", 166 Help: "Total number of requests rejected because the server was already serving too many concurrent requests.", 167 }, []string{"action"}) 168 prometheus.MustRegister(concurrentRequestsLimitHit) 169 170 concurrentRequests := prometheus.NewGaugeVec(prometheus.GaugeOpts{ 171 Namespace: "concourse", 172 Name: "concurrent_requests", 173 Help: "Number of concurrent requests being served by endpoints that have a specified limit of concurrent requests.", 174 }, []string{"action"}) 175 prometheus.MustRegister(concurrentRequests) 176 177 tasksWaiting := prometheus.NewGaugeVec(prometheus.GaugeOpts{ 178 Namespace: "concourse", 179 Subsystem: "tasks", 180 Name: "waiting", 181 Help: "Number of Concourse tasks currently waiting.", 182 }, []string{"teamId", "workerTags", "platform"}) 183 prometheus.MustRegister(tasksWaiting) 184 185 tasksWaitingDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{ 186 Namespace: "concourse", 187 Subsystem: "tasks", 188 Name: "wait_duration", 189 Help: "Elapsed time waiting for execution", 190 Buckets: []float64{1, 15, 30, 60, 120, 180, 240, 300, 600, 1200}, 191 }, []string{"teamId", "workerTags", "platform"}) 192 prometheus.MustRegister(tasksWaitingDuration) 193 194 buildsFinished := prometheus.NewCounter(prometheus.CounterOpts{ 195 Namespace: "concourse", 196 Subsystem: "builds", 197 Name: "finished_total", 198 Help: "Total number of Concourse builds finished.", 199 }) 200 prometheus.MustRegister(buildsFinished) 201 202 buildsSucceeded := prometheus.NewCounter(prometheus.CounterOpts{ 203 Namespace: "concourse", 204 Subsystem: "builds", 205 Name: "succeeded_total", 206 Help: "Total number of Concourse builds succeeded.", 207 }) 208 prometheus.MustRegister(buildsSucceeded) 209 210 buildsErrored := prometheus.NewCounter(prometheus.CounterOpts{ 211 Namespace: "concourse", 212 Subsystem: "builds", 213 Name: "errored_total", 214 Help: "Total number of Concourse builds errored.", 215 }) 216 prometheus.MustRegister(buildsErrored) 217 218 buildsFailed := prometheus.NewCounter(prometheus.CounterOpts{ 219 Namespace: "concourse", 220 Subsystem: "builds", 221 Name: "failed_total", 222 Help: "Total number of Concourse builds failed.", 223 }) 224 prometheus.MustRegister(buildsFailed) 225 226 buildsAborted := prometheus.NewCounter(prometheus.CounterOpts{ 227 Namespace: "concourse", 228 Subsystem: "builds", 229 Name: "aborted_total", 230 Help: "Total number of Concourse builds aborted.", 231 }) 232 prometheus.MustRegister(buildsAborted) 233 234 buildsFinishedVec := prometheus.NewCounterVec( 235 prometheus.CounterOpts{ 236 Namespace: "concourse", 237 Subsystem: "builds", 238 Name: "finished", 239 Help: "Count of builds finished across various dimensions.", 240 }, 241 []string{"team", "pipeline", "job", "status"}, 242 ) 243 prometheus.MustRegister(buildsFinishedVec) 244 245 buildDurationsVec := prometheus.NewHistogramVec( 246 prometheus.HistogramOpts{ 247 Namespace: "concourse", 248 Subsystem: "builds", 249 Name: "duration_seconds", 250 Help: "Build time in seconds", 251 Buckets: []float64{1, 60, 180, 300, 600, 900, 1200, 1800, 2700, 3600, 7200, 18000, 36000}, 252 }, 253 []string{"team", "pipeline", "job"}, 254 ) 255 prometheus.MustRegister(buildDurationsVec) 256 257 // worker metrics 258 workerContainers := prometheus.NewGaugeVec( 259 prometheus.GaugeOpts{ 260 Namespace: "concourse", 261 Subsystem: "workers", 262 Name: "containers", 263 Help: "Number of containers per worker", 264 }, 265 []string{"worker", "platform", "team", "tags"}, 266 ) 267 prometheus.MustRegister(workerContainers) 268 269 workerUnknownContainers := prometheus.NewGaugeVec( 270 prometheus.GaugeOpts{ 271 Namespace: "concourse", 272 Subsystem: "workers", 273 Name: "unknown_containers", 274 Help: "Number of unknown containers found on worker", 275 }, 276 []string{"worker"}, 277 ) 278 prometheus.MustRegister(workerUnknownContainers) 279 280 workerVolumes := prometheus.NewGaugeVec( 281 prometheus.GaugeOpts{ 282 Namespace: "concourse", 283 Subsystem: "workers", 284 Name: "volumes", 285 Help: "Number of volumes per worker", 286 }, 287 []string{"worker", "platform", "team", "tags"}, 288 ) 289 prometheus.MustRegister(workerVolumes) 290 291 workerUnknownVolumes := prometheus.NewGaugeVec( 292 prometheus.GaugeOpts{ 293 Namespace: "concourse", 294 Subsystem: "workers", 295 Name: "unknown_volumes", 296 Help: "Number of unknown volumes found on worker", 297 }, 298 []string{"worker"}, 299 ) 300 prometheus.MustRegister(workerUnknownVolumes) 301 302 workerTasks := prometheus.NewGaugeVec( 303 prometheus.GaugeOpts{ 304 Namespace: "concourse", 305 Subsystem: "workers", 306 Name: "tasks", 307 Help: "Number of active tasks per worker", 308 }, 309 []string{"worker", "platform"}, 310 ) 311 prometheus.MustRegister(workerTasks) 312 313 workersRegistered := prometheus.NewGaugeVec( 314 prometheus.GaugeOpts{ 315 Namespace: "concourse", 316 Subsystem: "workers", 317 Name: "registered", 318 Help: "Number of workers per state as seen by the database", 319 }, 320 []string{"state"}, 321 ) 322 prometheus.MustRegister(workersRegistered) 323 324 // http metrics 325 httpRequestsDuration := prometheus.NewHistogramVec( 326 prometheus.HistogramOpts{ 327 Namespace: "concourse", 328 Subsystem: "http_responses", 329 Name: "duration_seconds", 330 Help: "Response time in seconds", 331 }, 332 []string{"method", "route", "status"}, 333 ) 334 prometheus.MustRegister(httpRequestsDuration) 335 336 dbQueriesTotal := prometheus.NewCounter(prometheus.CounterOpts{ 337 Namespace: "concourse", 338 Subsystem: "db", 339 Name: "queries_total", 340 Help: "Total number of database Concourse database queries", 341 }) 342 prometheus.MustRegister(dbQueriesTotal) 343 344 dbConnections := prometheus.NewGaugeVec( 345 prometheus.GaugeOpts{ 346 Namespace: "concourse", 347 Subsystem: "db", 348 Name: "connections", 349 Help: "Current number of concourse database connections", 350 }, 351 []string{"dbname"}, 352 ) 353 prometheus.MustRegister(dbConnections) 354 355 checksFinished := prometheus.NewCounterVec( 356 prometheus.CounterOpts{ 357 Namespace: "concourse", 358 Subsystem: "lidar", 359 Name: "checks_finished_total", 360 Help: "Total number of checks finished", 361 }, 362 []string{"status"}, 363 ) 364 prometheus.MustRegister(checksFinished) 365 366 checksQueueSize := prometheus.NewGauge( 367 prometheus.GaugeOpts{ 368 Namespace: "concourse", 369 Subsystem: "lidar", 370 Name: "check_queue_size", 371 Help: "The size of the checks queue", 372 }, 373 ) 374 prometheus.MustRegister(checksQueueSize) 375 376 checksStarted := prometheus.NewCounter( 377 prometheus.CounterOpts{ 378 Namespace: "concourse", 379 Subsystem: "lidar", 380 Name: "checks_started_total", 381 Help: "Total number of checks started", 382 }, 383 ) 384 prometheus.MustRegister(checksStarted) 385 386 checksEnqueued := prometheus.NewCounter( 387 prometheus.CounterOpts{ 388 Namespace: "concourse", 389 Subsystem: "lidar", 390 Name: "checks_enqueued_total", 391 Help: "Total number of checks enqueued", 392 }, 393 ) 394 prometheus.MustRegister(checksEnqueued) 395 396 volumesStreamed := prometheus.NewCounter( 397 prometheus.CounterOpts{ 398 Namespace: "concourse", 399 Subsystem: "volumes", 400 Name: "volumes_streamed", 401 Help: "Total number of volumes streamed from one worker to the other", 402 }, 403 ) 404 prometheus.MustRegister(volumesStreamed) 405 406 listener, err := net.Listen("tcp", config.bind()) 407 if err != nil { 408 return nil, err 409 } 410 411 go http.Serve(listener, promhttp.Handler()) 412 413 emitter := &PrometheusEmitter{ 414 jobsScheduled: jobsScheduled, 415 jobsScheduling: jobsScheduling, 416 417 buildsStarted: buildsStarted, 418 buildsRunning: buildsRunning, 419 420 concurrentRequestsLimitHit: concurrentRequestsLimitHit, 421 concurrentRequests: concurrentRequests, 422 423 tasksWaiting: tasksWaiting, 424 tasksWaitingDuration: tasksWaitingDuration, 425 426 buildDurationsVec: buildDurationsVec, 427 buildsAborted: buildsAborted, 428 buildsErrored: buildsErrored, 429 buildsFailed: buildsFailed, 430 buildsFinished: buildsFinished, 431 buildsFinishedVec: buildsFinishedVec, 432 buildsSucceeded: buildsSucceeded, 433 434 dbConnections: dbConnections, 435 dbQueriesTotal: dbQueriesTotal, 436 437 errorLogs: errorLogs, 438 439 httpRequestsDuration: httpRequestsDuration, 440 441 locksHeld: locksHeld, 442 443 checksFinished: checksFinished, 444 checksQueueSize: checksQueueSize, 445 checksStarted: checksStarted, 446 checksEnqueued: checksEnqueued, 447 448 workerContainers: workerContainers, 449 workersRegistered: workersRegistered, 450 workerContainersLabels: map[string]map[string]prometheus.Labels{}, 451 workerVolumesLabels: map[string]map[string]prometheus.Labels{}, 452 workerTasksLabels: map[string]map[string]prometheus.Labels{}, 453 workerLastSeen: map[string]time.Time{}, 454 workerVolumes: workerVolumes, 455 workerTasks: workerTasks, 456 workerUnknownContainers: workerUnknownContainers, 457 workerUnknownVolumes: workerUnknownVolumes, 458 459 volumesStreamed: volumesStreamed, 460 } 461 go emitter.periodicMetricGC() 462 463 return emitter, nil 464 } 465 466 // Emit processes incoming metrics. 467 // In order to provide idiomatic Prometheus metrics, we'll have to convert the various 468 // Event types (differentiated by the less-than-ideal string Name field) into different 469 // Prometheus metrics. 470 func (emitter *PrometheusEmitter) Emit(logger lager.Logger, event metric.Event) { 471 472 //update last seen counters, used to gc stale timeseries 473 emitter.updateLastSeen(event) 474 475 switch event.Name { 476 case "error log": 477 emitter.errorLogsMetric(logger, event) 478 case "lock held": 479 emitter.lock(logger, event) 480 case "jobs scheduled": 481 emitter.jobsScheduled.Add(event.Value) 482 case "jobs scheduling": 483 emitter.jobsScheduling.Set(event.Value) 484 case "builds started": 485 emitter.buildsStarted.Add(event.Value) 486 case "builds running": 487 emitter.buildsRunning.Set(event.Value) 488 case "concurrent requests limit hit": 489 emitter.concurrentRequestsLimitHit.WithLabelValues(event.Attributes["action"]).Add(event.Value) 490 case "concurrent requests": 491 emitter.concurrentRequests. 492 WithLabelValues(event.Attributes["action"]).Set(event.Value) 493 case "tasks waiting": 494 emitter.tasksWaiting. 495 WithLabelValues( 496 event.Attributes["teamId"], 497 event.Attributes["workerTags"], 498 event.Attributes["platform"], 499 ).Set(event.Value) 500 case "tasks waiting duration": 501 emitter.tasksWaitingDuration. 502 WithLabelValues( 503 event.Attributes["teamId"], 504 event.Attributes["workerTags"], 505 event.Attributes["platform"], 506 ).Observe(event.Value) 507 case "build finished": 508 emitter.buildFinishedMetrics(logger, event) 509 case "worker containers": 510 emitter.workerContainersMetric(logger, event) 511 case "worker volumes": 512 emitter.workerVolumesMetric(logger, event) 513 case "worker unknown containers": 514 emitter.workerUnknownContainersMetric(logger, event) 515 case "worker unknown volumes": 516 emitter.workerUnknownVolumesMetric(logger, event) 517 case "worker tasks": 518 emitter.workerTasksMetric(logger, event) 519 case "worker state": 520 emitter.workersRegisteredMetric(logger, event) 521 case "http response time": 522 emitter.httpResponseTimeMetrics(logger, event) 523 case "database queries": 524 emitter.databaseMetrics(logger, event) 525 case "database connections": 526 emitter.databaseMetrics(logger, event) 527 case "checks finished": 528 emitter.checksFinished.WithLabelValues(event.Attributes["status"]).Add(event.Value) 529 case "checks started": 530 emitter.checksStarted.Add(event.Value) 531 case "checks enqueued": 532 emitter.checksEnqueued.Add(event.Value) 533 case "checks queue size": 534 emitter.checksQueueSize.Set(event.Value) 535 case "volumes streamed": 536 emitter.volumesStreamed.Add(event.Value) 537 default: 538 // unless we have a specific metric, we do nothing 539 } 540 } 541 542 func (emitter *PrometheusEmitter) lock(logger lager.Logger, event metric.Event) { 543 lockType, exists := event.Attributes["type"] 544 if !exists { 545 logger.Error("failed-to-find-type-in-event", fmt.Errorf("expected type to exist in event.Attributes")) 546 return 547 } 548 549 if event.Value == 1 { 550 emitter.locksHeld.WithLabelValues(lockType).Inc() 551 } else { 552 emitter.locksHeld.WithLabelValues(lockType).Dec() 553 } 554 } 555 556 func (emitter *PrometheusEmitter) errorLogsMetric(logger lager.Logger, event metric.Event) { 557 message, exists := event.Attributes["message"] 558 if !exists { 559 logger.Error("failed-to-find-message-in-event", 560 fmt.Errorf("expected team_name to exist in event.Attributes")) 561 return 562 } 563 564 emitter.errorLogs.WithLabelValues(message).Inc() 565 } 566 567 func (emitter *PrometheusEmitter) buildFinishedMetrics(logger lager.Logger, event metric.Event) { 568 // concourse_builds_finished_total 569 emitter.buildsFinished.Inc() 570 571 // concourse_builds_finished 572 team, exists := event.Attributes["team_name"] 573 if !exists { 574 logger.Error("failed-to-find-team-name-in-event", fmt.Errorf("expected team_name to exist in event.Attributes")) 575 return 576 } 577 578 pipeline, exists := event.Attributes["pipeline"] 579 if !exists { 580 logger.Error("failed-to-find-pipeline-in-event", fmt.Errorf("expected pipeline to exist in event.Attributes")) 581 return 582 } 583 584 job, exists := event.Attributes["job"] 585 if !exists { 586 logger.Error("failed-to-find-job-in-event", fmt.Errorf("expected job to exist in event.Attributes")) 587 return 588 } 589 590 buildStatus, exists := event.Attributes["build_status"] 591 if !exists { 592 logger.Error("failed-to-find-build_status-in-event", fmt.Errorf("expected build_status to exist in event.Attributes")) 593 return 594 } 595 emitter.buildsFinishedVec.WithLabelValues(team, pipeline, job, buildStatus).Inc() 596 597 // concourse_builds_(aborted|succeeded|failed|errored)_total 598 switch buildStatus { 599 case string(db.BuildStatusAborted): 600 // concourse_builds_aborted_total 601 emitter.buildsAborted.Inc() 602 case string(db.BuildStatusSucceeded): 603 // concourse_builds_succeeded_total 604 emitter.buildsSucceeded.Inc() 605 case string(db.BuildStatusFailed): 606 // concourse_builds_failed_total 607 emitter.buildsFailed.Inc() 608 case string(db.BuildStatusErrored): 609 // concourse_builds_errored_total 610 emitter.buildsErrored.Inc() 611 } 612 613 // seconds are the standard prometheus base unit for time 614 duration := event.Value / 1000 615 emitter.buildDurationsVec.WithLabelValues(team, pipeline, job).Observe(duration) 616 } 617 618 func (emitter *PrometheusEmitter) workerContainersMetric(logger lager.Logger, event metric.Event) { 619 worker, exists := event.Attributes["worker"] 620 if !exists { 621 logger.Error("failed-to-find-worker-in-event", fmt.Errorf("expected worker to exist in event.Attributes")) 622 return 623 } 624 platform, exists := event.Attributes["platform"] 625 if !exists || platform == "" { 626 logger.Error("failed-to-find-platform-in-event", fmt.Errorf("expected platform to exist in event.Attributes")) 627 return 628 } 629 team, exists := event.Attributes["team_name"] 630 if !exists { 631 logger.Error("failed-to-find-team-name-in-event", fmt.Errorf("expected team_name to exist in event.Attributes")) 632 return 633 } 634 tags, _ := event.Attributes["tags"] 635 636 labels := prometheus.Labels{ 637 "worker": worker, 638 "platform": platform, 639 "team": team, 640 "tags": tags, 641 } 642 key := serializeLabels(&labels) 643 if emitter.workerContainersLabels[worker] == nil { 644 emitter.workerContainersLabels[worker] = make(map[string]prometheus.Labels) 645 } 646 emitter.workerContainersLabels[worker][key] = labels 647 emitter.workerContainers.With(emitter.workerContainersLabels[worker][key]).Set(event.Value) 648 } 649 650 func (emitter *PrometheusEmitter) workersRegisteredMetric(logger lager.Logger, event metric.Event) { 651 state, exists := event.Attributes["state"] 652 if !exists { 653 logger.Error("failed-to-find-state-in-event", fmt.Errorf("expected state to exist in event.Attributes")) 654 return 655 } 656 657 emitter.workersRegistered.WithLabelValues(state).Set(event.Value) 658 } 659 660 func (emitter *PrometheusEmitter) workerUnknownContainersMetric(logger lager.Logger, event metric.Event) { 661 worker, exists := event.Attributes["worker"] 662 if !exists { 663 logger.Error("failed-to-find-worker-in-event", fmt.Errorf("expected worker to exist in event.Attributes")) 664 return 665 } 666 667 labels := prometheus.Labels{ 668 "worker": worker, 669 } 670 671 key := serializeLabels(&labels) 672 if emitter.workerContainersLabels[worker] == nil { 673 emitter.workerContainersLabels[worker] = make(map[string]prometheus.Labels) 674 } 675 emitter.workerContainersLabels[worker][key] = labels 676 emitter.workerUnknownContainers.With(emitter.workerContainersLabels[worker][key]).Set(event.Value) 677 } 678 679 func (emitter *PrometheusEmitter) workerVolumesMetric(logger lager.Logger, event metric.Event) { 680 worker, exists := event.Attributes["worker"] 681 if !exists { 682 logger.Error("failed-to-find-worker-in-event", fmt.Errorf("expected worker to exist in event.Attributes")) 683 return 684 } 685 platform, exists := event.Attributes["platform"] 686 if !exists || platform == "" { 687 logger.Error("failed-to-find-platform-in-event", fmt.Errorf("expected platform to exist in event.Attributes")) 688 return 689 } 690 team, exists := event.Attributes["team_name"] 691 if !exists { 692 logger.Error("failed-to-find-team-name-in-event", fmt.Errorf("expected team_name to exist in event.Attributes")) 693 return 694 } 695 tags, _ := event.Attributes["tags"] 696 697 labels := prometheus.Labels{ 698 "worker": worker, 699 "platform": platform, 700 "team": team, 701 "tags": tags, 702 } 703 key := serializeLabels(&labels) 704 if emitter.workerVolumesLabels[worker] == nil { 705 emitter.workerVolumesLabels[worker] = make(map[string]prometheus.Labels) 706 } 707 emitter.workerVolumesLabels[worker][key] = labels 708 emitter.workerVolumes.With(emitter.workerVolumesLabels[worker][key]).Set(event.Value) 709 } 710 711 func (emitter *PrometheusEmitter) workerUnknownVolumesMetric(logger lager.Logger, event metric.Event) { 712 worker, exists := event.Attributes["worker"] 713 if !exists { 714 logger.Error("failed-to-find-worker-in-event", fmt.Errorf("expected worker to exist in event.Attributes")) 715 return 716 } 717 718 labels := prometheus.Labels{ 719 "worker": worker, 720 } 721 722 key := serializeLabels(&labels) 723 if emitter.workerVolumesLabels[worker] == nil { 724 emitter.workerVolumesLabels[worker] = make(map[string]prometheus.Labels) 725 } 726 emitter.workerVolumesLabels[worker][key] = labels 727 emitter.workerUnknownVolumes.With(emitter.workerVolumesLabels[worker][key]).Set(event.Value) 728 } 729 730 func (emitter *PrometheusEmitter) workerTasksMetric(logger lager.Logger, event metric.Event) { 731 worker, exists := event.Attributes["worker"] 732 if !exists { 733 logger.Error("failed-to-find-worker-in-event", fmt.Errorf("expected worker to exist in event.Attributes")) 734 return 735 } 736 platform, exists := event.Attributes["platform"] 737 if !exists || platform == "" { 738 logger.Error("failed-to-find-platform-in-event", fmt.Errorf("expected platform to exist in event.Attributes")) 739 return 740 } 741 742 labels := prometheus.Labels{ 743 "worker": worker, 744 "platform": platform, 745 } 746 key := serializeLabels(&labels) 747 if emitter.workerTasksLabels[worker] == nil { 748 emitter.workerTasksLabels[worker] = make(map[string]prometheus.Labels) 749 } 750 emitter.workerTasksLabels[worker][key] = labels 751 emitter.workerTasks.With(emitter.workerTasksLabels[worker][key]).Set(event.Value) 752 } 753 754 func (emitter *PrometheusEmitter) httpResponseTimeMetrics(logger lager.Logger, event metric.Event) { 755 route, exists := event.Attributes["route"] 756 if !exists { 757 logger.Error("failed-to-find-route-in-event", fmt.Errorf("expected method to exist in event.Attributes")) 758 return 759 } 760 761 method, exists := event.Attributes["method"] 762 if !exists { 763 logger.Error("failed-to-find-method-in-event", fmt.Errorf("expected method to exist in event.Attributes")) 764 return 765 } 766 767 status, exists := event.Attributes["status"] 768 if !exists { 769 logger.Error("failed-to-find-status-in-event", fmt.Errorf("expected status to exist in event.Attributes")) 770 return 771 } 772 773 emitter.httpRequestsDuration.WithLabelValues(method, route, status).Observe(event.Value / 1000) 774 } 775 776 func (emitter *PrometheusEmitter) databaseMetrics(logger lager.Logger, event metric.Event) { 777 switch event.Name { 778 case "database queries": 779 emitter.dbQueriesTotal.Add(event.Value) 780 case "database connections": 781 connectionName, exists := event.Attributes["ConnectionName"] 782 if !exists { 783 logger.Error("failed-to-connection-name-in-event", fmt.Errorf("expected ConnectionName to exist in event.Attributes")) 784 return 785 } 786 emitter.dbConnections.WithLabelValues(connectionName).Set(event.Value) 787 default: 788 } 789 790 } 791 792 // updateLastSeen tracks for each worker when it last received a metric event. 793 func (emitter *PrometheusEmitter) updateLastSeen(event metric.Event) { 794 emitter.mu.Lock() 795 defer emitter.mu.Unlock() 796 if worker, exists := event.Attributes["worker"]; exists { 797 emitter.workerLastSeen[worker] = time.Now() 798 } 799 } 800 801 //periodically remove stale metrics for workers 802 func (emitter *PrometheusEmitter) periodicMetricGC() { 803 for { 804 emitter.mu.Lock() 805 now := time.Now() 806 for worker, lastSeen := range emitter.workerLastSeen { 807 if now.Sub(lastSeen) > 5*time.Minute { 808 DoGarbageCollection(emitter, worker) 809 delete(emitter.workerLastSeen, worker) 810 } 811 } 812 emitter.mu.Unlock() 813 time.Sleep(60 * time.Second) 814 } 815 } 816 817 // DoGarbageCollection retrieves and deletes stale metrics by their labels. 818 func DoGarbageCollection(emitter PrometheusGarbageCollectable, worker string) { 819 for _, labels := range emitter.WorkerContainersLabels()[worker] { 820 emitter.WorkerContainers().Delete(labels) 821 } 822 823 for _, labels := range emitter.WorkerVolumesLabels()[worker] { 824 emitter.WorkerVolumes().Delete(labels) 825 } 826 827 for _, labels := range emitter.WorkerTasksLabels()[worker] { 828 emitter.WorkerTasks().Delete(labels) 829 } 830 831 delete(emitter.WorkerContainersLabels(), worker) 832 delete(emitter.WorkerVolumesLabels(), worker) 833 delete(emitter.WorkerTasksLabels(), worker) 834 } 835 836 //go:generate counterfeiter . PrometheusGarbageCollectable 837 type PrometheusGarbageCollectable interface { 838 WorkerContainers() *prometheus.GaugeVec 839 WorkerVolumes() *prometheus.GaugeVec 840 WorkerTasks() *prometheus.GaugeVec 841 842 WorkerContainersLabels() map[string]map[string]prometheus.Labels 843 WorkerVolumesLabels() map[string]map[string]prometheus.Labels 844 WorkerTasksLabels() map[string]map[string]prometheus.Labels 845 } 846 847 func (emitter *PrometheusEmitter) WorkerContainers() *prometheus.GaugeVec { 848 return emitter.workerContainers 849 } 850 851 func (emitter *PrometheusEmitter) WorkerVolumes() *prometheus.GaugeVec { 852 return emitter.workerVolumes 853 } 854 855 func (emitter *PrometheusEmitter) WorkerTasks() *prometheus.GaugeVec { 856 return emitter.workerTasks 857 } 858 859 func (emitter *PrometheusEmitter) WorkerContainersLabels() map[string]map[string]prometheus.Labels { 860 return emitter.workerContainersLabels 861 } 862 863 func (emitter *PrometheusEmitter) WorkerVolumesLabels() map[string]map[string]prometheus.Labels { 864 return emitter.workerVolumesLabels 865 } 866 867 func (emitter *PrometheusEmitter) WorkerTasksLabels() map[string]map[string]prometheus.Labels { 868 return emitter.workerTasksLabels 869 }