github.com/kyma-project/kyma-environment-broker@v0.0.1/internal/metrics/operations_db_collector.go (about) 1 package metrics 2 3 import ( 4 "context" 5 "fmt" 6 "time" 7 8 "github.com/kyma-project/kyma-environment-broker/internal" 9 "github.com/pivotal-cf/brokerapi/v8/domain" 10 "github.com/prometheus/client_golang/prometheus" 11 "github.com/prometheus/client_golang/prometheus/promauto" 12 "github.com/sirupsen/logrus" 13 ) 14 15 // Retention is the default time and date for obtaining operations by the database query 16 // For performance reasons, it is not possible to query entire operations database table, 17 // so instead KEB queries the database for last 14 days worth of data and then for deltas 18 // during the ellapsed time 19 var Retention = 14 * 24 * time.Hour 20 var PollingInterval = 30 * time.Second 21 22 type operationsGetter interface { 23 ListOperationsInTimeRange(from, to time.Time) ([]internal.Operation, error) 24 } 25 26 type opsMetricService struct { 27 logger logrus.FieldLogger 28 operations *prometheus.GaugeVec 29 lastUpdate time.Time 30 db operationsGetter 31 cache map[string]internal.Operation 32 } 33 34 // StartOpsMetricService creates service for exposing prometheus metrics for operations. 35 // 36 // This is intended as a replacement for OperationResultCollector to address shortcomings 37 // of the initial implementation - lack of consistency and non-aggregatable metric desing. 38 // The underlying data is fetched asynchronously from the KEB SQL database to provide 39 // consistency and the operation result state is exposed as a label instead of a value to 40 // enable common gauge aggregation. 41 42 // compass_keb_operation_result 43 44 func StartOpsMetricService(ctx context.Context, db operationsGetter, logger logrus.FieldLogger) { 45 svc := &opsMetricService{ 46 db: db, 47 lastUpdate: time.Now().Add(-Retention), 48 logger: logger, 49 cache: make(map[string]internal.Operation), 50 operations: promauto.NewGaugeVec(prometheus.GaugeOpts{ 51 Namespace: prometheusNamespace, 52 Subsystem: prometheusSubsystem, 53 Name: "operation_result", 54 Help: "Results of operations", 55 }, []string{"operation_id", "instance_id", "global_account_id", "plan_id", "type", "state", "error_category", "error_reason"}), 56 } 57 go svc.run(ctx) 58 } 59 60 func (s *opsMetricService) setOperation(op internal.Operation, val float64) { 61 labels := make(map[string]string) 62 labels["operation_id"] = op.ID 63 labels["instance_id"] = op.InstanceID 64 labels["global_account_id"] = op.GlobalAccountID 65 labels["plan_id"] = op.Plan 66 labels["type"] = string(op.Type) 67 labels["state"] = string(op.State) 68 labels["error_category"] = string(op.LastError.Component()) 69 labels["error_reason"] = string(op.LastError.Reason()) 70 s.operations.With(labels).Set(val) 71 } 72 73 func (s *opsMetricService) updateOperation(op internal.Operation) { 74 oldOp, found := s.cache[op.ID] 75 if found { 76 s.setOperation(oldOp, 0) 77 } 78 s.setOperation(op, 1) 79 if op.State == domain.Failed || op.State == domain.Succeeded { 80 delete(s.cache, op.ID) 81 } else { 82 s.cache[op.ID] = op 83 } 84 } 85 86 func (s *opsMetricService) updateMetrics() (err error) { 87 defer func() { 88 if r := recover(); r != nil { 89 // it's not desirable to panic metrics goroutine, instead it should return and log the error 90 err = fmt.Errorf("panic recovered: %v", r) 91 } 92 }() 93 now := time.Now() 94 operations, err := s.db.ListOperationsInTimeRange(s.lastUpdate, now) 95 if err != nil { 96 return fmt.Errorf("failed to list operations: %v", err) 97 } 98 s.logger.Infof("updating operations metrics for: %v operations", len(operations)) 99 for _, op := range operations { 100 s.updateOperation(op) 101 } 102 s.lastUpdate = now 103 return nil 104 } 105 106 func (s *opsMetricService) run(ctx context.Context) { 107 if err := s.updateMetrics(); err != nil { 108 s.logger.Error("failed to update operations metrics", err) 109 } 110 ticker := time.NewTicker(PollingInterval) 111 for { 112 select { 113 case <-ticker.C: 114 if err := s.updateMetrics(); err != nil { 115 s.logger.Error("failed to update operations metrics", err) 116 } 117 case <-ctx.Done(): 118 return 119 } 120 } 121 }