github.com/kyma-project/kyma-environment-broker@v0.0.1/internal/metrics/operations_db_collector.go (about)

     1  package metrics
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"time"
     7  
     8  	"github.com/kyma-project/kyma-environment-broker/internal"
     9  	"github.com/pivotal-cf/brokerapi/v8/domain"
    10  	"github.com/prometheus/client_golang/prometheus"
    11  	"github.com/prometheus/client_golang/prometheus/promauto"
    12  	"github.com/sirupsen/logrus"
    13  )
    14  
    15  // Retention is the default time and date for obtaining operations by the database query
    16  // For performance reasons, it is not possible to query entire operations database table,
    17  // so instead KEB queries the database for last 14 days worth of data and then for deltas
    18  // during the ellapsed time
    19  var Retention = 14 * 24 * time.Hour
    20  var PollingInterval = 30 * time.Second
    21  
    22  type operationsGetter interface {
    23  	ListOperationsInTimeRange(from, to time.Time) ([]internal.Operation, error)
    24  }
    25  
    26  type opsMetricService struct {
    27  	logger     logrus.FieldLogger
    28  	operations *prometheus.GaugeVec
    29  	lastUpdate time.Time
    30  	db         operationsGetter
    31  	cache      map[string]internal.Operation
    32  }
    33  
    34  // StartOpsMetricService creates service for exposing prometheus metrics for operations.
    35  //
    36  // This is intended as a replacement for OperationResultCollector to address shortcomings
    37  // of the initial implementation - lack of consistency and non-aggregatable metric desing.
    38  // The underlying data is fetched asynchronously from the KEB SQL database to provide
    39  // consistency and the operation result state is exposed as a label instead of a value to
    40  // enable common gauge aggregation.
    41  
    42  // compass_keb_operation_result
    43  
    44  func StartOpsMetricService(ctx context.Context, db operationsGetter, logger logrus.FieldLogger) {
    45  	svc := &opsMetricService{
    46  		db:         db,
    47  		lastUpdate: time.Now().Add(-Retention),
    48  		logger:     logger,
    49  		cache:      make(map[string]internal.Operation),
    50  		operations: promauto.NewGaugeVec(prometheus.GaugeOpts{
    51  			Namespace: prometheusNamespace,
    52  			Subsystem: prometheusSubsystem,
    53  			Name:      "operation_result",
    54  			Help:      "Results of operations",
    55  		}, []string{"operation_id", "instance_id", "global_account_id", "plan_id", "type", "state", "error_category", "error_reason"}),
    56  	}
    57  	go svc.run(ctx)
    58  }
    59  
    60  func (s *opsMetricService) setOperation(op internal.Operation, val float64) {
    61  	labels := make(map[string]string)
    62  	labels["operation_id"] = op.ID
    63  	labels["instance_id"] = op.InstanceID
    64  	labels["global_account_id"] = op.GlobalAccountID
    65  	labels["plan_id"] = op.Plan
    66  	labels["type"] = string(op.Type)
    67  	labels["state"] = string(op.State)
    68  	labels["error_category"] = string(op.LastError.Component())
    69  	labels["error_reason"] = string(op.LastError.Reason())
    70  	s.operations.With(labels).Set(val)
    71  }
    72  
    73  func (s *opsMetricService) updateOperation(op internal.Operation) {
    74  	oldOp, found := s.cache[op.ID]
    75  	if found {
    76  		s.setOperation(oldOp, 0)
    77  	}
    78  	s.setOperation(op, 1)
    79  	if op.State == domain.Failed || op.State == domain.Succeeded {
    80  		delete(s.cache, op.ID)
    81  	} else {
    82  		s.cache[op.ID] = op
    83  	}
    84  }
    85  
    86  func (s *opsMetricService) updateMetrics() (err error) {
    87  	defer func() {
    88  		if r := recover(); r != nil {
    89  			// it's not desirable to panic metrics goroutine, instead it should return and log the error
    90  			err = fmt.Errorf("panic recovered: %v", r)
    91  		}
    92  	}()
    93  	now := time.Now()
    94  	operations, err := s.db.ListOperationsInTimeRange(s.lastUpdate, now)
    95  	if err != nil {
    96  		return fmt.Errorf("failed to list operations: %v", err)
    97  	}
    98  	s.logger.Infof("updating operations metrics for: %v operations", len(operations))
    99  	for _, op := range operations {
   100  		s.updateOperation(op)
   101  	}
   102  	s.lastUpdate = now
   103  	return nil
   104  }
   105  
   106  func (s *opsMetricService) run(ctx context.Context) {
   107  	if err := s.updateMetrics(); err != nil {
   108  		s.logger.Error("failed to update operations metrics", err)
   109  	}
   110  	ticker := time.NewTicker(PollingInterval)
   111  	for {
   112  		select {
   113  		case <-ticker.C:
   114  			if err := s.updateMetrics(); err != nil {
   115  				s.logger.Error("failed to update operations metrics", err)
   116  			}
   117  		case <-ctx.Done():
   118  			return
   119  		}
   120  	}
   121  }