github.com/authzed/spicedb@v1.32.1-0.20240520085336-ebda56537386/internal/telemetry/metrics.go (about)

     1  package telemetry
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"os"
     7  	"runtime"
     8  	"runtime/debug"
     9  	"strconv"
    10  	"time"
    11  
    12  	"github.com/jzelinskie/cobrautil/v2"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	dto "github.com/prometheus/client_model/go"
    15  	"golang.org/x/sync/errgroup"
    16  
    17  	log "github.com/authzed/spicedb/internal/logging"
    18  	"github.com/authzed/spicedb/internal/middleware/usagemetrics"
    19  	"github.com/authzed/spicedb/pkg/datastore"
    20  )
    21  
    22  // RegisterTelemetryCollector registers a collector for the various pieces of
    23  // data required by SpiceDB telemetry.
    24  func RegisterTelemetryCollector(datastoreEngine string, ds datastore.Datastore) (*prometheus.Registry, error) {
    25  	registry := prometheus.NewRegistry()
    26  
    27  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
    28  	defer cancel()
    29  
    30  	nodeID, err := os.Hostname()
    31  	if err != nil {
    32  		return nil, fmt.Errorf("unable to get hostname: %w", err)
    33  	}
    34  
    35  	dbStats, err := ds.Statistics(ctx)
    36  	if err != nil {
    37  		return nil, fmt.Errorf("unable to query DB stats: %w", err)
    38  	}
    39  
    40  	clusterID := dbStats.UniqueID
    41  	buildInfo, ok := debug.ReadBuildInfo()
    42  	if !ok {
    43  		return nil, fmt.Errorf("failed to read BuildInfo")
    44  	}
    45  
    46  	if err := registry.Register(&collector{
    47  		ds: ds,
    48  		infoDesc: prometheus.NewDesc(
    49  			prometheus.BuildFQName("spicedb", "telemetry", "info"),
    50  			"Information about the SpiceDB environment.",
    51  			nil,
    52  			prometheus.Labels{
    53  				"cluster_id": clusterID,
    54  				"node_id":    nodeID,
    55  				"version":    cobrautil.VersionWithFallbacks(buildInfo),
    56  				"os":         runtime.GOOS,
    57  				"arch":       runtime.GOARCH,
    58  				"go":         buildInfo.GoVersion,
    59  				"vcpu":       strconv.Itoa(runtime.NumCPU()),
    60  				"ds_engine":  datastoreEngine,
    61  			},
    62  		),
    63  		objectDefsDesc: prometheus.NewDesc(
    64  			prometheus.BuildFQName("spicedb", "telemetry", "object_definitions_total"),
    65  			"Count of the number of objects defined by the schema.",
    66  			nil,
    67  			prometheus.Labels{
    68  				"cluster_id": clusterID,
    69  				"node_id":    nodeID,
    70  			},
    71  		),
    72  		relationshipsDesc: prometheus.NewDesc(
    73  			prometheus.BuildFQName("spicedb", "telemetry", "relationships_estimate_total"),
    74  			"Count of the estimated number of stored relationships.",
    75  			nil,
    76  			prometheus.Labels{
    77  				"cluster_id": clusterID,
    78  				"node_id":    nodeID,
    79  			},
    80  		),
    81  		dispatchedDesc: prometheus.NewDesc(
    82  			prometheus.BuildFQName("spicedb", "telemetry", "dispatches"),
    83  			"Histogram of cluster dispatches performed by the instance.",
    84  			usagemetrics.DispatchedCountLabels,
    85  			prometheus.Labels{
    86  				"cluster_id": clusterID,
    87  				"node_id":    nodeID,
    88  			},
    89  		),
    90  	}); err != nil {
    91  		return nil, fmt.Errorf("unable to register telemetry collector: %w", err)
    92  	}
    93  
    94  	return registry, nil
    95  }
    96  
    97  type collector struct {
    98  	ds                datastore.Datastore
    99  	infoDesc          *prometheus.Desc
   100  	objectDefsDesc    *prometheus.Desc
   101  	relationshipsDesc *prometheus.Desc
   102  	dispatchedDesc    *prometheus.Desc
   103  }
   104  
   105  var _ prometheus.Collector = &collector{}
   106  
   107  func (c *collector) Describe(ch chan<- *prometheus.Desc) {
   108  	ch <- c.infoDesc
   109  	ch <- c.objectDefsDesc
   110  	ch <- c.relationshipsDesc
   111  	ch <- c.dispatchedDesc
   112  }
   113  
   114  func (c *collector) Collect(ch chan<- prometheus.Metric) {
   115  	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
   116  	defer cancel()
   117  
   118  	dsStats, err := c.ds.Statistics(ctx)
   119  	if err != nil {
   120  		log.Warn().Err(err).Msg("unable to collect datastore statistics")
   121  	}
   122  
   123  	ch <- prometheus.MustNewConstMetric(c.infoDesc, prometheus.GaugeValue, 1)
   124  	ch <- prometheus.MustNewConstMetric(c.objectDefsDesc, prometheus.GaugeValue, float64(len(dsStats.ObjectTypeStatistics)))
   125  	ch <- prometheus.MustNewConstMetric(c.relationshipsDesc, prometheus.GaugeValue, float64(dsStats.EstimatedRelationshipCount))
   126  
   127  	dispatchedCountMetrics := make(chan prometheus.Metric)
   128  	g := errgroup.Group{}
   129  	g.Go(func() error {
   130  		for metric := range dispatchedCountMetrics {
   131  			var m dto.Metric
   132  			if err := metric.Write(&m); err != nil {
   133  				return fmt.Errorf("error writing metric: %w", err)
   134  			}
   135  
   136  			buckets := make(map[float64]uint64, len(m.Histogram.Bucket))
   137  			for _, bucket := range m.Histogram.Bucket {
   138  				buckets[*bucket.UpperBound] = *bucket.CumulativeCount
   139  			}
   140  
   141  			dynamicLabels := make([]string, len(usagemetrics.DispatchedCountLabels))
   142  			for i, labelName := range usagemetrics.DispatchedCountLabels {
   143  				for _, labelVal := range m.Label {
   144  					if *labelVal.Name == labelName {
   145  						dynamicLabels[i] = *labelVal.Value
   146  					}
   147  				}
   148  			}
   149  			ch <- prometheus.MustNewConstHistogram(
   150  				c.dispatchedDesc,
   151  				*m.Histogram.SampleCount,
   152  				*m.Histogram.SampleSum,
   153  				buckets,
   154  				dynamicLabels...,
   155  			)
   156  		}
   157  		return nil
   158  	})
   159  
   160  	usagemetrics.DispatchedCountHistogram.Collect(dispatchedCountMetrics)
   161  	close(dispatchedCountMetrics)
   162  
   163  	if err := g.Wait(); err != nil {
   164  		log.Error().Err(err).Msg("error collecting metrics")
   165  	}
   166  }