github.com/authzed/spicedb@v1.32.1-0.20240520085336-ebda56537386/internal/telemetry/metrics.go (about) 1 package telemetry 2 3 import ( 4 "context" 5 "fmt" 6 "os" 7 "runtime" 8 "runtime/debug" 9 "strconv" 10 "time" 11 12 "github.com/jzelinskie/cobrautil/v2" 13 "github.com/prometheus/client_golang/prometheus" 14 dto "github.com/prometheus/client_model/go" 15 "golang.org/x/sync/errgroup" 16 17 log "github.com/authzed/spicedb/internal/logging" 18 "github.com/authzed/spicedb/internal/middleware/usagemetrics" 19 "github.com/authzed/spicedb/pkg/datastore" 20 ) 21 22 // RegisterTelemetryCollector registers a collector for the various pieces of 23 // data required by SpiceDB telemetry. 24 func RegisterTelemetryCollector(datastoreEngine string, ds datastore.Datastore) (*prometheus.Registry, error) { 25 registry := prometheus.NewRegistry() 26 27 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 28 defer cancel() 29 30 nodeID, err := os.Hostname() 31 if err != nil { 32 return nil, fmt.Errorf("unable to get hostname: %w", err) 33 } 34 35 dbStats, err := ds.Statistics(ctx) 36 if err != nil { 37 return nil, fmt.Errorf("unable to query DB stats: %w", err) 38 } 39 40 clusterID := dbStats.UniqueID 41 buildInfo, ok := debug.ReadBuildInfo() 42 if !ok { 43 return nil, fmt.Errorf("failed to read BuildInfo") 44 } 45 46 if err := registry.Register(&collector{ 47 ds: ds, 48 infoDesc: prometheus.NewDesc( 49 prometheus.BuildFQName("spicedb", "telemetry", "info"), 50 "Information about the SpiceDB environment.", 51 nil, 52 prometheus.Labels{ 53 "cluster_id": clusterID, 54 "node_id": nodeID, 55 "version": cobrautil.VersionWithFallbacks(buildInfo), 56 "os": runtime.GOOS, 57 "arch": runtime.GOARCH, 58 "go": buildInfo.GoVersion, 59 "vcpu": strconv.Itoa(runtime.NumCPU()), 60 "ds_engine": datastoreEngine, 61 }, 62 ), 63 objectDefsDesc: prometheus.NewDesc( 64 prometheus.BuildFQName("spicedb", "telemetry", "object_definitions_total"), 65 "Count of the number of objects defined by the schema.", 66 nil, 67 prometheus.Labels{ 68 "cluster_id": clusterID, 69 "node_id": nodeID, 70 }, 71 ), 72 relationshipsDesc: prometheus.NewDesc( 73 prometheus.BuildFQName("spicedb", "telemetry", "relationships_estimate_total"), 74 "Count of the estimated number of stored relationships.", 75 nil, 76 prometheus.Labels{ 77 "cluster_id": clusterID, 78 "node_id": nodeID, 79 }, 80 ), 81 dispatchedDesc: prometheus.NewDesc( 82 prometheus.BuildFQName("spicedb", "telemetry", "dispatches"), 83 "Histogram of cluster dispatches performed by the instance.", 84 usagemetrics.DispatchedCountLabels, 85 prometheus.Labels{ 86 "cluster_id": clusterID, 87 "node_id": nodeID, 88 }, 89 ), 90 }); err != nil { 91 return nil, fmt.Errorf("unable to register telemetry collector: %w", err) 92 } 93 94 return registry, nil 95 } 96 97 type collector struct { 98 ds datastore.Datastore 99 infoDesc *prometheus.Desc 100 objectDefsDesc *prometheus.Desc 101 relationshipsDesc *prometheus.Desc 102 dispatchedDesc *prometheus.Desc 103 } 104 105 var _ prometheus.Collector = &collector{} 106 107 func (c *collector) Describe(ch chan<- *prometheus.Desc) { 108 ch <- c.infoDesc 109 ch <- c.objectDefsDesc 110 ch <- c.relationshipsDesc 111 ch <- c.dispatchedDesc 112 } 113 114 func (c *collector) Collect(ch chan<- prometheus.Metric) { 115 ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) 116 defer cancel() 117 118 dsStats, err := c.ds.Statistics(ctx) 119 if err != nil { 120 log.Warn().Err(err).Msg("unable to collect datastore statistics") 121 } 122 123 ch <- prometheus.MustNewConstMetric(c.infoDesc, prometheus.GaugeValue, 1) 124 ch <- prometheus.MustNewConstMetric(c.objectDefsDesc, prometheus.GaugeValue, float64(len(dsStats.ObjectTypeStatistics))) 125 ch <- prometheus.MustNewConstMetric(c.relationshipsDesc, prometheus.GaugeValue, float64(dsStats.EstimatedRelationshipCount)) 126 127 dispatchedCountMetrics := make(chan prometheus.Metric) 128 g := errgroup.Group{} 129 g.Go(func() error { 130 for metric := range dispatchedCountMetrics { 131 var m dto.Metric 132 if err := metric.Write(&m); err != nil { 133 return fmt.Errorf("error writing metric: %w", err) 134 } 135 136 buckets := make(map[float64]uint64, len(m.Histogram.Bucket)) 137 for _, bucket := range m.Histogram.Bucket { 138 buckets[*bucket.UpperBound] = *bucket.CumulativeCount 139 } 140 141 dynamicLabels := make([]string, len(usagemetrics.DispatchedCountLabels)) 142 for i, labelName := range usagemetrics.DispatchedCountLabels { 143 for _, labelVal := range m.Label { 144 if *labelVal.Name == labelName { 145 dynamicLabels[i] = *labelVal.Value 146 } 147 } 148 } 149 ch <- prometheus.MustNewConstHistogram( 150 c.dispatchedDesc, 151 *m.Histogram.SampleCount, 152 *m.Histogram.SampleSum, 153 buckets, 154 dynamicLabels..., 155 ) 156 } 157 return nil 158 }) 159 160 usagemetrics.DispatchedCountHistogram.Collect(dispatchedCountMetrics) 161 close(dispatchedCountMetrics) 162 163 if err := g.Wait(); err != nil { 164 log.Error().Err(err).Msg("error collecting metrics") 165 } 166 }