github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/x/metrics.go (about) 1 /* 2 * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package x 18 19 import ( 20 "context" 21 "expvar" 22 "log" 23 "net/http" 24 "time" 25 26 "go.opencensus.io/trace" 27 28 "contrib.go.opencensus.io/exporter/jaeger" 29 oc_prom "contrib.go.opencensus.io/exporter/prometheus" 30 datadog "github.com/DataDog/opencensus-go-exporter-datadog" 31 "github.com/golang/glog" 32 "github.com/prometheus/client_golang/prometheus" 33 "github.com/spf13/viper" 34 "go.opencensus.io/stats" 35 "go.opencensus.io/stats/view" 36 "go.opencensus.io/tag" 37 ) 38 39 var ( 40 // Cumulative metrics. 41 42 // NumQueries is the total number of queries processed so far. 43 NumQueries = stats.Int64("num_queries_total", 44 "Total number of queries", stats.UnitDimensionless) 45 // NumMutations is the total number of mutations processed so far. 46 NumMutations = stats.Int64("num_mutations_total", 47 "Total number of mutations", stats.UnitDimensionless) 48 // NumEdges is the total number of edges created so far. 49 NumEdges = stats.Int64("num_edges_total", 50 "Total number of edges created", stats.UnitDimensionless) 51 // LatencyMs is the latency of the various Dgraph operations. 52 LatencyMs = stats.Float64("latency", 53 "Latency of the various methods", stats.UnitMilliseconds) 54 55 // Point-in-time metrics. 56 57 // PendingQueries records the current number of pending queries. 58 PendingQueries = stats.Int64("pending_queries_total", 59 "Number of pending queries", stats.UnitDimensionless) 60 // PendingProposals records the current number of pending RAFT proposals. 61 PendingProposals = stats.Int64("pending_proposals_total", 62 "Number of pending proposals", stats.UnitDimensionless) 63 // MemoryInUse records the current amount of used memory by Dgraph. 64 MemoryInUse = stats.Int64("memory_inuse_bytes", 65 "Amount of memory in use", stats.UnitBytes) 66 // MemoryIdle records the amount of memory held by the runtime but not in-use by Dgraph. 67 MemoryIdle = stats.Int64("memory_idle_bytes", 68 "Amount of memory in idle spans", stats.UnitBytes) 69 // MemoryProc records the amount of memory used in processes. 70 MemoryProc = stats.Int64("memory_proc_bytes", 71 "Amount of memory used in processes", stats.UnitBytes) 72 // ActiveMutations is the current number of active mutations. 73 ActiveMutations = stats.Int64("active_mutations_total", 74 "Number of active mutations", stats.UnitDimensionless) 75 // AlphaHealth status records the current health of the alphas. 76 AlphaHealth = stats.Int64("alpha_health_status", 77 "Status of the alphas", stats.UnitDimensionless) 78 // RaftAppliedIndex records the latest applied RAFT index. 79 RaftAppliedIndex = stats.Int64("raft_applied_index", 80 "Latest applied Raft index", stats.UnitDimensionless) 81 // MaxAssignedTs records the latest max assigned timestamp. 82 MaxAssignedTs = stats.Int64("max_assigned_ts", 83 "Latest max assigned timestamp", stats.UnitDimensionless) 84 85 // Conf holds the metrics config. 86 // TODO: Request statistics, latencies, 500, timeouts 87 Conf *expvar.Map 88 89 // Tag keys. 90 91 // KeyStatus is the tag key used to record the status of the server. 92 KeyStatus, _ = tag.NewKey("status") 93 // KeyMethod is the tag key used to record the method (e.g read or mutate). 94 KeyMethod, _ = tag.NewKey("method") 95 96 // Tag values. 97 98 // TagValueStatusOK is the tag value used to signal a successful operation. 99 TagValueStatusOK = "ok" 100 // TagValueStatusError is the tag value used to signal an unsuccessful operation. 101 TagValueStatusError = "error" 102 103 defaultLatencyMsDistribution = view.Distribution( 104 0, 0.01, 0.05, 0.1, 0.3, 0.6, 0.8, 1, 2, 3, 4, 5, 6, 8, 10, 13, 16, 105 20, 25, 30, 40, 50, 65, 80, 100, 130, 160, 200, 250, 300, 400, 500, 106 650, 800, 1000, 2000, 5000, 10000, 20000, 50000, 100000) 107 108 allTagKeys = []tag.Key{ 109 KeyStatus, KeyMethod, 110 } 111 112 allViews = []*view.View{ 113 { 114 Name: LatencyMs.Name(), 115 Measure: LatencyMs, 116 Description: LatencyMs.Description(), 117 Aggregation: defaultLatencyMsDistribution, 118 TagKeys: allTagKeys, 119 }, 120 { 121 Name: NumQueries.Name(), 122 Measure: NumQueries, 123 Description: NumQueries.Description(), 124 Aggregation: view.Count(), 125 TagKeys: allTagKeys, 126 }, 127 { 128 Name: NumEdges.Name(), 129 Measure: NumEdges, 130 Description: NumEdges.Description(), 131 Aggregation: view.Count(), 132 TagKeys: allTagKeys, 133 }, 134 { 135 Name: RaftAppliedIndex.Name(), 136 Measure: RaftAppliedIndex, 137 Description: RaftAppliedIndex.Description(), 138 Aggregation: view.Count(), 139 TagKeys: allTagKeys, 140 }, 141 { 142 Name: MaxAssignedTs.Name(), 143 Measure: MaxAssignedTs, 144 Description: MaxAssignedTs.Description(), 145 Aggregation: view.Count(), 146 TagKeys: allTagKeys, 147 }, 148 149 // Last value aggregations 150 { 151 Name: PendingQueries.Name(), 152 Measure: PendingQueries, 153 Description: PendingQueries.Description(), 154 Aggregation: view.LastValue(), 155 TagKeys: allTagKeys, 156 }, 157 { 158 Name: PendingProposals.Name(), 159 Measure: PendingProposals, 160 Description: PendingProposals.Description(), 161 Aggregation: view.LastValue(), 162 TagKeys: allTagKeys, 163 }, 164 { 165 Name: MemoryInUse.Name(), 166 Measure: MemoryInUse, 167 Description: MemoryInUse.Description(), 168 Aggregation: view.LastValue(), 169 TagKeys: allTagKeys, 170 }, 171 { 172 Name: MemoryIdle.Name(), 173 Measure: MemoryIdle, 174 Description: MemoryIdle.Description(), 175 Aggregation: view.LastValue(), 176 TagKeys: allTagKeys, 177 }, 178 { 179 Name: MemoryProc.Name(), 180 Measure: MemoryProc, 181 Description: MemoryProc.Description(), 182 Aggregation: view.LastValue(), 183 TagKeys: allTagKeys, 184 }, 185 { 186 Name: ActiveMutations.Name(), 187 Measure: ActiveMutations, 188 Description: ActiveMutations.Description(), 189 Aggregation: view.LastValue(), 190 TagKeys: allTagKeys, 191 }, 192 { 193 Name: AlphaHealth.Name(), 194 Measure: AlphaHealth, 195 Description: AlphaHealth.Description(), 196 Aggregation: view.LastValue(), 197 TagKeys: allTagKeys, 198 }, 199 } 200 ) 201 202 func init() { 203 Conf = expvar.NewMap("dgraph_config") 204 205 ctx := MetricsContext() 206 go func() { 207 var v string 208 ticker := time.NewTicker(5 * time.Second) 209 defer ticker.Stop() 210 for range ticker.C { 211 v = TagValueStatusOK 212 if err := HealthCheck(); err != nil { 213 v = TagValueStatusError 214 } 215 cctx, _ := tag.New(ctx, tag.Upsert(KeyStatus, v)) 216 // TODO: Do we need to set health to zero, or would this tag be sufficient to 217 // indicate if Alpha is up but HealthCheck is failing. 218 stats.Record(cctx, AlphaHealth.M(1)) 219 } 220 }() 221 222 CheckfNoTrace(view.Register(allViews...)) 223 224 pe, err := oc_prom.NewExporter(oc_prom.Options{ 225 Registry: prometheus.DefaultRegisterer.(*prometheus.Registry), 226 Namespace: "dgraph", 227 OnError: func(err error) { glog.Errorf("%v", err) }, 228 }) 229 Checkf(err, "Failed to create OpenCensus Prometheus exporter: %v", err) 230 view.RegisterExporter(pe) 231 232 http.Handle("/debug/prometheus_metrics", pe) 233 } 234 235 // MetricsContext returns a context with tags that are useful for 236 // distinguishing the state of the running system. 237 // This context will be used to derive other contexts. 238 func MetricsContext() context.Context { 239 // At the beginning add some distinguishing information 240 // to the context as tags that will be propagated when 241 // collecting metrics. 242 return context.Background() 243 } 244 245 // WithMethod returns a new updated context with the tag KeyMethod set to the given value. 246 func WithMethod(parent context.Context, method string) context.Context { 247 ctx, err := tag.New(parent, tag.Upsert(KeyMethod, method)) 248 Check(err) 249 return ctx 250 } 251 252 // SinceMs returns the time since startTime in milliseconds (as a float). 253 func SinceMs(startTime time.Time) float64 { 254 return float64(time.Since(startTime)) / 1e6 255 } 256 257 // RegisterExporters sets up the services to which metrics will be exported. 258 func RegisterExporters(conf *viper.Viper, service string) { 259 if collector := conf.GetString("jaeger.collector"); len(collector) > 0 { 260 // Port details: https://www.jaegertracing.io/docs/getting-started/ 261 // Default collectorEndpointURI := "http://localhost:14268" 262 je, err := jaeger.NewExporter(jaeger.Options{ 263 Endpoint: collector, 264 ServiceName: service, 265 }) 266 if err != nil { 267 log.Fatalf("Failed to create the Jaeger exporter: %v", err) 268 } 269 // And now finally register it as a Trace Exporter 270 trace.RegisterExporter(je) 271 } 272 273 if collector := conf.GetString("datadog.collector"); len(collector) > 0 { 274 exporter, err := datadog.NewExporter(datadog.Options{ 275 Service: service, 276 TraceAddr: collector, 277 }) 278 if err != nil { 279 log.Fatal(err) 280 } 281 282 trace.RegisterExporter(exporter) 283 284 // For demoing purposes, always sample. 285 trace.ApplyConfig(trace.Config{ 286 DefaultSampler: trace.AlwaysSample(), 287 }) 288 } 289 290 // Exclusively for stats, metrics, etc. Not for tracing. 291 // var views = append(ocgrpc.DefaultServerViews, ocgrpc.DefaultClientViews...) 292 // if err := view.Register(views...); err != nil { 293 // glog.Fatalf("Unable to register OpenCensus stats: %v", err) 294 // } 295 }