github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/x/metrics.go (about)

     1  /*
     2   * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package x
    18  
    19  import (
    20  	"context"
    21  	"expvar"
    22  	"log"
    23  	"net/http"
    24  	"time"
    25  
    26  	"go.opencensus.io/trace"
    27  
    28  	"contrib.go.opencensus.io/exporter/jaeger"
    29  	oc_prom "contrib.go.opencensus.io/exporter/prometheus"
    30  	datadog "github.com/DataDog/opencensus-go-exporter-datadog"
    31  	"github.com/golang/glog"
    32  	"github.com/prometheus/client_golang/prometheus"
    33  	"github.com/spf13/viper"
    34  	"go.opencensus.io/stats"
    35  	"go.opencensus.io/stats/view"
    36  	"go.opencensus.io/tag"
    37  )
    38  
    39  var (
    40  	// Cumulative metrics.
    41  
    42  	// NumQueries is the total number of queries processed so far.
    43  	NumQueries = stats.Int64("num_queries_total",
    44  		"Total number of queries", stats.UnitDimensionless)
    45  	// NumMutations is the total number of mutations processed so far.
    46  	NumMutations = stats.Int64("num_mutations_total",
    47  		"Total number of mutations", stats.UnitDimensionless)
    48  	// NumEdges is the total number of edges created so far.
    49  	NumEdges = stats.Int64("num_edges_total",
    50  		"Total number of edges created", stats.UnitDimensionless)
    51  	// LatencyMs is the latency of the various Dgraph operations.
    52  	LatencyMs = stats.Float64("latency",
    53  		"Latency of the various methods", stats.UnitMilliseconds)
    54  
    55  	// Point-in-time metrics.
    56  
    57  	// PendingQueries records the current number of pending queries.
    58  	PendingQueries = stats.Int64("pending_queries_total",
    59  		"Number of pending queries", stats.UnitDimensionless)
    60  	// PendingProposals records the current number of pending RAFT proposals.
    61  	PendingProposals = stats.Int64("pending_proposals_total",
    62  		"Number of pending proposals", stats.UnitDimensionless)
    63  	// MemoryInUse records the current amount of used memory by Dgraph.
    64  	MemoryInUse = stats.Int64("memory_inuse_bytes",
    65  		"Amount of memory in use", stats.UnitBytes)
    66  	// MemoryIdle records the amount of memory held by the runtime but not in-use by Dgraph.
    67  	MemoryIdle = stats.Int64("memory_idle_bytes",
    68  		"Amount of memory in idle spans", stats.UnitBytes)
    69  	// MemoryProc records the amount of memory used in processes.
    70  	MemoryProc = stats.Int64("memory_proc_bytes",
    71  		"Amount of memory used in processes", stats.UnitBytes)
    72  	// ActiveMutations is the current number of active mutations.
    73  	ActiveMutations = stats.Int64("active_mutations_total",
    74  		"Number of active mutations", stats.UnitDimensionless)
    75  	// AlphaHealth status records the current health of the alphas.
    76  	AlphaHealth = stats.Int64("alpha_health_status",
    77  		"Status of the alphas", stats.UnitDimensionless)
    78  	// RaftAppliedIndex records the latest applied RAFT index.
    79  	RaftAppliedIndex = stats.Int64("raft_applied_index",
    80  		"Latest applied Raft index", stats.UnitDimensionless)
    81  	// MaxAssignedTs records the latest max assigned timestamp.
    82  	MaxAssignedTs = stats.Int64("max_assigned_ts",
    83  		"Latest max assigned timestamp", stats.UnitDimensionless)
    84  
    85  	// Conf holds the metrics config.
    86  	// TODO: Request statistics, latencies, 500, timeouts
    87  	Conf *expvar.Map
    88  
    89  	// Tag keys.
    90  
    91  	// KeyStatus is the tag key used to record the status of the server.
    92  	KeyStatus, _ = tag.NewKey("status")
    93  	// KeyMethod is the tag key used to record the method (e.g read or mutate).
    94  	KeyMethod, _ = tag.NewKey("method")
    95  
    96  	// Tag values.
    97  
    98  	// TagValueStatusOK is the tag value used to signal a successful operation.
    99  	TagValueStatusOK = "ok"
   100  	// TagValueStatusError is the tag value used to signal an unsuccessful operation.
   101  	TagValueStatusError = "error"
   102  
   103  	defaultLatencyMsDistribution = view.Distribution(
   104  		0, 0.01, 0.05, 0.1, 0.3, 0.6, 0.8, 1, 2, 3, 4, 5, 6, 8, 10, 13, 16,
   105  		20, 25, 30, 40, 50, 65, 80, 100, 130, 160, 200, 250, 300, 400, 500,
   106  		650, 800, 1000, 2000, 5000, 10000, 20000, 50000, 100000)
   107  
   108  	allTagKeys = []tag.Key{
   109  		KeyStatus, KeyMethod,
   110  	}
   111  
   112  	allViews = []*view.View{
   113  		{
   114  			Name:        LatencyMs.Name(),
   115  			Measure:     LatencyMs,
   116  			Description: LatencyMs.Description(),
   117  			Aggregation: defaultLatencyMsDistribution,
   118  			TagKeys:     allTagKeys,
   119  		},
   120  		{
   121  			Name:        NumQueries.Name(),
   122  			Measure:     NumQueries,
   123  			Description: NumQueries.Description(),
   124  			Aggregation: view.Count(),
   125  			TagKeys:     allTagKeys,
   126  		},
   127  		{
   128  			Name:        NumEdges.Name(),
   129  			Measure:     NumEdges,
   130  			Description: NumEdges.Description(),
   131  			Aggregation: view.Count(),
   132  			TagKeys:     allTagKeys,
   133  		},
   134  		{
   135  			Name:        RaftAppliedIndex.Name(),
   136  			Measure:     RaftAppliedIndex,
   137  			Description: RaftAppliedIndex.Description(),
   138  			Aggregation: view.Count(),
   139  			TagKeys:     allTagKeys,
   140  		},
   141  		{
   142  			Name:        MaxAssignedTs.Name(),
   143  			Measure:     MaxAssignedTs,
   144  			Description: MaxAssignedTs.Description(),
   145  			Aggregation: view.Count(),
   146  			TagKeys:     allTagKeys,
   147  		},
   148  
   149  		// Last value aggregations
   150  		{
   151  			Name:        PendingQueries.Name(),
   152  			Measure:     PendingQueries,
   153  			Description: PendingQueries.Description(),
   154  			Aggregation: view.LastValue(),
   155  			TagKeys:     allTagKeys,
   156  		},
   157  		{
   158  			Name:        PendingProposals.Name(),
   159  			Measure:     PendingProposals,
   160  			Description: PendingProposals.Description(),
   161  			Aggregation: view.LastValue(),
   162  			TagKeys:     allTagKeys,
   163  		},
   164  		{
   165  			Name:        MemoryInUse.Name(),
   166  			Measure:     MemoryInUse,
   167  			Description: MemoryInUse.Description(),
   168  			Aggregation: view.LastValue(),
   169  			TagKeys:     allTagKeys,
   170  		},
   171  		{
   172  			Name:        MemoryIdle.Name(),
   173  			Measure:     MemoryIdle,
   174  			Description: MemoryIdle.Description(),
   175  			Aggregation: view.LastValue(),
   176  			TagKeys:     allTagKeys,
   177  		},
   178  		{
   179  			Name:        MemoryProc.Name(),
   180  			Measure:     MemoryProc,
   181  			Description: MemoryProc.Description(),
   182  			Aggregation: view.LastValue(),
   183  			TagKeys:     allTagKeys,
   184  		},
   185  		{
   186  			Name:        ActiveMutations.Name(),
   187  			Measure:     ActiveMutations,
   188  			Description: ActiveMutations.Description(),
   189  			Aggregation: view.LastValue(),
   190  			TagKeys:     allTagKeys,
   191  		},
   192  		{
   193  			Name:        AlphaHealth.Name(),
   194  			Measure:     AlphaHealth,
   195  			Description: AlphaHealth.Description(),
   196  			Aggregation: view.LastValue(),
   197  			TagKeys:     allTagKeys,
   198  		},
   199  	}
   200  )
   201  
   202  func init() {
   203  	Conf = expvar.NewMap("dgraph_config")
   204  
   205  	ctx := MetricsContext()
   206  	go func() {
   207  		var v string
   208  		ticker := time.NewTicker(5 * time.Second)
   209  		defer ticker.Stop()
   210  		for range ticker.C {
   211  			v = TagValueStatusOK
   212  			if err := HealthCheck(); err != nil {
   213  				v = TagValueStatusError
   214  			}
   215  			cctx, _ := tag.New(ctx, tag.Upsert(KeyStatus, v))
   216  			// TODO: Do we need to set health to zero, or would this tag be sufficient to
   217  			// indicate if Alpha is up but HealthCheck is failing.
   218  			stats.Record(cctx, AlphaHealth.M(1))
   219  		}
   220  	}()
   221  
   222  	CheckfNoTrace(view.Register(allViews...))
   223  
   224  	pe, err := oc_prom.NewExporter(oc_prom.Options{
   225  		Registry:  prometheus.DefaultRegisterer.(*prometheus.Registry),
   226  		Namespace: "dgraph",
   227  		OnError:   func(err error) { glog.Errorf("%v", err) },
   228  	})
   229  	Checkf(err, "Failed to create OpenCensus Prometheus exporter: %v", err)
   230  	view.RegisterExporter(pe)
   231  
   232  	http.Handle("/debug/prometheus_metrics", pe)
   233  }
   234  
   235  // MetricsContext returns a context with tags that are useful for
   236  // distinguishing the state of the running system.
   237  // This context will be used to derive other contexts.
   238  func MetricsContext() context.Context {
   239  	// At the beginning add some distinguishing information
   240  	// to the context as tags that will be propagated when
   241  	// collecting metrics.
   242  	return context.Background()
   243  }
   244  
   245  // WithMethod returns a new updated context with the tag KeyMethod set to the given value.
   246  func WithMethod(parent context.Context, method string) context.Context {
   247  	ctx, err := tag.New(parent, tag.Upsert(KeyMethod, method))
   248  	Check(err)
   249  	return ctx
   250  }
   251  
   252  // SinceMs returns the time since startTime in milliseconds (as a float).
   253  func SinceMs(startTime time.Time) float64 {
   254  	return float64(time.Since(startTime)) / 1e6
   255  }
   256  
   257  // RegisterExporters sets up the services to which metrics will be exported.
   258  func RegisterExporters(conf *viper.Viper, service string) {
   259  	if collector := conf.GetString("jaeger.collector"); len(collector) > 0 {
   260  		// Port details: https://www.jaegertracing.io/docs/getting-started/
   261  		// Default collectorEndpointURI := "http://localhost:14268"
   262  		je, err := jaeger.NewExporter(jaeger.Options{
   263  			Endpoint:    collector,
   264  			ServiceName: service,
   265  		})
   266  		if err != nil {
   267  			log.Fatalf("Failed to create the Jaeger exporter: %v", err)
   268  		}
   269  		// And now finally register it as a Trace Exporter
   270  		trace.RegisterExporter(je)
   271  	}
   272  
   273  	if collector := conf.GetString("datadog.collector"); len(collector) > 0 {
   274  		exporter, err := datadog.NewExporter(datadog.Options{
   275  			Service:   service,
   276  			TraceAddr: collector,
   277  		})
   278  		if err != nil {
   279  			log.Fatal(err)
   280  		}
   281  
   282  		trace.RegisterExporter(exporter)
   283  
   284  		// For demoing purposes, always sample.
   285  		trace.ApplyConfig(trace.Config{
   286  			DefaultSampler: trace.AlwaysSample(),
   287  		})
   288  	}
   289  
   290  	// Exclusively for stats, metrics, etc. Not for tracing.
   291  	// var views = append(ocgrpc.DefaultServerViews, ocgrpc.DefaultClientViews...)
   292  	// if err := view.Register(views...); err != nil {
   293  	// 	glog.Fatalf("Unable to register OpenCensus stats: %v", err)
   294  	// }
   295  }