github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/status/recorder.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package status
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"encoding/json"
    17  	"fmt"
    18  	"io"
    19  	"math"
    20  	"os"
    21  	"runtime"
    22  	"strconv"
    23  	"sync/atomic"
    24  	"time"
    25  
    26  	"github.com/cockroachdb/cockroach/pkg/build"
    27  	"github.com/cockroachdb/cockroach/pkg/gossip"
    28  	"github.com/cockroachdb/cockroach/pkg/keys"
    29  	"github.com/cockroachdb/cockroach/pkg/kv"
    30  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    31  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    32  	"github.com/cockroachdb/cockroach/pkg/rpc"
    33  	"github.com/cockroachdb/cockroach/pkg/server/status/statuspb"
    34  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    35  	"github.com/cockroachdb/cockroach/pkg/ts/tspb"
    36  	"github.com/cockroachdb/cockroach/pkg/util/cgroups"
    37  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    38  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    39  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    40  	"github.com/cockroachdb/cockroach/pkg/util/log"
    41  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    42  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    43  	"github.com/cockroachdb/errors"
    44  	humanize "github.com/dustin/go-humanize"
    45  	"github.com/elastic/gosigar"
    46  )
    47  
    48  const (
    49  	// storeTimeSeriesPrefix is the common prefix for time series keys which
    50  	// record store-specific data.
    51  	storeTimeSeriesPrefix = "cr.store.%s"
    52  	// nodeTimeSeriesPrefix is the common prefix for time series keys which
    53  	// record node-specific data.
    54  	nodeTimeSeriesPrefix = "cr.node.%s"
    55  
    56  	advertiseAddrLabelKey = "advertise-addr"
    57  	httpAddrLabelKey      = "http-addr"
    58  	sqlAddrLabelKey       = "sql-addr"
    59  )
    60  
    61  type quantile struct {
    62  	suffix   string
    63  	quantile float64
    64  }
    65  
    66  var recordHistogramQuantiles = []quantile{
    67  	{"-max", 100},
    68  	{"-p99.999", 99.999},
    69  	{"-p99.99", 99.99},
    70  	{"-p99.9", 99.9},
    71  	{"-p99", 99},
    72  	{"-p90", 90},
    73  	{"-p75", 75},
    74  	{"-p50", 50},
    75  }
    76  
    77  // storeMetrics is the minimum interface of the storage.Store object needed by
    78  // MetricsRecorder to provide status summaries. This is used instead of Store
    79  // directly in order to simplify testing.
    80  type storeMetrics interface {
    81  	StoreID() roachpb.StoreID
    82  	Descriptor(bool) (*roachpb.StoreDescriptor, error)
    83  	Registry() *metric.Registry
    84  }
    85  
    86  // MetricsRecorder is used to periodically record the information in a number of
    87  // metric registries.
    88  //
    89  // Two types of registries are maintained: "node-level" registries, provided by
    90  // node-level systems, and "store-level" registries which are provided by each
    91  // store hosted by the node. There are slight differences in the way these are
    92  // recorded, and they are thus kept separate.
    93  type MetricsRecorder struct {
    94  	*HealthChecker
    95  	gossip       *gossip.Gossip
    96  	nodeLiveness *kvserver.NodeLiveness
    97  	rpcContext   *rpc.Context
    98  	settings     *cluster.Settings
    99  	clock        *hlc.Clock
   100  
   101  	// Counts to help optimize slice allocation. Should only be accessed atomically.
   102  	lastDataCount        int64
   103  	lastSummaryCount     int64
   104  	lastNodeMetricCount  int64
   105  	lastStoreMetricCount int64
   106  
   107  	// mu synchronizes the reading of node/store registries against the adding of
   108  	// nodes/stores. Consequently, almost all uses of it only need to take an
   109  	// RLock on it.
   110  	mu struct {
   111  		syncutil.RWMutex
   112  		// nodeRegistry contains, as subregistries, the multiple component-specific
   113  		// registries which are recorded as "node level" metrics.
   114  		nodeRegistry *metric.Registry
   115  		desc         roachpb.NodeDescriptor
   116  		startedAt    int64
   117  
   118  		// storeRegistries contains a registry for each store on the node. These
   119  		// are not stored as subregistries, but rather are treated as wholly
   120  		// independent.
   121  		storeRegistries map[roachpb.StoreID]*metric.Registry
   122  		stores          map[roachpb.StoreID]storeMetrics
   123  	}
   124  	// PrometheusExporter is not thread-safe even for operations that are
   125  	// logically read-only, but we don't want to block using it just because
   126  	// another goroutine is reading from the registries (i.e. using
   127  	// `mu.RLock()`), so we use a separate mutex just for prometheus.
   128  	// NOTE: promMu should always be locked BEFORE trying to lock mu.
   129  	promMu struct {
   130  		syncutil.Mutex
   131  		// prometheusExporter merges metrics into families and generates the
   132  		// prometheus text format.
   133  		prometheusExporter metric.PrometheusExporter
   134  	}
   135  	// WriteNodeStatus is a potentially long-running method (with a network
   136  	// round-trip) that requires a mutex to be safe for concurrent usage. We
   137  	// therefore give it its own mutex to avoid blocking other methods.
   138  	writeSummaryMu syncutil.Mutex
   139  }
   140  
   141  // NewMetricsRecorder initializes a new MetricsRecorder object that uses the
   142  // given clock.
   143  func NewMetricsRecorder(
   144  	clock *hlc.Clock,
   145  	nodeLiveness *kvserver.NodeLiveness,
   146  	rpcContext *rpc.Context,
   147  	gossip *gossip.Gossip,
   148  	settings *cluster.Settings,
   149  ) *MetricsRecorder {
   150  	mr := &MetricsRecorder{
   151  		HealthChecker: NewHealthChecker(trackedMetrics),
   152  		nodeLiveness:  nodeLiveness,
   153  		rpcContext:    rpcContext,
   154  		gossip:        gossip,
   155  		settings:      settings,
   156  	}
   157  	mr.mu.storeRegistries = make(map[roachpb.StoreID]*metric.Registry)
   158  	mr.mu.stores = make(map[roachpb.StoreID]storeMetrics)
   159  	mr.promMu.prometheusExporter = metric.MakePrometheusExporter()
   160  	mr.clock = clock
   161  	return mr
   162  }
   163  
   164  // AddNode adds the Registry from an initialized node, along with its descriptor
   165  // and start time.
   166  func (mr *MetricsRecorder) AddNode(
   167  	reg *metric.Registry,
   168  	desc roachpb.NodeDescriptor,
   169  	startedAt int64,
   170  	advertiseAddr, httpAddr, sqlAddr string,
   171  ) {
   172  	mr.mu.Lock()
   173  	defer mr.mu.Unlock()
   174  	mr.mu.nodeRegistry = reg
   175  	mr.mu.desc = desc
   176  	mr.mu.startedAt = startedAt
   177  
   178  	// Create node ID gauge metric with host as a label.
   179  	metadata := metric.Metadata{
   180  		Name:        "node-id",
   181  		Help:        "node ID with labels for advertised RPC and HTTP addresses",
   182  		Measurement: "Node ID",
   183  		Unit:        metric.Unit_CONST,
   184  	}
   185  
   186  	metadata.AddLabel(advertiseAddrLabelKey, advertiseAddr)
   187  	metadata.AddLabel(httpAddrLabelKey, httpAddr)
   188  	metadata.AddLabel(sqlAddrLabelKey, sqlAddr)
   189  	nodeIDGauge := metric.NewGauge(metadata)
   190  	nodeIDGauge.Update(int64(desc.NodeID))
   191  	reg.AddMetric(nodeIDGauge)
   192  }
   193  
   194  // AddStore adds the Registry from the provided store as a store-level registry
   195  // in this recorder. A reference to the store is kept for the purpose of
   196  // gathering some additional information which is present in store status
   197  // summaries.
   198  // Stores should only be added to the registry after they have been started.
   199  func (mr *MetricsRecorder) AddStore(store storeMetrics) {
   200  	mr.mu.Lock()
   201  	defer mr.mu.Unlock()
   202  	storeID := store.StoreID()
   203  	store.Registry().AddLabel("store", strconv.Itoa(int(storeID)))
   204  	mr.mu.storeRegistries[storeID] = store.Registry()
   205  	mr.mu.stores[storeID] = store
   206  }
   207  
   208  // MarshalJSON returns an appropriate JSON representation of the current values
   209  // of the metrics being tracked by this recorder.
   210  func (mr *MetricsRecorder) MarshalJSON() ([]byte, error) {
   211  	mr.mu.RLock()
   212  	defer mr.mu.RUnlock()
   213  	if mr.mu.nodeRegistry == nil {
   214  		// We haven't yet processed initialization information; return an empty
   215  		// JSON object.
   216  		if log.V(1) {
   217  			log.Warning(context.TODO(), "MetricsRecorder.MarshalJSON() called before NodeID allocation")
   218  		}
   219  		return []byte("{}"), nil
   220  	}
   221  	topLevel := map[string]interface{}{
   222  		fmt.Sprintf("node.%d", mr.mu.desc.NodeID): mr.mu.nodeRegistry,
   223  	}
   224  	// Add collection of stores to top level. JSON requires that keys be strings,
   225  	// so we must convert the store ID to a string.
   226  	storeLevel := make(map[string]interface{})
   227  	for id, reg := range mr.mu.storeRegistries {
   228  		storeLevel[strconv.Itoa(int(id))] = reg
   229  	}
   230  	topLevel["stores"] = storeLevel
   231  	return json.Marshal(topLevel)
   232  }
   233  
   234  // scrapePrometheusLocked updates the prometheusExporter's metrics snapshot.
   235  func (mr *MetricsRecorder) scrapePrometheusLocked() {
   236  	mr.scrapeIntoPrometheus(&mr.promMu.prometheusExporter)
   237  }
   238  
   239  // scrapeIntoPrometheus updates the passed-in prometheusExporter's metrics
   240  // snapshot.
   241  func (mr *MetricsRecorder) scrapeIntoPrometheus(pm *metric.PrometheusExporter) {
   242  	mr.mu.RLock()
   243  	defer mr.mu.RUnlock()
   244  	if mr.mu.nodeRegistry == nil {
   245  		// We haven't yet processed initialization information; output nothing.
   246  		if log.V(1) {
   247  			log.Warning(context.TODO(), "MetricsRecorder asked to scrape metrics before NodeID allocation")
   248  		}
   249  	}
   250  
   251  	pm.ScrapeRegistry(mr.mu.nodeRegistry)
   252  	for _, reg := range mr.mu.storeRegistries {
   253  		pm.ScrapeRegistry(reg)
   254  	}
   255  }
   256  
   257  // PrintAsText writes the current metrics values as plain-text to the writer.
   258  // We write metrics to a temporary buffer which is then copied to the writer.
   259  // This is to avoid hanging requests from holding the lock.
   260  func (mr *MetricsRecorder) PrintAsText(w io.Writer) error {
   261  	var buf bytes.Buffer
   262  	if err := mr.lockAndPrintAsText(&buf); err != nil {
   263  		return err
   264  	}
   265  	_, err := buf.WriteTo(w)
   266  	return err
   267  }
   268  
   269  // lockAndPrintAsText grabs the recorder lock and generates the prometheus
   270  // metrics page.
   271  func (mr *MetricsRecorder) lockAndPrintAsText(w io.Writer) error {
   272  	mr.promMu.Lock()
   273  	defer mr.promMu.Unlock()
   274  	mr.scrapePrometheusLocked()
   275  	return mr.promMu.prometheusExporter.PrintAsText(w)
   276  }
   277  
   278  // ExportToGraphite sends the current metric values to a Graphite server.
   279  // It creates a new PrometheusExporter each time to avoid needing to worry
   280  // about races with mr.promMu.prometheusExporter. We are not as worried
   281  // about the extra memory allocations.
   282  func (mr *MetricsRecorder) ExportToGraphite(
   283  	ctx context.Context, endpoint string, pm *metric.PrometheusExporter,
   284  ) error {
   285  	mr.scrapeIntoPrometheus(pm)
   286  	graphiteExporter := metric.MakeGraphiteExporter(pm)
   287  	return graphiteExporter.Push(ctx, endpoint)
   288  }
   289  
   290  // GetTimeSeriesData serializes registered metrics for consumption by
   291  // CockroachDB's time series system.
   292  func (mr *MetricsRecorder) GetTimeSeriesData() []tspb.TimeSeriesData {
   293  	mr.mu.RLock()
   294  	defer mr.mu.RUnlock()
   295  
   296  	if mr.mu.nodeRegistry == nil {
   297  		// We haven't yet processed initialization information; do nothing.
   298  		if log.V(1) {
   299  			log.Warning(context.TODO(), "MetricsRecorder.GetTimeSeriesData() called before NodeID allocation")
   300  		}
   301  		return nil
   302  	}
   303  
   304  	lastDataCount := atomic.LoadInt64(&mr.lastDataCount)
   305  	data := make([]tspb.TimeSeriesData, 0, lastDataCount)
   306  
   307  	// Record time series from node-level registries.
   308  	now := mr.clock.PhysicalNow()
   309  	recorder := registryRecorder{
   310  		registry:       mr.mu.nodeRegistry,
   311  		format:         nodeTimeSeriesPrefix,
   312  		source:         strconv.FormatInt(int64(mr.mu.desc.NodeID), 10),
   313  		timestampNanos: now,
   314  	}
   315  	recorder.record(&data)
   316  
   317  	// Record time series from store-level registries.
   318  	for storeID, r := range mr.mu.storeRegistries {
   319  		storeRecorder := registryRecorder{
   320  			registry:       r,
   321  			format:         storeTimeSeriesPrefix,
   322  			source:         strconv.FormatInt(int64(storeID), 10),
   323  			timestampNanos: now,
   324  		}
   325  		storeRecorder.record(&data)
   326  	}
   327  	atomic.CompareAndSwapInt64(&mr.lastDataCount, lastDataCount, int64(len(data)))
   328  	return data
   329  }
   330  
   331  // GetMetricsMetadata returns the metadata from all metrics tracked in the node's
   332  // nodeRegistry and a randomly selected storeRegistry.
   333  func (mr *MetricsRecorder) GetMetricsMetadata() map[string]metric.Metadata {
   334  	mr.mu.Lock()
   335  	defer mr.mu.Unlock()
   336  
   337  	if mr.mu.nodeRegistry == nil {
   338  		// We haven't yet processed initialization information; do nothing.
   339  		if log.V(1) {
   340  			log.Warning(context.TODO(), "MetricsRecorder.GetMetricsMetadata() called before NodeID allocation")
   341  		}
   342  		return nil
   343  	}
   344  
   345  	metrics := make(map[string]metric.Metadata)
   346  
   347  	mr.mu.nodeRegistry.WriteMetricsMetadata(metrics)
   348  
   349  	// Get a random storeID.
   350  	var sID roachpb.StoreID
   351  
   352  	for storeID := range mr.mu.storeRegistries {
   353  		sID = storeID
   354  		break
   355  	}
   356  
   357  	// Get metric metadata from that store because all stores have the same metadata.
   358  	mr.mu.storeRegistries[sID].WriteMetricsMetadata(metrics)
   359  
   360  	return metrics
   361  }
   362  
   363  // getNetworkActivity produces three maps detailing information about
   364  // network activity between this node and all other nodes. The maps
   365  // are incoming throughput, outgoing throughput, and average
   366  // latency. Throughputs are stored as bytes, and latencies as nanos.
   367  func (mr *MetricsRecorder) getNetworkActivity(
   368  	ctx context.Context,
   369  ) map[roachpb.NodeID]statuspb.NodeStatus_NetworkActivity {
   370  	activity := make(map[roachpb.NodeID]statuspb.NodeStatus_NetworkActivity)
   371  	if mr.nodeLiveness != nil && mr.gossip != nil {
   372  		isLiveMap := mr.nodeLiveness.GetIsLiveMap()
   373  
   374  		throughputMap := mr.rpcContext.GetStatsMap()
   375  		var currentAverages map[string]time.Duration
   376  		if mr.rpcContext.RemoteClocks != nil {
   377  			currentAverages = mr.rpcContext.RemoteClocks.AllLatencies()
   378  		}
   379  		for nodeID, entry := range isLiveMap {
   380  			address, err := mr.gossip.GetNodeIDAddress(nodeID)
   381  			if err != nil {
   382  				if entry.IsLive {
   383  					log.Warningf(ctx, "%v", err)
   384  				}
   385  				continue
   386  			}
   387  			na := statuspb.NodeStatus_NetworkActivity{}
   388  			key := address.String()
   389  			if tp, ok := throughputMap.Load(key); ok {
   390  				stats := tp.(*rpc.Stats)
   391  				na.Incoming = stats.Incoming()
   392  				na.Outgoing = stats.Outgoing()
   393  			}
   394  			if entry.IsLive {
   395  				if latency, ok := currentAverages[key]; ok {
   396  					na.Latency = latency.Nanoseconds()
   397  				}
   398  			}
   399  			activity[nodeID] = na
   400  		}
   401  	}
   402  	return activity
   403  }
   404  
   405  // GenerateNodeStatus returns a status summary message for the node. The summary
   406  // includes the recent values of metrics for both the node and all of its
   407  // component stores. When the node isn't initialized yet, nil is returned.
   408  func (mr *MetricsRecorder) GenerateNodeStatus(ctx context.Context) *statuspb.NodeStatus {
   409  	activity := mr.getNetworkActivity(ctx)
   410  
   411  	mr.mu.RLock()
   412  	defer mr.mu.RUnlock()
   413  
   414  	if mr.mu.nodeRegistry == nil {
   415  		// We haven't yet processed initialization information; do nothing.
   416  		if log.V(1) {
   417  			log.Warning(ctx, "attempt to generate status summary before NodeID allocation.")
   418  		}
   419  		return nil
   420  	}
   421  
   422  	now := mr.clock.PhysicalNow()
   423  
   424  	lastSummaryCount := atomic.LoadInt64(&mr.lastSummaryCount)
   425  	lastNodeMetricCount := atomic.LoadInt64(&mr.lastNodeMetricCount)
   426  	lastStoreMetricCount := atomic.LoadInt64(&mr.lastStoreMetricCount)
   427  
   428  	systemMemory, _, err := GetTotalMemoryWithoutLogging()
   429  	if err != nil {
   430  		log.Errorf(ctx, "could not get total system memory: %v", err)
   431  	}
   432  
   433  	// Generate a node status with no store data.
   434  	nodeStat := &statuspb.NodeStatus{
   435  		Desc:              mr.mu.desc,
   436  		BuildInfo:         build.GetInfo(),
   437  		UpdatedAt:         now,
   438  		StartedAt:         mr.mu.startedAt,
   439  		StoreStatuses:     make([]statuspb.StoreStatus, 0, lastSummaryCount),
   440  		Metrics:           make(map[string]float64, lastNodeMetricCount),
   441  		Args:              os.Args,
   442  		Env:               envutil.GetEnvVarsUsed(),
   443  		Activity:          activity,
   444  		NumCpus:           int32(runtime.NumCPU()),
   445  		TotalSystemMemory: systemMemory,
   446  	}
   447  
   448  	eachRecordableValue(mr.mu.nodeRegistry, func(name string, val float64) {
   449  		nodeStat.Metrics[name] = val
   450  	})
   451  
   452  	// Generate status summaries for stores.
   453  	for storeID, r := range mr.mu.storeRegistries {
   454  		storeMetrics := make(map[string]float64, lastStoreMetricCount)
   455  		eachRecordableValue(r, func(name string, val float64) {
   456  			storeMetrics[name] = val
   457  		})
   458  
   459  		// Gather descriptor from store.
   460  		descriptor, err := mr.mu.stores[storeID].Descriptor(false /* useCached */)
   461  		if err != nil {
   462  			log.Errorf(ctx, "Could not record status summaries: Store %d could not return descriptor, error: %s", storeID, err)
   463  			continue
   464  		}
   465  
   466  		nodeStat.StoreStatuses = append(nodeStat.StoreStatuses, statuspb.StoreStatus{
   467  			Desc:    *descriptor,
   468  			Metrics: storeMetrics,
   469  		})
   470  	}
   471  
   472  	atomic.CompareAndSwapInt64(
   473  		&mr.lastSummaryCount, lastSummaryCount, int64(len(nodeStat.StoreStatuses)))
   474  	atomic.CompareAndSwapInt64(
   475  		&mr.lastNodeMetricCount, lastNodeMetricCount, int64(len(nodeStat.Metrics)))
   476  	if len(nodeStat.StoreStatuses) > 0 {
   477  		atomic.CompareAndSwapInt64(
   478  			&mr.lastStoreMetricCount, lastStoreMetricCount, int64(len(nodeStat.StoreStatuses[0].Metrics)))
   479  	}
   480  
   481  	return nodeStat
   482  }
   483  
   484  // WriteNodeStatus writes the supplied summary to the given client.
   485  func (mr *MetricsRecorder) WriteNodeStatus(
   486  	ctx context.Context, db *kv.DB, nodeStatus statuspb.NodeStatus,
   487  ) error {
   488  	mr.writeSummaryMu.Lock()
   489  	defer mr.writeSummaryMu.Unlock()
   490  	key := keys.NodeStatusKey(nodeStatus.Desc.NodeID)
   491  	// We use PutInline to store only a single version of the node status.
   492  	// There's not much point in keeping the historical versions as we keep
   493  	// all of the constituent data as timeseries. Further, due to the size
   494  	// of the build info in the node status, writing one of these every 10s
   495  	// will generate more versions than will easily fit into a range over
   496  	// the course of a day.
   497  	if err := db.PutInline(ctx, key, &nodeStatus); err != nil {
   498  		return err
   499  	}
   500  	if log.V(2) {
   501  		statusJSON, err := json.Marshal(&nodeStatus)
   502  		if err != nil {
   503  			log.Errorf(ctx, "error marshaling nodeStatus to json: %s", err)
   504  		}
   505  		log.Infof(ctx, "node %d status: %s", nodeStatus.Desc.NodeID, statusJSON)
   506  	}
   507  	return nil
   508  }
   509  
   510  // registryRecorder is a helper class for recording time series datapoints
   511  // from a metrics Registry.
   512  type registryRecorder struct {
   513  	registry       *metric.Registry
   514  	format         string
   515  	source         string
   516  	timestampNanos int64
   517  }
   518  
   519  func extractValue(mtr interface{}) (float64, error) {
   520  	// TODO(tschottdorf|mrtracy): consider moving this switch to an interface
   521  	// implemented by the individual metric types.
   522  	switch mtr := mtr.(type) {
   523  	case float64:
   524  		return mtr, nil
   525  	case *metric.Counter:
   526  		return float64(mtr.Count()), nil
   527  	case *metric.Gauge:
   528  		return float64(mtr.Value()), nil
   529  	case *metric.Rate:
   530  		return mtr.Value(), nil
   531  	case *metric.GaugeFloat64:
   532  		return mtr.Value(), nil
   533  	default:
   534  		return 0, errors.Errorf("cannot extract value for type %T", mtr)
   535  	}
   536  }
   537  
   538  // eachRecordableValue visits each metric in the registry, calling the supplied
   539  // function once for each recordable value represented by that metric. This is
   540  // useful to expand certain metric types (such as histograms) into multiple
   541  // recordable values.
   542  func eachRecordableValue(reg *metric.Registry, fn func(string, float64)) {
   543  	reg.Each(func(name string, mtr interface{}) {
   544  		if histogram, ok := mtr.(*metric.Histogram); ok {
   545  			// TODO(mrtracy): Where should this comment go for better
   546  			// visibility?
   547  			//
   548  			// Proper support of Histograms for time series is difficult and
   549  			// likely not worth the trouble. Instead, we aggregate a windowed
   550  			// histogram at fixed quantiles. If the scraping window and the
   551  			// histogram's eviction duration are similar, this should give
   552  			// good results; if the two durations are very different, we either
   553  			// report stale results or report only the more recent data.
   554  			//
   555  			// Additionally, we can only aggregate max/min of the quantiles;
   556  			// roll-ups don't know that and so they will return mathematically
   557  			// nonsensical values, but that seems acceptable for the time
   558  			// being.
   559  			curr, _ := histogram.Windowed()
   560  			for _, pt := range recordHistogramQuantiles {
   561  				fn(name+pt.suffix, float64(curr.ValueAtQuantile(pt.quantile)))
   562  			}
   563  		} else {
   564  			val, err := extractValue(mtr)
   565  			if err != nil {
   566  				log.Warningf(context.TODO(), "%v", err)
   567  				return
   568  			}
   569  			fn(name, val)
   570  		}
   571  	})
   572  }
   573  
   574  func (rr registryRecorder) record(dest *[]tspb.TimeSeriesData) {
   575  	eachRecordableValue(rr.registry, func(name string, val float64) {
   576  		*dest = append(*dest, tspb.TimeSeriesData{
   577  			Name:   fmt.Sprintf(rr.format, name),
   578  			Source: rr.source,
   579  			Datapoints: []tspb.TimeSeriesDatapoint{
   580  				{
   581  					TimestampNanos: rr.timestampNanos,
   582  					Value:          val,
   583  				},
   584  			},
   585  		})
   586  	})
   587  }
   588  
   589  // GetTotalMemory returns either the total system memory (in bytes) or if
   590  // possible the cgroups available memory.
   591  func GetTotalMemory(ctx context.Context) (int64, error) {
   592  	memory, warning, err := GetTotalMemoryWithoutLogging()
   593  	if err != nil {
   594  		return 0, err
   595  	}
   596  	if warning != "" {
   597  		log.Infof(ctx, "%s", warning)
   598  	}
   599  	return memory, nil
   600  }
   601  
   602  // GetTotalMemoryWithoutLogging is the same as GetTotalMemory, but returns any warning
   603  // as a string instead of logging it.
   604  func GetTotalMemoryWithoutLogging() (int64, string, error) {
   605  	totalMem, err := func() (int64, error) {
   606  		mem := gosigar.Mem{}
   607  		if err := mem.Get(); err != nil {
   608  			return 0, err
   609  		}
   610  		if mem.Total > math.MaxInt64 {
   611  			return 0, fmt.Errorf("inferred memory size %s exceeds maximum supported memory size %s",
   612  				humanize.IBytes(mem.Total), humanize.Bytes(math.MaxInt64))
   613  		}
   614  		return int64(mem.Total), nil
   615  	}()
   616  	if err != nil {
   617  		return 0, "", err
   618  	}
   619  	checkTotal := func(x int64, warning string) (int64, string, error) {
   620  		if x <= 0 {
   621  			// https://github.com/elastic/gosigar/issues/72
   622  			return 0, warning, fmt.Errorf("inferred memory size %d is suspicious, considering invalid", x)
   623  		}
   624  		return x, warning, nil
   625  	}
   626  	if runtime.GOOS != "linux" {
   627  		return checkTotal(totalMem, "")
   628  	}
   629  	cgAvlMem, warning, err := cgroups.GetMemoryLimit()
   630  	if err != nil {
   631  		return checkTotal(totalMem,
   632  			fmt.Sprintf("available memory from cgroups is unsupported, using system memory %s instead: %v",
   633  				humanizeutil.IBytes(totalMem), err))
   634  	}
   635  	if cgAvlMem == 0 || (totalMem > 0 && cgAvlMem > totalMem) {
   636  		return checkTotal(totalMem,
   637  			fmt.Sprintf("available memory from cgroups (%s) is unsupported, using system memory %s instead: %s",
   638  				humanize.IBytes(uint64(cgAvlMem)), humanizeutil.IBytes(totalMem), warning))
   639  	}
   640  	return checkTotal(cgAvlMem, "")
   641  }