github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/status/recorder_test.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/status/recorder_test.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package status
    12  
    13  import (
    14  	"context"
    15  	"io/ioutil"
    16  	"os"
    17  	"reflect"
    18  	"runtime"
    19  	"sort"
    20  	"strconv"
    21  	"sync"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/cockroachdb/cockroach/pkg/build"
    26  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    27  	"github.com/cockroachdb/cockroach/pkg/server/status/statuspb"
    28  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    29  	"github.com/cockroachdb/cockroach/pkg/ts/tspb"
    30  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    31  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    32  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    33  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    34  	"github.com/kr/pretty"
    35  )
    36  
    37  // byTimeAndName is a slice of tspb.TimeSeriesData.
    38  type byTimeAndName []tspb.TimeSeriesData
    39  
    40  // implement sort.Interface for byTimeAndName
    41  func (a byTimeAndName) Len() int      { return len(a) }
    42  func (a byTimeAndName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
    43  func (a byTimeAndName) Less(i, j int) bool {
    44  	if a[i].Name != a[j].Name {
    45  		return a[i].Name < a[j].Name
    46  	}
    47  	if a[i].Datapoints[0].TimestampNanos != a[j].Datapoints[0].TimestampNanos {
    48  		return a[i].Datapoints[0].TimestampNanos < a[j].Datapoints[0].TimestampNanos
    49  	}
    50  	return a[i].Source < a[j].Source
    51  }
    52  
    53  var _ sort.Interface = byTimeAndName{}
    54  
    55  // byStoreID is a slice of roachpb.StoreID.
    56  type byStoreID []roachpb.StoreID
    57  
    58  // implement sort.Interface for byStoreID
    59  func (a byStoreID) Len() int      { return len(a) }
    60  func (a byStoreID) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
    61  func (a byStoreID) Less(i, j int) bool {
    62  	return a[i] < a[j]
    63  }
    64  
    65  var _ sort.Interface = byStoreID{}
    66  
    67  // byStoreDescID is a slice of storage.StoreStatus
    68  type byStoreDescID []statuspb.StoreStatus
    69  
    70  // implement sort.Interface for byStoreDescID.
    71  func (a byStoreDescID) Len() int      { return len(a) }
    72  func (a byStoreDescID) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
    73  func (a byStoreDescID) Less(i, j int) bool {
    74  	return a[i].Desc.StoreID < a[j].Desc.StoreID
    75  }
    76  
    77  var _ sort.Interface = byStoreDescID{}
    78  
    79  // fakeStore implements only the methods of store needed by MetricsRecorder to
    80  // interact with stores.
    81  type fakeStore struct {
    82  	storeID  roachpb.StoreID
    83  	desc     roachpb.StoreDescriptor
    84  	registry *metric.Registry
    85  }
    86  
    87  func (fs fakeStore) StoreID() roachpb.StoreID {
    88  	return fs.storeID
    89  }
    90  
    91  func (fs fakeStore) Descriptor(_ bool) (*roachpb.StoreDescriptor, error) {
    92  	return &fs.desc, nil
    93  }
    94  
    95  func (fs fakeStore) Registry() *metric.Registry {
    96  	return fs.registry
    97  }
    98  
    99  // TestMetricsRecorder verifies that the metrics recorder properly formats the
   100  // statistics from various registries, both for Time Series and for Status
   101  // Summaries.
   102  func TestMetricsRecorder(t *testing.T) {
   103  	defer leaktest.AfterTest(t)()
   104  
   105  	// ========================================
   106  	// Construct a series of fake descriptors for use in test.
   107  	// ========================================
   108  	nodeDesc := roachpb.NodeDescriptor{
   109  		NodeID: roachpb.NodeID(1),
   110  	}
   111  	storeDesc1 := roachpb.StoreDescriptor{
   112  		StoreID: roachpb.StoreID(1),
   113  		Capacity: roachpb.StoreCapacity{
   114  			Capacity:  100,
   115  			Available: 50,
   116  			Used:      50,
   117  		},
   118  	}
   119  	storeDesc2 := roachpb.StoreDescriptor{
   120  		StoreID: roachpb.StoreID(2),
   121  		Capacity: roachpb.StoreCapacity{
   122  			Capacity:  200,
   123  			Available: 75,
   124  			Used:      125,
   125  		},
   126  	}
   127  
   128  	// ========================================
   129  	// Create registries and add them to the recorder (two node-level, two
   130  	// store-level).
   131  	// ========================================
   132  	reg1 := metric.NewRegistry()
   133  	store1 := fakeStore{
   134  		storeID:  roachpb.StoreID(1),
   135  		desc:     storeDesc1,
   136  		registry: metric.NewRegistry(),
   137  	}
   138  	store2 := fakeStore{
   139  		storeID:  roachpb.StoreID(2),
   140  		desc:     storeDesc2,
   141  		registry: metric.NewRegistry(),
   142  	}
   143  	manual := hlc.NewManualClock(100)
   144  	st := cluster.MakeTestingClusterSettings()
   145  	recorder := NewMetricsRecorder(hlc.NewClock(manual.UnixNano, time.Nanosecond), nil, nil, nil, st)
   146  	recorder.AddStore(store1)
   147  	recorder.AddStore(store2)
   148  	recorder.AddNode(reg1, nodeDesc, 50, "foo:26257", "foo:26258", "foo:5432")
   149  
   150  	// Ensure the metric system's view of time does not advance during this test
   151  	// as the test expects time to not advance too far which would age the actual
   152  	// data (e.g. in histogram's) unexpectedly.
   153  	defer metric.TestingSetNow(func() time.Time {
   154  		return timeutil.Unix(0, manual.UnixNano())
   155  	})()
   156  
   157  	// ========================================
   158  	// Generate Metrics Data & Expected Results
   159  	// ========================================
   160  
   161  	// Flatten the four registries into an array for ease of use.
   162  	regList := []struct {
   163  		reg    *metric.Registry
   164  		prefix string
   165  		source int64
   166  		isNode bool
   167  	}{
   168  		{
   169  			reg:    reg1,
   170  			prefix: "one.",
   171  			source: 1,
   172  			isNode: true,
   173  		},
   174  		{
   175  			reg:    reg1,
   176  			prefix: "two.",
   177  			source: 1,
   178  			isNode: true,
   179  		},
   180  		{
   181  			reg:    store1.registry,
   182  			prefix: "",
   183  			source: int64(store1.storeID),
   184  			isNode: false,
   185  		},
   186  		{
   187  			reg:    store2.registry,
   188  			prefix: "",
   189  			source: int64(store2.storeID),
   190  			isNode: false,
   191  		},
   192  	}
   193  
   194  	// Every registry will have a copy of the following metrics.
   195  	metricNames := []struct {
   196  		name string
   197  		typ  string
   198  		val  int64
   199  	}{
   200  		{"testGauge", "gauge", 20},
   201  		{"testGaugeFloat64", "floatgauge", 20},
   202  		{"testCounter", "counter", 5},
   203  		{"testHistogram", "histogram", 10},
   204  		{"testLatency", "latency", 10},
   205  
   206  		// Stats needed for store summaries.
   207  		{"ranges", "counter", 1},
   208  		{"replicas.leaders", "gauge", 1},
   209  		{"replicas.leaseholders", "gauge", 1},
   210  		{"ranges", "gauge", 1},
   211  		{"ranges.unavailable", "gauge", 1},
   212  		{"ranges.underreplicated", "gauge", 1},
   213  	}
   214  
   215  	// Add the metrics to each registry and set their values. At the same time,
   216  	// generate expected time series results and status summary metric values.
   217  	var expected []tspb.TimeSeriesData
   218  	expectedNodeSummaryMetrics := make(map[string]float64)
   219  	expectedStoreSummaryMetrics := make(map[string]float64)
   220  
   221  	// addExpected generates expected data for a single metric data point.
   222  	addExpected := func(prefix, name string, source, time, val int64, isNode bool) {
   223  		// Generate time series data.
   224  		tsPrefix := "cr.node."
   225  		if !isNode {
   226  			tsPrefix = "cr.store."
   227  		}
   228  		expect := tspb.TimeSeriesData{
   229  			Name:   tsPrefix + prefix + name,
   230  			Source: strconv.FormatInt(source, 10),
   231  			Datapoints: []tspb.TimeSeriesDatapoint{
   232  				{
   233  					TimestampNanos: time,
   234  					Value:          float64(val),
   235  				},
   236  			},
   237  		}
   238  		expected = append(expected, expect)
   239  
   240  		// Generate status summary data.
   241  		if isNode {
   242  			expectedNodeSummaryMetrics[prefix+name] = float64(val)
   243  		} else {
   244  			// This can overwrite the previous value, but this is expected as
   245  			// all stores in our tests have identical values; when comparing
   246  			// status summaries, the same map is used as expected data for all
   247  			// stores.
   248  			expectedStoreSummaryMetrics[prefix+name] = float64(val)
   249  		}
   250  	}
   251  
   252  	// Add metric for node ID.
   253  	g := metric.NewGauge(metric.Metadata{Name: "node-id"})
   254  	g.Update(int64(nodeDesc.NodeID))
   255  	addExpected("", "node-id", 1, 100, g.Value(), true)
   256  
   257  	for _, reg := range regList {
   258  		for _, data := range metricNames {
   259  			switch data.typ {
   260  			case "gauge":
   261  				g := metric.NewGauge(metric.Metadata{Name: reg.prefix + data.name})
   262  				reg.reg.AddMetric(g)
   263  				g.Update(data.val)
   264  				addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode)
   265  			case "floatgauge":
   266  				g := metric.NewGaugeFloat64(metric.Metadata{Name: reg.prefix + data.name})
   267  				reg.reg.AddMetric(g)
   268  				g.Update(float64(data.val))
   269  				addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode)
   270  			case "counter":
   271  				c := metric.NewCounter(metric.Metadata{Name: reg.prefix + data.name})
   272  				reg.reg.AddMetric(c)
   273  				c.Inc((data.val))
   274  				addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode)
   275  			case "histogram":
   276  				h := metric.NewHistogram(metric.Metadata{Name: reg.prefix + data.name}, time.Second, 1000, 2)
   277  				reg.reg.AddMetric(h)
   278  				h.RecordValue(data.val)
   279  				for _, q := range recordHistogramQuantiles {
   280  					addExpected(reg.prefix, data.name+q.suffix, reg.source, 100, data.val, reg.isNode)
   281  				}
   282  			case "latency":
   283  				l := metric.NewLatency(metric.Metadata{Name: reg.prefix + data.name}, time.Hour)
   284  				reg.reg.AddMetric(l)
   285  				l.RecordValue(data.val)
   286  				// Latency is simply three histograms (at different resolution
   287  				// time scales).
   288  				for _, q := range recordHistogramQuantiles {
   289  					addExpected(reg.prefix, data.name+q.suffix, reg.source, 100, data.val, reg.isNode)
   290  				}
   291  			default:
   292  				t.Fatalf("unexpected: %+v", data)
   293  			}
   294  		}
   295  	}
   296  
   297  	// ========================================
   298  	// Verify time series data
   299  	// ========================================
   300  	actual := recorder.GetTimeSeriesData()
   301  
   302  	// Actual comparison is simple: sort the resulting arrays by time and name,
   303  	// and use reflect.DeepEqual.
   304  	sort.Sort(byTimeAndName(actual))
   305  	sort.Sort(byTimeAndName(expected))
   306  	if a, e := actual, expected; !reflect.DeepEqual(a, e) {
   307  		t.Errorf("recorder did not yield expected time series collection; diff:\n %v", pretty.Diff(e, a))
   308  	}
   309  
   310  	totalMemory, err := GetTotalMemory(context.Background())
   311  	if err != nil {
   312  		t.Error("couldn't get total memory", err)
   313  	}
   314  
   315  	// ========================================
   316  	// Verify node summary generation
   317  	// ========================================
   318  	expectedNodeSummary := &statuspb.NodeStatus{
   319  		Desc:      nodeDesc,
   320  		BuildInfo: build.GetInfo(),
   321  		StartedAt: 50,
   322  		UpdatedAt: 100,
   323  		Metrics:   expectedNodeSummaryMetrics,
   324  		StoreStatuses: []statuspb.StoreStatus{
   325  			{
   326  				Desc:    storeDesc1,
   327  				Metrics: expectedStoreSummaryMetrics,
   328  			},
   329  			{
   330  				Desc:    storeDesc2,
   331  				Metrics: expectedStoreSummaryMetrics,
   332  			},
   333  		},
   334  		TotalSystemMemory: totalMemory,
   335  		NumCpus:           int32(runtime.NumCPU()),
   336  	}
   337  
   338  	// Make sure there is at least one environment variable that will be
   339  	// reported.
   340  	if err := os.Setenv("GOGC", "100"); err != nil {
   341  		t.Fatal(err)
   342  	}
   343  
   344  	nodeSummary := recorder.GenerateNodeStatus(context.Background())
   345  	if nodeSummary == nil {
   346  		t.Fatalf("recorder did not return nodeSummary")
   347  	}
   348  	if len(nodeSummary.Args) == 0 {
   349  		t.Fatalf("expected args to be present")
   350  	}
   351  	if len(nodeSummary.Env) == 0 {
   352  		t.Fatalf("expected env to be present")
   353  	}
   354  	nodeSummary.Args = nil
   355  	nodeSummary.Env = nil
   356  	nodeSummary.Activity = nil
   357  	nodeSummary.Latencies = nil
   358  
   359  	sort.Sort(byStoreDescID(nodeSummary.StoreStatuses))
   360  	if a, e := nodeSummary, expectedNodeSummary; !reflect.DeepEqual(a, e) {
   361  		t.Errorf("recorder did not produce expected NodeSummary; diff:\n %s", pretty.Diff(e, a))
   362  	}
   363  
   364  	// Make sure that all methods other than GenerateNodeStatus can operate in
   365  	// parallel with each other (i.e. even if recorder.mu is RLocked).
   366  	recorder.mu.RLock()
   367  	var wg sync.WaitGroup
   368  	for i := 0; i < 100; i++ {
   369  		wg.Add(1)
   370  		go func() {
   371  			if _, err := recorder.MarshalJSON(); err != nil {
   372  				t.Error(err)
   373  			}
   374  			_ = recorder.PrintAsText(ioutil.Discard)
   375  			_ = recorder.GetTimeSeriesData()
   376  			wg.Done()
   377  		}()
   378  	}
   379  	wg.Wait()
   380  	recorder.mu.RUnlock()
   381  }