github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/status/recorder_test.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package status 12 13 import ( 14 "context" 15 "io/ioutil" 16 "os" 17 "reflect" 18 "runtime" 19 "sort" 20 "strconv" 21 "sync" 22 "testing" 23 "time" 24 25 "github.com/cockroachdb/cockroach/pkg/build" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/server/status/statuspb" 28 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 29 "github.com/cockroachdb/cockroach/pkg/ts/tspb" 30 "github.com/cockroachdb/cockroach/pkg/util/hlc" 31 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 32 "github.com/cockroachdb/cockroach/pkg/util/metric" 33 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 34 "github.com/kr/pretty" 35 ) 36 37 // byTimeAndName is a slice of tspb.TimeSeriesData. 38 type byTimeAndName []tspb.TimeSeriesData 39 40 // implement sort.Interface for byTimeAndName 41 func (a byTimeAndName) Len() int { return len(a) } 42 func (a byTimeAndName) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 43 func (a byTimeAndName) Less(i, j int) bool { 44 if a[i].Name != a[j].Name { 45 return a[i].Name < a[j].Name 46 } 47 if a[i].Datapoints[0].TimestampNanos != a[j].Datapoints[0].TimestampNanos { 48 return a[i].Datapoints[0].TimestampNanos < a[j].Datapoints[0].TimestampNanos 49 } 50 return a[i].Source < a[j].Source 51 } 52 53 var _ sort.Interface = byTimeAndName{} 54 55 // byStoreID is a slice of roachpb.StoreID. 56 type byStoreID []roachpb.StoreID 57 58 // implement sort.Interface for byStoreID 59 func (a byStoreID) Len() int { return len(a) } 60 func (a byStoreID) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 61 func (a byStoreID) Less(i, j int) bool { 62 return a[i] < a[j] 63 } 64 65 var _ sort.Interface = byStoreID{} 66 67 // byStoreDescID is a slice of storage.StoreStatus 68 type byStoreDescID []statuspb.StoreStatus 69 70 // implement sort.Interface for byStoreDescID. 71 func (a byStoreDescID) Len() int { return len(a) } 72 func (a byStoreDescID) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 73 func (a byStoreDescID) Less(i, j int) bool { 74 return a[i].Desc.StoreID < a[j].Desc.StoreID 75 } 76 77 var _ sort.Interface = byStoreDescID{} 78 79 // fakeStore implements only the methods of store needed by MetricsRecorder to 80 // interact with stores. 81 type fakeStore struct { 82 storeID roachpb.StoreID 83 desc roachpb.StoreDescriptor 84 registry *metric.Registry 85 } 86 87 func (fs fakeStore) StoreID() roachpb.StoreID { 88 return fs.storeID 89 } 90 91 func (fs fakeStore) Descriptor(_ bool) (*roachpb.StoreDescriptor, error) { 92 return &fs.desc, nil 93 } 94 95 func (fs fakeStore) Registry() *metric.Registry { 96 return fs.registry 97 } 98 99 // TestMetricsRecorder verifies that the metrics recorder properly formats the 100 // statistics from various registries, both for Time Series and for Status 101 // Summaries. 102 func TestMetricsRecorder(t *testing.T) { 103 defer leaktest.AfterTest(t)() 104 105 // ======================================== 106 // Construct a series of fake descriptors for use in test. 107 // ======================================== 108 nodeDesc := roachpb.NodeDescriptor{ 109 NodeID: roachpb.NodeID(1), 110 } 111 storeDesc1 := roachpb.StoreDescriptor{ 112 StoreID: roachpb.StoreID(1), 113 Capacity: roachpb.StoreCapacity{ 114 Capacity: 100, 115 Available: 50, 116 Used: 50, 117 }, 118 } 119 storeDesc2 := roachpb.StoreDescriptor{ 120 StoreID: roachpb.StoreID(2), 121 Capacity: roachpb.StoreCapacity{ 122 Capacity: 200, 123 Available: 75, 124 Used: 125, 125 }, 126 } 127 128 // ======================================== 129 // Create registries and add them to the recorder (two node-level, two 130 // store-level). 131 // ======================================== 132 reg1 := metric.NewRegistry() 133 store1 := fakeStore{ 134 storeID: roachpb.StoreID(1), 135 desc: storeDesc1, 136 registry: metric.NewRegistry(), 137 } 138 store2 := fakeStore{ 139 storeID: roachpb.StoreID(2), 140 desc: storeDesc2, 141 registry: metric.NewRegistry(), 142 } 143 manual := hlc.NewManualClock(100) 144 st := cluster.MakeTestingClusterSettings() 145 recorder := NewMetricsRecorder(hlc.NewClock(manual.UnixNano, time.Nanosecond), nil, nil, nil, st) 146 recorder.AddStore(store1) 147 recorder.AddStore(store2) 148 recorder.AddNode(reg1, nodeDesc, 50, "foo:26257", "foo:26258", "foo:5432") 149 150 // Ensure the metric system's view of time does not advance during this test 151 // as the test expects time to not advance too far which would age the actual 152 // data (e.g. in histogram's) unexpectedly. 153 defer metric.TestingSetNow(func() time.Time { 154 return timeutil.Unix(0, manual.UnixNano()) 155 })() 156 157 // ======================================== 158 // Generate Metrics Data & Expected Results 159 // ======================================== 160 161 // Flatten the four registries into an array for ease of use. 162 regList := []struct { 163 reg *metric.Registry 164 prefix string 165 source int64 166 isNode bool 167 }{ 168 { 169 reg: reg1, 170 prefix: "one.", 171 source: 1, 172 isNode: true, 173 }, 174 { 175 reg: reg1, 176 prefix: "two.", 177 source: 1, 178 isNode: true, 179 }, 180 { 181 reg: store1.registry, 182 prefix: "", 183 source: int64(store1.storeID), 184 isNode: false, 185 }, 186 { 187 reg: store2.registry, 188 prefix: "", 189 source: int64(store2.storeID), 190 isNode: false, 191 }, 192 } 193 194 // Every registry will have a copy of the following metrics. 195 metricNames := []struct { 196 name string 197 typ string 198 val int64 199 }{ 200 {"testGauge", "gauge", 20}, 201 {"testGaugeFloat64", "floatgauge", 20}, 202 {"testCounter", "counter", 5}, 203 {"testHistogram", "histogram", 10}, 204 {"testLatency", "latency", 10}, 205 206 // Stats needed for store summaries. 207 {"ranges", "counter", 1}, 208 {"replicas.leaders", "gauge", 1}, 209 {"replicas.leaseholders", "gauge", 1}, 210 {"ranges", "gauge", 1}, 211 {"ranges.unavailable", "gauge", 1}, 212 {"ranges.underreplicated", "gauge", 1}, 213 } 214 215 // Add the metrics to each registry and set their values. At the same time, 216 // generate expected time series results and status summary metric values. 217 var expected []tspb.TimeSeriesData 218 expectedNodeSummaryMetrics := make(map[string]float64) 219 expectedStoreSummaryMetrics := make(map[string]float64) 220 221 // addExpected generates expected data for a single metric data point. 222 addExpected := func(prefix, name string, source, time, val int64, isNode bool) { 223 // Generate time series data. 224 tsPrefix := "cr.node." 225 if !isNode { 226 tsPrefix = "cr.store." 227 } 228 expect := tspb.TimeSeriesData{ 229 Name: tsPrefix + prefix + name, 230 Source: strconv.FormatInt(source, 10), 231 Datapoints: []tspb.TimeSeriesDatapoint{ 232 { 233 TimestampNanos: time, 234 Value: float64(val), 235 }, 236 }, 237 } 238 expected = append(expected, expect) 239 240 // Generate status summary data. 241 if isNode { 242 expectedNodeSummaryMetrics[prefix+name] = float64(val) 243 } else { 244 // This can overwrite the previous value, but this is expected as 245 // all stores in our tests have identical values; when comparing 246 // status summaries, the same map is used as expected data for all 247 // stores. 248 expectedStoreSummaryMetrics[prefix+name] = float64(val) 249 } 250 } 251 252 // Add metric for node ID. 253 g := metric.NewGauge(metric.Metadata{Name: "node-id"}) 254 g.Update(int64(nodeDesc.NodeID)) 255 addExpected("", "node-id", 1, 100, g.Value(), true) 256 257 for _, reg := range regList { 258 for _, data := range metricNames { 259 switch data.typ { 260 case "gauge": 261 g := metric.NewGauge(metric.Metadata{Name: reg.prefix + data.name}) 262 reg.reg.AddMetric(g) 263 g.Update(data.val) 264 addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) 265 case "floatgauge": 266 g := metric.NewGaugeFloat64(metric.Metadata{Name: reg.prefix + data.name}) 267 reg.reg.AddMetric(g) 268 g.Update(float64(data.val)) 269 addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) 270 case "counter": 271 c := metric.NewCounter(metric.Metadata{Name: reg.prefix + data.name}) 272 reg.reg.AddMetric(c) 273 c.Inc((data.val)) 274 addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) 275 case "histogram": 276 h := metric.NewHistogram(metric.Metadata{Name: reg.prefix + data.name}, time.Second, 1000, 2) 277 reg.reg.AddMetric(h) 278 h.RecordValue(data.val) 279 for _, q := range recordHistogramQuantiles { 280 addExpected(reg.prefix, data.name+q.suffix, reg.source, 100, data.val, reg.isNode) 281 } 282 case "latency": 283 l := metric.NewLatency(metric.Metadata{Name: reg.prefix + data.name}, time.Hour) 284 reg.reg.AddMetric(l) 285 l.RecordValue(data.val) 286 // Latency is simply three histograms (at different resolution 287 // time scales). 288 for _, q := range recordHistogramQuantiles { 289 addExpected(reg.prefix, data.name+q.suffix, reg.source, 100, data.val, reg.isNode) 290 } 291 default: 292 t.Fatalf("unexpected: %+v", data) 293 } 294 } 295 } 296 297 // ======================================== 298 // Verify time series data 299 // ======================================== 300 actual := recorder.GetTimeSeriesData() 301 302 // Actual comparison is simple: sort the resulting arrays by time and name, 303 // and use reflect.DeepEqual. 304 sort.Sort(byTimeAndName(actual)) 305 sort.Sort(byTimeAndName(expected)) 306 if a, e := actual, expected; !reflect.DeepEqual(a, e) { 307 t.Errorf("recorder did not yield expected time series collection; diff:\n %v", pretty.Diff(e, a)) 308 } 309 310 totalMemory, err := GetTotalMemory(context.Background()) 311 if err != nil { 312 t.Error("couldn't get total memory", err) 313 } 314 315 // ======================================== 316 // Verify node summary generation 317 // ======================================== 318 expectedNodeSummary := &statuspb.NodeStatus{ 319 Desc: nodeDesc, 320 BuildInfo: build.GetInfo(), 321 StartedAt: 50, 322 UpdatedAt: 100, 323 Metrics: expectedNodeSummaryMetrics, 324 StoreStatuses: []statuspb.StoreStatus{ 325 { 326 Desc: storeDesc1, 327 Metrics: expectedStoreSummaryMetrics, 328 }, 329 { 330 Desc: storeDesc2, 331 Metrics: expectedStoreSummaryMetrics, 332 }, 333 }, 334 TotalSystemMemory: totalMemory, 335 NumCpus: int32(runtime.NumCPU()), 336 } 337 338 // Make sure there is at least one environment variable that will be 339 // reported. 340 if err := os.Setenv("GOGC", "100"); err != nil { 341 t.Fatal(err) 342 } 343 344 nodeSummary := recorder.GenerateNodeStatus(context.Background()) 345 if nodeSummary == nil { 346 t.Fatalf("recorder did not return nodeSummary") 347 } 348 if len(nodeSummary.Args) == 0 { 349 t.Fatalf("expected args to be present") 350 } 351 if len(nodeSummary.Env) == 0 { 352 t.Fatalf("expected env to be present") 353 } 354 nodeSummary.Args = nil 355 nodeSummary.Env = nil 356 nodeSummary.Activity = nil 357 nodeSummary.Latencies = nil 358 359 sort.Sort(byStoreDescID(nodeSummary.StoreStatuses)) 360 if a, e := nodeSummary, expectedNodeSummary; !reflect.DeepEqual(a, e) { 361 t.Errorf("recorder did not produce expected NodeSummary; diff:\n %s", pretty.Diff(e, a)) 362 } 363 364 // Make sure that all methods other than GenerateNodeStatus can operate in 365 // parallel with each other (i.e. even if recorder.mu is RLocked). 366 recorder.mu.RLock() 367 var wg sync.WaitGroup 368 for i := 0; i < 100; i++ { 369 wg.Add(1) 370 go func() { 371 if _, err := recorder.MarshalJSON(); err != nil { 372 t.Error(err) 373 } 374 _ = recorder.PrintAsText(ioutil.Discard) 375 _ = recorder.GetTimeSeriesData() 376 wg.Done() 377 }() 378 } 379 wg.Wait() 380 recorder.mu.RUnlock() 381 }