github.com/grafana/pyroscope@v1.18.0/pkg/test/integration/microservices_test.go (about) 1 package integration 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "sort" 8 "strings" 9 "testing" 10 "time" 11 12 "connectrpc.com/connect" 13 "github.com/stretchr/testify/assert" 14 "github.com/stretchr/testify/require" 15 "golang.org/x/sync/errgroup" 16 17 profilev1 "github.com/grafana/pyroscope/api/gen/proto/go/google/v1" 18 pushv1 "github.com/grafana/pyroscope/api/gen/proto/go/push/v1" 19 "github.com/grafana/pyroscope/api/gen/proto/go/push/v1/pushv1connect" 20 querierv1 "github.com/grafana/pyroscope/api/gen/proto/go/querier/v1" 21 "github.com/grafana/pyroscope/api/gen/proto/go/querier/v1/querierv1connect" 22 typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1" 23 "github.com/grafana/pyroscope/pkg/metastore/raftnode/raftnodepb" 24 "github.com/grafana/pyroscope/pkg/pprof/testhelper" 25 "github.com/grafana/pyroscope/pkg/tenant" 26 "github.com/grafana/pyroscope/pkg/test/integration/cluster" 27 ) 28 29 // TestMicroServicesIntegration tests the integration of the microservices in a 30 // similar to is actually run in the scalable/high availability setup. 31 // 32 // After the cluster is fully started, it pushes profiles for a number of services 33 // and then queries the series, label names and label values. It then stops some 34 // of the services and runs the same queries again to check if the cluster is still 35 // able to respond to queries. 36 func TestMicroServicesIntegrationV1(t *testing.T) { 37 c := cluster.NewMicroServiceCluster() 38 ctx := context.Background() 39 40 require.NoError(t, c.Prepare(ctx)) 41 for _, comp := range c.Components { 42 t.Log(comp.String()) 43 } 44 45 // start returns as soon the cluster is ready 46 require.NoError(t, c.Start(ctx)) 47 t.Log("Cluster ready") 48 defer func() { 49 waitStopped := c.Stop() 50 require.NoError(t, waitStopped(ctx)) 51 }() 52 53 tc := newTestCtx(c) 54 t.Run("PushProfiles", func(t *testing.T) { 55 tc.pushProfiles(ctx, t) 56 }) 57 58 t.Run("HealthyCluster", func(t *testing.T) { 59 tc.runQueryTest(ctx, t) 60 }) 61 62 componentsToStop := map[string]struct{}{"store-gateway": {}, "ingester": {}} 63 g, gctx := errgroup.WithContext(ctx) 64 for _, comp := range c.Components { 65 if _, ok := componentsToStop[comp.Target]; ok { 66 t.Logf("Stopping %s", comp.Target) 67 awaitStop := comp.Stop() 68 delete(componentsToStop, comp.Target) 69 g.Go(func() error { 70 return awaitStop(gctx) 71 }) 72 } 73 } 74 // wait for services being stopped 75 require.NoError(t, g.Wait()) 76 77 t.Run("DegradedCluster", func(t *testing.T) { 78 tc.runQueryTest(ctx, t) 79 }) 80 81 } 82 83 func TestMicroServicesIntegrationV2(t *testing.T) { 84 c := cluster.NewMicroServiceCluster(cluster.WithV2()) 85 ctx := context.Background() 86 87 require.NoError(t, c.Prepare(ctx)) 88 for _, comp := range c.Components { 89 t.Log(comp.String()) 90 } 91 92 // start returns as soon the cluster is ready 93 require.NoError(t, c.Start(ctx)) 94 t.Log("Cluster ready") 95 defer func() { 96 waitStopped := c.Stop() 97 require.NoError(t, waitStopped(ctx)) 98 }() 99 100 tc := newTestCtx(c) 101 t.Run("PushProfiles", func(t *testing.T) { 102 tc.pushProfiles(ctx, t) 103 }) 104 105 // ingest some more data to compact the rest of the data we care about 106 // TODO: This shouldn't be necessary see https://github.com/grafana/pyroscope/issues/4193. 107 pushCtx, pushCancel := context.WithCancel(ctx) 108 g, gctx := errgroup.WithContext(pushCtx) 109 g.SetLimit(4) 110 for i := 0; i < 200; i++ { 111 g.Go(func() error { 112 p, err := testhelper.NewProfileBuilder(tc.now.UnixNano()). 113 CPUProfile(). 114 ForStacktraceString("foo", "bar", "baz").AddSamples(1). 115 MarshalVT() 116 require.NoError(t, err) 117 118 pctx := tenant.InjectTenantID(gctx, fmt.Sprintf("dummy-tenant-%d", i)) 119 _, err = tc.pusher.Push(pctx, connect.NewRequest(&pushv1.PushRequest{ 120 Series: []*pushv1.RawProfileSeries{{ 121 Labels: []*typesv1.LabelPair{ 122 {Name: "service_name", Value: fmt.Sprintf("dummy-service/%d", i)}, 123 {Name: "__name__", Value: "process_cpu"}, 124 }, 125 Samples: []*pushv1.RawSample{{RawProfile: p}}, 126 }}, 127 })) 128 return err 129 }) 130 } 131 defer func() { 132 pushCancel() 133 err := g.Wait() 134 if !errors.Is(err, context.Canceled) { 135 require.NoError(t, g.Wait()) 136 } 137 }() 138 139 // await compaction so tenant wide index is available 140 require.Eventually(t, func() bool { 141 jobs, err := c.CompactionJobsFinished(ctx) 142 return err == nil && jobs > 0 143 }, time.Minute, time.Second) 144 t.Log("Compaction worker finished") 145 146 // await until all tenants have all expected labelValues available 147 // TODO: This shouldn't be necessary see https://github.com/grafana/pyroscope/issues/4193. 148 require.Eventually(t, func() bool { 149 for tenantID := range tc.perTenantData { 150 ctx := tenant.InjectTenantID(ctx, tenantID) 151 resp, err := tc.querier.LabelValues(ctx, connect.NewRequest(&typesv1.LabelValuesRequest{ 152 Start: tc.now.Add(-time.Hour).UnixMilli(), 153 End: tc.now.Add(time.Hour).UnixMilli(), 154 Name: "service_name", 155 })) 156 if err != nil { 157 return false 158 } 159 if len(resp.Msg.Names) != tc.perTenantData[tenantID].serviceCount { 160 return false 161 } 162 } 163 return true 164 }, time.Minute, time.Second) 165 t.Log("All tenants have all expected labelValues available") 166 167 tc.runQueryTest(ctx, t) 168 169 } 170 171 // TestMetastoreAutoJoin tests that a new metastore node can join an existing cluster 172 // using the auto-join feature without requiring bootstrap configuration. 173 func TestMetastoreAutoJoin(t *testing.T) { 174 c := cluster.NewMicroServiceCluster(cluster.WithV2()) 175 ctx := context.Background() 176 177 require.NoError(t, c.Prepare(ctx)) 178 for _, comp := range c.Components { 179 t.Log(comp.String()) 180 } 181 182 require.NoError(t, c.Start(ctx)) 183 defer func() { 184 waitStopped := c.Stop() 185 require.NoError(t, waitStopped(ctx)) 186 }() 187 188 client, err := c.GetMetastoreRaftNodeClient() 189 require.NoError(t, err) 190 nodeInfo, err := client.NodeInfo(ctx, &raftnodepb.NodeInfoRequest{}) 191 require.NoError(t, err) 192 require.Equal(t, 3, len(nodeInfo.Node.Peers), "initial cluster should have 3 peers") 193 194 err = c.AddMetastoreWithAutoJoin(ctx) 195 require.NoError(t, err) 196 197 require.Eventually(t, func() bool { 198 nodeInfo, err := client.NodeInfo(ctx, &raftnodepb.NodeInfoRequest{}) 199 if err != nil { 200 t.Logf("Failed to get node info: %v", err) 201 return false 202 } 203 t.Logf("Current peer count: %d", len(nodeInfo.Node.Peers)) 204 return len(nodeInfo.Node.Peers) == 4 205 }, 30*time.Second, 1*time.Second, "new metastore should join cluster") 206 } 207 208 func newTestCtx(x interface { 209 PushClient() pushv1connect.PusherServiceClient 210 QueryClient() querierv1connect.QuerierServiceClient 211 }) *testCtx { 212 return &testCtx{ 213 now: time.Now().Truncate(time.Second), 214 perTenantData: map[string]tenantParams{ 215 "tenant-a": { 216 serviceCount: 100, 217 samples: 5, 218 }, 219 "tenant-b": { 220 serviceCount: 1, 221 samples: 1, 222 }, 223 "tenant-not-existing": {}, 224 }, 225 querier: x.QueryClient(), 226 pusher: x.PushClient(), 227 } 228 } 229 230 type tenantParams struct { 231 serviceCount int 232 samples int 233 } 234 235 type testCtx struct { 236 now time.Time 237 238 perTenantData map[string]tenantParams 239 querier querierv1connect.QuerierServiceClient 240 pusher pushv1connect.PusherServiceClient 241 } 242 243 func (tc *testCtx) pushProfiles(ctx context.Context, t *testing.T) { 244 g, gctx := errgroup.WithContext(ctx) 245 246 g.SetLimit(20) 247 for tenantID, params := range tc.perTenantData { 248 gctx := tenant.InjectTenantID(gctx, tenantID) 249 for i := 0; i < params.serviceCount; i++ { 250 var i = i 251 g.Go(func() error { 252 serviceName := fmt.Sprintf("%s/test-service-%d", tenantID, i) 253 builder := testhelper.NewProfileBuilder(int64(1)). 254 CPUProfile(). 255 WithLabels( 256 "job", "test", 257 "service_name", serviceName, 258 ) 259 builder.ForStacktraceString("foo", "bar", "baz").AddSamples(1) 260 for j := 0; j < params.samples; j++ { 261 builder.TimeNanos = tc.now.Add(time.Duration(j) * 5 * time.Second).UnixNano() 262 if (i+j)%3 == 0 { 263 builder.ForStacktraceString("foo", "bar", "boz").AddSamples(3) 264 } 265 } 266 267 rawProfile, err := builder.MarshalVT() 268 require.NoError(t, err) 269 270 _, err = tc.pusher.Push(gctx, connect.NewRequest(&pushv1.PushRequest{ 271 Series: []*pushv1.RawProfileSeries{{ 272 Labels: builder.Labels, 273 Samples: []*pushv1.RawSample{{RawProfile: rawProfile}}, 274 }}, 275 })) 276 return err 277 }) 278 } 279 } 280 require.NoError(t, g.Wait()) 281 282 } 283 284 func (tc *testCtx) runQueryTest(ctx context.Context, t *testing.T) { 285 isV2 := strings.HasSuffix(t.Name(), "V2") 286 t.Run("QuerySeries", func(t *testing.T) { 287 for tenantID, params := range tc.perTenantData { 288 t.Run(tenantID, func(t *testing.T) { 289 ctx := tenant.InjectTenantID(ctx, tenantID) 290 resp, err := tc.querier.Series(ctx, connect.NewRequest(&querierv1.SeriesRequest{ 291 Start: tc.now.Add(-time.Hour).UnixMilli(), 292 End: tc.now.Add(time.Hour).UnixMilli(), 293 LabelNames: []string{"__profile_type__", "service_name"}, 294 })) 295 require.NoError(t, err) 296 require.Len(t, resp.Msg.LabelsSet, params.serviceCount) 297 298 // no services to check 299 if params.serviceCount == 0 { 300 return 301 } 302 303 expectedValues := make([]*typesv1.Labels, params.serviceCount) 304 for i := 0; i < params.serviceCount; i++ { 305 // check if the service name is in the response 306 expectedValues[i] = &typesv1.Labels{ 307 Labels: []*typesv1.LabelPair{ 308 { 309 Name: "__profile_type__", 310 Value: "process_cpu:cpu:nanoseconds:cpu:nanoseconds", 311 }, 312 { 313 Name: "service_name", 314 Value: fmt.Sprintf("%s/test-service-%d", tenantID, i), 315 }, 316 }, 317 } 318 } 319 320 // sort the response by service name 321 sort.Slice(resp.Msg.LabelsSet, func(i, j int) bool { 322 return resp.Msg.LabelsSet[i].Labels[1].Value < resp.Msg.LabelsSet[j].Labels[1].Value 323 }) 324 sort.Slice(expectedValues, func(i, j int) bool { 325 return expectedValues[i].Labels[1].Value < expectedValues[j].Labels[1].Value 326 }) 327 assert.Equal(t, expectedValues, resp.Msg.LabelsSet) 328 }) 329 } 330 }) 331 t.Run("QueryLabelNames", func(t *testing.T) { 332 for tenantID, params := range tc.perTenantData { 333 t.Run(tenantID, func(t *testing.T) { 334 ctx := tenant.InjectTenantID(ctx, tenantID) 335 resp, err := tc.querier.LabelNames(ctx, connect.NewRequest(&typesv1.LabelNamesRequest{ 336 Start: tc.now.Add(-time.Hour).UnixMilli(), 337 End: tc.now.Add(time.Hour).UnixMilli(), 338 })) 339 require.NoError(t, err) 340 341 // no services, no label names 342 if params.serviceCount == 0 { 343 assert.Len(t, resp.Msg.Names, 0) 344 return 345 } 346 347 assert.Equal(t, []string{ 348 "__name__", 349 "__period_type__", 350 "__period_unit__", 351 "__profile_type__", 352 "__service_name__", 353 "__type__", 354 "__unit__", 355 "job", 356 "service_name", 357 }, resp.Msg.Names) 358 }) 359 } 360 }) 361 362 validateProfileTypes := func(t *testing.T, serviceCount int, resp *querierv1.ProfileTypesResponse) { 363 // no services, no label names 364 if serviceCount == 0 { 365 assert.Len(t, resp.ProfileTypes, 0) 366 return 367 } 368 369 profileTypes := make([]string, 0, len(resp.ProfileTypes)) 370 for _, pt := range resp.ProfileTypes { 371 profileTypes = append(profileTypes, pt.ID) 372 } 373 assert.Equal(t, []string{ 374 "process_cpu:cpu:nanoseconds:cpu:nanoseconds", 375 }, profileTypes) 376 } 377 378 t.Run("QueryProfileTypesWithTimeRange", func(t *testing.T) { 379 for tenantID, params := range tc.perTenantData { 380 t.Run(tenantID, func(t *testing.T) { 381 ctx := tenant.InjectTenantID(ctx, tenantID) 382 383 // Query profile types with time range 384 resp, err := tc.querier.ProfileTypes(ctx, connect.NewRequest(&querierv1.ProfileTypesRequest{ 385 Start: tc.now.Add(-time.Hour).UnixMilli(), 386 End: tc.now.Add(time.Hour).UnixMilli(), 387 })) 388 require.NoError(t, err) 389 390 validateProfileTypes(t, params.serviceCount, resp.Msg) 391 }) 392 } 393 }) 394 395 // Note: Some ProfileTypes API clients rely on the ablility to call it without start/end. 396 // See https://github.com/grafana/grafana/issues/110211 397 t.Run("QueryProfileTypesWithoutTimeRange", func(t *testing.T) { 398 for tenantID, params := range tc.perTenantData { 399 t.Run(tenantID, func(t *testing.T) { 400 ctx := tenant.InjectTenantID(ctx, tenantID) 401 402 // Query profile types with time range 403 resp, err := tc.querier.ProfileTypes(ctx, connect.NewRequest(&querierv1.ProfileTypesRequest{})) 404 require.NoError(t, err) 405 406 validateProfileTypes(t, params.serviceCount, resp.Msg) 407 }) 408 } 409 }) 410 411 t.Run("QueryLabelValues", func(t *testing.T) { 412 for tenantID, params := range tc.perTenantData { 413 t.Run(tenantID, func(t *testing.T) { 414 ctx := tenant.InjectTenantID(ctx, tenantID) 415 resp, err := tc.querier.LabelValues(ctx, connect.NewRequest(&typesv1.LabelValuesRequest{ 416 Start: tc.now.Add(-time.Hour).UnixMilli(), 417 End: tc.now.Add(time.Hour).UnixMilli(), 418 Name: "service_name", 419 })) 420 require.NoError(t, err) 421 422 // no services, no label values 423 if params.serviceCount == 0 { 424 assert.Len(t, resp.Msg.Names, 0) 425 return 426 } 427 428 expectedValues := make([]string, params.serviceCount) 429 for i := 0; i < params.serviceCount; i++ { 430 // check if the service name is in the response 431 expectedValues[i] = fmt.Sprintf("%s/test-service-%d", tenantID, i) 432 } 433 sort.Strings(expectedValues) 434 assert.Equal(t, expectedValues, resp.Msg.Names) 435 }) 436 } 437 }) 438 439 t.Run("QuerySelectMergeProfile", func(t *testing.T) { 440 for tenantID, params := range tc.perTenantData { 441 t.Run(tenantID, func(t *testing.T) { 442 ctx := tenant.InjectTenantID(ctx, tenantID) 443 req := &querierv1.SelectMergeProfileRequest{ 444 ProfileTypeID: "process_cpu:cpu:nanoseconds:cpu:nanoseconds", 445 LabelSelector: "{}", 446 Start: tc.now.Add(-time.Hour).UnixMilli(), 447 End: tc.now.Add(time.Hour).UnixMilli(), 448 } 449 resp, err := tc.querier.SelectMergeProfile(ctx, connect.NewRequest(req)) 450 require.NoError(t, err) 451 452 // no services, no samples profile 453 if params.serviceCount == 0 { 454 return 455 } 456 457 // TODO: Experimental storage layer v2 doesn't support DurationNanos yet 458 // https://github.com/grafana/pyroscope/issues/4192 459 if !isV2 { 460 assert.Equal(t, int64(7200000000000), resp.Msg.DurationNanos, "DurationNanos") 461 } 462 463 assert.Equal(t, req.End*1e6, resp.Msg.TimeNanos, "TimeNanos") 464 465 assert.Equal(t, 466 []*profilev1.ValueType{ 467 {Type: 6, Unit: 5}, 468 }, resp.Msg.SampleType, "SampleType", 469 ) 470 471 // boz samples 472 bozSamples := 0 473 for i := 0; i < params.serviceCount; i++ { 474 for j := 0; j < params.samples; j++ { 475 if (i+j)%3 == 0 { 476 bozSamples += 3 477 } 478 } 479 } 480 481 assert.Equal(t, 482 []*profilev1.Sample{ 483 {LocationId: []uint64{1, 2, 3}, Value: []int64{int64(params.serviceCount)}}, 484 {LocationId: []uint64{1, 2, 4}, Value: []int64{int64(bozSamples)}}, 485 }, resp.Msg.Sample, "Samples", 486 ) 487 assert.Equal(t, 488 []*profilev1.Mapping{ 489 {Id: 1, HasFunctions: true}, 490 }, resp.Msg.Mapping, "Mappings", 491 ) 492 assert.Equal(t, 493 []*profilev1.Location{ 494 {Id: 1, MappingId: 1, Line: []*profilev1.Line{{FunctionId: 1}}}, 495 {Id: 2, MappingId: 1, Line: []*profilev1.Line{{FunctionId: 2}}}, 496 {Id: 3, MappingId: 1, Line: []*profilev1.Line{{FunctionId: 3}}}, 497 {Id: 4, MappingId: 1, Line: []*profilev1.Line{{FunctionId: 4}}}, 498 }, resp.Msg.Location, "Locations", 499 ) 500 assert.Equal(t, 501 []*profilev1.Function{ 502 {Id: 1, Name: 1}, 503 {Id: 2, Name: 2}, 504 {Id: 3, Name: 3}, 505 {Id: 4, Name: 4}, 506 }, resp.Msg.Function, "Functions", 507 ) 508 assert.Equal(t, 509 []string{"", "foo", "bar", "baz", "boz", "nanoseconds", "cpu"}, 510 resp.Msg.StringTable, 511 ) 512 assert.Equal(t, 513 &profilev1.ValueType{Type: 6, Unit: 5}, 514 resp.Msg.PeriodType, 515 ) 516 }) 517 } 518 }) 519 }