github.com/grafana/pyroscope@v1.18.0/pkg/test/integration/microservices_test.go

github.com/grafana/pyroscope@v1.18.0/pkg/test/integration/microservices_test.go (about)

     1  package integration
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"sort"
     8  	"strings"
     9  	"testing"
    10  	"time"
    11  
    12  	"connectrpc.com/connect"
    13  	"github.com/stretchr/testify/assert"
    14  	"github.com/stretchr/testify/require"
    15  	"golang.org/x/sync/errgroup"
    16  
    17  	profilev1 "github.com/grafana/pyroscope/api/gen/proto/go/google/v1"
    18  	pushv1 "github.com/grafana/pyroscope/api/gen/proto/go/push/v1"
    19  	"github.com/grafana/pyroscope/api/gen/proto/go/push/v1/pushv1connect"
    20  	querierv1 "github.com/grafana/pyroscope/api/gen/proto/go/querier/v1"
    21  	"github.com/grafana/pyroscope/api/gen/proto/go/querier/v1/querierv1connect"
    22  	typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1"
    23  	"github.com/grafana/pyroscope/pkg/metastore/raftnode/raftnodepb"
    24  	"github.com/grafana/pyroscope/pkg/pprof/testhelper"
    25  	"github.com/grafana/pyroscope/pkg/tenant"
    26  	"github.com/grafana/pyroscope/pkg/test/integration/cluster"
    27  )
    28  
    29  // TestMicroServicesIntegration tests the integration of the microservices in a
    30  // similar to is actually run in the scalable/high availability setup.
    31  //
    32  // After the cluster is fully started, it pushes profiles for a number of services
    33  // and then queries the series, label names and label values. It then stops some
    34  // of the services and runs the same queries again to check if the cluster is still
    35  // able to respond to queries.
    36  func TestMicroServicesIntegrationV1(t *testing.T) {
    37  	c := cluster.NewMicroServiceCluster()
    38  	ctx := context.Background()
    39  
    40  	require.NoError(t, c.Prepare(ctx))
    41  	for _, comp := range c.Components {
    42  		t.Log(comp.String())
    43  	}
    44  
    45  	// start returns as soon the cluster is ready
    46  	require.NoError(t, c.Start(ctx))
    47  	t.Log("Cluster ready")
    48  	defer func() {
    49  		waitStopped := c.Stop()
    50  		require.NoError(t, waitStopped(ctx))
    51  	}()
    52  
    53  	tc := newTestCtx(c)
    54  	t.Run("PushProfiles", func(t *testing.T) {
    55  		tc.pushProfiles(ctx, t)
    56  	})
    57  
    58  	t.Run("HealthyCluster", func(t *testing.T) {
    59  		tc.runQueryTest(ctx, t)
    60  	})
    61  
    62  	componentsToStop := map[string]struct{}{"store-gateway": {}, "ingester": {}}
    63  	g, gctx := errgroup.WithContext(ctx)
    64  	for _, comp := range c.Components {
    65  		if _, ok := componentsToStop[comp.Target]; ok {
    66  			t.Logf("Stopping %s", comp.Target)
    67  			awaitStop := comp.Stop()
    68  			delete(componentsToStop, comp.Target)
    69  			g.Go(func() error {
    70  				return awaitStop(gctx)
    71  			})
    72  		}
    73  	}
    74  	// wait for services being stopped
    75  	require.NoError(t, g.Wait())
    76  
    77  	t.Run("DegradedCluster", func(t *testing.T) {
    78  		tc.runQueryTest(ctx, t)
    79  	})
    80  
    81  }
    82  
    83  func TestMicroServicesIntegrationV2(t *testing.T) {
    84  	c := cluster.NewMicroServiceCluster(cluster.WithV2())
    85  	ctx := context.Background()
    86  
    87  	require.NoError(t, c.Prepare(ctx))
    88  	for _, comp := range c.Components {
    89  		t.Log(comp.String())
    90  	}
    91  
    92  	// start returns as soon the cluster is ready
    93  	require.NoError(t, c.Start(ctx))
    94  	t.Log("Cluster ready")
    95  	defer func() {
    96  		waitStopped := c.Stop()
    97  		require.NoError(t, waitStopped(ctx))
    98  	}()
    99  
   100  	tc := newTestCtx(c)
   101  	t.Run("PushProfiles", func(t *testing.T) {
   102  		tc.pushProfiles(ctx, t)
   103  	})
   104  
   105  	// ingest some more data to compact the rest of the data we care about
   106  	// TODO: This shouldn't be necessary see https://github.com/grafana/pyroscope/issues/4193.
   107  	pushCtx, pushCancel := context.WithCancel(ctx)
   108  	g, gctx := errgroup.WithContext(pushCtx)
   109  	g.SetLimit(4)
   110  	for i := 0; i < 200; i++ {
   111  		g.Go(func() error {
   112  			p, err := testhelper.NewProfileBuilder(tc.now.UnixNano()).
   113  				CPUProfile().
   114  				ForStacktraceString("foo", "bar", "baz").AddSamples(1).
   115  				MarshalVT()
   116  			require.NoError(t, err)
   117  
   118  			pctx := tenant.InjectTenantID(gctx, fmt.Sprintf("dummy-tenant-%d", i))
   119  			_, err = tc.pusher.Push(pctx, connect.NewRequest(&pushv1.PushRequest{
   120  				Series: []*pushv1.RawProfileSeries{{
   121  					Labels: []*typesv1.LabelPair{
   122  						{Name: "service_name", Value: fmt.Sprintf("dummy-service/%d", i)},
   123  						{Name: "__name__", Value: "process_cpu"},
   124  					},
   125  					Samples: []*pushv1.RawSample{{RawProfile: p}},
   126  				}},
   127  			}))
   128  			return err
   129  		})
   130  	}
   131  	defer func() {
   132  		pushCancel()
   133  		err := g.Wait()
   134  		if !errors.Is(err, context.Canceled) {
   135  			require.NoError(t, g.Wait())
   136  		}
   137  	}()
   138  
   139  	// await compaction so tenant wide index is available
   140  	require.Eventually(t, func() bool {
   141  		jobs, err := c.CompactionJobsFinished(ctx)
   142  		return err == nil && jobs > 0
   143  	}, time.Minute, time.Second)
   144  	t.Log("Compaction worker finished")
   145  
   146  	// await until all tenants have all expected labelValues available
   147  	// TODO: This shouldn't be necessary see https://github.com/grafana/pyroscope/issues/4193.
   148  	require.Eventually(t, func() bool {
   149  		for tenantID := range tc.perTenantData {
   150  			ctx := tenant.InjectTenantID(ctx, tenantID)
   151  			resp, err := tc.querier.LabelValues(ctx, connect.NewRequest(&typesv1.LabelValuesRequest{
   152  				Start: tc.now.Add(-time.Hour).UnixMilli(),
   153  				End:   tc.now.Add(time.Hour).UnixMilli(),
   154  				Name:  "service_name",
   155  			}))
   156  			if err != nil {
   157  				return false
   158  			}
   159  			if len(resp.Msg.Names) != tc.perTenantData[tenantID].serviceCount {
   160  				return false
   161  			}
   162  		}
   163  		return true
   164  	}, time.Minute, time.Second)
   165  	t.Log("All tenants have all expected labelValues available")
   166  
   167  	tc.runQueryTest(ctx, t)
   168  
   169  }
   170  
   171  // TestMetastoreAutoJoin tests that a new metastore node can join an existing cluster
   172  // using the auto-join feature without requiring bootstrap configuration.
   173  func TestMetastoreAutoJoin(t *testing.T) {
   174  	c := cluster.NewMicroServiceCluster(cluster.WithV2())
   175  	ctx := context.Background()
   176  
   177  	require.NoError(t, c.Prepare(ctx))
   178  	for _, comp := range c.Components {
   179  		t.Log(comp.String())
   180  	}
   181  
   182  	require.NoError(t, c.Start(ctx))
   183  	defer func() {
   184  		waitStopped := c.Stop()
   185  		require.NoError(t, waitStopped(ctx))
   186  	}()
   187  
   188  	client, err := c.GetMetastoreRaftNodeClient()
   189  	require.NoError(t, err)
   190  	nodeInfo, err := client.NodeInfo(ctx, &raftnodepb.NodeInfoRequest{})
   191  	require.NoError(t, err)
   192  	require.Equal(t, 3, len(nodeInfo.Node.Peers), "initial cluster should have 3 peers")
   193  
   194  	err = c.AddMetastoreWithAutoJoin(ctx)
   195  	require.NoError(t, err)
   196  
   197  	require.Eventually(t, func() bool {
   198  		nodeInfo, err := client.NodeInfo(ctx, &raftnodepb.NodeInfoRequest{})
   199  		if err != nil {
   200  			t.Logf("Failed to get node info: %v", err)
   201  			return false
   202  		}
   203  		t.Logf("Current peer count: %d", len(nodeInfo.Node.Peers))
   204  		return len(nodeInfo.Node.Peers) == 4
   205  	}, 30*time.Second, 1*time.Second, "new metastore should join cluster")
   206  }
   207  
   208  func newTestCtx(x interface {
   209  	PushClient() pushv1connect.PusherServiceClient
   210  	QueryClient() querierv1connect.QuerierServiceClient
   211  }) *testCtx {
   212  	return &testCtx{
   213  		now: time.Now().Truncate(time.Second),
   214  		perTenantData: map[string]tenantParams{
   215  			"tenant-a": {
   216  				serviceCount: 100,
   217  				samples:      5,
   218  			},
   219  			"tenant-b": {
   220  				serviceCount: 1,
   221  				samples:      1,
   222  			},
   223  			"tenant-not-existing": {},
   224  		},
   225  		querier: x.QueryClient(),
   226  		pusher:  x.PushClient(),
   227  	}
   228  }
   229  
   230  type tenantParams struct {
   231  	serviceCount int
   232  	samples      int
   233  }
   234  
   235  type testCtx struct {
   236  	now time.Time
   237  
   238  	perTenantData map[string]tenantParams
   239  	querier       querierv1connect.QuerierServiceClient
   240  	pusher        pushv1connect.PusherServiceClient
   241  }
   242  
   243  func (tc *testCtx) pushProfiles(ctx context.Context, t *testing.T) {
   244  	g, gctx := errgroup.WithContext(ctx)
   245  
   246  	g.SetLimit(20)
   247  	for tenantID, params := range tc.perTenantData {
   248  		gctx := tenant.InjectTenantID(gctx, tenantID)
   249  		for i := 0; i < params.serviceCount; i++ {
   250  			var i = i
   251  			g.Go(func() error {
   252  				serviceName := fmt.Sprintf("%s/test-service-%d", tenantID, i)
   253  				builder := testhelper.NewProfileBuilder(int64(1)).
   254  					CPUProfile().
   255  					WithLabels(
   256  						"job", "test",
   257  						"service_name", serviceName,
   258  					)
   259  				builder.ForStacktraceString("foo", "bar", "baz").AddSamples(1)
   260  				for j := 0; j < params.samples; j++ {
   261  					builder.TimeNanos = tc.now.Add(time.Duration(j) * 5 * time.Second).UnixNano()
   262  					if (i+j)%3 == 0 {
   263  						builder.ForStacktraceString("foo", "bar", "boz").AddSamples(3)
   264  					}
   265  				}
   266  
   267  				rawProfile, err := builder.MarshalVT()
   268  				require.NoError(t, err)
   269  
   270  				_, err = tc.pusher.Push(gctx, connect.NewRequest(&pushv1.PushRequest{
   271  					Series: []*pushv1.RawProfileSeries{{
   272  						Labels:  builder.Labels,
   273  						Samples: []*pushv1.RawSample{{RawProfile: rawProfile}},
   274  					}},
   275  				}))
   276  				return err
   277  			})
   278  		}
   279  	}
   280  	require.NoError(t, g.Wait())
   281  
   282  }
   283  
   284  func (tc *testCtx) runQueryTest(ctx context.Context, t *testing.T) {
   285  	isV2 := strings.HasSuffix(t.Name(), "V2")
   286  	t.Run("QuerySeries", func(t *testing.T) {
   287  		for tenantID, params := range tc.perTenantData {
   288  			t.Run(tenantID, func(t *testing.T) {
   289  				ctx := tenant.InjectTenantID(ctx, tenantID)
   290  				resp, err := tc.querier.Series(ctx, connect.NewRequest(&querierv1.SeriesRequest{
   291  					Start:      tc.now.Add(-time.Hour).UnixMilli(),
   292  					End:        tc.now.Add(time.Hour).UnixMilli(),
   293  					LabelNames: []string{"__profile_type__", "service_name"},
   294  				}))
   295  				require.NoError(t, err)
   296  				require.Len(t, resp.Msg.LabelsSet, params.serviceCount)
   297  
   298  				// no services to check
   299  				if params.serviceCount == 0 {
   300  					return
   301  				}
   302  
   303  				expectedValues := make([]*typesv1.Labels, params.serviceCount)
   304  				for i := 0; i < params.serviceCount; i++ {
   305  					// check if the service name is in the response
   306  					expectedValues[i] = &typesv1.Labels{
   307  						Labels: []*typesv1.LabelPair{
   308  							{
   309  								Name:  "__profile_type__",
   310  								Value: "process_cpu:cpu:nanoseconds:cpu:nanoseconds",
   311  							},
   312  							{
   313  								Name:  "service_name",
   314  								Value: fmt.Sprintf("%s/test-service-%d", tenantID, i),
   315  							},
   316  						},
   317  					}
   318  				}
   319  
   320  				// sort the response by service name
   321  				sort.Slice(resp.Msg.LabelsSet, func(i, j int) bool {
   322  					return resp.Msg.LabelsSet[i].Labels[1].Value < resp.Msg.LabelsSet[j].Labels[1].Value
   323  				})
   324  				sort.Slice(expectedValues, func(i, j int) bool {
   325  					return expectedValues[i].Labels[1].Value < expectedValues[j].Labels[1].Value
   326  				})
   327  				assert.Equal(t, expectedValues, resp.Msg.LabelsSet)
   328  			})
   329  		}
   330  	})
   331  	t.Run("QueryLabelNames", func(t *testing.T) {
   332  		for tenantID, params := range tc.perTenantData {
   333  			t.Run(tenantID, func(t *testing.T) {
   334  				ctx := tenant.InjectTenantID(ctx, tenantID)
   335  				resp, err := tc.querier.LabelNames(ctx, connect.NewRequest(&typesv1.LabelNamesRequest{
   336  					Start: tc.now.Add(-time.Hour).UnixMilli(),
   337  					End:   tc.now.Add(time.Hour).UnixMilli(),
   338  				}))
   339  				require.NoError(t, err)
   340  
   341  				// no services, no label names
   342  				if params.serviceCount == 0 {
   343  					assert.Len(t, resp.Msg.Names, 0)
   344  					return
   345  				}
   346  
   347  				assert.Equal(t, []string{
   348  					"__name__",
   349  					"__period_type__",
   350  					"__period_unit__",
   351  					"__profile_type__",
   352  					"__service_name__",
   353  					"__type__",
   354  					"__unit__",
   355  					"job",
   356  					"service_name",
   357  				}, resp.Msg.Names)
   358  			})
   359  		}
   360  	})
   361  
   362  	validateProfileTypes := func(t *testing.T, serviceCount int, resp *querierv1.ProfileTypesResponse) {
   363  		// no services, no label names
   364  		if serviceCount == 0 {
   365  			assert.Len(t, resp.ProfileTypes, 0)
   366  			return
   367  		}
   368  
   369  		profileTypes := make([]string, 0, len(resp.ProfileTypes))
   370  		for _, pt := range resp.ProfileTypes {
   371  			profileTypes = append(profileTypes, pt.ID)
   372  		}
   373  		assert.Equal(t, []string{
   374  			"process_cpu:cpu:nanoseconds:cpu:nanoseconds",
   375  		}, profileTypes)
   376  	}
   377  
   378  	t.Run("QueryProfileTypesWithTimeRange", func(t *testing.T) {
   379  		for tenantID, params := range tc.perTenantData {
   380  			t.Run(tenantID, func(t *testing.T) {
   381  				ctx := tenant.InjectTenantID(ctx, tenantID)
   382  
   383  				// Query profile types with time range
   384  				resp, err := tc.querier.ProfileTypes(ctx, connect.NewRequest(&querierv1.ProfileTypesRequest{
   385  					Start: tc.now.Add(-time.Hour).UnixMilli(),
   386  					End:   tc.now.Add(time.Hour).UnixMilli(),
   387  				}))
   388  				require.NoError(t, err)
   389  
   390  				validateProfileTypes(t, params.serviceCount, resp.Msg)
   391  			})
   392  		}
   393  	})
   394  
   395  	// Note: Some ProfileTypes API clients rely on the ablility to call it without start/end.
   396  	// See https://github.com/grafana/grafana/issues/110211
   397  	t.Run("QueryProfileTypesWithoutTimeRange", func(t *testing.T) {
   398  		for tenantID, params := range tc.perTenantData {
   399  			t.Run(tenantID, func(t *testing.T) {
   400  				ctx := tenant.InjectTenantID(ctx, tenantID)
   401  
   402  				// Query profile types with time range
   403  				resp, err := tc.querier.ProfileTypes(ctx, connect.NewRequest(&querierv1.ProfileTypesRequest{}))
   404  				require.NoError(t, err)
   405  
   406  				validateProfileTypes(t, params.serviceCount, resp.Msg)
   407  			})
   408  		}
   409  	})
   410  
   411  	t.Run("QueryLabelValues", func(t *testing.T) {
   412  		for tenantID, params := range tc.perTenantData {
   413  			t.Run(tenantID, func(t *testing.T) {
   414  				ctx := tenant.InjectTenantID(ctx, tenantID)
   415  				resp, err := tc.querier.LabelValues(ctx, connect.NewRequest(&typesv1.LabelValuesRequest{
   416  					Start: tc.now.Add(-time.Hour).UnixMilli(),
   417  					End:   tc.now.Add(time.Hour).UnixMilli(),
   418  					Name:  "service_name",
   419  				}))
   420  				require.NoError(t, err)
   421  
   422  				// no services, no label values
   423  				if params.serviceCount == 0 {
   424  					assert.Len(t, resp.Msg.Names, 0)
   425  					return
   426  				}
   427  
   428  				expectedValues := make([]string, params.serviceCount)
   429  				for i := 0; i < params.serviceCount; i++ {
   430  					// check if the service name is in the response
   431  					expectedValues[i] = fmt.Sprintf("%s/test-service-%d", tenantID, i)
   432  				}
   433  				sort.Strings(expectedValues)
   434  				assert.Equal(t, expectedValues, resp.Msg.Names)
   435  			})
   436  		}
   437  	})
   438  
   439  	t.Run("QuerySelectMergeProfile", func(t *testing.T) {
   440  		for tenantID, params := range tc.perTenantData {
   441  			t.Run(tenantID, func(t *testing.T) {
   442  				ctx := tenant.InjectTenantID(ctx, tenantID)
   443  				req := &querierv1.SelectMergeProfileRequest{
   444  					ProfileTypeID: "process_cpu:cpu:nanoseconds:cpu:nanoseconds",
   445  					LabelSelector: "{}",
   446  					Start:         tc.now.Add(-time.Hour).UnixMilli(),
   447  					End:           tc.now.Add(time.Hour).UnixMilli(),
   448  				}
   449  				resp, err := tc.querier.SelectMergeProfile(ctx, connect.NewRequest(req))
   450  				require.NoError(t, err)
   451  
   452  				// no services, no samples profile
   453  				if params.serviceCount == 0 {
   454  					return
   455  				}
   456  
   457  				// TODO: Experimental storage layer v2 doesn't support DurationNanos yet
   458  				// https://github.com/grafana/pyroscope/issues/4192
   459  				if !isV2 {
   460  					assert.Equal(t, int64(7200000000000), resp.Msg.DurationNanos, "DurationNanos")
   461  				}
   462  
   463  				assert.Equal(t, req.End*1e6, resp.Msg.TimeNanos, "TimeNanos")
   464  
   465  				assert.Equal(t,
   466  					[]*profilev1.ValueType{
   467  						{Type: 6, Unit: 5},
   468  					}, resp.Msg.SampleType, "SampleType",
   469  				)
   470  
   471  				// boz samples
   472  				bozSamples := 0
   473  				for i := 0; i < params.serviceCount; i++ {
   474  					for j := 0; j < params.samples; j++ {
   475  						if (i+j)%3 == 0 {
   476  							bozSamples += 3
   477  						}
   478  					}
   479  				}
   480  
   481  				assert.Equal(t,
   482  					[]*profilev1.Sample{
   483  						{LocationId: []uint64{1, 2, 3}, Value: []int64{int64(params.serviceCount)}},
   484  						{LocationId: []uint64{1, 2, 4}, Value: []int64{int64(bozSamples)}},
   485  					}, resp.Msg.Sample, "Samples",
   486  				)
   487  				assert.Equal(t,
   488  					[]*profilev1.Mapping{
   489  						{Id: 1, HasFunctions: true},
   490  					}, resp.Msg.Mapping, "Mappings",
   491  				)
   492  				assert.Equal(t,
   493  					[]*profilev1.Location{
   494  						{Id: 1, MappingId: 1, Line: []*profilev1.Line{{FunctionId: 1}}},
   495  						{Id: 2, MappingId: 1, Line: []*profilev1.Line{{FunctionId: 2}}},
   496  						{Id: 3, MappingId: 1, Line: []*profilev1.Line{{FunctionId: 3}}},
   497  						{Id: 4, MappingId: 1, Line: []*profilev1.Line{{FunctionId: 4}}},
   498  					}, resp.Msg.Location, "Locations",
   499  				)
   500  				assert.Equal(t,
   501  					[]*profilev1.Function{
   502  						{Id: 1, Name: 1},
   503  						{Id: 2, Name: 2},
   504  						{Id: 3, Name: 3},
   505  						{Id: 4, Name: 4},
   506  					}, resp.Msg.Function, "Functions",
   507  				)
   508  				assert.Equal(t,
   509  					[]string{"", "foo", "bar", "baz", "boz", "nanoseconds", "cpu"},
   510  					resp.Msg.StringTable,
   511  				)
   512  				assert.Equal(t,
   513  					&profilev1.ValueType{Type: 6, Unit: 5},
   514  					resp.Msg.PeriodType,
   515  				)
   516  			})
   517  		}
   518  	})
   519  }