github.com/thanos-io/thanos@v0.32.5/test/e2e/compatibility_test.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package e2e_test
     5  
     6  import (
     7  	"bytes"
     8  	"fmt"
     9  	"io"
    10  	"net/http"
    11  	"net/url"
    12  	"os"
    13  	"path/filepath"
    14  	"testing"
    15  	"time"
    16  
    17  	"github.com/efficientgo/e2e"
    18  	e2edb "github.com/efficientgo/e2e/db"
    19  	e2emon "github.com/efficientgo/e2e/monitoring"
    20  	"github.com/efficientgo/e2e/monitoring/promconfig"
    21  	sdconfig "github.com/efficientgo/e2e/monitoring/promconfig/discovery/config"
    22  	"github.com/efficientgo/e2e/monitoring/promconfig/discovery/targetgroup"
    23  	e2eobs "github.com/efficientgo/e2e/observable"
    24  	common_cfg "github.com/prometheus/common/config"
    25  	"github.com/prometheus/common/model"
    26  	"github.com/prometheus/prometheus/config"
    27  
    28  	"github.com/efficientgo/core/testutil"
    29  	"github.com/thanos-io/thanos/pkg/alert"
    30  	"github.com/thanos-io/thanos/pkg/httpconfig"
    31  	"github.com/thanos-io/thanos/pkg/queryfrontend"
    32  	"github.com/thanos-io/thanos/pkg/store"
    33  	"github.com/thanos-io/thanos/test/e2e/e2ethanos"
    34  )
    35  
    36  // TestPromQLCompliance tests PromQL compatibility against https://github.com/prometheus/compliance/tree/main/promql.
    37  // NOTE: This requires dockerization of compliance framework: https://github.com/prometheus/compliance/pull/46
    38  // Test requires at least ~11m, so run this with `-test.timeout 9999m`.
    39  func TestPromQLCompliance(t *testing.T) {
    40  	testPromQLCompliance(t, false, store.EagerRetrieval)
    41  }
    42  
    43  // TestPromQLComplianceWithLazy tests PromQL compatibility against https://github.com/prometheus/compliance/tree/main/promql.
    44  // NOTE: This requires dockerization of compliance framework: https://github.com/prometheus/compliance/pull/46
    45  // Test requires at least ~11m, so run this with `-test.timeout 9999m`.
    46  // This uses lazy evaluation to test out how it works in comparison to eager.
    47  func TestPromQLComplianceWithLazy(t *testing.T) {
    48  	testPromQLCompliance(t, false, store.LazyRetrieval)
    49  }
    50  
    51  // TestPromQLComplianceWithQueryFrontend tests PromQL compatibility with query frontend with sharding enabled.
    52  func TestPromQLComplianceWithShardingQueryFrontend(t *testing.T) {
    53  	testPromQLCompliance(t, true, store.EagerRetrieval)
    54  }
    55  
    56  func testPromQLCompliance(t *testing.T, queryFrontend bool, retrievalStrategy store.RetrievalStrategy) {
    57  	t.Skip("This is interactive test, it requires time to build up (scrape) the data. The data is also obtain from remote promlab servers.")
    58  
    59  	e, err := e2e.NewDockerEnvironment("compatibility")
    60  	testutil.Ok(t, err)
    61  	t.Cleanup(e.Close)
    62  
    63  	// Start receive + Querier.
    64  	receiverRunnable := e2ethanos.NewReceiveBuilder(e, "receive").WithIngestionEnabled().Init()
    65  	queryReceive := e2edb.NewThanosQuerier(e, "query_receive", []string{receiverRunnable.InternalEndpoint("grpc")})
    66  	testutil.Ok(t, e2e.StartAndWaitReady(receiverRunnable, queryReceive))
    67  
    68  	rwURL, err := url.Parse(e2ethanos.RemoteWriteEndpoint(receiverRunnable.InternalEndpoint("remote-write")))
    69  	testutil.Ok(t, err)
    70  	// Start reference Prometheus.
    71  	prom := e2edb.NewPrometheus(e, "prom")
    72  	testutil.Ok(t, prom.SetConfig(promconfig.Config{
    73  		GlobalConfig: promconfig.GlobalConfig{
    74  			EvaluationInterval: model.Duration(5 * time.Second),
    75  			ScrapeInterval:     model.Duration(5 * time.Second),
    76  			ExternalLabels: map[model.LabelName]model.LabelValue{
    77  				"prometheus": "1",
    78  			},
    79  		},
    80  		RemoteWriteConfigs: []*promconfig.RemoteWriteConfig{
    81  			{
    82  				URL: &common_cfg.URL{URL: rwURL},
    83  			},
    84  		},
    85  		ScrapeConfigs: []*promconfig.ScrapeConfig{
    86  			{
    87  				JobName: "demo",
    88  				ServiceDiscoveryConfig: sdconfig.ServiceDiscoveryConfig{
    89  					StaticConfigs: []*targetgroup.Group{
    90  						{
    91  							Source: "demo.promlabs.com:10000",
    92  						},
    93  						{
    94  							Source: "demo.promlabs.com:10001",
    95  						},
    96  						{
    97  							Source: "demo.promlabs.com:10002",
    98  						},
    99  					},
   100  				},
   101  			},
   102  		},
   103  	}))
   104  	testutil.Ok(t, e2e.StartAndWaitReady(prom))
   105  
   106  	// Start sidecar + Querier
   107  	sidecar := e2edb.NewThanosSidecar(e, "sidecar", prom, e2edb.WithImage("thanos"))
   108  	extraOpts := []e2edb.Option{e2edb.WithImage("thanos"), e2edb.WithFlagOverride(map[string]string{"--grpc.proxy-strategy": string(retrievalStrategy)})}
   109  	querySidecar := e2edb.NewThanosQuerier(e, "query_sidecar", []string{sidecar.InternalEndpoint("grpc")}, extraOpts...)
   110  	testutil.Ok(t, e2e.StartAndWaitReady(sidecar, querySidecar))
   111  
   112  	// Start noop promql-compliance-tester. See https://github.com/prometheus/compliance/tree/main/promql on how to build local docker image.
   113  	compliance := e.Runnable("promql-compliance-tester").Init(e2e.StartOptions{
   114  		Image:   "promql-compliance-tester:latest",
   115  		Command: e2e.NewCommandWithoutEntrypoint("tail", "-f", "/dev/null"),
   116  	})
   117  	testutil.Ok(t, e2e.StartAndWaitReady(compliance))
   118  
   119  	// Wait 10 minutes for Prometheus to scrape relevant data.
   120  	time.Sleep(10 * time.Minute)
   121  
   122  	t.Run("receive", func(t *testing.T) {
   123  		queryTargetRunnable := queryReceive
   124  		if queryFrontend {
   125  			qf := newQueryFrontendRunnable(e, "query_frontend_receive", queryReceive.InternalEndpoint("http"))
   126  			testutil.Ok(t, e2e.StartAndWaitReady(qf))
   127  			queryTargetRunnable = qf
   128  		}
   129  
   130  		testutil.Ok(t, os.WriteFile(filepath.Join(compliance.Dir(), "receive.yaml"),
   131  			[]byte(promQLCompatConfig(prom, queryTargetRunnable, []string{"prometheus", "receive", "tenant_id"})), os.ModePerm))
   132  
   133  		testutil.Ok(t, compliance.Exec(e2e.NewCommand(
   134  			"/promql-compliance-tester",
   135  			"-config-file", filepath.Join(compliance.InternalDir(), "receive.yaml"),
   136  			"-config-file", "/promql-test-queries.yml",
   137  		)))
   138  	})
   139  	t.Run("sidecar", func(t *testing.T) {
   140  		queryTargetRunnable := querySidecar
   141  		if queryFrontend {
   142  			qf := newQueryFrontendRunnable(e, "query_frontend_sidecar", queryReceive.InternalEndpoint("http"))
   143  			testutil.Ok(t, e2e.StartAndWaitReady(qf))
   144  			queryTargetRunnable = qf
   145  		}
   146  
   147  		testutil.Ok(t, os.WriteFile(filepath.Join(compliance.Dir(), "sidecar.yaml"),
   148  			[]byte(promQLCompatConfig(prom, queryTargetRunnable, []string{"prometheus"})), os.ModePerm))
   149  
   150  		testutil.Ok(t, compliance.Exec(e2e.NewCommand(
   151  			"/promql-compliance-tester",
   152  			"-config-file", filepath.Join(compliance.InternalDir(), "sidecar.yaml"),
   153  			"-config-file", "/promql-test-queries.yml",
   154  		)))
   155  	})
   156  }
   157  
   158  // nolint (it's still used in skipped test).
   159  func promQLCompatConfig(reference *e2emon.Prometheus, target e2e.Runnable, dropLabels []string) string {
   160  	return `reference_target_config:
   161    query_url: 'http://` + reference.InternalEndpoint("http") + `'
   162  
   163  test_target_config:
   164    query_url: 'http://` + target.InternalEndpoint("http") + `'
   165  
   166  query_tweaks:
   167    - note: 'Thanos requires adding "external_labels" to distinguish Prometheus servers, leading to extra labels in query results that need to be stripped before comparing results.'
   168      no_bug: true
   169      drop_result_labels:
   170  ` + func() (ret string) {
   171  		for _, l := range dropLabels {
   172  			ret += `      - ` + l + "\n"
   173  		}
   174  		return ret
   175  	}()
   176  }
   177  
   178  // TestAlertCompliance tests Alert compatibility against https://github.com/prometheus/compliance/blob/main/alert_generator.
   179  // NOTE: This requires a dockerization of compliance framework: https://github.com/prometheus/compliance/pull/46
   180  func TestAlertCompliance(t *testing.T) {
   181  	t.Skip("This is an interactive test, using https://github.com/prometheus/compliance/tree/main/alert_generator. This tool is not optimized for CI runs (e.g. it infinitely retries, takes 38 minutes)")
   182  
   183  	t.Run("stateful ruler", func(t *testing.T) {
   184  		e, err := e2e.NewDockerEnvironment("alert-compat")
   185  		testutil.Ok(t, err)
   186  		t.Cleanup(e.Close)
   187  
   188  		// Start receive + Querier.
   189  		receive := e2ethanos.NewReceiveBuilder(e, "receive").WithIngestionEnabled().Init()
   190  		rwEndpoint := e2ethanos.RemoteWriteEndpoint(receive.InternalEndpoint("remote-write"))
   191  		querierBuilder := e2ethanos.NewQuerierBuilder(e, "query")
   192  
   193  		compliance := e.Runnable("alert_generator_compliance_tester").WithPorts(map[string]int{"http": 8080}).Init(e2e.StartOptions{
   194  			Image:   "alert_generator_compliance_tester:latest",
   195  			Command: e2e.NewCommandRunUntilStop(),
   196  		})
   197  
   198  		rFuture := e2ethanos.NewRulerBuilder(e, "1")
   199  		ruler := rFuture.WithAlertManagerConfig([]alert.AlertmanagerConfig{
   200  			{
   201  				EndpointsConfig: httpconfig.EndpointsConfig{
   202  					StaticAddresses: []string{compliance.InternalEndpoint("http")},
   203  					Scheme:          "http",
   204  				},
   205  				Timeout:    amTimeout,
   206  				APIVersion: alert.APIv1,
   207  			},
   208  		}).
   209  			// Use default resend delay and eval interval, as the compliance spec requires this.
   210  			WithResendDelay("1m").
   211  			WithEvalInterval("1m").
   212  			WithReplicaLabel("").
   213  			InitTSDB(filepath.Join(rFuture.InternalDir(), "rules"), []httpconfig.Config{
   214  				{
   215  					EndpointsConfig: httpconfig.EndpointsConfig{
   216  						StaticAddresses: []string{
   217  							querierBuilder.InternalEndpoint("http"),
   218  						},
   219  						Scheme: "http",
   220  					},
   221  				},
   222  			})
   223  
   224  		query := querierBuilder.
   225  			WithStoreAddresses(receive.InternalEndpoint("grpc"), ruler.InternalEndpoint("grpc")).
   226  			// We deduplicate by this, since alert compatibility tool requires clean metric without labels
   227  			// attached by receivers.
   228  			WithReplicaLabels("receive", "tenant_id").
   229  			Init()
   230  		testutil.Ok(t, e2e.StartAndWaitReady(receive, query, ruler, compliance))
   231  
   232  		// Pull rules.yaml:
   233  		{
   234  			var stdout bytes.Buffer
   235  			testutil.Ok(t, compliance.Exec(e2e.NewCommand("cat", "/rules.yaml"), e2e.WithExecOptionStdout(&stdout)))
   236  			testutil.Ok(t, os.MkdirAll(filepath.Join(ruler.Dir(), "rules"), os.ModePerm))
   237  			testutil.Ok(t, os.WriteFile(filepath.Join(ruler.Dir(), "rules", "rules.yaml"), stdout.Bytes(), os.ModePerm))
   238  
   239  			// Reload ruler.
   240  			resp, err := http.Post("http://"+ruler.Endpoint("http")+"/-/reload", "", nil)
   241  			testutil.Ok(t, err)
   242  			defer func() {
   243  				_, _ = io.Copy(io.Discard, resp.Body)
   244  				_ = resp.Body.Close()
   245  			}()
   246  			testutil.Equals(t, http.StatusOK, resp.StatusCode)
   247  		}
   248  		alertCompatCfg := alertCompatConfig(rwEndpoint, query.InternalEndpoint("http"), ruler.InternalEndpoint("http"))
   249  		testutil.Ok(t, os.WriteFile(filepath.Join(compliance.Dir(), "test-thanos.yaml"), []byte(alertCompatCfg), os.ModePerm))
   250  
   251  		fmt.Println(alertCompatCfg)
   252  
   253  		testutil.Ok(t, compliance.Exec(e2e.NewCommand(
   254  			"/alert_generator_compliance_tester", "-config-file", filepath.Join(compliance.InternalDir(), "test-thanos.yaml")),
   255  		))
   256  	})
   257  
   258  	t.Run("stateless ruler", func(t *testing.T) {
   259  		e, err := e2e.NewDockerEnvironment("alert-compat")
   260  		testutil.Ok(t, err)
   261  		t.Cleanup(e.Close)
   262  
   263  		// Start receive + Querier.
   264  		receive := e2ethanos.NewReceiveBuilder(e, "receive").WithIngestionEnabled().Init()
   265  		rwEndpoint := e2ethanos.RemoteWriteEndpoint(receive.InternalEndpoint("remote-write"))
   266  		rwURL := urlParse(t, rwEndpoint)
   267  		rFuture := e2ethanos.NewRulerBuilder(e, "1")
   268  		query := e2ethanos.NewQuerierBuilder(e, "query").
   269  			WithStoreAddresses(receive.InternalEndpoint("grpc")).
   270  			// We deduplicate by this, since alert compatibility tool requires clean metric without labels
   271  			// attached by receivers.
   272  			WithReplicaLabels("receive", "tenant_id").
   273  			Init()
   274  
   275  		compliance := e.Runnable("alert_generator_compliance_tester").WithPorts(map[string]int{"http": 8080}).Init(e2e.StartOptions{
   276  			Image:   "alert_generator_compliance_tester:latest",
   277  			Command: e2e.NewCommandRunUntilStop(),
   278  		})
   279  
   280  		ruler := rFuture.WithAlertManagerConfig([]alert.AlertmanagerConfig{
   281  			{
   282  				EndpointsConfig: httpconfig.EndpointsConfig{
   283  					StaticAddresses: []string{compliance.InternalEndpoint("http")},
   284  					Scheme:          "http",
   285  				},
   286  				Timeout:    amTimeout,
   287  				APIVersion: alert.APIv1,
   288  			},
   289  		}).
   290  			// Use default resend delay and eval interval, as the compliance spec requires this.
   291  			WithResendDelay("1m").
   292  			WithEvalInterval("1m").
   293  			WithReplicaLabel("").
   294  			WithRestoreIgnoredLabels("tenant_id").
   295  			InitStateless(filepath.Join(rFuture.InternalDir(), "rules"), []httpconfig.Config{
   296  				{
   297  					EndpointsConfig: httpconfig.EndpointsConfig{
   298  						StaticAddresses: []string{
   299  							query.InternalEndpoint("http"),
   300  						},
   301  						Scheme: "http",
   302  					},
   303  				},
   304  			}, []*config.RemoteWriteConfig{
   305  				{URL: &common_cfg.URL{URL: rwURL}, Name: "thanos-receiver"},
   306  			})
   307  
   308  		testutil.Ok(t, e2e.StartAndWaitReady(receive, query, ruler, compliance))
   309  
   310  		// Pull rules.yaml:
   311  		{
   312  			var stdout bytes.Buffer
   313  			testutil.Ok(t, compliance.Exec(e2e.NewCommand("cat", "/rules.yaml"), e2e.WithExecOptionStdout(&stdout)))
   314  			testutil.Ok(t, os.MkdirAll(filepath.Join(ruler.Dir(), "rules"), os.ModePerm))
   315  			testutil.Ok(t, os.WriteFile(filepath.Join(ruler.Dir(), "rules", "rules.yaml"), stdout.Bytes(), os.ModePerm))
   316  
   317  			// Reload ruler.
   318  			resp, err := http.Post("http://"+ruler.Endpoint("http")+"/-/reload", "", nil)
   319  			testutil.Ok(t, err)
   320  			defer func() {
   321  				_, _ = io.Copy(io.Discard, resp.Body)
   322  				_ = resp.Body.Close()
   323  			}()
   324  			testutil.Equals(t, http.StatusOK, resp.StatusCode)
   325  		}
   326  		alertCompatCfg := alertCompatConfig(rwEndpoint, query.InternalEndpoint("http"), query.InternalEndpoint("http"))
   327  		testutil.Ok(t, os.WriteFile(filepath.Join(compliance.Dir(), "test-thanos.yaml"), []byte(alertCompatCfg), os.ModePerm))
   328  
   329  		fmt.Println(alertCompatCfg)
   330  
   331  		testutil.Ok(t, compliance.Exec(e2e.NewCommand(
   332  			"/alert_generator_compliance_tester", "-config-file", filepath.Join(compliance.InternalDir(), "test-thanos.yaml")),
   333  		))
   334  	})
   335  }
   336  
   337  // nolint (it's still used in skipped test).
   338  func alertCompatConfig(remoteWriteURL, queryURL, rulesURL string) string {
   339  	return fmt.Sprintf(`settings:
   340    remote_write_url: '%s'
   341    query_base_url: 'http://%s'
   342    rules_and_alerts_api_base_url: 'http://%s'
   343    alert_reception_server_port: 8080
   344    alert_message_parser: default
   345  `, remoteWriteURL, queryURL, rulesURL)
   346  }
   347  
   348  func newQueryFrontendRunnable(e e2e.Environment, name, downstreamURL string) *e2eobs.Observable {
   349  	inMemoryCacheConfig := queryfrontend.CacheProviderConfig{
   350  		Type: queryfrontend.INMEMORY,
   351  		Config: queryfrontend.InMemoryResponseCacheConfig{
   352  			MaxSizeItems: 1000,
   353  			Validity:     time.Hour,
   354  		},
   355  	}
   356  	config := queryfrontend.Config{
   357  		QueryRangeConfig: queryfrontend.QueryRangeConfig{
   358  			AlignRangeWithStep: false,
   359  		},
   360  		NumShards: 3,
   361  	}
   362  	return e2ethanos.NewQueryFrontend(e, name, downstreamURL, config, inMemoryCacheConfig)
   363  }