github.com/thanos-io/thanos@v0.32.5/test/e2e/receive_test.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package e2e_test 5 6 import ( 7 "context" 8 "fmt" 9 "log" 10 "net/http" 11 "net/http/httputil" 12 "testing" 13 "time" 14 15 "github.com/efficientgo/core/backoff" 16 "github.com/efficientgo/e2e" 17 e2edb "github.com/efficientgo/e2e/db" 18 e2emon "github.com/efficientgo/e2e/monitoring" 19 "github.com/prometheus/common/model" 20 "github.com/prometheus/prometheus/model/relabel" 21 22 "github.com/efficientgo/core/testutil" 23 24 "github.com/thanos-io/thanos/pkg/promclient" 25 "github.com/thanos-io/thanos/pkg/receive" 26 "github.com/thanos-io/thanos/test/e2e/e2ethanos" 27 ) 28 29 type DebugTransport struct{} 30 31 func (DebugTransport) RoundTrip(r *http.Request) (*http.Response, error) { 32 _, err := httputil.DumpRequestOut(r, false) 33 if err != nil { 34 return nil, err 35 } 36 return http.DefaultTransport.RoundTrip(r) 37 } 38 39 func ErrorHandler(_ http.ResponseWriter, _ *http.Request, err error) { 40 log.Print("Response from receiver") 41 log.Print(err) 42 } 43 44 func TestReceive(t *testing.T) { 45 t.Parallel() 46 47 t.Run("single_ingestor", func(t *testing.T) { 48 /* 49 The single_ingestor suite represents the simplest possible configuration of Thanos Receive. 50 ┌──────────┐ 51 │ Prom │ 52 └────┬─────┘ 53 │ 54 ┌────▼─────┐ 55 │ Ingestor │ 56 └────┬─────┘ 57 │ 58 ┌────▼─────┐ 59 │ Query │ 60 └──────────┘ 61 NB: Made with asciiflow.com - you can copy & paste the above there to modify. 62 */ 63 64 t.Parallel() 65 e, err := e2e.NewDockerEnvironment("single-ingestor") 66 testutil.Ok(t, err) 67 t.Cleanup(e2ethanos.CleanScenario(t, e)) 68 69 // Setup Router Ingestor. 70 i := e2ethanos.NewReceiveBuilder(e, "ingestor").WithIngestionEnabled().Init() 71 testutil.Ok(t, e2e.StartAndWaitReady(i)) 72 73 // Setup Prometheus 74 prom := e2ethanos.NewPrometheus(e, "1", e2ethanos.DefaultPromConfig("prom1", 0, e2ethanos.RemoteWriteEndpoint(i.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 75 testutil.Ok(t, e2e.StartAndWaitReady(prom)) 76 77 q := e2ethanos.NewQuerierBuilder(e, "1", i.InternalEndpoint("grpc")).Init() 78 testutil.Ok(t, e2e.StartAndWaitReady(q)) 79 80 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 81 t.Cleanup(cancel) 82 83 testutil.Ok(t, q.WaitSumMetricsWithOptions(e2emon.Equals(1), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) 84 85 // We expect the data from each Prometheus instance to be replicated twice across our ingesting instances 86 queryAndAssertSeries(t, ctx, q.Endpoint("http"), e2ethanos.QueryUpWithoutInstance, time.Now, promclient.QueryOptions{ 87 Deduplicate: false, 88 }, []model.Metric{ 89 { 90 "job": "myself", 91 "prometheus": "prom1", 92 "receive": "receive-ingestor", 93 "replica": "0", 94 "tenant_id": "default-tenant", 95 }, 96 }) 97 }) 98 99 t.Run("ha_ingestor_with_ha_prom", func(t *testing.T) { 100 /* 101 The ha_ingestor_with_ha_prom suite represents a configuration of a 102 naive HA Thanos Receive with HA Prometheus. This is used to exercise 103 deduplication with external and "internal" TSDB block labels 104 105 ┌──────┐ ┌──────┬──────┐ 106 │ Prom │ │ │ Prom │ 107 └─┬────┴──┼────┐ └─────┬┘ 108 │ │ │ │ 109 │ │ │ │ 110 ┌─▼───────▼┐ ┌─▼───────▼┐ 111 │ Ingestor │ │ Ingestor │ 112 └───────┬──┘ └──┬───────┘ 113 │ │ 114 ┌▼───────▼┐ 115 │ Query │ 116 └─────────┘ 117 NB: Made with asciiflow.com - you can copy & paste the above there to modify. 118 */ 119 t.Parallel() 120 e, err := e2e.NewDockerEnvironment("haingest-haprom") 121 testutil.Ok(t, err) 122 t.Cleanup(e2ethanos.CleanScenario(t, e)) 123 124 // Setup Receives 125 r1 := e2ethanos.NewReceiveBuilder(e, "1").WithIngestionEnabled().Init() 126 r2 := e2ethanos.NewReceiveBuilder(e, "2").WithIngestionEnabled().Init() 127 r3 := e2ethanos.NewReceiveBuilder(e, "3").WithIngestionEnabled().Init() 128 129 testutil.Ok(t, e2e.StartAndWaitReady(r1, r2, r3)) 130 131 // These static metrics help reproduce issue https://github.com/thanos-io/thanos/issues/6257 in Receive. 132 metrics := []byte(` 133 # HELP test_metric A test metric 134 # TYPE test_metric counter 135 test_metric{a="1", b="1"} 1 136 test_metric{a="1", b="2"} 1 137 test_metric{a="2", b="1"} 1 138 test_metric{a="2", b="2"} 1`) 139 static := e2emon.NewStaticMetricsServer(e, "static", metrics) 140 testutil.Ok(t, e2e.StartAndWaitReady(static)) 141 142 // Setup Prometheus 143 prom := e2ethanos.NewPrometheus(e, "1", e2ethanos.DefaultPromConfig( 144 "prom1", 145 0, 146 e2ethanos.RemoteWriteEndpoints(r1.InternalEndpoint("remote-write"), r2.InternalEndpoint("remote-write")), 147 "", 148 e2ethanos.LocalPrometheusTarget, 149 ), "", e2ethanos.DefaultPrometheusImage()) 150 prom2 := e2ethanos.NewPrometheus(e, "2", e2ethanos.DefaultPromConfig( 151 "prom1", 152 1, 153 e2ethanos.RemoteWriteEndpoints(r1.InternalEndpoint("remote-write"), r2.InternalEndpoint("remote-write")), 154 "", 155 e2ethanos.LocalPrometheusTarget, 156 ), "", e2ethanos.DefaultPrometheusImage()) 157 promStatic := e2ethanos.NewPrometheus(e, "3", e2ethanos.DefaultPromConfig( 158 "prom-static", 159 0, 160 e2ethanos.RemoteWriteEndpoints(r3.InternalEndpoint("remote-write")), 161 "", 162 static.InternalEndpoint("http"), 163 ), "", e2ethanos.DefaultPrometheusImage()) 164 testutil.Ok(t, e2e.StartAndWaitReady(prom, prom2, promStatic)) 165 166 // The "replica" label is added to the Prometheus instances via the e2ethanos.DefaultPromConfig. It is a block label. 167 // The "receive" label is added to the Receive instances via the e2ethanos.NewReceiveBuilder. It is an external label. 168 // Here we setup 3 queriers, one for each possible combination of replica label: internal, external, and both. 169 q1 := e2ethanos.NewQuerierBuilder(e, "1", r1.InternalEndpoint("grpc"), r2.InternalEndpoint("grpc")). 170 WithReplicaLabels("replica", "receive"). 171 Init() 172 q2 := e2ethanos.NewQuerierBuilder(e, "2", r1.InternalEndpoint("grpc"), r2.InternalEndpoint("grpc")). 173 WithReplicaLabels("replica"). 174 Init() 175 q3 := e2ethanos.NewQuerierBuilder(e, "3", r1.InternalEndpoint("grpc"), r2.InternalEndpoint("grpc")). 176 WithReplicaLabels("receive"). 177 Init() 178 qStatic := e2ethanos.NewQuerierBuilder(e, "static", r3.InternalEndpoint("grpc")). 179 WithReplicaLabels("a"). 180 Init() 181 testutil.Ok(t, e2e.StartAndWaitReady(q1, q2, q3, qStatic)) 182 183 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 184 t.Cleanup(cancel) 185 186 testutil.Ok(t, q1.WaitSumMetricsWithOptions(e2emon.Equals(2), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) 187 188 // We expect the data from each Prometheus instance to be replicated 4 times across our ingesting instances. 189 // So 2 results when using both replica labels, one for each Prometheus instance. 190 queryAndAssertSeries(t, ctx, q1.Endpoint("http"), e2ethanos.QueryUpWithoutInstance, time.Now, promclient.QueryOptions{ 191 Deduplicate: true, 192 }, []model.Metric{ 193 { 194 "job": "myself", 195 "prometheus": "prom1", 196 "tenant_id": "default-tenant", 197 }, 198 }) 199 200 // We expect 2 results when using only the "replica" label, which is an internal label when coming from 201 // Prometheus via remote write. 202 queryAndAssertSeries(t, ctx, q2.Endpoint("http"), e2ethanos.QueryUpWithoutInstance, time.Now, promclient.QueryOptions{ 203 Deduplicate: true, 204 }, []model.Metric{ 205 { 206 "job": "myself", 207 "prometheus": "prom1", 208 "receive": "receive-1", 209 "tenant_id": "default-tenant", 210 }, 211 { 212 "job": "myself", 213 "prometheus": "prom1", 214 "receive": "receive-2", 215 "tenant_id": "default-tenant", 216 }, 217 }) 218 219 // We expect 2 results when using only the "receive" label, which is an external label added by the Receives. 220 queryAndAssertSeries(t, ctx, q3.Endpoint("http"), e2ethanos.QueryUpWithoutInstance, time.Now, promclient.QueryOptions{ 221 Deduplicate: true, 222 }, []model.Metric{ 223 { 224 "job": "myself", 225 "prometheus": "prom1", 226 "replica": "0", 227 "tenant_id": "default-tenant", 228 }, 229 { 230 "job": "myself", 231 "prometheus": "prom1", 232 "replica": "1", 233 "tenant_id": "default-tenant", 234 }, 235 }) 236 237 // This is a regression test for the bug outlined in https://github.com/thanos-io/thanos/issues/6257. 238 instantQuery(t, ctx, qStatic.Endpoint("http"), func() string { 239 return "test_metric" 240 }, time.Now, promclient.QueryOptions{ 241 Deduplicate: true, 242 }, 2) 243 }) 244 245 t.Run("router_replication", func(t *testing.T) { 246 /* 247 The router_replication suite configures separate routing and ingesting components. 248 It verifies that data ingested from Prometheus instances through the router is successfully replicated twice 249 across the ingestors. 250 251 ┌───────┐ ┌───────┐ ┌───────┐ 252 │ │ │ │ │ │ 253 │ Prom1 │ │ Prom2 │ │ Prom3 │ 254 │ │ │ │ │ │ 255 └───┬───┘ └───┬───┘ └──┬────┘ 256 │ ┌───▼────┐ │ 257 └───────────► ◄────────┘ 258 │ Router │ 259 ┌───────────┤ ├──────────┐ 260 │ └───┬────┘ │ 261 ┌─────▼─────┐ ┌─────▼─────┐ ┌─────▼─────┐ 262 │ │ │ │ │ │ 263 │ Ingestor1 │ │ Ingestor2 │ │ Ingestor3 │ 264 │ │ │ │ │ │ 265 └─────┬─────┘ └─────┬─────┘ └─────┬─────┘ 266 │ ┌───▼───┐ │ 267 │ │ │ │ 268 └───────────► Query ◄───────────┘ 269 │ │ 270 └───────┘ 271 272 NB: Made with asciiflow.com - you can copy & paste the above there to modify. 273 */ 274 275 t.Parallel() 276 e, err := e2e.NewDockerEnvironment("routerReplica") 277 testutil.Ok(t, err) 278 t.Cleanup(e2ethanos.CleanScenario(t, e)) 279 280 // Setup 3 ingestors. 281 i1 := e2ethanos.NewReceiveBuilder(e, "i1").WithIngestionEnabled().Init() 282 i2 := e2ethanos.NewReceiveBuilder(e, "i2").WithIngestionEnabled().Init() 283 i3 := e2ethanos.NewReceiveBuilder(e, "i3").WithIngestionEnabled().Init() 284 285 h := receive.HashringConfig{ 286 Endpoints: []receive.Endpoint{ 287 {Address: i1.InternalEndpoint("grpc")}, 288 {Address: i2.InternalEndpoint("grpc")}, 289 {Address: i3.InternalEndpoint("grpc")}, 290 }, 291 } 292 293 // Setup 1 distributor with double replication 294 r1 := e2ethanos.NewReceiveBuilder(e, "r1").WithRouting(2, h).Init() 295 testutil.Ok(t, e2e.StartAndWaitReady(i1, i2, i3, r1)) 296 297 prom1 := e2ethanos.NewPrometheus(e, "1", e2ethanos.DefaultPromConfig("prom1", 0, e2ethanos.RemoteWriteEndpoint(r1.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 298 prom2 := e2ethanos.NewPrometheus(e, "2", e2ethanos.DefaultPromConfig("prom2", 0, e2ethanos.RemoteWriteEndpoint(r1.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 299 prom3 := e2ethanos.NewPrometheus(e, "3", e2ethanos.DefaultPromConfig("prom3", 0, e2ethanos.RemoteWriteEndpoint(r1.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 300 testutil.Ok(t, e2e.StartAndWaitReady(prom1, prom2, prom3)) 301 302 q := e2ethanos.NewQuerierBuilder(e, "1", i1.InternalEndpoint("grpc"), i2.InternalEndpoint("grpc"), i3.InternalEndpoint("grpc")).Init() 303 testutil.Ok(t, e2e.StartAndWaitReady(q)) 304 305 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 306 t.Cleanup(cancel) 307 308 testutil.Ok(t, q.WaitSumMetricsWithOptions(e2emon.Equals(3), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) 309 310 expectedReplicationFactor := 2.0 311 312 queryAndAssert(t, ctx, q.Endpoint("http"), func() string { return "count(up) by (prometheus)" }, time.Now, promclient.QueryOptions{ 313 Deduplicate: false, 314 }, model.Vector{ 315 &model.Sample{ 316 Metric: model.Metric{ 317 "prometheus": "prom1", 318 }, 319 Value: model.SampleValue(expectedReplicationFactor), 320 }, 321 &model.Sample{ 322 Metric: model.Metric{ 323 "prometheus": "prom2", 324 }, 325 Value: model.SampleValue(expectedReplicationFactor), 326 }, 327 &model.Sample{ 328 Metric: model.Metric{ 329 "prometheus": "prom3", 330 }, 331 Value: model.SampleValue(expectedReplicationFactor), 332 }, 333 }) 334 }) 335 336 t.Run("routing_tree", func(t *testing.T) { 337 /* 338 The routing_tree suite configures a valid and plausible, but non-trivial topology of receiver components. 339 Crucially, the first router routes to both a routing component, and a receiving component. This demonstrates 340 Receiver's ability to handle arbitrary depth receiving trees. 341 342 Router1 is configured to duplicate data twice, once to Ingestor1, and once to Router2, 343 Router2 is also configured to duplicate data twice, once to Ingestor2, and once to Ingestor3. 344 345 ┌───────┐ ┌───────┐ 346 │ │ │ │ 347 │ Prom1 ├──┐ ┌──┤ Prom2 │ 348 │ │ │ │ │ │ 349 └───────┘ │ │ └───────┘ 350 ┌──▼───▼──┐ 351 │ │ 352 │ Router1 │ 353 ┌────┤ ├───────┐ 354 │ └─────────┘ │ 355 ┌───▼─────┐ ┌─────▼─────┐ 356 │ │ │ │ 357 │ Router2 │ │ Ingestor1 │ 358 ┌───┤ ├───┐ │ │ 359 │ └─────────┘ │ └─────┬─────┘ 360 ┌─────▼─────┐ ┌────▼──────┐ │ 361 │ │ │ │ │ 362 │ Ingestor2 │ │ Ingestor3 │ │ 363 │ │ │ │ │ 364 └─────┬─────┘ └─────┬─────┘ │ 365 │ ┌────▼────┐ │ 366 │ │ │ │ 367 └─────────────► Query ◄──────┘ 368 │ │ 369 └─────────┘ 370 371 NB: Made with asciiflow.com - you can copy & paste the above there to modify. 372 */ 373 374 t.Parallel() 375 e, err := e2e.NewDockerEnvironment("routing-tree") 376 testutil.Ok(t, err) 377 t.Cleanup(e2ethanos.CleanScenario(t, e)) 378 379 // Setup ingestors. 380 i1 := e2ethanos.NewReceiveBuilder(e, "i1").WithIngestionEnabled().Init() 381 i2 := e2ethanos.NewReceiveBuilder(e, "i2").WithIngestionEnabled().Init() 382 i3 := e2ethanos.NewReceiveBuilder(e, "i3").WithIngestionEnabled().Init() 383 384 // Setup distributors 385 r2 := e2ethanos.NewReceiveBuilder(e, "r2").WithRouting(2, receive.HashringConfig{ 386 Endpoints: []receive.Endpoint{ 387 {Address: i2.InternalEndpoint("grpc")}, 388 {Address: i3.InternalEndpoint("grpc")}, 389 }, 390 }).Init() 391 r1 := e2ethanos.NewReceiveBuilder(e, "r1").WithRouting(2, receive.HashringConfig{ 392 Endpoints: []receive.Endpoint{ 393 {Address: i1.InternalEndpoint("grpc")}, 394 {Address: r2.InternalEndpoint("grpc")}, 395 }, 396 }).Init() 397 testutil.Ok(t, e2e.StartAndWaitReady(i1, i2, i3, r1, r2)) 398 399 // Setup Prometheus. 400 prom1 := e2ethanos.NewPrometheus(e, "1", e2ethanos.DefaultPromConfig("prom1", 0, e2ethanos.RemoteWriteEndpoint(r1.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 401 prom2 := e2ethanos.NewPrometheus(e, "2", e2ethanos.DefaultPromConfig("prom2", 0, e2ethanos.RemoteWriteEndpoint(r1.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 402 testutil.Ok(t, e2e.StartAndWaitReady(prom1, prom2)) 403 404 //Setup Querier 405 q := e2ethanos.NewQuerierBuilder(e, "1", i1.InternalEndpoint("grpc"), i2.InternalEndpoint("grpc"), i3.InternalEndpoint("grpc")).Init() 406 testutil.Ok(t, e2e.StartAndWaitReady(q)) 407 408 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 409 t.Cleanup(cancel) 410 411 testutil.Ok(t, q.WaitSumMetricsWithOptions(e2emon.Equals(3), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) 412 413 expectedReplicationFactor := 3.0 414 415 queryAndAssert(t, ctx, q.Endpoint("http"), func() string { return "count(up) by (prometheus)" }, time.Now, promclient.QueryOptions{ 416 Deduplicate: false, 417 }, model.Vector{ 418 &model.Sample{ 419 Metric: model.Metric{ 420 "prometheus": "prom1", 421 }, 422 Value: model.SampleValue(expectedReplicationFactor), 423 }, 424 &model.Sample{ 425 Metric: model.Metric{ 426 "prometheus": "prom2", 427 }, 428 Value: model.SampleValue(expectedReplicationFactor), 429 }, 430 }) 431 }) 432 433 t.Run("hashring", func(t *testing.T) { 434 /* 435 The hashring suite creates three receivers, each with a Prometheus 436 remote-writing data to it. However, due to the hashing of the labels, 437 the time series from the Prometheus is forwarded to a different 438 receiver in the hashring than the one handling the request. 439 The querier queries all the receivers and the test verifies 440 the time series are forwarded to the correct receive node. 441 442 ┌───────┐ 443 │ │ 444 │ Prom2 │ 445 │ │ 446 └───┬───┘ 447 │ 448 │ 449 ┌────────┐ ┌─────▼─────┐ ┌───────┐ 450 │ │ │ │ │ │ 451 │ Prom1 │ │ Router │ │ Prom3 │ 452 │ │ │ Ingestor2 │ │ │ 453 └───┬────┘ │ │ └───┬───┘ 454 │ └──▲──┬──▲──┘ │ 455 │ │ │ │ │ 456 ┌────▼──────┐ │ │ │ ┌────▼──────┐ 457 │ ◄───────┘ │ └───────► │ 458 │ Router │ │ │ Router │ 459 │ Ingestor1 ◄──────────┼──────────► Ingestor3 │ 460 │ │ │ │ │ 461 └─────┬─────┘ │ └────┬──────┘ 462 │ │ │ 463 │ ┌───▼───┐ │ 464 │ │ │ │ 465 └────────────► Query ◄───────────┘ 466 │ │ 467 └───────┘ 468 */ 469 t.Parallel() 470 471 e, err := e2e.NewDockerEnvironment("hashring") 472 testutil.Ok(t, err) 473 t.Cleanup(e2ethanos.CleanScenario(t, e)) 474 475 r1 := e2ethanos.NewReceiveBuilder(e, "1").WithIngestionEnabled() 476 r2 := e2ethanos.NewReceiveBuilder(e, "2").WithIngestionEnabled() 477 r3 := e2ethanos.NewReceiveBuilder(e, "3").WithIngestionEnabled() 478 479 h := receive.HashringConfig{ 480 Endpoints: []receive.Endpoint{ 481 {Address: r1.InternalEndpoint("grpc")}, 482 {Address: r2.InternalEndpoint("grpc")}, 483 {Address: r3.InternalEndpoint("grpc")}, 484 }, 485 } 486 487 // Create with hashring config watcher. 488 r1Runnable := r1.WithRouting(1, h).Init() 489 r2Runnable := r2.WithRouting(1, h).Init() 490 r3Runnable := r3.WithRouting(1, h).Init() 491 testutil.Ok(t, e2e.StartAndWaitReady(r1Runnable, r2Runnable, r3Runnable)) 492 493 prom1 := e2ethanos.NewPrometheus(e, "1", e2ethanos.DefaultPromConfig("prom1", 0, e2ethanos.RemoteWriteEndpoint(r1.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 494 prom2 := e2ethanos.NewPrometheus(e, "2", e2ethanos.DefaultPromConfig("prom2", 0, e2ethanos.RemoteWriteEndpoint(r2.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 495 prom3 := e2ethanos.NewPrometheus(e, "3", e2ethanos.DefaultPromConfig("prom3", 0, e2ethanos.RemoteWriteEndpoint(r3.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 496 testutil.Ok(t, err) 497 testutil.Ok(t, e2e.StartAndWaitReady(prom1, prom2, prom3)) 498 499 q := e2ethanos.NewQuerierBuilder(e, "1", r1.InternalEndpoint("grpc"), r2.InternalEndpoint("grpc"), r3.InternalEndpoint("grpc")).Init() 500 testutil.Ok(t, err) 501 testutil.Ok(t, e2e.StartAndWaitReady(q)) 502 503 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 504 t.Cleanup(cancel) 505 506 testutil.Ok(t, q.WaitSumMetricsWithOptions(e2emon.Equals(3), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) 507 508 queryAndAssertSeries(t, ctx, q.Endpoint("http"), e2ethanos.QueryUpWithoutInstance, time.Now, promclient.QueryOptions{ 509 Deduplicate: false, 510 }, []model.Metric{ 511 { 512 "job": "myself", 513 "prometheus": "prom1", 514 "receive": "receive-2", 515 "replica": "0", 516 "tenant_id": "default-tenant", 517 }, 518 { 519 "job": "myself", 520 "prometheus": "prom2", 521 "receive": "receive-1", 522 "replica": "0", 523 "tenant_id": "default-tenant", 524 }, 525 { 526 "job": "myself", 527 "prometheus": "prom3", 528 "receive": "receive-2", 529 "replica": "0", 530 "tenant_id": "default-tenant", 531 }, 532 }) 533 }) 534 535 t.Run("replication", func(t *testing.T) { 536 t.Parallel() 537 538 e, err := e2e.NewDockerEnvironment("replication") 539 testutil.Ok(t, err) 540 t.Cleanup(e2ethanos.CleanScenario(t, e)) 541 542 // The replication suite creates three receivers but only one 543 // receives Prometheus remote-written data. The querier queries all 544 // receivers and the test verifies that the time series are 545 // replicated to all of the nodes. 546 547 r1 := e2ethanos.NewReceiveBuilder(e, "1").WithIngestionEnabled() 548 r2 := e2ethanos.NewReceiveBuilder(e, "2").WithIngestionEnabled() 549 r3 := e2ethanos.NewReceiveBuilder(e, "3").WithIngestionEnabled() 550 551 h := receive.HashringConfig{ 552 Endpoints: []receive.Endpoint{ 553 {Address: r1.InternalEndpoint("grpc")}, 554 {Address: r2.InternalEndpoint("grpc")}, 555 {Address: r3.InternalEndpoint("grpc")}, 556 }, 557 } 558 559 // Create with hashring config. 560 r1Runnable := r1.WithRouting(3, h).Init() 561 r2Runnable := r2.WithRouting(3, h).Init() 562 r3Runnable := r3.WithRouting(3, h).Init() 563 testutil.Ok(t, e2e.StartAndWaitReady(r1Runnable, r2Runnable, r3Runnable)) 564 565 prom1 := e2ethanos.NewPrometheus(e, "1", e2ethanos.DefaultPromConfig("prom1", 0, e2ethanos.RemoteWriteEndpoint(r1.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 566 testutil.Ok(t, e2e.StartAndWaitReady(prom1)) 567 568 q := e2ethanos.NewQuerierBuilder(e, "1", r1.InternalEndpoint("grpc"), r2.InternalEndpoint("grpc"), r3.InternalEndpoint("grpc")).Init() 569 testutil.Ok(t, e2e.StartAndWaitReady(q)) 570 571 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 572 t.Cleanup(cancel) 573 574 testutil.Ok(t, q.WaitSumMetricsWithOptions(e2emon.Equals(3), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) 575 576 queryAndAssertSeries(t, ctx, q.Endpoint("http"), e2ethanos.QueryUpWithoutInstance, time.Now, promclient.QueryOptions{ 577 Deduplicate: false, 578 }, []model.Metric{ 579 { 580 "job": "myself", 581 "prometheus": "prom1", 582 "receive": "receive-1", 583 "replica": "0", 584 "tenant_id": "default-tenant", 585 }, 586 { 587 "job": "myself", 588 "prometheus": "prom1", 589 "receive": "receive-2", 590 "replica": "0", 591 "tenant_id": "default-tenant", 592 }, 593 { 594 "job": "myself", 595 "prometheus": "prom1", 596 "receive": "receive-3", 597 "replica": "0", 598 "tenant_id": "default-tenant", 599 }, 600 }) 601 }) 602 603 t.Run("replication_with_outage", func(t *testing.T) { 604 t.Parallel() 605 606 e, err := e2e.NewDockerEnvironment("outage") 607 testutil.Ok(t, err) 608 t.Cleanup(e2ethanos.CleanScenario(t, e)) 609 610 // The replication suite creates a three-node hashring but one of the 611 // receivers is dead. In this case, replication should still 612 // succeed and the time series should be replicated to the other nodes. 613 614 r1 := e2ethanos.NewReceiveBuilder(e, "1").WithIngestionEnabled() 615 r2 := e2ethanos.NewReceiveBuilder(e, "2").WithIngestionEnabled() 616 r3 := e2ethanos.NewReceiveBuilder(e, "3").WithIngestionEnabled() 617 618 h := receive.HashringConfig{ 619 Endpoints: []receive.Endpoint{ 620 {Address: r1.InternalEndpoint("grpc")}, 621 {Address: r2.InternalEndpoint("grpc")}, 622 {Address: r3.InternalEndpoint("grpc")}, 623 }, 624 } 625 626 // Create with hashring config. 627 r1Runnable := r1.WithRouting(3, h).Init() 628 r2Runnable := r2.WithRouting(3, h).Init() 629 testutil.Ok(t, e2e.StartAndWaitReady(r1Runnable, r2Runnable)) 630 631 prom1 := e2ethanos.NewPrometheus(e, "1", e2ethanos.DefaultPromConfig("prom1", 0, e2ethanos.RemoteWriteEndpoint(r1.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 632 testutil.Ok(t, e2e.StartAndWaitReady(prom1)) 633 634 q := e2ethanos.NewQuerierBuilder(e, "1", r1.InternalEndpoint("grpc"), r2.InternalEndpoint("grpc")).Init() 635 testutil.Ok(t, e2e.StartAndWaitReady(q)) 636 637 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 638 t.Cleanup(cancel) 639 640 testutil.Ok(t, q.WaitSumMetricsWithOptions(e2emon.Equals(2), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) 641 642 queryAndAssertSeries(t, ctx, q.Endpoint("http"), e2ethanos.QueryUpWithoutInstance, time.Now, promclient.QueryOptions{ 643 Deduplicate: false, 644 }, []model.Metric{ 645 { 646 "job": "myself", 647 "prometheus": "prom1", 648 "receive": "receive-1", 649 "replica": "0", 650 "tenant_id": "default-tenant", 651 }, 652 { 653 "job": "myself", 654 "prometheus": "prom1", 655 "receive": "receive-2", 656 "replica": "0", 657 "tenant_id": "default-tenant", 658 }, 659 }) 660 }) 661 662 t.Run("multitenancy", func(t *testing.T) { 663 t.Parallel() 664 665 e, err := e2e.NewDockerEnvironment("multitenancy") 666 testutil.Ok(t, err) 667 t.Cleanup(e2ethanos.CleanScenario(t, e)) 668 669 r1 := e2ethanos.NewReceiveBuilder(e, "1").WithIngestionEnabled() 670 671 h := receive.HashringConfig{ 672 Endpoints: []receive.Endpoint{ 673 {Address: r1.InternalEndpoint("grpc")}, 674 }, 675 } 676 677 // Create with hashring config. 678 r1Runnable := r1.WithRouting(1, h).Init() 679 testutil.Ok(t, e2e.StartAndWaitReady(r1Runnable)) 680 681 rp1 := e2ethanos.NewReverseProxy(e, "1", "tenant-1", "http://"+r1.InternalEndpoint("remote-write")) 682 rp2 := e2ethanos.NewReverseProxy(e, "2", "tenant-2", "http://"+r1.InternalEndpoint("remote-write")) 683 testutil.Ok(t, e2e.StartAndWaitReady(rp1, rp2)) 684 685 prom1 := e2ethanos.NewPrometheus(e, "1", e2ethanos.DefaultPromConfig("prom1", 0, "http://"+rp1.InternalEndpoint("http")+"/api/v1/receive", "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 686 prom2 := e2ethanos.NewPrometheus(e, "2", e2ethanos.DefaultPromConfig("prom2", 0, "http://"+rp2.InternalEndpoint("http")+"/api/v1/receive", "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 687 testutil.Ok(t, e2e.StartAndWaitReady(prom1, prom2)) 688 689 q := e2ethanos.NewQuerierBuilder(e, "1", r1.InternalEndpoint("grpc")).Init() 690 testutil.Ok(t, e2e.StartAndWaitReady(q)) 691 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 692 t.Cleanup(cancel) 693 694 testutil.Ok(t, q.WaitSumMetricsWithOptions(e2emon.Equals(1), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) 695 queryAndAssertSeries(t, ctx, q.Endpoint("http"), e2ethanos.QueryUpWithoutInstance, time.Now, promclient.QueryOptions{ 696 Deduplicate: false, 697 }, []model.Metric{ 698 { 699 "job": "myself", 700 "prometheus": "prom1", 701 "receive": "receive-1", 702 "replica": "0", 703 "tenant_id": "tenant-1", 704 }, 705 { 706 "job": "myself", 707 "prometheus": "prom2", 708 "receive": "receive-1", 709 "replica": "0", 710 "tenant_id": "tenant-2", 711 }, 712 }) 713 }) 714 715 t.Run("relabel", func(t *testing.T) { 716 t.Parallel() 717 e, err := e2e.NewDockerEnvironment("receive-relabel") 718 testutil.Ok(t, err) 719 t.Cleanup(e2ethanos.CleanScenario(t, e)) 720 721 // Setup Router Ingestor. 722 i := e2ethanos.NewReceiveBuilder(e, "ingestor"). 723 WithIngestionEnabled(). 724 WithRelabelConfigs([]*relabel.Config{ 725 { 726 Action: relabel.LabelDrop, 727 Regex: relabel.MustNewRegexp("prometheus"), 728 }, 729 }).Init() 730 731 testutil.Ok(t, e2e.StartAndWaitReady(i)) 732 733 // Setup Prometheus 734 prom := e2ethanos.NewPrometheus(e, "1", e2ethanos.DefaultPromConfig("prom1", 0, e2ethanos.RemoteWriteEndpoint(i.InternalEndpoint("remote-write")), "", e2ethanos.LocalPrometheusTarget), "", e2ethanos.DefaultPrometheusImage()) 735 testutil.Ok(t, e2e.StartAndWaitReady(prom)) 736 737 q := e2ethanos.NewQuerierBuilder(e, "1", i.InternalEndpoint("grpc")).Init() 738 testutil.Ok(t, e2e.StartAndWaitReady(q)) 739 740 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 741 t.Cleanup(cancel) 742 743 testutil.Ok(t, q.WaitSumMetricsWithOptions(e2emon.Equals(1), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) 744 // Label `prometheus` should be dropped. 745 queryAndAssertSeries(t, ctx, q.Endpoint("http"), e2ethanos.QueryUpWithoutInstance, time.Now, promclient.QueryOptions{ 746 Deduplicate: false, 747 }, []model.Metric{ 748 { 749 "job": "myself", 750 "receive": "receive-ingestor", 751 "replica": "0", 752 "tenant_id": "default-tenant", 753 }, 754 }) 755 }) 756 757 t.Run("multitenant_active_series_limiting", func(t *testing.T) { 758 /* 759 The multitenant_active_series_limiting suite configures a hashring with 760 two avalanche writers and dedicated meta-monitoring. 761 762 ┌──────────┐ ┌──────────┐ 763 │ │ │ │ 764 │Avalanche │ │Avalanche │ 765 │ │ │ │ 766 │ │ │ │ 767 └──────────┴──────────┐ ┌──────────┴──────────┘ 768 │ │ 769 ┌─▼─────▼──┐ 770 │ │ 771 │Router ├────────────────► Meta-monitoring 772 │Ingestor │ 773 │ │ 774 └──▲─┬──▲──┘ 775 │ │ │ 776 ┌──────────┐ │ │ │ ┌──────────┐ 777 │ │ │ │ │ │ │ 778 │Router ◄───────┘ │ └────────►Router │ 779 │Ingestor │ │ │Ingestor │ 780 │ ◄─────────┼───────────► │ 781 └────┬─────┘ │ └────┬─────┘ 782 │ │ │ 783 │ ┌────▼─────┐ │ 784 │ │ │ │ 785 └──────────► Query ◄──────────┘ 786 │ │ 787 │ │ 788 └──────────┘ 789 790 NB: Made with asciiflow.com - you can copy & paste the above there to modify. 791 */ 792 793 t.Parallel() 794 e, err := e2e.NewDockerEnvironment("active-series") 795 testutil.Ok(t, err) 796 t.Cleanup(e2ethanos.CleanScenario(t, e)) 797 798 // This can be treated as the meta-monitoring service. 799 meta, err := e2emon.Start(e) 800 testutil.Ok(t, err) 801 802 // Setup 3 RouterIngestors with a limit of 10 active series. 803 ingestor1 := e2ethanos.NewReceiveBuilder(e, "i1").WithIngestionEnabled() 804 ingestor2 := e2ethanos.NewReceiveBuilder(e, "i2").WithIngestionEnabled() 805 ingestor3 := e2ethanos.NewReceiveBuilder(e, "i3").WithIngestionEnabled() 806 807 h := receive.HashringConfig{ 808 Endpoints: []receive.Endpoint{ 809 {Address: ingestor1.InternalEndpoint("grpc")}, 810 {Address: ingestor2.InternalEndpoint("grpc")}, 811 {Address: ingestor3.InternalEndpoint("grpc")}, 812 }, 813 } 814 815 tenantsLimits := receive.TenantsWriteLimitsConfig{ 816 "unlimited-tenant": receive.NewEmptyWriteLimitConfig().SetHeadSeriesLimit(0), 817 } 818 819 i1Runnable := ingestor1.WithRouting(1, h).WithValidationEnabled(10, "http://"+meta.GetMonitoringRunnable().InternalEndpoint(e2edb.AccessPortName), tenantsLimits).Init() 820 i2Runnable := ingestor2.WithRouting(1, h).WithValidationEnabled(10, "http://"+meta.GetMonitoringRunnable().InternalEndpoint(e2edb.AccessPortName), tenantsLimits).Init() 821 i3Runnable := ingestor3.WithRouting(1, h).WithValidationEnabled(10, "http://"+meta.GetMonitoringRunnable().InternalEndpoint(e2edb.AccessPortName), tenantsLimits).Init() 822 823 testutil.Ok(t, e2e.StartAndWaitReady(i1Runnable, i2Runnable, i3Runnable)) 824 825 querier := e2ethanos.NewQuerierBuilder(e, "1", ingestor1.InternalEndpoint("grpc"), ingestor2.InternalEndpoint("grpc"), ingestor3.InternalEndpoint("grpc")).Init() 826 testutil.Ok(t, e2e.StartAndWaitReady(querier)) 827 828 testutil.Ok(t, querier.WaitSumMetricsWithOptions(e2emon.Equals(3), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) 829 830 // We run three avalanches, one tenant which exceeds the limit, one tenant which remains under it, and one for the unlimited tenant. 831 832 // Avalanche in this configuration, would send 5 requests each with 10 new timeseries. 833 // One request always fails due to TSDB not being ready for new tenant. 834 // So without limiting we end up with 40 timeseries and 40 samples. 835 avalanche1 := e2ethanos.NewAvalanche(e, "avalanche-1", 836 e2ethanos.AvalancheOptions{ 837 MetricCount: "10", 838 SeriesCount: "1", 839 MetricInterval: "30", 840 SeriesInterval: "3600", 841 ValueInterval: "3600", 842 843 RemoteURL: e2ethanos.RemoteWriteEndpoint(ingestor1.InternalEndpoint("remote-write")), 844 RemoteWriteInterval: "30s", 845 RemoteBatchSize: "10", 846 RemoteRequestCount: "5", 847 848 TenantID: "exceed-tenant", 849 }) 850 851 // Avalanche in this configuration, would send 5 requests each with 5 of the same timeseries. 852 // One request always fails due to TSDB not being ready for new tenant. 853 // So we end up with 5 timeseries, 20 samples. 854 avalanche2 := e2ethanos.NewAvalanche(e, "avalanche-2", 855 e2ethanos.AvalancheOptions{ 856 MetricCount: "5", 857 SeriesCount: "1", 858 MetricInterval: "3600", 859 SeriesInterval: "3600", 860 ValueInterval: "3600", 861 862 RemoteURL: e2ethanos.RemoteWriteEndpoint(ingestor1.InternalEndpoint("remote-write")), 863 RemoteWriteInterval: "30s", 864 RemoteBatchSize: "5", 865 RemoteRequestCount: "5", 866 867 TenantID: "under-tenant", 868 }) 869 870 // Avalanche in this configuration, would send 5 requests each with 10 new timeseries. 871 // One request always fails due to TSDB not being ready for new tenant. 872 // So without limiting we end up with 40 timeseries and 40 samples. 873 avalanche3 := e2ethanos.NewAvalanche(e, "avalanche-3", 874 e2ethanos.AvalancheOptions{ 875 MetricCount: "10", 876 SeriesCount: "1", 877 MetricInterval: "30", 878 SeriesInterval: "3600", 879 ValueInterval: "3600", 880 881 RemoteURL: e2ethanos.RemoteWriteEndpoint(ingestor1.InternalEndpoint("remote-write")), 882 RemoteWriteInterval: "30s", 883 RemoteBatchSize: "10", 884 RemoteRequestCount: "5", 885 886 TenantID: "unlimited-tenant", 887 }) 888 889 testutil.Ok(t, e2e.StartAndWaitReady(avalanche1, avalanche2, avalanche3)) 890 891 // Here, 3/5 requests are failed due to limiting, as one request fails due to TSDB readiness and we ingest one initial request. 892 // 3 limited requests belong to the exceed-tenant. 893 testutil.Ok(t, i1Runnable.WaitSumMetricsWithOptions(e2emon.Equals(3), []string{"thanos_receive_head_series_limited_requests_total"}, e2emon.WithWaitBackoff(&backoff.Config{Min: 1 * time.Second, Max: 10 * time.Minute, MaxRetries: 200}), e2emon.WaitMissingMetrics())) 894 895 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 896 t.Cleanup(cancel) 897 898 ingestor1Name := e.Name() + "-" + ingestor1.Name() 899 // Here for exceed-tenant we go above limit by 10, which results in 0 value. 900 queryWaitAndAssert(t, ctx, meta.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { 901 return fmt.Sprintf("sum(prometheus_tsdb_head_series{tenant=\"exceed-tenant\"}) - on() thanos_receive_head_series_limit{instance=\"%s:8080\", job=\"receive-i1\", tenant=\"\"}", ingestor1Name) 902 }, time.Now, promclient.QueryOptions{ 903 Deduplicate: true, 904 }, model.Vector{ 905 &model.Sample{ 906 Metric: model.Metric{}, 907 Value: model.SampleValue(0), 908 }, 909 }) 910 911 // For under-tenant we stay at -5, as we have only pushed 5 series. 912 queryWaitAndAssert(t, ctx, meta.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { 913 return fmt.Sprintf("sum(prometheus_tsdb_head_series{tenant=\"under-tenant\"}) - on() thanos_receive_head_series_limit{instance=\"%s:8080\", job=\"receive-i1\", tenant=\"\"}", ingestor1Name) 914 }, time.Now, promclient.QueryOptions{ 915 Deduplicate: true, 916 }, model.Vector{ 917 &model.Sample{ 918 Metric: model.Metric{}, 919 Value: model.SampleValue(-5), 920 }, 921 }) 922 923 // Query meta-monitoring solution to assert that only 10 timeseries have been ingested for exceed-tenant. 924 queryWaitAndAssert(t, ctx, meta.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { return "sum(prometheus_tsdb_head_series{tenant=\"exceed-tenant\"})" }, time.Now, promclient.QueryOptions{ 925 Deduplicate: true, 926 }, model.Vector{ 927 &model.Sample{ 928 Metric: model.Metric{}, 929 Value: model.SampleValue(10), 930 }, 931 }) 932 933 // Query meta-monitoring solution to assert that only 5 timeseries have been ingested for under-tenant. 934 queryWaitAndAssert(t, ctx, meta.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { return "sum(prometheus_tsdb_head_series{tenant=\"under-tenant\"})" }, time.Now, promclient.QueryOptions{ 935 Deduplicate: true, 936 }, model.Vector{ 937 &model.Sample{ 938 Metric: model.Metric{}, 939 Value: model.SampleValue(5), 940 }, 941 }) 942 943 // Query meta-monitoring solution to assert that we have ingested some number of timeseries. 944 // Avalanche sometimes misses some requests due to TSDB readiness etc. In this case, as the 945 // limit is set to `0` we just want to make sure some timeseries are ingested. 946 queryWaitAndAssert(t, ctx, meta.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { return "sum(prometheus_tsdb_head_series{tenant=\"unlimited-tenant\"}) >=bool 10" }, time.Now, promclient.QueryOptions{ 947 Deduplicate: true, 948 }, model.Vector{ 949 &model.Sample{ 950 Metric: model.Metric{}, 951 Value: model.SampleValue(1), 952 }, 953 }) 954 955 // Query meta-monitoring solution to assert that 3 requests were limited for exceed-tenant and none for under-tenant. 956 queryWaitAndAssert(t, ctx, meta.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { return "thanos_receive_head_series_limited_requests_total" }, time.Now, promclient.QueryOptions{ 957 Deduplicate: true, 958 }, model.Vector{ 959 &model.Sample{ 960 Metric: model.Metric{ 961 "__name__": "thanos_receive_head_series_limited_requests_total", 962 "instance": model.LabelValue(fmt.Sprintf("%s:8080", ingestor1Name)), 963 "job": "receive-i1", 964 "tenant": "exceed-tenant", 965 }, 966 Value: model.SampleValue(3), 967 }, 968 }) 969 }) 970 }