github.com/sequix/cortex@v1.1.6/pkg/chunk/aws/metrics_autoscaling_test.go (about)

     1  package aws
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"testing"
     7  	"time"
     8  
     9  	"github.com/pkg/errors"
    10  	"github.com/prometheus/client_golang/api"
    11  	promV1 "github.com/prometheus/client_golang/api/prometheus/v1"
    12  	"github.com/prometheus/common/model"
    13  
    14  	"github.com/sequix/cortex/pkg/chunk"
    15  )
    16  
    17  func TestTableManagerMetricsAutoScaling(t *testing.T) {
    18  	dynamoDB := newMockDynamoDB(0, 0)
    19  	mockProm := mockPrometheus{}
    20  
    21  	client := dynamoTableClient{
    22  		DynamoDB: dynamoDB,
    23  		autoscale: &metricsData{
    24  			promAPI: &mockProm,
    25  			cfg: MetricsAutoScalingConfig{
    26  				TargetQueueLen: 100000,
    27  				ScaleUpFactor:  1.2,
    28  			},
    29  			tableLastUpdated: make(map[string]time.Time),
    30  		},
    31  	}
    32  
    33  	indexWriteScale := fixtureWriteScale()
    34  	chunkWriteScale := fixtureWriteScale()
    35  	chunkWriteScale.MaxCapacity /= 5
    36  	chunkWriteScale.MinCapacity /= 5
    37  	inactiveWriteScale := fixtureWriteScale()
    38  	inactiveWriteScale.MinCapacity = 5
    39  
    40  	// Set up table-manager config
    41  	cfg := chunk.SchemaConfig{
    42  		Configs: []chunk.PeriodConfig{
    43  			{
    44  				IndexType: "aws-dynamo",
    45  				IndexTables: chunk.PeriodicTableConfig{
    46  					Prefix: "a",
    47  				},
    48  			},
    49  			{
    50  				IndexType:   "aws-dynamo",
    51  				IndexTables: fixturePeriodicTableConfig(tablePrefix),
    52  				ChunkTables: fixturePeriodicTableConfig(chunkTablePrefix),
    53  			},
    54  		},
    55  	}
    56  	tbm := chunk.TableManagerConfig{
    57  		CreationGracePeriod: gracePeriod,
    58  		IndexTables:         fixtureProvisionConfig(2, indexWriteScale, inactiveWriteScale),
    59  		ChunkTables:         fixtureProvisionConfig(2, chunkWriteScale, inactiveWriteScale),
    60  	}
    61  
    62  	tableManager, err := chunk.NewTableManager(tbm, cfg, maxChunkAge, client, nil)
    63  	if err != nil {
    64  		t.Fatal(err)
    65  	}
    66  
    67  	// Create tables
    68  	startTime := time.Unix(0, 0).Add(maxChunkAge).Add(gracePeriod)
    69  
    70  	test(t, client, tableManager, "Create tables",
    71  		startTime,
    72  		append(baseTable("a", inactiveRead, inactiveWrite),
    73  			staticTable(0, read, write, read, write)...),
    74  	)
    75  
    76  	mockProm.SetResponseForWrites(0, 100000, 100000, []int{0, 0}, []int{100, 20})
    77  	test(t, client, tableManager, "Queues but no throttling",
    78  		startTime.Add(time.Minute*10),
    79  		append(baseTable("a", inactiveRead, inactiveWrite),
    80  			staticTable(0, read, write, read, write)...), // - remain flat
    81  	)
    82  
    83  	mockProm.SetResponseForWrites(0, 120000, 100000, []int{100, 200}, []int{100, 20})
    84  	test(t, client, tableManager, "Shrinking queues",
    85  		startTime.Add(time.Minute*20),
    86  		append(baseTable("a", inactiveRead, inactiveWrite),
    87  			staticTable(0, read, write, read, write)...), //  - remain flat
    88  	)
    89  
    90  	mockProm.SetResponseForWrites(0, 120000, 200000, []int{100, 0}, []int{100, 20})
    91  	test(t, client, tableManager, "Building queues",
    92  		startTime.Add(time.Minute*30),
    93  		append(baseTable("a", inactiveRead, inactiveWrite),
    94  			staticTable(0, read, 240, read, write)...), // - scale up index table
    95  	)
    96  
    97  	mockProm.SetResponseForWrites(0, 5000000, 5000000, []int{1, 0}, []int{100, 20})
    98  	test(t, client, tableManager, "Large queues small throtttling",
    99  		startTime.Add(time.Minute*40),
   100  		append(baseTable("a", inactiveRead, inactiveWrite),
   101  			staticTable(0, read, 250, read, write)...), // - scale up index table
   102  	)
   103  
   104  	mockProm.SetResponseForWrites(0, 0, 0, []int{0, 0}, []int{120, 40})
   105  	test(t, client, tableManager, "No queues no throttling",
   106  		startTime.Add(time.Minute*100),
   107  		append(baseTable("a", inactiveRead, inactiveWrite),
   108  			staticTable(0, read, 150, read, 50)...), // - scale down both tables
   109  	)
   110  
   111  	mockProm.SetResponseForWrites(0, 0, 0, []int{0, 0}, []int{50, 10})
   112  	test(t, client, tableManager, "in cooldown period",
   113  		startTime.Add(time.Minute*101),
   114  		append(baseTable("a", inactiveRead, inactiveWrite),
   115  			staticTable(0, read, 150, read, 50)...), // - no change; in cooldown period
   116  	)
   117  
   118  	mockProm.SetResponseForWrites(0, 0, 0, []int{0, 0}, []int{90, 10})
   119  	test(t, client, tableManager, "No queues no throttling",
   120  		startTime.Add(time.Minute*200),
   121  		append(baseTable("a", inactiveRead, inactiveWrite),
   122  			staticTable(0, read, 112, read, 20)...), // - scale down both again
   123  	)
   124  
   125  	mockProm.SetResponseForWrites(0, 0, 0, []int{0, 0}, []int{50, 10})
   126  	test(t, client, tableManager, "de minimis change",
   127  		startTime.Add(time.Minute*220),
   128  		append(baseTable("a", inactiveRead, inactiveWrite),
   129  			staticTable(0, read, 112, read, 20)...), // - should see no change
   130  	)
   131  
   132  	mockProm.SetResponseForWrites(0, 0, 0, []int{30, 30, 30, 30}, []int{50, 10, 100, 20})
   133  	test(t, client, tableManager, "Next week",
   134  		startTime.Add(tablePeriod),
   135  		// Nothing much happening - expect table 0 write rates to stay as-is and table 1 to be created with defaults
   136  		append(append(baseTable("a", inactiveRead, inactiveWrite),
   137  			staticTable(0, inactiveRead, 112, inactiveRead, 20)...),
   138  			staticTable(1, read, write, read, write)...),
   139  	)
   140  
   141  	// No throttling on last week's index table, still some on chunk table
   142  	mockProm.SetResponseForWrites(0, 0, 0, []int{0, 30, 30, 30}, []int{10, 2, 100, 20})
   143  	test(t, client, tableManager, "Next week plus a bit",
   144  		startTime.Add(tablePeriod).Add(time.Minute*10),
   145  		append(append(baseTable("a", inactiveRead, inactiveWrite),
   146  			staticTable(0, inactiveRead, 12, inactiveRead, 20)...), // Scale back last week's index table
   147  			staticTable(1, read, write, read, write)...),
   148  	)
   149  
   150  	// No throttling on last week's tables but some queueing
   151  	mockProm.SetResponseForWrites(20000, 20000, 20000, []int{0, 0, 1, 1}, []int{0, 0, 100, 20})
   152  	test(t, client, tableManager, "Next week plus a bit",
   153  		startTime.Add(tablePeriod).Add(time.Minute*20),
   154  		append(append(baseTable("a", inactiveRead, inactiveWrite),
   155  			staticTable(0, inactiveRead, 12, inactiveRead, 20)...), // no scaling back
   156  			staticTable(1, read, write, read, write)...),
   157  	)
   158  
   159  	mockProm.SetResponseForWrites(120000, 130000, 140000, []int{0, 0, 1, 0}, []int{0, 0, 100, 20})
   160  	test(t, client, tableManager, "next week, queues building, throttling on index table",
   161  		startTime.Add(tablePeriod).Add(time.Minute*30),
   162  		append(append(baseTable("a", inactiveRead, inactiveWrite),
   163  			staticTable(0, inactiveRead, 12, inactiveRead, 20)...), // no scaling back
   164  			staticTable(1, read, 240, read, write)...), // scale up index table
   165  	)
   166  
   167  	mockProm.SetResponseForWrites(140000, 130000, 120000, []int{0, 0, 1, 0}, []int{0, 0, 100, 20})
   168  	test(t, client, tableManager, "next week, queues shrinking, throttling on index table",
   169  		startTime.Add(tablePeriod).Add(time.Minute*40),
   170  		append(append(baseTable("a", inactiveRead, inactiveWrite),
   171  			staticTable(0, inactiveRead, 5, inactiveRead, 5)...), // scale right back
   172  			staticTable(1, read, 240, read, 25)...), // scale chunk table to usage/80%
   173  	)
   174  }
   175  
   176  func TestTableManagerMetricsReadAutoScaling(t *testing.T) {
   177  	dynamoDB := newMockDynamoDB(0, 0)
   178  	mockProm := mockPrometheus{}
   179  
   180  	client := dynamoTableClient{
   181  		DynamoDB: dynamoDB,
   182  		autoscale: &metricsData{
   183  			promAPI: &mockProm,
   184  			cfg: MetricsAutoScalingConfig{
   185  				TargetQueueLen: 100000,
   186  				ScaleUpFactor:  1.2,
   187  			},
   188  			tableLastUpdated:     make(map[string]time.Time),
   189  			tableReadLastUpdated: make(map[string]time.Time),
   190  		},
   191  	}
   192  
   193  	indexReadScale := fixtureReadScale()
   194  	chunkReadScale := fixtureReadScale()
   195  	inactiveReadScale := fixtureReadScale()
   196  	inactiveReadScale.MinCapacity = 5
   197  
   198  	// Set up table-manager config
   199  	cfg := chunk.SchemaConfig{
   200  		Configs: []chunk.PeriodConfig{
   201  			{
   202  				IndexType: "aws-dynamo",
   203  				IndexTables: chunk.PeriodicTableConfig{
   204  					Prefix: "a",
   205  				},
   206  			},
   207  			{
   208  				IndexType:   "aws-dynamo",
   209  				IndexTables: fixturePeriodicTableConfig(tablePrefix),
   210  				ChunkTables: fixturePeriodicTableConfig(chunkTablePrefix),
   211  			},
   212  		},
   213  	}
   214  	tbm := chunk.TableManagerConfig{
   215  		CreationGracePeriod: gracePeriod,
   216  		IndexTables:         fixtureReadProvisionConfig(indexReadScale, inactiveReadScale),
   217  		ChunkTables:         fixtureReadProvisionConfig(chunkReadScale, inactiveReadScale),
   218  	}
   219  
   220  	tableManager, err := chunk.NewTableManager(tbm, cfg, maxChunkAge, client, nil)
   221  	if err != nil {
   222  		t.Fatal(err)
   223  	}
   224  
   225  	// Create tables
   226  	startTime := time.Unix(0, 0).Add(maxChunkAge).Add(gracePeriod)
   227  
   228  	test(t, client, tableManager, "Create tables",
   229  		startTime,
   230  		append(baseTable("a", inactiveRead, inactiveWrite),
   231  			staticTable(0, read, write, read, write)...),
   232  	)
   233  
   234  	mockProm.SetResponseForReads([][]int{{0, 0}}, [][]int{{0, 0}})
   235  	test(t, client, tableManager, "No Query Usage",
   236  		startTime.Add(time.Minute*10),
   237  		append(baseTable("a", inactiveRead, inactiveWrite),
   238  			staticTable(0, 1, write, 1, write)...), // - remain flat
   239  	)
   240  
   241  	mockProm.SetResponseForReads([][]int{{10, 10}}, [][]int{{0, 0}})
   242  	test(t, client, tableManager, "Query Usage but no errors",
   243  		startTime.Add(time.Minute*20),
   244  		append(baseTable("a", inactiveRead, inactiveWrite),
   245  			staticTable(0, 201, write, 201, write)...), //  - less than 10% of max ... scale read on both
   246  	)
   247  
   248  	mockProm.SetResponseForReads([][]int{{11, 11}}, [][]int{{20, 0}})
   249  	test(t, client, tableManager, "Query Usage and throttling on index",
   250  		startTime.Add(time.Minute*30),
   251  		append(baseTable("a", inactiveRead, inactiveWrite),
   252  			staticTable(0, 401, write, 201, write)...), // - scale up index table read
   253  	)
   254  
   255  	mockProm.SetResponseForReads([][]int{{12, 12}}, [][]int{{20, 20}})
   256  	test(t, client, tableManager, "Query Usage and throttling on index plus chunk",
   257  		startTime.Add(time.Minute*40),
   258  		append(baseTable("a", inactiveRead, inactiveWrite),
   259  			staticTable(0, 601, write, 401, write)...), // - scale up index more and scale chunk a step
   260  	)
   261  
   262  	mockProm.SetResponseForReads([][]int{{13, 13}}, [][]int{{200, 200}})
   263  	test(t, client, tableManager, "in cooldown period",
   264  		startTime.Add(time.Minute*41),
   265  		append(baseTable("a", inactiveRead, inactiveWrite),
   266  			staticTable(0, 601, write, 401, write)...), // - no change; in cooldown period
   267  	)
   268  
   269  	mockProm.SetResponseForReads([][]int{{13, 13}}, [][]int{{0, 0}})
   270  	test(t, client, tableManager, "Sustained Query Usage",
   271  		startTime.Add(time.Minute*100),
   272  		append(baseTable("a", inactiveRead, inactiveWrite),
   273  			staticTable(0, 601, write, 401, write)...), // - errors have stopped, but usage continues so no scaling
   274  	)
   275  
   276  	mockProm.SetResponseForReads([][]int{{0, 0}}, [][]int{{0, 0}})
   277  	test(t, client, tableManager, "Query Usage has ended",
   278  		startTime.Add(time.Minute*200),
   279  		append(baseTable("a", inactiveRead, inactiveWrite),
   280  			staticTable(0, 1, write, 1, write)...), // - scale down to minimum... no usage at all
   281  	)
   282  
   283  }
   284  
   285  // Helper to return pre-canned results to Prometheus queries
   286  type mockPrometheus struct {
   287  	promV1.API
   288  	rangeValues []model.Value
   289  }
   290  
   291  func (m *mockPrometheus) SetResponseForWrites(q0, q1, q2 model.SampleValue, throttleRates ...[]int) {
   292  	// Mock metrics from Prometheus
   293  	m.rangeValues = []model.Value{
   294  		// Queue lengths
   295  		model.Matrix{
   296  			&model.SampleStream{Values: []model.SamplePair{
   297  				{Timestamp: 0, Value: q0},
   298  				{Timestamp: 15000, Value: q1},
   299  				{Timestamp: 30000, Value: q2},
   300  			}},
   301  		},
   302  	}
   303  	for _, rates := range throttleRates {
   304  		throttleMatrix := model.Matrix{}
   305  		for i := 0; i < len(rates)/2; i++ {
   306  			throttleMatrix = append(throttleMatrix,
   307  				&model.SampleStream{
   308  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))},
   309  					Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2])}},
   310  				},
   311  				&model.SampleStream{
   312  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))},
   313  					Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2+1])}},
   314  				})
   315  		}
   316  		m.rangeValues = append(m.rangeValues, throttleMatrix)
   317  	}
   318  	// stub response for usage queries (not used in write tests)
   319  	for _, rates := range throttleRates {
   320  		readUsageMatrix := model.Matrix{}
   321  		for i := 0; i < len(rates)/2; i++ {
   322  
   323  			readUsageMatrix = append(readUsageMatrix,
   324  				&model.SampleStream{
   325  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))},
   326  					Values: []model.SamplePair{{Timestamp: 30000, Value: 0}},
   327  				},
   328  				&model.SampleStream{
   329  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))},
   330  					Values: []model.SamplePair{{Timestamp: 30000, Value: 0}},
   331  				})
   332  		}
   333  		m.rangeValues = append(m.rangeValues, readUsageMatrix)
   334  	}
   335  	// stub response for usage error queries (not used in write tests)
   336  	for _, rates := range throttleRates {
   337  		readErrorMatrix := model.Matrix{}
   338  		for i := 0; i < len(rates)/2; i++ {
   339  
   340  			readErrorMatrix = append(readErrorMatrix,
   341  				&model.SampleStream{
   342  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))},
   343  					Values: []model.SamplePair{{Timestamp: 30000, Value: 0}},
   344  				},
   345  				&model.SampleStream{
   346  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))},
   347  					Values: []model.SamplePair{{Timestamp: 30000, Value: 0}},
   348  				})
   349  		}
   350  		m.rangeValues = append(m.rangeValues, readErrorMatrix)
   351  	}
   352  }
   353  
   354  func (m *mockPrometheus) SetResponseForReads(usageRates [][]int, errorRates [][]int) {
   355  	// Mock metrics from Prometheus. In Read tests, these aren't used but must be
   356  	// filled out in a basic way for the underlying functions to get the right amount of prometheus results
   357  	m.rangeValues = []model.Value{
   358  		// Queue lengths ( not used)
   359  		model.Matrix{
   360  			&model.SampleStream{Values: []model.SamplePair{{Timestamp: 0, Value: 0},
   361  				{Timestamp: 15000, Value: 0},
   362  				{Timestamp: 30000, Value: 0}}},
   363  		},
   364  	}
   365  	// Error rates, for writes so not used in a read test. Here as a filler for the expected number of prom responses
   366  	for _, rates := range errorRates {
   367  		errorMatrix := model.Matrix{}
   368  		for i := 0; i < len(rates)/2; i++ {
   369  			errorMatrix = append(errorMatrix,
   370  				&model.SampleStream{
   371  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))},
   372  					Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(0)}},
   373  				},
   374  				&model.SampleStream{
   375  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))},
   376  					Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(0)}},
   377  				})
   378  		}
   379  		m.rangeValues = append(m.rangeValues, errorMatrix)
   380  	}
   381  	// usage rates, for writes so not used in a read test. Here as a filler for the expected number of prom responses
   382  	for _, rates := range errorRates {
   383  		errorMatrix := model.Matrix{}
   384  		for i := 0; i < len(rates)/2; i++ {
   385  			errorMatrix = append(errorMatrix,
   386  				&model.SampleStream{
   387  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))},
   388  					Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(0)}},
   389  				},
   390  				&model.SampleStream{
   391  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))},
   392  					Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(0)}},
   393  				})
   394  		}
   395  		m.rangeValues = append(m.rangeValues, errorMatrix)
   396  	}
   397  	// read usage metrics per table.
   398  	for _, rates := range usageRates {
   399  		readUsageMatrix := model.Matrix{}
   400  		for i := 0; i < len(rates)/2; i++ {
   401  
   402  			readUsageMatrix = append(readUsageMatrix,
   403  				&model.SampleStream{
   404  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))},
   405  					Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2])}},
   406  				},
   407  				&model.SampleStream{
   408  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))},
   409  					Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2+1])}},
   410  				})
   411  		}
   412  		m.rangeValues = append(m.rangeValues, readUsageMatrix)
   413  	}
   414  	// errors from read throttling, per table
   415  	for _, rates := range errorRates {
   416  		readErrorMatrix := model.Matrix{}
   417  		for i := 0; i < len(rates)/2; i++ {
   418  
   419  			readErrorMatrix = append(readErrorMatrix,
   420  				&model.SampleStream{
   421  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))},
   422  					Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2])}},
   423  				},
   424  				&model.SampleStream{
   425  					Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))},
   426  					Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2+1])}},
   427  				})
   428  		}
   429  		m.rangeValues = append(m.rangeValues, readErrorMatrix)
   430  	}
   431  }
   432  
   433  func (m *mockPrometheus) QueryRange(ctx context.Context, query string, r promV1.Range) (model.Value, api.Warnings, error) {
   434  	if len(m.rangeValues) == 0 {
   435  		return nil, nil, errors.New("mockPrometheus.QueryRange: out of values")
   436  	}
   437  	// Take the first value and move the slice up
   438  	ret := m.rangeValues[0]
   439  	m.rangeValues = m.rangeValues[1:]
   440  	return ret, nil, nil
   441  }