github.com/sequix/cortex@v1.1.6/pkg/chunk/aws/metrics_autoscaling_test.go (about) 1 package aws 2 3 import ( 4 "context" 5 "fmt" 6 "testing" 7 "time" 8 9 "github.com/pkg/errors" 10 "github.com/prometheus/client_golang/api" 11 promV1 "github.com/prometheus/client_golang/api/prometheus/v1" 12 "github.com/prometheus/common/model" 13 14 "github.com/sequix/cortex/pkg/chunk" 15 ) 16 17 func TestTableManagerMetricsAutoScaling(t *testing.T) { 18 dynamoDB := newMockDynamoDB(0, 0) 19 mockProm := mockPrometheus{} 20 21 client := dynamoTableClient{ 22 DynamoDB: dynamoDB, 23 autoscale: &metricsData{ 24 promAPI: &mockProm, 25 cfg: MetricsAutoScalingConfig{ 26 TargetQueueLen: 100000, 27 ScaleUpFactor: 1.2, 28 }, 29 tableLastUpdated: make(map[string]time.Time), 30 }, 31 } 32 33 indexWriteScale := fixtureWriteScale() 34 chunkWriteScale := fixtureWriteScale() 35 chunkWriteScale.MaxCapacity /= 5 36 chunkWriteScale.MinCapacity /= 5 37 inactiveWriteScale := fixtureWriteScale() 38 inactiveWriteScale.MinCapacity = 5 39 40 // Set up table-manager config 41 cfg := chunk.SchemaConfig{ 42 Configs: []chunk.PeriodConfig{ 43 { 44 IndexType: "aws-dynamo", 45 IndexTables: chunk.PeriodicTableConfig{ 46 Prefix: "a", 47 }, 48 }, 49 { 50 IndexType: "aws-dynamo", 51 IndexTables: fixturePeriodicTableConfig(tablePrefix), 52 ChunkTables: fixturePeriodicTableConfig(chunkTablePrefix), 53 }, 54 }, 55 } 56 tbm := chunk.TableManagerConfig{ 57 CreationGracePeriod: gracePeriod, 58 IndexTables: fixtureProvisionConfig(2, indexWriteScale, inactiveWriteScale), 59 ChunkTables: fixtureProvisionConfig(2, chunkWriteScale, inactiveWriteScale), 60 } 61 62 tableManager, err := chunk.NewTableManager(tbm, cfg, maxChunkAge, client, nil) 63 if err != nil { 64 t.Fatal(err) 65 } 66 67 // Create tables 68 startTime := time.Unix(0, 0).Add(maxChunkAge).Add(gracePeriod) 69 70 test(t, client, tableManager, "Create tables", 71 startTime, 72 append(baseTable("a", inactiveRead, inactiveWrite), 73 staticTable(0, read, write, read, write)...), 74 ) 75 76 mockProm.SetResponseForWrites(0, 100000, 100000, []int{0, 0}, []int{100, 20}) 77 test(t, client, tableManager, "Queues but no throttling", 78 startTime.Add(time.Minute*10), 79 append(baseTable("a", inactiveRead, inactiveWrite), 80 staticTable(0, read, write, read, write)...), // - remain flat 81 ) 82 83 mockProm.SetResponseForWrites(0, 120000, 100000, []int{100, 200}, []int{100, 20}) 84 test(t, client, tableManager, "Shrinking queues", 85 startTime.Add(time.Minute*20), 86 append(baseTable("a", inactiveRead, inactiveWrite), 87 staticTable(0, read, write, read, write)...), // - remain flat 88 ) 89 90 mockProm.SetResponseForWrites(0, 120000, 200000, []int{100, 0}, []int{100, 20}) 91 test(t, client, tableManager, "Building queues", 92 startTime.Add(time.Minute*30), 93 append(baseTable("a", inactiveRead, inactiveWrite), 94 staticTable(0, read, 240, read, write)...), // - scale up index table 95 ) 96 97 mockProm.SetResponseForWrites(0, 5000000, 5000000, []int{1, 0}, []int{100, 20}) 98 test(t, client, tableManager, "Large queues small throtttling", 99 startTime.Add(time.Minute*40), 100 append(baseTable("a", inactiveRead, inactiveWrite), 101 staticTable(0, read, 250, read, write)...), // - scale up index table 102 ) 103 104 mockProm.SetResponseForWrites(0, 0, 0, []int{0, 0}, []int{120, 40}) 105 test(t, client, tableManager, "No queues no throttling", 106 startTime.Add(time.Minute*100), 107 append(baseTable("a", inactiveRead, inactiveWrite), 108 staticTable(0, read, 150, read, 50)...), // - scale down both tables 109 ) 110 111 mockProm.SetResponseForWrites(0, 0, 0, []int{0, 0}, []int{50, 10}) 112 test(t, client, tableManager, "in cooldown period", 113 startTime.Add(time.Minute*101), 114 append(baseTable("a", inactiveRead, inactiveWrite), 115 staticTable(0, read, 150, read, 50)...), // - no change; in cooldown period 116 ) 117 118 mockProm.SetResponseForWrites(0, 0, 0, []int{0, 0}, []int{90, 10}) 119 test(t, client, tableManager, "No queues no throttling", 120 startTime.Add(time.Minute*200), 121 append(baseTable("a", inactiveRead, inactiveWrite), 122 staticTable(0, read, 112, read, 20)...), // - scale down both again 123 ) 124 125 mockProm.SetResponseForWrites(0, 0, 0, []int{0, 0}, []int{50, 10}) 126 test(t, client, tableManager, "de minimis change", 127 startTime.Add(time.Minute*220), 128 append(baseTable("a", inactiveRead, inactiveWrite), 129 staticTable(0, read, 112, read, 20)...), // - should see no change 130 ) 131 132 mockProm.SetResponseForWrites(0, 0, 0, []int{30, 30, 30, 30}, []int{50, 10, 100, 20}) 133 test(t, client, tableManager, "Next week", 134 startTime.Add(tablePeriod), 135 // Nothing much happening - expect table 0 write rates to stay as-is and table 1 to be created with defaults 136 append(append(baseTable("a", inactiveRead, inactiveWrite), 137 staticTable(0, inactiveRead, 112, inactiveRead, 20)...), 138 staticTable(1, read, write, read, write)...), 139 ) 140 141 // No throttling on last week's index table, still some on chunk table 142 mockProm.SetResponseForWrites(0, 0, 0, []int{0, 30, 30, 30}, []int{10, 2, 100, 20}) 143 test(t, client, tableManager, "Next week plus a bit", 144 startTime.Add(tablePeriod).Add(time.Minute*10), 145 append(append(baseTable("a", inactiveRead, inactiveWrite), 146 staticTable(0, inactiveRead, 12, inactiveRead, 20)...), // Scale back last week's index table 147 staticTable(1, read, write, read, write)...), 148 ) 149 150 // No throttling on last week's tables but some queueing 151 mockProm.SetResponseForWrites(20000, 20000, 20000, []int{0, 0, 1, 1}, []int{0, 0, 100, 20}) 152 test(t, client, tableManager, "Next week plus a bit", 153 startTime.Add(tablePeriod).Add(time.Minute*20), 154 append(append(baseTable("a", inactiveRead, inactiveWrite), 155 staticTable(0, inactiveRead, 12, inactiveRead, 20)...), // no scaling back 156 staticTable(1, read, write, read, write)...), 157 ) 158 159 mockProm.SetResponseForWrites(120000, 130000, 140000, []int{0, 0, 1, 0}, []int{0, 0, 100, 20}) 160 test(t, client, tableManager, "next week, queues building, throttling on index table", 161 startTime.Add(tablePeriod).Add(time.Minute*30), 162 append(append(baseTable("a", inactiveRead, inactiveWrite), 163 staticTable(0, inactiveRead, 12, inactiveRead, 20)...), // no scaling back 164 staticTable(1, read, 240, read, write)...), // scale up index table 165 ) 166 167 mockProm.SetResponseForWrites(140000, 130000, 120000, []int{0, 0, 1, 0}, []int{0, 0, 100, 20}) 168 test(t, client, tableManager, "next week, queues shrinking, throttling on index table", 169 startTime.Add(tablePeriod).Add(time.Minute*40), 170 append(append(baseTable("a", inactiveRead, inactiveWrite), 171 staticTable(0, inactiveRead, 5, inactiveRead, 5)...), // scale right back 172 staticTable(1, read, 240, read, 25)...), // scale chunk table to usage/80% 173 ) 174 } 175 176 func TestTableManagerMetricsReadAutoScaling(t *testing.T) { 177 dynamoDB := newMockDynamoDB(0, 0) 178 mockProm := mockPrometheus{} 179 180 client := dynamoTableClient{ 181 DynamoDB: dynamoDB, 182 autoscale: &metricsData{ 183 promAPI: &mockProm, 184 cfg: MetricsAutoScalingConfig{ 185 TargetQueueLen: 100000, 186 ScaleUpFactor: 1.2, 187 }, 188 tableLastUpdated: make(map[string]time.Time), 189 tableReadLastUpdated: make(map[string]time.Time), 190 }, 191 } 192 193 indexReadScale := fixtureReadScale() 194 chunkReadScale := fixtureReadScale() 195 inactiveReadScale := fixtureReadScale() 196 inactiveReadScale.MinCapacity = 5 197 198 // Set up table-manager config 199 cfg := chunk.SchemaConfig{ 200 Configs: []chunk.PeriodConfig{ 201 { 202 IndexType: "aws-dynamo", 203 IndexTables: chunk.PeriodicTableConfig{ 204 Prefix: "a", 205 }, 206 }, 207 { 208 IndexType: "aws-dynamo", 209 IndexTables: fixturePeriodicTableConfig(tablePrefix), 210 ChunkTables: fixturePeriodicTableConfig(chunkTablePrefix), 211 }, 212 }, 213 } 214 tbm := chunk.TableManagerConfig{ 215 CreationGracePeriod: gracePeriod, 216 IndexTables: fixtureReadProvisionConfig(indexReadScale, inactiveReadScale), 217 ChunkTables: fixtureReadProvisionConfig(chunkReadScale, inactiveReadScale), 218 } 219 220 tableManager, err := chunk.NewTableManager(tbm, cfg, maxChunkAge, client, nil) 221 if err != nil { 222 t.Fatal(err) 223 } 224 225 // Create tables 226 startTime := time.Unix(0, 0).Add(maxChunkAge).Add(gracePeriod) 227 228 test(t, client, tableManager, "Create tables", 229 startTime, 230 append(baseTable("a", inactiveRead, inactiveWrite), 231 staticTable(0, read, write, read, write)...), 232 ) 233 234 mockProm.SetResponseForReads([][]int{{0, 0}}, [][]int{{0, 0}}) 235 test(t, client, tableManager, "No Query Usage", 236 startTime.Add(time.Minute*10), 237 append(baseTable("a", inactiveRead, inactiveWrite), 238 staticTable(0, 1, write, 1, write)...), // - remain flat 239 ) 240 241 mockProm.SetResponseForReads([][]int{{10, 10}}, [][]int{{0, 0}}) 242 test(t, client, tableManager, "Query Usage but no errors", 243 startTime.Add(time.Minute*20), 244 append(baseTable("a", inactiveRead, inactiveWrite), 245 staticTable(0, 201, write, 201, write)...), // - less than 10% of max ... scale read on both 246 ) 247 248 mockProm.SetResponseForReads([][]int{{11, 11}}, [][]int{{20, 0}}) 249 test(t, client, tableManager, "Query Usage and throttling on index", 250 startTime.Add(time.Minute*30), 251 append(baseTable("a", inactiveRead, inactiveWrite), 252 staticTable(0, 401, write, 201, write)...), // - scale up index table read 253 ) 254 255 mockProm.SetResponseForReads([][]int{{12, 12}}, [][]int{{20, 20}}) 256 test(t, client, tableManager, "Query Usage and throttling on index plus chunk", 257 startTime.Add(time.Minute*40), 258 append(baseTable("a", inactiveRead, inactiveWrite), 259 staticTable(0, 601, write, 401, write)...), // - scale up index more and scale chunk a step 260 ) 261 262 mockProm.SetResponseForReads([][]int{{13, 13}}, [][]int{{200, 200}}) 263 test(t, client, tableManager, "in cooldown period", 264 startTime.Add(time.Minute*41), 265 append(baseTable("a", inactiveRead, inactiveWrite), 266 staticTable(0, 601, write, 401, write)...), // - no change; in cooldown period 267 ) 268 269 mockProm.SetResponseForReads([][]int{{13, 13}}, [][]int{{0, 0}}) 270 test(t, client, tableManager, "Sustained Query Usage", 271 startTime.Add(time.Minute*100), 272 append(baseTable("a", inactiveRead, inactiveWrite), 273 staticTable(0, 601, write, 401, write)...), // - errors have stopped, but usage continues so no scaling 274 ) 275 276 mockProm.SetResponseForReads([][]int{{0, 0}}, [][]int{{0, 0}}) 277 test(t, client, tableManager, "Query Usage has ended", 278 startTime.Add(time.Minute*200), 279 append(baseTable("a", inactiveRead, inactiveWrite), 280 staticTable(0, 1, write, 1, write)...), // - scale down to minimum... no usage at all 281 ) 282 283 } 284 285 // Helper to return pre-canned results to Prometheus queries 286 type mockPrometheus struct { 287 promV1.API 288 rangeValues []model.Value 289 } 290 291 func (m *mockPrometheus) SetResponseForWrites(q0, q1, q2 model.SampleValue, throttleRates ...[]int) { 292 // Mock metrics from Prometheus 293 m.rangeValues = []model.Value{ 294 // Queue lengths 295 model.Matrix{ 296 &model.SampleStream{Values: []model.SamplePair{ 297 {Timestamp: 0, Value: q0}, 298 {Timestamp: 15000, Value: q1}, 299 {Timestamp: 30000, Value: q2}, 300 }}, 301 }, 302 } 303 for _, rates := range throttleRates { 304 throttleMatrix := model.Matrix{} 305 for i := 0; i < len(rates)/2; i++ { 306 throttleMatrix = append(throttleMatrix, 307 &model.SampleStream{ 308 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))}, 309 Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2])}}, 310 }, 311 &model.SampleStream{ 312 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))}, 313 Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2+1])}}, 314 }) 315 } 316 m.rangeValues = append(m.rangeValues, throttleMatrix) 317 } 318 // stub response for usage queries (not used in write tests) 319 for _, rates := range throttleRates { 320 readUsageMatrix := model.Matrix{} 321 for i := 0; i < len(rates)/2; i++ { 322 323 readUsageMatrix = append(readUsageMatrix, 324 &model.SampleStream{ 325 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))}, 326 Values: []model.SamplePair{{Timestamp: 30000, Value: 0}}, 327 }, 328 &model.SampleStream{ 329 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))}, 330 Values: []model.SamplePair{{Timestamp: 30000, Value: 0}}, 331 }) 332 } 333 m.rangeValues = append(m.rangeValues, readUsageMatrix) 334 } 335 // stub response for usage error queries (not used in write tests) 336 for _, rates := range throttleRates { 337 readErrorMatrix := model.Matrix{} 338 for i := 0; i < len(rates)/2; i++ { 339 340 readErrorMatrix = append(readErrorMatrix, 341 &model.SampleStream{ 342 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))}, 343 Values: []model.SamplePair{{Timestamp: 30000, Value: 0}}, 344 }, 345 &model.SampleStream{ 346 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))}, 347 Values: []model.SamplePair{{Timestamp: 30000, Value: 0}}, 348 }) 349 } 350 m.rangeValues = append(m.rangeValues, readErrorMatrix) 351 } 352 } 353 354 func (m *mockPrometheus) SetResponseForReads(usageRates [][]int, errorRates [][]int) { 355 // Mock metrics from Prometheus. In Read tests, these aren't used but must be 356 // filled out in a basic way for the underlying functions to get the right amount of prometheus results 357 m.rangeValues = []model.Value{ 358 // Queue lengths ( not used) 359 model.Matrix{ 360 &model.SampleStream{Values: []model.SamplePair{{Timestamp: 0, Value: 0}, 361 {Timestamp: 15000, Value: 0}, 362 {Timestamp: 30000, Value: 0}}}, 363 }, 364 } 365 // Error rates, for writes so not used in a read test. Here as a filler for the expected number of prom responses 366 for _, rates := range errorRates { 367 errorMatrix := model.Matrix{} 368 for i := 0; i < len(rates)/2; i++ { 369 errorMatrix = append(errorMatrix, 370 &model.SampleStream{ 371 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))}, 372 Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(0)}}, 373 }, 374 &model.SampleStream{ 375 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))}, 376 Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(0)}}, 377 }) 378 } 379 m.rangeValues = append(m.rangeValues, errorMatrix) 380 } 381 // usage rates, for writes so not used in a read test. Here as a filler for the expected number of prom responses 382 for _, rates := range errorRates { 383 errorMatrix := model.Matrix{} 384 for i := 0; i < len(rates)/2; i++ { 385 errorMatrix = append(errorMatrix, 386 &model.SampleStream{ 387 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))}, 388 Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(0)}}, 389 }, 390 &model.SampleStream{ 391 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))}, 392 Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(0)}}, 393 }) 394 } 395 m.rangeValues = append(m.rangeValues, errorMatrix) 396 } 397 // read usage metrics per table. 398 for _, rates := range usageRates { 399 readUsageMatrix := model.Matrix{} 400 for i := 0; i < len(rates)/2; i++ { 401 402 readUsageMatrix = append(readUsageMatrix, 403 &model.SampleStream{ 404 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))}, 405 Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2])}}, 406 }, 407 &model.SampleStream{ 408 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))}, 409 Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2+1])}}, 410 }) 411 } 412 m.rangeValues = append(m.rangeValues, readUsageMatrix) 413 } 414 // errors from read throttling, per table 415 for _, rates := range errorRates { 416 readErrorMatrix := model.Matrix{} 417 for i := 0; i < len(rates)/2; i++ { 418 419 readErrorMatrix = append(readErrorMatrix, 420 &model.SampleStream{ 421 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", tablePrefix, i))}, 422 Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2])}}, 423 }, 424 &model.SampleStream{ 425 Metric: model.Metric{"table": model.LabelValue(fmt.Sprintf("%s%d", chunkTablePrefix, i))}, 426 Values: []model.SamplePair{{Timestamp: 30000, Value: model.SampleValue(rates[i*2+1])}}, 427 }) 428 } 429 m.rangeValues = append(m.rangeValues, readErrorMatrix) 430 } 431 } 432 433 func (m *mockPrometheus) QueryRange(ctx context.Context, query string, r promV1.Range) (model.Value, api.Warnings, error) { 434 if len(m.rangeValues) == 0 { 435 return nil, nil, errors.New("mockPrometheus.QueryRange: out of values") 436 } 437 // Take the first value and move the slice up 438 ret := m.rangeValues[0] 439 m.rangeValues = m.rangeValues[1:] 440 return ret, nil, nil 441 }