github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/storage/chunk/client/aws/metrics_autoscaling.go (about) 1 package aws 2 3 import ( 4 "context" 5 "flag" 6 "fmt" 7 "time" 8 9 "github.com/go-kit/log/level" 10 "github.com/pkg/errors" 11 promApi "github.com/prometheus/client_golang/api" 12 promV1 "github.com/prometheus/client_golang/api/prometheus/v1" 13 "github.com/prometheus/common/model" 14 "github.com/weaveworks/common/mtime" 15 16 "github.com/grafana/loki/pkg/storage/config" 17 util_log "github.com/grafana/loki/pkg/util/log" 18 ) 19 20 const ( 21 cachePromDataFor = 30 * time.Second 22 queueObservationPeriod = 2 * time.Minute 23 targetScaledown = 0.1 // consider scaling down if queue smaller than this times target 24 targetMax = 10 // always scale up if queue bigger than this times target 25 throttleFractionScaledown = 0.1 26 minUsageForScaledown = 100 // only scale down if usage is > this DynamoDB units/sec 27 28 // fetch Ingester queue length 29 // average the queue length over 2 minutes to avoid aliasing with the 1-minute flush period 30 defaultQueueLenQuery = `sum(avg_over_time(cortex_ingester_flush_queue_length{job="cortex/ingester"}[2m]))` 31 // fetch write throttle rate per DynamoDB table 32 defaultThrottleRateQuery = `sum(rate(cortex_dynamo_throttled_total{operation="DynamoDB.BatchWriteItem"}[1m])) by (table) > 0` 33 // fetch write capacity usage per DynamoDB table 34 // use the rate over 15 minutes so we take a broad average 35 defaultUsageQuery = `sum(rate(cortex_dynamo_consumed_capacity_total{operation="DynamoDB.BatchWriteItem"}[15m])) by (table) > 0` 36 // use the read rate over 1hr so we take a broad average 37 defaultReadUsageQuery = `sum(rate(cortex_dynamo_consumed_capacity_total{operation="DynamoDB.QueryPages"}[1h])) by (table) > 0` 38 // fetch read error rate per DynamoDB table 39 defaultReadErrorQuery = `sum(increase(cortex_dynamo_failures_total{operation="DynamoDB.QueryPages",error="ProvisionedThroughputExceededException"}[1m])) by (table) > 0` 40 ) 41 42 // MetricsAutoScalingConfig holds parameters to configure how it works 43 type MetricsAutoScalingConfig struct { 44 URL string `yaml:"url"` // URL to contact Prometheus store on 45 TargetQueueLen int64 `yaml:"target_queue_length"` // Queue length above which we will scale up capacity 46 ScaleUpFactor float64 `yaml:"scale_up_factor"` // Scale up capacity by this multiple 47 MinThrottling float64 `yaml:"ignore_throttle_below"` // Ignore throttling below this level 48 QueueLengthQuery string `yaml:"queue_length_query"` // Promql query to fetch ingester queue length 49 ThrottleQuery string `yaml:"write_throttle_query"` // Promql query to fetch throttle rate per table 50 UsageQuery string `yaml:"write_usage_query"` // Promql query to fetch write capacity usage per table 51 ReadUsageQuery string `yaml:"read_usage_query"` // Promql query to fetch read usage per table 52 ReadErrorQuery string `yaml:"read_error_query"` // Promql query to fetch read errors per table 53 } 54 55 // RegisterFlags adds the flags required to config this to the given FlagSet 56 func (cfg *MetricsAutoScalingConfig) RegisterFlags(f *flag.FlagSet) { 57 f.StringVar(&cfg.URL, "metrics.url", "", "Use metrics-based autoscaling, via this query URL") 58 f.Int64Var(&cfg.TargetQueueLen, "metrics.target-queue-length", 100000, "Queue length above which we will scale up capacity") 59 f.Float64Var(&cfg.ScaleUpFactor, "metrics.scale-up-factor", 1.3, "Scale up capacity by this multiple") 60 f.Float64Var(&cfg.MinThrottling, "metrics.ignore-throttle-below", 1, "Ignore throttling below this level (rate per second)") 61 f.StringVar(&cfg.QueueLengthQuery, "metrics.queue-length-query", defaultQueueLenQuery, "query to fetch ingester queue length") 62 f.StringVar(&cfg.ThrottleQuery, "metrics.write-throttle-query", defaultThrottleRateQuery, "query to fetch throttle rates per table") 63 f.StringVar(&cfg.UsageQuery, "metrics.usage-query", defaultUsageQuery, "query to fetch write capacity usage per table") 64 f.StringVar(&cfg.ReadUsageQuery, "metrics.read-usage-query", defaultReadUsageQuery, "query to fetch read capacity usage per table") 65 f.StringVar(&cfg.ReadErrorQuery, "metrics.read-error-query", defaultReadErrorQuery, "query to fetch read errors per table") 66 } 67 68 type metricsData struct { 69 cfg MetricsAutoScalingConfig 70 promAPI promV1.API 71 promLastQuery time.Time 72 tableLastUpdated map[string]time.Time 73 tableReadLastUpdated map[string]time.Time 74 queueLengths []float64 75 throttleRates map[string]float64 76 usageRates map[string]float64 77 usageReadRates map[string]float64 78 readErrorRates map[string]float64 79 } 80 81 func newMetricsAutoScaling(cfg DynamoDBConfig) (*metricsData, error) { 82 client, err := promApi.NewClient(promApi.Config{Address: cfg.Metrics.URL}) 83 if err != nil { 84 return nil, err 85 } 86 return &metricsData{ 87 promAPI: promV1.NewAPI(client), 88 cfg: cfg.Metrics, 89 tableLastUpdated: make(map[string]time.Time), 90 tableReadLastUpdated: make(map[string]time.Time), 91 }, nil 92 } 93 94 func (m *metricsData) PostCreateTable(ctx context.Context, desc config.TableDesc) error { 95 return nil 96 } 97 98 func (m *metricsData) DescribeTable(ctx context.Context, desc *config.TableDesc) error { 99 return nil 100 } 101 102 func (m *metricsData) UpdateTable(ctx context.Context, current config.TableDesc, expected *config.TableDesc) error { 103 if err := m.update(ctx); err != nil { 104 return err 105 } 106 107 if expected.WriteScale.Enabled { 108 // default if no action is taken is to use the currently provisioned setting 109 expected.ProvisionedWrite = current.ProvisionedWrite 110 111 throttleRate := m.throttleRates[expected.Name] 112 usageRate := m.usageRates[expected.Name] 113 114 level.Info(util_log.Logger).Log("msg", "checking write metrics", "table", current.Name, "queueLengths", fmt.Sprint(m.queueLengths), "throttleRate", throttleRate, "usageRate", usageRate) 115 116 switch { 117 case throttleRate < throttleFractionScaledown*float64(current.ProvisionedWrite) && 118 m.queueLengths[2] < float64(m.cfg.TargetQueueLen)*targetScaledown: 119 // No big queue, low throttling -> scale down 120 expected.ProvisionedWrite = scaleDown(current.Name, 121 current.ProvisionedWrite, 122 expected.WriteScale.MinCapacity, 123 computeScaleDown(current.Name, m.usageRates, expected.WriteScale.TargetValue), 124 m.tableLastUpdated, 125 expected.WriteScale.InCooldown, 126 "metrics scale-down", 127 "write", 128 m.usageRates) 129 case throttleRate == 0 && 130 m.queueLengths[2] < m.queueLengths[1] && m.queueLengths[1] < m.queueLengths[0]: 131 // zero errors and falling queue -> scale down to current usage 132 expected.ProvisionedWrite = scaleDown(current.Name, 133 current.ProvisionedWrite, 134 expected.WriteScale.MinCapacity, 135 computeScaleDown(current.Name, m.usageRates, expected.WriteScale.TargetValue), 136 m.tableLastUpdated, 137 expected.WriteScale.InCooldown, 138 "zero errors scale-down", 139 "write", 140 m.usageRates) 141 case throttleRate > 0 && m.queueLengths[2] > float64(m.cfg.TargetQueueLen)*targetMax: 142 // Too big queue, some throttling -> scale up (note we don't apply MinThrottling in this case) 143 expected.ProvisionedWrite = scaleUp(current.Name, 144 current.ProvisionedWrite, 145 expected.WriteScale.MaxCapacity, 146 computeScaleUp(current.ProvisionedWrite, expected.WriteScale.MaxCapacity, m.cfg.ScaleUpFactor), 147 m.tableLastUpdated, 148 expected.WriteScale.OutCooldown, 149 "metrics max queue scale-up", 150 "write") 151 case throttleRate > m.cfg.MinThrottling && 152 m.queueLengths[2] > float64(m.cfg.TargetQueueLen) && 153 m.queueLengths[2] > m.queueLengths[1] && m.queueLengths[1] > m.queueLengths[0]: 154 // Growing queue, some throttling -> scale up 155 expected.ProvisionedWrite = scaleUp(current.Name, 156 current.ProvisionedWrite, 157 expected.WriteScale.MaxCapacity, 158 computeScaleUp(current.ProvisionedWrite, expected.WriteScale.MaxCapacity, m.cfg.ScaleUpFactor), 159 m.tableLastUpdated, 160 expected.WriteScale.OutCooldown, 161 "metrics queue growing scale-up", 162 "write") 163 } 164 } 165 166 if expected.ReadScale.Enabled { 167 // default if no action is taken is to use the currently provisioned setting 168 expected.ProvisionedRead = current.ProvisionedRead 169 readUsageRate := m.usageReadRates[expected.Name] 170 readErrorRate := m.readErrorRates[expected.Name] 171 172 level.Info(util_log.Logger).Log("msg", "checking read metrics", "table", current.Name, "errorRate", readErrorRate, "readUsageRate", readUsageRate) 173 // Read Scaling 174 switch { 175 // the table is at low/minimum capacity and it is being used -> scale up 176 case readUsageRate > 0 && current.ProvisionedRead < expected.ReadScale.MaxCapacity/10: 177 expected.ProvisionedRead = scaleUp( 178 current.Name, 179 current.ProvisionedRead, 180 expected.ReadScale.MaxCapacity, 181 computeScaleUp(current.ProvisionedRead, expected.ReadScale.MaxCapacity, m.cfg.ScaleUpFactor), 182 m.tableReadLastUpdated, expected.ReadScale.OutCooldown, 183 "table is being used. scale up", 184 "read") 185 case readErrorRate > 0 && readUsageRate > 0: 186 // Queries are causing read throttling on the table -> scale up 187 expected.ProvisionedRead = scaleUp( 188 current.Name, 189 current.ProvisionedRead, 190 expected.ReadScale.MaxCapacity, 191 computeScaleUp(current.ProvisionedRead, expected.ReadScale.MaxCapacity, m.cfg.ScaleUpFactor), 192 m.tableReadLastUpdated, expected.ReadScale.OutCooldown, 193 "table is in use and there are read throttle errors, scale up", 194 "read") 195 case readErrorRate == 0 && readUsageRate == 0: 196 // this table is not being used. -> scale down 197 expected.ProvisionedRead = scaleDown(current.Name, 198 current.ProvisionedRead, 199 expected.ReadScale.MinCapacity, 200 computeScaleDown(current.Name, m.usageReadRates, expected.ReadScale.TargetValue), 201 m.tableReadLastUpdated, 202 expected.ReadScale.InCooldown, 203 "table is not in use. scale down", "read", 204 nil) 205 } 206 } 207 208 return nil 209 } 210 211 func computeScaleUp(currentValue, maxValue int64, scaleFactor float64) int64 { 212 scaleUp := int64(float64(currentValue) * scaleFactor) 213 // Scale up minimum of 10% of max capacity, to avoid futzing around at low levels 214 minIncrement := maxValue / 10 215 if scaleUp < currentValue+minIncrement { 216 scaleUp = currentValue + minIncrement 217 } 218 return scaleUp 219 } 220 221 func computeScaleDown(currentName string, usageRates map[string]float64, targetValue float64) int64 { 222 usageRate := usageRates[currentName] 223 return int64(usageRate * 100.0 / targetValue) 224 } 225 226 func scaleDown(tableName string, currentValue, minValue int64, newValue int64, lastUpdated map[string]time.Time, coolDown int64, msg, operation string, usageRates map[string]float64) int64 { 227 if newValue < minValue { 228 newValue = minValue 229 } 230 // If we're already at or below the requested value, it's not a scale-down. 231 if newValue >= currentValue { 232 return currentValue 233 } 234 235 earliest := lastUpdated[tableName].Add(time.Duration(coolDown) * time.Second) 236 if earliest.After(mtime.Now()) { 237 level.Info(util_log.Logger).Log("msg", "deferring "+msg, "table", tableName, "till", earliest, "op", operation) 238 return currentValue 239 } 240 241 // Reject a change that is less than 20% - AWS rate-limits scale-downs so save 242 // our chances until it makes a bigger difference 243 if newValue > currentValue*4/5 { 244 level.Info(util_log.Logger).Log("msg", "rejected de minimis "+msg, "table", tableName, "current", currentValue, "proposed", newValue, "op", operation) 245 return currentValue 246 } 247 248 if usageRates != nil { 249 // Check that the ingesters seem to be doing some work - don't want to scale down 250 // if all our metrics are returning zero, or all the ingesters have crashed, etc 251 totalUsage := 0.0 252 for _, u := range usageRates { 253 totalUsage += u 254 } 255 if totalUsage < minUsageForScaledown { 256 level.Info(util_log.Logger).Log("msg", "rejected low usage "+msg, "table", tableName, "totalUsage", totalUsage, "op", operation) 257 return currentValue 258 } 259 } 260 261 level.Info(util_log.Logger).Log("msg", msg, "table", tableName, operation, newValue) 262 lastUpdated[tableName] = mtime.Now() 263 return newValue 264 } 265 266 func scaleUp(tableName string, currentValue, maxValue int64, newValue int64, lastUpdated map[string]time.Time, coolDown int64, msg, operation string) int64 { 267 if newValue > maxValue { 268 newValue = maxValue 269 } 270 earliest := lastUpdated[tableName].Add(time.Duration(coolDown) * time.Second) 271 if !earliest.After(mtime.Now()) && newValue > currentValue { 272 level.Info(util_log.Logger).Log("msg", msg, "table", tableName, operation, newValue) 273 lastUpdated[tableName] = mtime.Now() 274 return newValue 275 } 276 277 level.Info(util_log.Logger).Log("msg", "deferring "+msg, "table", tableName, "till", earliest) 278 return currentValue 279 } 280 281 func (m *metricsData) update(ctx context.Context) error { 282 if m.promLastQuery.After(mtime.Now().Add(-cachePromDataFor)) { 283 return nil 284 } 285 286 m.promLastQuery = mtime.Now() 287 qlMatrix, err := promQuery(ctx, m.promAPI, m.cfg.QueueLengthQuery, queueObservationPeriod, queueObservationPeriod/2) 288 if err != nil { 289 return err 290 } 291 if len(qlMatrix) != 1 { 292 return errors.Errorf("expected one sample stream for queue: %d", len(qlMatrix)) 293 } 294 if len(qlMatrix[0].Values) != 3 { 295 return errors.Errorf("expected three values: %d", len(qlMatrix[0].Values)) 296 } 297 m.queueLengths = make([]float64, len(qlMatrix[0].Values)) 298 for i, v := range qlMatrix[0].Values { 299 m.queueLengths[i] = float64(v.Value) 300 } 301 302 deMatrix, err := promQuery(ctx, m.promAPI, m.cfg.ThrottleQuery, 0, time.Second) 303 if err != nil { 304 return err 305 } 306 if m.throttleRates, err = extractRates(deMatrix); err != nil { 307 return err 308 } 309 310 usageMatrix, err := promQuery(ctx, m.promAPI, m.cfg.UsageQuery, 0, time.Second) 311 if err != nil { 312 return err 313 } 314 if m.usageRates, err = extractRates(usageMatrix); err != nil { 315 return err 316 } 317 318 readUsageMatrix, err := promQuery(ctx, m.promAPI, m.cfg.ReadUsageQuery, 0, time.Second) 319 if err != nil { 320 return err 321 } 322 if m.usageReadRates, err = extractRates(readUsageMatrix); err != nil { 323 return err 324 } 325 326 readErrorMatrix, err := promQuery(ctx, m.promAPI, m.cfg.ReadErrorQuery, 0, time.Second) 327 if err != nil { 328 return err 329 } 330 if m.readErrorRates, err = extractRates(readErrorMatrix); err != nil { 331 return err 332 } 333 334 return nil 335 } 336 337 func extractRates(matrix model.Matrix) (map[string]float64, error) { 338 ret := map[string]float64{} 339 for _, s := range matrix { 340 table, found := s.Metric["table"] 341 if !found { 342 continue 343 } 344 if len(s.Values) != 1 { 345 return nil, errors.Errorf("expected one sample for table %s: %d", table, len(s.Values)) 346 } 347 ret[string(table)] = float64(s.Values[0].Value) 348 } 349 return ret, nil 350 } 351 352 func promQuery(ctx context.Context, promAPI promV1.API, query string, duration, step time.Duration) (model.Matrix, error) { 353 queryRange := promV1.Range{ 354 Start: mtime.Now().Add(-duration), 355 End: mtime.Now(), 356 Step: step, 357 } 358 359 value, wrngs, err := promAPI.QueryRange(ctx, query, queryRange) 360 if err != nil { 361 return nil, err 362 } 363 if wrngs != nil { 364 level.Warn(util_log.Logger).Log( 365 "query", query, 366 "start", queryRange.Start, 367 "end", queryRange.End, 368 "step", queryRange.Step, 369 "warnings", wrngs, 370 ) 371 } 372 matrix, ok := value.(model.Matrix) 373 if !ok { 374 return nil, fmt.Errorf("Unable to convert value to matrix: %#v", value) 375 } 376 return matrix, nil 377 }