github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/compactor/compactor.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package compactor 12 13 import ( 14 "context" 15 "fmt" 16 "time" 17 18 "github.com/cockroachdb/cockroach/pkg/keys" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 22 "github.com/cockroachdb/cockroach/pkg/storage" 23 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 24 "github.com/cockroachdb/cockroach/pkg/util/log" 25 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 26 "github.com/cockroachdb/cockroach/pkg/util/stop" 27 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 28 "github.com/cockroachdb/cockroach/pkg/util/tracing" 29 "github.com/cockroachdb/errors" 30 "github.com/cockroachdb/logtags" 31 ) 32 33 type storeCapacityFunc func() (roachpb.StoreCapacity, error) 34 35 type doneCompactingFunc func(ctx context.Context) 36 37 // A Compactor records suggested compactions and periodically 38 // makes requests to the engine to reclaim storage space. 39 type Compactor struct { 40 st *cluster.Settings 41 eng storage.Engine 42 capFn storeCapacityFunc 43 doneFn doneCompactingFunc 44 ch chan struct{} 45 Metrics Metrics 46 } 47 48 // NewCompactor returns a compactor for the specified storage engine. 49 func NewCompactor( 50 st *cluster.Settings, eng storage.Engine, capFn storeCapacityFunc, doneFn doneCompactingFunc, 51 ) *Compactor { 52 return &Compactor{ 53 st: st, 54 eng: eng, 55 capFn: capFn, 56 doneFn: doneFn, 57 ch: make(chan struct{}, 1), 58 Metrics: makeMetrics(), 59 } 60 } 61 62 func (c *Compactor) enabled() bool { 63 return enabled.Get(&c.st.SV) 64 } 65 66 func (c *Compactor) minInterval() time.Duration { 67 return minInterval.Get(&c.st.SV) 68 } 69 70 func (c *Compactor) thresholdBytes() int64 { 71 return thresholdBytes.Get(&c.st.SV) 72 } 73 74 func (c *Compactor) thresholdBytesUsedFraction() float64 { 75 return thresholdBytesUsedFraction.Get(&c.st.SV) 76 } 77 78 func (c *Compactor) thresholdBytesAvailableFraction() float64 { 79 return thresholdBytesAvailableFraction.Get(&c.st.SV) 80 } 81 82 func (c *Compactor) maxAge() time.Duration { 83 return maxSuggestedCompactionRecordAge.Get(&c.st.SV) 84 } 85 86 // poke instructs the compactor's main loop to react to new suggestions in a 87 // timely manner. 88 func (c *Compactor) poke() { 89 select { 90 case c.ch <- struct{}{}: 91 default: 92 } 93 } 94 95 // Start launches a compaction processing goroutine and exits when the 96 // provided stopper indicates. Processing is done with a periodicity of 97 // compactionMinInterval, but only if there are compactions pending. 98 func (c *Compactor) Start(ctx context.Context, stopper *stop.Stopper) { 99 ctx = logtags.AddTag(ctx, "compactor", "") 100 101 // Wake up immediately to examine the queue and set the bytes queued metric. 102 // Note that the compactor may have received suggestions before having been 103 // started (this isn't great, but it's how it is right now). 104 c.poke() 105 106 // Run the Worker in a Task because the worker holds on to the engine and 107 // may still access it even though the stopper has allowed it to close. 108 _ = stopper.RunTask(ctx, "compactor", func(ctx context.Context) { 109 stopper.RunWorker(ctx, func(ctx context.Context) { 110 var timer timeutil.Timer 111 defer timer.Stop() 112 113 // The above timer will either be on c.minInterval() or c.maxAge(). The 114 // former applies if we know there are new suggestions waiting to be 115 // inspected: we want to look at them soon, but also want to make sure 116 // "related" suggestions arrive before we start compacting. When no new 117 // suggestions have been made since the last inspection, the expectation 118 // is that all we have to do is clean up any previously skipped ones (at 119 // least after sufficient time has passed), and so we wait out the max age. 120 var isFast bool 121 122 for { 123 select { 124 case <-stopper.ShouldStop(): 125 return 126 127 case <-c.ch: 128 // A new suggestion was made. Examine the compaction queue, 129 // which returns the number of bytes queued. 130 if bytesQueued, err := c.examineQueue(ctx); err != nil { 131 log.Warningf(ctx, "failed check whether compaction suggestions exist: %+v", err) 132 } else if bytesQueued > 0 { 133 log.VEventf(ctx, 3, "compactor starting in %s as there are suggested compactions pending", c.minInterval()) 134 } else { 135 // Queue is empty, don't set the timer. This can happen only at startup. 136 break 137 } 138 // Set the wait timer if not already set. 139 if !isFast { 140 isFast = true 141 timer.Reset(c.minInterval()) 142 } 143 144 case <-timer.C: 145 timer.Read = true 146 ok, err := c.processSuggestions(ctx) 147 if err != nil { 148 log.Warningf(ctx, "failed processing suggested compactions: %+v", err) 149 } 150 if ok { 151 // The queue was processed, so either it's empty or contains suggestions 152 // that were skipped for now. Revisit when they are certainly expired. 153 isFast = false 154 timer.Reset(c.maxAge()) 155 break 156 } 157 // More work to do, revisit after minInterval. Note that basically 158 // `ok == (err == nil)` but this refactor is left for a future commit. 159 isFast = true 160 timer.Reset(c.minInterval()) 161 } 162 } 163 }) 164 }) 165 } 166 167 // aggregatedCompaction is a utility struct that holds information 168 // about aggregated suggested compactions. 169 type aggregatedCompaction struct { 170 kvserverpb.SuggestedCompaction 171 suggestions []kvserverpb.SuggestedCompaction 172 startIdx int 173 total int 174 } 175 176 func initAggregatedCompaction( 177 startIdx, total int, sc kvserverpb.SuggestedCompaction, 178 ) aggregatedCompaction { 179 return aggregatedCompaction{ 180 SuggestedCompaction: sc, 181 suggestions: []kvserverpb.SuggestedCompaction{sc}, 182 startIdx: startIdx, 183 total: total, 184 } 185 } 186 187 func (aggr aggregatedCompaction) String() string { 188 var seqFmt string 189 if len(aggr.suggestions) == 1 { 190 seqFmt = fmt.Sprintf("#%d/%d", aggr.startIdx+1, aggr.total) 191 } else { 192 seqFmt = fmt.Sprintf("#%d-%d/%d", aggr.startIdx+1, aggr.startIdx+len(aggr.suggestions), aggr.total) 193 } 194 return fmt.Sprintf("%s (%s-%s) for %s", seqFmt, aggr.StartKey, aggr.EndKey, humanizeutil.IBytes(aggr.Bytes)) 195 } 196 197 // processSuggestions considers all suggested compactions and 198 // processes contiguous or nearly contiguous aggregations if they 199 // exceed the absolute or fractional size thresholds. If suggested 200 // compactions don't meet thresholds, they're discarded if they're 201 // older than maxSuggestedCompactionRecordAge. Returns a boolean 202 // indicating whether the queue was successfully processed. 203 func (c *Compactor) processSuggestions(ctx context.Context) (bool, error) { 204 ctx, cleanup := tracing.EnsureContext(ctx, c.st.Tracer, "process suggested compactions") 205 defer cleanup() 206 207 suggestions, totalBytes, err := c.fetchSuggestions(ctx) 208 if err != nil { 209 return false, err 210 } 211 212 // Update at start of processing. Note that totalBytes is decremented and 213 // updated after any compactions which are processed. 214 c.Metrics.BytesQueued.Update(totalBytes) 215 216 if len(suggestions) == 0 { 217 return false, nil 218 } 219 220 log.Eventf(ctx, "considering %d suggested compaction(s)", len(suggestions)) 221 222 // Determine whether to attempt a compaction to reclaim space during 223 // this processing. The decision is based on total bytes to free up 224 // and the time since the last processing. 225 capacity, err := c.capFn() 226 if err != nil { 227 return false, err 228 } 229 230 // Get information about SSTables in the underlying RocksDB instance. 231 ssti := storage.NewSSTableInfosByLevel(c.eng.GetSSTables()) 232 233 // Update the bytes queued metric based, periodically querying the persisted 234 // suggestions so that we pick up newly added suggestions in the case where 235 // we're processing a large number of suggestions. 236 lastUpdate := timeutil.Now() 237 updateBytesQueued := func(delta int64) error { 238 totalBytes -= delta 239 if timeutil.Since(lastUpdate) >= 10*time.Second { 240 lastUpdate = timeutil.Now() 241 bytes, err := c.examineQueue(ctx) 242 if err != nil { 243 return err 244 } 245 totalBytes = bytes 246 // NB: examineQueue updates the BytesQueued metric. 247 } else { 248 c.Metrics.BytesQueued.Update(totalBytes) 249 } 250 return nil 251 } 252 253 // Iterate through suggestions, merging them into a running 254 // aggregation. Aggregates which exceed size thresholds are compacted. Small, 255 // isolated suggestions will be ignored until becoming too old, at which 256 // point they are discarded without compaction. 257 aggr := initAggregatedCompaction(0, len(suggestions), suggestions[0]) 258 for i, sc := range suggestions[1:] { 259 // Aggregate current suggestion with running aggregate if possible. If 260 // the current suggestion cannot be merged with the aggregate, process 261 // it if it meets compaction thresholds. 262 if done := c.aggregateCompaction(ctx, ssti, &aggr, sc); done { 263 processedBytes, err := c.processCompaction(ctx, aggr, capacity) 264 if err != nil { 265 log.Errorf(ctx, "failed processing suggested compactions %+v: %+v", aggr, err) 266 } else if err := updateBytesQueued(processedBytes); err != nil { 267 log.Errorf(ctx, "failed updating bytes queued metric %+v", err) 268 } 269 // Reset aggregation to the last, un-aggregated, suggested compaction. 270 aggr = initAggregatedCompaction(i, len(suggestions), sc) 271 } 272 } 273 // Process remaining aggregated compaction. 274 processedBytes, err := c.processCompaction(ctx, aggr, capacity) 275 if err != nil { 276 return false, err 277 } 278 if err := updateBytesQueued(processedBytes); err != nil { 279 log.Errorf(ctx, "failed updating bytes queued metric %+v", err) 280 } 281 282 return true, nil 283 } 284 285 // fetchSuggestions loads the persisted suggested compactions from the store. 286 func (c *Compactor) fetchSuggestions( 287 ctx context.Context, 288 ) (suggestions []kvserverpb.SuggestedCompaction, totalBytes int64, err error) { 289 dataIter := c.eng.NewIterator(storage.IterOptions{ 290 UpperBound: roachpb.KeyMax, // refined before every seek 291 }) 292 defer dataIter.Close() 293 294 delBatch := c.eng.NewBatch() 295 defer delBatch.Close() 296 297 err = c.eng.Iterate( 298 keys.LocalStoreSuggestedCompactionsMin, 299 keys.LocalStoreSuggestedCompactionsMax, 300 func(kv storage.MVCCKeyValue) (bool, error) { 301 var sc kvserverpb.SuggestedCompaction 302 var err error 303 sc.StartKey, sc.EndKey, err = keys.DecodeStoreSuggestedCompactionKey(kv.Key.Key) 304 if err != nil { 305 return false, errors.Wrapf(err, "failed to decode suggested compaction key") 306 } 307 if err := protoutil.Unmarshal(kv.Value, &sc.Compaction); err != nil { 308 return false, err 309 } 310 311 dataIter.SetUpperBound(sc.EndKey) 312 dataIter.SeekGE(storage.MakeMVCCMetadataKey(sc.StartKey)) 313 if ok, err := dataIter.Valid(); err != nil { 314 return false, err 315 } else if ok && dataIter.UnsafeKey().Less(storage.MakeMVCCMetadataKey(sc.EndKey)) { 316 // The suggested compaction span has live keys remaining. This is a 317 // strong indicator that compacting this range will be significantly 318 // more expensive than we expected when the compaction was suggested, as 319 // compactions are only suggested when a ClearRange request has removed 320 // all the keys in the span. Perhaps a replica was rebalanced away then 321 // back? 322 // 323 // Since we can't guarantee that this compaction will be an easy win, 324 // purge it to avoid bogging down the compaction queue. 325 log.Infof(ctx, "purging suggested compaction for range %s - %s that contains live data", 326 sc.StartKey, sc.EndKey) 327 if err := delBatch.Clear(kv.Key); err != nil { 328 log.Fatalf(ctx, "%v", err) // should never happen on a batch 329 } 330 c.Metrics.BytesSkipped.Inc(sc.Bytes) 331 } else { 332 suggestions = append(suggestions, sc) 333 totalBytes += sc.Bytes 334 } 335 336 return false, nil // continue iteration 337 }, 338 ) 339 if err != nil { 340 return nil, 0, err 341 } 342 if err := delBatch.Commit(true); err != nil { 343 log.Warningf(ctx, "unable to delete suggested compaction records: %+v", err) 344 } 345 return suggestions, totalBytes, nil 346 } 347 348 // processCompaction sends CompactRange requests to the storage engine if the 349 // aggregated suggestion exceeds size threshold(s). Otherwise, it either skips 350 // the compaction or skips the compaction *and* deletes the suggested compaction 351 // records if they're too old (and in particular, if the compactor is disabled, 352 // deletes any suggestions handed to it). Returns the number of bytes processed 353 // (either compacted or skipped and deleted due to age). 354 func (c *Compactor) processCompaction( 355 ctx context.Context, aggr aggregatedCompaction, capacity roachpb.StoreCapacity, 356 ) (int64, error) { 357 aboveSizeThresh := aggr.Bytes >= c.thresholdBytes() 358 aboveUsedFracThresh := func() bool { 359 thresh := c.thresholdBytesUsedFraction() 360 return thresh > 0 && aggr.Bytes >= int64(float64(capacity.LogicalBytes)*thresh) 361 }() 362 aboveAvailFracThresh := func() bool { 363 thresh := c.thresholdBytesAvailableFraction() 364 return thresh > 0 && aggr.Bytes >= int64(float64(capacity.Available)*thresh) 365 }() 366 367 shouldProcess := c.enabled() && (aboveSizeThresh || aboveUsedFracThresh || aboveAvailFracThresh) 368 if shouldProcess { 369 startTime := timeutil.Now() 370 log.Infof(ctx, 371 "processing compaction %s (reasons: size=%t used=%t avail=%t)", 372 aggr, aboveSizeThresh, aboveUsedFracThresh, aboveAvailFracThresh, 373 ) 374 375 if err := c.eng.CompactRange(aggr.StartKey, aggr.EndKey, false /* forceBottommost */); err != nil { 376 c.Metrics.CompactionFailures.Inc(1) 377 return 0, errors.Wrapf(err, "unable to compact range %+v", aggr) 378 } 379 c.Metrics.BytesCompacted.Inc(aggr.Bytes) 380 c.Metrics.CompactionSuccesses.Inc(1) 381 duration := timeutil.Since(startTime) 382 c.Metrics.CompactingNanos.Inc(int64(duration)) 383 if c.doneFn != nil { 384 c.doneFn(ctx) 385 } 386 log.Infof(ctx, "processed compaction %s in %.1fs", aggr, duration.Seconds()) 387 } else { 388 log.VEventf(ctx, 2, "skipping compaction(s) %s", aggr) 389 } 390 391 delBatch := c.eng.NewWriteOnlyBatch() 392 393 // Delete suggested compaction records if appropriate. 394 for _, sc := range aggr.suggestions { 395 age := timeutil.Since(timeutil.Unix(0, sc.SuggestedAtNanos)) 396 tooOld := age >= c.maxAge() || !c.enabled() 397 // Delete unless we didn't process and the record isn't too old. 398 if !shouldProcess && !tooOld { 399 continue 400 } 401 if tooOld { 402 c.Metrics.BytesSkipped.Inc(sc.Bytes) 403 } 404 key := keys.StoreSuggestedCompactionKey(sc.StartKey, sc.EndKey) 405 if err := delBatch.Clear(storage.MVCCKey{Key: key}); err != nil { 406 log.Fatalf(ctx, "%v", err) // should never happen on a batch 407 } 408 } 409 410 if err := delBatch.Commit(true); err != nil { 411 log.Warningf(ctx, "unable to delete suggested compaction records: %+v", err) 412 } 413 delBatch.Close() 414 415 if shouldProcess { 416 return aggr.Bytes, nil 417 } 418 return 0, nil 419 } 420 421 // aggregateCompaction merges sc into aggr, to create a new suggested 422 // compaction, if the key spans are overlapping or near-contiguous. Note that 423 // because suggested compactions are stored sorted by their start key, 424 // sc.StartKey >= aggr.StartKey. Returns true if we couldn't add the new 425 // suggested compaction to the aggregation and are therefore done building the 426 // current aggregation and should process it. Returns false if we should 427 // continue aggregating suggested compactions. 428 func (c *Compactor) aggregateCompaction( 429 ctx context.Context, 430 ssti storage.SSTableInfosByLevel, 431 aggr *aggregatedCompaction, 432 sc kvserverpb.SuggestedCompaction, 433 ) (done bool) { 434 // Don't bother aggregating more once we reach threshold bytes. 435 if aggr.Bytes >= c.thresholdBytes() { 436 return true // suggested compation could not be aggregated 437 } 438 439 // If the key spans don't overlap, then check whether they're 440 // "nearly" contiguous. 441 if aggr.EndKey.Compare(sc.StartKey) < 0 { 442 // Aggregate if the gap between current aggregate and proposed 443 // compaction span overlaps (at most) two contiguous SSTables at 444 // the bottommost level. 445 span := roachpb.Span{Key: aggr.EndKey, EndKey: sc.StartKey} 446 maxLevel := ssti.MaxLevelSpanOverlapsContiguousSSTables(span) 447 if maxLevel < ssti.MaxLevel() { 448 return true // suggested compaction could not be aggregated 449 } 450 } 451 452 // We can aggregate, so merge sc into aggr. 453 if aggr.EndKey.Compare(sc.EndKey) < 0 { 454 aggr.EndKey = sc.EndKey 455 } 456 aggr.Bytes += sc.Bytes 457 aggr.suggestions = append(aggr.suggestions, sc) 458 return false // aggregated successfully 459 } 460 461 // examineQueue returns the total number of bytes queued and updates the 462 // BytesQueued gauge. 463 func (c *Compactor) examineQueue(ctx context.Context) (int64, error) { 464 var totalBytes int64 465 if err := c.eng.Iterate( 466 keys.LocalStoreSuggestedCompactionsMin, 467 keys.LocalStoreSuggestedCompactionsMax, 468 func(kv storage.MVCCKeyValue) (bool, error) { 469 var c kvserverpb.Compaction 470 if err := protoutil.Unmarshal(kv.Value, &c); err != nil { 471 return false, err 472 } 473 totalBytes += c.Bytes 474 return false, nil // continue iteration 475 }, 476 ); err != nil { 477 return 0, err 478 } 479 c.Metrics.BytesQueued.Update(totalBytes) 480 return totalBytes, nil 481 } 482 483 // Suggest writes the specified compaction to persistent storage and 484 // pings the processing goroutine. 485 func (c *Compactor) Suggest(ctx context.Context, sc kvserverpb.SuggestedCompaction) { 486 log.VEventf(ctx, 2, "suggested compaction from %s - %s: %+v", sc.StartKey, sc.EndKey, sc.Compaction) 487 488 // Check whether a suggested compaction already exists for this key span. 489 key := keys.StoreSuggestedCompactionKey(sc.StartKey, sc.EndKey) 490 var existing kvserverpb.Compaction 491 //lint:ignore SA1019 historical usage of deprecated c.eng.GetProto is OK 492 ok, _, _, err := c.eng.GetProto(storage.MVCCKey{Key: key}, &existing) 493 if err != nil { 494 log.VErrEventf(ctx, 2, "unable to record suggested compaction: %s", err) 495 return 496 } 497 498 // If there's already a suggested compaction, merge them. Note that 499 // this method is only called after clearing keys from the underlying 500 // storage engine. All such actions really do result in successively 501 // more bytes being made available for compaction, so there is no 502 // double-counting if the same range were cleared twice. 503 if ok { 504 sc.Bytes += existing.Bytes 505 } 506 507 // Store the new compaction. 508 //lint:ignore SA1019 historical usage of deprecated engine.PutProto is OK 509 if _, _, err = storage.PutProto(c.eng, storage.MVCCKey{Key: key}, &sc.Compaction); err != nil { 510 log.Warningf(ctx, "unable to record suggested compaction: %+v", err) 511 } 512 513 // Poke the compactor goroutine to reconsider compaction in light of 514 // this new suggested compaction. 515 c.poke() 516 }