github.com/grafana/pyroscope@v1.18.0/pkg/ingester/retention.go (about) 1 package ingester 2 3 import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "io/fs" 8 "os" 9 "path/filepath" 10 "sort" 11 "sync" 12 "time" 13 14 "github.com/go-kit/log" 15 "github.com/go-kit/log/level" 16 "github.com/grafana/dskit/services" 17 "github.com/oklog/ulid/v2" 18 19 "github.com/grafana/pyroscope/pkg/phlaredb" 20 "github.com/grafana/pyroscope/pkg/phlaredb/block" 21 "github.com/grafana/pyroscope/pkg/phlaredb/shipper" 22 diskutil "github.com/grafana/pyroscope/pkg/util/disk" 23 ) 24 25 const ( 26 // TODO(kolesnikovae): Unify with pkg/phlaredb. 27 phlareDBLocalPath = "local" 28 ) 29 30 // newDiskCleaner creates a service that will intermittently clean blocks from 31 // disk. 32 func newDiskCleaner(logger log.Logger, evictor blockEvictor, policy retentionPolicy, cfg phlaredb.Config) *diskCleaner { 33 dc := &diskCleaner{ 34 logger: logger, 35 policy: policy, 36 config: cfg, 37 blockManager: newFSBlockManager(cfg.DataPath, evictor, newFS()), 38 volumeChecker: diskutil.NewVolumeChecker(policy.MinFreeDisk*1024*1024*1024, policy.MinDiskAvailablePercentage), 39 stop: make(chan struct{}), 40 } 41 dc.Service = services.NewBasicService(nil, dc.running, dc.stopping) 42 43 return dc 44 } 45 46 // newFSBlockManager creates a component that can manage blocks on a file system. 47 func newFSBlockManager(root string, evictor blockEvictor, fs fileSystem) fsBlockManager { 48 return &realFSBlockManager{ 49 Root: root, 50 Evictor: evictor, 51 FS: fs, 52 } 53 } 54 55 // newFS creates a file system implementation that interacts directly with the 56 // OS file system. 57 func newFS() fileSystem { 58 return &realFS{} 59 } 60 61 func defaultRetentionPolicy() retentionPolicy { 62 return retentionPolicy{ 63 MinFreeDisk: phlaredb.DefaultMinFreeDisk, 64 MinDiskAvailablePercentage: phlaredb.DefaultMinDiskAvailablePercentage, 65 EnforcementInterval: phlaredb.DefaultRetentionPolicyEnforcementInterval, 66 Expiry: phlaredb.DefaultRetentionExpiry, 67 } 68 } 69 70 type retentionPolicy struct { 71 MinFreeDisk uint64 72 MinDiskAvailablePercentage float64 73 EnforcementInterval time.Duration 74 Expiry time.Duration 75 } 76 77 // diskCleaner monitors disk usage and cleans unused data. 78 type diskCleaner struct { 79 services.Service 80 81 logger log.Logger 82 config phlaredb.Config 83 policy retentionPolicy 84 blockManager fsBlockManager 85 volumeChecker diskutil.VolumeChecker 86 87 stop chan struct{} 88 wg sync.WaitGroup 89 } 90 91 func (dc *diskCleaner) running(ctx context.Context) error { 92 dc.wg.Add(1) 93 ticker := time.NewTicker(dc.policy.EnforcementInterval) 94 defer func() { 95 ticker.Stop() 96 dc.wg.Done() 97 }() 98 99 var deleted int 100 var bytesDeleted int 101 var hasHighDiskUtilization bool 102 for { 103 deleted = dc.DeleteUploadedBlocks(ctx) 104 level.Debug(dc.logger).Log("msg", "cleaned uploaded blocks", "count", deleted) 105 106 deleted, bytesDeleted, hasHighDiskUtilization = dc.CleanupBlocksWhenHighDiskUtilization(ctx) 107 if hasHighDiskUtilization { 108 level.Debug(dc.logger).Log( 109 "msg", "cleaned files after high disk utilization", 110 "deleted_blocks", deleted, 111 "deleted_bytes", bytesDeleted, 112 ) 113 } 114 115 select { 116 case <-ticker.C: 117 case <-ctx.Done(): 118 return nil 119 case <-dc.stop: 120 return nil 121 } 122 } 123 } 124 125 func (dc *diskCleaner) stopping(_ error) error { 126 close(dc.stop) 127 dc.wg.Wait() 128 return nil 129 } 130 131 // DeleteUploadedBlocks scans and deletes blocks on all tenants that have 132 // already been uploaded. It returns the number of blocks deleted. 133 func (dc *diskCleaner) DeleteUploadedBlocks(ctx context.Context) int { 134 tenantIDs, err := dc.blockManager.GetTenantIDs(ctx) 135 if err != nil { 136 level.Error(dc.logger).Log( 137 "msg", "failed to delete uploaded blocks, could not read tenant ids", 138 "err", err, 139 ) 140 return 0 141 } 142 143 var deleted int 144 for _, tenantID := range tenantIDs { 145 blocks, err := dc.blockManager.GetBlocksForTenant(ctx, tenantID) 146 if err != nil { 147 level.Error(dc.logger).Log( 148 "msg", "failed to delete uploaded blocks, could not get blocks for tenant", 149 "err", err, 150 "tenantID", tenantID, 151 ) 152 continue 153 } 154 155 for _, block := range blocks { 156 if !block.Uploaded || !dc.isExpired(block) { 157 continue 158 } 159 160 err = dc.blockManager.DeleteBlock(ctx, block) 161 switch { 162 case os.IsNotExist(err): 163 level.Warn(dc.logger).Log( 164 "msg", "failed to delete uploaded block, does not exist", 165 "err", err, 166 "path", block.Path, 167 ) 168 case err != nil: 169 level.Error(dc.logger).Log( 170 "msg", "failed to delete uploaded block", 171 "err", err, 172 "path", block.Path, 173 ) 174 default: 175 deleted++ 176 } 177 } 178 } 179 return deleted 180 } 181 182 // CleanupBlocksWhenHighDiskUtilization will run more aggressive disk cleaning 183 // if high disk utilization is detected by deleting blocks that have been 184 // uploaded but may not necessarily have been synced with the store gateway. It 185 // returns true if high disk utilization was detected, along with the number of 186 // files deleted and the estimated bytes recovered. If no high disk utilization 187 // was detected, false is returned. 188 func (dc *diskCleaner) CleanupBlocksWhenHighDiskUtilization(ctx context.Context) (int, int, bool) { 189 volumeStats, err := dc.volumeChecker.HasHighDiskUtilization(dc.config.DataPath) 190 if err != nil { 191 level.Error(dc.logger).Log( 192 "msg", "failed run high disk cleanup, failed to check disk utilization", 193 "err", err, 194 ) 195 return 0, 0, false 196 } 197 198 // Not in high disk utilization, nothing to do. 199 if !volumeStats.HighDiskUtilization { 200 return 0, 0, false 201 } 202 originalBytesAvailable := volumeStats.BytesAvailable 203 204 tenantIDs, err := dc.blockManager.GetTenantIDs(ctx) 205 if err != nil { 206 level.Error(dc.logger).Log( 207 "msg", "failed run high disk cleanup, could not read tenant ids", 208 "err", err, 209 ) 210 return 0, 0, true 211 } 212 213 blocks := make([]*tenantBlock, 0) 214 for _, tenantID := range tenantIDs { 215 tenantBlocks, err := dc.blockManager.GetBlocksForTenant(ctx, tenantID) 216 if err != nil { 217 level.Error(dc.logger).Log( 218 "msg", "failed to get blocks for tenant", 219 "tenantID", tenantID, 220 "err", err, 221 ) 222 223 // Keep trying to read blocks from other tenants. 224 continue 225 } 226 227 blocks = append(blocks, tenantBlocks...) 228 } 229 230 // Sort by uploaded, then age (oldest first). 231 sort.Sort(blocksByUploadAndAge(blocks)) 232 233 prevVolumeStats := &diskutil.VolumeStats{} 234 filesDeleted := 0 235 for _, block := range blocks { 236 if !dc.isExpired(block) { 237 continue 238 } 239 240 // Delete a block. 241 err = dc.blockManager.DeleteBlock(ctx, block) 242 switch { 243 case os.IsNotExist(err): 244 level.Warn(dc.logger).Log( 245 "msg", "failed to delete block, does not exist", 246 "err", err, 247 "path", block.Path, 248 ) 249 return filesDeleted, int(volumeStats.BytesAvailable - originalBytesAvailable), true 250 case err != nil: 251 level.Error(dc.logger).Log( 252 "msg", "failed run high disk cleanup, could not delete block", 253 "path", block.Path, 254 "err", err, 255 ) 256 return filesDeleted, int(volumeStats.BytesAvailable - originalBytesAvailable), true 257 default: 258 filesDeleted++ 259 } 260 261 // Recheck volume stats. 262 prevVolumeStats = volumeStats 263 volumeStats, err = dc.volumeChecker.HasHighDiskUtilization(dc.config.DataPath) 264 if err != nil { 265 level.Error(dc.logger).Log( 266 "msg", "failed to check disk utilization", 267 "err", err, 268 ) 269 break 270 } 271 272 if !volumeStats.HighDiskUtilization { 273 // No longer in high disk utilization. 274 break 275 } 276 277 if prevVolumeStats.BytesAvailable >= volumeStats.BytesAvailable { 278 // Disk utilization has not been lowered since the last block was 279 // deleted. There may be a delay in VolumeChecker reporting disk 280 // utilization. In an effort to be conservative when deleting 281 // blocks, stop the clean up now and wait for the next cycle to let 282 // VolumeChecker catch up on the current state of the disk. 283 level.Warn(dc.logger).Log("msg", "disk utilization is not lowered by deletion of a block, pausing until next cycle") 284 break 285 } 286 } 287 288 return filesDeleted, int(volumeStats.BytesAvailable - originalBytesAvailable), true 289 } 290 291 // isBlockDeletable returns true if this block can be deleted. 292 func (dc *diskCleaner) isExpired(block *tenantBlock) bool { 293 // TODO(kolesnikovae): 294 // Expiry defaults to -querier.query-store-after which should be deprecated, 295 // blocks-storage.bucket-store.ignore-blocks-within can be used instead. 296 expiryTs := time.Now().Add(-dc.policy.Expiry) 297 return ulid.Time(block.ID.Time()).Before(expiryTs) 298 } 299 300 // blocksByUploadAndAge implements sorting tenantBlock by uploaded then by age 301 // in ascending order. 302 type blocksByUploadAndAge []*tenantBlock 303 304 func (b blocksByUploadAndAge) Len() int { return len(b) } 305 func (b blocksByUploadAndAge) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 306 func (b blocksByUploadAndAge) Less(i, j int) bool { 307 switch { 308 case b[i].Uploaded == b[j].Uploaded: 309 return b[i].ID.Compare(b[j].ID) < 0 310 case b[i].Uploaded: 311 return !b[j].Uploaded 312 case b[j].Uploaded: 313 fallthrough 314 default: 315 return b[i].Uploaded 316 } 317 } 318 319 // blockEvictor unloads blocks from tenant instance. 320 type blockEvictor interface { 321 // evictBlock evicts the block by its ID from the memory and 322 // invokes fn callback, regardless of if the tenant is found. 323 // The call is thread-safe: tenant can't be added or removed 324 // during the execution. 325 evictBlock(tenant string, b ulid.ULID, fn func() error) error 326 } 327 328 type fileSystem interface { 329 fs.ReadDirFS 330 RemoveAll(name string) error 331 } 332 333 type realFS struct{} 334 335 func (*realFS) Open(name string) (fs.File, error) { return os.Open(name) } 336 func (*realFS) ReadDir(name string) ([]fs.DirEntry, error) { return os.ReadDir(name) } 337 func (*realFS) RemoveAll(path string) error { return os.RemoveAll(path) } 338 339 type tenantBlock struct { 340 ID ulid.ULID 341 TenantID string 342 Path string 343 Uploaded bool 344 } 345 346 func (t *tenantBlock) String() string { 347 return t.ID.String() 348 } 349 350 type fsBlockManager interface { 351 GetTenantIDs(ctx context.Context) ([]string, error) 352 GetBlocksForTenant(ctx context.Context, tenantID string) ([]*tenantBlock, error) 353 DeleteBlock(ctx context.Context, block *tenantBlock) error 354 } 355 356 type realFSBlockManager struct { 357 Root string 358 Evictor blockEvictor 359 FS fileSystem 360 } 361 362 func (bm *realFSBlockManager) getUploadedBlockIds(tenantID string) (map[ulid.ULID]struct{}, error) { 363 localDirPath := filepath.Join(bm.Root, tenantID, phlareDBLocalPath) 364 365 shipperPath := filepath.Join(localDirPath, shipper.MetaFilename) 366 bytes, err := fs.ReadFile(bm.FS, shipperPath) 367 if err != nil { 368 if os.IsNotExist(err) { 369 return make(map[ulid.ULID]struct{}), nil 370 } 371 return nil, err 372 } 373 374 var meta shipper.Meta 375 err = json.Unmarshal(bytes, &meta) 376 if err != nil { 377 return nil, err 378 } 379 380 uploadedBlockIDs := make(map[ulid.ULID]struct{}, len(meta.Uploaded)) 381 for _, id := range meta.Uploaded { 382 uploadedBlockIDs[id] = struct{}{} 383 } 384 385 return uploadedBlockIDs, nil 386 } 387 388 func (bm *realFSBlockManager) GetTenantIDs(ctx context.Context) ([]string, error) { 389 if ctx.Err() != nil { 390 return nil, ctx.Err() 391 } 392 393 dirs, err := fs.ReadDir(bm.FS, bm.Root) 394 if err != nil { 395 return nil, err 396 } 397 398 tenantIDs := make([]string, 0) 399 for _, dir := range dirs { 400 if !bm.isTenantDir(bm.Root, dir) { 401 continue 402 } 403 404 tenantIDs = append(tenantIDs, dir.Name()) 405 } 406 return tenantIDs, nil 407 } 408 409 func (bm *realFSBlockManager) GetBlocksForTenant(ctx context.Context, tenantID string) ([]*tenantBlock, error) { 410 if ctx.Err() != nil { 411 return nil, ctx.Err() 412 } 413 414 localDirPath := filepath.Join(bm.Root, tenantID, phlareDBLocalPath) 415 blockDirs, err := fs.ReadDir(bm.FS, localDirPath) 416 if err != nil { 417 return nil, err 418 } 419 420 uploadedBlockIDs, err := bm.getUploadedBlockIds(tenantID) 421 if err != nil { 422 return nil, err 423 } 424 425 // Read blocks. 426 blocks := make([]*tenantBlock, 0) 427 for _, blockDir := range blockDirs { 428 if !blockDir.IsDir() { 429 continue 430 } 431 432 path := filepath.Join(localDirPath, blockDir.Name()) 433 blockID, ok := block.IsBlockDir(path) 434 if !ok { 435 // A malformed/invalid ULID likely means that the directory is not a 436 // valid block, ignoring. 437 continue 438 } 439 440 _, uploaded := uploadedBlockIDs[blockID] 441 blocks = append(blocks, &tenantBlock{ 442 ID: blockID, 443 TenantID: tenantID, 444 Path: path, 445 Uploaded: uploaded, 446 }) 447 } 448 return blocks, nil 449 } 450 451 func (bm *realFSBlockManager) DeleteBlock(ctx context.Context, block *tenantBlock) error { 452 if ctx.Err() != nil { 453 return ctx.Err() 454 } 455 456 return bm.Evictor.evictBlock(block.TenantID, block.ID, func() error { 457 err := bm.FS.RemoveAll(block.Path) 458 switch { 459 case os.IsNotExist(err): 460 return err 461 case err != nil: 462 return fmt.Errorf("failed to delete block: %q: %w", block.Path, err) 463 } 464 return nil 465 }) 466 } 467 468 // isTenantDir checks if a directory is a tenant directory. 469 func (bm *realFSBlockManager) isTenantDir(path string, entry fs.DirEntry) bool { 470 if !entry.IsDir() { 471 return false 472 } 473 474 subEntries, err := bm.FS.ReadDir(filepath.Join(path, entry.Name())) 475 if err != nil { 476 return false 477 } 478 479 foundLocalDir := false 480 for _, subEntry := range subEntries { 481 if !subEntry.IsDir() { 482 continue 483 } 484 485 if subEntry.Name() == phlareDBLocalPath { 486 foundLocalDir = true 487 break 488 } 489 } 490 return foundLocalDir 491 }