github.com/grafana/pyroscope@v1.18.0/pkg/ingester/retention.go (about)

     1  package ingester
     2  
     3  import (
     4  	"context"
     5  	"encoding/json"
     6  	"fmt"
     7  	"io/fs"
     8  	"os"
     9  	"path/filepath"
    10  	"sort"
    11  	"sync"
    12  	"time"
    13  
    14  	"github.com/go-kit/log"
    15  	"github.com/go-kit/log/level"
    16  	"github.com/grafana/dskit/services"
    17  	"github.com/oklog/ulid/v2"
    18  
    19  	"github.com/grafana/pyroscope/pkg/phlaredb"
    20  	"github.com/grafana/pyroscope/pkg/phlaredb/block"
    21  	"github.com/grafana/pyroscope/pkg/phlaredb/shipper"
    22  	diskutil "github.com/grafana/pyroscope/pkg/util/disk"
    23  )
    24  
    25  const (
    26  	// TODO(kolesnikovae): Unify with pkg/phlaredb.
    27  	phlareDBLocalPath = "local"
    28  )
    29  
    30  // newDiskCleaner creates a service that will intermittently clean blocks from
    31  // disk.
    32  func newDiskCleaner(logger log.Logger, evictor blockEvictor, policy retentionPolicy, cfg phlaredb.Config) *diskCleaner {
    33  	dc := &diskCleaner{
    34  		logger:        logger,
    35  		policy:        policy,
    36  		config:        cfg,
    37  		blockManager:  newFSBlockManager(cfg.DataPath, evictor, newFS()),
    38  		volumeChecker: diskutil.NewVolumeChecker(policy.MinFreeDisk*1024*1024*1024, policy.MinDiskAvailablePercentage),
    39  		stop:          make(chan struct{}),
    40  	}
    41  	dc.Service = services.NewBasicService(nil, dc.running, dc.stopping)
    42  
    43  	return dc
    44  }
    45  
    46  // newFSBlockManager creates a component that can manage blocks on a file system.
    47  func newFSBlockManager(root string, evictor blockEvictor, fs fileSystem) fsBlockManager {
    48  	return &realFSBlockManager{
    49  		Root:    root,
    50  		Evictor: evictor,
    51  		FS:      fs,
    52  	}
    53  }
    54  
    55  // newFS creates a file system implementation that interacts directly with the
    56  // OS file system.
    57  func newFS() fileSystem {
    58  	return &realFS{}
    59  }
    60  
    61  func defaultRetentionPolicy() retentionPolicy {
    62  	return retentionPolicy{
    63  		MinFreeDisk:                phlaredb.DefaultMinFreeDisk,
    64  		MinDiskAvailablePercentage: phlaredb.DefaultMinDiskAvailablePercentage,
    65  		EnforcementInterval:        phlaredb.DefaultRetentionPolicyEnforcementInterval,
    66  		Expiry:                     phlaredb.DefaultRetentionExpiry,
    67  	}
    68  }
    69  
    70  type retentionPolicy struct {
    71  	MinFreeDisk                uint64
    72  	MinDiskAvailablePercentage float64
    73  	EnforcementInterval        time.Duration
    74  	Expiry                     time.Duration
    75  }
    76  
    77  // diskCleaner monitors disk usage and cleans unused data.
    78  type diskCleaner struct {
    79  	services.Service
    80  
    81  	logger        log.Logger
    82  	config        phlaredb.Config
    83  	policy        retentionPolicy
    84  	blockManager  fsBlockManager
    85  	volumeChecker diskutil.VolumeChecker
    86  
    87  	stop chan struct{}
    88  	wg   sync.WaitGroup
    89  }
    90  
    91  func (dc *diskCleaner) running(ctx context.Context) error {
    92  	dc.wg.Add(1)
    93  	ticker := time.NewTicker(dc.policy.EnforcementInterval)
    94  	defer func() {
    95  		ticker.Stop()
    96  		dc.wg.Done()
    97  	}()
    98  
    99  	var deleted int
   100  	var bytesDeleted int
   101  	var hasHighDiskUtilization bool
   102  	for {
   103  		deleted = dc.DeleteUploadedBlocks(ctx)
   104  		level.Debug(dc.logger).Log("msg", "cleaned uploaded blocks", "count", deleted)
   105  
   106  		deleted, bytesDeleted, hasHighDiskUtilization = dc.CleanupBlocksWhenHighDiskUtilization(ctx)
   107  		if hasHighDiskUtilization {
   108  			level.Debug(dc.logger).Log(
   109  				"msg", "cleaned files after high disk utilization",
   110  				"deleted_blocks", deleted,
   111  				"deleted_bytes", bytesDeleted,
   112  			)
   113  		}
   114  
   115  		select {
   116  		case <-ticker.C:
   117  		case <-ctx.Done():
   118  			return nil
   119  		case <-dc.stop:
   120  			return nil
   121  		}
   122  	}
   123  }
   124  
   125  func (dc *diskCleaner) stopping(_ error) error {
   126  	close(dc.stop)
   127  	dc.wg.Wait()
   128  	return nil
   129  }
   130  
   131  // DeleteUploadedBlocks scans and deletes blocks on all tenants that have
   132  // already been uploaded. It returns the number of blocks deleted.
   133  func (dc *diskCleaner) DeleteUploadedBlocks(ctx context.Context) int {
   134  	tenantIDs, err := dc.blockManager.GetTenantIDs(ctx)
   135  	if err != nil {
   136  		level.Error(dc.logger).Log(
   137  			"msg", "failed to delete uploaded blocks, could not read tenant ids",
   138  			"err", err,
   139  		)
   140  		return 0
   141  	}
   142  
   143  	var deleted int
   144  	for _, tenantID := range tenantIDs {
   145  		blocks, err := dc.blockManager.GetBlocksForTenant(ctx, tenantID)
   146  		if err != nil {
   147  			level.Error(dc.logger).Log(
   148  				"msg", "failed to delete uploaded blocks, could not get blocks for tenant",
   149  				"err", err,
   150  				"tenantID", tenantID,
   151  			)
   152  			continue
   153  		}
   154  
   155  		for _, block := range blocks {
   156  			if !block.Uploaded || !dc.isExpired(block) {
   157  				continue
   158  			}
   159  
   160  			err = dc.blockManager.DeleteBlock(ctx, block)
   161  			switch {
   162  			case os.IsNotExist(err):
   163  				level.Warn(dc.logger).Log(
   164  					"msg", "failed to delete uploaded block, does not exist",
   165  					"err", err,
   166  					"path", block.Path,
   167  				)
   168  			case err != nil:
   169  				level.Error(dc.logger).Log(
   170  					"msg", "failed to delete uploaded block",
   171  					"err", err,
   172  					"path", block.Path,
   173  				)
   174  			default:
   175  				deleted++
   176  			}
   177  		}
   178  	}
   179  	return deleted
   180  }
   181  
   182  // CleanupBlocksWhenHighDiskUtilization will run more aggressive disk cleaning
   183  // if high disk utilization is detected by deleting blocks that have been
   184  // uploaded but may not necessarily have been synced with the store gateway. It
   185  // returns true if high disk utilization was detected, along with the number of
   186  // files deleted and the estimated bytes recovered. If no high disk utilization
   187  // was detected, false is returned.
   188  func (dc *diskCleaner) CleanupBlocksWhenHighDiskUtilization(ctx context.Context) (int, int, bool) {
   189  	volumeStats, err := dc.volumeChecker.HasHighDiskUtilization(dc.config.DataPath)
   190  	if err != nil {
   191  		level.Error(dc.logger).Log(
   192  			"msg", "failed run high disk cleanup, failed to check disk utilization",
   193  			"err", err,
   194  		)
   195  		return 0, 0, false
   196  	}
   197  
   198  	// Not in high disk utilization, nothing to do.
   199  	if !volumeStats.HighDiskUtilization {
   200  		return 0, 0, false
   201  	}
   202  	originalBytesAvailable := volumeStats.BytesAvailable
   203  
   204  	tenantIDs, err := dc.blockManager.GetTenantIDs(ctx)
   205  	if err != nil {
   206  		level.Error(dc.logger).Log(
   207  			"msg", "failed run high disk cleanup, could not read tenant ids",
   208  			"err", err,
   209  		)
   210  		return 0, 0, true
   211  	}
   212  
   213  	blocks := make([]*tenantBlock, 0)
   214  	for _, tenantID := range tenantIDs {
   215  		tenantBlocks, err := dc.blockManager.GetBlocksForTenant(ctx, tenantID)
   216  		if err != nil {
   217  			level.Error(dc.logger).Log(
   218  				"msg", "failed to get blocks for tenant",
   219  				"tenantID", tenantID,
   220  				"err", err,
   221  			)
   222  
   223  			// Keep trying to read blocks from other tenants.
   224  			continue
   225  		}
   226  
   227  		blocks = append(blocks, tenantBlocks...)
   228  	}
   229  
   230  	// Sort by uploaded, then age (oldest first).
   231  	sort.Sort(blocksByUploadAndAge(blocks))
   232  
   233  	prevVolumeStats := &diskutil.VolumeStats{}
   234  	filesDeleted := 0
   235  	for _, block := range blocks {
   236  		if !dc.isExpired(block) {
   237  			continue
   238  		}
   239  
   240  		// Delete a block.
   241  		err = dc.blockManager.DeleteBlock(ctx, block)
   242  		switch {
   243  		case os.IsNotExist(err):
   244  			level.Warn(dc.logger).Log(
   245  				"msg", "failed to delete block, does not exist",
   246  				"err", err,
   247  				"path", block.Path,
   248  			)
   249  			return filesDeleted, int(volumeStats.BytesAvailable - originalBytesAvailable), true
   250  		case err != nil:
   251  			level.Error(dc.logger).Log(
   252  				"msg", "failed run high disk cleanup, could not delete block",
   253  				"path", block.Path,
   254  				"err", err,
   255  			)
   256  			return filesDeleted, int(volumeStats.BytesAvailable - originalBytesAvailable), true
   257  		default:
   258  			filesDeleted++
   259  		}
   260  
   261  		// Recheck volume stats.
   262  		prevVolumeStats = volumeStats
   263  		volumeStats, err = dc.volumeChecker.HasHighDiskUtilization(dc.config.DataPath)
   264  		if err != nil {
   265  			level.Error(dc.logger).Log(
   266  				"msg", "failed to check disk utilization",
   267  				"err", err,
   268  			)
   269  			break
   270  		}
   271  
   272  		if !volumeStats.HighDiskUtilization {
   273  			// No longer in high disk utilization.
   274  			break
   275  		}
   276  
   277  		if prevVolumeStats.BytesAvailable >= volumeStats.BytesAvailable {
   278  			// Disk utilization has not been lowered since the last block was
   279  			// deleted. There may be a delay in VolumeChecker reporting disk
   280  			// utilization. In an effort to be conservative when deleting
   281  			// blocks, stop the clean up now and wait for the next cycle to let
   282  			// VolumeChecker catch up on the current state of the disk.
   283  			level.Warn(dc.logger).Log("msg", "disk utilization is not lowered by deletion of a block, pausing until next cycle")
   284  			break
   285  		}
   286  	}
   287  
   288  	return filesDeleted, int(volumeStats.BytesAvailable - originalBytesAvailable), true
   289  }
   290  
   291  // isBlockDeletable returns true if this block can be deleted.
   292  func (dc *diskCleaner) isExpired(block *tenantBlock) bool {
   293  	// TODO(kolesnikovae):
   294  	//  Expiry defaults to -querier.query-store-after which should be deprecated,
   295  	//  blocks-storage.bucket-store.ignore-blocks-within can be used instead.
   296  	expiryTs := time.Now().Add(-dc.policy.Expiry)
   297  	return ulid.Time(block.ID.Time()).Before(expiryTs)
   298  }
   299  
   300  // blocksByUploadAndAge implements sorting tenantBlock by uploaded then by age
   301  // in ascending order.
   302  type blocksByUploadAndAge []*tenantBlock
   303  
   304  func (b blocksByUploadAndAge) Len() int      { return len(b) }
   305  func (b blocksByUploadAndAge) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
   306  func (b blocksByUploadAndAge) Less(i, j int) bool {
   307  	switch {
   308  	case b[i].Uploaded == b[j].Uploaded:
   309  		return b[i].ID.Compare(b[j].ID) < 0
   310  	case b[i].Uploaded:
   311  		return !b[j].Uploaded
   312  	case b[j].Uploaded:
   313  		fallthrough
   314  	default:
   315  		return b[i].Uploaded
   316  	}
   317  }
   318  
   319  // blockEvictor unloads blocks from tenant instance.
   320  type blockEvictor interface {
   321  	// evictBlock evicts the block by its ID from the memory and
   322  	// invokes fn callback, regardless of if the tenant is found.
   323  	// The call is thread-safe: tenant can't be added or removed
   324  	// during the execution.
   325  	evictBlock(tenant string, b ulid.ULID, fn func() error) error
   326  }
   327  
   328  type fileSystem interface {
   329  	fs.ReadDirFS
   330  	RemoveAll(name string) error
   331  }
   332  
   333  type realFS struct{}
   334  
   335  func (*realFS) Open(name string) (fs.File, error)          { return os.Open(name) }
   336  func (*realFS) ReadDir(name string) ([]fs.DirEntry, error) { return os.ReadDir(name) }
   337  func (*realFS) RemoveAll(path string) error                { return os.RemoveAll(path) }
   338  
   339  type tenantBlock struct {
   340  	ID       ulid.ULID
   341  	TenantID string
   342  	Path     string
   343  	Uploaded bool
   344  }
   345  
   346  func (t *tenantBlock) String() string {
   347  	return t.ID.String()
   348  }
   349  
   350  type fsBlockManager interface {
   351  	GetTenantIDs(ctx context.Context) ([]string, error)
   352  	GetBlocksForTenant(ctx context.Context, tenantID string) ([]*tenantBlock, error)
   353  	DeleteBlock(ctx context.Context, block *tenantBlock) error
   354  }
   355  
   356  type realFSBlockManager struct {
   357  	Root    string
   358  	Evictor blockEvictor
   359  	FS      fileSystem
   360  }
   361  
   362  func (bm *realFSBlockManager) getUploadedBlockIds(tenantID string) (map[ulid.ULID]struct{}, error) {
   363  	localDirPath := filepath.Join(bm.Root, tenantID, phlareDBLocalPath)
   364  
   365  	shipperPath := filepath.Join(localDirPath, shipper.MetaFilename)
   366  	bytes, err := fs.ReadFile(bm.FS, shipperPath)
   367  	if err != nil {
   368  		if os.IsNotExist(err) {
   369  			return make(map[ulid.ULID]struct{}), nil
   370  		}
   371  		return nil, err
   372  	}
   373  
   374  	var meta shipper.Meta
   375  	err = json.Unmarshal(bytes, &meta)
   376  	if err != nil {
   377  		return nil, err
   378  	}
   379  
   380  	uploadedBlockIDs := make(map[ulid.ULID]struct{}, len(meta.Uploaded))
   381  	for _, id := range meta.Uploaded {
   382  		uploadedBlockIDs[id] = struct{}{}
   383  	}
   384  
   385  	return uploadedBlockIDs, nil
   386  }
   387  
   388  func (bm *realFSBlockManager) GetTenantIDs(ctx context.Context) ([]string, error) {
   389  	if ctx.Err() != nil {
   390  		return nil, ctx.Err()
   391  	}
   392  
   393  	dirs, err := fs.ReadDir(bm.FS, bm.Root)
   394  	if err != nil {
   395  		return nil, err
   396  	}
   397  
   398  	tenantIDs := make([]string, 0)
   399  	for _, dir := range dirs {
   400  		if !bm.isTenantDir(bm.Root, dir) {
   401  			continue
   402  		}
   403  
   404  		tenantIDs = append(tenantIDs, dir.Name())
   405  	}
   406  	return tenantIDs, nil
   407  }
   408  
   409  func (bm *realFSBlockManager) GetBlocksForTenant(ctx context.Context, tenantID string) ([]*tenantBlock, error) {
   410  	if ctx.Err() != nil {
   411  		return nil, ctx.Err()
   412  	}
   413  
   414  	localDirPath := filepath.Join(bm.Root, tenantID, phlareDBLocalPath)
   415  	blockDirs, err := fs.ReadDir(bm.FS, localDirPath)
   416  	if err != nil {
   417  		return nil, err
   418  	}
   419  
   420  	uploadedBlockIDs, err := bm.getUploadedBlockIds(tenantID)
   421  	if err != nil {
   422  		return nil, err
   423  	}
   424  
   425  	// Read blocks.
   426  	blocks := make([]*tenantBlock, 0)
   427  	for _, blockDir := range blockDirs {
   428  		if !blockDir.IsDir() {
   429  			continue
   430  		}
   431  
   432  		path := filepath.Join(localDirPath, blockDir.Name())
   433  		blockID, ok := block.IsBlockDir(path)
   434  		if !ok {
   435  			// A malformed/invalid ULID likely means that the directory is not a
   436  			// valid block, ignoring.
   437  			continue
   438  		}
   439  
   440  		_, uploaded := uploadedBlockIDs[blockID]
   441  		blocks = append(blocks, &tenantBlock{
   442  			ID:       blockID,
   443  			TenantID: tenantID,
   444  			Path:     path,
   445  			Uploaded: uploaded,
   446  		})
   447  	}
   448  	return blocks, nil
   449  }
   450  
   451  func (bm *realFSBlockManager) DeleteBlock(ctx context.Context, block *tenantBlock) error {
   452  	if ctx.Err() != nil {
   453  		return ctx.Err()
   454  	}
   455  
   456  	return bm.Evictor.evictBlock(block.TenantID, block.ID, func() error {
   457  		err := bm.FS.RemoveAll(block.Path)
   458  		switch {
   459  		case os.IsNotExist(err):
   460  			return err
   461  		case err != nil:
   462  			return fmt.Errorf("failed to delete block: %q: %w", block.Path, err)
   463  		}
   464  		return nil
   465  	})
   466  }
   467  
   468  // isTenantDir checks if a directory is a tenant directory.
   469  func (bm *realFSBlockManager) isTenantDir(path string, entry fs.DirEntry) bool {
   470  	if !entry.IsDir() {
   471  		return false
   472  	}
   473  
   474  	subEntries, err := bm.FS.ReadDir(filepath.Join(path, entry.Name()))
   475  	if err != nil {
   476  		return false
   477  	}
   478  
   479  	foundLocalDir := false
   480  	for _, subEntry := range subEntries {
   481  		if !subEntry.IsDir() {
   482  			continue
   483  		}
   484  
   485  		if subEntry.Name() == phlareDBLocalPath {
   486  			foundLocalDir = true
   487  			break
   488  		}
   489  	}
   490  	return foundLocalDir
   491  }