github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/graveler/retention/garbage_collection_manager.go (about)

     1  package retention
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"encoding/csv"
     7  	"errors"
     8  	"fmt"
     9  	"io"
    10  	"os"
    11  	"strings"
    12  	"time"
    13  
    14  	"github.com/rs/xid"
    15  	"github.com/treeverse/lakefs/pkg/block"
    16  	"github.com/treeverse/lakefs/pkg/graveler"
    17  	"google.golang.org/protobuf/proto"
    18  )
    19  
    20  const (
    21  	configFileSuffixTemplate      = "%s/retention/gc/rules/config.json"
    22  	addressesFilePrefixTemplate   = "%s/retention/gc/addresses/"
    23  	commitsFileSuffixTemplate     = "%s/retention/gc/commits/run_id=%s/commits.csv"
    24  	uncommittedPrefixTemplate     = "%s/retention/gc/uncommitted/"
    25  	uncommittedFilePrefixTemplate = uncommittedPrefixTemplate + "%s/uncommitted/"
    26  
    27  	// unixYear4000 epoch value for Saturday, January 1, 4000 12:00:00 AM. Changing this value is a breaking change as it is used to have reverse order for time based unique ID (xid).
    28  	unixYear4000 = 64060588800
    29  )
    30  
    31  type GarbageCollectionManager struct {
    32  	blockAdapter                block.Adapter
    33  	refManager                  graveler.RefManager
    34  	committedBlockStoragePrefix string
    35  }
    36  
    37  func (m *GarbageCollectionManager) GetCommitsCSVLocation(runID string, sn graveler.StorageNamespace) (string, error) {
    38  	key := fmt.Sprintf(commitsFileSuffixTemplate, m.committedBlockStoragePrefix, runID)
    39  	qk, err := m.blockAdapter.ResolveNamespace(sn.String(), key, block.IdentifierTypeRelative)
    40  	if err != nil {
    41  		return "", err
    42  	}
    43  	return qk.Format(), nil
    44  }
    45  
    46  func (m *GarbageCollectionManager) GetAddressesLocation(sn graveler.StorageNamespace) (string, error) {
    47  	key := fmt.Sprintf(addressesFilePrefixTemplate, m.committedBlockStoragePrefix)
    48  	qk, err := m.blockAdapter.ResolveNamespace(sn.String(), key, block.IdentifierTypeRelative)
    49  	if err != nil {
    50  		return "", err
    51  	}
    52  	return qk.Format(), nil
    53  }
    54  
    55  // GetUncommittedLocation return full path to underlying storage path to store uncommitted information
    56  func (m *GarbageCollectionManager) GetUncommittedLocation(runID string, sn graveler.StorageNamespace) (string, error) {
    57  	key := fmt.Sprintf(uncommittedFilePrefixTemplate, m.committedBlockStoragePrefix, runID)
    58  	qk, err := m.blockAdapter.ResolveNamespace(sn.String(), key, block.IdentifierTypeRelative)
    59  	if err != nil {
    60  		return "", err
    61  	}
    62  	return qk.Format(), nil
    63  }
    64  
    65  func (m *GarbageCollectionManager) SaveGarbageCollectionUncommitted(ctx context.Context, repository *graveler.RepositoryRecord, filename, runID string) error {
    66  	location, err := m.GetUncommittedLocation(runID, repository.StorageNamespace)
    67  	if err != nil {
    68  		return err
    69  	}
    70  
    71  	fd, err := os.Open(filename)
    72  	if err != nil {
    73  		return err
    74  	}
    75  	stat, err := fd.Stat()
    76  	if err != nil {
    77  		return err
    78  	}
    79  
    80  	if !strings.HasSuffix(location, "/") {
    81  		location += "/"
    82  	}
    83  	location += filename
    84  	return m.blockAdapter.Put(ctx, block.ObjectPointer{
    85  		Identifier:     location,
    86  		IdentifierType: block.IdentifierTypeFull,
    87  	}, stat.Size(), fd, block.PutOpts{})
    88  }
    89  
    90  type RepositoryCommitGetter struct {
    91  	refManager graveler.RefManager
    92  	repository *graveler.RepositoryRecord
    93  }
    94  
    95  func (r *RepositoryCommitGetter) ListCommits(ctx context.Context) (graveler.CommitIterator, error) {
    96  	return r.refManager.ListCommits(ctx, r.repository)
    97  }
    98  
    99  func NewGarbageCollectionManager(blockAdapter block.Adapter, refManager graveler.RefManager, committedBlockStoragePrefix string) *GarbageCollectionManager {
   100  	return &GarbageCollectionManager{
   101  		blockAdapter:                blockAdapter,
   102  		refManager:                  refManager,
   103  		committedBlockStoragePrefix: committedBlockStoragePrefix,
   104  	}
   105  }
   106  
   107  func (m *GarbageCollectionManager) GetRules(ctx context.Context, storageNamespace graveler.StorageNamespace) (*graveler.GarbageCollectionRules, error) {
   108  	objectPointer := block.ObjectPointer{
   109  		StorageNamespace: string(storageNamespace),
   110  		Identifier:       fmt.Sprintf(configFileSuffixTemplate, m.committedBlockStoragePrefix),
   111  		IdentifierType:   block.IdentifierTypeRelative,
   112  	}
   113  	reader, err := m.blockAdapter.Get(ctx, objectPointer, -1)
   114  	if errors.Is(err, block.ErrDataNotFound) {
   115  		return nil, graveler.ErrNotFound
   116  	}
   117  	if err != nil {
   118  		return nil, err
   119  	}
   120  	defer func() {
   121  		_ = reader.Close()
   122  	}()
   123  	var rules graveler.GarbageCollectionRules
   124  	rulesBytes, err := io.ReadAll(reader)
   125  	if err != nil {
   126  		return nil, err
   127  	}
   128  	if len(rulesBytes) == 0 {
   129  		// empty file - no GC rules
   130  		return nil, graveler.ErrNotFound
   131  	}
   132  	err = proto.Unmarshal(rulesBytes, &rules)
   133  	if err != nil {
   134  		return nil, err
   135  	}
   136  	return &rules, nil
   137  }
   138  
   139  func (m *GarbageCollectionManager) SaveRules(ctx context.Context, storageNamespace graveler.StorageNamespace, rules *graveler.GarbageCollectionRules) error {
   140  	rulesBytes, err := proto.Marshal(rules)
   141  	if err != nil {
   142  		return err
   143  	}
   144  	return m.blockAdapter.Put(ctx, block.ObjectPointer{
   145  		StorageNamespace: string(storageNamespace),
   146  		Identifier:       fmt.Sprintf(configFileSuffixTemplate, m.committedBlockStoragePrefix),
   147  		IdentifierType:   block.IdentifierTypeRelative,
   148  	}, int64(len(rulesBytes)), bytes.NewReader(rulesBytes), block.PutOpts{})
   149  }
   150  
   151  func (m *GarbageCollectionManager) SaveGarbageCollectionCommits(ctx context.Context, repository *graveler.RepositoryRecord, rules *graveler.GarbageCollectionRules) (string, error) {
   152  	commitGetter := &RepositoryCommitGetter{
   153  		refManager: m.refManager,
   154  		repository: repository,
   155  	}
   156  	branchIterator, err := m.refManager.GCBranchIterator(ctx, repository)
   157  	if err != nil {
   158  		return "", err
   159  	}
   160  	defer branchIterator.Close()
   161  	// get all commits that are not the first parent of any commit:
   162  	commitIterator, err := m.refManager.GCCommitIterator(ctx, repository)
   163  	if err != nil {
   164  		return "", fmt.Errorf("create kv orderd commit iterator commits: %w", err)
   165  	}
   166  	defer commitIterator.Close()
   167  	startingPointIterator := NewGCStartingPointIterator(commitIterator, branchIterator)
   168  	defer startingPointIterator.Close()
   169  	gcCommits, err := GetGarbageCollectionCommits(ctx, startingPointIterator, commitGetter, rules)
   170  	if err != nil {
   171  		return "", fmt.Errorf("find expired commits: %w", err)
   172  	}
   173  	b := &strings.Builder{}
   174  	csvWriter := csv.NewWriter(b)
   175  	// (TODO) - remove expired column from the CSV file and from the GC logic
   176  	headers := []string{"commit_id", "expired", "metarange_id"}
   177  	if err = csvWriter.Write(headers); err != nil {
   178  		return "", err
   179  	}
   180  	for commitID, metarangeID := range gcCommits {
   181  		err := csvWriter.Write([]string{string(commitID), "false", string(metarangeID)})
   182  		if err != nil {
   183  			return "", err
   184  		}
   185  	}
   186  	csvWriter.Flush()
   187  	err = csvWriter.Error()
   188  	if err != nil {
   189  		return "", err
   190  	}
   191  	commitsStr := b.String()
   192  	runID := m.NewID()
   193  	csvLocation, err := m.GetCommitsCSVLocation(runID, repository.StorageNamespace)
   194  	if err != nil {
   195  		return "", err
   196  	}
   197  	err = m.blockAdapter.Put(ctx, block.ObjectPointer{
   198  		Identifier:     csvLocation,
   199  		IdentifierType: block.IdentifierTypeFull,
   200  	}, int64(len(commitsStr)), strings.NewReader(commitsStr), block.PutOpts{})
   201  	if err != nil {
   202  		return "", err
   203  	}
   204  	return runID, nil
   205  }
   206  
   207  func (m *GarbageCollectionManager) NewID() string {
   208  	return newDescendingID(time.Now()).String()
   209  }
   210  
   211  // TODO: Unify implementations of descending IDs
   212  func newDescendingID(tm time.Time) xid.ID {
   213  	t := time.Unix(unixYear4000-tm.Unix(), 0).UTC()
   214  	return xid.NewWithTime(t)
   215  }