github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/graveler/retention/garbage_collection_manager.go (about) 1 package retention 2 3 import ( 4 "bytes" 5 "context" 6 "encoding/csv" 7 "errors" 8 "fmt" 9 "io" 10 "os" 11 "strings" 12 "time" 13 14 "github.com/rs/xid" 15 "github.com/treeverse/lakefs/pkg/block" 16 "github.com/treeverse/lakefs/pkg/graveler" 17 "google.golang.org/protobuf/proto" 18 ) 19 20 const ( 21 configFileSuffixTemplate = "%s/retention/gc/rules/config.json" 22 addressesFilePrefixTemplate = "%s/retention/gc/addresses/" 23 commitsFileSuffixTemplate = "%s/retention/gc/commits/run_id=%s/commits.csv" 24 uncommittedPrefixTemplate = "%s/retention/gc/uncommitted/" 25 uncommittedFilePrefixTemplate = uncommittedPrefixTemplate + "%s/uncommitted/" 26 27 // unixYear4000 epoch value for Saturday, January 1, 4000 12:00:00 AM. Changing this value is a breaking change as it is used to have reverse order for time based unique ID (xid). 28 unixYear4000 = 64060588800 29 ) 30 31 type GarbageCollectionManager struct { 32 blockAdapter block.Adapter 33 refManager graveler.RefManager 34 committedBlockStoragePrefix string 35 } 36 37 func (m *GarbageCollectionManager) GetCommitsCSVLocation(runID string, sn graveler.StorageNamespace) (string, error) { 38 key := fmt.Sprintf(commitsFileSuffixTemplate, m.committedBlockStoragePrefix, runID) 39 qk, err := m.blockAdapter.ResolveNamespace(sn.String(), key, block.IdentifierTypeRelative) 40 if err != nil { 41 return "", err 42 } 43 return qk.Format(), nil 44 } 45 46 func (m *GarbageCollectionManager) GetAddressesLocation(sn graveler.StorageNamespace) (string, error) { 47 key := fmt.Sprintf(addressesFilePrefixTemplate, m.committedBlockStoragePrefix) 48 qk, err := m.blockAdapter.ResolveNamespace(sn.String(), key, block.IdentifierTypeRelative) 49 if err != nil { 50 return "", err 51 } 52 return qk.Format(), nil 53 } 54 55 // GetUncommittedLocation return full path to underlying storage path to store uncommitted information 56 func (m *GarbageCollectionManager) GetUncommittedLocation(runID string, sn graveler.StorageNamespace) (string, error) { 57 key := fmt.Sprintf(uncommittedFilePrefixTemplate, m.committedBlockStoragePrefix, runID) 58 qk, err := m.blockAdapter.ResolveNamespace(sn.String(), key, block.IdentifierTypeRelative) 59 if err != nil { 60 return "", err 61 } 62 return qk.Format(), nil 63 } 64 65 func (m *GarbageCollectionManager) SaveGarbageCollectionUncommitted(ctx context.Context, repository *graveler.RepositoryRecord, filename, runID string) error { 66 location, err := m.GetUncommittedLocation(runID, repository.StorageNamespace) 67 if err != nil { 68 return err 69 } 70 71 fd, err := os.Open(filename) 72 if err != nil { 73 return err 74 } 75 stat, err := fd.Stat() 76 if err != nil { 77 return err 78 } 79 80 if !strings.HasSuffix(location, "/") { 81 location += "/" 82 } 83 location += filename 84 return m.blockAdapter.Put(ctx, block.ObjectPointer{ 85 Identifier: location, 86 IdentifierType: block.IdentifierTypeFull, 87 }, stat.Size(), fd, block.PutOpts{}) 88 } 89 90 type RepositoryCommitGetter struct { 91 refManager graveler.RefManager 92 repository *graveler.RepositoryRecord 93 } 94 95 func (r *RepositoryCommitGetter) ListCommits(ctx context.Context) (graveler.CommitIterator, error) { 96 return r.refManager.ListCommits(ctx, r.repository) 97 } 98 99 func NewGarbageCollectionManager(blockAdapter block.Adapter, refManager graveler.RefManager, committedBlockStoragePrefix string) *GarbageCollectionManager { 100 return &GarbageCollectionManager{ 101 blockAdapter: blockAdapter, 102 refManager: refManager, 103 committedBlockStoragePrefix: committedBlockStoragePrefix, 104 } 105 } 106 107 func (m *GarbageCollectionManager) GetRules(ctx context.Context, storageNamespace graveler.StorageNamespace) (*graveler.GarbageCollectionRules, error) { 108 objectPointer := block.ObjectPointer{ 109 StorageNamespace: string(storageNamespace), 110 Identifier: fmt.Sprintf(configFileSuffixTemplate, m.committedBlockStoragePrefix), 111 IdentifierType: block.IdentifierTypeRelative, 112 } 113 reader, err := m.blockAdapter.Get(ctx, objectPointer, -1) 114 if errors.Is(err, block.ErrDataNotFound) { 115 return nil, graveler.ErrNotFound 116 } 117 if err != nil { 118 return nil, err 119 } 120 defer func() { 121 _ = reader.Close() 122 }() 123 var rules graveler.GarbageCollectionRules 124 rulesBytes, err := io.ReadAll(reader) 125 if err != nil { 126 return nil, err 127 } 128 if len(rulesBytes) == 0 { 129 // empty file - no GC rules 130 return nil, graveler.ErrNotFound 131 } 132 err = proto.Unmarshal(rulesBytes, &rules) 133 if err != nil { 134 return nil, err 135 } 136 return &rules, nil 137 } 138 139 func (m *GarbageCollectionManager) SaveRules(ctx context.Context, storageNamespace graveler.StorageNamespace, rules *graveler.GarbageCollectionRules) error { 140 rulesBytes, err := proto.Marshal(rules) 141 if err != nil { 142 return err 143 } 144 return m.blockAdapter.Put(ctx, block.ObjectPointer{ 145 StorageNamespace: string(storageNamespace), 146 Identifier: fmt.Sprintf(configFileSuffixTemplate, m.committedBlockStoragePrefix), 147 IdentifierType: block.IdentifierTypeRelative, 148 }, int64(len(rulesBytes)), bytes.NewReader(rulesBytes), block.PutOpts{}) 149 } 150 151 func (m *GarbageCollectionManager) SaveGarbageCollectionCommits(ctx context.Context, repository *graveler.RepositoryRecord, rules *graveler.GarbageCollectionRules) (string, error) { 152 commitGetter := &RepositoryCommitGetter{ 153 refManager: m.refManager, 154 repository: repository, 155 } 156 branchIterator, err := m.refManager.GCBranchIterator(ctx, repository) 157 if err != nil { 158 return "", err 159 } 160 defer branchIterator.Close() 161 // get all commits that are not the first parent of any commit: 162 commitIterator, err := m.refManager.GCCommitIterator(ctx, repository) 163 if err != nil { 164 return "", fmt.Errorf("create kv orderd commit iterator commits: %w", err) 165 } 166 defer commitIterator.Close() 167 startingPointIterator := NewGCStartingPointIterator(commitIterator, branchIterator) 168 defer startingPointIterator.Close() 169 gcCommits, err := GetGarbageCollectionCommits(ctx, startingPointIterator, commitGetter, rules) 170 if err != nil { 171 return "", fmt.Errorf("find expired commits: %w", err) 172 } 173 b := &strings.Builder{} 174 csvWriter := csv.NewWriter(b) 175 // (TODO) - remove expired column from the CSV file and from the GC logic 176 headers := []string{"commit_id", "expired", "metarange_id"} 177 if err = csvWriter.Write(headers); err != nil { 178 return "", err 179 } 180 for commitID, metarangeID := range gcCommits { 181 err := csvWriter.Write([]string{string(commitID), "false", string(metarangeID)}) 182 if err != nil { 183 return "", err 184 } 185 } 186 csvWriter.Flush() 187 err = csvWriter.Error() 188 if err != nil { 189 return "", err 190 } 191 commitsStr := b.String() 192 runID := m.NewID() 193 csvLocation, err := m.GetCommitsCSVLocation(runID, repository.StorageNamespace) 194 if err != nil { 195 return "", err 196 } 197 err = m.blockAdapter.Put(ctx, block.ObjectPointer{ 198 Identifier: csvLocation, 199 IdentifierType: block.IdentifierTypeFull, 200 }, int64(len(commitsStr)), strings.NewReader(commitsStr), block.PutOpts{}) 201 if err != nil { 202 return "", err 203 } 204 return runID, nil 205 } 206 207 func (m *GarbageCollectionManager) NewID() string { 208 return newDescendingID(time.Now()).String() 209 } 210 211 // TODO: Unify implementations of descending IDs 212 func newDescendingID(tm time.Time) xid.ID { 213 t := time.Unix(unixYear4000-tm.Unix(), 0).UTC() 214 return xid.NewWithTime(t) 215 }