github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/graveler/retention/active_commits.go (about) 1 package retention 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "time" 8 9 "github.com/treeverse/lakefs/pkg/graveler" 10 ) 11 12 type CommitNode struct { 13 CreationDate time.Time 14 MainParent graveler.CommitID 15 MetaRangeID graveler.MetaRangeID 16 } 17 18 func NewCommitNode(creationDate time.Time, mainParent graveler.CommitID, metaRangeID graveler.MetaRangeID) CommitNode { 19 return CommitNode{ 20 CreationDate: creationDate, 21 MainParent: mainParent, 22 MetaRangeID: metaRangeID, 23 } 24 } 25 26 var ErrCommitNotFound = errors.New("commit not found") 27 28 // GetGarbageCollectionCommits returns the sets of active commits, according to the repository's garbage collection rules. 29 // See https://github.com/treeverse/lakeFS/issues/1932 for more details. 30 // Upon completion, the given startingPointIterator is closed. 31 func GetGarbageCollectionCommits(ctx context.Context, startingPointIterator *GCStartingPointIterator, commitGetter *RepositoryCommitGetter, rules *graveler.GarbageCollectionRules) (map[graveler.CommitID]graveler.MetaRangeID, error) { 32 // From each starting point in the given startingPointIterator, it iterates through its main ancestry. 33 // All commits reached are added to the active set, until and including the first commit performed before the start of the retention period. 34 processed := make(map[graveler.CommitID]time.Time) 35 activeMap := make(map[graveler.CommitID]struct{}) 36 37 commitsIterator, err := commitGetter.ListCommits(ctx) 38 if err != nil { 39 return nil, err 40 } 41 commitsMap := make(map[graveler.CommitID]CommitNode) 42 defer commitsIterator.Close() 43 for commitsIterator.Next() { 44 commitRecord := commitsIterator.Value() 45 var mainParent graveler.CommitID 46 if len(commitRecord.Commit.Parents) > 0 { 47 // every branch retains only its main ancestry, acquired by recursively taking the first parent: 48 mainParent = commitRecord.Commit.Parents[0] 49 if commitRecord.Commit.Version < graveler.CommitVersionParentSwitch { 50 mainParent = commitRecord.Commit.Parents[len(commitRecord.Commit.Parents)-1] 51 } 52 } 53 commitsMap[commitRecord.CommitID] = NewCommitNode(commitRecord.Commit.CreationDate, mainParent, commitRecord.MetaRangeID) 54 } 55 56 now := time.Now() 57 defer startingPointIterator.Close() 58 for startingPointIterator.Next() { 59 startingPoint := startingPointIterator.Value() 60 retentionDays := int(rules.DefaultRetentionDays) 61 commitNode, ok := commitsMap[startingPoint.CommitID] 62 if !ok { 63 return nil, fmt.Errorf("%w: %s", ErrCommitNotFound, startingPoint.CommitID) 64 } 65 if startingPoint.BranchID == "" { 66 // If the current commit is NOT a branch HEAD (a dangling commit) - add a hypothetical HEAD as its child 67 commitNode = CommitNode{ 68 CreationDate: commitNode.CreationDate, 69 MainParent: startingPoint.CommitID, 70 } 71 } else { 72 // If the current commit IS a branch HEAD - fetch and retention rules for this branch and... 73 var branchRetentionDays int32 74 if branchRetentionDays, ok = rules.BranchRetentionDays[string(startingPoint.BranchID)]; ok { 75 retentionDays = int(branchRetentionDays) 76 } 77 activeMap[startingPoint.CommitID] = struct{}{} 78 } 79 // Calculate the expiration time for the current commit 80 branchExpirationThreshold := now.AddDate(0, 0, -retentionDays) 81 if startingPoint.BranchID != "" { 82 // If the current commit IS a branch's HEAD, add it to the `processed` with the calculated expiration threshold. 83 // (it will be optionally examined later on by different commit paths to get the longest expiration threshold for a given commit) 84 processed[startingPoint.CommitID] = branchExpirationThreshold 85 } 86 // Start traversing the commit's ancestors (path): 87 for commitNode.MainParent != "" { 88 nextCommitID := commitNode.MainParent 89 var previousThreshold time.Time 90 if previousThreshold, ok = processed[nextCommitID]; ok && !previousThreshold.After(branchExpirationThreshold) { 91 // If the parent commit was already processed and its threshold was longer than the current threshold, 92 // i.e. the current threshold doesn't hold for it, stop processing it because the other path decision 93 // wins 94 break 95 } 96 if commitNode.CreationDate.After(branchExpirationThreshold) { 97 // If the current commit creation time is after the threshold, then its parent is active because the 98 // definition for 'active' is either creation time is after the threshold, or the first beyond 99 // the threshold. In either way, the PARENT is active. 100 activeMap[nextCommitID] = struct{}{} 101 } 102 // Continue down the rabbit hole. 103 commitNode, ok = commitsMap[nextCommitID] 104 if !ok { 105 return nil, fmt.Errorf("%w: %s", ErrCommitNotFound, nextCommitID) 106 } 107 // Set the parent commit ID's expiration threshold as the current (this is true because this one is the 108 // longest, because we wouldn't have gotten here otherwise) 109 processed[nextCommitID] = branchExpirationThreshold 110 } 111 } 112 if startingPointIterator.Err() != nil { 113 return nil, startingPointIterator.Err() 114 } 115 return makeCommitMap(commitsMap, activeMap), nil 116 } 117 118 func makeCommitMap(commitNodes map[graveler.CommitID]CommitNode, commitSet map[graveler.CommitID]struct{}) map[graveler.CommitID]graveler.MetaRangeID { 119 res := make(map[graveler.CommitID]graveler.MetaRangeID) 120 for commitID := range commitSet { 121 res[commitID] = commitNodes[commitID].MetaRangeID 122 } 123 return res 124 }