github.com/atlassian/git-lob@v0.0.0-20150806085256-2386a5ed291a/core/prune.go (about)

     1  package core
     2  
     3  import (
     4  	"bufio"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"os/exec"
    10  	"path/filepath"
    11  	"regexp"
    12  	"time"
    13  
    14  	"github.com/atlassian/git-lob/providers"
    15  	"github.com/atlassian/git-lob/util"
    16  )
    17  
    18  type PruneCallbackType int
    19  
    20  const (
    21  	// Prune is working (for spinner)
    22  	PruneWorking PruneCallbackType = iota
    23  	// Prune is retaining LOB because referenced
    24  	PruneRetainReferenced PruneCallbackType = iota
    25  	// Prune is retaining LOB because commit referencing it is within retention period
    26  	PruneRetainByDate PruneCallbackType = iota
    27  	// Prune is retaining LOB because commit is referencing it is not pushed
    28  	PruneRetainNotPushed PruneCallbackType = iota
    29  	// Prune is deleting LOB (because unreferenced or out of date range & pushed)
    30  	PruneDeleted PruneCallbackType = iota
    31  )
    32  
    33  // Callback when running prune, identifies what's going on
    34  // When in dry run mode the same callbacks are made even if the actual act isn't performed (e.g. deletion)
    35  type PruneCallback func(t PruneCallbackType, lobsha string)
    36  
    37  var (
    38  	diffLOBReferenceRegex *regexp.Regexp
    39  	lobFilenameRegex      *regexp.Regexp
    40  )
    41  
    42  // Retrieve the full set of SHAs that currently have files locally (complete or not)
    43  func getAllLocalLOBSHAs() (util.StringSet, error) {
    44  	return getAllLOBSHAsInDir(GetLocalLOBRoot())
    45  }
    46  
    47  // Retrieve the full set of SHAs that currently have files in the shared store (complete or not)
    48  func getAllSharedLOBSHAs() (util.StringSet, error) {
    49  	return getAllLOBSHAsInDir(GetSharedLOBRoot())
    50  }
    51  
    52  func getAllLOBSHAsInDir(lobroot string) (util.StringSet, error) {
    53  
    54  	// os.File.Readdirnames is the most efficient
    55  	// os.File.Readdir retrieves extra info we don't usually need but in case other unexpected files
    56  	// end up in there (e.g. .DS_Store), we use it to identify directories
    57  	// ioutil.ReadDir and filepath.Walk do sorting which is unnecessary & inefficient
    58  
    59  	if lobFilenameRegex == nil {
    60  		lobFilenameRegex = regexp.MustCompile(`^([A-Za-z0-9]{40})_(meta|\d+)$`)
    61  	}
    62  	// Readdir returns in 'directory order' which means we may not get files for same SHA together
    63  	// so use set to find uniques
    64  	ret := util.NewStringSet()
    65  
    66  	// We only need to support a 2-folder structure here & know that all files are at the bottom level
    67  	// We always work on the local LOB folder (either only copy or hard link)
    68  	rootf, err := os.Open(lobroot)
    69  	if err != nil {
    70  		return ret, errors.New(fmt.Sprintf("Unable to open LOB root: %v\n", err))
    71  	}
    72  	defer rootf.Close()
    73  	dir1, err := rootf.Readdir(0)
    74  	if err != nil {
    75  		return ret, errors.New(fmt.Sprintf("Unable to read first level LOB dir: %v\n", err))
    76  	}
    77  	for _, dir1fi := range dir1 {
    78  		if dir1fi.IsDir() {
    79  			dir1path := filepath.Join(lobroot, dir1fi.Name())
    80  			dir1f, err := os.Open(dir1path)
    81  			if err != nil {
    82  				return ret, errors.New(fmt.Sprintf("Unable to open LOB dir: %v\n", err))
    83  			}
    84  			defer dir1f.Close()
    85  			dir2, err := dir1f.Readdir(0)
    86  			if err != nil {
    87  				return ret, errors.New(fmt.Sprintf("Unable to read second level LOB dir: %v\n", err))
    88  			}
    89  			for _, dir2fi := range dir2 {
    90  				if dir2fi.IsDir() {
    91  					dir2path := filepath.Join(dir1path, dir2fi.Name())
    92  					dir2f, err := os.Open(dir2path)
    93  					if err != nil {
    94  						return ret, errors.New(fmt.Sprintf("Unable to open LOB dir: %v\n", err))
    95  					}
    96  					defer dir2f.Close()
    97  					lobnames, err := dir2f.Readdirnames(0)
    98  					if err != nil {
    99  						return ret, errors.New(fmt.Sprintf("Unable to read innermost LOB dir: %v\n", err))
   100  					}
   101  					for _, lobname := range lobnames {
   102  						// Make sure it's really a LOB file
   103  						if match := lobFilenameRegex.FindStringSubmatch(lobname); match != nil {
   104  							// Regex pulls out the SHA
   105  							sha := match[1]
   106  							ret.Add(sha)
   107  						}
   108  					}
   109  
   110  				}
   111  			}
   112  		}
   113  
   114  	}
   115  
   116  	return ret, nil
   117  
   118  }
   119  
   120  // Determine if a line from git diff output is referencing a LOB (returns "" if not)
   121  func lobReferenceFromDiffLine(line string) string {
   122  	// Because this is a diff, it will start with +/-
   123  	// We only care about +, since - is stopping referencing a SHA
   124  	// important when it comes to purging old files
   125  	if diffLOBReferenceRegex == nil {
   126  		diffLOBReferenceRegex = regexp.MustCompile(`^\+git-lob: ([A-Za-z0-9]{40})$`)
   127  	}
   128  
   129  	if match := diffLOBReferenceRegex.FindStringSubmatch(line); match != nil {
   130  		return match[1]
   131  	}
   132  	return ""
   133  }
   134  
   135  // Delete unreferenced binary files from local store
   136  // For a file to be deleted it needs to not be referenced by any (reachable) commit
   137  // Returns a list of SHAs that were deleted (unless dryRun = true)
   138  func PruneUnreferenced(dryRun bool, callback PruneCallback) ([]string, error) {
   139  	// Purging requires full git on the command line, no way around this really
   140  	cmd := exec.Command("git", "log", "--all", "--no-color", "--oneline", "-p", "-G", SHALineRegexStr)
   141  	stdout, err := cmd.StdoutPipe()
   142  	if err != nil {
   143  		return make([]string, 0), errors.New("Unable to query git log for binary references: " + err.Error())
   144  	}
   145  	stderr, err := cmd.StderrPipe()
   146  	if err != nil {
   147  		return make([]string, 0), errors.New("Unable to open pipe: " + err.Error())
   148  	}
   149  	multi := io.MultiReader(stdout, stderr)
   150  	scanner := bufio.NewScanner(multi)
   151  	cmd.Start()
   152  	referencedSHAs := util.NewStringSet()
   153  	for scanner.Scan() {
   154  		callback(PruneWorking, "")
   155  		line := scanner.Text()
   156  		if sha := lobReferenceFromDiffLine(line); sha != "" {
   157  			if referencedSHAs.Add(sha) {
   158  				callback(PruneRetainReferenced, sha)
   159  			}
   160  		}
   161  	}
   162  	cmd.Wait()
   163  
   164  	// Must also not prune anything that's added but uncommitted
   165  	cmd = exec.Command("git", "diff", "--cached", "--no-color", "-G", SHALineRegexStr)
   166  	stdout, err = cmd.StdoutPipe()
   167  	if err != nil {
   168  		return make([]string, 0), errors.New("Unable to query git index for binary references: " + err.Error())
   169  	}
   170  	scanner = bufio.NewScanner(stdout)
   171  	cmd.Start()
   172  	for scanner.Scan() {
   173  		callback(PruneWorking, "")
   174  		line := scanner.Text()
   175  		if sha := lobReferenceFromDiffLine(line); sha != "" {
   176  			if referencedSHAs.Add(sha) {
   177  				callback(PruneRetainReferenced, sha)
   178  			}
   179  		}
   180  	}
   181  	cmd.Wait()
   182  
   183  	fileSHAs, err := getAllLocalLOBSHAs()
   184  	if err == nil {
   185  
   186  		var ret []string
   187  		for sha := range fileSHAs.Iter() {
   188  			callback(PruneWorking, "")
   189  			if !referencedSHAs.Contains(sha) {
   190  				ret = append(ret, string(sha))
   191  				callback(PruneDeleted, sha)
   192  				if !dryRun {
   193  					DeleteLOB(string(sha))
   194  				}
   195  			}
   196  		}
   197  		return ret, nil
   198  	} else {
   199  		return make([]string, 0), errors.New("Unable to get list of binary files: " + err.Error())
   200  	}
   201  
   202  }
   203  
   204  // Remove LOBs from the local store if they fall outside the range we would normally fetch for
   205  // Returns a list of SHAs that were deleted (unless dryRun = true)
   206  // Unreferenced binaries are also deleted by this
   207  func PruneOld(dryRun, safeMode bool, callback PruneCallback) ([]string, error) {
   208  	refSHAsDone := util.NewStringSet()
   209  	// Build a list to keep, then delete all else (includes deleting unreferenced)
   210  	// Can't just look at diffs (just like fetch) since LOB changed 3 years ago but still valid = recent
   211  	retainSet := util.NewStringSet()
   212  
   213  	// Add LOBs to retainSet for this commit and history
   214  	retainLOBs := func(commit string, days int, notPushedScanOnly bool, remoteName string) error {
   215  
   216  		var err error
   217  		var earliestCommit string
   218  		if notPushedScanOnly {
   219  			// We only want to include lobs from this ref if not pushed
   220  			earliestCommit = commit
   221  			// we never have to snapshot the file system because we're only interested in
   222  			// lobs which haven't been pushed. If that's all of them, then we'll eventually
   223  			// find the original addition of the lob in the history anyway
   224  		} else {
   225  			callback(PruneWorking, "")
   226  			// This ref is itself included so perform usual 'all lobs at checkout + n days history' query
   227  			var lobs []string
   228  			lobs, earliestCommit, err = GetGitAllLOBsToCheckoutAtCommitAndRecent(commit, days, []string{}, []string{})
   229  			if err != nil {
   230  				return fmt.Errorf("Error determining recent commits from %v: %v", commit, err.Error())
   231  			}
   232  			for _, l := range lobs {
   233  				if retainSet.Add(l) {
   234  					callback(PruneRetainByDate, l)
   235  				}
   236  			}
   237  		}
   238  
   239  		// earliestCommit is the earliest one which changed (replaced) a binary SHA
   240  		// and therefore the SHA we pulled out of it applied UP TO that point
   241  		// that we've included in the lobs list already
   242  		// If this commit is pushed then we're OK, if not we have to go backwards
   243  		// until we find the one that is.
   244  		// A pushed commit indicates the SHA pulled out of the *following* commit
   245  		// has been pushed:
   246  		//
   247  		// Binary A <-- --> B          B <-- --> C               C <-- --> D
   248  		// ------------|-----------|--------|-------------------------|
   249  		// Commit      1           |        2                         3
   250  		// "Retention"             R
   251  		//
   252  		// Given 3 commits (1/2/3) each changing a binary through states A/B/C/D
   253  		// 1. We retrieve state D through ls-files
   254  		// 2. We retrieve statees B and C through log --since=R, since we pick up
   255  		//    commits 2 and 3 and hence the SHAs for C and then B from the '-' side of the diff
   256  		// 3. 'Earliest commit' is 2
   257  		// 4. We then walk all commits that are at 2 or ancestors which reference LOBs
   258  		//    and are not pushed (this happens forwards from earliest up to & including 2)
   259  		//    This actually picks up the '+' sides of the diff
   260  
   261  		// This switching between using '-' and '+' lines of diff might seem odd but using
   262  		// the '-' lines is the easiest way to get required state in between commits. When
   263  		// your threshold date is in between commits you actually want the SHA from the commit
   264  		// before which changed that file, which is awkward & could be different for every file.
   265  		// Using the '-' lines eliminates that issue & also lets us just use git log --since.
   266  		// When you're looking at commits (rather than between them) you can use '+' which is easier
   267  
   268  		// WalkGitCommitLOBsToPush already finds the earliest commits that are not pushed before / on a ref
   269  		// so we use that plus a walk function
   270  		walkHistoryFunc := func(commitLOB *CommitLOBRef) (quit bool, err error) {
   271  			callback(PruneWorking, "")
   272  
   273  			// we asked to be told about the '+' side of the diff for LOBs while doing this walk,
   274  			// so that it corresponds with the push flag. Snapshots above include that already, so
   275  			// here we only deal with differences.
   276  			// We have to use the '-' diffs *between* commits (arbitrary date), but can use '+' when *on* commits
   277  			for _, l := range commitLOB.LobSHAs {
   278  				if retainSet.Add(l) {
   279  					callback(PruneRetainNotPushed, l)
   280  				}
   281  			}
   282  
   283  			return false, nil
   284  
   285  		}
   286  
   287  		// Now walk all unpushed commits referencing LOBs that are earlier than this
   288  		err = WalkGitCommitLOBsToPush(remoteName, earliestCommit, false, walkHistoryFunc)
   289  
   290  		return nil
   291  
   292  	}
   293  
   294  	// What remote(s) do we check for push? Defaults to "origin"
   295  	remoteName := util.GlobalOptions.PruneRemote
   296  
   297  	// First, include HEAD (we always want to keep that)
   298  	util.LogConsoleDebugf("\r") // to reset any progress spinner but don't want \r in log
   299  	util.LogDebugf("Retaining HEAD and %dd of history\n", util.GlobalOptions.RetentionCommitsPeriodHEAD)
   300  	headsha, _ := GitRefToFullSHA("HEAD")
   301  	err := retainLOBs(headsha, util.GlobalOptions.RetentionCommitsPeriodHEAD, false, remoteName)
   302  	if err != nil {
   303  		return []string{}, err
   304  	}
   305  	refSHAsDone.Add(headsha)
   306  
   307  	// Get all refs - we get all refs and not just recent refs like fetch, because we should
   308  	// not purge binaries in old refs if they are not pushed. However we get them in date order
   309  	// so that we don't have to check date once we cross retention-period-refs threshold
   310  	refs, err := GetGitRecentRefs(-1, true, "")
   311  	if err != nil {
   312  		return []string{}, err
   313  	}
   314  	outsideRefRetention := false
   315  	earliestRefDate := time.Now().AddDate(0, 0, -util.GlobalOptions.RetentionRefsPeriod)
   316  	for _, ref := range refs {
   317  		callback(PruneWorking, "")
   318  		// Don't duplicate work when >1 ref has the same SHA
   319  		// Most common with HEAD if not detached but also tags
   320  		if refSHAsDone.Contains(ref.CommitSHA) {
   321  			continue
   322  		}
   323  		refSHAsDone.Add(ref.CommitSHA)
   324  
   325  		notPushedScanOnly := false
   326  		// Is the ref out of the retention-period-refs window already? If so jump straight to push check
   327  		// refs are reverse date ordered so once we've found one that's outside, all following are too
   328  		if outsideRefRetention {
   329  			// previus ref being ouside ref retention manes this one is too (date ordered), save time
   330  			notPushedScanOnly = true
   331  		} else {
   332  			// check individual date
   333  			commit, err := GetGitCommitSummary(ref.CommitSHA)
   334  			if err != nil {
   335  				// We can't tell when this was last committed, so be safe & assume it's recent
   336  			} else if commit.CommitDate.Before(earliestRefDate) {
   337  				// this ref is already out of retention, so only keep if not pushed
   338  				notPushedScanOnly = true
   339  				// all subseqent refs are earlier
   340  				outsideRefRetention = true
   341  			}
   342  		}
   343  
   344  		if !notPushedScanOnly {
   345  			util.LogConsoleDebugf("\r") // to reset any progress spinner but don't want \r in log
   346  			util.LogDebugf("Retaining %v and %dd of history\n", ref.Name, util.GlobalOptions.RetentionCommitsPeriodOther)
   347  		}
   348  
   349  		// LOBs to keep for this ref
   350  		err := retainLOBs(ref.CommitSHA, util.GlobalOptions.RetentionCommitsPeriodOther, notPushedScanOnly, remoteName)
   351  		if err != nil {
   352  			return []string{}, fmt.Errorf("Error determining LOBs to keep for %v: %v", err.Error())
   353  		}
   354  
   355  	}
   356  
   357  	var provider providers.SyncProvider
   358  	safeRemote := "origin"
   359  	if safeMode {
   360  		if util.GlobalOptions.PruneRemote != "" {
   361  			safeRemote = util.GlobalOptions.PruneRemote
   362  			if safeRemote == "*" {
   363  				remotes, err := GetGitRemotes()
   364  				if err != nil {
   365  					return []string{}, fmt.Errorf("Can't determine remotes to check in safe mode for '*': %v", err.Error())
   366  				}
   367  				if len(remotes) == 0 {
   368  					return []string{}, fmt.Errorf("No remotes exist, cannot prune anything in --safe mode")
   369  				}
   370  
   371  				for _, remote := range remotes {
   372  					// default to origin if present
   373  					if remote == "origin" {
   374  						safeRemote = remote
   375  						break
   376  					}
   377  				}
   378  				// If not found, use the first one
   379  				if safeRemote == "*" {
   380  					safeRemote = remotes[0]
   381  				}
   382  			}
   383  		}
   384  		var err error
   385  		provider, err = providers.GetProviderForRemote(safeRemote)
   386  		if err != nil {
   387  			return []string{}, err
   388  		}
   389  		if err = provider.ValidateConfig(safeRemote); err != nil {
   390  			return []string{}, fmt.Errorf("Remote %v has configuration problems:\n%v", safeRemote, err)
   391  		}
   392  
   393  	}
   394  	var removedList []string
   395  	localLOBs, err := getAllLocalLOBSHAs()
   396  	if err == nil {
   397  		for sha := range localLOBs.Iter() {
   398  			callback(PruneWorking, "")
   399  			if !retainSet.Contains(sha) {
   400  				if safeMode {
   401  					// check with remote before deleting
   402  					if CheckRemoteLOBFilesForSHA(sha, provider, safeRemote) != nil {
   403  						util.LogDebugf("Would have deleted %v but it does not exist on the remote %v, so keeping", sha, safeRemote)
   404  						continue
   405  					}
   406  				}
   407  				removedList = append(removedList, string(sha))
   408  				callback(PruneDeleted, sha)
   409  				if !dryRun {
   410  					DeleteLOB(string(sha))
   411  				}
   412  			}
   413  		}
   414  	} else {
   415  		return []string{}, errors.New("Unable to get list of binary files: " + err.Error())
   416  	}
   417  	util.LogConsoleDebugf("\r") // to reset any progress spinner but don't want \r in log
   418  	util.LogDebugf("Also retained everything that hasn't been pushed to %v\n", remoteName)
   419  
   420  	return removedList, nil
   421  }
   422  
   423  // Prune the shared store of all LOBs with only 1 hard link (itself)
   424  // DeleteLOB will do this for individual LOBs we prune, but if the user
   425  // manually deletes a repo then unreferenced shared LOBs may never be cleaned up
   426  // callback is a basic function to let caller know something is happening
   427  func PruneSharedStore(dryRun bool, callback PruneCallback) ([]string, error) {
   428  	fileSHAs, err := getAllSharedLOBSHAs()
   429  	if err == nil {
   430  		ret := make([]string, 0, 10)
   431  		for sha := range fileSHAs.Iter() {
   432  			shareddir := GetSharedLOBDir(sha)
   433  			names, err := filepath.Glob(filepath.Join(shareddir, fmt.Sprintf("%v*", sha)))
   434  			if err != nil {
   435  				return make([]string, 0), errors.New(fmt.Sprintf("Unable to glob shared files for %v: %v\n", sha, err))
   436  			}
   437  			var deleted bool = false
   438  			var lastsha string
   439  			for _, n := range names {
   440  				callback(PruneWorking, "")
   441  				links, err := GetHardLinkCount(n)
   442  				if err == nil && links == 1 {
   443  					// only 1 hard link means no other repo refers to this shared LOB
   444  					// so it's safe to delete it
   445  					deleted = true
   446  					sha = filepath.Base(n)[:40]
   447  					if lastsha != sha {
   448  						callback(PruneDeleted, sha)
   449  						lastsha = sha
   450  					}
   451  					if !dryRun {
   452  						err = os.Remove(n)
   453  						if err != nil {
   454  							// don't abort for 1 failure, report & carry on
   455  							util.LogErrorf("Unable to delete file %v: %v\n", n, err)
   456  						}
   457  					}
   458  				}
   459  			}
   460  			if deleted {
   461  				ret = append(ret, string(sha))
   462  			}
   463  		}
   464  		return ret, nil
   465  	} else {
   466  		return make([]string, 0), err
   467  	}
   468  
   469  }