github.com/atlassian/git-lob@v0.0.0-20150806085256-2386a5ed291a/core/prune.go (about) 1 package core 2 3 import ( 4 "bufio" 5 "errors" 6 "fmt" 7 "io" 8 "os" 9 "os/exec" 10 "path/filepath" 11 "regexp" 12 "time" 13 14 "github.com/atlassian/git-lob/providers" 15 "github.com/atlassian/git-lob/util" 16 ) 17 18 type PruneCallbackType int 19 20 const ( 21 // Prune is working (for spinner) 22 PruneWorking PruneCallbackType = iota 23 // Prune is retaining LOB because referenced 24 PruneRetainReferenced PruneCallbackType = iota 25 // Prune is retaining LOB because commit referencing it is within retention period 26 PruneRetainByDate PruneCallbackType = iota 27 // Prune is retaining LOB because commit is referencing it is not pushed 28 PruneRetainNotPushed PruneCallbackType = iota 29 // Prune is deleting LOB (because unreferenced or out of date range & pushed) 30 PruneDeleted PruneCallbackType = iota 31 ) 32 33 // Callback when running prune, identifies what's going on 34 // When in dry run mode the same callbacks are made even if the actual act isn't performed (e.g. deletion) 35 type PruneCallback func(t PruneCallbackType, lobsha string) 36 37 var ( 38 diffLOBReferenceRegex *regexp.Regexp 39 lobFilenameRegex *regexp.Regexp 40 ) 41 42 // Retrieve the full set of SHAs that currently have files locally (complete or not) 43 func getAllLocalLOBSHAs() (util.StringSet, error) { 44 return getAllLOBSHAsInDir(GetLocalLOBRoot()) 45 } 46 47 // Retrieve the full set of SHAs that currently have files in the shared store (complete or not) 48 func getAllSharedLOBSHAs() (util.StringSet, error) { 49 return getAllLOBSHAsInDir(GetSharedLOBRoot()) 50 } 51 52 func getAllLOBSHAsInDir(lobroot string) (util.StringSet, error) { 53 54 // os.File.Readdirnames is the most efficient 55 // os.File.Readdir retrieves extra info we don't usually need but in case other unexpected files 56 // end up in there (e.g. .DS_Store), we use it to identify directories 57 // ioutil.ReadDir and filepath.Walk do sorting which is unnecessary & inefficient 58 59 if lobFilenameRegex == nil { 60 lobFilenameRegex = regexp.MustCompile(`^([A-Za-z0-9]{40})_(meta|\d+)$`) 61 } 62 // Readdir returns in 'directory order' which means we may not get files for same SHA together 63 // so use set to find uniques 64 ret := util.NewStringSet() 65 66 // We only need to support a 2-folder structure here & know that all files are at the bottom level 67 // We always work on the local LOB folder (either only copy or hard link) 68 rootf, err := os.Open(lobroot) 69 if err != nil { 70 return ret, errors.New(fmt.Sprintf("Unable to open LOB root: %v\n", err)) 71 } 72 defer rootf.Close() 73 dir1, err := rootf.Readdir(0) 74 if err != nil { 75 return ret, errors.New(fmt.Sprintf("Unable to read first level LOB dir: %v\n", err)) 76 } 77 for _, dir1fi := range dir1 { 78 if dir1fi.IsDir() { 79 dir1path := filepath.Join(lobroot, dir1fi.Name()) 80 dir1f, err := os.Open(dir1path) 81 if err != nil { 82 return ret, errors.New(fmt.Sprintf("Unable to open LOB dir: %v\n", err)) 83 } 84 defer dir1f.Close() 85 dir2, err := dir1f.Readdir(0) 86 if err != nil { 87 return ret, errors.New(fmt.Sprintf("Unable to read second level LOB dir: %v\n", err)) 88 } 89 for _, dir2fi := range dir2 { 90 if dir2fi.IsDir() { 91 dir2path := filepath.Join(dir1path, dir2fi.Name()) 92 dir2f, err := os.Open(dir2path) 93 if err != nil { 94 return ret, errors.New(fmt.Sprintf("Unable to open LOB dir: %v\n", err)) 95 } 96 defer dir2f.Close() 97 lobnames, err := dir2f.Readdirnames(0) 98 if err != nil { 99 return ret, errors.New(fmt.Sprintf("Unable to read innermost LOB dir: %v\n", err)) 100 } 101 for _, lobname := range lobnames { 102 // Make sure it's really a LOB file 103 if match := lobFilenameRegex.FindStringSubmatch(lobname); match != nil { 104 // Regex pulls out the SHA 105 sha := match[1] 106 ret.Add(sha) 107 } 108 } 109 110 } 111 } 112 } 113 114 } 115 116 return ret, nil 117 118 } 119 120 // Determine if a line from git diff output is referencing a LOB (returns "" if not) 121 func lobReferenceFromDiffLine(line string) string { 122 // Because this is a diff, it will start with +/- 123 // We only care about +, since - is stopping referencing a SHA 124 // important when it comes to purging old files 125 if diffLOBReferenceRegex == nil { 126 diffLOBReferenceRegex = regexp.MustCompile(`^\+git-lob: ([A-Za-z0-9]{40})$`) 127 } 128 129 if match := diffLOBReferenceRegex.FindStringSubmatch(line); match != nil { 130 return match[1] 131 } 132 return "" 133 } 134 135 // Delete unreferenced binary files from local store 136 // For a file to be deleted it needs to not be referenced by any (reachable) commit 137 // Returns a list of SHAs that were deleted (unless dryRun = true) 138 func PruneUnreferenced(dryRun bool, callback PruneCallback) ([]string, error) { 139 // Purging requires full git on the command line, no way around this really 140 cmd := exec.Command("git", "log", "--all", "--no-color", "--oneline", "-p", "-G", SHALineRegexStr) 141 stdout, err := cmd.StdoutPipe() 142 if err != nil { 143 return make([]string, 0), errors.New("Unable to query git log for binary references: " + err.Error()) 144 } 145 stderr, err := cmd.StderrPipe() 146 if err != nil { 147 return make([]string, 0), errors.New("Unable to open pipe: " + err.Error()) 148 } 149 multi := io.MultiReader(stdout, stderr) 150 scanner := bufio.NewScanner(multi) 151 cmd.Start() 152 referencedSHAs := util.NewStringSet() 153 for scanner.Scan() { 154 callback(PruneWorking, "") 155 line := scanner.Text() 156 if sha := lobReferenceFromDiffLine(line); sha != "" { 157 if referencedSHAs.Add(sha) { 158 callback(PruneRetainReferenced, sha) 159 } 160 } 161 } 162 cmd.Wait() 163 164 // Must also not prune anything that's added but uncommitted 165 cmd = exec.Command("git", "diff", "--cached", "--no-color", "-G", SHALineRegexStr) 166 stdout, err = cmd.StdoutPipe() 167 if err != nil { 168 return make([]string, 0), errors.New("Unable to query git index for binary references: " + err.Error()) 169 } 170 scanner = bufio.NewScanner(stdout) 171 cmd.Start() 172 for scanner.Scan() { 173 callback(PruneWorking, "") 174 line := scanner.Text() 175 if sha := lobReferenceFromDiffLine(line); sha != "" { 176 if referencedSHAs.Add(sha) { 177 callback(PruneRetainReferenced, sha) 178 } 179 } 180 } 181 cmd.Wait() 182 183 fileSHAs, err := getAllLocalLOBSHAs() 184 if err == nil { 185 186 var ret []string 187 for sha := range fileSHAs.Iter() { 188 callback(PruneWorking, "") 189 if !referencedSHAs.Contains(sha) { 190 ret = append(ret, string(sha)) 191 callback(PruneDeleted, sha) 192 if !dryRun { 193 DeleteLOB(string(sha)) 194 } 195 } 196 } 197 return ret, nil 198 } else { 199 return make([]string, 0), errors.New("Unable to get list of binary files: " + err.Error()) 200 } 201 202 } 203 204 // Remove LOBs from the local store if they fall outside the range we would normally fetch for 205 // Returns a list of SHAs that were deleted (unless dryRun = true) 206 // Unreferenced binaries are also deleted by this 207 func PruneOld(dryRun, safeMode bool, callback PruneCallback) ([]string, error) { 208 refSHAsDone := util.NewStringSet() 209 // Build a list to keep, then delete all else (includes deleting unreferenced) 210 // Can't just look at diffs (just like fetch) since LOB changed 3 years ago but still valid = recent 211 retainSet := util.NewStringSet() 212 213 // Add LOBs to retainSet for this commit and history 214 retainLOBs := func(commit string, days int, notPushedScanOnly bool, remoteName string) error { 215 216 var err error 217 var earliestCommit string 218 if notPushedScanOnly { 219 // We only want to include lobs from this ref if not pushed 220 earliestCommit = commit 221 // we never have to snapshot the file system because we're only interested in 222 // lobs which haven't been pushed. If that's all of them, then we'll eventually 223 // find the original addition of the lob in the history anyway 224 } else { 225 callback(PruneWorking, "") 226 // This ref is itself included so perform usual 'all lobs at checkout + n days history' query 227 var lobs []string 228 lobs, earliestCommit, err = GetGitAllLOBsToCheckoutAtCommitAndRecent(commit, days, []string{}, []string{}) 229 if err != nil { 230 return fmt.Errorf("Error determining recent commits from %v: %v", commit, err.Error()) 231 } 232 for _, l := range lobs { 233 if retainSet.Add(l) { 234 callback(PruneRetainByDate, l) 235 } 236 } 237 } 238 239 // earliestCommit is the earliest one which changed (replaced) a binary SHA 240 // and therefore the SHA we pulled out of it applied UP TO that point 241 // that we've included in the lobs list already 242 // If this commit is pushed then we're OK, if not we have to go backwards 243 // until we find the one that is. 244 // A pushed commit indicates the SHA pulled out of the *following* commit 245 // has been pushed: 246 // 247 // Binary A <-- --> B B <-- --> C C <-- --> D 248 // ------------|-----------|--------|-------------------------| 249 // Commit 1 | 2 3 250 // "Retention" R 251 // 252 // Given 3 commits (1/2/3) each changing a binary through states A/B/C/D 253 // 1. We retrieve state D through ls-files 254 // 2. We retrieve statees B and C through log --since=R, since we pick up 255 // commits 2 and 3 and hence the SHAs for C and then B from the '-' side of the diff 256 // 3. 'Earliest commit' is 2 257 // 4. We then walk all commits that are at 2 or ancestors which reference LOBs 258 // and are not pushed (this happens forwards from earliest up to & including 2) 259 // This actually picks up the '+' sides of the diff 260 261 // This switching between using '-' and '+' lines of diff might seem odd but using 262 // the '-' lines is the easiest way to get required state in between commits. When 263 // your threshold date is in between commits you actually want the SHA from the commit 264 // before which changed that file, which is awkward & could be different for every file. 265 // Using the '-' lines eliminates that issue & also lets us just use git log --since. 266 // When you're looking at commits (rather than between them) you can use '+' which is easier 267 268 // WalkGitCommitLOBsToPush already finds the earliest commits that are not pushed before / on a ref 269 // so we use that plus a walk function 270 walkHistoryFunc := func(commitLOB *CommitLOBRef) (quit bool, err error) { 271 callback(PruneWorking, "") 272 273 // we asked to be told about the '+' side of the diff for LOBs while doing this walk, 274 // so that it corresponds with the push flag. Snapshots above include that already, so 275 // here we only deal with differences. 276 // We have to use the '-' diffs *between* commits (arbitrary date), but can use '+' when *on* commits 277 for _, l := range commitLOB.LobSHAs { 278 if retainSet.Add(l) { 279 callback(PruneRetainNotPushed, l) 280 } 281 } 282 283 return false, nil 284 285 } 286 287 // Now walk all unpushed commits referencing LOBs that are earlier than this 288 err = WalkGitCommitLOBsToPush(remoteName, earliestCommit, false, walkHistoryFunc) 289 290 return nil 291 292 } 293 294 // What remote(s) do we check for push? Defaults to "origin" 295 remoteName := util.GlobalOptions.PruneRemote 296 297 // First, include HEAD (we always want to keep that) 298 util.LogConsoleDebugf("\r") // to reset any progress spinner but don't want \r in log 299 util.LogDebugf("Retaining HEAD and %dd of history\n", util.GlobalOptions.RetentionCommitsPeriodHEAD) 300 headsha, _ := GitRefToFullSHA("HEAD") 301 err := retainLOBs(headsha, util.GlobalOptions.RetentionCommitsPeriodHEAD, false, remoteName) 302 if err != nil { 303 return []string{}, err 304 } 305 refSHAsDone.Add(headsha) 306 307 // Get all refs - we get all refs and not just recent refs like fetch, because we should 308 // not purge binaries in old refs if they are not pushed. However we get them in date order 309 // so that we don't have to check date once we cross retention-period-refs threshold 310 refs, err := GetGitRecentRefs(-1, true, "") 311 if err != nil { 312 return []string{}, err 313 } 314 outsideRefRetention := false 315 earliestRefDate := time.Now().AddDate(0, 0, -util.GlobalOptions.RetentionRefsPeriod) 316 for _, ref := range refs { 317 callback(PruneWorking, "") 318 // Don't duplicate work when >1 ref has the same SHA 319 // Most common with HEAD if not detached but also tags 320 if refSHAsDone.Contains(ref.CommitSHA) { 321 continue 322 } 323 refSHAsDone.Add(ref.CommitSHA) 324 325 notPushedScanOnly := false 326 // Is the ref out of the retention-period-refs window already? If so jump straight to push check 327 // refs are reverse date ordered so once we've found one that's outside, all following are too 328 if outsideRefRetention { 329 // previus ref being ouside ref retention manes this one is too (date ordered), save time 330 notPushedScanOnly = true 331 } else { 332 // check individual date 333 commit, err := GetGitCommitSummary(ref.CommitSHA) 334 if err != nil { 335 // We can't tell when this was last committed, so be safe & assume it's recent 336 } else if commit.CommitDate.Before(earliestRefDate) { 337 // this ref is already out of retention, so only keep if not pushed 338 notPushedScanOnly = true 339 // all subseqent refs are earlier 340 outsideRefRetention = true 341 } 342 } 343 344 if !notPushedScanOnly { 345 util.LogConsoleDebugf("\r") // to reset any progress spinner but don't want \r in log 346 util.LogDebugf("Retaining %v and %dd of history\n", ref.Name, util.GlobalOptions.RetentionCommitsPeriodOther) 347 } 348 349 // LOBs to keep for this ref 350 err := retainLOBs(ref.CommitSHA, util.GlobalOptions.RetentionCommitsPeriodOther, notPushedScanOnly, remoteName) 351 if err != nil { 352 return []string{}, fmt.Errorf("Error determining LOBs to keep for %v: %v", err.Error()) 353 } 354 355 } 356 357 var provider providers.SyncProvider 358 safeRemote := "origin" 359 if safeMode { 360 if util.GlobalOptions.PruneRemote != "" { 361 safeRemote = util.GlobalOptions.PruneRemote 362 if safeRemote == "*" { 363 remotes, err := GetGitRemotes() 364 if err != nil { 365 return []string{}, fmt.Errorf("Can't determine remotes to check in safe mode for '*': %v", err.Error()) 366 } 367 if len(remotes) == 0 { 368 return []string{}, fmt.Errorf("No remotes exist, cannot prune anything in --safe mode") 369 } 370 371 for _, remote := range remotes { 372 // default to origin if present 373 if remote == "origin" { 374 safeRemote = remote 375 break 376 } 377 } 378 // If not found, use the first one 379 if safeRemote == "*" { 380 safeRemote = remotes[0] 381 } 382 } 383 } 384 var err error 385 provider, err = providers.GetProviderForRemote(safeRemote) 386 if err != nil { 387 return []string{}, err 388 } 389 if err = provider.ValidateConfig(safeRemote); err != nil { 390 return []string{}, fmt.Errorf("Remote %v has configuration problems:\n%v", safeRemote, err) 391 } 392 393 } 394 var removedList []string 395 localLOBs, err := getAllLocalLOBSHAs() 396 if err == nil { 397 for sha := range localLOBs.Iter() { 398 callback(PruneWorking, "") 399 if !retainSet.Contains(sha) { 400 if safeMode { 401 // check with remote before deleting 402 if CheckRemoteLOBFilesForSHA(sha, provider, safeRemote) != nil { 403 util.LogDebugf("Would have deleted %v but it does not exist on the remote %v, so keeping", sha, safeRemote) 404 continue 405 } 406 } 407 removedList = append(removedList, string(sha)) 408 callback(PruneDeleted, sha) 409 if !dryRun { 410 DeleteLOB(string(sha)) 411 } 412 } 413 } 414 } else { 415 return []string{}, errors.New("Unable to get list of binary files: " + err.Error()) 416 } 417 util.LogConsoleDebugf("\r") // to reset any progress spinner but don't want \r in log 418 util.LogDebugf("Also retained everything that hasn't been pushed to %v\n", remoteName) 419 420 return removedList, nil 421 } 422 423 // Prune the shared store of all LOBs with only 1 hard link (itself) 424 // DeleteLOB will do this for individual LOBs we prune, but if the user 425 // manually deletes a repo then unreferenced shared LOBs may never be cleaned up 426 // callback is a basic function to let caller know something is happening 427 func PruneSharedStore(dryRun bool, callback PruneCallback) ([]string, error) { 428 fileSHAs, err := getAllSharedLOBSHAs() 429 if err == nil { 430 ret := make([]string, 0, 10) 431 for sha := range fileSHAs.Iter() { 432 shareddir := GetSharedLOBDir(sha) 433 names, err := filepath.Glob(filepath.Join(shareddir, fmt.Sprintf("%v*", sha))) 434 if err != nil { 435 return make([]string, 0), errors.New(fmt.Sprintf("Unable to glob shared files for %v: %v\n", sha, err)) 436 } 437 var deleted bool = false 438 var lastsha string 439 for _, n := range names { 440 callback(PruneWorking, "") 441 links, err := GetHardLinkCount(n) 442 if err == nil && links == 1 { 443 // only 1 hard link means no other repo refers to this shared LOB 444 // so it's safe to delete it 445 deleted = true 446 sha = filepath.Base(n)[:40] 447 if lastsha != sha { 448 callback(PruneDeleted, sha) 449 lastsha = sha 450 } 451 if !dryRun { 452 err = os.Remove(n) 453 if err != nil { 454 // don't abort for 1 failure, report & carry on 455 util.LogErrorf("Unable to delete file %v: %v\n", n, err) 456 } 457 } 458 } 459 } 460 if deleted { 461 ret = append(ret, string(sha)) 462 } 463 } 464 return ret, nil 465 } else { 466 return make([]string, 0), err 467 } 468 469 }