github.com/stffabi/git-lfs@v2.3.5-0.20180214015214-8eeaa8d88902+incompatible/git/githistory/rewriter.go (about) 1 package githistory 2 3 import ( 4 "encoding/hex" 5 "fmt" 6 "io" 7 "os" 8 "path/filepath" 9 "sync" 10 11 "github.com/git-lfs/git-lfs/errors" 12 "github.com/git-lfs/git-lfs/filepathfilter" 13 "github.com/git-lfs/git-lfs/git" 14 "github.com/git-lfs/git-lfs/git/odb" 15 "github.com/git-lfs/git-lfs/tasklog" 16 ) 17 18 // Rewriter allows rewriting topologically equivalent Git histories 19 // between two revisions. 20 type Rewriter struct { 21 // mu guards entries and commits (see below) 22 mu *sync.Mutex 23 // entries is a mapping of old tree entries to new (rewritten) ones. 24 // Since TreeEntry contains a []byte (and is therefore not a key-able 25 // type), a unique TreeEntry -> string function is used for map keys. 26 entries map[string]*odb.TreeEntry 27 // commits is a mapping of old commit SHAs to new ones, where the ASCII 28 // hex encoding of the SHA1 values are used as map keys. 29 commits map[string][]byte 30 // filter is an optional value used to specify which tree entries 31 // (blobs, subtrees) are modifiable given a BlobFn. If non-nil, this 32 // filter will cull out any unmodifiable subtrees and blobs. 33 filter *filepathfilter.Filter 34 // db is the *ObjectDatabase from which blobs, commits, and trees are 35 // loaded from. 36 db *odb.ObjectDatabase 37 // l is the *tasklog.Logger to which updates are written. 38 l *tasklog.Logger 39 } 40 41 // RewriteOptions is an options type given to the Rewrite() function. 42 type RewriteOptions struct { 43 // Include is the list of refs of which commits reachable by that ref 44 // will be included. 45 Include []string 46 // Exclude is the list of refs of which commits reachable by that ref 47 // will be excluded. 48 Exclude []string 49 50 // UpdateRefs specifies whether the Rewriter should move refs from the 51 // original graph onto the migrated one. If true, the refs will be 52 // moved, and a reflog entry will be created. 53 UpdateRefs bool 54 55 // Verbose mode prints migrated objects. 56 Verbose bool 57 58 // BlobFn specifies a function to rewrite blobs. 59 // 60 // It is called once per unique, unchanged path. That is to say, if 61 // /a/foo and /a/bar contain identical contents, the BlobFn will be 62 // called twice: once for /a/foo and once for /a/bar, but no more on 63 // each blob for subsequent revisions, so long as each entry remains 64 // unchanged. 65 BlobFn BlobRewriteFn 66 // TreeCallbackFn specifies a function to rewrite trees after they have 67 // been reassembled by calling the above BlobFn on all existing tree 68 // entries. 69 TreeCallbackFn TreeCallbackFn 70 } 71 72 // blobFn returns a useable BlobRewriteFn, either the one that was given in the 73 // *RewriteOptions, or a noopBlobFn. 74 func (r *RewriteOptions) blobFn() BlobRewriteFn { 75 if r.BlobFn == nil { 76 return noopBlobFn 77 } 78 return r.BlobFn 79 } 80 81 // treeFn returns a useable TreeRewriteFn, either the one that was given in the 82 // *RewriteOptions, or a noopTreeFn. 83 func (r *RewriteOptions) treeFn() TreeCallbackFn { 84 if r.TreeCallbackFn == nil { 85 return noopTreeFn 86 } 87 return r.TreeCallbackFn 88 } 89 90 // BlobRewriteFn is a mapping function that takes a given blob and returns a 91 // new, modified blob. If it returns an error, the new blob will not be written 92 // and instead the error will be returned from the Rewrite() function. 93 // 94 // Invocations of an instance of BlobRewriteFn are not expected to store the 95 // returned blobs in the *git/odb.ObjectDatabase. 96 // 97 // The path argument is given to be an absolute path to the tree entry being 98 // rewritten, where the repository root is the root of the path given. For 99 // instance, a file "b.txt" in directory "dir" would be given as "/dir/b.txt", 100 // where as a file "a.txt" in the root would be given as "/a.txt". 101 // 102 // As above, the path separators are OS specific, and equivalent to the result 103 // of filepath.Join(...) or os.PathSeparator. 104 type BlobRewriteFn func(path string, b *odb.Blob) (*odb.Blob, error) 105 106 // TreeCallbackFn specifies a function to call before writing a re-written tree 107 // to the object database. The TreeCallbackFn can return a modified tree to be 108 // written to the object database instead of one generated from calling BlobFn 109 // on all of the tree entries. 110 // 111 // Trees returned from a TreeCallbackFn MUST have all objects referenced in the 112 // entryset already written to the object database. 113 // 114 // TreeCallbackFn can be nil, and will therefore exhibit behavior equivalent to 115 // only calling the BlobFn on existing tree entries. 116 // 117 // If the TreeCallbackFn returns an error, it will be returned from the 118 // Rewrite() invocation. 119 type TreeCallbackFn func(path string, t *odb.Tree) (*odb.Tree, error) 120 121 type rewriterOption func(*Rewriter) 122 123 var ( 124 // WithFilter is an optional argument given to the NewRewriter 125 // constructor function to limit invocations of the BlobRewriteFn to 126 // only pathspecs that match the given *filepathfilter.Filter. 127 WithFilter = func(filter *filepathfilter.Filter) rewriterOption { 128 return func(r *Rewriter) { 129 r.filter = filter 130 } 131 } 132 133 // WithLoggerto logs updates caused by the *git/githistory.Rewriter to 134 // the given io.Writer "sink". 135 WithLoggerTo = func(sink io.Writer) rewriterOption { 136 return WithLogger(tasklog.NewLogger(sink)) 137 } 138 139 // WithLogger logs updates caused by the *git/githistory.Rewriter to the 140 // be given to the provided logger, "l". 141 WithLogger = func(l *tasklog.Logger) rewriterOption { 142 return func(r *Rewriter) { 143 r.l = l 144 } 145 } 146 147 // noopBlobFn is a no-op implementation of the BlobRewriteFn. It returns 148 // the blob that it was given, and returns no error. 149 noopBlobFn = func(path string, b *odb.Blob) (*odb.Blob, error) { return b, nil } 150 // noopTreeFn is a no-op implementation of the TreeRewriteFn. It returns 151 // the tree that it was given, and returns no error. 152 noopTreeFn = func(path string, t *odb.Tree) (*odb.Tree, error) { return t, nil } 153 ) 154 155 // NewRewriter constructs a *Rewriter from the given *ObjectDatabase instance. 156 func NewRewriter(db *odb.ObjectDatabase, opts ...rewriterOption) *Rewriter { 157 rewriter := &Rewriter{ 158 mu: new(sync.Mutex), 159 entries: make(map[string]*odb.TreeEntry), 160 commits: make(map[string][]byte), 161 162 db: db, 163 } 164 165 for _, opt := range opts { 166 opt(rewriter) 167 } 168 return rewriter 169 } 170 171 // Rewrite rewrites the range of commits given by *RewriteOptions.{Left,Right} 172 // using the BlobRewriteFn to rewrite the individual blobs. 173 func (r *Rewriter) Rewrite(opt *RewriteOptions) ([]byte, error) { 174 // First, obtain a list of commits to rewrite. 175 commits, err := r.commitsToMigrate(opt) 176 if err != nil { 177 return nil, err 178 } 179 180 var perc *tasklog.PercentageTask 181 if opt.UpdateRefs { 182 perc = r.l.Percentage("migrate: Rewriting commits", uint64(len(commits))) 183 } else { 184 perc = r.l.Percentage("migrate: Examining commits", uint64(len(commits))) 185 } 186 187 var vPerc *tasklog.PercentageTask 188 if opt.Verbose { 189 vPerc = perc 190 } 191 192 // Keep track of the last commit that we rewrote. Callers often want 193 // this so that they can perform a git-update-ref(1). 194 var tip []byte 195 for _, oid := range commits { 196 // Load the original commit to access the data necessary in 197 // order to rewrite it. 198 original, err := r.db.Commit(oid) 199 if err != nil { 200 return nil, err 201 } 202 203 // Rewrite the tree given at that commit. 204 rewrittenTree, err := r.rewriteTree(oid, original.TreeID, "", opt.blobFn(), opt.treeFn(), vPerc) 205 if err != nil { 206 return nil, err 207 } 208 209 // Create a new list of parents from the original commit to 210 // point at the rewritten parents in order to create a 211 // topologically equivalent DAG. 212 // 213 // This operation is safe since we are visiting the commits in 214 // reverse topological order and therefore have seen all parents 215 // before children (in other words, r.uncacheCommit(...) will 216 // always return a value, if the prospective parent is a part of 217 // the migration). 218 rewrittenParents := make([][]byte, 0, len(original.ParentIDs)) 219 for _, originalParent := range original.ParentIDs { 220 rewrittenParent, ok := r.uncacheCommit(originalParent) 221 if !ok { 222 // If we haven't seen the parent before, this 223 // means that we're doing a partial migration 224 // and the parent that we're looking for isn't 225 // included. 226 // 227 // Use the original parent to properly link 228 // history across the migration boundary. 229 rewrittenParent = originalParent 230 } 231 232 rewrittenParents = append(rewrittenParents, rewrittenParent) 233 } 234 235 // Construct a new commit using the original header information, 236 // but the rewritten set of parents as well as root tree. 237 rewrittenCommit := &odb.Commit{ 238 Author: original.Author, 239 Committer: original.Committer, 240 ExtraHeaders: original.ExtraHeaders, 241 Message: original.Message, 242 243 ParentIDs: rewrittenParents, 244 TreeID: rewrittenTree, 245 } 246 247 var newSha []byte 248 249 if original.Equal(rewrittenCommit) { 250 newSha = make([]byte, len(oid)) 251 copy(newSha, oid) 252 } else { 253 newSha, err = r.db.WriteCommit(rewrittenCommit) 254 if err != nil { 255 return nil, err 256 } 257 } 258 259 // Cache that commit so that we can reassign children of this 260 // commit. 261 r.cacheCommit(oid, newSha) 262 263 // Increment the percentage displayed in the terminal. 264 perc.Count(1) 265 266 // Move the tip forward. 267 tip = newSha 268 } 269 270 if opt.UpdateRefs { 271 refs, err := r.refsToMigrate() 272 if err != nil { 273 return nil, errors.Wrap(err, "could not find refs to update") 274 } 275 276 root, _ := r.db.Root() 277 278 updater := &refUpdater{ 279 CacheFn: r.uncacheCommit, 280 Logger: r.l, 281 Refs: refs, 282 Root: root, 283 284 db: r.db, 285 } 286 287 if err := updater.UpdateRefs(); err != nil { 288 return nil, errors.Wrap(err, "could not update refs") 289 } 290 } 291 292 return tip, err 293 } 294 295 // rewriteTree is a recursive function which rewrites a tree given by the ID 296 // "sha" and path "path". It uses the given BlobRewriteFn to rewrite all blobs 297 // within the tree, either calling that function or recurring down into subtrees 298 // by re-assigning the SHA. 299 // 300 // Once it is done assembling the entries in a given subtree, it then calls the 301 // TreeCallbackFn, "tfn" to perform a final traversal of the subtree before 302 // saving it to the object database. 303 // 304 // It returns the new SHA of the rewritten tree, or an error if the tree was 305 // unable to be rewritten. 306 func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string, fn BlobRewriteFn, tfn TreeCallbackFn, perc *tasklog.PercentageTask) ([]byte, error) { 307 tree, err := r.db.Tree(treeOID) 308 if err != nil { 309 return nil, err 310 } 311 312 entries := make([]*odb.TreeEntry, 0, len(tree.Entries)) 313 for _, entry := range tree.Entries { 314 path := filepath.Join(path, entry.Name) 315 316 if !r.allows(entry.Type(), path) { 317 entries = append(entries, entry) 318 continue 319 } 320 321 if cached := r.uncacheEntry(entry); cached != nil { 322 entries = append(entries, cached) 323 continue 324 } 325 326 var oid []byte 327 328 switch entry.Type() { 329 case odb.BlobObjectType: 330 oid, err = r.rewriteBlob(commitOID, entry.Oid, path, fn, perc) 331 case odb.TreeObjectType: 332 oid, err = r.rewriteTree(commitOID, entry.Oid, path, fn, tfn, perc) 333 default: 334 oid = entry.Oid 335 336 } 337 if err != nil { 338 return nil, err 339 } 340 341 entries = append(entries, r.cacheEntry(entry, &odb.TreeEntry{ 342 Filemode: entry.Filemode, 343 Name: entry.Name, 344 Oid: oid, 345 })) 346 } 347 348 rewritten, err := tfn(string(os.PathSeparator)+path, &odb.Tree{Entries: entries}) 349 if err != nil { 350 return nil, err 351 } 352 353 if tree.Equal(rewritten) { 354 return treeOID, nil 355 } 356 return r.db.WriteTree(rewritten) 357 } 358 359 func (r *Rewriter) allows(typ odb.ObjectType, abs string) bool { 360 switch typ { 361 case odb.BlobObjectType: 362 return r.Filter().Allows(abs) 363 case odb.TreeObjectType: 364 return r.Filter().HasPrefix(abs) 365 case odb.CommitObjectType: 366 return true 367 default: 368 panic(fmt.Sprintf("git/githistory: unknown entry type: %s", typ)) 369 } 370 } 371 372 // rewriteBlob calls the given BlobRewriteFn "fn" on a blob given in the object 373 // database by the SHA1 "from" []byte. It writes and returns the new blob SHA, 374 // or an error if either the BlobRewriteFn returned one, or if the object could 375 // not be loaded/saved. 376 func (r *Rewriter) rewriteBlob(commitOID, from []byte, path string, fn BlobRewriteFn, perc *tasklog.PercentageTask) ([]byte, error) { 377 blob, err := r.db.Blob(from) 378 if err != nil { 379 return nil, err 380 } 381 382 b, err := fn(path, blob) 383 if err != nil { 384 return nil, err 385 } 386 387 if !blob.Equal(b) { 388 sha, err := r.db.WriteBlob(b) 389 if err != nil { 390 return nil, err 391 } 392 393 // Close the source blob, so long as it is not equal to the 394 // rewritten blob. If the two are equal, as in the check above 395 // this comment, calling r.db.WriteBlob(b) will have already 396 // closed both "b" and "blob" since they are the same. 397 // 398 // Closing an *os.File twice causes an `os.ErrInvalid` to be 399 // returned. 400 if err = blob.Close(); err != nil { 401 return nil, err 402 } 403 404 if perc != nil { 405 perc.Entry(fmt.Sprintf("migrate: commit %s: %s", hex.EncodeToString(commitOID), path)) 406 } 407 408 return sha, nil 409 } 410 411 // Close the source blob, since it is identical to the rewritten blob, 412 // but neither were written. 413 if err := blob.Close(); err != nil { 414 return nil, err 415 } 416 return from, nil 417 } 418 419 // commitsToMigrate returns an in-memory copy of a list of commits according to 420 // the output of git-rev-list(1) (given the *RewriteOptions), where each 421 // outputted commit is 20 bytes of raw SHA1. 422 // 423 // If any error was encountered, it will be returned. 424 func (r *Rewriter) commitsToMigrate(opt *RewriteOptions) ([][]byte, error) { 425 waiter := r.l.Waiter("migrate: Sorting commits") 426 defer waiter.Complete() 427 428 scanner, err := git.NewRevListScanner( 429 opt.Include, opt.Exclude, r.scannerOpts()) 430 if err != nil { 431 return nil, err 432 } 433 434 var commits [][]byte 435 for scanner.Scan() { 436 commits = append(commits, scanner.OID()) 437 } 438 439 if err = scanner.Err(); err != nil { 440 return nil, err 441 } 442 if err = scanner.Close(); err != nil { 443 return nil, err 444 } 445 return commits, nil 446 } 447 448 // refsToMigrate returns a list of references to migrate, or an error if loading 449 // those references failed. 450 func (r *Rewriter) refsToMigrate() ([]*git.Ref, error) { 451 var refs []*git.Ref 452 var err error 453 454 if root, ok := r.db.Root(); ok { 455 refs, err = git.AllRefsIn(root) 456 } else { 457 refs, err = git.AllRefs() 458 } 459 460 if err != nil { 461 return nil, err 462 } 463 464 var local []*git.Ref 465 for _, ref := range refs { 466 if ref.Type == git.RefTypeRemoteBranch || ref.Type == git.RefTypeRemoteTag { 467 continue 468 } 469 470 local = append(local, ref) 471 } 472 473 return local, nil 474 } 475 476 // scannerOpts returns a *git.ScanRefsOptions instance to be given to the 477 // *git.RevListScanner. 478 // 479 // If the database this *Rewriter is operating in a given root (not in memory) 480 // it re-assigns the working directory to be there. 481 func (r *Rewriter) scannerOpts() *git.ScanRefsOptions { 482 opts := &git.ScanRefsOptions{ 483 Mode: git.ScanRefsMode, 484 Order: git.TopoRevListOrder, 485 Reverse: true, 486 CommitsOnly: true, 487 488 SkippedRefs: make([]string, 0), 489 Mutex: new(sync.Mutex), 490 Names: make(map[string]string), 491 } 492 493 if root, ok := r.db.Root(); ok { 494 opts.WorkingDir = root 495 } 496 return opts 497 } 498 499 // Filter returns the filter used by this *Rewriter to filter subtrees, blobs 500 // (see above). 501 func (r *Rewriter) Filter() *filepathfilter.Filter { 502 return r.filter 503 } 504 505 // cacheEntry caches then given "from" entry so that it is always rewritten as 506 // a *TreeEntry equivalent to "to". 507 func (r *Rewriter) cacheEntry(from, to *odb.TreeEntry) *odb.TreeEntry { 508 r.mu.Lock() 509 defer r.mu.Unlock() 510 511 r.entries[r.entryKey(from)] = to 512 513 return to 514 } 515 516 // uncacheEntry returns a *TreeEntry that is cached from the given *TreeEntry 517 // "from". That is to say, it returns the *TreeEntry that "from" should be 518 // rewritten to, or nil if none could be found. 519 func (r *Rewriter) uncacheEntry(from *odb.TreeEntry) *odb.TreeEntry { 520 r.mu.Lock() 521 defer r.mu.Unlock() 522 523 return r.entries[r.entryKey(from)] 524 } 525 526 // entryKey returns a unique key for a given *TreeEntry "e". 527 func (r *Rewriter) entryKey(e *odb.TreeEntry) string { 528 return fmt.Sprintf("%s:%x", e.Name, e.Oid) 529 } 530 531 // cacheEntry caches then given "from" commit so that it is always rewritten as 532 // a *git/odb.Commit equivalent to "to". 533 func (r *Rewriter) cacheCommit(from, to []byte) { 534 r.mu.Lock() 535 defer r.mu.Unlock() 536 537 r.commits[hex.EncodeToString(from)] = to 538 } 539 540 // uncacheCommit returns a *git/odb.Commit that is cached from the given 541 // *git/odb.Commit "from". That is to say, it returns the *git/odb.Commit that 542 // "from" should be rewritten to and true, or nil and false if none could be 543 // found. 544 func (r *Rewriter) uncacheCommit(from []byte) ([]byte, bool) { 545 r.mu.Lock() 546 defer r.mu.Unlock() 547 548 c, ok := r.commits[hex.EncodeToString(from)] 549 return c, ok 550 }