github.com/x-oss-byte/git-lfs@v2.5.2+incompatible/git/githistory/rewriter.go (about) 1 package githistory 2 3 import ( 4 "encoding/hex" 5 "fmt" 6 "io" 7 "os" 8 "strings" 9 "sync" 10 11 "github.com/git-lfs/git-lfs/errors" 12 "github.com/git-lfs/git-lfs/filepathfilter" 13 "github.com/git-lfs/git-lfs/git" 14 "github.com/git-lfs/git-lfs/tasklog" 15 "github.com/git-lfs/gitobj" 16 ) 17 18 // Rewriter allows rewriting topologically equivalent Git histories 19 // between two revisions. 20 type Rewriter struct { 21 // mu guards entries and commits (see below) 22 mu *sync.Mutex 23 // entries is a mapping of old tree entries to new (rewritten) ones. 24 // Since TreeEntry contains a []byte (and is therefore not a key-able 25 // type), a unique TreeEntry -> string function is used for map keys. 26 entries map[string]*gitobj.TreeEntry 27 // commits is a mapping of old commit SHAs to new ones, where the ASCII 28 // hex encoding of the SHA1 values are used as map keys. 29 commits map[string][]byte 30 // filter is an optional value used to specify which tree entries 31 // (blobs, subtrees) are modifiable given a BlobFn. If non-nil, this 32 // filter will cull out any unmodifiable subtrees and blobs. 33 filter *filepathfilter.Filter 34 // db is the *ObjectDatabase from which blobs, commits, and trees are 35 // loaded from. 36 db *gitobj.ObjectDatabase 37 // l is the *tasklog.Logger to which updates are written. 38 l *tasklog.Logger 39 } 40 41 // RewriteOptions is an options type given to the Rewrite() function. 42 type RewriteOptions struct { 43 // Include is the list of refs of which commits reachable by that ref 44 // will be included. 45 Include []string 46 // Exclude is the list of refs of which commits reachable by that ref 47 // will be excluded. 48 Exclude []string 49 50 // UpdateRefs specifies whether the Rewriter should move refs from the 51 // original graph onto the migrated one. If true, the refs will be 52 // moved, and a reflog entry will be created. 53 UpdateRefs bool 54 55 // Verbose mode prints migrated objects. 56 Verbose bool 57 58 // ObjectMapFilePath is the path to the map of old sha1 to new sha1 59 // commits 60 ObjectMapFilePath string 61 62 // BlobFn specifies a function to rewrite blobs. 63 // 64 // It is called once per unique, unchanged path. That is to say, if 65 // /a/foo and /a/bar contain identical contents, the BlobFn will be 66 // called twice: once for /a/foo and once for /a/bar, but no more on 67 // each blob for subsequent revisions, so long as each entry remains 68 // unchanged. 69 BlobFn BlobRewriteFn 70 // TreePreCallbackFn specifies a function to be called before opening a 71 // tree for rewriting. It will be called on all trees throughout history 72 // in topological ordering through the tree, starting at the root. 73 TreePreCallbackFn TreePreCallbackFn 74 // TreeCallbackFn specifies a function to rewrite trees after they have 75 // been reassembled by calling the above BlobFn on all existing tree 76 // entries. 77 TreeCallbackFn TreeCallbackFn 78 } 79 80 // blobFn returns a useable BlobRewriteFn, either the one that was given in the 81 // *RewriteOptions, or a noopBlobFn. 82 func (r *RewriteOptions) blobFn() BlobRewriteFn { 83 if r.BlobFn == nil { 84 return noopBlobFn 85 } 86 return r.BlobFn 87 } 88 89 // treePreFn returns a useable TreePreCallbackFn, either the one that was given 90 // in the *RewriteOptions, or a noopTreePreFn. 91 func (r *RewriteOptions) treePreFn() TreePreCallbackFn { 92 if r.TreePreCallbackFn == nil { 93 return noopTreePreFn 94 } 95 return r.TreePreCallbackFn 96 } 97 98 // treeFn returns a useable TreeRewriteFn, either the one that was given in the 99 // *RewriteOptions, or a noopTreeFn. 100 func (r *RewriteOptions) treeFn() TreeCallbackFn { 101 if r.TreeCallbackFn == nil { 102 return noopTreeFn 103 } 104 return r.TreeCallbackFn 105 } 106 107 // BlobRewriteFn is a mapping function that takes a given blob and returns a 108 // new, modified blob. If it returns an error, the new blob will not be written 109 // and instead the error will be returned from the Rewrite() function. 110 // 111 // Invocations of an instance of BlobRewriteFn are not expected to store the 112 // returned blobs in the *git/gitobj.ObjectDatabase. 113 // 114 // The path argument is given to be an absolute path to the tree entry being 115 // rewritten, where the repository root is the root of the path given. For 116 // instance, a file "b.txt" in directory "dir" would be given as "/dir/b.txt", 117 // where as a file "a.txt" in the root would be given as "/a.txt". 118 // 119 // As above, the path separators are OS specific, and equivalent to the result 120 // of filepath.Join(...) or os.PathSeparator. 121 type BlobRewriteFn func(path string, b *gitobj.Blob) (*gitobj.Blob, error) 122 123 // TreePreCallbackFn specifies a function to call upon opening a new tree for 124 // rewriting. 125 // 126 // Unlike its sibling TreeCallbackFn, TreePreCallbackFn may not modify the given 127 // tree. 128 // 129 // TreePreCallbackFn can be nil, and will therefore exhibit behavior equivalent 130 // to only calling the BlobFn on existing tree entries. 131 // 132 // If the TreePreCallbackFn returns an error, it will be returned from the 133 // Rewrite() invocation. 134 type TreePreCallbackFn func(path string, t *gitobj.Tree) error 135 136 // TreeCallbackFn specifies a function to call before writing a re-written tree 137 // to the object database. The TreeCallbackFn can return a modified tree to be 138 // written to the object database instead of one generated from calling BlobFn 139 // on all of the tree entries. 140 // 141 // 142 // TreeCallbackFn can be nil, and will therefore exhibit behavior equivalent to 143 // only calling the BlobFn on existing tree entries. 144 // 145 // If the TreeCallbackFn returns an error, it will be returned from the 146 // Rewrite() invocation. 147 type TreeCallbackFn func(path string, t *gitobj.Tree) (*gitobj.Tree, error) 148 149 type rewriterOption func(*Rewriter) 150 151 var ( 152 // WithFilter is an optional argument given to the NewRewriter 153 // constructor function to limit invocations of the BlobRewriteFn to 154 // only pathspecs that match the given *filepathfilter.Filter. 155 WithFilter = func(filter *filepathfilter.Filter) rewriterOption { 156 return func(r *Rewriter) { 157 r.filter = filter 158 } 159 } 160 161 // WithLoggerto logs updates caused by the *git/githistory.Rewriter to 162 // the given io.Writer "sink". 163 WithLoggerTo = func(sink io.Writer) rewriterOption { 164 return WithLogger(tasklog.NewLogger(sink)) 165 } 166 167 // WithLogger logs updates caused by the *git/githistory.Rewriter to the 168 // be given to the provided logger, "l". 169 WithLogger = func(l *tasklog.Logger) rewriterOption { 170 return func(r *Rewriter) { 171 r.l = l 172 } 173 } 174 175 // noopBlobFn is a no-op implementation of the BlobRewriteFn. It returns 176 // the blob that it was given, and returns no error. 177 noopBlobFn = func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil } 178 // noopTreePreFn is a no-op implementation of the TreePreRewriteFn. It 179 // returns the tree that it was given, and returns no error. 180 noopTreePreFn = func(path string, t *gitobj.Tree) error { return nil } 181 // noopTreeFn is a no-op implementation of the TreeRewriteFn. It returns 182 // the tree that it was given, and returns no error. 183 noopTreeFn = func(path string, t *gitobj.Tree) (*gitobj.Tree, error) { return t, nil } 184 ) 185 186 // NewRewriter constructs a *Rewriter from the given *ObjectDatabase instance. 187 func NewRewriter(db *gitobj.ObjectDatabase, opts ...rewriterOption) *Rewriter { 188 rewriter := &Rewriter{ 189 mu: new(sync.Mutex), 190 entries: make(map[string]*gitobj.TreeEntry), 191 commits: make(map[string][]byte), 192 193 db: db, 194 } 195 196 for _, opt := range opts { 197 opt(rewriter) 198 } 199 return rewriter 200 } 201 202 // Rewrite rewrites the range of commits given by *RewriteOptions.{Left,Right} 203 // using the BlobRewriteFn to rewrite the individual blobs. 204 func (r *Rewriter) Rewrite(opt *RewriteOptions) ([]byte, error) { 205 // First, obtain a list of commits to rewrite. 206 commits, err := r.commitsToMigrate(opt) 207 if err != nil { 208 return nil, err 209 } 210 211 var perc *tasklog.PercentageTask 212 if opt.UpdateRefs { 213 perc = r.l.Percentage("migrate: Rewriting commits", uint64(len(commits))) 214 } else { 215 perc = r.l.Percentage("migrate: Examining commits", uint64(len(commits))) 216 } 217 218 var vPerc *tasklog.PercentageTask 219 if opt.Verbose { 220 vPerc = perc 221 } 222 223 var objectMapFile *os.File 224 if len(opt.ObjectMapFilePath) > 0 { 225 objectMapFile, err = os.OpenFile(opt.ObjectMapFilePath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666) 226 if err != nil { 227 return nil, fmt.Errorf("Could not create object map file: %v", err) 228 } 229 defer objectMapFile.Close() 230 } 231 232 // Keep track of the last commit that we rewrote. Callers often want 233 // this so that they can perform a git-update-ref(1). 234 var tip []byte 235 for _, oid := range commits { 236 // Load the original commit to access the data necessary in 237 // order to rewrite it. 238 original, err := r.db.Commit(oid) 239 if err != nil { 240 return nil, err 241 } 242 243 // Rewrite the tree given at that commit. 244 rewrittenTree, err := r.rewriteTree(oid, original.TreeID, "", opt.blobFn(), opt.treePreFn(), opt.treeFn(), vPerc) 245 if err != nil { 246 return nil, err 247 } 248 249 // Create a new list of parents from the original commit to 250 // point at the rewritten parents in order to create a 251 // topologically equivalent DAG. 252 // 253 // This operation is safe since we are visiting the commits in 254 // reverse topological order and therefore have seen all parents 255 // before children (in other words, r.uncacheCommit(...) will 256 // always return a value, if the prospective parent is a part of 257 // the migration). 258 rewrittenParents := make([][]byte, 0, len(original.ParentIDs)) 259 for _, originalParent := range original.ParentIDs { 260 rewrittenParent, ok := r.uncacheCommit(originalParent) 261 if !ok { 262 // If we haven't seen the parent before, this 263 // means that we're doing a partial migration 264 // and the parent that we're looking for isn't 265 // included. 266 // 267 // Use the original parent to properly link 268 // history across the migration boundary. 269 rewrittenParent = originalParent 270 } 271 272 rewrittenParents = append(rewrittenParents, rewrittenParent) 273 } 274 275 // Construct a new commit using the original header information, 276 // but the rewritten set of parents as well as root tree. 277 rewrittenCommit := &gitobj.Commit{ 278 Author: original.Author, 279 Committer: original.Committer, 280 ExtraHeaders: original.ExtraHeaders, 281 Message: original.Message, 282 283 ParentIDs: rewrittenParents, 284 TreeID: rewrittenTree, 285 } 286 287 var newSha []byte 288 289 if original.Equal(rewrittenCommit) { 290 newSha = make([]byte, len(oid)) 291 copy(newSha, oid) 292 } else { 293 newSha, err = r.db.WriteCommit(rewrittenCommit) 294 if err != nil { 295 return nil, err 296 } 297 if objectMapFile != nil { 298 if _, err := fmt.Fprintf(objectMapFile, "%x,%x\n", oid, newSha); err != nil { 299 return nil, err 300 } 301 } 302 } 303 304 // Cache that commit so that we can reassign children of this 305 // commit. 306 r.cacheCommit(oid, newSha) 307 308 // Increment the percentage displayed in the terminal. 309 perc.Count(1) 310 311 // Move the tip forward. 312 tip = newSha 313 } 314 315 if opt.UpdateRefs { 316 refs, err := r.refsToMigrate() 317 if err != nil { 318 return nil, errors.Wrap(err, "could not find refs to update") 319 } 320 321 root, _ := r.db.Root() 322 323 updater := &refUpdater{ 324 CacheFn: r.uncacheCommit, 325 Logger: r.l, 326 Refs: refs, 327 Root: root, 328 329 db: r.db, 330 } 331 332 if err := updater.UpdateRefs(); err != nil { 333 return nil, errors.Wrap(err, "could not update refs") 334 } 335 } 336 337 return tip, err 338 } 339 340 // rewriteTree is a recursive function which rewrites a tree given by the ID 341 // "sha" and path "path". It uses the given BlobRewriteFn to rewrite all blobs 342 // within the tree, either calling that function or recurring down into subtrees 343 // by re-assigning the SHA. 344 // 345 // Once it is done assembling the entries in a given subtree, it then calls the 346 // TreeCallbackFn, "tfn" to perform a final traversal of the subtree before 347 // saving it to the object database. 348 // 349 // It returns the new SHA of the rewritten tree, or an error if the tree was 350 // unable to be rewritten. 351 func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string, 352 fn BlobRewriteFn, tpfn TreePreCallbackFn, tfn TreeCallbackFn, 353 perc *tasklog.PercentageTask) ([]byte, error) { 354 355 tree, err := r.db.Tree(treeOID) 356 if err != nil { 357 return nil, err 358 } 359 360 if err := tpfn("/"+path, tree); err != nil { 361 return nil, err 362 } 363 364 entries := make([]*gitobj.TreeEntry, 0, len(tree.Entries)) 365 for _, entry := range tree.Entries { 366 var fullpath string 367 if len(path) > 0 { 368 fullpath = strings.Join([]string{path, entry.Name}, "/") 369 } else { 370 fullpath = entry.Name 371 } 372 373 if !r.allows(entry.Type(), fullpath) { 374 entries = append(entries, copyEntry(entry)) 375 continue 376 } 377 378 // If this is a symlink, skip it 379 if entry.Filemode == 0120000 { 380 entries = append(entries, copyEntry(entry)) 381 continue 382 } 383 384 if cached := r.uncacheEntry(entry); cached != nil { 385 entries = append(entries, copyEntry(cached)) 386 continue 387 } 388 389 var oid []byte 390 391 switch entry.Type() { 392 case gitobj.BlobObjectType: 393 oid, err = r.rewriteBlob(commitOID, entry.Oid, fullpath, fn, perc) 394 case gitobj.TreeObjectType: 395 oid, err = r.rewriteTree(commitOID, entry.Oid, fullpath, fn, tpfn, tfn, perc) 396 default: 397 oid = entry.Oid 398 399 } 400 if err != nil { 401 return nil, err 402 } 403 404 entries = append(entries, r.cacheEntry(entry, &gitobj.TreeEntry{ 405 Filemode: entry.Filemode, 406 Name: entry.Name, 407 Oid: oid, 408 })) 409 } 410 411 rewritten, err := tfn("/"+path, &gitobj.Tree{Entries: entries}) 412 if err != nil { 413 return nil, err 414 } 415 416 if tree.Equal(rewritten) { 417 return treeOID, nil 418 } 419 return r.db.WriteTree(rewritten) 420 } 421 422 func copyEntry(e *gitobj.TreeEntry) *gitobj.TreeEntry { 423 if e == nil { 424 return nil 425 } 426 427 oid := make([]byte, len(e.Oid)) 428 copy(oid, e.Oid) 429 430 return &gitobj.TreeEntry{ 431 Filemode: e.Filemode, 432 Name: e.Name, 433 Oid: oid, 434 } 435 } 436 437 func (r *Rewriter) allows(typ gitobj.ObjectType, abs string) bool { 438 switch typ { 439 case gitobj.BlobObjectType: 440 return r.Filter().Allows(strings.TrimPrefix(abs, "/")) 441 case gitobj.CommitObjectType, gitobj.TreeObjectType: 442 return true 443 default: 444 panic(fmt.Sprintf("git/githistory: unknown entry type: %s", typ)) 445 } 446 } 447 448 // rewriteBlob calls the given BlobRewriteFn "fn" on a blob given in the object 449 // database by the SHA1 "from" []byte. It writes and returns the new blob SHA, 450 // or an error if either the BlobRewriteFn returned one, or if the object could 451 // not be loaded/saved. 452 func (r *Rewriter) rewriteBlob(commitOID, from []byte, path string, fn BlobRewriteFn, perc *tasklog.PercentageTask) ([]byte, error) { 453 blob, err := r.db.Blob(from) 454 if err != nil { 455 return nil, err 456 } 457 458 b, err := fn(path, blob) 459 if err != nil { 460 return nil, err 461 } 462 463 if !blob.Equal(b) { 464 sha, err := r.db.WriteBlob(b) 465 if err != nil { 466 return nil, err 467 } 468 469 // Close the source blob, so long as it is not equal to the 470 // rewritten blob. If the two are equal, as in the check above 471 // this comment, calling r.db.WriteBlob(b) will have already 472 // closed both "b" and "blob" since they are the same. 473 // 474 // Closing an *os.File twice causes an `os.ErrInvalid` to be 475 // returned. 476 if err = blob.Close(); err != nil { 477 return nil, err 478 } 479 480 if perc != nil { 481 perc.Entry(fmt.Sprintf("migrate: commit %s: %s", hex.EncodeToString(commitOID), path)) 482 } 483 484 return sha, nil 485 } 486 487 // Close the source blob, since it is identical to the rewritten blob, 488 // but neither were written. 489 if err := blob.Close(); err != nil { 490 return nil, err 491 } 492 return from, nil 493 } 494 495 // commitsToMigrate returns an in-memory copy of a list of commits according to 496 // the output of git-rev-list(1) (given the *RewriteOptions), where each 497 // outputted commit is 20 bytes of raw SHA1. 498 // 499 // If any error was encountered, it will be returned. 500 func (r *Rewriter) commitsToMigrate(opt *RewriteOptions) ([][]byte, error) { 501 waiter := r.l.Waiter("migrate: Sorting commits") 502 defer waiter.Complete() 503 504 scanner, err := git.NewRevListScanner( 505 opt.Include, opt.Exclude, r.scannerOpts()) 506 if err != nil { 507 return nil, err 508 } 509 510 var commits [][]byte 511 for scanner.Scan() { 512 commits = append(commits, scanner.OID()) 513 } 514 515 if err = scanner.Err(); err != nil { 516 return nil, err 517 } 518 if err = scanner.Close(); err != nil { 519 return nil, err 520 } 521 return commits, nil 522 } 523 524 // refsToMigrate returns a list of references to migrate, or an error if loading 525 // those references failed. 526 func (r *Rewriter) refsToMigrate() ([]*git.Ref, error) { 527 var refs []*git.Ref 528 var err error 529 530 if root, ok := r.db.Root(); ok { 531 refs, err = git.AllRefsIn(root) 532 } else { 533 refs, err = git.AllRefs() 534 } 535 536 if err != nil { 537 return nil, err 538 } 539 540 var local []*git.Ref 541 for _, ref := range refs { 542 if ref.Type == git.RefTypeRemoteBranch || ref.Type == git.RefTypeRemoteTag { 543 continue 544 } 545 546 local = append(local, ref) 547 } 548 549 return local, nil 550 } 551 552 // scannerOpts returns a *git.ScanRefsOptions instance to be given to the 553 // *git.RevListScanner. 554 // 555 // If the database this *Rewriter is operating in a given root (not in memory) 556 // it re-assigns the working directory to be there. 557 func (r *Rewriter) scannerOpts() *git.ScanRefsOptions { 558 opts := &git.ScanRefsOptions{ 559 Mode: git.ScanRefsMode, 560 Order: git.TopoRevListOrder, 561 Reverse: true, 562 CommitsOnly: true, 563 564 SkippedRefs: make([]string, 0), 565 Mutex: new(sync.Mutex), 566 Names: make(map[string]string), 567 } 568 569 if root, ok := r.db.Root(); ok { 570 opts.WorkingDir = root 571 } 572 return opts 573 } 574 575 // Filter returns the filter used by this *Rewriter to filter subtrees, blobs 576 // (see above). 577 func (r *Rewriter) Filter() *filepathfilter.Filter { 578 return r.filter 579 } 580 581 // cacheEntry caches then given "from" entry so that it is always rewritten as 582 // a *TreeEntry equivalent to "to". 583 func (r *Rewriter) cacheEntry(from, to *gitobj.TreeEntry) *gitobj.TreeEntry { 584 r.mu.Lock() 585 defer r.mu.Unlock() 586 587 r.entries[r.entryKey(from)] = to 588 589 return to 590 } 591 592 // uncacheEntry returns a *TreeEntry that is cached from the given *TreeEntry 593 // "from". That is to say, it returns the *TreeEntry that "from" should be 594 // rewritten to, or nil if none could be found. 595 func (r *Rewriter) uncacheEntry(from *gitobj.TreeEntry) *gitobj.TreeEntry { 596 r.mu.Lock() 597 defer r.mu.Unlock() 598 599 return r.entries[r.entryKey(from)] 600 } 601 602 // entryKey returns a unique key for a given *TreeEntry "e". 603 func (r *Rewriter) entryKey(e *gitobj.TreeEntry) string { 604 return fmt.Sprintf("%s:%x", e.Name, e.Oid) 605 } 606 607 // cacheEntry caches then given "from" commit so that it is always rewritten as 608 // a *git/gitobj.Commit equivalent to "to". 609 func (r *Rewriter) cacheCommit(from, to []byte) { 610 r.mu.Lock() 611 defer r.mu.Unlock() 612 613 r.commits[hex.EncodeToString(from)] = to 614 } 615 616 // uncacheCommit returns a *git/gitobj.Commit that is cached from the given 617 // *git/gitobj.Commit "from". That is to say, it returns the *git/gitobj.Commit that 618 // "from" should be rewritten to and true, or nil and false if none could be 619 // found. 620 func (r *Rewriter) uncacheCommit(from []byte) ([]byte, bool) { 621 r.mu.Lock() 622 defer r.mu.Unlock() 623 624 c, ok := r.commits[hex.EncodeToString(from)] 625 return c, ok 626 }