github.com/stffabi/git-lfs@v2.3.5-0.20180214015214-8eeaa8d88902+incompatible/git/githistory/rewriter.go (about)

     1  package githistory
     2  
     3  import (
     4  	"encoding/hex"
     5  	"fmt"
     6  	"io"
     7  	"os"
     8  	"path/filepath"
     9  	"sync"
    10  
    11  	"github.com/git-lfs/git-lfs/errors"
    12  	"github.com/git-lfs/git-lfs/filepathfilter"
    13  	"github.com/git-lfs/git-lfs/git"
    14  	"github.com/git-lfs/git-lfs/git/odb"
    15  	"github.com/git-lfs/git-lfs/tasklog"
    16  )
    17  
    18  // Rewriter allows rewriting topologically equivalent Git histories
    19  // between two revisions.
    20  type Rewriter struct {
    21  	// mu guards entries and commits (see below)
    22  	mu *sync.Mutex
    23  	// entries is a mapping of old tree entries to new (rewritten) ones.
    24  	// Since TreeEntry contains a []byte (and is therefore not a key-able
    25  	// type), a unique TreeEntry -> string function is used for map keys.
    26  	entries map[string]*odb.TreeEntry
    27  	// commits is a mapping of old commit SHAs to new ones, where the ASCII
    28  	// hex encoding of the SHA1 values are used as map keys.
    29  	commits map[string][]byte
    30  	// filter is an optional value used to specify which tree entries
    31  	// (blobs, subtrees) are modifiable given a BlobFn. If non-nil, this
    32  	// filter will cull out any unmodifiable subtrees and blobs.
    33  	filter *filepathfilter.Filter
    34  	// db is the *ObjectDatabase from which blobs, commits, and trees are
    35  	// loaded from.
    36  	db *odb.ObjectDatabase
    37  	// l is the *tasklog.Logger to which updates are written.
    38  	l *tasklog.Logger
    39  }
    40  
    41  // RewriteOptions is an options type given to the Rewrite() function.
    42  type RewriteOptions struct {
    43  	// Include is the list of refs of which commits reachable by that ref
    44  	// will be included.
    45  	Include []string
    46  	// Exclude is the list of refs of which commits reachable by that ref
    47  	// will be excluded.
    48  	Exclude []string
    49  
    50  	// UpdateRefs specifies whether the Rewriter should move refs from the
    51  	// original graph onto the migrated one. If true, the refs will be
    52  	// moved, and a reflog entry will be created.
    53  	UpdateRefs bool
    54  
    55  	// Verbose mode prints migrated objects.
    56  	Verbose bool
    57  
    58  	// BlobFn specifies a function to rewrite blobs.
    59  	//
    60  	// It is called once per unique, unchanged path. That is to say, if
    61  	// /a/foo and /a/bar contain identical contents, the BlobFn will be
    62  	// called twice: once for /a/foo and once for /a/bar, but no more on
    63  	// each blob for subsequent revisions, so long as each entry remains
    64  	// unchanged.
    65  	BlobFn BlobRewriteFn
    66  	// TreeCallbackFn specifies a function to rewrite trees after they have
    67  	// been reassembled by calling the above BlobFn on all existing tree
    68  	// entries.
    69  	TreeCallbackFn TreeCallbackFn
    70  }
    71  
    72  // blobFn returns a useable BlobRewriteFn, either the one that was given in the
    73  // *RewriteOptions, or a noopBlobFn.
    74  func (r *RewriteOptions) blobFn() BlobRewriteFn {
    75  	if r.BlobFn == nil {
    76  		return noopBlobFn
    77  	}
    78  	return r.BlobFn
    79  }
    80  
    81  // treeFn returns a useable TreeRewriteFn, either the one that was given in the
    82  // *RewriteOptions, or a noopTreeFn.
    83  func (r *RewriteOptions) treeFn() TreeCallbackFn {
    84  	if r.TreeCallbackFn == nil {
    85  		return noopTreeFn
    86  	}
    87  	return r.TreeCallbackFn
    88  }
    89  
    90  // BlobRewriteFn is a mapping function that takes a given blob and returns a
    91  // new, modified blob. If it returns an error, the new blob will not be written
    92  // and instead the error will be returned from the Rewrite() function.
    93  //
    94  // Invocations of an instance of BlobRewriteFn are not expected to store the
    95  // returned blobs in the *git/odb.ObjectDatabase.
    96  //
    97  // The path argument is given to be an absolute path to the tree entry being
    98  // rewritten, where the repository root is the root of the path given. For
    99  // instance, a file "b.txt" in directory "dir" would be given as "/dir/b.txt",
   100  // where as a file "a.txt" in the root would be given as "/a.txt".
   101  //
   102  // As above, the path separators are OS specific, and equivalent to the result
   103  // of filepath.Join(...) or os.PathSeparator.
   104  type BlobRewriteFn func(path string, b *odb.Blob) (*odb.Blob, error)
   105  
   106  // TreeCallbackFn specifies a function to call before writing a re-written tree
   107  // to the object database. The TreeCallbackFn can return a modified tree to be
   108  // written to the object database instead of one generated from calling BlobFn
   109  // on all of the tree entries.
   110  //
   111  // Trees returned from a TreeCallbackFn MUST have all objects referenced in the
   112  // entryset already written to the object database.
   113  //
   114  // TreeCallbackFn can be nil, and will therefore exhibit behavior equivalent to
   115  // only calling the BlobFn on existing tree entries.
   116  //
   117  // If the TreeCallbackFn returns an error, it will be returned from the
   118  // Rewrite() invocation.
   119  type TreeCallbackFn func(path string, t *odb.Tree) (*odb.Tree, error)
   120  
   121  type rewriterOption func(*Rewriter)
   122  
   123  var (
   124  	// WithFilter is an optional argument given to the NewRewriter
   125  	// constructor function to limit invocations of the BlobRewriteFn to
   126  	// only pathspecs that match the given *filepathfilter.Filter.
   127  	WithFilter = func(filter *filepathfilter.Filter) rewriterOption {
   128  		return func(r *Rewriter) {
   129  			r.filter = filter
   130  		}
   131  	}
   132  
   133  	// WithLoggerto logs updates caused by the *git/githistory.Rewriter to
   134  	// the given io.Writer "sink".
   135  	WithLoggerTo = func(sink io.Writer) rewriterOption {
   136  		return WithLogger(tasklog.NewLogger(sink))
   137  	}
   138  
   139  	// WithLogger logs updates caused by the *git/githistory.Rewriter to the
   140  	// be given to the provided logger, "l".
   141  	WithLogger = func(l *tasklog.Logger) rewriterOption {
   142  		return func(r *Rewriter) {
   143  			r.l = l
   144  		}
   145  	}
   146  
   147  	// noopBlobFn is a no-op implementation of the BlobRewriteFn. It returns
   148  	// the blob that it was given, and returns no error.
   149  	noopBlobFn = func(path string, b *odb.Blob) (*odb.Blob, error) { return b, nil }
   150  	// noopTreeFn is a no-op implementation of the TreeRewriteFn. It returns
   151  	// the tree that it was given, and returns no error.
   152  	noopTreeFn = func(path string, t *odb.Tree) (*odb.Tree, error) { return t, nil }
   153  )
   154  
   155  // NewRewriter constructs a *Rewriter from the given *ObjectDatabase instance.
   156  func NewRewriter(db *odb.ObjectDatabase, opts ...rewriterOption) *Rewriter {
   157  	rewriter := &Rewriter{
   158  		mu:      new(sync.Mutex),
   159  		entries: make(map[string]*odb.TreeEntry),
   160  		commits: make(map[string][]byte),
   161  
   162  		db: db,
   163  	}
   164  
   165  	for _, opt := range opts {
   166  		opt(rewriter)
   167  	}
   168  	return rewriter
   169  }
   170  
   171  // Rewrite rewrites the range of commits given by *RewriteOptions.{Left,Right}
   172  // using the BlobRewriteFn to rewrite the individual blobs.
   173  func (r *Rewriter) Rewrite(opt *RewriteOptions) ([]byte, error) {
   174  	// First, obtain a list of commits to rewrite.
   175  	commits, err := r.commitsToMigrate(opt)
   176  	if err != nil {
   177  		return nil, err
   178  	}
   179  
   180  	var perc *tasklog.PercentageTask
   181  	if opt.UpdateRefs {
   182  		perc = r.l.Percentage("migrate: Rewriting commits", uint64(len(commits)))
   183  	} else {
   184  		perc = r.l.Percentage("migrate: Examining commits", uint64(len(commits)))
   185  	}
   186  
   187  	var vPerc *tasklog.PercentageTask
   188  	if opt.Verbose {
   189  		vPerc = perc
   190  	}
   191  
   192  	// Keep track of the last commit that we rewrote. Callers often want
   193  	// this so that they can perform a git-update-ref(1).
   194  	var tip []byte
   195  	for _, oid := range commits {
   196  		// Load the original commit to access the data necessary in
   197  		// order to rewrite it.
   198  		original, err := r.db.Commit(oid)
   199  		if err != nil {
   200  			return nil, err
   201  		}
   202  
   203  		// Rewrite the tree given at that commit.
   204  		rewrittenTree, err := r.rewriteTree(oid, original.TreeID, "", opt.blobFn(), opt.treeFn(), vPerc)
   205  		if err != nil {
   206  			return nil, err
   207  		}
   208  
   209  		// Create a new list of parents from the original commit to
   210  		// point at the rewritten parents in order to create a
   211  		// topologically equivalent DAG.
   212  		//
   213  		// This operation is safe since we are visiting the commits in
   214  		// reverse topological order and therefore have seen all parents
   215  		// before children (in other words, r.uncacheCommit(...) will
   216  		// always return a value, if the prospective parent is a part of
   217  		// the migration).
   218  		rewrittenParents := make([][]byte, 0, len(original.ParentIDs))
   219  		for _, originalParent := range original.ParentIDs {
   220  			rewrittenParent, ok := r.uncacheCommit(originalParent)
   221  			if !ok {
   222  				// If we haven't seen the parent before, this
   223  				// means that we're doing a partial migration
   224  				// and the parent that we're looking for isn't
   225  				// included.
   226  				//
   227  				// Use the original parent to properly link
   228  				// history across the migration boundary.
   229  				rewrittenParent = originalParent
   230  			}
   231  
   232  			rewrittenParents = append(rewrittenParents, rewrittenParent)
   233  		}
   234  
   235  		// Construct a new commit using the original header information,
   236  		// but the rewritten set of parents as well as root tree.
   237  		rewrittenCommit := &odb.Commit{
   238  			Author:       original.Author,
   239  			Committer:    original.Committer,
   240  			ExtraHeaders: original.ExtraHeaders,
   241  			Message:      original.Message,
   242  
   243  			ParentIDs: rewrittenParents,
   244  			TreeID:    rewrittenTree,
   245  		}
   246  
   247  		var newSha []byte
   248  
   249  		if original.Equal(rewrittenCommit) {
   250  			newSha = make([]byte, len(oid))
   251  			copy(newSha, oid)
   252  		} else {
   253  			newSha, err = r.db.WriteCommit(rewrittenCommit)
   254  			if err != nil {
   255  				return nil, err
   256  			}
   257  		}
   258  
   259  		// Cache that commit so that we can reassign children of this
   260  		// commit.
   261  		r.cacheCommit(oid, newSha)
   262  
   263  		// Increment the percentage displayed in the terminal.
   264  		perc.Count(1)
   265  
   266  		// Move the tip forward.
   267  		tip = newSha
   268  	}
   269  
   270  	if opt.UpdateRefs {
   271  		refs, err := r.refsToMigrate()
   272  		if err != nil {
   273  			return nil, errors.Wrap(err, "could not find refs to update")
   274  		}
   275  
   276  		root, _ := r.db.Root()
   277  
   278  		updater := &refUpdater{
   279  			CacheFn: r.uncacheCommit,
   280  			Logger:  r.l,
   281  			Refs:    refs,
   282  			Root:    root,
   283  
   284  			db: r.db,
   285  		}
   286  
   287  		if err := updater.UpdateRefs(); err != nil {
   288  			return nil, errors.Wrap(err, "could not update refs")
   289  		}
   290  	}
   291  
   292  	return tip, err
   293  }
   294  
   295  // rewriteTree is a recursive function which rewrites a tree given by the ID
   296  // "sha" and path "path". It uses the given BlobRewriteFn to rewrite all blobs
   297  // within the tree, either calling that function or recurring down into subtrees
   298  // by re-assigning the SHA.
   299  //
   300  // Once it is done assembling the entries in a given subtree, it then calls the
   301  // TreeCallbackFn, "tfn" to perform a final traversal of the subtree before
   302  // saving it to the object database.
   303  //
   304  // It returns the new SHA of the rewritten tree, or an error if the tree was
   305  // unable to be rewritten.
   306  func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string, fn BlobRewriteFn, tfn TreeCallbackFn, perc *tasklog.PercentageTask) ([]byte, error) {
   307  	tree, err := r.db.Tree(treeOID)
   308  	if err != nil {
   309  		return nil, err
   310  	}
   311  
   312  	entries := make([]*odb.TreeEntry, 0, len(tree.Entries))
   313  	for _, entry := range tree.Entries {
   314  		path := filepath.Join(path, entry.Name)
   315  
   316  		if !r.allows(entry.Type(), path) {
   317  			entries = append(entries, entry)
   318  			continue
   319  		}
   320  
   321  		if cached := r.uncacheEntry(entry); cached != nil {
   322  			entries = append(entries, cached)
   323  			continue
   324  		}
   325  
   326  		var oid []byte
   327  
   328  		switch entry.Type() {
   329  		case odb.BlobObjectType:
   330  			oid, err = r.rewriteBlob(commitOID, entry.Oid, path, fn, perc)
   331  		case odb.TreeObjectType:
   332  			oid, err = r.rewriteTree(commitOID, entry.Oid, path, fn, tfn, perc)
   333  		default:
   334  			oid = entry.Oid
   335  
   336  		}
   337  		if err != nil {
   338  			return nil, err
   339  		}
   340  
   341  		entries = append(entries, r.cacheEntry(entry, &odb.TreeEntry{
   342  			Filemode: entry.Filemode,
   343  			Name:     entry.Name,
   344  			Oid:      oid,
   345  		}))
   346  	}
   347  
   348  	rewritten, err := tfn(string(os.PathSeparator)+path, &odb.Tree{Entries: entries})
   349  	if err != nil {
   350  		return nil, err
   351  	}
   352  
   353  	if tree.Equal(rewritten) {
   354  		return treeOID, nil
   355  	}
   356  	return r.db.WriteTree(rewritten)
   357  }
   358  
   359  func (r *Rewriter) allows(typ odb.ObjectType, abs string) bool {
   360  	switch typ {
   361  	case odb.BlobObjectType:
   362  		return r.Filter().Allows(abs)
   363  	case odb.TreeObjectType:
   364  		return r.Filter().HasPrefix(abs)
   365  	case odb.CommitObjectType:
   366  		return true
   367  	default:
   368  		panic(fmt.Sprintf("git/githistory: unknown entry type: %s", typ))
   369  	}
   370  }
   371  
   372  // rewriteBlob calls the given BlobRewriteFn "fn" on a blob given in the object
   373  // database by the SHA1 "from" []byte. It writes and returns the new blob SHA,
   374  // or an error if either the BlobRewriteFn returned one, or if the object could
   375  // not be loaded/saved.
   376  func (r *Rewriter) rewriteBlob(commitOID, from []byte, path string, fn BlobRewriteFn, perc *tasklog.PercentageTask) ([]byte, error) {
   377  	blob, err := r.db.Blob(from)
   378  	if err != nil {
   379  		return nil, err
   380  	}
   381  
   382  	b, err := fn(path, blob)
   383  	if err != nil {
   384  		return nil, err
   385  	}
   386  
   387  	if !blob.Equal(b) {
   388  		sha, err := r.db.WriteBlob(b)
   389  		if err != nil {
   390  			return nil, err
   391  		}
   392  
   393  		// Close the source blob, so long as it is not equal to the
   394  		// rewritten blob. If the two are equal, as in the check above
   395  		// this comment, calling r.db.WriteBlob(b) will have already
   396  		// closed both "b" and "blob" since they are the same.
   397  		//
   398  		// Closing an *os.File twice causes an `os.ErrInvalid` to be
   399  		// returned.
   400  		if err = blob.Close(); err != nil {
   401  			return nil, err
   402  		}
   403  
   404  		if perc != nil {
   405  			perc.Entry(fmt.Sprintf("migrate: commit %s: %s", hex.EncodeToString(commitOID), path))
   406  		}
   407  
   408  		return sha, nil
   409  	}
   410  
   411  	// Close the source blob, since it is identical to the rewritten blob,
   412  	// but neither were written.
   413  	if err := blob.Close(); err != nil {
   414  		return nil, err
   415  	}
   416  	return from, nil
   417  }
   418  
   419  // commitsToMigrate returns an in-memory copy of a list of commits according to
   420  // the output of git-rev-list(1) (given the *RewriteOptions), where each
   421  // outputted commit is 20 bytes of raw SHA1.
   422  //
   423  // If any error was encountered, it will be returned.
   424  func (r *Rewriter) commitsToMigrate(opt *RewriteOptions) ([][]byte, error) {
   425  	waiter := r.l.Waiter("migrate: Sorting commits")
   426  	defer waiter.Complete()
   427  
   428  	scanner, err := git.NewRevListScanner(
   429  		opt.Include, opt.Exclude, r.scannerOpts())
   430  	if err != nil {
   431  		return nil, err
   432  	}
   433  
   434  	var commits [][]byte
   435  	for scanner.Scan() {
   436  		commits = append(commits, scanner.OID())
   437  	}
   438  
   439  	if err = scanner.Err(); err != nil {
   440  		return nil, err
   441  	}
   442  	if err = scanner.Close(); err != nil {
   443  		return nil, err
   444  	}
   445  	return commits, nil
   446  }
   447  
   448  // refsToMigrate returns a list of references to migrate, or an error if loading
   449  // those references failed.
   450  func (r *Rewriter) refsToMigrate() ([]*git.Ref, error) {
   451  	var refs []*git.Ref
   452  	var err error
   453  
   454  	if root, ok := r.db.Root(); ok {
   455  		refs, err = git.AllRefsIn(root)
   456  	} else {
   457  		refs, err = git.AllRefs()
   458  	}
   459  
   460  	if err != nil {
   461  		return nil, err
   462  	}
   463  
   464  	var local []*git.Ref
   465  	for _, ref := range refs {
   466  		if ref.Type == git.RefTypeRemoteBranch || ref.Type == git.RefTypeRemoteTag {
   467  			continue
   468  		}
   469  
   470  		local = append(local, ref)
   471  	}
   472  
   473  	return local, nil
   474  }
   475  
   476  // scannerOpts returns a *git.ScanRefsOptions instance to be given to the
   477  // *git.RevListScanner.
   478  //
   479  // If the database this *Rewriter is operating in a given root (not in memory)
   480  // it re-assigns the working directory to be there.
   481  func (r *Rewriter) scannerOpts() *git.ScanRefsOptions {
   482  	opts := &git.ScanRefsOptions{
   483  		Mode:        git.ScanRefsMode,
   484  		Order:       git.TopoRevListOrder,
   485  		Reverse:     true,
   486  		CommitsOnly: true,
   487  
   488  		SkippedRefs: make([]string, 0),
   489  		Mutex:       new(sync.Mutex),
   490  		Names:       make(map[string]string),
   491  	}
   492  
   493  	if root, ok := r.db.Root(); ok {
   494  		opts.WorkingDir = root
   495  	}
   496  	return opts
   497  }
   498  
   499  // Filter returns the filter used by this *Rewriter to filter subtrees, blobs
   500  // (see above).
   501  func (r *Rewriter) Filter() *filepathfilter.Filter {
   502  	return r.filter
   503  }
   504  
   505  // cacheEntry caches then given "from" entry so that it is always rewritten as
   506  // a *TreeEntry equivalent to "to".
   507  func (r *Rewriter) cacheEntry(from, to *odb.TreeEntry) *odb.TreeEntry {
   508  	r.mu.Lock()
   509  	defer r.mu.Unlock()
   510  
   511  	r.entries[r.entryKey(from)] = to
   512  
   513  	return to
   514  }
   515  
   516  // uncacheEntry returns a *TreeEntry that is cached from the given *TreeEntry
   517  // "from". That is to say, it returns the *TreeEntry that "from" should be
   518  // rewritten to, or nil if none could be found.
   519  func (r *Rewriter) uncacheEntry(from *odb.TreeEntry) *odb.TreeEntry {
   520  	r.mu.Lock()
   521  	defer r.mu.Unlock()
   522  
   523  	return r.entries[r.entryKey(from)]
   524  }
   525  
   526  // entryKey returns a unique key for a given *TreeEntry "e".
   527  func (r *Rewriter) entryKey(e *odb.TreeEntry) string {
   528  	return fmt.Sprintf("%s:%x", e.Name, e.Oid)
   529  }
   530  
   531  // cacheEntry caches then given "from" commit so that it is always rewritten as
   532  // a *git/odb.Commit equivalent to "to".
   533  func (r *Rewriter) cacheCommit(from, to []byte) {
   534  	r.mu.Lock()
   535  	defer r.mu.Unlock()
   536  
   537  	r.commits[hex.EncodeToString(from)] = to
   538  }
   539  
   540  // uncacheCommit returns a *git/odb.Commit that is cached from the given
   541  // *git/odb.Commit "from". That is to say, it returns the *git/odb.Commit that
   542  // "from" should be rewritten to and true, or nil and false if none could be
   543  // found.
   544  func (r *Rewriter) uncacheCommit(from []byte) ([]byte, bool) {
   545  	r.mu.Lock()
   546  	defer r.mu.Unlock()
   547  
   548  	c, ok := r.commits[hex.EncodeToString(from)]
   549  	return c, ok
   550  }