golang.org/x/build@v0.0.0-20240506185731-218518f32b70/maintner/git.go (about)

     1  // Copyright 2017 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package maintner
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"context"
    11  	"encoding/hex"
    12  	"errors"
    13  	"fmt"
    14  	"log"
    15  	"os/exec"
    16  	"sort"
    17  	"strconv"
    18  	"strings"
    19  	"time"
    20  
    21  	"golang.org/x/build/internal/envutil"
    22  	"golang.org/x/build/internal/foreach"
    23  	"golang.org/x/build/maintner/maintpb"
    24  )
    25  
    26  // GitHash is a git commit in binary form (NOT hex form).
    27  // They are currently always 20 bytes long. (for SHA-1 refs)
    28  // That may change in the future.
    29  type GitHash string
    30  
    31  func (h GitHash) String() string { return fmt.Sprintf("%x", string(h)) }
    32  
    33  // requires c.mu be held for writing
    34  func (c *Corpus) gitHashFromHexStr(s string) GitHash {
    35  	if len(s) != 40 {
    36  		panic(fmt.Sprintf("bogus git hash %q", s))
    37  	}
    38  	var buf [40]byte
    39  	copy(buf[:], s)
    40  	_, err := hex.Decode(buf[:20], buf[:]) // aliasing is safe
    41  	if err != nil {
    42  		panic(fmt.Sprintf("bogus git hash %q: %v", s, err))
    43  	}
    44  	return GitHash(c.strb(buf[:20]))
    45  }
    46  
    47  // requires c.mu be held for writing
    48  func (c *Corpus) gitHashFromHex(s []byte) GitHash {
    49  	if len(s) != 40 {
    50  		panic(fmt.Sprintf("bogus git hash %q", s))
    51  	}
    52  	var buf [20]byte
    53  	_, err := hex.Decode(buf[:], s)
    54  	if err != nil {
    55  		panic(fmt.Sprintf("bogus git hash %q: %v", s, err))
    56  	}
    57  	return GitHash(c.strb(buf[:20]))
    58  }
    59  
    60  // placeholderCommitter is a sentinel value for GitCommit.Committer to
    61  // mean that the GitCommit is a placeholder. It's used for commits we
    62  // know should exist (because they're referenced as parents) but we
    63  // haven't yet seen in the log.
    64  var placeholderCommitter = new(GitPerson)
    65  
    66  // GitCommit represents a single commit in a git repository.
    67  type GitCommit struct {
    68  	Hash       GitHash
    69  	Tree       GitHash
    70  	Parents    []*GitCommit
    71  	Author     *GitPerson
    72  	AuthorTime time.Time
    73  	Committer  *GitPerson
    74  	Reviewer   *GitPerson
    75  	CommitTime time.Time
    76  	Msg        string // Commit message subject and body
    77  	Files      []*maintpb.GitDiffTreeFile
    78  	GerritMeta *GerritMeta // non-nil if it's a Gerrit NoteDB meta commit
    79  }
    80  
    81  func (gc *GitCommit) String() string {
    82  	if gc == nil {
    83  		return "<nil *GitCommit>"
    84  	}
    85  	return fmt.Sprintf("{GitCommit %s}", gc.Hash)
    86  }
    87  
    88  // HasAncestor reports whether gc contains the provided ancestor
    89  // commit in gc's history.
    90  func (gc *GitCommit) HasAncestor(ancestor *GitCommit) bool {
    91  	return gc.hasAncestor(ancestor, make(map[*GitCommit]bool))
    92  }
    93  
    94  func (gc *GitCommit) hasAncestor(ancestor *GitCommit, checked map[*GitCommit]bool) bool {
    95  	if v, ok := checked[gc]; ok {
    96  		return v
    97  	}
    98  	checked[gc] = false
    99  	for _, pc := range gc.Parents {
   100  		if pc == nil {
   101  			panic("nil parent")
   102  		}
   103  		if pc.Committer == placeholderCommitter {
   104  			log.Printf("WARNING: hasAncestor(%q, %q) found parent %q with placeholder parent", gc.Hash, ancestor.Hash, pc.Hash)
   105  		}
   106  		if pc.Hash == ancestor.Hash || pc.hasAncestor(ancestor, checked) {
   107  			checked[gc] = true
   108  			return true
   109  		}
   110  	}
   111  	return false
   112  }
   113  
   114  // Summary returns the first line of the commit message.
   115  func (gc *GitCommit) Summary() string {
   116  	s := gc.Msg
   117  	if i := strings.IndexByte(s, '\n'); i != -1 {
   118  		s = s[:i]
   119  	}
   120  	s = strings.TrimSpace(s)
   121  	return s
   122  }
   123  
   124  // SameDiffStat reports whether gc has the same diff stat numbers as b.
   125  // If either is unknown, false is returned.
   126  func (gc *GitCommit) SameDiffStat(b *GitCommit) bool {
   127  	if len(gc.Files) != len(b.Files) {
   128  		return false
   129  	}
   130  	for i, af := range gc.Files {
   131  		bf := b.Files[i]
   132  		if af == nil || bf == nil {
   133  			return false
   134  		}
   135  		if *af != *bf {
   136  			return false
   137  		}
   138  	}
   139  	return true
   140  }
   141  
   142  // GitPerson is a person in a git commit.
   143  type GitPerson struct {
   144  	Str string // "Foo Bar <foo@bar.com>"
   145  }
   146  
   147  // Email returns the GitPerson's email address only, without the name
   148  // or angle brackets.
   149  func (p *GitPerson) Email() string {
   150  	lt := strings.IndexByte(p.Str, '<')
   151  	gt := strings.IndexByte(p.Str, '>')
   152  	if lt < 0 || gt < lt {
   153  		return ""
   154  	}
   155  	return p.Str[lt+1 : gt]
   156  }
   157  
   158  func (p *GitPerson) Name() string {
   159  	i := strings.IndexByte(p.Str, '<')
   160  	if i < 0 {
   161  		return p.Str
   162  	}
   163  	return strings.TrimSpace(p.Str[:i])
   164  }
   165  
   166  // String implements fmt.Stringer.
   167  func (p *GitPerson) String() string { return p.Str }
   168  
   169  // requires c.mu be held for writing.
   170  func (c *Corpus) enqueueCommitLocked(h GitHash) {
   171  	if _, ok := c.gitCommit[h]; ok {
   172  		return
   173  	}
   174  	if c.gitCommitTodo == nil {
   175  		c.gitCommitTodo = map[GitHash]bool{}
   176  	}
   177  	c.gitCommitTodo[h] = true
   178  }
   179  
   180  // syncGitCommits polls for git commits in a directory.
   181  func (c *Corpus) syncGitCommits(ctx context.Context, conf polledGitCommits, loop bool) error {
   182  	cmd := exec.CommandContext(ctx, "git", "show-ref", "refs/remotes/origin/master")
   183  	envutil.SetDir(cmd, conf.dir)
   184  	out, err := cmd.Output()
   185  	if err != nil {
   186  		log.Fatal(err)
   187  	}
   188  	outs := strings.TrimSpace(string(out))
   189  	if outs == "" {
   190  		return fmt.Errorf("no remote found for refs/remotes/origin/master")
   191  	}
   192  	ref := strings.Fields(outs)[0]
   193  	c.mu.Lock()
   194  	refHash := c.gitHashFromHexStr(ref)
   195  	c.enqueueCommitLocked(refHash)
   196  	c.mu.Unlock()
   197  
   198  	idle := false
   199  	for {
   200  		hash := c.gitCommitToIndex()
   201  		if hash == "" {
   202  			if !loop {
   203  				return nil
   204  			}
   205  			if !idle {
   206  				log.Printf("All git commits index for %v; idle.", conf.repo)
   207  				idle = true
   208  			}
   209  			time.Sleep(5 * time.Second)
   210  			continue
   211  		}
   212  		if err := c.indexCommit(conf, hash); err != nil {
   213  			log.Printf("Error indexing %v: %v", hash, err)
   214  			select {
   215  			case <-ctx.Done():
   216  				return ctx.Err()
   217  				// TODO: temporary vs permanent failure? reschedule? fail hard?
   218  				// For now just loop with a sleep.
   219  			case <-time.After(5 * time.Second):
   220  			}
   221  		}
   222  	}
   223  }
   224  
   225  // returns nil if no work.
   226  func (c *Corpus) gitCommitToIndex() GitHash {
   227  	c.mu.RLock()
   228  	defer c.mu.RUnlock()
   229  	for hash := range c.gitCommitTodo {
   230  		if _, ok := c.gitCommit[hash]; !ok {
   231  			return hash
   232  		}
   233  		log.Printf("Warning: git commit %v in todo map, but already known; ignoring", hash)
   234  	}
   235  	return ""
   236  }
   237  
   238  var (
   239  	nlnl           = []byte("\n\n")
   240  	parentSpace    = []byte("parent ")
   241  	authorSpace    = []byte("author ")
   242  	committerSpace = []byte("committer ")
   243  	treeSpace      = []byte("tree ")
   244  	golangHgSpace  = []byte("golang-hg ")
   245  	gpgSigSpace    = []byte("gpgsig ")
   246  	encodingSpace  = []byte("encoding ")
   247  	space          = []byte(" ")
   248  )
   249  
   250  func parseCommitFromGit(dir string, hash GitHash) (*maintpb.GitCommit, error) {
   251  	cmd := exec.Command("git", "cat-file", "commit", hash.String())
   252  	envutil.SetDir(cmd, dir)
   253  	catFile, err := cmd.Output()
   254  	if err != nil {
   255  		return nil, fmt.Errorf("git cat-file -p %v: %v", hash, err)
   256  	}
   257  	cmd = exec.Command("git", "diff-tree", "--numstat", hash.String())
   258  	envutil.SetDir(cmd, dir)
   259  	diffTreeOut, err := cmd.Output()
   260  	if err != nil {
   261  		return nil, fmt.Errorf("git diff-tree --numstat %v: %v", hash, err)
   262  	}
   263  
   264  	diffTree := &maintpb.GitDiffTree{}
   265  	bs := bufio.NewScanner(bytes.NewReader(diffTreeOut))
   266  	lineNum := 0
   267  	for bs.Scan() {
   268  		line := strings.TrimSpace(bs.Text())
   269  		lineNum++
   270  		if lineNum == 1 && line == hash.String() {
   271  			continue
   272  		}
   273  		f := strings.Fields(line)
   274  		// A line is like: <added> WS+ <deleted> WS+ <filename>
   275  		// Where <added> or <deleted> can be '-' to mean binary.
   276  		// The filename could contain spaces.
   277  		// 49      8       maintner/maintner.go
   278  		// Or:
   279  		// 49      8       some/name with spaces.txt
   280  		if len(f) < 3 {
   281  			continue
   282  		}
   283  		binary := f[0] == "-" || f[1] == "-"
   284  		added, _ := strconv.ParseInt(f[0], 10, 64)
   285  		deleted, _ := strconv.ParseInt(f[1], 10, 64)
   286  		file := strings.TrimPrefix(line, f[0])
   287  		file = strings.TrimSpace(file)
   288  		file = strings.TrimPrefix(file, f[1])
   289  		file = strings.TrimSpace(file)
   290  
   291  		diffTree.File = append(diffTree.File, &maintpb.GitDiffTreeFile{
   292  			File:    file,
   293  			Added:   added,
   294  			Deleted: deleted,
   295  			Binary:  binary,
   296  		})
   297  	}
   298  	if err := bs.Err(); err != nil {
   299  		return nil, err
   300  	}
   301  	commit := &maintpb.GitCommit{
   302  		Raw:      catFile,
   303  		DiffTree: diffTree,
   304  	}
   305  	switch len(hash) {
   306  	case 20:
   307  		commit.Sha1 = hash.String()
   308  	default:
   309  		return nil, fmt.Errorf("unsupported git hash %q", hash.String())
   310  	}
   311  	return commit, nil
   312  }
   313  
   314  func (c *Corpus) indexCommit(conf polledGitCommits, hash GitHash) error {
   315  	if conf.repo == nil {
   316  		panic("bogus config; nil repo")
   317  	}
   318  	commit, err := parseCommitFromGit(conf.dir, hash)
   319  	if err != nil {
   320  		return err
   321  	}
   322  	m := &maintpb.Mutation{
   323  		Git: &maintpb.GitMutation{
   324  			Repo:   conf.repo,
   325  			Commit: commit,
   326  		},
   327  	}
   328  	c.addMutation(m)
   329  	return nil
   330  }
   331  
   332  // c.mu is held for writing.
   333  func (c *Corpus) processGitMutation(m *maintpb.GitMutation) {
   334  	commit := m.Commit
   335  	if commit == nil {
   336  		return
   337  	}
   338  	// TODO: care about m.Repo?
   339  	c.processGitCommit(commit)
   340  }
   341  
   342  // c.mu is held for writing.
   343  func (c *Corpus) processGitCommit(commit *maintpb.GitCommit) (*GitCommit, error) {
   344  	if c.gitCommit == nil {
   345  		c.gitCommit = map[GitHash]*GitCommit{}
   346  	}
   347  	if len(commit.Sha1) != 40 {
   348  		return nil, fmt.Errorf("bogus git sha1 %q", commit.Sha1)
   349  	}
   350  	hash := c.gitHashFromHexStr(commit.Sha1)
   351  
   352  	catFile := commit.Raw
   353  	i := bytes.Index(catFile, nlnl)
   354  	if i == 0 {
   355  		return nil, fmt.Errorf("commit %v lacks double newline", hash)
   356  	}
   357  	hdr, msg := catFile[:i], catFile[i+2:]
   358  	gc := &GitCommit{
   359  		Hash:    hash,
   360  		Parents: make([]*GitCommit, 0, bytes.Count(hdr, parentSpace)),
   361  		Msg:     c.strb(msg),
   362  	}
   363  
   364  	// The commit message contains the reviewer email address. Sample commit message:
   365  	// Update patch set 1
   366  	//
   367  	// Patch Set 1: Code-Review+2
   368  	//
   369  	// Patch-set: 1
   370  	// Reviewer: Ian Lance Taylor <5206@62eb7196-b449-3ce5-99f1-c037f21e1705>
   371  	// Label: Code-Review=+2
   372  	if reviewer := lineValue(c.strb(msg), "Reviewer: "); reviewer != "" {
   373  		gc.Reviewer = &GitPerson{Str: reviewer}
   374  	}
   375  
   376  	if commit.DiffTree != nil {
   377  		gc.Files = commit.DiffTree.File
   378  	}
   379  	for _, f := range gc.Files {
   380  		f.File = c.str(f.File) // intern the string
   381  	}
   382  	sort.Slice(gc.Files, func(i, j int) bool { return gc.Files[i].File < gc.Files[j].File })
   383  	parents := 0
   384  	err := foreach.Line(hdr, func(ln []byte) error {
   385  		if bytes.HasPrefix(ln, parentSpace) {
   386  			parents++
   387  			parentHash := c.gitHashFromHex(ln[len(parentSpace):])
   388  			parent := c.gitCommit[parentHash]
   389  			if parent == nil {
   390  				// Install a placeholder to be filled in later.
   391  				parent = &GitCommit{
   392  					Hash:      parentHash,
   393  					Committer: placeholderCommitter,
   394  				}
   395  				c.gitCommit[parentHash] = parent
   396  			}
   397  			gc.Parents = append(gc.Parents, parent)
   398  			c.enqueueCommitLocked(parentHash)
   399  			return nil
   400  		}
   401  		if bytes.HasPrefix(ln, authorSpace) {
   402  			p, t, err := c.parsePerson(ln[len(authorSpace):])
   403  			if err != nil {
   404  				return fmt.Errorf("unrecognized author line %q: %v", ln, err)
   405  			}
   406  			gc.Author = p
   407  			gc.AuthorTime = t
   408  			return nil
   409  		}
   410  		if bytes.HasPrefix(ln, committerSpace) {
   411  			p, t, err := c.parsePerson(ln[len(committerSpace):])
   412  			if err != nil {
   413  				return fmt.Errorf("unrecognized committer line %q: %v", ln, err)
   414  			}
   415  			gc.Committer = p
   416  			gc.CommitTime = t
   417  			return nil
   418  		}
   419  		if bytes.HasPrefix(ln, treeSpace) {
   420  			gc.Tree = c.gitHashFromHex(ln[len(treeSpace):])
   421  			return nil
   422  		}
   423  		if bytes.HasPrefix(ln, golangHgSpace) {
   424  			if c.gitOfHg == nil {
   425  				c.gitOfHg = map[string]GitHash{}
   426  			}
   427  			c.gitOfHg[string(ln[len(golangHgSpace):])] = hash
   428  			return nil
   429  		}
   430  		if bytes.HasPrefix(ln, gpgSigSpace) || bytes.HasPrefix(ln, space) {
   431  			// Jessie Frazelle is a unique butterfly.
   432  			return nil
   433  		}
   434  		if bytes.HasPrefix(ln, encodingSpace) {
   435  			// Also ignore this. In practice this has only
   436  			// been seen to declare that a commit's
   437  			// metadata is utf-8 when the author name has
   438  			// non-ASCII.
   439  			return nil
   440  		}
   441  		log.Printf("in commit %s, unrecognized line %q", hash, ln)
   442  		return nil
   443  	})
   444  	if err != nil {
   445  		log.Printf("Unparseable commit %q: %v", hash, err)
   446  		return nil, fmt.Errorf("Unparseable commit %q: %v", hash, err)
   447  	}
   448  	if ph, ok := c.gitCommit[hash]; ok {
   449  		// Update placeholder.
   450  		*ph = *gc
   451  	} else {
   452  		c.gitCommit[hash] = gc
   453  	}
   454  	if c.gitCommitTodo != nil {
   455  		delete(c.gitCommitTodo, hash)
   456  	}
   457  	if c.verbose {
   458  		now := time.Now()
   459  		if now.After(c.lastGitCount.Add(time.Second)) {
   460  			c.lastGitCount = now
   461  			log.Printf("Num git commits = %v", len(c.gitCommit))
   462  		}
   463  	}
   464  	return gc, nil
   465  }
   466  
   467  // parsePerson parses an "author" or "committer" value from "git cat-file -p COMMIT"
   468  // The values are like:
   469  //
   470  //	Foo Bar <foobar@gmail.com> 1488624439 +0900
   471  //
   472  // c.mu must be held for writing.
   473  func (c *Corpus) parsePerson(v []byte) (*GitPerson, time.Time, error) {
   474  	v = bytes.TrimSpace(v)
   475  
   476  	lastSpace := bytes.LastIndexByte(v, ' ')
   477  	if lastSpace < 0 {
   478  		return nil, time.Time{}, errors.New("failed to match person")
   479  	}
   480  	tz := v[lastSpace+1:] // "+0800"
   481  	v = v[:lastSpace]     // now v is "Foo Bar <foobar@gmail.com> 1488624439"
   482  
   483  	lastSpace = bytes.LastIndexByte(v, ' ')
   484  	if lastSpace < 0 {
   485  		return nil, time.Time{}, errors.New("failed to match person")
   486  	}
   487  	unixTime := v[lastSpace+1:]
   488  	nameEmail := v[:lastSpace] // now v is "Foo Bar <foobar@gmail.com>"
   489  
   490  	ut, err := strconv.ParseInt(string(unixTime), 10, 64)
   491  	if err != nil {
   492  		return nil, time.Time{}, err
   493  	}
   494  	t := time.Unix(ut, 0).In(c.gitLocation(tz))
   495  
   496  	p, ok := c.gitPeople[string(nameEmail)]
   497  	if !ok {
   498  		p = &GitPerson{Str: string(nameEmail)}
   499  		if c.gitPeople == nil {
   500  			c.gitPeople = map[string]*GitPerson{}
   501  		}
   502  		c.gitPeople[p.Str] = p
   503  	}
   504  	return p, t, nil
   505  
   506  }
   507  
   508  // GitCommit returns the provided git commit, or nil if it's unknown.
   509  func (c *Corpus) GitCommit(hash string) *GitCommit {
   510  	if len(hash) != 40 {
   511  		// TODO: support prefix lookups. build a trie. But
   512  		// for now just avoid panicking in gitHashFromHexStr.
   513  		return nil
   514  	}
   515  	var buf [20]byte
   516  	_, err := decodeHexStr(buf[:], hash)
   517  	if err != nil {
   518  		return nil
   519  	}
   520  	return c.gitCommit[GitHash(buf[:])]
   521  }
   522  
   523  // v is like '[+-]hhmm'
   524  // c.mu must be held for writing.
   525  func (c *Corpus) gitLocation(v []byte) *time.Location {
   526  	if loc, ok := c.zoneCache[string(v)]; ok {
   527  		return loc
   528  	}
   529  	s := string(v)
   530  	h, _ := strconv.Atoi(s[1:3])
   531  	m, _ := strconv.Atoi(s[3:5])
   532  	east := 1
   533  	if v[0] == '-' {
   534  		east = -1
   535  	}
   536  	loc := time.FixedZone(s, east*(h*3600+m*60))
   537  	if c.zoneCache == nil {
   538  		c.zoneCache = map[string]*time.Location{}
   539  	}
   540  	c.zoneCache[s] = loc
   541  	return loc
   542  }
   543  
   544  func decodeHexStr(dst []byte, src string) (int, error) {
   545  	if len(src)%2 == 1 {
   546  		return 0, hex.ErrLength
   547  	}
   548  
   549  	for i := 0; i < len(src)/2; i++ {
   550  		a, ok := fromHexChar(src[i*2])
   551  		if !ok {
   552  			return 0, hex.InvalidByteError(src[i*2])
   553  		}
   554  		b, ok := fromHexChar(src[i*2+1])
   555  		if !ok {
   556  			return 0, hex.InvalidByteError(src[i*2+1])
   557  		}
   558  		dst[i] = (a << 4) | b
   559  	}
   560  
   561  	return len(src) / 2, nil
   562  }
   563  
   564  // fromHexChar converts a hex character into its value and a success flag.
   565  func fromHexChar(c byte) (byte, bool) {
   566  	switch {
   567  	case '0' <= c && c <= '9':
   568  		return c - '0', true
   569  	case 'a' <= c && c <= 'f':
   570  		return c - 'a' + 10, true
   571  	case 'A' <= c && c <= 'F':
   572  		return c - 'A' + 10, true
   573  	}
   574  
   575  	return 0, false
   576  }