golang.org/x/build@v0.0.0-20240506185731-218518f32b70/maintner/git.go (about) 1 // Copyright 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package maintner 6 7 import ( 8 "bufio" 9 "bytes" 10 "context" 11 "encoding/hex" 12 "errors" 13 "fmt" 14 "log" 15 "os/exec" 16 "sort" 17 "strconv" 18 "strings" 19 "time" 20 21 "golang.org/x/build/internal/envutil" 22 "golang.org/x/build/internal/foreach" 23 "golang.org/x/build/maintner/maintpb" 24 ) 25 26 // GitHash is a git commit in binary form (NOT hex form). 27 // They are currently always 20 bytes long. (for SHA-1 refs) 28 // That may change in the future. 29 type GitHash string 30 31 func (h GitHash) String() string { return fmt.Sprintf("%x", string(h)) } 32 33 // requires c.mu be held for writing 34 func (c *Corpus) gitHashFromHexStr(s string) GitHash { 35 if len(s) != 40 { 36 panic(fmt.Sprintf("bogus git hash %q", s)) 37 } 38 var buf [40]byte 39 copy(buf[:], s) 40 _, err := hex.Decode(buf[:20], buf[:]) // aliasing is safe 41 if err != nil { 42 panic(fmt.Sprintf("bogus git hash %q: %v", s, err)) 43 } 44 return GitHash(c.strb(buf[:20])) 45 } 46 47 // requires c.mu be held for writing 48 func (c *Corpus) gitHashFromHex(s []byte) GitHash { 49 if len(s) != 40 { 50 panic(fmt.Sprintf("bogus git hash %q", s)) 51 } 52 var buf [20]byte 53 _, err := hex.Decode(buf[:], s) 54 if err != nil { 55 panic(fmt.Sprintf("bogus git hash %q: %v", s, err)) 56 } 57 return GitHash(c.strb(buf[:20])) 58 } 59 60 // placeholderCommitter is a sentinel value for GitCommit.Committer to 61 // mean that the GitCommit is a placeholder. It's used for commits we 62 // know should exist (because they're referenced as parents) but we 63 // haven't yet seen in the log. 64 var placeholderCommitter = new(GitPerson) 65 66 // GitCommit represents a single commit in a git repository. 67 type GitCommit struct { 68 Hash GitHash 69 Tree GitHash 70 Parents []*GitCommit 71 Author *GitPerson 72 AuthorTime time.Time 73 Committer *GitPerson 74 Reviewer *GitPerson 75 CommitTime time.Time 76 Msg string // Commit message subject and body 77 Files []*maintpb.GitDiffTreeFile 78 GerritMeta *GerritMeta // non-nil if it's a Gerrit NoteDB meta commit 79 } 80 81 func (gc *GitCommit) String() string { 82 if gc == nil { 83 return "<nil *GitCommit>" 84 } 85 return fmt.Sprintf("{GitCommit %s}", gc.Hash) 86 } 87 88 // HasAncestor reports whether gc contains the provided ancestor 89 // commit in gc's history. 90 func (gc *GitCommit) HasAncestor(ancestor *GitCommit) bool { 91 return gc.hasAncestor(ancestor, make(map[*GitCommit]bool)) 92 } 93 94 func (gc *GitCommit) hasAncestor(ancestor *GitCommit, checked map[*GitCommit]bool) bool { 95 if v, ok := checked[gc]; ok { 96 return v 97 } 98 checked[gc] = false 99 for _, pc := range gc.Parents { 100 if pc == nil { 101 panic("nil parent") 102 } 103 if pc.Committer == placeholderCommitter { 104 log.Printf("WARNING: hasAncestor(%q, %q) found parent %q with placeholder parent", gc.Hash, ancestor.Hash, pc.Hash) 105 } 106 if pc.Hash == ancestor.Hash || pc.hasAncestor(ancestor, checked) { 107 checked[gc] = true 108 return true 109 } 110 } 111 return false 112 } 113 114 // Summary returns the first line of the commit message. 115 func (gc *GitCommit) Summary() string { 116 s := gc.Msg 117 if i := strings.IndexByte(s, '\n'); i != -1 { 118 s = s[:i] 119 } 120 s = strings.TrimSpace(s) 121 return s 122 } 123 124 // SameDiffStat reports whether gc has the same diff stat numbers as b. 125 // If either is unknown, false is returned. 126 func (gc *GitCommit) SameDiffStat(b *GitCommit) bool { 127 if len(gc.Files) != len(b.Files) { 128 return false 129 } 130 for i, af := range gc.Files { 131 bf := b.Files[i] 132 if af == nil || bf == nil { 133 return false 134 } 135 if *af != *bf { 136 return false 137 } 138 } 139 return true 140 } 141 142 // GitPerson is a person in a git commit. 143 type GitPerson struct { 144 Str string // "Foo Bar <foo@bar.com>" 145 } 146 147 // Email returns the GitPerson's email address only, without the name 148 // or angle brackets. 149 func (p *GitPerson) Email() string { 150 lt := strings.IndexByte(p.Str, '<') 151 gt := strings.IndexByte(p.Str, '>') 152 if lt < 0 || gt < lt { 153 return "" 154 } 155 return p.Str[lt+1 : gt] 156 } 157 158 func (p *GitPerson) Name() string { 159 i := strings.IndexByte(p.Str, '<') 160 if i < 0 { 161 return p.Str 162 } 163 return strings.TrimSpace(p.Str[:i]) 164 } 165 166 // String implements fmt.Stringer. 167 func (p *GitPerson) String() string { return p.Str } 168 169 // requires c.mu be held for writing. 170 func (c *Corpus) enqueueCommitLocked(h GitHash) { 171 if _, ok := c.gitCommit[h]; ok { 172 return 173 } 174 if c.gitCommitTodo == nil { 175 c.gitCommitTodo = map[GitHash]bool{} 176 } 177 c.gitCommitTodo[h] = true 178 } 179 180 // syncGitCommits polls for git commits in a directory. 181 func (c *Corpus) syncGitCommits(ctx context.Context, conf polledGitCommits, loop bool) error { 182 cmd := exec.CommandContext(ctx, "git", "show-ref", "refs/remotes/origin/master") 183 envutil.SetDir(cmd, conf.dir) 184 out, err := cmd.Output() 185 if err != nil { 186 log.Fatal(err) 187 } 188 outs := strings.TrimSpace(string(out)) 189 if outs == "" { 190 return fmt.Errorf("no remote found for refs/remotes/origin/master") 191 } 192 ref := strings.Fields(outs)[0] 193 c.mu.Lock() 194 refHash := c.gitHashFromHexStr(ref) 195 c.enqueueCommitLocked(refHash) 196 c.mu.Unlock() 197 198 idle := false 199 for { 200 hash := c.gitCommitToIndex() 201 if hash == "" { 202 if !loop { 203 return nil 204 } 205 if !idle { 206 log.Printf("All git commits index for %v; idle.", conf.repo) 207 idle = true 208 } 209 time.Sleep(5 * time.Second) 210 continue 211 } 212 if err := c.indexCommit(conf, hash); err != nil { 213 log.Printf("Error indexing %v: %v", hash, err) 214 select { 215 case <-ctx.Done(): 216 return ctx.Err() 217 // TODO: temporary vs permanent failure? reschedule? fail hard? 218 // For now just loop with a sleep. 219 case <-time.After(5 * time.Second): 220 } 221 } 222 } 223 } 224 225 // returns nil if no work. 226 func (c *Corpus) gitCommitToIndex() GitHash { 227 c.mu.RLock() 228 defer c.mu.RUnlock() 229 for hash := range c.gitCommitTodo { 230 if _, ok := c.gitCommit[hash]; !ok { 231 return hash 232 } 233 log.Printf("Warning: git commit %v in todo map, but already known; ignoring", hash) 234 } 235 return "" 236 } 237 238 var ( 239 nlnl = []byte("\n\n") 240 parentSpace = []byte("parent ") 241 authorSpace = []byte("author ") 242 committerSpace = []byte("committer ") 243 treeSpace = []byte("tree ") 244 golangHgSpace = []byte("golang-hg ") 245 gpgSigSpace = []byte("gpgsig ") 246 encodingSpace = []byte("encoding ") 247 space = []byte(" ") 248 ) 249 250 func parseCommitFromGit(dir string, hash GitHash) (*maintpb.GitCommit, error) { 251 cmd := exec.Command("git", "cat-file", "commit", hash.String()) 252 envutil.SetDir(cmd, dir) 253 catFile, err := cmd.Output() 254 if err != nil { 255 return nil, fmt.Errorf("git cat-file -p %v: %v", hash, err) 256 } 257 cmd = exec.Command("git", "diff-tree", "--numstat", hash.String()) 258 envutil.SetDir(cmd, dir) 259 diffTreeOut, err := cmd.Output() 260 if err != nil { 261 return nil, fmt.Errorf("git diff-tree --numstat %v: %v", hash, err) 262 } 263 264 diffTree := &maintpb.GitDiffTree{} 265 bs := bufio.NewScanner(bytes.NewReader(diffTreeOut)) 266 lineNum := 0 267 for bs.Scan() { 268 line := strings.TrimSpace(bs.Text()) 269 lineNum++ 270 if lineNum == 1 && line == hash.String() { 271 continue 272 } 273 f := strings.Fields(line) 274 // A line is like: <added> WS+ <deleted> WS+ <filename> 275 // Where <added> or <deleted> can be '-' to mean binary. 276 // The filename could contain spaces. 277 // 49 8 maintner/maintner.go 278 // Or: 279 // 49 8 some/name with spaces.txt 280 if len(f) < 3 { 281 continue 282 } 283 binary := f[0] == "-" || f[1] == "-" 284 added, _ := strconv.ParseInt(f[0], 10, 64) 285 deleted, _ := strconv.ParseInt(f[1], 10, 64) 286 file := strings.TrimPrefix(line, f[0]) 287 file = strings.TrimSpace(file) 288 file = strings.TrimPrefix(file, f[1]) 289 file = strings.TrimSpace(file) 290 291 diffTree.File = append(diffTree.File, &maintpb.GitDiffTreeFile{ 292 File: file, 293 Added: added, 294 Deleted: deleted, 295 Binary: binary, 296 }) 297 } 298 if err := bs.Err(); err != nil { 299 return nil, err 300 } 301 commit := &maintpb.GitCommit{ 302 Raw: catFile, 303 DiffTree: diffTree, 304 } 305 switch len(hash) { 306 case 20: 307 commit.Sha1 = hash.String() 308 default: 309 return nil, fmt.Errorf("unsupported git hash %q", hash.String()) 310 } 311 return commit, nil 312 } 313 314 func (c *Corpus) indexCommit(conf polledGitCommits, hash GitHash) error { 315 if conf.repo == nil { 316 panic("bogus config; nil repo") 317 } 318 commit, err := parseCommitFromGit(conf.dir, hash) 319 if err != nil { 320 return err 321 } 322 m := &maintpb.Mutation{ 323 Git: &maintpb.GitMutation{ 324 Repo: conf.repo, 325 Commit: commit, 326 }, 327 } 328 c.addMutation(m) 329 return nil 330 } 331 332 // c.mu is held for writing. 333 func (c *Corpus) processGitMutation(m *maintpb.GitMutation) { 334 commit := m.Commit 335 if commit == nil { 336 return 337 } 338 // TODO: care about m.Repo? 339 c.processGitCommit(commit) 340 } 341 342 // c.mu is held for writing. 343 func (c *Corpus) processGitCommit(commit *maintpb.GitCommit) (*GitCommit, error) { 344 if c.gitCommit == nil { 345 c.gitCommit = map[GitHash]*GitCommit{} 346 } 347 if len(commit.Sha1) != 40 { 348 return nil, fmt.Errorf("bogus git sha1 %q", commit.Sha1) 349 } 350 hash := c.gitHashFromHexStr(commit.Sha1) 351 352 catFile := commit.Raw 353 i := bytes.Index(catFile, nlnl) 354 if i == 0 { 355 return nil, fmt.Errorf("commit %v lacks double newline", hash) 356 } 357 hdr, msg := catFile[:i], catFile[i+2:] 358 gc := &GitCommit{ 359 Hash: hash, 360 Parents: make([]*GitCommit, 0, bytes.Count(hdr, parentSpace)), 361 Msg: c.strb(msg), 362 } 363 364 // The commit message contains the reviewer email address. Sample commit message: 365 // Update patch set 1 366 // 367 // Patch Set 1: Code-Review+2 368 // 369 // Patch-set: 1 370 // Reviewer: Ian Lance Taylor <5206@62eb7196-b449-3ce5-99f1-c037f21e1705> 371 // Label: Code-Review=+2 372 if reviewer := lineValue(c.strb(msg), "Reviewer: "); reviewer != "" { 373 gc.Reviewer = &GitPerson{Str: reviewer} 374 } 375 376 if commit.DiffTree != nil { 377 gc.Files = commit.DiffTree.File 378 } 379 for _, f := range gc.Files { 380 f.File = c.str(f.File) // intern the string 381 } 382 sort.Slice(gc.Files, func(i, j int) bool { return gc.Files[i].File < gc.Files[j].File }) 383 parents := 0 384 err := foreach.Line(hdr, func(ln []byte) error { 385 if bytes.HasPrefix(ln, parentSpace) { 386 parents++ 387 parentHash := c.gitHashFromHex(ln[len(parentSpace):]) 388 parent := c.gitCommit[parentHash] 389 if parent == nil { 390 // Install a placeholder to be filled in later. 391 parent = &GitCommit{ 392 Hash: parentHash, 393 Committer: placeholderCommitter, 394 } 395 c.gitCommit[parentHash] = parent 396 } 397 gc.Parents = append(gc.Parents, parent) 398 c.enqueueCommitLocked(parentHash) 399 return nil 400 } 401 if bytes.HasPrefix(ln, authorSpace) { 402 p, t, err := c.parsePerson(ln[len(authorSpace):]) 403 if err != nil { 404 return fmt.Errorf("unrecognized author line %q: %v", ln, err) 405 } 406 gc.Author = p 407 gc.AuthorTime = t 408 return nil 409 } 410 if bytes.HasPrefix(ln, committerSpace) { 411 p, t, err := c.parsePerson(ln[len(committerSpace):]) 412 if err != nil { 413 return fmt.Errorf("unrecognized committer line %q: %v", ln, err) 414 } 415 gc.Committer = p 416 gc.CommitTime = t 417 return nil 418 } 419 if bytes.HasPrefix(ln, treeSpace) { 420 gc.Tree = c.gitHashFromHex(ln[len(treeSpace):]) 421 return nil 422 } 423 if bytes.HasPrefix(ln, golangHgSpace) { 424 if c.gitOfHg == nil { 425 c.gitOfHg = map[string]GitHash{} 426 } 427 c.gitOfHg[string(ln[len(golangHgSpace):])] = hash 428 return nil 429 } 430 if bytes.HasPrefix(ln, gpgSigSpace) || bytes.HasPrefix(ln, space) { 431 // Jessie Frazelle is a unique butterfly. 432 return nil 433 } 434 if bytes.HasPrefix(ln, encodingSpace) { 435 // Also ignore this. In practice this has only 436 // been seen to declare that a commit's 437 // metadata is utf-8 when the author name has 438 // non-ASCII. 439 return nil 440 } 441 log.Printf("in commit %s, unrecognized line %q", hash, ln) 442 return nil 443 }) 444 if err != nil { 445 log.Printf("Unparseable commit %q: %v", hash, err) 446 return nil, fmt.Errorf("Unparseable commit %q: %v", hash, err) 447 } 448 if ph, ok := c.gitCommit[hash]; ok { 449 // Update placeholder. 450 *ph = *gc 451 } else { 452 c.gitCommit[hash] = gc 453 } 454 if c.gitCommitTodo != nil { 455 delete(c.gitCommitTodo, hash) 456 } 457 if c.verbose { 458 now := time.Now() 459 if now.After(c.lastGitCount.Add(time.Second)) { 460 c.lastGitCount = now 461 log.Printf("Num git commits = %v", len(c.gitCommit)) 462 } 463 } 464 return gc, nil 465 } 466 467 // parsePerson parses an "author" or "committer" value from "git cat-file -p COMMIT" 468 // The values are like: 469 // 470 // Foo Bar <foobar@gmail.com> 1488624439 +0900 471 // 472 // c.mu must be held for writing. 473 func (c *Corpus) parsePerson(v []byte) (*GitPerson, time.Time, error) { 474 v = bytes.TrimSpace(v) 475 476 lastSpace := bytes.LastIndexByte(v, ' ') 477 if lastSpace < 0 { 478 return nil, time.Time{}, errors.New("failed to match person") 479 } 480 tz := v[lastSpace+1:] // "+0800" 481 v = v[:lastSpace] // now v is "Foo Bar <foobar@gmail.com> 1488624439" 482 483 lastSpace = bytes.LastIndexByte(v, ' ') 484 if lastSpace < 0 { 485 return nil, time.Time{}, errors.New("failed to match person") 486 } 487 unixTime := v[lastSpace+1:] 488 nameEmail := v[:lastSpace] // now v is "Foo Bar <foobar@gmail.com>" 489 490 ut, err := strconv.ParseInt(string(unixTime), 10, 64) 491 if err != nil { 492 return nil, time.Time{}, err 493 } 494 t := time.Unix(ut, 0).In(c.gitLocation(tz)) 495 496 p, ok := c.gitPeople[string(nameEmail)] 497 if !ok { 498 p = &GitPerson{Str: string(nameEmail)} 499 if c.gitPeople == nil { 500 c.gitPeople = map[string]*GitPerson{} 501 } 502 c.gitPeople[p.Str] = p 503 } 504 return p, t, nil 505 506 } 507 508 // GitCommit returns the provided git commit, or nil if it's unknown. 509 func (c *Corpus) GitCommit(hash string) *GitCommit { 510 if len(hash) != 40 { 511 // TODO: support prefix lookups. build a trie. But 512 // for now just avoid panicking in gitHashFromHexStr. 513 return nil 514 } 515 var buf [20]byte 516 _, err := decodeHexStr(buf[:], hash) 517 if err != nil { 518 return nil 519 } 520 return c.gitCommit[GitHash(buf[:])] 521 } 522 523 // v is like '[+-]hhmm' 524 // c.mu must be held for writing. 525 func (c *Corpus) gitLocation(v []byte) *time.Location { 526 if loc, ok := c.zoneCache[string(v)]; ok { 527 return loc 528 } 529 s := string(v) 530 h, _ := strconv.Atoi(s[1:3]) 531 m, _ := strconv.Atoi(s[3:5]) 532 east := 1 533 if v[0] == '-' { 534 east = -1 535 } 536 loc := time.FixedZone(s, east*(h*3600+m*60)) 537 if c.zoneCache == nil { 538 c.zoneCache = map[string]*time.Location{} 539 } 540 c.zoneCache[s] = loc 541 return loc 542 } 543 544 func decodeHexStr(dst []byte, src string) (int, error) { 545 if len(src)%2 == 1 { 546 return 0, hex.ErrLength 547 } 548 549 for i := 0; i < len(src)/2; i++ { 550 a, ok := fromHexChar(src[i*2]) 551 if !ok { 552 return 0, hex.InvalidByteError(src[i*2]) 553 } 554 b, ok := fromHexChar(src[i*2+1]) 555 if !ok { 556 return 0, hex.InvalidByteError(src[i*2+1]) 557 } 558 dst[i] = (a << 4) | b 559 } 560 561 return len(src) / 2, nil 562 } 563 564 // fromHexChar converts a hex character into its value and a success flag. 565 func fromHexChar(c byte) (byte, bool) { 566 switch { 567 case '0' <= c && c <= '9': 568 return c - '0', true 569 case 'a' <= c && c <= 'f': 570 return c - 'a' + 10, true 571 case 'A' <= c && c <= 'F': 572 return c - 'A' + 10, true 573 } 574 575 return 0, false 576 }