github.com/soypat/gitaligned@v0.3.4-0.20221228122414-e435aab44fbc/nlp.go (about) 1 package main 2 3 import ( 4 "strings" 5 6 "github.com/jdkato/prose/v2" 7 ) 8 9 var replacecommits = strings.NewReplacer(".", " ", "(", " ", ")", " ", ":", " ", ",", " ", 10 "[", " ", "]", " ", "\\", " ", `"`, " ", "'", " ", "!", " ", ";", " ", "?", " ", 11 "/", " ", "<", " ", ">", " ") 12 13 func tokenizeCommits(commits []commit) ([]prose.Token, error) { 14 var err error 15 if len(commits) == 0 { 16 panic("expected non-nil/non-zero number of commits") 17 } 18 19 var doc *prose.Document 20 var allCommits = &strings.Builder{} 21 cap := allCommits.Cap() 22 if cap < len(commits)*20 { 23 allCommits.Grow(len(commits)*20 - cap) 24 } 25 26 for i := range commits { 27 msg := replacecommits.Replace(commits[i].Message) + " . " 28 allCommits.WriteString(msg) 29 } 30 // allstr :=allCommits.String() // debugging purposes 31 32 doc, err = prose.NewDocument(allCommits.String(), 33 prose.WithExtraction(false), prose.WithSegmentation(false), prose.WithTokenization(false)) 34 return doc.Tokens(), err 35 } 36 37 // walkCommits is SLOW. This is because it processes all commit messages into one 38 // 39 func walkCommits(commits []commit, f func(*commit, []prose.Token)) error { 40 tokens, err := tokenizeCommits(commits) 41 if err != nil { 42 return err 43 } 44 atCommit := 0 45 last := -1 46 for i := range tokens { 47 if tokens[i].Tag == "." { 48 f(&commits[atCommit], tokens[last+1:i]) 49 last = i 50 atCommit++ 51 } 52 } 53 return nil 54 } 55 56 func min(a, b int) int { 57 if a < b { 58 return a 59 } 60 return b 61 } 62 63 func max(a, b int) int { 64 if a > b { 65 return a 66 } 67 return b 68 } 69 70 func spaces(n int) string { 71 const spaces32 = " " 72 if n < 32 { 73 return spaces32[:n] 74 } 75 var res string 76 for i := 0; i < n/32; i++ { 77 res += spaces32 78 } 79 return res + spaces32[:n%32] 80 }