github.com/rohankumardubey/draft-classic@v0.16.0/pkg/linguist/analyse.go (about) 1 package linguist 2 3 import ( 4 "bytes" 5 "log" 6 "math" 7 8 "github.com/Azure/draft/pkg/linguist/data" 9 "github.com/Azure/draft/pkg/linguist/tokenizer" 10 "github.com/jbrukh/bayesian" 11 ) 12 13 var classifier *bayesian.Classifier 14 var classifierInitialized = false 15 16 // Gets the baysian.Classifier which has been trained on programming language 17 // samples from github.com/github/linguist after running the generator 18 // 19 // See also cmd/generate-classifier 20 func getClassifier() *bayesian.Classifier { 21 // NOTE(tso): this could probably go into an init() function instead 22 // but this lazy loading approach works, and it's conceivable that the 23 // analyse() function might not invoked in an actual runtime anyway 24 if !classifierInitialized { 25 d, err := data.Asset("classifier") 26 if err != nil { 27 log.Panicln(err) 28 } 29 reader := bytes.NewReader(d) 30 classifier, err = bayesian.NewClassifierFromReader(reader) 31 if err != nil { 32 log.Panicln(err) 33 } 34 classifierInitialized = true 35 } 36 return classifier 37 } 38 39 // Analyse returns the name of a programming language, or the empty string if one could 40 // not be determined. 41 // 42 // Uses Naive Bayesian Classification on the file contents provided. 43 // 44 // It is recommended to use LanguageByContents() instead of this function directly. 45 // 46 // Obtain hints from LanguageHints() 47 // 48 // NOTE(tso): May yield inaccurate results 49 func Analyse(contents []byte, hints []string) (language string) { 50 document := tokenizer.Tokenize(contents) 51 classifier := getClassifier() 52 scores, idx, _ := classifier.LogScores(document) 53 54 if len(hints) == 0 { 55 return string(classifier.Classes[idx]) 56 } 57 58 langs := map[string]struct{}{} 59 for _, hint := range hints { 60 langs[hint] = struct{}{} 61 } 62 63 bestScore := math.Inf(-1) 64 bestAnswer := "" 65 66 for id, score := range scores { 67 answer := string(classifier.Classes[id]) 68 if _, ok := langs[answer]; ok { 69 if score >= bestScore { 70 bestScore = score 71 bestAnswer = answer 72 } 73 } 74 } 75 return bestAnswer 76 }