github.com/rohankumardubey/draft-classic@v0.16.0/pkg/linguist/analyse.go (about)

     1  package linguist
     2  
     3  import (
     4  	"bytes"
     5  	"log"
     6  	"math"
     7  
     8  	"github.com/Azure/draft/pkg/linguist/data"
     9  	"github.com/Azure/draft/pkg/linguist/tokenizer"
    10  	"github.com/jbrukh/bayesian"
    11  )
    12  
    13  var classifier *bayesian.Classifier
    14  var classifierInitialized = false
    15  
    16  // Gets the baysian.Classifier which has been trained on programming language
    17  // samples from github.com/github/linguist after running the generator
    18  //
    19  // See also cmd/generate-classifier
    20  func getClassifier() *bayesian.Classifier {
    21  	// NOTE(tso): this could probably go into an init() function instead
    22  	// but this lazy loading approach works, and it's conceivable that the
    23  	// analyse() function might not invoked in an actual runtime anyway
    24  	if !classifierInitialized {
    25  		d, err := data.Asset("classifier")
    26  		if err != nil {
    27  			log.Panicln(err)
    28  		}
    29  		reader := bytes.NewReader(d)
    30  		classifier, err = bayesian.NewClassifierFromReader(reader)
    31  		if err != nil {
    32  			log.Panicln(err)
    33  		}
    34  		classifierInitialized = true
    35  	}
    36  	return classifier
    37  }
    38  
    39  // Analyse returns the name of a programming language, or the empty string if one could
    40  // not be determined.
    41  //
    42  // Uses Naive Bayesian Classification on the file contents provided.
    43  //
    44  // It is recommended to use LanguageByContents() instead of this function directly.
    45  //
    46  // Obtain hints from LanguageHints()
    47  //
    48  // NOTE(tso): May yield inaccurate results
    49  func Analyse(contents []byte, hints []string) (language string) {
    50  	document := tokenizer.Tokenize(contents)
    51  	classifier := getClassifier()
    52  	scores, idx, _ := classifier.LogScores(document)
    53  
    54  	if len(hints) == 0 {
    55  		return string(classifier.Classes[idx])
    56  	}
    57  
    58  	langs := map[string]struct{}{}
    59  	for _, hint := range hints {
    60  		langs[hint] = struct{}{}
    61  	}
    62  
    63  	bestScore := math.Inf(-1)
    64  	bestAnswer := ""
    65  
    66  	for id, score := range scores {
    67  		answer := string(classifier.Classes[id])
    68  		if _, ok := langs[answer]; ok {
    69  			if score >= bestScore {
    70  				bestScore = score
    71  				bestAnswer = answer
    72  			}
    73  		}
    74  	}
    75  	return bestAnswer
    76  }