github.com/zkry/enry@v1.6.3/classifier.go (about)

     1  package enry
     2  
     3  import (
     4  	"math"
     5  	"sort"
     6  
     7  	"gopkg.in/src-d/enry.v1/internal/tokenizer"
     8  )
     9  
    10  // Classifier is the interface in charge to detect the possible languages of the given content based on a set of
    11  // candidates. Candidates is a map which can be used to assign weights to languages dynamically.
    12  type Classifier interface {
    13  	Classify(content []byte, candidates map[string]float64) (languages []string)
    14  }
    15  
    16  type classifier struct {
    17  	languagesLogProbabilities map[string]float64
    18  	tokensLogProbabilities    map[string]map[string]float64
    19  	tokensTotal               float64
    20  }
    21  
    22  type scoredLanguage struct {
    23  	language string
    24  	score    float64
    25  }
    26  
    27  // Classify returns a sorted slice of possible languages sorted by decreasing language's probability
    28  func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
    29  	if len(content) == 0 {
    30  		return nil
    31  	}
    32  
    33  	var languages map[string]float64
    34  	if len(candidates) == 0 {
    35  		languages = c.knownLangs()
    36  	} else {
    37  		languages = make(map[string]float64, len(candidates))
    38  		for candidate, weight := range candidates {
    39  			if lang, ok := GetLanguageByAlias(candidate); ok {
    40  				candidate = lang
    41  			}
    42  
    43  			languages[candidate] = weight
    44  		}
    45  	}
    46  
    47  	tokens := tokenizer.Tokenize(content)
    48  	scoredLangs := make([]*scoredLanguage, 0, len(languages))
    49  	for language := range languages {
    50  		scoredLang := &scoredLanguage{
    51  			language: language,
    52  			score:    c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language],
    53  		}
    54  
    55  		scoredLangs = append(scoredLangs, scoredLang)
    56  	}
    57  
    58  	return sortLanguagesByScore(scoredLangs)
    59  }
    60  
    61  func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
    62  	sort.Stable(byScore(scoredLangs))
    63  	sortedLanguages := make([]string, 0, len(scoredLangs))
    64  	for _, scoredLang := range scoredLangs {
    65  		sortedLanguages = append(sortedLanguages, scoredLang.language)
    66  	}
    67  
    68  	return sortedLanguages
    69  }
    70  
    71  func (c *classifier) knownLangs() map[string]float64 {
    72  	langs := make(map[string]float64, len(c.languagesLogProbabilities))
    73  	for lang := range c.languagesLogProbabilities {
    74  		langs[lang]++
    75  	}
    76  
    77  	return langs
    78  }
    79  
    80  func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
    81  	var sum float64
    82  	for _, token := range tokens {
    83  		sum += c.tokenProbability(token, language)
    84  	}
    85  
    86  	return sum
    87  }
    88  
    89  func (c *classifier) tokenProbability(token, language string) float64 {
    90  	tokenProb, ok := c.tokensLogProbabilities[language][token]
    91  	if !ok {
    92  		tokenProb = math.Log(1.000000 / c.tokensTotal)
    93  	}
    94  
    95  	return tokenProb
    96  }
    97  
    98  type byScore []*scoredLanguage
    99  
   100  func (b byScore) Len() int           { return len(b) }
   101  func (b byScore) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   102  func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }