github.com/rohankumardubey/go-enry@v1.7.3/classifier.go (about)

     1  package enry
     2  
     3  import (
     4  	"math"
     5  	"sort"
     6  
     7  	"gopkg.in/src-d/enry.v1/internal/tokenizer"
     8  )
     9  
    10  // Classifier is the interface in charge to detect the possible languages of the given content based on a set of
    11  // candidates. Candidates is a map which can be used to assign weights to languages dynamically.
    12  type Classifier interface {
    13  	Classify(content []byte, candidates map[string]float64) (languages []string)
    14  }
    15  
    16  type classifier struct {
    17  	languagesLogProbabilities map[string]float64
    18  	tokensLogProbabilities    map[string]map[string]float64
    19  	tokensTotal               float64
    20  }
    21  
    22  type scoredLanguage struct {
    23  	language string
    24  	score    float64
    25  }
    26  
    27  // Classify returns a sorted slice of possible languages sorted by decreasing language's probability
    28  func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
    29  
    30  	var languages map[string]float64
    31  	if len(candidates) == 0 {
    32  		languages = c.knownLangs()
    33  	} else {
    34  		languages = make(map[string]float64, len(candidates))
    35  		for candidate, weight := range candidates {
    36  			if lang, ok := GetLanguageByAlias(candidate); ok {
    37  				candidate = lang
    38  			}
    39  
    40  			languages[candidate] = weight
    41  		}
    42  	}
    43  
    44  	empty := len(content) == 0
    45  	scoredLangs := make([]*scoredLanguage, 0, len(languages))
    46  
    47  	var tokens []string
    48  	if !empty {
    49  		tokens = tokenizer.Tokenize(content)
    50  	}
    51  
    52  	for language := range languages {
    53  		score := c.languagesLogProbabilities[language]
    54  		if !empty {
    55  			score += c.tokensLogProbability(tokens, language)
    56  		}
    57  		scoredLangs = append(scoredLangs, &scoredLanguage{
    58  			language: language,
    59  			score:    score,
    60  		})
    61  	}
    62  
    63  	return sortLanguagesByScore(scoredLangs)
    64  }
    65  
    66  func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
    67  	sort.Stable(byScore(scoredLangs))
    68  	sortedLanguages := make([]string, 0, len(scoredLangs))
    69  	for _, scoredLang := range scoredLangs {
    70  		sortedLanguages = append(sortedLanguages, scoredLang.language)
    71  	}
    72  
    73  	return sortedLanguages
    74  }
    75  
    76  func (c *classifier) knownLangs() map[string]float64 {
    77  	langs := make(map[string]float64, len(c.languagesLogProbabilities))
    78  	for lang := range c.languagesLogProbabilities {
    79  		langs[lang]++
    80  	}
    81  
    82  	return langs
    83  }
    84  
    85  func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
    86  	var sum float64
    87  	for _, token := range tokens {
    88  		sum += c.tokenProbability(token, language)
    89  	}
    90  
    91  	return sum
    92  }
    93  
    94  func (c *classifier) tokenProbability(token, language string) float64 {
    95  	tokenProb, ok := c.tokensLogProbabilities[language][token]
    96  	if !ok {
    97  		tokenProb = math.Log(1.000000 / c.tokensTotal)
    98  	}
    99  
   100  	return tokenProb
   101  }
   102  
   103  type byScore []*scoredLanguage
   104  
   105  func (b byScore) Len() int           { return len(b) }
   106  func (b byScore) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   107  func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }