github.com/zkry/enry@v1.6.3/classifier.go (about) 1 package enry 2 3 import ( 4 "math" 5 "sort" 6 7 "gopkg.in/src-d/enry.v1/internal/tokenizer" 8 ) 9 10 // Classifier is the interface in charge to detect the possible languages of the given content based on a set of 11 // candidates. Candidates is a map which can be used to assign weights to languages dynamically. 12 type Classifier interface { 13 Classify(content []byte, candidates map[string]float64) (languages []string) 14 } 15 16 type classifier struct { 17 languagesLogProbabilities map[string]float64 18 tokensLogProbabilities map[string]map[string]float64 19 tokensTotal float64 20 } 21 22 type scoredLanguage struct { 23 language string 24 score float64 25 } 26 27 // Classify returns a sorted slice of possible languages sorted by decreasing language's probability 28 func (c *classifier) Classify(content []byte, candidates map[string]float64) []string { 29 if len(content) == 0 { 30 return nil 31 } 32 33 var languages map[string]float64 34 if len(candidates) == 0 { 35 languages = c.knownLangs() 36 } else { 37 languages = make(map[string]float64, len(candidates)) 38 for candidate, weight := range candidates { 39 if lang, ok := GetLanguageByAlias(candidate); ok { 40 candidate = lang 41 } 42 43 languages[candidate] = weight 44 } 45 } 46 47 tokens := tokenizer.Tokenize(content) 48 scoredLangs := make([]*scoredLanguage, 0, len(languages)) 49 for language := range languages { 50 scoredLang := &scoredLanguage{ 51 language: language, 52 score: c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language], 53 } 54 55 scoredLangs = append(scoredLangs, scoredLang) 56 } 57 58 return sortLanguagesByScore(scoredLangs) 59 } 60 61 func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string { 62 sort.Stable(byScore(scoredLangs)) 63 sortedLanguages := make([]string, 0, len(scoredLangs)) 64 for _, scoredLang := range scoredLangs { 65 sortedLanguages = append(sortedLanguages, scoredLang.language) 66 } 67 68 return sortedLanguages 69 } 70 71 func (c *classifier) knownLangs() map[string]float64 { 72 langs := make(map[string]float64, len(c.languagesLogProbabilities)) 73 for lang := range c.languagesLogProbabilities { 74 langs[lang]++ 75 } 76 77 return langs 78 } 79 80 func (c *classifier) tokensLogProbability(tokens []string, language string) float64 { 81 var sum float64 82 for _, token := range tokens { 83 sum += c.tokenProbability(token, language) 84 } 85 86 return sum 87 } 88 89 func (c *classifier) tokenProbability(token, language string) float64 { 90 tokenProb, ok := c.tokensLogProbabilities[language][token] 91 if !ok { 92 tokenProb = math.Log(1.000000 / c.tokensTotal) 93 } 94 95 return tokenProb 96 } 97 98 type byScore []*scoredLanguage 99 100 func (b byScore) Len() int { return len(b) } 101 func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 102 func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }