github.com/mcuadros/go-enry@v1.7.3/classifier.go (about) 1 package enry 2 3 import ( 4 "math" 5 "sort" 6 7 "gopkg.in/src-d/enry.v1/internal/tokenizer" 8 ) 9 10 // Classifier is the interface in charge to detect the possible languages of the given content based on a set of 11 // candidates. Candidates is a map which can be used to assign weights to languages dynamically. 12 type Classifier interface { 13 Classify(content []byte, candidates map[string]float64) (languages []string) 14 } 15 16 type classifier struct { 17 languagesLogProbabilities map[string]float64 18 tokensLogProbabilities map[string]map[string]float64 19 tokensTotal float64 20 } 21 22 type scoredLanguage struct { 23 language string 24 score float64 25 } 26 27 // Classify returns a sorted slice of possible languages sorted by decreasing language's probability 28 func (c *classifier) Classify(content []byte, candidates map[string]float64) []string { 29 30 var languages map[string]float64 31 if len(candidates) == 0 { 32 languages = c.knownLangs() 33 } else { 34 languages = make(map[string]float64, len(candidates)) 35 for candidate, weight := range candidates { 36 if lang, ok := GetLanguageByAlias(candidate); ok { 37 candidate = lang 38 } 39 40 languages[candidate] = weight 41 } 42 } 43 44 empty := len(content) == 0 45 scoredLangs := make([]*scoredLanguage, 0, len(languages)) 46 47 var tokens []string 48 if !empty { 49 tokens = tokenizer.Tokenize(content) 50 } 51 52 for language := range languages { 53 score := c.languagesLogProbabilities[language] 54 if !empty { 55 score += c.tokensLogProbability(tokens, language) 56 } 57 scoredLangs = append(scoredLangs, &scoredLanguage{ 58 language: language, 59 score: score, 60 }) 61 } 62 63 return sortLanguagesByScore(scoredLangs) 64 } 65 66 func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string { 67 sort.Stable(byScore(scoredLangs)) 68 sortedLanguages := make([]string, 0, len(scoredLangs)) 69 for _, scoredLang := range scoredLangs { 70 sortedLanguages = append(sortedLanguages, scoredLang.language) 71 } 72 73 return sortedLanguages 74 } 75 76 func (c *classifier) knownLangs() map[string]float64 { 77 langs := make(map[string]float64, len(c.languagesLogProbabilities)) 78 for lang := range c.languagesLogProbabilities { 79 langs[lang]++ 80 } 81 82 return langs 83 } 84 85 func (c *classifier) tokensLogProbability(tokens []string, language string) float64 { 86 var sum float64 87 for _, token := range tokens { 88 sum += c.tokenProbability(token, language) 89 } 90 91 return sum 92 } 93 94 func (c *classifier) tokenProbability(token, language string) float64 { 95 tokenProb, ok := c.tokensLogProbabilities[language][token] 96 if !ok { 97 tokenProb = math.Log(1.000000 / c.tokensTotal) 98 } 99 100 return tokenProb 101 } 102 103 type byScore []*scoredLanguage 104 105 func (b byScore) Len() int { return len(b) } 106 func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 107 func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }