github.com/zkry/enry@v1.6.3/internal/code-generator/generator/samplesfreq.go (about) 1 package generator 2 3 import ( 4 "bytes" 5 "fmt" 6 "io" 7 "io/ioutil" 8 "log" 9 "math" 10 "os" 11 "path/filepath" 12 "sort" 13 "strconv" 14 "text/template" 15 16 "gopkg.in/src-d/enry.v1/internal/tokenizer" 17 ) 18 19 type samplesFrequencies struct { 20 LanguageTotal int `json:"language_total,omitempty"` 21 Languages map[string]int `json:"languages,omitempty"` 22 TokensTotal int `json:"tokens_total,omitempty"` 23 Tokens map[string]map[string]int `json:"tokens,omitempty"` 24 LanguageTokens map[string]int `json:"language_tokens,omitempty"` 25 } 26 27 // Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write 28 // the file outPath using tmplName as a template. It complies with type File signature. 29 func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error { 30 freqs, err := getFrequencies(samplesDir) 31 if err != nil { 32 return err 33 } 34 35 buf := &bytes.Buffer{} 36 if err := executeFrequenciesTemplate(buf, freqs, tmplPath, tmplName, commit); err != nil { 37 return err 38 } 39 40 return formatedWrite(outPath, buf.Bytes()) 41 } 42 43 func getFrequencies(samplesDir string) (*samplesFrequencies, error) { 44 entries, err := ioutil.ReadDir(samplesDir) 45 if err != nil { 46 return nil, err 47 } 48 49 var languageTotal int 50 var languages = make(map[string]int) 51 var tokensTotal int 52 var tokens = make(map[string]map[string]int) 53 var languageTokens = make(map[string]int) 54 55 for _, entry := range entries { 56 if !entry.IsDir() { 57 log.Println(err) 58 continue 59 } 60 61 samples, err := getSamples(samplesDir, entry) 62 if err != nil { 63 log.Println(err) 64 } 65 66 if len(samples) == 0 { 67 continue 68 } 69 70 samplesTokens, err := getTokens(samples) 71 if err != nil { 72 log.Println(err) 73 continue 74 } 75 76 lang := entry.Name() 77 languageTotal += len(samples) 78 languages[lang] = len(samples) 79 tokensTotal += len(samplesTokens) 80 languageTokens[lang] = len(samplesTokens) 81 tokens[lang] = make(map[string]int) 82 for _, token := range samplesTokens { 83 tokens[lang][token]++ 84 } 85 } 86 87 return &samplesFrequencies{ 88 TokensTotal: tokensTotal, 89 LanguageTotal: languageTotal, 90 Tokens: tokens, 91 LanguageTokens: languageTokens, 92 Languages: languages, 93 }, nil 94 } 95 96 func getSamples(samplesDir string, langDir os.FileInfo) ([]string, error) { 97 const samplesSubDir = "filenames" 98 samples := []string{} 99 path := filepath.Join(samplesDir, langDir.Name()) 100 entries, err := ioutil.ReadDir(path) 101 if err != nil { 102 return nil, err 103 } 104 105 for _, entry := range entries { 106 if entry.Mode().IsRegular() { 107 samples = append(samples, filepath.Join(path, entry.Name())) 108 } 109 110 if entry.IsDir() && entry.Name() == samplesSubDir { 111 subSamples, err := getSubSamples(samplesDir, langDir.Name(), entry) 112 if err != nil { 113 return nil, err 114 } 115 116 samples = append(samples, subSamples...) 117 } 118 119 } 120 121 return samples, nil 122 } 123 124 func getSubSamples(samplesDir, langDir string, subLangDir os.FileInfo) ([]string, error) { 125 subSamples := []string{} 126 path := filepath.Join(samplesDir, langDir, subLangDir.Name()) 127 entries, err := ioutil.ReadDir(path) 128 if err != nil { 129 return nil, err 130 } 131 132 for _, entry := range entries { 133 if entry.Mode().IsRegular() { 134 subSamples = append(subSamples, filepath.Join(path, entry.Name())) 135 } 136 } 137 138 return subSamples, nil 139 } 140 141 func getTokens(samples []string) ([]string, error) { 142 tokens := make([]string, 0, 20) 143 var anyError error 144 for _, sample := range samples { 145 content, err := ioutil.ReadFile(sample) 146 if err != nil { 147 anyError = err 148 continue 149 } 150 151 t := tokenizer.Tokenize(content) 152 tokens = append(tokens, t...) 153 } 154 155 return tokens, anyError 156 } 157 158 func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, tmplPath, tmplName, commit string) error { 159 fmap := template.FuncMap{ 160 "getCommit": func() string { return commit }, 161 "toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) }, 162 "orderKeys": func(m map[string]int) []string { 163 keys := make([]string, 0, len(m)) 164 for key := range m { 165 keys = append(keys, key) 166 } 167 168 sort.Strings(keys) 169 return keys 170 }, 171 "languageLogProbability": func(language string) string { 172 num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal)) 173 return fmt.Sprintf("%f", num) 174 }, 175 "orderMapMapKeys": func(mm map[string]map[string]int) []string { 176 keys := make([]string, 0, len(mm)) 177 for key := range mm { 178 keys = append(keys, key) 179 } 180 181 sort.Strings(keys) 182 return keys 183 }, 184 "tokenLogProbability": func(language, token string) string { 185 num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language])) 186 return fmt.Sprintf("%f", num) 187 }, 188 "quote": strconv.Quote, 189 } 190 191 t := template.Must(template.New(tmplName).Funcs(fmap).ParseFiles(tmplPath)) 192 if err := t.Execute(out, freqs); err != nil { 193 return err 194 } 195 196 return nil 197 }