github.com/rohankumardubey/go-enry@v1.7.3/internal/code-generator/generator/samplesfreq.go (about) 1 package generator 2 3 import ( 4 "bytes" 5 "fmt" 6 "io" 7 "io/ioutil" 8 "log" 9 "math" 10 "path/filepath" 11 "sort" 12 "strconv" 13 "text/template" 14 15 "gopkg.in/src-d/enry.v1/internal/tokenizer" 16 ) 17 18 type samplesFrequencies struct { 19 LanguageTotal int `json:"language_total,omitempty"` 20 Languages map[string]int `json:"languages,omitempty"` 21 TokensTotal int `json:"tokens_total,omitempty"` 22 Tokens map[string]map[string]int `json:"tokens,omitempty"` 23 LanguageTokens map[string]int `json:"language_tokens,omitempty"` 24 } 25 26 // Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write 27 // the file outPath using tmplName as a template. It complies with type File signature. 28 func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error { 29 freqs, err := getFrequencies(samplesDir) 30 if err != nil { 31 return err 32 } 33 34 buf := &bytes.Buffer{} 35 if err := executeFrequenciesTemplate(buf, freqs, tmplPath, tmplName, commit); err != nil { 36 return err 37 } 38 39 return formatedWrite(outPath, buf.Bytes()) 40 } 41 42 func getFrequencies(samplesDir string) (*samplesFrequencies, error) { 43 langDirs, err := ioutil.ReadDir(samplesDir) 44 if err != nil { 45 return nil, err 46 } 47 48 var languageTotal int 49 var languages = make(map[string]int) 50 var tokensTotal int 51 var tokens = make(map[string]map[string]int) 52 var languageTokens = make(map[string]int) 53 54 for _, langDir := range langDirs { 55 if !langDir.IsDir() { 56 continue 57 } 58 59 lang := langDir.Name() 60 samples, err := readSamples(filepath.Join(samplesDir, lang)) 61 if err != nil { 62 log.Println(err) 63 } 64 65 if len(samples) == 0 { 66 continue 67 } 68 69 samplesTokens, err := getTokens(samples) 70 if err != nil { 71 log.Println(err) 72 continue 73 } 74 75 languageTotal += len(samples) 76 languages[lang] = len(samples) 77 tokensTotal += len(samplesTokens) 78 languageTokens[lang] = len(samplesTokens) 79 tokens[lang] = make(map[string]int) 80 for _, token := range samplesTokens { 81 tokens[lang][token]++ 82 } 83 } 84 85 return &samplesFrequencies{ 86 TokensTotal: tokensTotal, 87 LanguageTotal: languageTotal, 88 Tokens: tokens, 89 LanguageTokens: languageTokens, 90 Languages: languages, 91 }, nil 92 } 93 94 func readSamples(samplesLangDir string) ([]string, error) { 95 const samplesLangFilesDir = "filenames" 96 sampleFiles, err := ioutil.ReadDir(samplesLangDir) 97 if err != nil { 98 return nil, err 99 } 100 101 var samples []string 102 for _, sampleFile := range sampleFiles { 103 filename := filepath.Join(samplesLangDir, sampleFile.Name()) 104 if sampleFile.Mode().IsRegular() { 105 samples = append(samples, filename) 106 continue 107 } 108 109 if sampleFile.IsDir() && sampleFile.Name() == samplesLangFilesDir { 110 subSamples, err := readSubSamples(filename) 111 if err != nil { 112 return nil, err 113 } 114 115 samples = append(samples, subSamples...) 116 } 117 118 } 119 120 return samples, nil 121 } 122 123 func readSubSamples(path string) ([]string, error) { 124 subSamples := []string{} 125 entries, err := ioutil.ReadDir(path) 126 if err != nil { 127 return nil, err 128 } 129 130 for _, entry := range entries { 131 if entry.Mode().IsRegular() { 132 subSamples = append(subSamples, filepath.Join(path, entry.Name())) 133 } 134 } 135 136 return subSamples, nil 137 } 138 139 func getTokens(samples []string) ([]string, error) { 140 tokens := make([]string, 0, 20) 141 var anyError error 142 for _, sample := range samples { 143 content, err := ioutil.ReadFile(sample) 144 if err != nil { 145 anyError = err 146 continue 147 } 148 149 t := tokenizer.Tokenize(content) 150 tokens = append(tokens, t...) 151 } 152 153 return tokens, anyError 154 } 155 156 func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, tmplPath, tmplName, commit string) error { 157 fmap := template.FuncMap{ 158 "toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) }, 159 "orderKeys": func(m map[string]int) []string { 160 keys := make([]string, 0, len(m)) 161 for key := range m { 162 keys = append(keys, key) 163 } 164 165 sort.Strings(keys) 166 return keys 167 }, 168 "languageLogProbability": func(language string) string { 169 num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal)) 170 return fmt.Sprintf("%f", num) 171 }, 172 "orderMapMapKeys": func(mm map[string]map[string]int) []string { 173 keys := make([]string, 0, len(mm)) 174 for key := range mm { 175 keys = append(keys, key) 176 } 177 178 sort.Strings(keys) 179 return keys 180 }, 181 "tokenLogProbability": func(language, token string) string { 182 num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language])) 183 return fmt.Sprintf("%f", num) 184 }, 185 "quote": strconv.Quote, 186 } 187 return executeTemplate(out, tmplName, tmplPath, commit, fmap, freqs) 188 }