github.com/xzntrc/go-enry/v2@v2.0.0-20230215091818-766cc1d65498/internal/code-generator/generator/frequencies.go (about) 1 package generator 2 3 import ( 4 "bytes" 5 "fmt" 6 "io" 7 "io/ioutil" 8 "log" 9 "math" 10 "os" 11 "path/filepath" 12 "sort" 13 "strconv" 14 "strings" 15 "text/template" 16 17 "github.com/go-enry/go-enry/v2/internal/tokenizer" 18 ) 19 20 type samplesFrequencies struct { 21 LanguageTotal int `json:"language_total,omitempty"` 22 Languages map[string]int `json:"languages,omitempty"` 23 TokensTotal int `json:"tokens_total,omitempty"` 24 Tokens map[string]map[string]int `json:"tokens,omitempty"` 25 LanguageTokens map[string]int `json:"language_tokens,omitempty"` 26 } 27 28 // Frequencies reads directories in samplesDir, retrieves information about frequencies of languages and tokens, and write 29 // the file outPath using tmplName as a template. It complies with type File signature. 30 func Frequencies(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error { 31 freqs, err := getFrequencies(samplesDir) 32 if err != nil { 33 return err 34 } 35 36 if _, ok := os.LookupEnv("ENRY_DEBUG"); ok { 37 log.Printf("Total samples: %d\n", freqs.LanguageTotal) 38 log.Printf("Total tokens: %d\n", freqs.TokensTotal) 39 40 keys := make([]string, 0, len(freqs.Languages)) 41 for k := range freqs.Languages { 42 keys = append(keys, k) 43 } 44 sort.Strings(keys) 45 46 for _, k := range keys { 47 fmt.Printf(" %s: %d\n", k, freqs.Languages[k]) 48 } 49 } 50 51 buf := &bytes.Buffer{} 52 if err := executeFrequenciesTemplate(buf, freqs, tmplPath, tmplName, commit); err != nil { 53 return err 54 } 55 56 return formatedWrite(outPath, buf.Bytes()) 57 } 58 59 func getFrequencies(samplesDir string) (*samplesFrequencies, error) { 60 langDirs, err := ioutil.ReadDir(samplesDir) 61 if err != nil { 62 return nil, err 63 } 64 65 var languageTotal int 66 var languages = make(map[string]int) 67 var tokensTotal int 68 var tokens = make(map[string]map[string]int) 69 var languageTokens = make(map[string]int) 70 71 for _, langDir := range langDirs { 72 if !langDir.IsDir() { 73 continue 74 } 75 76 lang := langDir.Name() 77 samples, err := readSamples(filepath.Join(samplesDir, lang)) 78 if err != nil { 79 log.Println(err) 80 } 81 82 if len(samples) == 0 { 83 continue 84 } 85 86 samplesTokens, err := getTokens(samples) 87 if err != nil { 88 log.Println(err) 89 continue 90 } 91 92 languageTotal += len(samples) 93 languages[lang] = len(samples) 94 tokensTotal += len(samplesTokens) 95 languageTokens[lang] = len(samplesTokens) 96 tokens[lang] = make(map[string]int) 97 for _, token := range samplesTokens { 98 tokens[lang][token]++ 99 } 100 } 101 102 return &samplesFrequencies{ 103 TokensTotal: tokensTotal, 104 LanguageTotal: languageTotal, 105 Tokens: tokens, 106 LanguageTokens: languageTokens, 107 Languages: languages, 108 }, nil 109 } 110 111 // readSamples collects ./samples/ filenames from the Linguist codebase, skipping symlinks. 112 func readSamples(samplesLangDir string) ([]string, error) { 113 const specialSubDir = "filenames" 114 var samples []string 115 116 err := filepath.Walk(samplesLangDir, func(path string, info os.FileInfo, err error) error { 117 if err != nil { 118 fmt.Printf("failure accessing a path %q: %v\n", path, err) 119 return err 120 } 121 if info.IsDir() { 122 switch info.Name() { 123 case filepath.Base(samplesLangDir): 124 return nil 125 case specialSubDir: 126 return nil 127 default: 128 return filepath.SkipDir 129 } 130 } 131 // skip git file symlinks on win and *nix 132 if isKnownSymlinkInLinguist(path) || !info.Mode().IsRegular() { 133 return nil 134 } 135 samples = append(samples, path) 136 return nil 137 }) 138 139 return samples, err 140 } 141 142 // isKnownSymlinkInLinguist checks if the file name is on the list of known symlinks. 143 // On Windows, there is no symlink support in Git [1] and those become regular text files, 144 // so we have to skip these files manually, maintaining a list here :/ 145 // 1. https://github.com/git-for-windows/git/wiki/Symbolic-Links 146 // 147 // $ find -L .linguist/samples -xtype l 148 func isKnownSymlinkInLinguist(path string) bool { 149 return strings.HasSuffix(path, filepath.Join("Ant Build System", "filenames", "build.xml")) || 150 strings.HasSuffix(path, filepath.Join("Markdown", "symlink.md")) 151 } 152 153 func getTokens(samples []string) ([]string, error) { 154 tokens := make([]string, 0, 20) 155 var anyError error 156 for _, sample := range samples { 157 content, err := ioutil.ReadFile(sample) 158 if err != nil { 159 anyError = err 160 continue 161 } 162 163 t := tokenizer.Tokenize(content) 164 tokens = append(tokens, t...) 165 } 166 167 return tokens, anyError 168 } 169 170 func executeFrequenciesTemplate(out io.Writer, freqs *samplesFrequencies, tmplPath, tmplName, commit string) error { 171 fmap := template.FuncMap{ 172 "toFloat64": func(num int) string { return fmt.Sprintf("%f", float64(num)) }, 173 "orderKeys": func(m map[string]int) []string { 174 keys := make([]string, 0, len(m)) 175 for key := range m { 176 keys = append(keys, key) 177 } 178 179 sort.Strings(keys) 180 return keys 181 }, 182 "languageLogProbability": func(language string) string { 183 num := math.Log(float64(freqs.Languages[language]) / float64(freqs.LanguageTotal)) 184 return fmt.Sprintf("%f", num) 185 }, 186 "orderMapMapKeys": func(mm map[string]map[string]int) []string { 187 keys := make([]string, 0, len(mm)) 188 for key := range mm { 189 keys = append(keys, key) 190 } 191 192 sort.Strings(keys) 193 return keys 194 }, 195 "tokenLogProbability": func(language, token string) string { 196 num := math.Log(float64(freqs.Tokens[language][token]) / float64(freqs.LanguageTokens[language])) 197 return fmt.Sprintf("%f", num) 198 }, 199 "quote": strconv.Quote, 200 } 201 return executeTemplate(out, tmplName, tmplPath, commit, fmap, freqs) 202 }