github.com/SupersunnySea/draft@v0.16.0/pkg/linguist/util.go (about) 1 package linguist 2 3 import ( 4 "bufio" 5 "bytes" 6 "log" 7 "path/filepath" 8 "regexp" 9 "strings" 10 11 "github.com/ghodss/yaml" 12 ) 13 14 var ( 15 vendorRE *regexp.Regexp 16 doxRE *regexp.Regexp 17 18 extensions = map[string][]string{} 19 filenames = map[string][]string{} 20 interpreters = map[string][]string{} 21 colors = map[string]string{} 22 23 shebangRE = regexp.MustCompile(`^#!\s*(\S+)(?:\s+(\S+))?.*`) 24 scriptVersionRE = regexp.MustCompile(`((?:\d+\.?)+)`) 25 ) 26 27 func init() { 28 var regexps []string 29 bytes := []byte(files["data/vendor.yml"]) 30 if err := yaml.Unmarshal(bytes, ®exps); err != nil { 31 log.Fatal(err) 32 return 33 } 34 vendorRE = regexp.MustCompile(strings.Join(regexps, "|")) 35 36 var moreregex []string 37 bytes = []byte(files["data/documentation.yml"]) 38 if err := yaml.Unmarshal(bytes, &moreregex); err != nil { 39 log.Fatal(err) 40 return 41 } 42 doxRE = regexp.MustCompile(strings.Join(moreregex, "|")) 43 44 type language struct { 45 Extensions []string `yaml:"extensions,omitempty"` 46 Filenames []string `yaml:"filenames,omitempty"` 47 Interpreters []string `yaml:"interpreters,omitempty"` 48 Color string `yaml:"color,omitempty"` 49 } 50 languages := map[string]*language{} 51 52 bytes = []byte(files["data/languages.yml"]) 53 if err := yaml.Unmarshal(bytes, &languages); err != nil { 54 log.Fatal(err) 55 } 56 57 for n, l := range languages { 58 for _, e := range l.Extensions { 59 extensions[e] = append(extensions[e], n) 60 } 61 for _, f := range l.Filenames { 62 filenames[f] = append(filenames[f], n) 63 } 64 for _, i := range l.Interpreters { 65 interpreters[i] = append(interpreters[i], n) 66 } 67 colors[n] = l.Color 68 } 69 } 70 71 // LanguageColor is a convenience function that returns the color associated 72 // with the language, in HTML Hex notation (e.g. "#123ABC") 73 // from the languages.yml file provided by https://github.com/github/linguist 74 // 75 // Returns the empty string if there is no associated color for the language. 76 func LanguageColor(language string) string { 77 if c, ok := colors[language]; ok { 78 return c 79 } 80 return "" 81 } 82 83 // LanguageByFilename attempts to determine the language of a source file based solely on 84 // common naming conventions and file extensions 85 // from the languages.yml file provided by https://github.com/github/linguist 86 // 87 // Returns the empty string in ambiguous or unrecognized cases. 88 func LanguageByFilename(filename string) string { 89 if l := filenames[filename]; len(l) == 1 { 90 return l[0] 91 } 92 ext := filepath.Ext(filename) 93 if ext != "" { 94 if l := extensions[ext]; len(l) == 1 { 95 return l[0] 96 } 97 } 98 return "" 99 } 100 101 // LanguageHints attempts to detect all possible languages of a source file based solely on 102 // common naming conventions and file extensions 103 // from the languages.yml file provided by https://github.com/github/linguist 104 // 105 // Intended to be used with LanguageByContents. 106 // 107 // May return an empty slice. 108 func LanguageHints(filename string) (hints []string) { 109 if l, ok := filenames[filename]; ok { 110 hints = append(hints, l...) 111 } 112 if ext := filepath.Ext(filename); ext != "" { 113 if l, ok := extensions[ext]; ok { 114 hints = append(hints, l...) 115 } 116 } 117 return hints 118 } 119 120 // LanguageByContents attempts to detect the language of a source file based on its 121 // contents and a slice of hints to the possible answer. 122 // 123 // Obtain hints with LanguageHints() 124 // 125 // Returns the empty string a language could not be determined. 126 func LanguageByContents(contents []byte, hints []string) string { 127 interpreter := detectInterpreter(contents) 128 if interpreter != "" { 129 if l := interpreters[interpreter]; len(l) == 1 { 130 return l[0] 131 } 132 } 133 return Analyse(contents, hints) 134 } 135 136 func detectInterpreter(contents []byte) string { 137 scanner := bufio.NewScanner(bytes.NewReader(contents)) 138 scanner.Scan() 139 line := scanner.Text() 140 m := shebangRE.FindStringSubmatch(line) 141 if m == nil || len(m) != 3 { 142 return "" 143 } 144 base := filepath.Base(m[1]) 145 if base == "env" && m[2] != "" { 146 base = m[2] 147 } 148 // Strip suffixed version number. 149 return scriptVersionRE.ReplaceAllString(base, "") 150 } 151 152 // ShouldIgnoreFilename checks if filename should not be passed to LanguageByFilename. 153 // 154 // (this simply calls IsVendored and IsDocumentation) 155 func ShouldIgnoreFilename(filename string) bool { 156 vendored := IsVendored(filename) 157 documentation := IsDocumentation(filename) 158 return vendored || documentation 159 // return IsVendored(filename) || IsDocumentation(filename) 160 } 161 162 // ShouldIgnoreContents checks if contents should not be passed to LangugeByContents. 163 // 164 // (this simply calls IsBinary) 165 func ShouldIgnoreContents(contents []byte) bool { 166 return IsBinary(contents) 167 } 168 169 // IsVendored checks if path contains a filename commonly belonging to configuration files. 170 func IsVendored(path string) bool { 171 return vendorRE.MatchString(path) 172 } 173 174 // IsDocumentation checks if path contains a filename commonly belonging to documentation. 175 func IsDocumentation(path string) bool { 176 return doxRE.MatchString(path) 177 } 178 179 // IsBinary checks contents for known character escape codes which 180 // frequently show up in binary files but rarely (if ever) in text. 181 // 182 // Use this check before using LanguageFromContents to reduce likelihood 183 // of passing binary data into it which can cause inaccurate results. 184 func IsBinary(contents []byte) bool { 185 // NOTE(tso): preliminary testing on this method of checking for binary 186 // contents were promising, having fed a document consisting of all 187 // utf-8 codepoints from 0000 to FFFF with satisfactory results. Thanks 188 // to robpike.io/cmd/unicode: 189 // ``` 190 // unicode -c $(seq 0 65535 | xargs printf "%04x ") | tr -d '\n' > unicode_test 191 // ``` 192 // 193 // However, the intentional presence of character escape codes to throw 194 // this function off is entirely possible, as is, potentially, a binary 195 // file consisting entirely of the 4 exceptions to the rule for the first 196 // 512 bytes. It is also possible that more character escape codes need 197 // to be added. 198 // 199 // Further analysis and real world testing of this is required. 200 for n, b := range contents { 201 if n >= 512 { 202 break 203 } 204 if b < 32 { 205 switch b { 206 case 0: 207 fallthrough 208 case 9: 209 fallthrough 210 case 10: 211 fallthrough 212 case 13: 213 continue 214 default: 215 return true 216 } 217 } 218 } 219 return false 220 }