github.com/dennwc/enry@v1.6.4-0.20180424151738-42391b8e105b/common.go (about) 1 package enry 2 3 import ( 4 "bufio" 5 "bytes" 6 "path/filepath" 7 "regexp" 8 "strings" 9 10 "gopkg.in/src-d/enry.v1/data" 11 ) 12 13 // OtherLanguage is used as a zero value when a function can not return a specific language. 14 const OtherLanguage = "" 15 16 // Strategy type fix the signature for the functions that can be used as a strategy. 17 type Strategy func(filename string, content []byte, candidates []string) (languages []string) 18 19 // DefaultStrategies is the strategies' sequence GetLanguage uses to detect languages. 20 var DefaultStrategies = []Strategy{ 21 GetLanguagesByModeline, 22 GetLanguagesByFilename, 23 GetLanguagesByShebang, 24 GetLanguagesByExtension, 25 GetLanguagesByContent, 26 GetLanguagesByClassifier, 27 } 28 29 var DefaultClassifier Classifier = &classifier{ 30 languagesLogProbabilities: data.LanguagesLogProbabilities, 31 tokensLogProbabilities: data.TokensLogProbabilities, 32 tokensTotal: data.TokensTotal, 33 } 34 35 // GetLanguage applies a sequence of strategies based on the given filename and content 36 // to find out the most probably language to return. 37 func GetLanguage(filename string, content []byte) (language string) { 38 languages := GetLanguages(filename, content) 39 return firstLanguage(languages) 40 } 41 42 func firstLanguage(languages []string) string { 43 if len(languages) == 0 { 44 return OtherLanguage 45 } 46 47 return languages[0] 48 } 49 50 // GetLanguageByModeline returns detected language. If there are more than one possibles languages 51 // it returns the first language by alphabetically order and safe to false. 52 func GetLanguageByModeline(content []byte) (language string, safe bool) { 53 return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil) 54 } 55 56 // GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages 57 // it returns the first language by alphabetically order and safe to false. 58 func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) { 59 return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil) 60 } 61 62 // GetLanguageByVimModeline returns detected language. If there are more than one possibles languages 63 // it returns the first language by alphabetically order and safe to false. 64 func GetLanguageByVimModeline(content []byte) (language string, safe bool) { 65 return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil) 66 } 67 68 // GetLanguageByFilename returns detected language. If there are more than one possibles languages 69 // it returns the first language by alphabetically order and safe to false. 70 func GetLanguageByFilename(filename string) (language string, safe bool) { 71 return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil) 72 } 73 74 // GetLanguageByShebang returns detected language. If there are more than one possibles languages 75 // it returns the first language by alphabetically order and safe to false. 76 func GetLanguageByShebang(content []byte) (language string, safe bool) { 77 return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil) 78 } 79 80 // GetLanguageByExtension returns detected language. If there are more than one possibles languages 81 // it returns the first language by alphabetically order and safe to false. 82 func GetLanguageByExtension(filename string) (language string, safe bool) { 83 return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil) 84 } 85 86 // GetLanguageByContent returns detected language. If there are more than one possibles languages 87 // it returns the first language by alphabetically order and safe to false. 88 func GetLanguageByContent(filename string, content []byte) (language string, safe bool) { 89 return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil) 90 } 91 92 // GetLanguageByClassifier returns the most probably language detected for the given content. It uses 93 // DefaultClassifier, if no candidates are provided it returns OtherLanguage. 94 func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) { 95 return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates) 96 } 97 98 func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) { 99 languages := strategy(filename, content, candidates) 100 return getFirstLanguageAndSafe(languages) 101 } 102 103 func getFirstLanguageAndSafe(languages []string) (language string, safe bool) { 104 language = firstLanguage(languages) 105 safe = len(languages) == 1 106 return 107 } 108 109 // GetLanguageBySpecificClassifier returns the most probably language for the given content using 110 // classifier to detect language. 111 func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) { 112 languages := GetLanguagesBySpecificClassifier(content, candidates, classifier) 113 return getFirstLanguageAndSafe(languages) 114 } 115 116 // GetLanguages applies a sequence of strategies based on the given filename and content 117 // to find out the most probably languages to return. 118 func GetLanguages(filename string, content []byte) []string { 119 if IsBinary(content) { 120 return nil 121 } 122 123 var languages []string 124 candidates := []string{} 125 for _, strategy := range DefaultStrategies { 126 languages = strategy(filename, content, candidates) 127 if len(languages) == 1 { 128 return languages 129 } 130 131 if len(languages) > 0 { 132 candidates = append(candidates, languages...) 133 } 134 } 135 136 return languages 137 } 138 139 // GetLanguagesByModeline returns a slice of possible languages for the given content. 140 // It complies with the signature to be a Strategy type. 141 func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string { 142 headFoot := getHeaderAndFooter(content) 143 var languages []string 144 for _, getLang := range modelinesFunc { 145 languages = getLang("", headFoot, candidates) 146 if len(languages) > 0 { 147 break 148 } 149 } 150 151 return languages 152 } 153 154 var modelinesFunc = []Strategy{ 155 GetLanguagesByEmacsModeline, 156 GetLanguagesByVimModeline, 157 } 158 159 func getHeaderAndFooter(content []byte) []byte { 160 const searchScope = 5 161 162 if len(content) == 0 { 163 return content 164 } 165 166 if bytes.Count(content, []byte("\n")) < 2*searchScope { 167 return content 168 } 169 170 header := headScope(content, searchScope) 171 footer := footScope(content, searchScope) 172 headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:])) 173 headerAndFooter = append(headerAndFooter, content[:header]...) 174 headerAndFooter = append(headerAndFooter, content[footer:]...) 175 return headerAndFooter 176 } 177 178 func headScope(content []byte, scope int) (index int) { 179 for i := 0; i < scope; i++ { 180 eol := bytes.IndexAny(content, "\n") 181 content = content[eol+1:] 182 index += eol 183 } 184 185 return index + scope - 1 186 } 187 188 func footScope(content []byte, scope int) (index int) { 189 for i := 0; i < scope; i++ { 190 index = bytes.LastIndexAny(content, "\n") 191 content = content[:index] 192 } 193 194 return index + 1 195 } 196 197 var ( 198 reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`) 199 reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`) 200 reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`) 201 reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`) 202 ) 203 204 // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content. 205 // It complies with the signature to be a Strategy type. 206 func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string { 207 matched := reEmacsModeline.FindAllSubmatch(content, -1) 208 if matched == nil { 209 return nil 210 } 211 212 // only take the last matched line, discard previous lines 213 lastLineMatched := matched[len(matched)-1][1] 214 matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched) 215 var alias string 216 if matchedAlias != nil { 217 alias = string(matchedAlias[1]) 218 } else { 219 alias = string(lastLineMatched) 220 } 221 222 language, ok := GetLanguageByAlias(alias) 223 if !ok { 224 return nil 225 } 226 227 return []string{language} 228 } 229 230 // GetLanguagesByVimModeline returns a slice of possible languages for the given content. 231 // It complies with the signature to be a Strategy type. 232 func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string { 233 matched := reVimModeline.FindAllSubmatch(content, -1) 234 if matched == nil { 235 return nil 236 } 237 238 // only take the last matched line, discard previous lines 239 lastLineMatched := matched[len(matched)-1][1] 240 matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1) 241 if matchedAlias == nil { 242 return nil 243 } 244 245 alias := string(matchedAlias[0][1]) 246 if len(matchedAlias) > 1 { 247 // cases: 248 // matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage; 249 // matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python"; 250 for _, match := range matchedAlias { 251 otherAlias := string(match[1]) 252 if otherAlias != alias { 253 return nil 254 } 255 } 256 } 257 258 language, ok := GetLanguageByAlias(alias) 259 if !ok { 260 return nil 261 } 262 263 return []string{language} 264 } 265 266 // GetLanguagesByFilename returns a slice of possible languages for the given filename. 267 // It complies with the signature to be a Strategy type. 268 func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string { 269 if filename == "" { 270 return nil 271 } 272 273 return data.LanguagesByFilename[filepath.Base(filename)] 274 } 275 276 // GetLanguagesByShebang returns a slice of possible languages for the given content. 277 // It complies with the signature to be a Strategy type. 278 func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) { 279 interpreter := getInterpreter(content) 280 return data.LanguagesByInterpreter[interpreter] 281 } 282 283 var ( 284 shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`) 285 pythonVersion = regexp.MustCompile(`python\d\.\d+`) 286 ) 287 288 func getInterpreter(data []byte) (interpreter string) { 289 line := getFirstLine(data) 290 if !hasShebang(line) { 291 return "" 292 } 293 294 // skip shebang 295 line = bytes.TrimSpace(line[2:]) 296 splitted := bytes.Fields(line) 297 if len(splitted) == 0 { 298 return "" 299 } 300 301 if bytes.Contains(splitted[0], []byte("env")) { 302 if len(splitted) > 1 { 303 interpreter = string(splitted[1]) 304 } 305 } else { 306 splittedPath := bytes.Split(splitted[0], []byte{'/'}) 307 interpreter = string(splittedPath[len(splittedPath)-1]) 308 } 309 310 if interpreter == "sh" { 311 interpreter = lookForMultilineExec(data) 312 } 313 314 if pythonVersion.MatchString(interpreter) { 315 interpreter = interpreter[:strings.Index(interpreter, `.`)] 316 } 317 318 return 319 } 320 321 func getFirstLine(data []byte) []byte { 322 buf := bufio.NewScanner(bytes.NewReader(data)) 323 buf.Scan() 324 line := buf.Bytes() 325 if err := buf.Err(); err != nil { 326 return nil 327 } 328 329 return line 330 } 331 332 func hasShebang(line []byte) bool { 333 const shebang = `#!` 334 prefix := []byte(shebang) 335 return bytes.HasPrefix(line, prefix) 336 } 337 338 func lookForMultilineExec(data []byte) string { 339 const magicNumOfLines = 5 340 interpreter := "sh" 341 342 buf := bufio.NewScanner(bytes.NewReader(data)) 343 for i := 0; i < magicNumOfLines && buf.Scan(); i++ { 344 line := buf.Bytes() 345 if shebangExecHack.Match(line) { 346 interpreter = shebangExecHack.FindStringSubmatch(string(line))[1] 347 break 348 } 349 } 350 351 if err := buf.Err(); err != nil { 352 return interpreter 353 } 354 355 return interpreter 356 } 357 358 // GetLanguagesByExtension returns a slice of possible languages for the given filename. 359 // It complies with the signature to be a Strategy type. 360 func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string { 361 if !strings.Contains(filename, ".") { 362 return nil 363 } 364 365 filename = strings.ToLower(filename) 366 dots := getDotIndexes(filename) 367 for _, dot := range dots { 368 ext := filename[dot:] 369 languages, ok := data.LanguagesByExtension[ext] 370 if ok { 371 return languages 372 } 373 } 374 375 return nil 376 } 377 378 func getDotIndexes(filename string) []int { 379 dots := make([]int, 0, 2) 380 for i, letter := range filename { 381 if letter == rune('.') { 382 dots = append(dots, i) 383 } 384 } 385 386 return dots 387 } 388 389 // GetLanguagesByContent returns a slice of possible languages for the given content. 390 // It complies with the signature to be a Strategy type. 391 func GetLanguagesByContent(filename string, content []byte, _ []string) []string { 392 if filename == "" { 393 return nil 394 } 395 396 ext := strings.ToLower(filepath.Ext(filename)) 397 fnMatcher, ok := data.ContentMatchers[ext] 398 if !ok { 399 return nil 400 } 401 402 return fnMatcher(content) 403 } 404 405 // GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by 406 // decreasing language's probability. If there are not candidates it returns nil. It complies with the signature to be a Strategy type. 407 func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) { 408 if len(candidates) == 0 { 409 return nil 410 } 411 412 return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier) 413 } 414 415 // GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used. 416 func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) { 417 mapCandidates := make(map[string]float64) 418 for _, candidate := range candidates { 419 mapCandidates[candidate]++ 420 } 421 422 return classifier.Classify(content, mapCandidates) 423 } 424 425 // GetLanguageExtensions returns the different extensions being used by the language. 426 func GetLanguageExtensions(language string) []string { 427 return data.ExtensionsByLanguage[language] 428 } 429 430 // Type represent language's type. Either data, programming, markup, prose, or unknown. 431 type Type int 432 433 // Type's values. 434 const ( 435 Unknown Type = iota 436 Data 437 Programming 438 Markup 439 Prose 440 ) 441 442 // GetLanguageType returns the type of the given language. 443 func GetLanguageType(language string) (langType Type) { 444 intType, ok := data.LanguagesType[language] 445 langType = Type(intType) 446 if !ok { 447 langType = Unknown 448 } 449 return langType 450 } 451 452 // GetLanguageByAlias returns either the language related to the given alias and ok set to true 453 // or Otherlanguage and ok set to false if the alias is not recognized. 454 func GetLanguageByAlias(alias string) (lang string, ok bool) { 455 a := strings.Split(alias, `,`)[0] 456 a = strings.ToLower(a) 457 lang, ok = data.LanguagesByAlias[a] 458 if !ok { 459 lang = OtherLanguage 460 } 461 462 return 463 }