gitlab.com/thomasboni/go-enry@v1.7.3/common.go (about) 1 package enry 2 3 import ( 4 "bufio" 5 "bytes" 6 "path/filepath" 7 "strings" 8 9 "gopkg.in/src-d/enry.v1/data" 10 "gopkg.in/src-d/enry.v1/regex" 11 ) 12 13 // OtherLanguage is used as a zero value when a function can not return a specific language. 14 const OtherLanguage = "" 15 16 // Strategy type fix the signature for the functions that can be used as a strategy. 17 type Strategy func(filename string, content []byte, candidates []string) (languages []string) 18 19 // DefaultStrategies is a sequence of strategies used by GetLanguage to detect languages. 20 var DefaultStrategies = []Strategy{ 21 GetLanguagesByModeline, 22 GetLanguagesByFilename, 23 GetLanguagesByShebang, 24 GetLanguagesByExtension, 25 GetLanguagesByContent, 26 GetLanguagesByClassifier, 27 } 28 29 // DefaultClassifier is a Naive Bayes classifier trained on Linguist samples. 30 var DefaultClassifier Classifier = &classifier{ 31 languagesLogProbabilities: data.LanguagesLogProbabilities, 32 tokensLogProbabilities: data.TokensLogProbabilities, 33 tokensTotal: data.TokensTotal, 34 } 35 36 // GetLanguage applies a sequence of strategies based on the given filename and content 37 // to find out the most probably language to return. 38 func GetLanguage(filename string, content []byte) (language string) { 39 languages := GetLanguages(filename, content) 40 return firstLanguage(languages) 41 } 42 43 func firstLanguage(languages []string) string { 44 for _, l := range languages { 45 if l != "" { 46 return l 47 } 48 } 49 return OtherLanguage 50 } 51 52 // GetLanguageByModeline returns detected language. If there are more than one possibles languages 53 // it returns the first language by alphabetically order and safe to false. 54 func GetLanguageByModeline(content []byte) (language string, safe bool) { 55 return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil) 56 } 57 58 // GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages 59 // it returns the first language by alphabetically order and safe to false. 60 func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) { 61 return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil) 62 } 63 64 // GetLanguageByVimModeline returns detected language. If there are more than one possibles languages 65 // it returns the first language by alphabetically order and safe to false. 66 func GetLanguageByVimModeline(content []byte) (language string, safe bool) { 67 return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil) 68 } 69 70 // GetLanguageByFilename returns detected language. If there are more than one possibles languages 71 // it returns the first language by alphabetically order and safe to false. 72 func GetLanguageByFilename(filename string) (language string, safe bool) { 73 return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil) 74 } 75 76 // GetLanguageByShebang returns detected language. If there are more than one possibles languages 77 // it returns the first language by alphabetically order and safe to false. 78 func GetLanguageByShebang(content []byte) (language string, safe bool) { 79 return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil) 80 } 81 82 // GetLanguageByExtension returns detected language. If there are more than one possibles languages 83 // it returns the first language by alphabetically order and safe to false. 84 func GetLanguageByExtension(filename string) (language string, safe bool) { 85 return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil) 86 } 87 88 // GetLanguageByContent returns detected language. If there are more than one possibles languages 89 // it returns the first language by alphabetically order and safe to false. 90 func GetLanguageByContent(filename string, content []byte) (language string, safe bool) { 91 return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil) 92 } 93 94 // GetLanguageByClassifier returns the most probably language detected for the given content. It uses 95 // DefaultClassifier, if no candidates are provided it returns OtherLanguage. 96 func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) { 97 return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates) 98 } 99 100 func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) { 101 languages := strategy(filename, content, candidates) 102 return getFirstLanguageAndSafe(languages) 103 } 104 105 func getFirstLanguageAndSafe(languages []string) (language string, safe bool) { 106 language = firstLanguage(languages) 107 safe = len(languages) == 1 108 return 109 } 110 111 // GetLanguageBySpecificClassifier returns the most probably language for the given content using 112 // classifier to detect language. 113 func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) { 114 languages := GetLanguagesBySpecificClassifier(content, candidates, classifier) 115 return getFirstLanguageAndSafe(languages) 116 } 117 118 // GetLanguages applies a sequence of strategies based on the given filename and content 119 // to find out the most probably languages to return. 120 // At least one of arguments should be set. If content is missing, language detection will be based on the filename. 121 // The function won't read the file, given an empty content. 122 func GetLanguages(filename string, content []byte) []string { 123 if IsBinary(content) { 124 return nil 125 } 126 127 var languages []string 128 candidates := []string{} 129 for _, strategy := range DefaultStrategies { 130 languages = strategy(filename, content, candidates) 131 if len(languages) == 1 { 132 return languages 133 } 134 135 if len(languages) > 0 { 136 candidates = append(candidates, languages...) 137 } 138 } 139 140 return languages 141 } 142 143 // GetLanguagesByModeline returns a slice of possible languages for the given content. 144 // It complies with the signature to be a Strategy type. 145 func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string { 146 headFoot := getHeaderAndFooter(content) 147 var languages []string 148 for _, getLang := range modelinesFunc { 149 languages = getLang("", headFoot, candidates) 150 if len(languages) > 0 { 151 break 152 } 153 } 154 155 return languages 156 } 157 158 var modelinesFunc = []Strategy{ 159 GetLanguagesByEmacsModeline, 160 GetLanguagesByVimModeline, 161 } 162 163 func getHeaderAndFooter(content []byte) []byte { 164 const searchScope = 5 165 166 if len(content) == 0 { 167 return content 168 } 169 170 if bytes.Count(content, []byte("\n")) < 2*searchScope { 171 return content 172 } 173 174 header := headScope(content, searchScope) 175 footer := footScope(content, searchScope) 176 headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:])) 177 headerAndFooter = append(headerAndFooter, content[:header]...) 178 headerAndFooter = append(headerAndFooter, content[footer:]...) 179 return headerAndFooter 180 } 181 182 func headScope(content []byte, scope int) (index int) { 183 for i := 0; i < scope; i++ { 184 eol := bytes.IndexAny(content, "\n") 185 content = content[eol+1:] 186 index += eol 187 } 188 189 return index + scope - 1 190 } 191 192 func footScope(content []byte, scope int) (index int) { 193 for i := 0; i < scope; i++ { 194 index = bytes.LastIndexAny(content, "\n") 195 content = content[:index] 196 } 197 198 return index + 1 199 } 200 201 var ( 202 reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`) 203 reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`) 204 reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`) 205 reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`) 206 ) 207 208 // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content. 209 // It complies with the signature to be a Strategy type. 210 func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string { 211 matched := reEmacsModeline.FindAllSubmatch(content, -1) 212 if matched == nil { 213 return nil 214 } 215 216 // only take the last matched line, discard previous lines 217 lastLineMatched := matched[len(matched)-1][1] 218 matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched) 219 var alias string 220 if matchedAlias != nil { 221 alias = string(matchedAlias[1]) 222 } else { 223 alias = string(lastLineMatched) 224 } 225 226 language, ok := GetLanguageByAlias(alias) 227 if !ok { 228 return nil 229 } 230 231 return []string{language} 232 } 233 234 // GetLanguagesByVimModeline returns a slice of possible languages for the given content. 235 // It complies with the signature to be a Strategy type. 236 func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string { 237 matched := reVimModeline.FindAllSubmatch(content, -1) 238 if matched == nil { 239 return nil 240 } 241 242 // only take the last matched line, discard previous lines 243 lastLineMatched := matched[len(matched)-1][1] 244 matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1) 245 if matchedAlias == nil { 246 return nil 247 } 248 249 alias := string(matchedAlias[0][1]) 250 if len(matchedAlias) > 1 { 251 // cases: 252 // matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage; 253 // matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python"; 254 for _, match := range matchedAlias { 255 otherAlias := string(match[1]) 256 if otherAlias != alias { 257 return nil 258 } 259 } 260 } 261 262 language, ok := GetLanguageByAlias(alias) 263 if !ok { 264 return nil 265 } 266 267 return []string{language} 268 } 269 270 // GetLanguagesByFilename returns a slice of possible languages for the given filename. 271 // It complies with the signature to be a Strategy type. 272 func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string { 273 if filename == "" { 274 return nil 275 } 276 277 return data.LanguagesByFilename[filepath.Base(filename)] 278 } 279 280 // GetLanguagesByShebang returns a slice of possible languages for the given content. 281 // It complies with the signature to be a Strategy type. 282 func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) { 283 interpreter := getInterpreter(content) 284 return data.LanguagesByInterpreter[interpreter] 285 } 286 287 var ( 288 shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`) 289 pythonVersion = regex.MustCompile(`python\d\.\d+`) 290 ) 291 292 func getInterpreter(data []byte) (interpreter string) { 293 line := getFirstLine(data) 294 if !hasShebang(line) { 295 return "" 296 } 297 298 // skip shebang 299 line = bytes.TrimSpace(line[2:]) 300 splitted := bytes.Fields(line) 301 if len(splitted) == 0 { 302 return "" 303 } 304 305 if bytes.Contains(splitted[0], []byte("env")) { 306 if len(splitted) > 1 { 307 interpreter = string(splitted[1]) 308 } 309 } else { 310 splittedPath := bytes.Split(splitted[0], []byte{'/'}) 311 interpreter = string(splittedPath[len(splittedPath)-1]) 312 } 313 314 if interpreter == "sh" { 315 interpreter = lookForMultilineExec(data) 316 } 317 318 if pythonVersion.MatchString(interpreter) { 319 interpreter = interpreter[:strings.Index(interpreter, `.`)] 320 } 321 322 return 323 } 324 325 func getFirstLine(data []byte) []byte { 326 buf := bufio.NewScanner(bytes.NewReader(data)) 327 buf.Scan() 328 line := buf.Bytes() 329 if err := buf.Err(); err != nil { 330 return nil 331 } 332 333 return line 334 } 335 336 func hasShebang(line []byte) bool { 337 const shebang = `#!` 338 prefix := []byte(shebang) 339 return bytes.HasPrefix(line, prefix) 340 } 341 342 func lookForMultilineExec(data []byte) string { 343 const magicNumOfLines = 5 344 interpreter := "sh" 345 346 buf := bufio.NewScanner(bytes.NewReader(data)) 347 for i := 0; i < magicNumOfLines && buf.Scan(); i++ { 348 line := buf.Bytes() 349 if shebangExecHack.Match(line) { 350 interpreter = shebangExecHack.FindStringSubmatch(string(line))[1] 351 break 352 } 353 } 354 355 if err := buf.Err(); err != nil { 356 return interpreter 357 } 358 359 return interpreter 360 } 361 362 // GetLanguagesByExtension returns a slice of possible languages for the given filename. 363 // It complies with the signature to be a Strategy type. 364 func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string { 365 if !strings.Contains(filename, ".") { 366 return nil 367 } 368 369 filename = strings.ToLower(filename) 370 dots := getDotIndexes(filename) 371 for _, dot := range dots { 372 ext := filename[dot:] 373 languages, ok := data.LanguagesByExtension[ext] 374 if ok { 375 return languages 376 } 377 } 378 379 return nil 380 } 381 382 func getDotIndexes(filename string) []int { 383 dots := make([]int, 0, 2) 384 for i, letter := range filename { 385 if letter == rune('.') { 386 dots = append(dots, i) 387 } 388 } 389 390 return dots 391 } 392 393 // GetLanguagesByContent returns a slice of languages for the given content. 394 // It is a Strategy that uses content-based regexp heuristics and a filename extension. 395 func GetLanguagesByContent(filename string, content []byte, _ []string) []string { 396 if filename == "" { 397 return nil 398 } 399 400 ext := strings.ToLower(filepath.Ext(filename)) 401 402 heuristic, ok := data.ContentHeuristics[ext] 403 if !ok { 404 return nil 405 } 406 407 return heuristic.Match(content) 408 } 409 410 // GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by 411 // decreasing language's probability. If there are not candidates it returns nil. It complies with the signature to be a Strategy type. 412 func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) { 413 if len(candidates) == 0 { 414 return nil 415 } 416 417 return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier) 418 } 419 420 // GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used. 421 func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) { 422 mapCandidates := make(map[string]float64) 423 for _, candidate := range candidates { 424 mapCandidates[candidate]++ 425 } 426 427 return classifier.Classify(content, mapCandidates) 428 } 429 430 // GetLanguageExtensions returns the different extensions being used by the language. 431 func GetLanguageExtensions(language string) []string { 432 return data.ExtensionsByLanguage[language] 433 } 434 435 // Type represent language's type. Either data, programming, markup, prose, or unknown. 436 type Type int 437 438 // Type's values. 439 const ( 440 Unknown Type = iota 441 Data 442 Programming 443 Markup 444 Prose 445 ) 446 447 // GetLanguageType returns the type of the given language. 448 func GetLanguageType(language string) (langType Type) { 449 intType, ok := data.LanguagesType[language] 450 langType = Type(intType) 451 if !ok { 452 langType = Unknown 453 } 454 return langType 455 } 456 457 // GetLanguageByAlias returns either the language related to the given alias and ok set to true 458 // or Otherlanguage and ok set to false if the alias is not recognized. 459 func GetLanguageByAlias(alias string) (lang string, ok bool) { 460 lang, ok = data.LanguageByAlias(alias) 461 if !ok { 462 lang = OtherLanguage 463 } 464 465 return 466 }