gitlab.com/thomasboni/go-enry/v2@v2.8.3-0.20220418031202-30b0d7a3de98/common.go (about) 1 package enry 2 3 import ( 4 "bufio" 5 "bytes" 6 "fmt" 7 "path" 8 "path/filepath" 9 "strings" 10 11 "gitlab.com/thomasboni/go-enry/v2/data" 12 "gitlab.com/thomasboni/go-enry/v2/regex" 13 ) 14 15 // OtherLanguage is used as a zero value when a function can not return a specific language. 16 const OtherLanguage = "" 17 18 // Strategy type fix the signature for the functions that can be used as a strategy. 19 type Strategy func(filename string, content []byte, candidates []string) (languages []string) 20 21 // DefaultStrategies is a sequence of strategies used by GetLanguage to detect languages. 22 var DefaultStrategies = []Strategy{ 23 GetLanguagesByModeline, 24 GetLanguagesByFilename, 25 GetLanguagesByShebang, 26 GetLanguagesByExtension, 27 GetLanguagesByXML, 28 GetLanguagesByManpage, 29 GetLanguagesByContent, 30 GetLanguagesByClassifier, 31 } 32 33 // defaultClassifier is a Naive Bayes classifier trained on Linguist samples. 34 var defaultClassifier classifier = &naiveBayes{ 35 languagesLogProbabilities: data.LanguagesLogProbabilities, 36 tokensLogProbabilities: data.TokensLogProbabilities, 37 tokensTotal: data.TokensTotal, 38 } 39 40 // GetLanguage applies a sequence of strategies based on the given filename and content 41 // to find out the most probably language to return. 42 func GetLanguage(filename string, content []byte) (language string) { 43 languages := GetLanguages(filename, content) 44 return firstLanguage(languages) 45 } 46 47 func firstLanguage(languages []string) string { 48 for _, l := range languages { 49 if l != "" { 50 return l 51 } 52 } 53 return OtherLanguage 54 } 55 56 // GetLanguageByModeline returns detected language. If there are more than one possibles languages 57 // it returns the first language by alphabetically order and safe to false. 58 func GetLanguageByModeline(content []byte) (language string, safe bool) { 59 return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil) 60 } 61 62 // GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages 63 // it returns the first language by alphabetically order and safe to false. 64 func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) { 65 return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil) 66 } 67 68 // GetLanguageByVimModeline returns detected language. If there are more than one possibles languages 69 // it returns the first language by alphabetically order and safe to false. 70 func GetLanguageByVimModeline(content []byte) (language string, safe bool) { 71 return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil) 72 } 73 74 // GetLanguageByFilename returns detected language. If there are more than one possibles languages 75 // it returns the first language by alphabetically order and safe to false. 76 func GetLanguageByFilename(filename string) (language string, safe bool) { 77 return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil) 78 } 79 80 // GetLanguageByShebang returns detected language. If there are more than one possibles languages 81 // it returns the first language by alphabetically order and safe to false. 82 func GetLanguageByShebang(content []byte) (language string, safe bool) { 83 return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil) 84 } 85 86 // GetLanguageByExtension returns detected language. If there are more than one possibles languages 87 // it returns the first language by alphabetically order and safe to false. 88 func GetLanguageByExtension(filename string) (language string, safe bool) { 89 return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil) 90 } 91 92 // GetLanguageByContent returns detected language. If there are more than one possibles languages 93 // it returns the first language by alphabetically order and safe to false. 94 func GetLanguageByContent(filename string, content []byte) (language string, safe bool) { 95 return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil) 96 } 97 98 // GetLanguageByClassifier returns the most probably language detected for the given content. It uses 99 // defaultClassifier, if no candidates are provided it returns OtherLanguage. 100 func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) { 101 return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates) 102 } 103 104 func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) { 105 languages := strategy(filename, content, candidates) 106 return getFirstLanguageAndSafe(languages) 107 } 108 109 func getFirstLanguageAndSafe(languages []string) (language string, safe bool) { 110 language = firstLanguage(languages) 111 safe = len(languages) == 1 112 return 113 } 114 115 // GetLanguages applies a sequence of strategies based on the given filename and content 116 // to find out the most probable languages to return. 117 // 118 // If it finds a strategy that produces a single result, it will be returned; 119 // otherise the last strategy that returned multiple results will be returned. 120 // If the content is binary, no results will be returned. This matches the 121 // behavior of Linguist.detect: https://github.com/github/linguist/blob/aad49acc0624c70d654a8dce447887dbbc713c7a/lib/linguist.rb#L14-L49 122 // 123 // At least one of arguments should be set. If content is missing, language detection will be based on the filename. 124 // The function won't read the file, given an empty content. 125 func GetLanguages(filename string, content []byte) []string { 126 if IsBinary(content) { 127 return nil 128 } 129 130 var languages []string 131 for _, strategy := range DefaultStrategies { 132 candidates := strategy(filename, content, languages) 133 // No candidates, continue to next strategy without updating languages 134 if len(candidates) == 0 { 135 continue 136 } 137 138 // Only one candidate match, return it 139 if len(candidates) == 1 { 140 return candidates 141 } 142 143 // Save the candidates from this strategy to pass onto to the next strategy, like Linguist 144 languages = candidates 145 } 146 147 return languages 148 } 149 150 // GetLanguagesByModeline returns a slice of possible languages for the given content. 151 // It complies with the signature to be a Strategy type. 152 func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string { 153 headFoot := getHeaderAndFooter(content) 154 var languages []string 155 for _, getLang := range modelinesFunc { 156 languages = getLang("", headFoot, candidates) 157 if len(languages) > 0 { 158 break 159 } 160 } 161 162 return languages 163 } 164 165 var modelinesFunc = []Strategy{ 166 GetLanguagesByEmacsModeline, 167 GetLanguagesByVimModeline, 168 } 169 170 func getHeaderAndFooter(content []byte) []byte { 171 const searchScope = 5 172 173 if len(content) == 0 { 174 return content 175 } 176 177 if bytes.Count(content, []byte("\n")) < 2*searchScope { 178 return content 179 } 180 181 header := headScope(content, searchScope) 182 footer := footScope(content, searchScope) 183 headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:])) 184 headerAndFooter = append(headerAndFooter, content[:header]...) 185 headerAndFooter = append(headerAndFooter, content[footer:]...) 186 return headerAndFooter 187 } 188 189 func headScope(content []byte, scope int) (index int) { 190 for i := 0; i < scope; i++ { 191 eol := bytes.IndexAny(content, "\n") 192 content = content[eol+1:] 193 index += eol 194 } 195 196 return index + scope - 1 197 } 198 199 func footScope(content []byte, scope int) (index int) { 200 for i := 0; i < scope; i++ { 201 index = bytes.LastIndexAny(content, "\n") 202 content = content[:index] 203 } 204 205 return index + 1 206 } 207 208 var ( 209 reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`) 210 reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`) 211 reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`) 212 reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`) 213 ) 214 215 // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content. 216 // It complies with the signature to be a Strategy type. 217 func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string { 218 matched := reEmacsModeline.FindAllSubmatch(content, -1) 219 if matched == nil { 220 return nil 221 } 222 223 // only take the last matched line, discard previous lines 224 lastLineMatched := matched[len(matched)-1][1] 225 matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched) 226 var alias string 227 if matchedAlias != nil { 228 alias = string(matchedAlias[1]) 229 } else { 230 alias = string(lastLineMatched) 231 } 232 233 language, ok := GetLanguageByAlias(alias) 234 if !ok { 235 return nil 236 } 237 238 return []string{language} 239 } 240 241 // GetLanguagesByVimModeline returns a slice of possible languages for the given content. 242 // It complies with the signature to be a Strategy type. 243 func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string { 244 matched := reVimModeline.FindAllSubmatch(content, -1) 245 if matched == nil { 246 return nil 247 } 248 249 // only take the last matched line, discard previous lines 250 lastLineMatched := matched[len(matched)-1][1] 251 matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1) 252 if matchedAlias == nil { 253 return nil 254 } 255 256 alias := string(matchedAlias[0][1]) 257 if len(matchedAlias) > 1 { 258 // cases: 259 // matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage; 260 // matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python"; 261 for _, match := range matchedAlias { 262 otherAlias := string(match[1]) 263 if otherAlias != alias { 264 return nil 265 } 266 } 267 } 268 269 language, ok := GetLanguageByAlias(alias) 270 if !ok { 271 return nil 272 } 273 274 return []string{language} 275 } 276 277 // GetLanguagesByFilename returns a slice of possible languages for the given filename. 278 // It complies with the signature to be a Strategy type. 279 func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string { 280 if filename == "" { 281 return nil 282 } 283 284 return data.LanguagesByFilename[filepath.Base(filename)] 285 } 286 287 // GetLanguagesByShebang returns a slice of possible languages for the given content. 288 // It complies with the signature to be a Strategy type. 289 func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) { 290 interpreter := getInterpreter(content) 291 return data.LanguagesByInterpreter[interpreter] 292 } 293 294 var ( 295 shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`) 296 pythonVersion = regex.MustCompile(`python\d\.\d+`) 297 envOptArgs = regex.MustCompile(`-[i0uCSv]*|--\S+`) 298 envVarArgs = regex.MustCompile(`\S+=\S+`) 299 ) 300 301 func getInterpreter(data []byte) string { 302 line := getFirstLine(data) 303 if !hasShebang(line) { 304 return "" 305 } 306 307 // skip shebang 308 line = bytes.TrimSpace(line[2:]) 309 splitted := bytes.Fields(line) 310 if len(splitted) == 0 { 311 return "" 312 } 313 314 // Extract interpreter name from path. Use path.Base because 315 // shebang on Cygwin/Windows still use a forward slash 316 interpreter := path.Base(string(splitted[0])) 317 318 // #!/usr/bin/env [...] 319 if interpreter == "env" { 320 if len(splitted) == 1 { 321 // /usr/bin/env with no arguments 322 return "" 323 } 324 for len(splitted) > 2 { 325 if envOptArgs.Match(splitted[1]) || envVarArgs.Match(splitted[1]) { 326 splitted = append(splitted[:1], splitted[2:]...) 327 continue 328 } 329 break 330 } 331 interpreter = path.Base(string(splitted[1])) 332 } 333 334 if interpreter == "sh" { 335 interpreter = lookForMultilineExec(data) 336 } 337 338 if pythonVersion.MatchString(interpreter) { 339 interpreter = interpreter[:strings.Index(interpreter, `.`)] 340 } 341 342 // If osascript is called with argument -l it could be different language so do not relay on it 343 // To match linguist behaviour, see ref https://github.com/github/linguist/blob/d95bae794576ab0ef2fcb41a39eb61ea5302c5b5/lib/linguist/shebang.rb#L63 344 if interpreter == "osascript" && bytes.Contains(line, []byte("-l")) { 345 interpreter = "" 346 } 347 348 return interpreter 349 } 350 351 func getFirstLines(content []byte, count int) []byte { 352 nlpos := -1 353 for ; count > 0; count-- { 354 pos := bytes.IndexByte(content[nlpos+1:], '\n') 355 if pos < 0 { 356 return content 357 } 358 nlpos += pos + 1 359 } 360 361 return content[:nlpos] 362 } 363 364 func getFirstLine(content []byte) []byte { 365 return getFirstLines(content, 1) 366 } 367 368 func hasShebang(line []byte) bool { 369 const shebang = `#!` 370 prefix := []byte(shebang) 371 return bytes.HasPrefix(line, prefix) 372 } 373 374 func lookForMultilineExec(data []byte) string { 375 const magicNumOfLines = 5 376 interpreter := "sh" 377 378 buf := bufio.NewScanner(bytes.NewReader(data)) 379 for i := 0; i < magicNumOfLines && buf.Scan(); i++ { 380 line := buf.Bytes() 381 if shebangExecHack.Match(line) { 382 interpreter = shebangExecHack.FindStringSubmatch(string(line))[1] 383 break 384 } 385 } 386 387 if err := buf.Err(); err != nil { 388 return interpreter 389 } 390 391 return interpreter 392 } 393 394 // GetLanguagesByExtension returns a slice of possible languages for the given filename. 395 // It complies with the signature to be a Strategy type. 396 func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string { 397 if !strings.Contains(filename, ".") { 398 return nil 399 } 400 401 filename = strings.ToLower(filename) 402 dots := getDotIndexes(filename) 403 for _, dot := range dots { 404 ext := filename[dot:] 405 languages, ok := data.LanguagesByExtension[ext] 406 if ok { 407 return languages 408 } 409 } 410 411 return nil 412 } 413 414 var ( 415 manpageExtension = regex.MustCompile(`\.(?:[1-9](?:[a-z_]+[a-z_0-9]*)?|0p|n|man|mdoc)(?:\.in)?$`) 416 ) 417 418 // GetLanguagesByManpage returns a slice of possible manpage languages for the given filename. 419 // It complies with the signature to be a Strategy type. 420 func GetLanguagesByManpage(filename string, _ []byte, _ []string) []string { 421 filename = strings.ToLower(filename) 422 423 // Check if matches Roff man page filenames 424 if manpageExtension.Match([]byte(filename)) { 425 return []string{ 426 "Roff Manpage", 427 "Roff", 428 } 429 } 430 431 return nil 432 } 433 434 var ( 435 xmlHeader = regex.MustCompile(`<?xml version=`) 436 ) 437 438 // GetLanguagesByXML returns a slice of possible XML language for the given filename. 439 // It complies with the signature to be a Strategy type. 440 func GetLanguagesByXML(_ string, content []byte, candidates []string) []string { 441 if len(candidates) > 0 { 442 return candidates 443 } 444 445 header := getFirstLines(content, 2) 446 447 // Check if contains XML header 448 if xmlHeader.Match(header) { 449 return []string{ 450 "XML", 451 } 452 } 453 454 return nil 455 } 456 457 func getDotIndexes(filename string) []int { 458 dots := make([]int, 0, 2) 459 for i, letter := range filename { 460 if letter == rune('.') { 461 dots = append(dots, i) 462 } 463 } 464 465 return dots 466 } 467 468 // GetLanguagesByContent returns a slice of languages for the given content. 469 // It is a Strategy that uses content-based regexp heuristics and a filename extension. 470 func GetLanguagesByContent(filename string, content []byte, _ []string) []string { 471 if filename == "" { 472 return nil 473 } 474 475 ext := strings.ToLower(filepath.Ext(filename)) 476 477 heuristic, ok := data.ContentHeuristics[ext] 478 if !ok { 479 return nil 480 } 481 482 return heuristic.Match(content) 483 } 484 485 // GetLanguagesByClassifier returns a sorted slice of possible languages ordered by 486 // decreasing language's probability. If there are not candidates it returns nil. 487 // It is a Strategy that uses a pre-trained defaultClassifier. 488 func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) { 489 if len(candidates) == 0 { 490 return nil 491 } 492 493 return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier) 494 } 495 496 // getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used. 497 func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) { 498 mapCandidates := make(map[string]float64) 499 for _, candidate := range candidates { 500 mapCandidates[candidate]++ 501 } 502 503 return classifier.classify(content, mapCandidates) 504 } 505 506 // GetLanguageExtensions returns all extensions associated with the given language. 507 func GetLanguageExtensions(language string) []string { 508 return data.ExtensionsByLanguage[language] 509 } 510 511 // GetLanguageID returns the ID for the language. IDs are assigned by GitHub. 512 // The input must be the canonical language name. Aliases are not supported. 513 // 514 // NOTE: The zero value (0) is a valid language ID, so this API mimics the Go 515 // map API. Use the second return value to check if the language was found. 516 func GetLanguageID(language string) (int, bool) { 517 id, ok := data.IDByLanguage[language] 518 return id, ok 519 } 520 521 // Type represent language's type. Either data, programming, markup, prose, or unknown. 522 type Type int 523 524 // Type's values. 525 const ( 526 Unknown Type = Type(data.TypeUnknown) 527 Data = Type(data.TypeData) 528 Programming = Type(data.TypeProgramming) 529 Markup = Type(data.TypeMarkup) 530 Prose = Type(data.TypeProse) 531 ) 532 533 // GetLanguageType returns the type of the given language. 534 func GetLanguageType(language string) (langType Type) { 535 intType, ok := data.LanguagesType[language] 536 langType = Type(intType) 537 if !ok { 538 langType = Unknown 539 } 540 return langType 541 } 542 543 // GetLanguageByAlias returns either the language related to the given alias and ok set to true 544 // or Otherlanguage and ok set to false if the alias is not recognized. 545 func GetLanguageByAlias(alias string) (lang string, ok bool) { 546 lang, ok = data.LanguageByAlias(alias) 547 if !ok { 548 lang = OtherLanguage 549 } 550 551 return 552 } 553 554 // GetLanguageGroup returns language group or empty string if language does not have group. 555 func GetLanguageGroup(language string) string { 556 if group, ok := data.LanguagesGroup[language]; ok { 557 return group 558 } 559 560 return "" 561 } 562 563 // GetLanguageInfo returns the LanguageInfo for a given language name, or an error if not found. 564 func GetLanguageInfo(language string) (data.LanguageInfo, error) { 565 id, ok := GetLanguageID(language) 566 if !ok { 567 return data.LanguageInfo{}, fmt.Errorf("language %q not found", language) 568 } 569 570 return GetLanguageInfoByID(id) 571 } 572 573 // GetLanguageInfoByID returns the LanguageInfo for a given language ID, or an error if not found. 574 func GetLanguageInfoByID(id int) (data.LanguageInfo, error) { 575 if info, ok := data.LanguageInfoByID[id]; ok { 576 return info, nil 577 } 578 579 return data.LanguageInfo{}, fmt.Errorf("language %q not found", id) 580 }