git.templeos.me/xultist/go-enry/v2@v2.0.0-20230215093429-6ef3e87f47c0/common.go (about)

     1  package enry
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"fmt"
     7  	"path"
     8  	"path/filepath"
     9  	"strings"
    10  
    11  	"github.com/go-enry/go-enry/v2/data"
    12  	"github.com/go-enry/go-enry/v2/regex"
    13  )
    14  
    15  // OtherLanguage is used as a zero value when a function can not return a specific language.
    16  const OtherLanguage = ""
    17  
    18  // Strategy type fix the signature for the functions that can be used as a strategy.
    19  type Strategy func(filename string, content []byte, candidates []string) (languages []string)
    20  
    21  // DefaultStrategies is a sequence of strategies used by GetLanguage to detect languages.
    22  var DefaultStrategies = []Strategy{
    23  	GetLanguagesByModeline,
    24  	GetLanguagesByFilename,
    25  	GetLanguagesByShebang,
    26  	GetLanguagesByExtension,
    27  	GetLanguagesByXML,
    28  	GetLanguagesByManpage,
    29  	GetLanguagesByContent,
    30  	GetLanguagesByClassifier,
    31  }
    32  
    33  // defaultClassifier is a Naive Bayes classifier trained on Linguist samples.
    34  var defaultClassifier classifier = &naiveBayes{
    35  	languagesLogProbabilities: data.LanguagesLogProbabilities,
    36  	tokensLogProbabilities:    data.TokensLogProbabilities,
    37  	tokensTotal:               data.TokensTotal,
    38  }
    39  
    40  // GetLanguage applies a sequence of strategies based on the given filename and content
    41  // to find out the most probable language to return.
    42  func GetLanguage(filename string, content []byte) (language string) {
    43  	languages := GetLanguages(filename, content)
    44  	return firstLanguage(languages)
    45  }
    46  
    47  func firstLanguage(languages []string) string {
    48  	for _, l := range languages {
    49  		if l != "" {
    50  			return l
    51  		}
    52  	}
    53  	return OtherLanguage
    54  }
    55  
    56  // GetLanguageByModeline returns detected language. If there are more than one possibles languages
    57  // it returns the first language by alphabetically order and safe to false.
    58  func GetLanguageByModeline(content []byte) (language string, safe bool) {
    59  	return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil)
    60  }
    61  
    62  // GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
    63  // it returns the first language by alphabetically order and safe to false.
    64  func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) {
    65  	return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil)
    66  }
    67  
    68  // GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
    69  // it returns the first language by alphabetically order and safe to false.
    70  func GetLanguageByVimModeline(content []byte) (language string, safe bool) {
    71  	return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil)
    72  }
    73  
    74  // GetLanguageByFilename returns detected language. If there are more than one possibles languages
    75  // it returns the first language by alphabetically order and safe to false.
    76  func GetLanguageByFilename(filename string) (language string, safe bool) {
    77  	return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil)
    78  }
    79  
    80  // GetLanguageByShebang returns detected language. If there are more than one possibles languages
    81  // it returns the first language by alphabetically order and safe to false.
    82  func GetLanguageByShebang(content []byte) (language string, safe bool) {
    83  	return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil)
    84  }
    85  
    86  // GetLanguageByExtension returns detected language. If there are more than one possibles languages
    87  // it returns the first language by alphabetically order and safe to false.
    88  func GetLanguageByExtension(filename string) (language string, safe bool) {
    89  	return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil)
    90  }
    91  
    92  // GetLanguageByContent returns detected language. If there are more than one possibles languages
    93  // it returns the first language by alphabetically order and safe to false.
    94  func GetLanguageByContent(filename string, content []byte) (language string, safe bool) {
    95  	return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil)
    96  }
    97  
    98  // GetLanguageByClassifier returns the most probably language detected for the given content. It uses
    99  // defaultClassifier, if no candidates are provided it returns OtherLanguage.
   100  func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
   101  	return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
   102  }
   103  
   104  func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) {
   105  	languages := strategy(filename, content, candidates)
   106  	return getFirstLanguageAndSafe(languages)
   107  }
   108  
   109  func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
   110  	language = firstLanguage(languages)
   111  	safe = len(languages) == 1
   112  	return
   113  }
   114  
   115  // GetLanguages applies a sequence of strategies based on the given filename and content
   116  // to find out the most probable languages to return.
   117  //
   118  // If it finds a strategy that produces a single result, it will be returned;
   119  // otherise the last strategy that returned multiple results will be returned.
   120  // If the content is binary, no results will be returned. This matches the
   121  // behavior of Linguist.detect: https://github.com/github/linguist/blob/aad49acc0624c70d654a8dce447887dbbc713c7a/lib/linguist.rb#L14-L49
   122  //
   123  // At least one of arguments should be set. If content is missing, language detection will be based on the filename.
   124  // The function won't read the file, given an empty content.
   125  func GetLanguages(filename string, content []byte) []string {
   126  	if IsBinary(content) {
   127  		return nil
   128  	}
   129  
   130  	var languages []string
   131  	for _, strategy := range DefaultStrategies {
   132  		candidates := strategy(filename, content, languages)
   133  		// No candidates, continue to next strategy without updating languages
   134  		if len(candidates) == 0 {
   135  			continue
   136  		}
   137  
   138  		// Only one candidate match, return it
   139  		if len(candidates) == 1 {
   140  			return candidates
   141  		}
   142  
   143  		// Save the candidates from this strategy to pass onto to the next strategy, like Linguist
   144  		languages = candidates
   145  	}
   146  
   147  	return languages
   148  }
   149  
   150  // GetLanguagesByModeline returns a slice of possible languages for the given content.
   151  // It complies with the signature to be a Strategy type.
   152  func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string {
   153  	headFoot := getHeaderAndFooter(content)
   154  	var languages []string
   155  	for _, getLang := range modelinesFunc {
   156  		languages = getLang("", headFoot, candidates)
   157  		if len(languages) > 0 {
   158  			break
   159  		}
   160  	}
   161  
   162  	return languages
   163  }
   164  
   165  var modelinesFunc = []Strategy{
   166  	GetLanguagesByEmacsModeline,
   167  	GetLanguagesByVimModeline,
   168  }
   169  
   170  func getHeaderAndFooter(content []byte) []byte {
   171  	const searchScope = 5
   172  
   173  	if len(content) == 0 {
   174  		return content
   175  	}
   176  
   177  	if bytes.Count(content, []byte("\n")) < 2*searchScope {
   178  		return content
   179  	}
   180  
   181  	header := headScope(content, searchScope)
   182  	footer := footScope(content, searchScope)
   183  	headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
   184  	headerAndFooter = append(headerAndFooter, content[:header]...)
   185  	headerAndFooter = append(headerAndFooter, content[footer:]...)
   186  	return headerAndFooter
   187  }
   188  
   189  func headScope(content []byte, scope int) (index int) {
   190  	for i := 0; i < scope; i++ {
   191  		eol := bytes.IndexAny(content, "\n")
   192  		content = content[eol+1:]
   193  		index += eol
   194  	}
   195  
   196  	return index + scope - 1
   197  }
   198  
   199  func footScope(content []byte, scope int) (index int) {
   200  	for i := 0; i < scope; i++ {
   201  		index = bytes.LastIndexAny(content, "\n")
   202  		content = content[:index]
   203  	}
   204  
   205  	return index + 1
   206  }
   207  
   208  var (
   209  	reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
   210  	reEmacsLang     = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
   211  	reVimModeline   = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
   212  	reVimLang       = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
   213  )
   214  
   215  // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
   216  // It complies with the signature to be a Strategy type.
   217  func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string {
   218  	matched := reEmacsModeline.FindAllSubmatch(content, -1)
   219  	if matched == nil {
   220  		return nil
   221  	}
   222  
   223  	// only take the last matched line, discard previous lines
   224  	lastLineMatched := matched[len(matched)-1][1]
   225  	matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
   226  	var alias string
   227  	if matchedAlias != nil {
   228  		alias = string(matchedAlias[1])
   229  	} else {
   230  		alias = string(lastLineMatched)
   231  	}
   232  
   233  	language, ok := GetLanguageByAlias(alias)
   234  	if !ok {
   235  		return nil
   236  	}
   237  
   238  	return []string{language}
   239  }
   240  
   241  // GetLanguagesByVimModeline returns a slice of possible languages for the given content.
   242  // It complies with the signature to be a Strategy type.
   243  func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string {
   244  	matched := reVimModeline.FindAllSubmatch(content, -1)
   245  	if matched == nil {
   246  		return nil
   247  	}
   248  
   249  	// only take the last matched line, discard previous lines
   250  	lastLineMatched := matched[len(matched)-1][1]
   251  	matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
   252  	if matchedAlias == nil {
   253  		return nil
   254  	}
   255  
   256  	alias := string(matchedAlias[0][1])
   257  	if len(matchedAlias) > 1 {
   258  		// cases:
   259  		// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
   260  		// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
   261  		for _, match := range matchedAlias {
   262  			otherAlias := string(match[1])
   263  			if otherAlias != alias {
   264  				return nil
   265  			}
   266  		}
   267  	}
   268  
   269  	language, ok := GetLanguageByAlias(alias)
   270  	if !ok {
   271  		return nil
   272  	}
   273  
   274  	return []string{language}
   275  }
   276  
   277  // GetLanguagesByFilename returns a slice of possible languages for the given filename.
   278  // It complies with the signature to be a Strategy type.
   279  func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string {
   280  	if filename == "" {
   281  		return nil
   282  	}
   283  
   284  	return data.LanguagesByFilename[filepath.Base(filename)]
   285  }
   286  
   287  // GetLanguagesByShebang returns a slice of possible languages for the given content.
   288  // It complies with the signature to be a Strategy type.
   289  func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) {
   290  	interpreter := getInterpreter(content)
   291  	return data.LanguagesByInterpreter[interpreter]
   292  }
   293  
   294  var (
   295  	shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
   296  	pythonVersion   = regex.MustCompile(`python\d\.\d+`)
   297  	envOptArgs      = regex.MustCompile(`-[i0uCSv]*|--\S+`)
   298  	envVarArgs      = regex.MustCompile(`\S+=\S+`)
   299  )
   300  
   301  func getInterpreter(data []byte) string {
   302  	line := getFirstLine(data)
   303  	if !hasShebang(line) {
   304  		return ""
   305  	}
   306  
   307  	// skip shebang
   308  	line = bytes.TrimSpace(line[2:])
   309  	splitted := bytes.Fields(line)
   310  	if len(splitted) == 0 {
   311  		return ""
   312  	}
   313  
   314  	// Extract interpreter name from path. Use path.Base because
   315  	// shebang on Cygwin/Windows still use a forward slash
   316  	interpreter := path.Base(string(splitted[0]))
   317  
   318  	// #!/usr/bin/env [...]
   319  	if interpreter == "env" {
   320  		if len(splitted) == 1 {
   321  			// /usr/bin/env with no arguments
   322  			return ""
   323  		}
   324  		for len(splitted) > 2 {
   325  			if envOptArgs.Match(splitted[1]) || envVarArgs.Match(splitted[1]) {
   326  				splitted = append(splitted[:1], splitted[2:]...)
   327  				continue
   328  			}
   329  			break
   330  		}
   331  		interpreter = path.Base(string(splitted[1]))
   332  	}
   333  
   334  	if interpreter == "sh" {
   335  		interpreter = lookForMultilineExec(data)
   336  	}
   337  
   338  	if pythonVersion.MatchString(interpreter) {
   339  		interpreter = interpreter[:strings.Index(interpreter, `.`)]
   340  	}
   341  
   342  	// If osascript is called with argument -l it could be different language so do not relay on it
   343  	// To match linguist behaviour, see ref https://github.com/github/linguist/blob/d95bae794576ab0ef2fcb41a39eb61ea5302c5b5/lib/linguist/shebang.rb#L63
   344  	if interpreter == "osascript" && bytes.Contains(line, []byte("-l")) {
   345  		interpreter = ""
   346  	}
   347  
   348  	return interpreter
   349  }
   350  
   351  func getFirstLines(content []byte, count int) []byte {
   352  	nlpos := -1
   353  	for ; count > 0; count-- {
   354  		pos := bytes.IndexByte(content[nlpos+1:], '\n')
   355  		if pos < 0 {
   356  			return content
   357  		}
   358  		nlpos += pos + 1
   359  	}
   360  
   361  	return content[:nlpos]
   362  }
   363  
   364  func getFirstLine(content []byte) []byte {
   365  	return getFirstLines(content, 1)
   366  }
   367  
   368  func hasShebang(line []byte) bool {
   369  	const shebang = `#!`
   370  	prefix := []byte(shebang)
   371  	return bytes.HasPrefix(line, prefix)
   372  }
   373  
   374  func lookForMultilineExec(data []byte) string {
   375  	const magicNumOfLines = 5
   376  	interpreter := "sh"
   377  
   378  	buf := bufio.NewScanner(bytes.NewReader(data))
   379  	for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
   380  		line := buf.Bytes()
   381  		if shebangExecHack.Match(line) {
   382  			interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
   383  			break
   384  		}
   385  	}
   386  
   387  	if err := buf.Err(); err != nil {
   388  		return interpreter
   389  	}
   390  
   391  	return interpreter
   392  }
   393  
   394  // GetLanguagesByExtension returns a slice of possible languages for the given filename.
   395  // It complies with the signature to be a Strategy type.
   396  func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string {
   397  	if !strings.Contains(filename, ".") {
   398  		return nil
   399  	}
   400  
   401  	filename = strings.ToLower(filename)
   402  	dots := getDotIndexes(filename)
   403  	for _, dot := range dots {
   404  		ext := filename[dot:]
   405  		languages, ok := data.LanguagesByExtension[ext]
   406  		if ok {
   407  			return languages
   408  		}
   409  	}
   410  
   411  	return nil
   412  }
   413  
   414  var (
   415  	manpageExtension = regex.MustCompile(`\.(?:[1-9](?:[a-z_]+[a-z_0-9]*)?|0p|n|man|mdoc)(?:\.in)?$`)
   416  )
   417  
   418  // GetLanguagesByManpage returns a slice of possible manpage languages for the given filename.
   419  // It complies with the signature to be a Strategy type.
   420  func GetLanguagesByManpage(filename string, _ []byte, _ []string) []string {
   421  	filename = strings.ToLower(filename)
   422  
   423  	// Check if matches Roff man page filenames
   424  	if manpageExtension.Match([]byte(filename)) {
   425  		return []string{
   426  			"Roff Manpage",
   427  			"Roff",
   428  		}
   429  	}
   430  
   431  	return nil
   432  }
   433  
   434  var (
   435  	xmlHeader = regex.MustCompile(`<?xml version=`)
   436  )
   437  
   438  // GetLanguagesByXML returns a slice of possible XML language for the given filename.
   439  // It complies with the signature to be a Strategy type.
   440  func GetLanguagesByXML(_ string, content []byte, candidates []string) []string {
   441  	if len(candidates) > 0 {
   442  		return candidates
   443  	}
   444  
   445  	header := getFirstLines(content, 2)
   446  
   447  	// Check if contains XML header
   448  	if xmlHeader.Match(header) {
   449  		return []string{
   450  			"XML",
   451  		}
   452  	}
   453  
   454  	return nil
   455  }
   456  
   457  func getDotIndexes(filename string) []int {
   458  	dots := make([]int, 0, 2)
   459  	for i, letter := range filename {
   460  		if letter == rune('.') {
   461  			dots = append(dots, i)
   462  		}
   463  	}
   464  
   465  	return dots
   466  }
   467  
   468  // GetLanguagesByContent returns a slice of languages for the given content.
   469  // It is a Strategy that uses content-based regexp heuristics and a filename extension.
   470  func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
   471  	if filename == "" {
   472  		return nil
   473  	}
   474  
   475  	ext := strings.ToLower(filepath.Ext(filename))
   476  
   477  	heuristic, ok := data.ContentHeuristics[ext]
   478  	if !ok {
   479  		return nil
   480  	}
   481  
   482  	return heuristic.Match(content)
   483  }
   484  
   485  // GetLanguagesByClassifier returns a sorted slice of possible languages ordered by
   486  // decreasing language's probability. If there are not candidates it returns nil.
   487  // It is a Strategy that uses a pre-trained defaultClassifier.
   488  func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
   489  	if len(candidates) == 0 {
   490  		return nil
   491  	}
   492  
   493  	return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
   494  }
   495  
   496  // getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
   497  func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) {
   498  	mapCandidates := make(map[string]float64)
   499  	for _, candidate := range candidates {
   500  		mapCandidates[candidate]++
   501  	}
   502  
   503  	return classifier.classify(content, mapCandidates)
   504  }
   505  
   506  // GetLanguageExtensions returns all extensions associated with the given language.
   507  func GetLanguageExtensions(language string) []string {
   508  	return data.ExtensionsByLanguage[language]
   509  }
   510  
   511  // GetLanguageType returns the type of the given language.
   512  func GetLanguageType(language string) (langType Type) {
   513  	intType, ok := data.LanguagesType[language]
   514  	langType = Type(intType)
   515  	if !ok {
   516  		langType = Unknown
   517  	}
   518  	return langType
   519  }
   520  
   521  // GetLanguageGroup returns language group or empty string if language does not have group.
   522  func GetLanguageGroup(language string) string {
   523  	if group, ok := data.LanguagesGroup[language]; ok {
   524  		return group
   525  	}
   526  
   527  	return ""
   528  }
   529  
   530  // GetLanguageByAlias returns either the language related to the given alias and ok set to true
   531  // or Otherlanguage and ok set to false if the alias is not recognized.
   532  func GetLanguageByAlias(alias string) (lang string, ok bool) {
   533  	lang, ok = data.LanguageByAlias(alias)
   534  	if !ok {
   535  		lang = OtherLanguage
   536  	}
   537  
   538  	return
   539  }
   540  
   541  // GetLanguageID returns the ID for the language. IDs are assigned by GitHub.
   542  // The input must be the canonical language name. Aliases are not supported.
   543  //
   544  // NOTE: The zero value (0) is a valid language ID, so this API mimics the Go
   545  // map API. Use the second return value to check if the language was found.
   546  func GetLanguageID(language string) (int, bool) {
   547  	id, ok := data.IDByLanguage[language]
   548  	return id, ok
   549  }
   550  
   551  // GetLanguageInfo returns the LanguageInfo for a given language name, or an error if not found.
   552  func GetLanguageInfo(language string) (data.LanguageInfo, error) {
   553  	id, ok := GetLanguageID(language)
   554  	if !ok {
   555  		return data.LanguageInfo{}, fmt.Errorf("language %q not found", language)
   556  	}
   557  
   558  	return GetLanguageInfoByID(id)
   559  }
   560  
   561  // GetLanguageInfoByID returns the LanguageInfo for a given language ID, or an error if not found.
   562  func GetLanguageInfoByID(id int) (data.LanguageInfo, error) {
   563  	if info, ok := data.LanguageInfoByID[id]; ok {
   564  		return info, nil
   565  	}
   566  
   567  	return data.LanguageInfo{}, fmt.Errorf("language %q not found", id)
   568  }