github.com/mcuadros/go-enry@v1.7.3/common.go (about)

     1  package enry
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"path/filepath"
     7  	"strings"
     8  
     9  	"gopkg.in/src-d/enry.v1/data"
    10  	"gopkg.in/src-d/enry.v1/regex"
    11  )
    12  
    13  // OtherLanguage is used as a zero value when a function can not return a specific language.
    14  const OtherLanguage = ""
    15  
    16  // Strategy type fix the signature for the functions that can be used as a strategy.
    17  type Strategy func(filename string, content []byte, candidates []string) (languages []string)
    18  
    19  // DefaultStrategies is a sequence of strategies used by GetLanguage to detect languages.
    20  var DefaultStrategies = []Strategy{
    21  	GetLanguagesByModeline,
    22  	GetLanguagesByFilename,
    23  	GetLanguagesByShebang,
    24  	GetLanguagesByExtension,
    25  	GetLanguagesByContent,
    26  	GetLanguagesByClassifier,
    27  }
    28  
    29  // DefaultClassifier is a Naive Bayes classifier trained on Linguist samples.
    30  var DefaultClassifier Classifier = &classifier{
    31  	languagesLogProbabilities: data.LanguagesLogProbabilities,
    32  	tokensLogProbabilities:    data.TokensLogProbabilities,
    33  	tokensTotal:               data.TokensTotal,
    34  }
    35  
    36  // GetLanguage applies a sequence of strategies based on the given filename and content
    37  // to find out the most probably language to return.
    38  func GetLanguage(filename string, content []byte) (language string) {
    39  	languages := GetLanguages(filename, content)
    40  	return firstLanguage(languages)
    41  }
    42  
    43  func firstLanguage(languages []string) string {
    44  	for _, l := range languages {
    45  		if l != "" {
    46  			return l
    47  		}
    48  	}
    49  	return OtherLanguage
    50  }
    51  
    52  // GetLanguageByModeline returns detected language. If there are more than one possibles languages
    53  // it returns the first language by alphabetically order and safe to false.
    54  func GetLanguageByModeline(content []byte) (language string, safe bool) {
    55  	return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil)
    56  }
    57  
    58  // GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
    59  // it returns the first language by alphabetically order and safe to false.
    60  func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) {
    61  	return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil)
    62  }
    63  
    64  // GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
    65  // it returns the first language by alphabetically order and safe to false.
    66  func GetLanguageByVimModeline(content []byte) (language string, safe bool) {
    67  	return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil)
    68  }
    69  
    70  // GetLanguageByFilename returns detected language. If there are more than one possibles languages
    71  // it returns the first language by alphabetically order and safe to false.
    72  func GetLanguageByFilename(filename string) (language string, safe bool) {
    73  	return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil)
    74  }
    75  
    76  // GetLanguageByShebang returns detected language. If there are more than one possibles languages
    77  // it returns the first language by alphabetically order and safe to false.
    78  func GetLanguageByShebang(content []byte) (language string, safe bool) {
    79  	return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil)
    80  }
    81  
    82  // GetLanguageByExtension returns detected language. If there are more than one possibles languages
    83  // it returns the first language by alphabetically order and safe to false.
    84  func GetLanguageByExtension(filename string) (language string, safe bool) {
    85  	return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil)
    86  }
    87  
    88  // GetLanguageByContent returns detected language. If there are more than one possibles languages
    89  // it returns the first language by alphabetically order and safe to false.
    90  func GetLanguageByContent(filename string, content []byte) (language string, safe bool) {
    91  	return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil)
    92  }
    93  
    94  // GetLanguageByClassifier returns the most probably language detected for the given content. It uses
    95  // DefaultClassifier, if no candidates are provided it returns OtherLanguage.
    96  func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
    97  	return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
    98  }
    99  
   100  func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) {
   101  	languages := strategy(filename, content, candidates)
   102  	return getFirstLanguageAndSafe(languages)
   103  }
   104  
   105  func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
   106  	language = firstLanguage(languages)
   107  	safe = len(languages) == 1
   108  	return
   109  }
   110  
   111  // GetLanguageBySpecificClassifier returns the most probably language for the given content using
   112  // classifier to detect language.
   113  func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) {
   114  	languages := GetLanguagesBySpecificClassifier(content, candidates, classifier)
   115  	return getFirstLanguageAndSafe(languages)
   116  }
   117  
   118  // GetLanguages applies a sequence of strategies based on the given filename and content
   119  // to find out the most probably languages to return.
   120  // At least one of arguments should be set. If content is missing, language detection will be based on the filename.
   121  // The function won't read the file, given an empty content.
   122  func GetLanguages(filename string, content []byte) []string {
   123  	if IsBinary(content) {
   124  		return nil
   125  	}
   126  
   127  	var languages []string
   128  	candidates := []string{}
   129  	for _, strategy := range DefaultStrategies {
   130  		languages = strategy(filename, content, candidates)
   131  		if len(languages) == 1 {
   132  			return languages
   133  		}
   134  
   135  		if len(languages) > 0 {
   136  			candidates = append(candidates, languages...)
   137  		}
   138  	}
   139  
   140  	return languages
   141  }
   142  
   143  // GetLanguagesByModeline returns a slice of possible languages for the given content.
   144  // It complies with the signature to be a Strategy type.
   145  func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string {
   146  	headFoot := getHeaderAndFooter(content)
   147  	var languages []string
   148  	for _, getLang := range modelinesFunc {
   149  		languages = getLang("", headFoot, candidates)
   150  		if len(languages) > 0 {
   151  			break
   152  		}
   153  	}
   154  
   155  	return languages
   156  }
   157  
   158  var modelinesFunc = []Strategy{
   159  	GetLanguagesByEmacsModeline,
   160  	GetLanguagesByVimModeline,
   161  }
   162  
   163  func getHeaderAndFooter(content []byte) []byte {
   164  	const searchScope = 5
   165  
   166  	if len(content) == 0 {
   167  		return content
   168  	}
   169  
   170  	if bytes.Count(content, []byte("\n")) < 2*searchScope {
   171  		return content
   172  	}
   173  
   174  	header := headScope(content, searchScope)
   175  	footer := footScope(content, searchScope)
   176  	headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
   177  	headerAndFooter = append(headerAndFooter, content[:header]...)
   178  	headerAndFooter = append(headerAndFooter, content[footer:]...)
   179  	return headerAndFooter
   180  }
   181  
   182  func headScope(content []byte, scope int) (index int) {
   183  	for i := 0; i < scope; i++ {
   184  		eol := bytes.IndexAny(content, "\n")
   185  		content = content[eol+1:]
   186  		index += eol
   187  	}
   188  
   189  	return index + scope - 1
   190  }
   191  
   192  func footScope(content []byte, scope int) (index int) {
   193  	for i := 0; i < scope; i++ {
   194  		index = bytes.LastIndexAny(content, "\n")
   195  		content = content[:index]
   196  	}
   197  
   198  	return index + 1
   199  }
   200  
   201  var (
   202  	reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
   203  	reEmacsLang     = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
   204  	reVimModeline   = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
   205  	reVimLang       = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
   206  )
   207  
   208  // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
   209  // It complies with the signature to be a Strategy type.
   210  func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string {
   211  	matched := reEmacsModeline.FindAllSubmatch(content, -1)
   212  	if matched == nil {
   213  		return nil
   214  	}
   215  
   216  	// only take the last matched line, discard previous lines
   217  	lastLineMatched := matched[len(matched)-1][1]
   218  	matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
   219  	var alias string
   220  	if matchedAlias != nil {
   221  		alias = string(matchedAlias[1])
   222  	} else {
   223  		alias = string(lastLineMatched)
   224  	}
   225  
   226  	language, ok := GetLanguageByAlias(alias)
   227  	if !ok {
   228  		return nil
   229  	}
   230  
   231  	return []string{language}
   232  }
   233  
   234  // GetLanguagesByVimModeline returns a slice of possible languages for the given content.
   235  // It complies with the signature to be a Strategy type.
   236  func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string {
   237  	matched := reVimModeline.FindAllSubmatch(content, -1)
   238  	if matched == nil {
   239  		return nil
   240  	}
   241  
   242  	// only take the last matched line, discard previous lines
   243  	lastLineMatched := matched[len(matched)-1][1]
   244  	matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
   245  	if matchedAlias == nil {
   246  		return nil
   247  	}
   248  
   249  	alias := string(matchedAlias[0][1])
   250  	if len(matchedAlias) > 1 {
   251  		// cases:
   252  		// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
   253  		// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
   254  		for _, match := range matchedAlias {
   255  			otherAlias := string(match[1])
   256  			if otherAlias != alias {
   257  				return nil
   258  			}
   259  		}
   260  	}
   261  
   262  	language, ok := GetLanguageByAlias(alias)
   263  	if !ok {
   264  		return nil
   265  	}
   266  
   267  	return []string{language}
   268  }
   269  
   270  // GetLanguagesByFilename returns a slice of possible languages for the given filename.
   271  // It complies with the signature to be a Strategy type.
   272  func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string {
   273  	if filename == "" {
   274  		return nil
   275  	}
   276  
   277  	return data.LanguagesByFilename[filepath.Base(filename)]
   278  }
   279  
   280  // GetLanguagesByShebang returns a slice of possible languages for the given content.
   281  // It complies with the signature to be a Strategy type.
   282  func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) {
   283  	interpreter := getInterpreter(content)
   284  	return data.LanguagesByInterpreter[interpreter]
   285  }
   286  
   287  var (
   288  	shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
   289  	pythonVersion   = regex.MustCompile(`python\d\.\d+`)
   290  )
   291  
   292  func getInterpreter(data []byte) (interpreter string) {
   293  	line := getFirstLine(data)
   294  	if !hasShebang(line) {
   295  		return ""
   296  	}
   297  
   298  	// skip shebang
   299  	line = bytes.TrimSpace(line[2:])
   300  	splitted := bytes.Fields(line)
   301  	if len(splitted) == 0 {
   302  		return ""
   303  	}
   304  
   305  	if bytes.Contains(splitted[0], []byte("env")) {
   306  		if len(splitted) > 1 {
   307  			interpreter = string(splitted[1])
   308  		}
   309  	} else {
   310  		splittedPath := bytes.Split(splitted[0], []byte{'/'})
   311  		interpreter = string(splittedPath[len(splittedPath)-1])
   312  	}
   313  
   314  	if interpreter == "sh" {
   315  		interpreter = lookForMultilineExec(data)
   316  	}
   317  
   318  	if pythonVersion.MatchString(interpreter) {
   319  		interpreter = interpreter[:strings.Index(interpreter, `.`)]
   320  	}
   321  
   322  	return
   323  }
   324  
   325  func getFirstLine(data []byte) []byte {
   326  	buf := bufio.NewScanner(bytes.NewReader(data))
   327  	buf.Scan()
   328  	line := buf.Bytes()
   329  	if err := buf.Err(); err != nil {
   330  		return nil
   331  	}
   332  
   333  	return line
   334  }
   335  
   336  func hasShebang(line []byte) bool {
   337  	const shebang = `#!`
   338  	prefix := []byte(shebang)
   339  	return bytes.HasPrefix(line, prefix)
   340  }
   341  
   342  func lookForMultilineExec(data []byte) string {
   343  	const magicNumOfLines = 5
   344  	interpreter := "sh"
   345  
   346  	buf := bufio.NewScanner(bytes.NewReader(data))
   347  	for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
   348  		line := buf.Bytes()
   349  		if shebangExecHack.Match(line) {
   350  			interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
   351  			break
   352  		}
   353  	}
   354  
   355  	if err := buf.Err(); err != nil {
   356  		return interpreter
   357  	}
   358  
   359  	return interpreter
   360  }
   361  
   362  // GetLanguagesByExtension returns a slice of possible languages for the given filename.
   363  // It complies with the signature to be a Strategy type.
   364  func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string {
   365  	if !strings.Contains(filename, ".") {
   366  		return nil
   367  	}
   368  
   369  	filename = strings.ToLower(filename)
   370  	dots := getDotIndexes(filename)
   371  	for _, dot := range dots {
   372  		ext := filename[dot:]
   373  		languages, ok := data.LanguagesByExtension[ext]
   374  		if ok {
   375  			return languages
   376  		}
   377  	}
   378  
   379  	return nil
   380  }
   381  
   382  func getDotIndexes(filename string) []int {
   383  	dots := make([]int, 0, 2)
   384  	for i, letter := range filename {
   385  		if letter == rune('.') {
   386  			dots = append(dots, i)
   387  		}
   388  	}
   389  
   390  	return dots
   391  }
   392  
   393  // GetLanguagesByContent returns a slice of languages for the given content.
   394  // It is a Strategy that uses content-based regexp heuristics and a filename extension.
   395  func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
   396  	if filename == "" {
   397  		return nil
   398  	}
   399  
   400  	ext := strings.ToLower(filepath.Ext(filename))
   401  
   402  	heuristic, ok := data.ContentHeuristics[ext]
   403  	if !ok {
   404  		return nil
   405  	}
   406  
   407  	return heuristic.Match(content)
   408  }
   409  
   410  // GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by
   411  // decreasing language's probability. If there are not candidates it returns nil. It complies with the signature to be a Strategy type.
   412  func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
   413  	if len(candidates) == 0 {
   414  		return nil
   415  	}
   416  
   417  	return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier)
   418  }
   419  
   420  // GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
   421  func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) {
   422  	mapCandidates := make(map[string]float64)
   423  	for _, candidate := range candidates {
   424  		mapCandidates[candidate]++
   425  	}
   426  
   427  	return classifier.Classify(content, mapCandidates)
   428  }
   429  
   430  // GetLanguageExtensions returns the different extensions being used by the language.
   431  func GetLanguageExtensions(language string) []string {
   432  	return data.ExtensionsByLanguage[language]
   433  }
   434  
   435  // Type represent language's type. Either data, programming, markup, prose, or unknown.
   436  type Type int
   437  
   438  // Type's values.
   439  const (
   440  	Unknown Type = iota
   441  	Data
   442  	Programming
   443  	Markup
   444  	Prose
   445  )
   446  
   447  // GetLanguageType returns the type of the given language.
   448  func GetLanguageType(language string) (langType Type) {
   449  	intType, ok := data.LanguagesType[language]
   450  	langType = Type(intType)
   451  	if !ok {
   452  		langType = Unknown
   453  	}
   454  	return langType
   455  }
   456  
   457  // GetLanguageByAlias returns either the language related to the given alias and ok set to true
   458  // or Otherlanguage and ok set to false if the alias is not recognized.
   459  func GetLanguageByAlias(alias string) (lang string, ok bool) {
   460  	lang, ok = data.LanguageByAlias(alias)
   461  	if !ok {
   462  		lang = OtherLanguage
   463  	}
   464  
   465  	return
   466  }