github.com/dennwc/enry@v1.6.4-0.20180424151738-42391b8e105b/common.go (about)

     1  package enry
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"path/filepath"
     7  	"regexp"
     8  	"strings"
     9  
    10  	"gopkg.in/src-d/enry.v1/data"
    11  )
    12  
    13  // OtherLanguage is used as a zero value when a function can not return a specific language.
    14  const OtherLanguage = ""
    15  
    16  // Strategy type fix the signature for the functions that can be used as a strategy.
    17  type Strategy func(filename string, content []byte, candidates []string) (languages []string)
    18  
    19  // DefaultStrategies is the strategies' sequence GetLanguage uses to detect languages.
    20  var DefaultStrategies = []Strategy{
    21  	GetLanguagesByModeline,
    22  	GetLanguagesByFilename,
    23  	GetLanguagesByShebang,
    24  	GetLanguagesByExtension,
    25  	GetLanguagesByContent,
    26  	GetLanguagesByClassifier,
    27  }
    28  
    29  var DefaultClassifier Classifier = &classifier{
    30  	languagesLogProbabilities: data.LanguagesLogProbabilities,
    31  	tokensLogProbabilities:    data.TokensLogProbabilities,
    32  	tokensTotal:               data.TokensTotal,
    33  }
    34  
    35  // GetLanguage applies a sequence of strategies based on the given filename and content
    36  // to find out the most probably language to return.
    37  func GetLanguage(filename string, content []byte) (language string) {
    38  	languages := GetLanguages(filename, content)
    39  	return firstLanguage(languages)
    40  }
    41  
    42  func firstLanguage(languages []string) string {
    43  	if len(languages) == 0 {
    44  		return OtherLanguage
    45  	}
    46  
    47  	return languages[0]
    48  }
    49  
    50  // GetLanguageByModeline returns detected language. If there are more than one possibles languages
    51  // it returns the first language by alphabetically order and safe to false.
    52  func GetLanguageByModeline(content []byte) (language string, safe bool) {
    53  	return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil)
    54  }
    55  
    56  // GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
    57  // it returns the first language by alphabetically order and safe to false.
    58  func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) {
    59  	return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil)
    60  }
    61  
    62  // GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
    63  // it returns the first language by alphabetically order and safe to false.
    64  func GetLanguageByVimModeline(content []byte) (language string, safe bool) {
    65  	return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil)
    66  }
    67  
    68  // GetLanguageByFilename returns detected language. If there are more than one possibles languages
    69  // it returns the first language by alphabetically order and safe to false.
    70  func GetLanguageByFilename(filename string) (language string, safe bool) {
    71  	return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil)
    72  }
    73  
    74  // GetLanguageByShebang returns detected language. If there are more than one possibles languages
    75  // it returns the first language by alphabetically order and safe to false.
    76  func GetLanguageByShebang(content []byte) (language string, safe bool) {
    77  	return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil)
    78  }
    79  
    80  // GetLanguageByExtension returns detected language. If there are more than one possibles languages
    81  // it returns the first language by alphabetically order and safe to false.
    82  func GetLanguageByExtension(filename string) (language string, safe bool) {
    83  	return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil)
    84  }
    85  
    86  // GetLanguageByContent returns detected language. If there are more than one possibles languages
    87  // it returns the first language by alphabetically order and safe to false.
    88  func GetLanguageByContent(filename string, content []byte) (language string, safe bool) {
    89  	return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil)
    90  }
    91  
    92  // GetLanguageByClassifier returns the most probably language detected for the given content. It uses
    93  // DefaultClassifier, if no candidates are provided it returns OtherLanguage.
    94  func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
    95  	return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
    96  }
    97  
    98  func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) {
    99  	languages := strategy(filename, content, candidates)
   100  	return getFirstLanguageAndSafe(languages)
   101  }
   102  
   103  func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
   104  	language = firstLanguage(languages)
   105  	safe = len(languages) == 1
   106  	return
   107  }
   108  
   109  // GetLanguageBySpecificClassifier returns the most probably language for the given content using
   110  // classifier to detect language.
   111  func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) {
   112  	languages := GetLanguagesBySpecificClassifier(content, candidates, classifier)
   113  	return getFirstLanguageAndSafe(languages)
   114  }
   115  
   116  // GetLanguages applies a sequence of strategies based on the given filename and content
   117  // to find out the most probably languages to return.
   118  func GetLanguages(filename string, content []byte) []string {
   119  	if IsBinary(content) {
   120  		return nil
   121  	}
   122  
   123  	var languages []string
   124  	candidates := []string{}
   125  	for _, strategy := range DefaultStrategies {
   126  		languages = strategy(filename, content, candidates)
   127  		if len(languages) == 1 {
   128  			return languages
   129  		}
   130  
   131  		if len(languages) > 0 {
   132  			candidates = append(candidates, languages...)
   133  		}
   134  	}
   135  
   136  	return languages
   137  }
   138  
   139  // GetLanguagesByModeline returns a slice of possible languages for the given content.
   140  // It complies with the signature to be a Strategy type.
   141  func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string {
   142  	headFoot := getHeaderAndFooter(content)
   143  	var languages []string
   144  	for _, getLang := range modelinesFunc {
   145  		languages = getLang("", headFoot, candidates)
   146  		if len(languages) > 0 {
   147  			break
   148  		}
   149  	}
   150  
   151  	return languages
   152  }
   153  
   154  var modelinesFunc = []Strategy{
   155  	GetLanguagesByEmacsModeline,
   156  	GetLanguagesByVimModeline,
   157  }
   158  
   159  func getHeaderAndFooter(content []byte) []byte {
   160  	const searchScope = 5
   161  
   162  	if len(content) == 0 {
   163  		return content
   164  	}
   165  
   166  	if bytes.Count(content, []byte("\n")) < 2*searchScope {
   167  		return content
   168  	}
   169  
   170  	header := headScope(content, searchScope)
   171  	footer := footScope(content, searchScope)
   172  	headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
   173  	headerAndFooter = append(headerAndFooter, content[:header]...)
   174  	headerAndFooter = append(headerAndFooter, content[footer:]...)
   175  	return headerAndFooter
   176  }
   177  
   178  func headScope(content []byte, scope int) (index int) {
   179  	for i := 0; i < scope; i++ {
   180  		eol := bytes.IndexAny(content, "\n")
   181  		content = content[eol+1:]
   182  		index += eol
   183  	}
   184  
   185  	return index + scope - 1
   186  }
   187  
   188  func footScope(content []byte, scope int) (index int) {
   189  	for i := 0; i < scope; i++ {
   190  		index = bytes.LastIndexAny(content, "\n")
   191  		content = content[:index]
   192  	}
   193  
   194  	return index + 1
   195  }
   196  
   197  var (
   198  	reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
   199  	reEmacsLang     = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
   200  	reVimModeline   = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
   201  	reVimLang       = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
   202  )
   203  
   204  // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
   205  // It complies with the signature to be a Strategy type.
   206  func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string {
   207  	matched := reEmacsModeline.FindAllSubmatch(content, -1)
   208  	if matched == nil {
   209  		return nil
   210  	}
   211  
   212  	// only take the last matched line, discard previous lines
   213  	lastLineMatched := matched[len(matched)-1][1]
   214  	matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
   215  	var alias string
   216  	if matchedAlias != nil {
   217  		alias = string(matchedAlias[1])
   218  	} else {
   219  		alias = string(lastLineMatched)
   220  	}
   221  
   222  	language, ok := GetLanguageByAlias(alias)
   223  	if !ok {
   224  		return nil
   225  	}
   226  
   227  	return []string{language}
   228  }
   229  
   230  // GetLanguagesByVimModeline returns a slice of possible languages for the given content.
   231  // It complies with the signature to be a Strategy type.
   232  func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string {
   233  	matched := reVimModeline.FindAllSubmatch(content, -1)
   234  	if matched == nil {
   235  		return nil
   236  	}
   237  
   238  	// only take the last matched line, discard previous lines
   239  	lastLineMatched := matched[len(matched)-1][1]
   240  	matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
   241  	if matchedAlias == nil {
   242  		return nil
   243  	}
   244  
   245  	alias := string(matchedAlias[0][1])
   246  	if len(matchedAlias) > 1 {
   247  		// cases:
   248  		// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
   249  		// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
   250  		for _, match := range matchedAlias {
   251  			otherAlias := string(match[1])
   252  			if otherAlias != alias {
   253  				return nil
   254  			}
   255  		}
   256  	}
   257  
   258  	language, ok := GetLanguageByAlias(alias)
   259  	if !ok {
   260  		return nil
   261  	}
   262  
   263  	return []string{language}
   264  }
   265  
   266  // GetLanguagesByFilename returns a slice of possible languages for the given filename.
   267  // It complies with the signature to be a Strategy type.
   268  func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string {
   269  	if filename == "" {
   270  		return nil
   271  	}
   272  
   273  	return data.LanguagesByFilename[filepath.Base(filename)]
   274  }
   275  
   276  // GetLanguagesByShebang returns a slice of possible languages for the given content.
   277  // It complies with the signature to be a Strategy type.
   278  func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) {
   279  	interpreter := getInterpreter(content)
   280  	return data.LanguagesByInterpreter[interpreter]
   281  }
   282  
   283  var (
   284  	shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
   285  	pythonVersion   = regexp.MustCompile(`python\d\.\d+`)
   286  )
   287  
   288  func getInterpreter(data []byte) (interpreter string) {
   289  	line := getFirstLine(data)
   290  	if !hasShebang(line) {
   291  		return ""
   292  	}
   293  
   294  	// skip shebang
   295  	line = bytes.TrimSpace(line[2:])
   296  	splitted := bytes.Fields(line)
   297  	if len(splitted) == 0 {
   298  		return ""
   299  	}
   300  
   301  	if bytes.Contains(splitted[0], []byte("env")) {
   302  		if len(splitted) > 1 {
   303  			interpreter = string(splitted[1])
   304  		}
   305  	} else {
   306  		splittedPath := bytes.Split(splitted[0], []byte{'/'})
   307  		interpreter = string(splittedPath[len(splittedPath)-1])
   308  	}
   309  
   310  	if interpreter == "sh" {
   311  		interpreter = lookForMultilineExec(data)
   312  	}
   313  
   314  	if pythonVersion.MatchString(interpreter) {
   315  		interpreter = interpreter[:strings.Index(interpreter, `.`)]
   316  	}
   317  
   318  	return
   319  }
   320  
   321  func getFirstLine(data []byte) []byte {
   322  	buf := bufio.NewScanner(bytes.NewReader(data))
   323  	buf.Scan()
   324  	line := buf.Bytes()
   325  	if err := buf.Err(); err != nil {
   326  		return nil
   327  	}
   328  
   329  	return line
   330  }
   331  
   332  func hasShebang(line []byte) bool {
   333  	const shebang = `#!`
   334  	prefix := []byte(shebang)
   335  	return bytes.HasPrefix(line, prefix)
   336  }
   337  
   338  func lookForMultilineExec(data []byte) string {
   339  	const magicNumOfLines = 5
   340  	interpreter := "sh"
   341  
   342  	buf := bufio.NewScanner(bytes.NewReader(data))
   343  	for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
   344  		line := buf.Bytes()
   345  		if shebangExecHack.Match(line) {
   346  			interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
   347  			break
   348  		}
   349  	}
   350  
   351  	if err := buf.Err(); err != nil {
   352  		return interpreter
   353  	}
   354  
   355  	return interpreter
   356  }
   357  
   358  // GetLanguagesByExtension returns a slice of possible languages for the given filename.
   359  // It complies with the signature to be a Strategy type.
   360  func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string {
   361  	if !strings.Contains(filename, ".") {
   362  		return nil
   363  	}
   364  
   365  	filename = strings.ToLower(filename)
   366  	dots := getDotIndexes(filename)
   367  	for _, dot := range dots {
   368  		ext := filename[dot:]
   369  		languages, ok := data.LanguagesByExtension[ext]
   370  		if ok {
   371  			return languages
   372  		}
   373  	}
   374  
   375  	return nil
   376  }
   377  
   378  func getDotIndexes(filename string) []int {
   379  	dots := make([]int, 0, 2)
   380  	for i, letter := range filename {
   381  		if letter == rune('.') {
   382  			dots = append(dots, i)
   383  		}
   384  	}
   385  
   386  	return dots
   387  }
   388  
   389  // GetLanguagesByContent returns a slice of possible languages for the given content.
   390  // It complies with the signature to be a Strategy type.
   391  func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
   392  	if filename == "" {
   393  		return nil
   394  	}
   395  
   396  	ext := strings.ToLower(filepath.Ext(filename))
   397  	fnMatcher, ok := data.ContentMatchers[ext]
   398  	if !ok {
   399  		return nil
   400  	}
   401  
   402  	return fnMatcher(content)
   403  }
   404  
   405  // GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by
   406  // decreasing language's probability. If there are not candidates it returns nil. It complies with the signature to be a Strategy type.
   407  func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
   408  	if len(candidates) == 0 {
   409  		return nil
   410  	}
   411  
   412  	return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier)
   413  }
   414  
   415  // GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
   416  func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) {
   417  	mapCandidates := make(map[string]float64)
   418  	for _, candidate := range candidates {
   419  		mapCandidates[candidate]++
   420  	}
   421  
   422  	return classifier.Classify(content, mapCandidates)
   423  }
   424  
   425  // GetLanguageExtensions returns the different extensions being used by the language.
   426  func GetLanguageExtensions(language string) []string {
   427  	return data.ExtensionsByLanguage[language]
   428  }
   429  
   430  // Type represent language's type. Either data, programming, markup, prose, or unknown.
   431  type Type int
   432  
   433  // Type's values.
   434  const (
   435  	Unknown Type = iota
   436  	Data
   437  	Programming
   438  	Markup
   439  	Prose
   440  )
   441  
   442  // GetLanguageType returns the type of the given language.
   443  func GetLanguageType(language string) (langType Type) {
   444  	intType, ok := data.LanguagesType[language]
   445  	langType = Type(intType)
   446  	if !ok {
   447  		langType = Unknown
   448  	}
   449  	return langType
   450  }
   451  
   452  // GetLanguageByAlias returns either the language related to the given alias and ok set to true
   453  // or Otherlanguage and ok set to false if the alias is not recognized.
   454  func GetLanguageByAlias(alias string) (lang string, ok bool) {
   455  	a := strings.Split(alias, `,`)[0]
   456  	a = strings.ToLower(a)
   457  	lang, ok = data.LanguagesByAlias[a]
   458  	if !ok {
   459  		lang = OtherLanguage
   460  	}
   461  
   462  	return
   463  }