github.com/SupersunnySea/draft@v0.16.0/pkg/linguist/util.go (about)

     1  package linguist
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"log"
     7  	"path/filepath"
     8  	"regexp"
     9  	"strings"
    10  
    11  	"github.com/ghodss/yaml"
    12  )
    13  
    14  var (
    15  	vendorRE *regexp.Regexp
    16  	doxRE    *regexp.Regexp
    17  
    18  	extensions   = map[string][]string{}
    19  	filenames    = map[string][]string{}
    20  	interpreters = map[string][]string{}
    21  	colors       = map[string]string{}
    22  
    23  	shebangRE       = regexp.MustCompile(`^#!\s*(\S+)(?:\s+(\S+))?.*`)
    24  	scriptVersionRE = regexp.MustCompile(`((?:\d+\.?)+)`)
    25  )
    26  
    27  func init() {
    28  	var regexps []string
    29  	bytes := []byte(files["data/vendor.yml"])
    30  	if err := yaml.Unmarshal(bytes, &regexps); err != nil {
    31  		log.Fatal(err)
    32  		return
    33  	}
    34  	vendorRE = regexp.MustCompile(strings.Join(regexps, "|"))
    35  
    36  	var moreregex []string
    37  	bytes = []byte(files["data/documentation.yml"])
    38  	if err := yaml.Unmarshal(bytes, &moreregex); err != nil {
    39  		log.Fatal(err)
    40  		return
    41  	}
    42  	doxRE = regexp.MustCompile(strings.Join(moreregex, "|"))
    43  
    44  	type language struct {
    45  		Extensions   []string `yaml:"extensions,omitempty"`
    46  		Filenames    []string `yaml:"filenames,omitempty"`
    47  		Interpreters []string `yaml:"interpreters,omitempty"`
    48  		Color        string   `yaml:"color,omitempty"`
    49  	}
    50  	languages := map[string]*language{}
    51  
    52  	bytes = []byte(files["data/languages.yml"])
    53  	if err := yaml.Unmarshal(bytes, &languages); err != nil {
    54  		log.Fatal(err)
    55  	}
    56  
    57  	for n, l := range languages {
    58  		for _, e := range l.Extensions {
    59  			extensions[e] = append(extensions[e], n)
    60  		}
    61  		for _, f := range l.Filenames {
    62  			filenames[f] = append(filenames[f], n)
    63  		}
    64  		for _, i := range l.Interpreters {
    65  			interpreters[i] = append(interpreters[i], n)
    66  		}
    67  		colors[n] = l.Color
    68  	}
    69  }
    70  
    71  // LanguageColor is a convenience function that returns the color associated
    72  // with the language, in HTML Hex notation (e.g. "#123ABC")
    73  // from the languages.yml file provided by https://github.com/github/linguist
    74  //
    75  // Returns the empty string if there is no associated color for the language.
    76  func LanguageColor(language string) string {
    77  	if c, ok := colors[language]; ok {
    78  		return c
    79  	}
    80  	return ""
    81  }
    82  
    83  // LanguageByFilename attempts to determine the language of a source file based solely on
    84  // common naming conventions and file extensions
    85  // from the languages.yml file provided by https://github.com/github/linguist
    86  //
    87  // Returns the empty string in ambiguous or unrecognized cases.
    88  func LanguageByFilename(filename string) string {
    89  	if l := filenames[filename]; len(l) == 1 {
    90  		return l[0]
    91  	}
    92  	ext := filepath.Ext(filename)
    93  	if ext != "" {
    94  		if l := extensions[ext]; len(l) == 1 {
    95  			return l[0]
    96  		}
    97  	}
    98  	return ""
    99  }
   100  
   101  // LanguageHints attempts to detect all possible languages of a source file based solely on
   102  // common naming conventions and file extensions
   103  // from the languages.yml file provided by https://github.com/github/linguist
   104  //
   105  // Intended to be used with LanguageByContents.
   106  //
   107  // May return an empty slice.
   108  func LanguageHints(filename string) (hints []string) {
   109  	if l, ok := filenames[filename]; ok {
   110  		hints = append(hints, l...)
   111  	}
   112  	if ext := filepath.Ext(filename); ext != "" {
   113  		if l, ok := extensions[ext]; ok {
   114  			hints = append(hints, l...)
   115  		}
   116  	}
   117  	return hints
   118  }
   119  
   120  // LanguageByContents attempts to detect the language of a source file based on its
   121  // contents and a slice of hints to the possible answer.
   122  //
   123  // Obtain hints with LanguageHints()
   124  //
   125  // Returns the empty string a language could not be determined.
   126  func LanguageByContents(contents []byte, hints []string) string {
   127  	interpreter := detectInterpreter(contents)
   128  	if interpreter != "" {
   129  		if l := interpreters[interpreter]; len(l) == 1 {
   130  			return l[0]
   131  		}
   132  	}
   133  	return Analyse(contents, hints)
   134  }
   135  
   136  func detectInterpreter(contents []byte) string {
   137  	scanner := bufio.NewScanner(bytes.NewReader(contents))
   138  	scanner.Scan()
   139  	line := scanner.Text()
   140  	m := shebangRE.FindStringSubmatch(line)
   141  	if m == nil || len(m) != 3 {
   142  		return ""
   143  	}
   144  	base := filepath.Base(m[1])
   145  	if base == "env" && m[2] != "" {
   146  		base = m[2]
   147  	}
   148  	// Strip suffixed version number.
   149  	return scriptVersionRE.ReplaceAllString(base, "")
   150  }
   151  
   152  // ShouldIgnoreFilename checks if filename should not be passed to LanguageByFilename.
   153  //
   154  // (this simply calls IsVendored and IsDocumentation)
   155  func ShouldIgnoreFilename(filename string) bool {
   156  	vendored := IsVendored(filename)
   157  	documentation := IsDocumentation(filename)
   158  	return vendored || documentation
   159  	// return IsVendored(filename) || IsDocumentation(filename)
   160  }
   161  
   162  // ShouldIgnoreContents checks if contents should not be passed to LangugeByContents.
   163  //
   164  // (this simply calls IsBinary)
   165  func ShouldIgnoreContents(contents []byte) bool {
   166  	return IsBinary(contents)
   167  }
   168  
   169  // IsVendored checks if path contains a filename commonly belonging to configuration files.
   170  func IsVendored(path string) bool {
   171  	return vendorRE.MatchString(path)
   172  }
   173  
   174  // IsDocumentation checks if path contains a filename commonly belonging to documentation.
   175  func IsDocumentation(path string) bool {
   176  	return doxRE.MatchString(path)
   177  }
   178  
   179  // IsBinary checks contents for known character escape codes which
   180  // frequently show up in binary files but rarely (if ever) in text.
   181  //
   182  // Use this check before using LanguageFromContents to reduce likelihood
   183  // of passing binary data into it which can cause inaccurate results.
   184  func IsBinary(contents []byte) bool {
   185  	// NOTE(tso): preliminary testing on this method of checking for binary
   186  	// contents were promising, having fed a document consisting of all
   187  	// utf-8 codepoints from 0000 to FFFF with satisfactory results. Thanks
   188  	// to robpike.io/cmd/unicode:
   189  	// ```
   190  	// unicode -c $(seq 0 65535 | xargs printf "%04x ") | tr -d '\n' > unicode_test
   191  	// ```
   192  	//
   193  	// However, the intentional presence of character escape codes to throw
   194  	// this function off is entirely possible, as is, potentially, a binary
   195  	// file consisting entirely of the 4 exceptions to the rule for the first
   196  	// 512 bytes. It is also possible that more character escape codes need
   197  	// to be added.
   198  	//
   199  	// Further analysis and real world testing of this is required.
   200  	for n, b := range contents {
   201  		if n >= 512 {
   202  			break
   203  		}
   204  		if b < 32 {
   205  			switch b {
   206  			case 0:
   207  				fallthrough
   208  			case 9:
   209  				fallthrough
   210  			case 10:
   211  				fallthrough
   212  			case 13:
   213  				continue
   214  			default:
   215  				return true
   216  			}
   217  		}
   218  	}
   219  	return false
   220  }