github.com/bzz/enry@v1.6.7/internal/code-generator/generator/heuristics.go (about)

     1  package generator
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"fmt"
     7  	"io"
     8  	"io/ioutil"
     9  	"strconv"
    10  	"strings"
    11  	"text/template"
    12  
    13  	"gopkg.in/src-d/enry.v1/regex"
    14  )
    15  
    16  // Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
    17  func Heuristics(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error {
    18  	data, err := ioutil.ReadFile(fileToParse)
    19  	if err != nil {
    20  		return err
    21  	}
    22  
    23  	disambiguators, err := getDisambiguators(data)
    24  	if err != nil {
    25  		return err
    26  	}
    27  
    28  	buf := &bytes.Buffer{}
    29  	if err := executeContentTemplate(buf, disambiguators, tmplPath, tmplName, commit); err != nil {
    30  		return err
    31  	}
    32  
    33  	return formatedWrite(outPath, buf.Bytes())
    34  }
    35  
    36  const (
    37  	unknownLanguage = "OtherLanguage"
    38  	emptyFile       = "^$"
    39  )
    40  
    41  var (
    42  	disambLine       = regex.MustCompile(`^(\s*)disambiguate`)
    43  	definedRegs      = make(map[string]string)
    44  	illegalCharacter = map[string]string{
    45  		"#": "Sharp",
    46  		"+": "Plus",
    47  		"-": "Dash",
    48  	}
    49  )
    50  
    51  type disambiguator struct {
    52  	Extension string                `json:"extension,omitempty"`
    53  	Languages []*languageHeuristics `json:"languages,omitempty"`
    54  }
    55  
    56  func (d *disambiguator) setHeuristicsNames() {
    57  	for _, lang := range d.Languages {
    58  		for i, heuristic := range lang.Heuristics {
    59  			name := buildName(d.Extension, lang.Language, i)
    60  			heuristic.Name = name
    61  		}
    62  	}
    63  }
    64  
    65  func buildName(extension, language string, id int) string {
    66  	extension = strings.TrimPrefix(extension, `.`)
    67  	language = strings.Join(strings.Fields(language), ``)
    68  	name := strings.Join([]string{extension, language, "Matcher", strconv.Itoa(id)}, `_`)
    69  	for k, v := range illegalCharacter {
    70  		if strings.Contains(name, k) {
    71  			name = strings.Replace(name, k, v, -1)
    72  		}
    73  	}
    74  
    75  	return name
    76  }
    77  
    78  type languageHeuristics struct {
    79  	Language       string       `json:"language,omitempty"`
    80  	Heuristics     []*heuristic `json:"heuristics,omitempty"`
    81  	LogicRelations []string     `json:"logic_relations,omitempty"`
    82  }
    83  
    84  func (l *languageHeuristics) clone() (*languageHeuristics, error) {
    85  	language := l.Language
    86  	logicRels := make([]string, len(l.LogicRelations))
    87  	if copy(logicRels, l.LogicRelations) != len(l.LogicRelations) {
    88  		return nil, fmt.Errorf("error copying logic relations")
    89  	}
    90  
    91  	heuristics := make([]*heuristic, 0, len(l.Heuristics))
    92  	for _, h := range l.Heuristics {
    93  		heuristic := *h
    94  		heuristics = append(heuristics, &heuristic)
    95  	}
    96  
    97  	clone := &languageHeuristics{
    98  		Language:       language,
    99  		Heuristics:     heuristics,
   100  		LogicRelations: logicRels,
   101  	}
   102  
   103  	return clone, nil
   104  }
   105  
   106  type heuristic struct {
   107  	Name   string `json:"name,omitempty"`
   108  	Regexp string `json:"regexp,omitempty"`
   109  }
   110  
   111  // A disambiguate block looks like:
   112  // disambiguate ".mod", ".extension" do |data|
   113  // 	if data.include?('<!ENTITY ') && data.include?('patata')
   114  // 		Language["XML"]
   115  // 	elsif /^\s*MODULE [\w\.]+;/i.match(data) || /^\s*END [\w\.]+;/i.match(data) || data.empty?
   116  // 		Language["Modula-2"]
   117  //	elsif (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data))
   118  //              Language["Scala"]
   119  //      elsif (data.include?("gap> "))
   120  //		Language["GAP"]
   121  // 	else
   122  // 		[Language["Linux Kernel Module"], Language["AMPL"]]
   123  // 	end
   124  // end
   125  func getDisambiguators(heuristics []byte) ([]*disambiguator, error) {
   126  	seenExtensions := map[string]bool{}
   127  	buf := bufio.NewScanner(bytes.NewReader(heuristics))
   128  	disambiguators := make([]*disambiguator, 0, 50)
   129  	for buf.Scan() {
   130  		line := buf.Text()
   131  		if disambLine.MatchString(line) {
   132  			d, err := parseDisambiguators(line, buf, seenExtensions)
   133  			if err != nil {
   134  				return nil, err
   135  			}
   136  
   137  			disambiguators = append(disambiguators, d...)
   138  		}
   139  
   140  		lookForRegexpVariables(line)
   141  	}
   142  
   143  	if err := buf.Err(); err != nil {
   144  		return nil, err
   145  	}
   146  
   147  	return disambiguators, nil
   148  }
   149  
   150  func lookForRegexpVariables(line string) {
   151  	if strings.Contains(line, "ObjectiveCRegex = ") {
   152  		line = strings.TrimSpace(line)
   153  		reg := strings.TrimPrefix(line, "ObjectiveCRegex = ")
   154  		definedRegs["ObjectiveCRegex"] = reg
   155  	}
   156  
   157  	if strings.Contains(line, "fortran_rx = ") {
   158  		line = strings.TrimSpace(line)
   159  		reg := strings.TrimPrefix(line, "fortran_rx = ")
   160  		definedRegs["fortran_rx"] = reg
   161  	}
   162  }
   163  
   164  func parseDisambiguators(line string, buf *bufio.Scanner, seenExtensions map[string]bool) ([]*disambiguator, error) {
   165  	disambList := make([]*disambiguator, 0, 2)
   166  	splitted := strings.Fields(line)
   167  
   168  	for _, v := range splitted {
   169  		if strings.HasPrefix(v, `"`) {
   170  			extension := strings.Trim(v, `",`)
   171  			if _, ok := seenExtensions[extension]; !ok {
   172  				d := &disambiguator{Extension: extension}
   173  				disambList = append(disambList, d)
   174  				seenExtensions[extension] = true
   175  			}
   176  		}
   177  	}
   178  
   179  	langsHeuristics, err := getLanguagesHeuristics(buf)
   180  	if err != nil {
   181  		return nil, err
   182  	}
   183  
   184  	for i, disamb := range disambList {
   185  		lh := langsHeuristics
   186  		if i != 0 {
   187  			lh = cloneLanguagesHeuristics(langsHeuristics)
   188  		}
   189  
   190  		disamb.Languages = lh
   191  		disamb.setHeuristicsNames()
   192  	}
   193  
   194  	return disambList, nil
   195  }
   196  
   197  func cloneLanguagesHeuristics(list []*languageHeuristics) []*languageHeuristics {
   198  	cloneList := make([]*languageHeuristics, 0, len(list))
   199  	for _, langHeu := range list {
   200  		clone, _ := langHeu.clone()
   201  		cloneList = append(cloneList, clone)
   202  	}
   203  
   204  	return cloneList
   205  }
   206  
   207  func getLanguagesHeuristics(buf *bufio.Scanner) ([]*languageHeuristics, error) {
   208  	langsList := make([][]string, 0, 2)
   209  	heuristicsList := make([][]*heuristic, 0, 1)
   210  	logicRelsList := make([][]string, 0, 1)
   211  
   212  	lastWasMatch := false
   213  	for buf.Scan() {
   214  		line := buf.Text()
   215  		if strings.TrimSpace(line) == "end" {
   216  			break
   217  		}
   218  
   219  		if hasRegExp(line) {
   220  			line := cleanRegExpLine(line)
   221  
   222  			logicRels := getLogicRelations(line)
   223  			heuristics := getHeuristics(line)
   224  			if lastWasMatch {
   225  				i := len(heuristicsList) - 1
   226  				heuristicsList[i] = append(heuristicsList[i], heuristics...)
   227  				i = len(logicRelsList) - 1
   228  				logicRelsList[i] = append(logicRelsList[i], logicRels...)
   229  			} else {
   230  				heuristicsList = append(heuristicsList, heuristics)
   231  				logicRelsList = append(logicRelsList, logicRels)
   232  			}
   233  
   234  			lastWasMatch = true
   235  		}
   236  
   237  		if strings.Contains(line, "Language") {
   238  			langs := getLanguages(line)
   239  			langsList = append(langsList, langs)
   240  			lastWasMatch = false
   241  		}
   242  
   243  	}
   244  
   245  	if err := buf.Err(); err != nil {
   246  		return nil, err
   247  	}
   248  
   249  	langsHeuristics := buildLanguagesHeuristics(langsList, heuristicsList, logicRelsList)
   250  	return langsHeuristics, nil
   251  }
   252  
   253  func hasRegExp(line string) bool {
   254  	return strings.Contains(line, ".match") || strings.Contains(line, ".include?") || strings.Contains(line, ".empty?")
   255  }
   256  
   257  func cleanRegExpLine(line string) string {
   258  	if strings.Contains(line, "if ") {
   259  		line = line[strings.Index(line, `if `)+3:]
   260  	}
   261  
   262  	line = strings.TrimSpace(line)
   263  	line = strings.TrimPrefix(line, `(`)
   264  	if strings.Contains(line, "))") {
   265  		line = strings.TrimSuffix(line, `)`)
   266  	}
   267  
   268  	return line
   269  }
   270  
   271  func getLogicRelations(line string) []string {
   272  	rels := make([]string, 0)
   273  	splitted := strings.Split(line, "||")
   274  	for i, v := range splitted {
   275  		if strings.Contains(v, "&&") {
   276  			rels = append(rels, "&&")
   277  		}
   278  
   279  		if i < len(splitted)-1 {
   280  			rels = append(rels, "||")
   281  		}
   282  	}
   283  
   284  	if len(rels) == 0 {
   285  		rels = nil
   286  	}
   287  
   288  	return rels
   289  }
   290  
   291  func getHeuristics(line string) []*heuristic {
   292  	splitted := splitByLogicOps(line)
   293  	heuristics := make([]*heuristic, 0, len(splitted))
   294  	for _, v := range splitted {
   295  		v = strings.TrimSpace(v)
   296  		var reg string
   297  
   298  		if strings.Contains(v, ".match") {
   299  			reg = v[:strings.Index(v, ".match")]
   300  			reg = replaceRegexpVariables(reg)
   301  		}
   302  
   303  		if strings.Contains(v, ".include?") {
   304  			reg = includeToRegExp(v)
   305  		}
   306  
   307  		if strings.Contains(v, ".empty?") {
   308  			reg = emptyFile
   309  		}
   310  
   311  		if reg != "" {
   312  			reg = convertToValidRegexp(reg)
   313  			heuristics = append(heuristics, &heuristic{Regexp: reg})
   314  		}
   315  	}
   316  
   317  	return heuristics
   318  }
   319  
   320  func splitByLogicOps(line string) []string {
   321  	splitted := make([]string, 0, 1)
   322  	splitOr := strings.Split(line, "||")
   323  	for _, v := range splitOr {
   324  		splitAnd := strings.Split(v, "&&")
   325  		splitted = append(splitted, splitAnd...)
   326  	}
   327  
   328  	return splitted
   329  }
   330  
   331  func replaceRegexpVariables(reg string) string {
   332  	repl := reg
   333  	if v, ok := definedRegs[reg]; ok {
   334  		repl = v
   335  	}
   336  
   337  	return repl
   338  }
   339  
   340  func convertToValidRegexp(reg string) string {
   341  	// example: `/^(\s*)(<Project|<Import|<Property|<?xml|xmlns)/i``
   342  	// Ruby modifier "m" matches multiple lines, recognizing newlines as normal characters, Go use flag "s" for that.
   343  	const (
   344  		caseSensitive = "i"
   345  		matchEOL      = "s"
   346  
   347  		rubyCaseSensitive = "i"
   348  		rubyMultiLine     = "m"
   349  	)
   350  
   351  	if reg == emptyFile {
   352  		return reg
   353  	}
   354  
   355  	reg = strings.TrimPrefix(reg, `/`)
   356  	flags := "(?m"
   357  	lastSlash := strings.LastIndex(reg, `/`)
   358  	if lastSlash == -1 {
   359  		return flags + ")" + reg
   360  	}
   361  
   362  	specialChars := reg[lastSlash:]
   363  	reg = reg[:lastSlash]
   364  	if lastSlash == len(reg)-1 {
   365  		return flags + ")" + reg
   366  	}
   367  
   368  	if strings.Contains(specialChars, rubyCaseSensitive) {
   369  		flags = flags + caseSensitive
   370  	}
   371  
   372  	if strings.Contains(specialChars, rubyMultiLine) {
   373  		flags = flags + matchEOL
   374  	}
   375  
   376  	return flags + ")" + reg
   377  }
   378  
   379  func includeToRegExp(include string) string {
   380  	content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
   381  	content = strings.Trim(content, `"'`)
   382  	return regex.QuoteMeta(content)
   383  }
   384  
   385  func getLanguages(line string) []string {
   386  	languages := make([]string, 0)
   387  	splitted := strings.Split(line, `,`)
   388  	for _, lang := range splitted {
   389  		lang = trimLanguage(lang)
   390  		languages = append(languages, lang)
   391  	}
   392  
   393  	return languages
   394  }
   395  
   396  func trimLanguage(enclosedLang string) string {
   397  	lang := strings.TrimSpace(enclosedLang)
   398  	lang = lang[strings.Index(lang, `"`)+1:]
   399  	lang = lang[:strings.Index(lang, `"`)]
   400  	return lang
   401  }
   402  
   403  func buildLanguagesHeuristics(langsList [][]string, heuristicsList [][]*heuristic, logicRelsList [][]string) []*languageHeuristics {
   404  	langsHeuristics := make([]*languageHeuristics, 0, len(langsList))
   405  	for i, langSlice := range langsList {
   406  		var heuristics []*heuristic
   407  		if i < len(heuristicsList) {
   408  			heuristics = heuristicsList[i]
   409  		}
   410  
   411  		var rels []string
   412  		if i < len(logicRelsList) {
   413  			rels = logicRelsList[i]
   414  		}
   415  
   416  		for _, lang := range langSlice {
   417  			lh := &languageHeuristics{
   418  				Language:       lang,
   419  				Heuristics:     heuristics,
   420  				LogicRelations: rels,
   421  			}
   422  
   423  			langsHeuristics = append(langsHeuristics, lh)
   424  		}
   425  	}
   426  
   427  	return langsHeuristics
   428  }
   429  
   430  func executeContentTemplate(out io.Writer, disambiguators []*disambiguator, tmplPath, tmplName, commit string) error {
   431  	fmap := template.FuncMap{
   432  		"getAllHeuristics": getAllHeuristics,
   433  		"returnStringSlice": func(slice []string) string {
   434  			if len(slice) == 0 {
   435  				return "nil"
   436  			}
   437  
   438  			return `[]string{` + strings.Join(slice, `, `) + `}`
   439  		},
   440  		"returnLanguages": returnLanguages,
   441  		"avoidLanguage":   avoidLanguage,
   442  	}
   443  	return executeTemplate(out, tmplName, tmplPath, commit, fmap, disambiguators)
   444  }
   445  
   446  func getAllHeuristics(disambiguators []*disambiguator) []*heuristic {
   447  	heuristics := make([]*heuristic, 0)
   448  	for _, disamb := range disambiguators {
   449  		for _, lang := range disamb.Languages {
   450  			if !avoidLanguage(lang) {
   451  				heuristics = append(heuristics, lang.Heuristics...)
   452  			}
   453  		}
   454  	}
   455  
   456  	return heuristics
   457  }
   458  
   459  func avoidLanguage(lang *languageHeuristics) bool {
   460  	// necessary to avoid corner cases
   461  	for _, heuristic := range lang.Heuristics {
   462  		if containsInvalidRegexp(heuristic.Regexp) {
   463  			return true
   464  		}
   465  	}
   466  
   467  	return false
   468  }
   469  
   470  func containsInvalidRegexp(reg string) bool {
   471  	return strings.Contains(reg, `(?<`) || strings.Contains(reg, `\1`)
   472  }
   473  
   474  func returnLanguages(langsHeuristics []*languageHeuristics) []string {
   475  	langs := make([]string, 0)
   476  	for _, langHeu := range langsHeuristics {
   477  		if len(langHeu.Heuristics) == 0 {
   478  			langs = append(langs, `"`+langHeu.Language+`"`)
   479  		}
   480  	}
   481  
   482  	return langs
   483  }