github.com/dennwc/enry@v1.6.4-0.20180424151738-42391b8e105b/internal/code-generator/generator/heuristics.go (about)

     1  package generator
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"fmt"
     7  	"io"
     8  	"io/ioutil"
     9  	"regexp"
    10  	"strconv"
    11  	"strings"
    12  	"text/template"
    13  )
    14  
    15  // Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
    16  func Heuristics(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error {
    17  	data, err := ioutil.ReadFile(fileToParse)
    18  	if err != nil {
    19  		return err
    20  	}
    21  
    22  	disambiguators, err := getDisambiguators(data)
    23  	if err != nil {
    24  		return err
    25  	}
    26  
    27  	buf := &bytes.Buffer{}
    28  	if err := executeContentTemplate(buf, disambiguators, tmplPath, tmplName, commit); err != nil {
    29  		return err
    30  	}
    31  
    32  	return formatedWrite(outPath, buf.Bytes())
    33  }
    34  
    35  const (
    36  	unknownLanguage = "OtherLanguage"
    37  	emptyFile       = "^$"
    38  )
    39  
    40  var (
    41  	disambLine       = regexp.MustCompile(`^(\s*)disambiguate`)
    42  	definedRegs      = make(map[string]string)
    43  	illegalCharacter = map[string]string{
    44  		"#": "Sharp",
    45  		"+": "Plus",
    46  		"-": "Dash",
    47  	}
    48  )
    49  
    50  type disambiguator struct {
    51  	Extension string                `json:"extension,omitempty"`
    52  	Languages []*languageHeuristics `json:"languages,omitempty"`
    53  }
    54  
    55  func (d *disambiguator) setHeuristicsNames() {
    56  	for _, lang := range d.Languages {
    57  		for i, heuristic := range lang.Heuristics {
    58  			name := buildName(d.Extension, lang.Language, i)
    59  			heuristic.Name = name
    60  		}
    61  	}
    62  }
    63  
    64  func buildName(extension, language string, id int) string {
    65  	extension = strings.TrimPrefix(extension, `.`)
    66  	language = strings.Join(strings.Fields(language), ``)
    67  	name := strings.Join([]string{extension, language, "Matcher", strconv.Itoa(id)}, `_`)
    68  	for k, v := range illegalCharacter {
    69  		if strings.Contains(name, k) {
    70  			name = strings.Replace(name, k, v, -1)
    71  		}
    72  	}
    73  
    74  	return name
    75  }
    76  
    77  type languageHeuristics struct {
    78  	Language       string       `json:"language,omitempty"`
    79  	Heuristics     []*heuristic `json:"heuristics,omitempty"`
    80  	LogicRelations []string     `json:"logic_relations,omitempty"`
    81  }
    82  
    83  func (l *languageHeuristics) clone() (*languageHeuristics, error) {
    84  	language := l.Language
    85  	logicRels := make([]string, len(l.LogicRelations))
    86  	if copy(logicRels, l.LogicRelations) != len(l.LogicRelations) {
    87  		return nil, fmt.Errorf("error copying logic relations")
    88  	}
    89  
    90  	heuristics := make([]*heuristic, 0, len(l.Heuristics))
    91  	for _, h := range l.Heuristics {
    92  		heuristic := *h
    93  		heuristics = append(heuristics, &heuristic)
    94  	}
    95  
    96  	clone := &languageHeuristics{
    97  		Language:       language,
    98  		Heuristics:     heuristics,
    99  		LogicRelations: logicRels,
   100  	}
   101  
   102  	return clone, nil
   103  }
   104  
   105  type heuristic struct {
   106  	Name   string `json:"name,omitempty"`
   107  	Regexp string `json:"regexp,omitempty"`
   108  }
   109  
   110  // A disambiguate block looks like:
   111  // disambiguate ".mod", ".extension" do |data|
   112  // 	if data.include?('<!ENTITY ') && data.include?('patata')
   113  // 		Language["XML"]
   114  // 	elsif /^\s*MODULE [\w\.]+;/i.match(data) || /^\s*END [\w\.]+;/i.match(data) || data.empty?
   115  // 		Language["Modula-2"]
   116  //	elsif (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data))
   117  //              Language["Scala"]
   118  //      elsif (data.include?("gap> "))
   119  //		Language["GAP"]
   120  // 	else
   121  // 		[Language["Linux Kernel Module"], Language["AMPL"]]
   122  // 	end
   123  // end
   124  func getDisambiguators(heuristics []byte) ([]*disambiguator, error) {
   125  	seenExtensions := map[string]bool{}
   126  	buf := bufio.NewScanner(bytes.NewReader(heuristics))
   127  	disambiguators := make([]*disambiguator, 0, 50)
   128  	for buf.Scan() {
   129  		line := buf.Text()
   130  		if disambLine.MatchString(line) {
   131  			d, err := parseDisambiguators(line, buf, seenExtensions)
   132  			if err != nil {
   133  				return nil, err
   134  			}
   135  
   136  			disambiguators = append(disambiguators, d...)
   137  		}
   138  
   139  		lookForRegexpVariables(line)
   140  	}
   141  
   142  	if err := buf.Err(); err != nil {
   143  		return nil, err
   144  	}
   145  
   146  	return disambiguators, nil
   147  }
   148  
   149  func lookForRegexpVariables(line string) {
   150  	if strings.Contains(line, "ObjectiveCRegex = ") {
   151  		line = strings.TrimSpace(line)
   152  		reg := strings.TrimPrefix(line, "ObjectiveCRegex = ")
   153  		definedRegs["ObjectiveCRegex"] = reg
   154  	}
   155  
   156  	if strings.Contains(line, "fortran_rx = ") {
   157  		line = strings.TrimSpace(line)
   158  		reg := strings.TrimPrefix(line, "fortran_rx = ")
   159  		definedRegs["fortran_rx"] = reg
   160  	}
   161  }
   162  
   163  func parseDisambiguators(line string, buf *bufio.Scanner, seenExtensions map[string]bool) ([]*disambiguator, error) {
   164  	disambList := make([]*disambiguator, 0, 2)
   165  	splitted := strings.Fields(line)
   166  
   167  	for _, v := range splitted {
   168  		if strings.HasPrefix(v, `"`) {
   169  			extension := strings.Trim(v, `",`)
   170  			if _, ok := seenExtensions[extension]; !ok {
   171  				d := &disambiguator{Extension: extension}
   172  				disambList = append(disambList, d)
   173  				seenExtensions[extension] = true
   174  			}
   175  		}
   176  	}
   177  
   178  	langsHeuristics, err := getLanguagesHeuristics(buf)
   179  	if err != nil {
   180  		return nil, err
   181  	}
   182  
   183  	for i, disamb := range disambList {
   184  		lh := langsHeuristics
   185  		if i != 0 {
   186  			lh = cloneLanguagesHeuristics(langsHeuristics)
   187  		}
   188  
   189  		disamb.Languages = lh
   190  		disamb.setHeuristicsNames()
   191  	}
   192  
   193  	return disambList, nil
   194  }
   195  
   196  func cloneLanguagesHeuristics(list []*languageHeuristics) []*languageHeuristics {
   197  	cloneList := make([]*languageHeuristics, 0, len(list))
   198  	for _, langHeu := range list {
   199  		clone, _ := langHeu.clone()
   200  		cloneList = append(cloneList, clone)
   201  	}
   202  
   203  	return cloneList
   204  }
   205  
   206  func getLanguagesHeuristics(buf *bufio.Scanner) ([]*languageHeuristics, error) {
   207  	langsList := make([][]string, 0, 2)
   208  	heuristicsList := make([][]*heuristic, 0, 1)
   209  	logicRelsList := make([][]string, 0, 1)
   210  
   211  	lastWasMatch := false
   212  	for buf.Scan() {
   213  		line := buf.Text()
   214  		if strings.TrimSpace(line) == "end" {
   215  			break
   216  		}
   217  
   218  		if hasRegExp(line) {
   219  			line := cleanRegExpLine(line)
   220  
   221  			logicRels := getLogicRelations(line)
   222  			heuristics := getHeuristics(line)
   223  			if lastWasMatch {
   224  				i := len(heuristicsList) - 1
   225  				heuristicsList[i] = append(heuristicsList[i], heuristics...)
   226  				i = len(logicRelsList) - 1
   227  				logicRelsList[i] = append(logicRelsList[i], logicRels...)
   228  			} else {
   229  				heuristicsList = append(heuristicsList, heuristics)
   230  				logicRelsList = append(logicRelsList, logicRels)
   231  			}
   232  
   233  			lastWasMatch = true
   234  		}
   235  
   236  		if strings.Contains(line, "Language") {
   237  			langs := getLanguages(line)
   238  			langsList = append(langsList, langs)
   239  			lastWasMatch = false
   240  		}
   241  
   242  	}
   243  
   244  	if err := buf.Err(); err != nil {
   245  		return nil, err
   246  	}
   247  
   248  	langsHeuristics := buildLanguagesHeuristics(langsList, heuristicsList, logicRelsList)
   249  	return langsHeuristics, nil
   250  }
   251  
   252  func hasRegExp(line string) bool {
   253  	return strings.Contains(line, ".match") || strings.Contains(line, ".include?") || strings.Contains(line, ".empty?")
   254  }
   255  
   256  func cleanRegExpLine(line string) string {
   257  	if strings.Contains(line, "if ") {
   258  		line = line[strings.Index(line, `if `)+3:]
   259  	}
   260  
   261  	line = strings.TrimSpace(line)
   262  	line = strings.TrimPrefix(line, `(`)
   263  	if strings.Contains(line, "))") {
   264  		line = strings.TrimSuffix(line, `)`)
   265  	}
   266  
   267  	return line
   268  }
   269  
   270  func getLogicRelations(line string) []string {
   271  	rels := make([]string, 0)
   272  	splitted := strings.Split(line, "||")
   273  	for i, v := range splitted {
   274  		if strings.Contains(v, "&&") {
   275  			rels = append(rels, "&&")
   276  		}
   277  
   278  		if i < len(splitted)-1 {
   279  			rels = append(rels, "||")
   280  		}
   281  	}
   282  
   283  	if len(rels) == 0 {
   284  		rels = nil
   285  	}
   286  
   287  	return rels
   288  }
   289  
   290  func getHeuristics(line string) []*heuristic {
   291  	splitted := splitByLogicOps(line)
   292  	heuristics := make([]*heuristic, 0, len(splitted))
   293  	for _, v := range splitted {
   294  		v = strings.TrimSpace(v)
   295  		var reg string
   296  
   297  		if strings.Contains(v, ".match") {
   298  			reg = v[:strings.Index(v, ".match")]
   299  			reg = replaceRegexpVariables(reg)
   300  		}
   301  
   302  		if strings.Contains(v, ".include?") {
   303  			reg = includeToRegExp(v)
   304  		}
   305  
   306  		if strings.Contains(v, ".empty?") {
   307  			reg = emptyFile
   308  		}
   309  
   310  		if reg != "" {
   311  			reg = convertToValidRegexp(reg)
   312  			heuristics = append(heuristics, &heuristic{Regexp: reg})
   313  		}
   314  	}
   315  
   316  	return heuristics
   317  }
   318  
   319  func splitByLogicOps(line string) []string {
   320  	splitted := make([]string, 0, 1)
   321  	splitOr := strings.Split(line, "||")
   322  	for _, v := range splitOr {
   323  		splitAnd := strings.Split(v, "&&")
   324  		splitted = append(splitted, splitAnd...)
   325  	}
   326  
   327  	return splitted
   328  }
   329  
   330  func replaceRegexpVariables(reg string) string {
   331  	repl := reg
   332  	if v, ok := definedRegs[reg]; ok {
   333  		repl = v
   334  	}
   335  
   336  	return repl
   337  }
   338  
   339  func convertToValidRegexp(reg string) string {
   340  	// example: `/^(\s*)(<Project|<Import|<Property|<?xml|xmlns)/i``
   341  	// Ruby modifier "m" matches multiple lines, recognizing newlines as normal characters, Go use flag "s" for that.
   342  	const (
   343  		caseSensitive = "i"
   344  		matchEOL      = "s"
   345  
   346  		rubyCaseSensitive = "i"
   347  		rubyMultiLine     = "m"
   348  	)
   349  
   350  	if reg == emptyFile {
   351  		return reg
   352  	}
   353  
   354  	reg = strings.TrimPrefix(reg, `/`)
   355  	flags := "(?m"
   356  	lastSlash := strings.LastIndex(reg, `/`)
   357  	if lastSlash == -1 {
   358  		return flags + ")" + reg
   359  	}
   360  
   361  	specialChars := reg[lastSlash:]
   362  	reg = reg[:lastSlash]
   363  	if lastSlash == len(reg)-1 {
   364  		return flags + ")" + reg
   365  	}
   366  
   367  	if strings.Contains(specialChars, rubyCaseSensitive) {
   368  		flags = flags + caseSensitive
   369  	}
   370  
   371  	if strings.Contains(specialChars, rubyMultiLine) {
   372  		flags = flags + matchEOL
   373  	}
   374  
   375  	return flags + ")" + reg
   376  }
   377  
   378  func includeToRegExp(include string) string {
   379  	content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
   380  	content = strings.Trim(content, `"'`)
   381  	return regexp.QuoteMeta(content)
   382  }
   383  
   384  func getLanguages(line string) []string {
   385  	languages := make([]string, 0)
   386  	splitted := strings.Split(line, `,`)
   387  	for _, lang := range splitted {
   388  		lang = trimLanguage(lang)
   389  		languages = append(languages, lang)
   390  	}
   391  
   392  	return languages
   393  }
   394  
   395  func trimLanguage(enclosedLang string) string {
   396  	lang := strings.TrimSpace(enclosedLang)
   397  	lang = lang[strings.Index(lang, `"`)+1:]
   398  	lang = lang[:strings.Index(lang, `"`)]
   399  	return lang
   400  }
   401  
   402  func buildLanguagesHeuristics(langsList [][]string, heuristicsList [][]*heuristic, logicRelsList [][]string) []*languageHeuristics {
   403  	langsHeuristics := make([]*languageHeuristics, 0, len(langsList))
   404  	for i, langSlice := range langsList {
   405  		var heuristics []*heuristic
   406  		if i < len(heuristicsList) {
   407  			heuristics = heuristicsList[i]
   408  		}
   409  
   410  		var rels []string
   411  		if i < len(logicRelsList) {
   412  			rels = logicRelsList[i]
   413  		}
   414  
   415  		for _, lang := range langSlice {
   416  			lh := &languageHeuristics{
   417  				Language:       lang,
   418  				Heuristics:     heuristics,
   419  				LogicRelations: rels,
   420  			}
   421  
   422  			langsHeuristics = append(langsHeuristics, lh)
   423  		}
   424  	}
   425  
   426  	return langsHeuristics
   427  }
   428  
   429  func executeContentTemplate(out io.Writer, disambiguators []*disambiguator, tmplPath, tmplName, commit string) error {
   430  	fmap := template.FuncMap{
   431  		"getCommit":        func() string { return commit },
   432  		"getAllHeuristics": getAllHeuristics,
   433  		"returnStringSlice": func(slice []string) string {
   434  			if len(slice) == 0 {
   435  				return "nil"
   436  			}
   437  
   438  			return `[]string{` + strings.Join(slice, `, `) + `}`
   439  		},
   440  		"returnLanguages": returnLanguages,
   441  		"avoidLanguage":   avoidLanguage,
   442  	}
   443  
   444  	t := template.Must(template.New(tmplName).Funcs(fmap).ParseFiles(tmplPath))
   445  	if err := t.Execute(out, disambiguators); err != nil {
   446  		return err
   447  	}
   448  
   449  	return nil
   450  }
   451  
   452  func getAllHeuristics(disambiguators []*disambiguator) []*heuristic {
   453  	heuristics := make([]*heuristic, 0)
   454  	for _, disamb := range disambiguators {
   455  		for _, lang := range disamb.Languages {
   456  			if !avoidLanguage(lang) {
   457  				heuristics = append(heuristics, lang.Heuristics...)
   458  			}
   459  		}
   460  	}
   461  
   462  	return heuristics
   463  }
   464  
   465  func avoidLanguage(lang *languageHeuristics) bool {
   466  	// necessary to avoid corner cases
   467  	for _, heuristic := range lang.Heuristics {
   468  		if containsInvalidRegexp(heuristic.Regexp) {
   469  			return true
   470  		}
   471  	}
   472  
   473  	return false
   474  }
   475  
   476  func containsInvalidRegexp(reg string) bool {
   477  	return strings.Contains(reg, `(?<`) || strings.Contains(reg, `\1`)
   478  }
   479  
   480  func returnLanguages(langsHeuristics []*languageHeuristics) []string {
   481  	langs := make([]string, 0)
   482  	for _, langHeu := range langsHeuristics {
   483  		if len(langHeu.Heuristics) == 0 {
   484  			langs = append(langs, `"`+langHeu.Language+`"`)
   485  		}
   486  	}
   487  
   488  	return langs
   489  }