github.com/xzntrc/go-enry/v2@v2.0.0-20230215091818-766cc1d65498/internal/code-generator/generator/heuristics.go (about)

     1  package generator
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"log"
     8  	"strings"
     9  
    10  	yaml "gopkg.in/yaml.v2"
    11  )
    12  
    13  const (
    14  	multilinePrefix = "(?m)"
    15  	orPipe          = "|"
    16  )
    17  
    18  // GenHeuristics generates language identification heuristics in Go.
    19  // It is of generator.File type.
    20  func GenHeuristics(fileToParse, _, outPath, tmplPath, tmplName, commit string) error {
    21  	heuristicsYaml, err := parseYaml(fileToParse)
    22  	if err != nil {
    23  		return err
    24  	}
    25  
    26  	langPatterns, err := loadHeuristics(heuristicsYaml)
    27  	if err != nil {
    28  		return err
    29  	}
    30  
    31  	buf := &bytes.Buffer{}
    32  	err = executeTemplate(buf, tmplName, tmplPath, commit, nil, langPatterns)
    33  	if err != nil {
    34  		return err
    35  	}
    36  
    37  	return formatedWrite(outPath, buf.Bytes())
    38  }
    39  
    40  // loadHeuristics transforms parsed YAML to map[".ext"]->IR for code generation.
    41  func loadHeuristics(yaml *Heuristics) (map[string][]*LanguagePattern, error) {
    42  	patterns := make(map[string][]*LanguagePattern)
    43  	for _, disambiguation := range yaml.Disambiguations {
    44  		var rules []*LanguagePattern
    45  		for _, rule := range disambiguation.Rules {
    46  			langPattern := loadRule(yaml.NamedPatterns, rule)
    47  			if langPattern != nil {
    48  				rules = append(rules, langPattern)
    49  			}
    50  		}
    51  		// unroll to a single map
    52  		for _, ext := range disambiguation.Extensions {
    53  			if _, ok := patterns[ext]; ok {
    54  				return nil, fmt.Errorf("cannot add extension '%s', it already exists for %+v", ext, patterns[ext])
    55  			}
    56  			patterns[ext] = rules
    57  		}
    58  
    59  	}
    60  	return patterns, nil
    61  }
    62  
    63  // loadRule transforms single rule from parsed YAML to IR for code generation.
    64  // For OrPattern case, it always combines multiple patterns into a single one.
    65  func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern {
    66  	var result *LanguagePattern
    67  	if len(rule.And) != 0 { // AndPattern
    68  		var subPatterns []*LanguagePattern
    69  		for _, r := range rule.And {
    70  			subp := loadRule(namedPatterns, r)
    71  			subPatterns = append(subPatterns, subp)
    72  		}
    73  		result = &LanguagePattern{"And", rule.Languages, "", subPatterns}
    74  	} else if len(rule.Pattern) != 0 { // OrPattern
    75  		conjunction := strings.Join(rule.Pattern, orPipe)
    76  		pattern := convertToValidRegexp(conjunction)
    77  		result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
    78  	} else if rule.NegativePattern != "" { // NotPattern
    79  		pattern := convertToValidRegexp(rule.NegativePattern)
    80  		result = &LanguagePattern{"Not", rule.Languages, pattern, nil}
    81  	} else if rule.NamedPattern != "" { // Named OrPattern
    82  		conjunction := strings.Join(namedPatterns[rule.NamedPattern], orPipe)
    83  		pattern := convertToValidRegexp(conjunction)
    84  		result = &LanguagePattern{"Or", rule.Languages, pattern, nil}
    85  	} else { // AlwaysPattern
    86  		result = &LanguagePattern{"Always", rule.Languages, "", nil}
    87  	}
    88  
    89  	if isUnsupportedRegexpSyntax(result.Pattern) {
    90  		log.Printf("skipping rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern)
    91  		return nil
    92  	}
    93  	return result
    94  }
    95  
    96  // LanguagePattern is an IR of parsed Rule suitable for code generations.
    97  // Strings are used as this is to be be consumed by text/template.
    98  type LanguagePattern struct {
    99  	Op      string
   100  	Langs   []string
   101  	Pattern string
   102  	Rules   []*LanguagePattern
   103  }
   104  
   105  type Heuristics struct {
   106  	Disambiguations []*Disambiguation
   107  	NamedPatterns   map[string]StringArray `yaml:"named_patterns"`
   108  }
   109  
   110  type Disambiguation struct {
   111  	Extensions []string `yaml:"extensions,flow"`
   112  	Rules      []*Rule  `yaml:"rules"`
   113  }
   114  
   115  type Rule struct {
   116  	Patterns  `yaml:",inline"`
   117  	Languages StringArray `yaml:"language"`
   118  	And       []*Rule
   119  }
   120  
   121  type Patterns struct {
   122  	Pattern         StringArray `yaml:"pattern,omitempty"`
   123  	NamedPattern    string      `yaml:"named_pattern,omitempty"`
   124  	NegativePattern string      `yaml:"negative_pattern,omitempty"`
   125  }
   126  
   127  // StringArray is workaround for parsing named_pattern,
   128  // wich is sometimes arry and sometimes not.
   129  // See https://github.com/go-yaml/yaml/issues/100
   130  type StringArray []string
   131  
   132  // UnmarshalYAML allows to parse element always as a []string
   133  func (sa *StringArray) UnmarshalYAML(unmarshal func(interface{}) error) error {
   134  	var multi []string
   135  	if err := unmarshal(&multi); err != nil {
   136  		var single string
   137  		if err := unmarshal(&single); err != nil {
   138  			return err
   139  		}
   140  		*sa = []string{single}
   141  	} else {
   142  		*sa = multi
   143  	}
   144  	return nil
   145  }
   146  
   147  func parseYaml(file string) (*Heuristics, error) {
   148  	data, err := ioutil.ReadFile(file)
   149  	if err != nil {
   150  		return nil, err
   151  	}
   152  
   153  	h := &Heuristics{}
   154  	if err := yaml.Unmarshal(data, &h); err != nil {
   155  		return nil, err
   156  	}
   157  
   158  	return h, nil
   159  }
   160  
   161  // isUnsupportedRegexpSyntax filters regexp syntax that is not supported by RE2.
   162  // In particular, we stumbled up on usage of next cases:
   163  // - lookbehind & lookahead
   164  // - non-backtracking subexpressions
   165  // - named & numbered capturing group/after text matching
   166  // - backreference
   167  // - possessive quantifier
   168  // For reference on supported syntax see https://github.com/google/re2/wiki/Syntax
   169  func isUnsupportedRegexpSyntax(reg string) bool {
   170  	return strings.Contains(reg, `(?<`) || strings.Contains(reg, `(?=`) || strings.Contains(reg, `(?!`) ||
   171  		strings.Contains(reg, `(?>`) || strings.Contains(reg, `\1`) || strings.Contains(reg, `*+`) ||
   172  		// See https://github.com/github/linguist/pull/4243#discussion_r246105067
   173  		(strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`))
   174  }
   175  
   176  // convertToValidRegexp converts Ruby regexp syntax to RE2 equivalent.
   177  // Does not work with Ruby regexp literals.
   178  func convertToValidRegexp(rubyRegexp string) string {
   179  	return multilinePrefix + rubyRegexp
   180  }