github.com/rohankumardubey/go-enry@v1.7.3/internal/code-generator/generator/heuristics.go (about) 1 package generator 2 3 import ( 4 "bytes" 5 "fmt" 6 "io/ioutil" 7 "log" 8 "strings" 9 10 yaml "gopkg.in/yaml.v2" 11 ) 12 13 const ( 14 multilinePrefix = "(?m)" 15 orPipe = "|" 16 ) 17 18 // GenHeuristics generates language identification heuristics in Go. 19 // It is of generator.File type. 20 func GenHeuristics(fileToParse, _, outPath, tmplPath, tmplName, commit string) error { 21 heuristicsYaml, err := parseYaml(fileToParse) 22 if err != nil { 23 return err 24 } 25 26 langPatterns, err := loadHeuristics(heuristicsYaml) 27 if err != nil { 28 return err 29 } 30 31 buf := &bytes.Buffer{} 32 err = executeTemplate(buf, tmplName, tmplPath, commit, nil, langPatterns) 33 if err != nil { 34 return err 35 } 36 37 return formatedWrite(outPath, buf.Bytes()) 38 } 39 40 // loadHeuristics transforms parsed YAML to map[".ext"]->IR for code generation. 41 func loadHeuristics(yaml *Heuristics) (map[string][]*LanguagePattern, error) { 42 var patterns = make(map[string][]*LanguagePattern) 43 for _, disambiguation := range yaml.Disambiguations { 44 var rules []*LanguagePattern 45 for _, rule := range disambiguation.Rules { 46 langPattern := loadRule(yaml.NamedPatterns, rule) 47 if langPattern != nil { 48 rules = append(rules, langPattern) 49 } 50 } 51 // unroll to a single map 52 for _, ext := range disambiguation.Extensions { 53 if _, ok := patterns[ext]; ok { 54 return nil, fmt.Errorf("cannot add extension '%s', it already exists for %+v", ext, patterns[ext]) 55 } 56 patterns[ext] = rules 57 } 58 59 } 60 return patterns, nil 61 } 62 63 // loadRule transforms single rule from parsed YAML to IR for code generation. 64 // For OrPattern case, it always combines multiple patterns into a single one. 65 func loadRule(namedPatterns map[string]StringArray, rule *Rule) *LanguagePattern { 66 var result *LanguagePattern 67 if len(rule.And) != 0 { // AndPattern 68 var subPatterns []*LanguagePattern 69 for _, r := range rule.And { 70 subp := loadRule(namedPatterns, r) 71 subPatterns = append(subPatterns, subp) 72 } 73 result = &LanguagePattern{"And", rule.Languages, "", subPatterns} 74 } else if len(rule.Pattern) != 0 { // OrPattern 75 conjunction := strings.Join(rule.Pattern, orPipe) 76 pattern := convertToValidRegexp(conjunction) 77 result = &LanguagePattern{"Or", rule.Languages, pattern, nil} 78 } else if rule.NegativePattern != "" { // NotPattern 79 pattern := convertToValidRegexp(rule.NegativePattern) 80 result = &LanguagePattern{"Not", rule.Languages, pattern, nil} 81 } else if rule.NamedPattern != "" { // Named OrPattern 82 conjunction := strings.Join(namedPatterns[rule.NamedPattern], orPipe) 83 pattern := convertToValidRegexp(conjunction) 84 result = &LanguagePattern{"Or", rule.Languages, pattern, nil} 85 } else { // AlwaysPattern 86 result = &LanguagePattern{"Always", rule.Languages, "", nil} 87 } 88 89 if isUnsupportedRegexpSyntax(result.Pattern) { 90 log.Printf("skipping rule: language:'%q', rule:'%q'\n", rule.Languages, result.Pattern) 91 return nil 92 } 93 return result 94 } 95 96 // LanguagePattern is an IR of parsed Rule suitable for code generations. 97 // Strings are used as this is to be be consumed by text/template. 98 type LanguagePattern struct { 99 Op string 100 Langs []string 101 Pattern string 102 Rules []*LanguagePattern 103 } 104 105 type Heuristics struct { 106 Disambiguations []*Disambiguation 107 NamedPatterns map[string]StringArray `yaml:"named_patterns"` 108 } 109 110 type Disambiguation struct { 111 Extensions []string `yaml:"extensions,flow"` 112 Rules []*Rule `yaml:"rules"` 113 } 114 115 type Rule struct { 116 Patterns `yaml:",inline"` 117 Languages StringArray `yaml:"language"` 118 And []*Rule 119 } 120 121 type Patterns struct { 122 Pattern StringArray `yaml:"pattern,omitempty"` 123 NamedPattern string `yaml:"named_pattern,omitempty"` 124 NegativePattern string `yaml:"negative_pattern,omitempty"` 125 } 126 127 // StringArray is workaround for parsing named_pattern, 128 // wich is sometimes arry and sometimes not. 129 // See https://github.com/go-yaml/yaml/issues/100 130 type StringArray []string 131 132 // UnmarshalYAML allowes to parse element always as a []string 133 func (sa *StringArray) UnmarshalYAML(unmarshal func(interface{}) error) error { 134 var multi []string 135 if err := unmarshal(&multi); err != nil { 136 var single string 137 if err := unmarshal(&single); err != nil { 138 return err 139 } 140 *sa = []string{single} 141 } else { 142 *sa = multi 143 } 144 return nil 145 } 146 147 func parseYaml(file string) (*Heuristics, error) { 148 data, err := ioutil.ReadFile(file) 149 if err != nil { 150 return nil, err 151 } 152 153 h := &Heuristics{} 154 if err := yaml.Unmarshal(data, &h); err != nil { 155 return nil, err 156 } 157 158 return h, nil 159 } 160 161 // isUnsupportedRegexpSyntax filters regexp syntax that is not supported by RE2. 162 // In particular, we stumbled up on usage of next cases: 163 // - named & numbered capturing group/after text matching 164 // - backreference 165 // For referece on supported syntax see https://github.com/google/re2/wiki/Syntax 166 func isUnsupportedRegexpSyntax(reg string) bool { 167 return strings.Contains(reg, `(?<`) || strings.Contains(reg, `\1`) || 168 // See https://github.com/github/linguist/pull/4243#discussion_r246105067 169 (strings.HasPrefix(reg, multilinePrefix+`/`) && strings.HasSuffix(reg, `/`)) 170 } 171 172 // convertToValidRegexp converts Ruby regexp syntaxt to RE2 equivalent. 173 // Does not work with Ruby regexp literals. 174 func convertToValidRegexp(rubyRegexp string) string { 175 return multilinePrefix + rubyRegexp 176 }