github.com/zkry/enry@v1.6.3/internal/code-generator/generator/heuristics.go (about) 1 package generator 2 3 import ( 4 "bufio" 5 "bytes" 6 "fmt" 7 "io" 8 "io/ioutil" 9 "regexp" 10 "strconv" 11 "strings" 12 "text/template" 13 ) 14 15 // Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature. 16 func Heuristics(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error { 17 data, err := ioutil.ReadFile(fileToParse) 18 if err != nil { 19 return err 20 } 21 22 disambiguators, err := getDisambiguators(data) 23 if err != nil { 24 return err 25 } 26 27 buf := &bytes.Buffer{} 28 if err := executeContentTemplate(buf, disambiguators, tmplPath, tmplName, commit); err != nil { 29 return err 30 } 31 32 return formatedWrite(outPath, buf.Bytes()) 33 } 34 35 const ( 36 unknownLanguage = "OtherLanguage" 37 emptyFile = "^$" 38 ) 39 40 var ( 41 disambLine = regexp.MustCompile(`^(\s*)disambiguate`) 42 definedRegs = make(map[string]string) 43 illegalCharacter = map[string]string{ 44 "#": "Sharp", 45 "+": "Plus", 46 "-": "Dash", 47 } 48 ) 49 50 type disambiguator struct { 51 Extension string `json:"extension,omitempty"` 52 Languages []*languageHeuristics `json:"languages,omitempty"` 53 } 54 55 func (d *disambiguator) setHeuristicsNames() { 56 for _, lang := range d.Languages { 57 for i, heuristic := range lang.Heuristics { 58 name := buildName(d.Extension, lang.Language, i) 59 heuristic.Name = name 60 } 61 } 62 } 63 64 func buildName(extension, language string, id int) string { 65 extension = strings.TrimPrefix(extension, `.`) 66 language = strings.Join(strings.Fields(language), ``) 67 name := strings.Join([]string{extension, language, "Matcher", strconv.Itoa(id)}, `_`) 68 for k, v := range illegalCharacter { 69 if strings.Contains(name, k) { 70 name = strings.Replace(name, k, v, -1) 71 } 72 } 73 74 return name 75 } 76 77 type languageHeuristics struct { 78 Language string `json:"language,omitempty"` 79 Heuristics []*heuristic `json:"heuristics,omitempty"` 80 LogicRelations []string `json:"logic_relations,omitempty"` 81 } 82 83 func (l *languageHeuristics) clone() (*languageHeuristics, error) { 84 language := l.Language 85 logicRels := make([]string, len(l.LogicRelations)) 86 if copy(logicRels, l.LogicRelations) != len(l.LogicRelations) { 87 return nil, fmt.Errorf("error copying logic relations") 88 } 89 90 heuristics := make([]*heuristic, 0, len(l.Heuristics)) 91 for _, h := range l.Heuristics { 92 heuristic := *h 93 heuristics = append(heuristics, &heuristic) 94 } 95 96 clone := &languageHeuristics{ 97 Language: language, 98 Heuristics: heuristics, 99 LogicRelations: logicRels, 100 } 101 102 return clone, nil 103 } 104 105 type heuristic struct { 106 Name string `json:"name,omitempty"` 107 Regexp string `json:"regexp,omitempty"` 108 } 109 110 // A disambiguate block looks like: 111 // disambiguate ".mod", ".extension" do |data| 112 // if data.include?('<!ENTITY ') && data.include?('patata') 113 // Language["XML"] 114 // elsif /^\s*MODULE [\w\.]+;/i.match(data) || /^\s*END [\w\.]+;/i.match(data) || data.empty? 115 // Language["Modula-2"] 116 // elsif (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data)) 117 // Language["Scala"] 118 // elsif (data.include?("gap> ")) 119 // Language["GAP"] 120 // else 121 // [Language["Linux Kernel Module"], Language["AMPL"]] 122 // end 123 // end 124 func getDisambiguators(heuristics []byte) ([]*disambiguator, error) { 125 seenExtensions := map[string]bool{} 126 buf := bufio.NewScanner(bytes.NewReader(heuristics)) 127 disambiguators := make([]*disambiguator, 0, 50) 128 for buf.Scan() { 129 line := buf.Text() 130 if disambLine.MatchString(line) { 131 d, err := parseDisambiguators(line, buf, seenExtensions) 132 if err != nil { 133 return nil, err 134 } 135 136 disambiguators = append(disambiguators, d...) 137 } 138 139 lookForRegexpVariables(line) 140 } 141 142 if err := buf.Err(); err != nil { 143 return nil, err 144 } 145 146 return disambiguators, nil 147 } 148 149 func lookForRegexpVariables(line string) { 150 if strings.Contains(line, "ObjectiveCRegex = ") { 151 line = strings.TrimSpace(line) 152 reg := strings.TrimPrefix(line, "ObjectiveCRegex = ") 153 definedRegs["ObjectiveCRegex"] = reg 154 } 155 156 if strings.Contains(line, "fortran_rx = ") { 157 line = strings.TrimSpace(line) 158 reg := strings.TrimPrefix(line, "fortran_rx = ") 159 definedRegs["fortran_rx"] = reg 160 } 161 } 162 163 func parseDisambiguators(line string, buf *bufio.Scanner, seenExtensions map[string]bool) ([]*disambiguator, error) { 164 disambList := make([]*disambiguator, 0, 2) 165 splitted := strings.Fields(line) 166 167 for _, v := range splitted { 168 if strings.HasPrefix(v, `"`) { 169 extension := strings.Trim(v, `",`) 170 if _, ok := seenExtensions[extension]; !ok { 171 d := &disambiguator{Extension: extension} 172 disambList = append(disambList, d) 173 seenExtensions[extension] = true 174 } 175 } 176 } 177 178 langsHeuristics, err := getLanguagesHeuristics(buf) 179 if err != nil { 180 return nil, err 181 } 182 183 for i, disamb := range disambList { 184 lh := langsHeuristics 185 if i != 0 { 186 lh = cloneLanguagesHeuristics(langsHeuristics) 187 } 188 189 disamb.Languages = lh 190 disamb.setHeuristicsNames() 191 } 192 193 return disambList, nil 194 } 195 196 func cloneLanguagesHeuristics(list []*languageHeuristics) []*languageHeuristics { 197 cloneList := make([]*languageHeuristics, 0, len(list)) 198 for _, langHeu := range list { 199 clone, _ := langHeu.clone() 200 cloneList = append(cloneList, clone) 201 } 202 203 return cloneList 204 } 205 206 func getLanguagesHeuristics(buf *bufio.Scanner) ([]*languageHeuristics, error) { 207 langsList := make([][]string, 0, 2) 208 heuristicsList := make([][]*heuristic, 0, 1) 209 logicRelsList := make([][]string, 0, 1) 210 211 lastWasMatch := false 212 for buf.Scan() { 213 line := buf.Text() 214 if strings.TrimSpace(line) == "end" { 215 break 216 } 217 218 if hasRegExp(line) { 219 line := cleanRegExpLine(line) 220 221 logicRels := getLogicRelations(line) 222 heuristics := getHeuristics(line) 223 if lastWasMatch { 224 i := len(heuristicsList) - 1 225 heuristicsList[i] = append(heuristicsList[i], heuristics...) 226 i = len(logicRelsList) - 1 227 logicRelsList[i] = append(logicRelsList[i], logicRels...) 228 } else { 229 heuristicsList = append(heuristicsList, heuristics) 230 logicRelsList = append(logicRelsList, logicRels) 231 } 232 233 lastWasMatch = true 234 } 235 236 if strings.Contains(line, "Language") { 237 langs := getLanguages(line) 238 langsList = append(langsList, langs) 239 lastWasMatch = false 240 } 241 242 } 243 244 if err := buf.Err(); err != nil { 245 return nil, err 246 } 247 248 langsHeuristics := buildLanguagesHeuristics(langsList, heuristicsList, logicRelsList) 249 return langsHeuristics, nil 250 } 251 252 func hasRegExp(line string) bool { 253 return strings.Contains(line, ".match") || strings.Contains(line, ".include?") || strings.Contains(line, ".empty?") 254 } 255 256 func cleanRegExpLine(line string) string { 257 if strings.Contains(line, "if ") { 258 line = line[strings.Index(line, `if `)+3:] 259 } 260 261 line = strings.TrimSpace(line) 262 line = strings.TrimPrefix(line, `(`) 263 if strings.Contains(line, "))") { 264 line = strings.TrimSuffix(line, `)`) 265 } 266 267 return line 268 } 269 270 func getLogicRelations(line string) []string { 271 rels := make([]string, 0) 272 splitted := strings.Split(line, "||") 273 for i, v := range splitted { 274 if strings.Contains(v, "&&") { 275 rels = append(rels, "&&") 276 } 277 278 if i < len(splitted)-1 { 279 rels = append(rels, "||") 280 } 281 } 282 283 if len(rels) == 0 { 284 rels = nil 285 } 286 287 return rels 288 } 289 290 func getHeuristics(line string) []*heuristic { 291 splitted := splitByLogicOps(line) 292 heuristics := make([]*heuristic, 0, len(splitted)) 293 for _, v := range splitted { 294 v = strings.TrimSpace(v) 295 var reg string 296 297 if strings.Contains(v, ".match") { 298 reg = v[:strings.Index(v, ".match")] 299 reg = replaceRegexpVariables(reg) 300 } 301 302 if strings.Contains(v, ".include?") { 303 reg = includeToRegExp(v) 304 } 305 306 if strings.Contains(v, ".empty?") { 307 reg = emptyFile 308 } 309 310 if reg != "" { 311 reg = convertToValidRegexp(reg) 312 heuristics = append(heuristics, &heuristic{Regexp: reg}) 313 } 314 } 315 316 return heuristics 317 } 318 319 func splitByLogicOps(line string) []string { 320 splitted := make([]string, 0, 1) 321 splitOr := strings.Split(line, "||") 322 for _, v := range splitOr { 323 splitAnd := strings.Split(v, "&&") 324 splitted = append(splitted, splitAnd...) 325 } 326 327 return splitted 328 } 329 330 func replaceRegexpVariables(reg string) string { 331 repl := reg 332 if v, ok := definedRegs[reg]; ok { 333 repl = v 334 } 335 336 return repl 337 } 338 339 func convertToValidRegexp(reg string) string { 340 // example: `/^(\s*)(<Project|<Import|<Property|<?xml|xmlns)/i`` 341 // Ruby modifier "m" matches multiple lines, recognizing newlines as normal characters, Go use flag "s" for that. 342 const ( 343 caseSensitive = "i" 344 matchEOL = "s" 345 346 rubyCaseSensitive = "i" 347 rubyMultiLine = "m" 348 ) 349 350 if reg == emptyFile { 351 return reg 352 } 353 354 reg = strings.TrimPrefix(reg, `/`) 355 flags := "(?m" 356 lastSlash := strings.LastIndex(reg, `/`) 357 if lastSlash == -1 { 358 return flags + ")" + reg 359 } 360 361 specialChars := reg[lastSlash:] 362 reg = reg[:lastSlash] 363 if lastSlash == len(reg)-1 { 364 return flags + ")" + reg 365 } 366 367 if strings.Contains(specialChars, rubyCaseSensitive) { 368 flags = flags + caseSensitive 369 } 370 371 if strings.Contains(specialChars, rubyMultiLine) { 372 flags = flags + matchEOL 373 } 374 375 return flags + ")" + reg 376 } 377 378 func includeToRegExp(include string) string { 379 content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)] 380 content = strings.Trim(content, `"'`) 381 return regexp.QuoteMeta(content) 382 } 383 384 func getLanguages(line string) []string { 385 languages := make([]string, 0) 386 splitted := strings.Split(line, `,`) 387 for _, lang := range splitted { 388 lang = trimLanguage(lang) 389 languages = append(languages, lang) 390 } 391 392 return languages 393 } 394 395 func trimLanguage(enclosedLang string) string { 396 lang := strings.TrimSpace(enclosedLang) 397 lang = lang[strings.Index(lang, `"`)+1:] 398 lang = lang[:strings.Index(lang, `"`)] 399 return lang 400 } 401 402 func buildLanguagesHeuristics(langsList [][]string, heuristicsList [][]*heuristic, logicRelsList [][]string) []*languageHeuristics { 403 langsHeuristics := make([]*languageHeuristics, 0, len(langsList)) 404 for i, langSlice := range langsList { 405 var heuristics []*heuristic 406 if i < len(heuristicsList) { 407 heuristics = heuristicsList[i] 408 } 409 410 var rels []string 411 if i < len(logicRelsList) { 412 rels = logicRelsList[i] 413 } 414 415 for _, lang := range langSlice { 416 lh := &languageHeuristics{ 417 Language: lang, 418 Heuristics: heuristics, 419 LogicRelations: rels, 420 } 421 422 langsHeuristics = append(langsHeuristics, lh) 423 } 424 } 425 426 return langsHeuristics 427 } 428 429 func executeContentTemplate(out io.Writer, disambiguators []*disambiguator, tmplPath, tmplName, commit string) error { 430 fmap := template.FuncMap{ 431 "getCommit": func() string { return commit }, 432 "getAllHeuristics": getAllHeuristics, 433 "returnStringSlice": func(slice []string) string { 434 if len(slice) == 0 { 435 return "nil" 436 } 437 438 return `[]string{` + strings.Join(slice, `, `) + `}` 439 }, 440 "returnLanguages": returnLanguages, 441 "avoidLanguage": avoidLanguage, 442 } 443 444 t := template.Must(template.New(tmplName).Funcs(fmap).ParseFiles(tmplPath)) 445 if err := t.Execute(out, disambiguators); err != nil { 446 return err 447 } 448 449 return nil 450 } 451 452 func getAllHeuristics(disambiguators []*disambiguator) []*heuristic { 453 heuristics := make([]*heuristic, 0) 454 for _, disamb := range disambiguators { 455 for _, lang := range disamb.Languages { 456 if !avoidLanguage(lang) { 457 heuristics = append(heuristics, lang.Heuristics...) 458 } 459 } 460 } 461 462 return heuristics 463 } 464 465 func avoidLanguage(lang *languageHeuristics) bool { 466 // necessary to avoid corner cases 467 for _, heuristic := range lang.Heuristics { 468 if containsInvalidRegexp(heuristic.Regexp) { 469 return true 470 } 471 } 472 473 return false 474 } 475 476 func containsInvalidRegexp(reg string) bool { 477 return strings.Contains(reg, `(?<`) || strings.Contains(reg, `\1`) 478 } 479 480 func returnLanguages(langsHeuristics []*languageHeuristics) []string { 481 langs := make([]string, 0) 482 for _, langHeu := range langsHeuristics { 483 if len(langHeu.Heuristics) == 0 { 484 langs = append(langs, `"`+langHeu.Language+`"`) 485 } 486 } 487 488 return langs 489 }