github.com/bzz/enry@v1.6.7/internal/code-generator/generator/heuristics.go (about) 1 package generator 2 3 import ( 4 "bufio" 5 "bytes" 6 "fmt" 7 "io" 8 "io/ioutil" 9 "strconv" 10 "strings" 11 "text/template" 12 13 "gopkg.in/src-d/enry.v1/regex" 14 ) 15 16 // Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature. 17 func Heuristics(fileToParse, samplesDir, outPath, tmplPath, tmplName, commit string) error { 18 data, err := ioutil.ReadFile(fileToParse) 19 if err != nil { 20 return err 21 } 22 23 disambiguators, err := getDisambiguators(data) 24 if err != nil { 25 return err 26 } 27 28 buf := &bytes.Buffer{} 29 if err := executeContentTemplate(buf, disambiguators, tmplPath, tmplName, commit); err != nil { 30 return err 31 } 32 33 return formatedWrite(outPath, buf.Bytes()) 34 } 35 36 const ( 37 unknownLanguage = "OtherLanguage" 38 emptyFile = "^$" 39 ) 40 41 var ( 42 disambLine = regex.MustCompile(`^(\s*)disambiguate`) 43 definedRegs = make(map[string]string) 44 illegalCharacter = map[string]string{ 45 "#": "Sharp", 46 "+": "Plus", 47 "-": "Dash", 48 } 49 ) 50 51 type disambiguator struct { 52 Extension string `json:"extension,omitempty"` 53 Languages []*languageHeuristics `json:"languages,omitempty"` 54 } 55 56 func (d *disambiguator) setHeuristicsNames() { 57 for _, lang := range d.Languages { 58 for i, heuristic := range lang.Heuristics { 59 name := buildName(d.Extension, lang.Language, i) 60 heuristic.Name = name 61 } 62 } 63 } 64 65 func buildName(extension, language string, id int) string { 66 extension = strings.TrimPrefix(extension, `.`) 67 language = strings.Join(strings.Fields(language), ``) 68 name := strings.Join([]string{extension, language, "Matcher", strconv.Itoa(id)}, `_`) 69 for k, v := range illegalCharacter { 70 if strings.Contains(name, k) { 71 name = strings.Replace(name, k, v, -1) 72 } 73 } 74 75 return name 76 } 77 78 type languageHeuristics struct { 79 Language string `json:"language,omitempty"` 80 Heuristics []*heuristic `json:"heuristics,omitempty"` 81 LogicRelations []string `json:"logic_relations,omitempty"` 82 } 83 84 func (l *languageHeuristics) clone() (*languageHeuristics, error) { 85 language := l.Language 86 logicRels := make([]string, len(l.LogicRelations)) 87 if copy(logicRels, l.LogicRelations) != len(l.LogicRelations) { 88 return nil, fmt.Errorf("error copying logic relations") 89 } 90 91 heuristics := make([]*heuristic, 0, len(l.Heuristics)) 92 for _, h := range l.Heuristics { 93 heuristic := *h 94 heuristics = append(heuristics, &heuristic) 95 } 96 97 clone := &languageHeuristics{ 98 Language: language, 99 Heuristics: heuristics, 100 LogicRelations: logicRels, 101 } 102 103 return clone, nil 104 } 105 106 type heuristic struct { 107 Name string `json:"name,omitempty"` 108 Regexp string `json:"regexp,omitempty"` 109 } 110 111 // A disambiguate block looks like: 112 // disambiguate ".mod", ".extension" do |data| 113 // if data.include?('<!ENTITY ') && data.include?('patata') 114 // Language["XML"] 115 // elsif /^\s*MODULE [\w\.]+;/i.match(data) || /^\s*END [\w\.]+;/i.match(data) || data.empty? 116 // Language["Modula-2"] 117 // elsif (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data)) 118 // Language["Scala"] 119 // elsif (data.include?("gap> ")) 120 // Language["GAP"] 121 // else 122 // [Language["Linux Kernel Module"], Language["AMPL"]] 123 // end 124 // end 125 func getDisambiguators(heuristics []byte) ([]*disambiguator, error) { 126 seenExtensions := map[string]bool{} 127 buf := bufio.NewScanner(bytes.NewReader(heuristics)) 128 disambiguators := make([]*disambiguator, 0, 50) 129 for buf.Scan() { 130 line := buf.Text() 131 if disambLine.MatchString(line) { 132 d, err := parseDisambiguators(line, buf, seenExtensions) 133 if err != nil { 134 return nil, err 135 } 136 137 disambiguators = append(disambiguators, d...) 138 } 139 140 lookForRegexpVariables(line) 141 } 142 143 if err := buf.Err(); err != nil { 144 return nil, err 145 } 146 147 return disambiguators, nil 148 } 149 150 func lookForRegexpVariables(line string) { 151 if strings.Contains(line, "ObjectiveCRegex = ") { 152 line = strings.TrimSpace(line) 153 reg := strings.TrimPrefix(line, "ObjectiveCRegex = ") 154 definedRegs["ObjectiveCRegex"] = reg 155 } 156 157 if strings.Contains(line, "fortran_rx = ") { 158 line = strings.TrimSpace(line) 159 reg := strings.TrimPrefix(line, "fortran_rx = ") 160 definedRegs["fortran_rx"] = reg 161 } 162 } 163 164 func parseDisambiguators(line string, buf *bufio.Scanner, seenExtensions map[string]bool) ([]*disambiguator, error) { 165 disambList := make([]*disambiguator, 0, 2) 166 splitted := strings.Fields(line) 167 168 for _, v := range splitted { 169 if strings.HasPrefix(v, `"`) { 170 extension := strings.Trim(v, `",`) 171 if _, ok := seenExtensions[extension]; !ok { 172 d := &disambiguator{Extension: extension} 173 disambList = append(disambList, d) 174 seenExtensions[extension] = true 175 } 176 } 177 } 178 179 langsHeuristics, err := getLanguagesHeuristics(buf) 180 if err != nil { 181 return nil, err 182 } 183 184 for i, disamb := range disambList { 185 lh := langsHeuristics 186 if i != 0 { 187 lh = cloneLanguagesHeuristics(langsHeuristics) 188 } 189 190 disamb.Languages = lh 191 disamb.setHeuristicsNames() 192 } 193 194 return disambList, nil 195 } 196 197 func cloneLanguagesHeuristics(list []*languageHeuristics) []*languageHeuristics { 198 cloneList := make([]*languageHeuristics, 0, len(list)) 199 for _, langHeu := range list { 200 clone, _ := langHeu.clone() 201 cloneList = append(cloneList, clone) 202 } 203 204 return cloneList 205 } 206 207 func getLanguagesHeuristics(buf *bufio.Scanner) ([]*languageHeuristics, error) { 208 langsList := make([][]string, 0, 2) 209 heuristicsList := make([][]*heuristic, 0, 1) 210 logicRelsList := make([][]string, 0, 1) 211 212 lastWasMatch := false 213 for buf.Scan() { 214 line := buf.Text() 215 if strings.TrimSpace(line) == "end" { 216 break 217 } 218 219 if hasRegExp(line) { 220 line := cleanRegExpLine(line) 221 222 logicRels := getLogicRelations(line) 223 heuristics := getHeuristics(line) 224 if lastWasMatch { 225 i := len(heuristicsList) - 1 226 heuristicsList[i] = append(heuristicsList[i], heuristics...) 227 i = len(logicRelsList) - 1 228 logicRelsList[i] = append(logicRelsList[i], logicRels...) 229 } else { 230 heuristicsList = append(heuristicsList, heuristics) 231 logicRelsList = append(logicRelsList, logicRels) 232 } 233 234 lastWasMatch = true 235 } 236 237 if strings.Contains(line, "Language") { 238 langs := getLanguages(line) 239 langsList = append(langsList, langs) 240 lastWasMatch = false 241 } 242 243 } 244 245 if err := buf.Err(); err != nil { 246 return nil, err 247 } 248 249 langsHeuristics := buildLanguagesHeuristics(langsList, heuristicsList, logicRelsList) 250 return langsHeuristics, nil 251 } 252 253 func hasRegExp(line string) bool { 254 return strings.Contains(line, ".match") || strings.Contains(line, ".include?") || strings.Contains(line, ".empty?") 255 } 256 257 func cleanRegExpLine(line string) string { 258 if strings.Contains(line, "if ") { 259 line = line[strings.Index(line, `if `)+3:] 260 } 261 262 line = strings.TrimSpace(line) 263 line = strings.TrimPrefix(line, `(`) 264 if strings.Contains(line, "))") { 265 line = strings.TrimSuffix(line, `)`) 266 } 267 268 return line 269 } 270 271 func getLogicRelations(line string) []string { 272 rels := make([]string, 0) 273 splitted := strings.Split(line, "||") 274 for i, v := range splitted { 275 if strings.Contains(v, "&&") { 276 rels = append(rels, "&&") 277 } 278 279 if i < len(splitted)-1 { 280 rels = append(rels, "||") 281 } 282 } 283 284 if len(rels) == 0 { 285 rels = nil 286 } 287 288 return rels 289 } 290 291 func getHeuristics(line string) []*heuristic { 292 splitted := splitByLogicOps(line) 293 heuristics := make([]*heuristic, 0, len(splitted)) 294 for _, v := range splitted { 295 v = strings.TrimSpace(v) 296 var reg string 297 298 if strings.Contains(v, ".match") { 299 reg = v[:strings.Index(v, ".match")] 300 reg = replaceRegexpVariables(reg) 301 } 302 303 if strings.Contains(v, ".include?") { 304 reg = includeToRegExp(v) 305 } 306 307 if strings.Contains(v, ".empty?") { 308 reg = emptyFile 309 } 310 311 if reg != "" { 312 reg = convertToValidRegexp(reg) 313 heuristics = append(heuristics, &heuristic{Regexp: reg}) 314 } 315 } 316 317 return heuristics 318 } 319 320 func splitByLogicOps(line string) []string { 321 splitted := make([]string, 0, 1) 322 splitOr := strings.Split(line, "||") 323 for _, v := range splitOr { 324 splitAnd := strings.Split(v, "&&") 325 splitted = append(splitted, splitAnd...) 326 } 327 328 return splitted 329 } 330 331 func replaceRegexpVariables(reg string) string { 332 repl := reg 333 if v, ok := definedRegs[reg]; ok { 334 repl = v 335 } 336 337 return repl 338 } 339 340 func convertToValidRegexp(reg string) string { 341 // example: `/^(\s*)(<Project|<Import|<Property|<?xml|xmlns)/i`` 342 // Ruby modifier "m" matches multiple lines, recognizing newlines as normal characters, Go use flag "s" for that. 343 const ( 344 caseSensitive = "i" 345 matchEOL = "s" 346 347 rubyCaseSensitive = "i" 348 rubyMultiLine = "m" 349 ) 350 351 if reg == emptyFile { 352 return reg 353 } 354 355 reg = strings.TrimPrefix(reg, `/`) 356 flags := "(?m" 357 lastSlash := strings.LastIndex(reg, `/`) 358 if lastSlash == -1 { 359 return flags + ")" + reg 360 } 361 362 specialChars := reg[lastSlash:] 363 reg = reg[:lastSlash] 364 if lastSlash == len(reg)-1 { 365 return flags + ")" + reg 366 } 367 368 if strings.Contains(specialChars, rubyCaseSensitive) { 369 flags = flags + caseSensitive 370 } 371 372 if strings.Contains(specialChars, rubyMultiLine) { 373 flags = flags + matchEOL 374 } 375 376 return flags + ")" + reg 377 } 378 379 func includeToRegExp(include string) string { 380 content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)] 381 content = strings.Trim(content, `"'`) 382 return regex.QuoteMeta(content) 383 } 384 385 func getLanguages(line string) []string { 386 languages := make([]string, 0) 387 splitted := strings.Split(line, `,`) 388 for _, lang := range splitted { 389 lang = trimLanguage(lang) 390 languages = append(languages, lang) 391 } 392 393 return languages 394 } 395 396 func trimLanguage(enclosedLang string) string { 397 lang := strings.TrimSpace(enclosedLang) 398 lang = lang[strings.Index(lang, `"`)+1:] 399 lang = lang[:strings.Index(lang, `"`)] 400 return lang 401 } 402 403 func buildLanguagesHeuristics(langsList [][]string, heuristicsList [][]*heuristic, logicRelsList [][]string) []*languageHeuristics { 404 langsHeuristics := make([]*languageHeuristics, 0, len(langsList)) 405 for i, langSlice := range langsList { 406 var heuristics []*heuristic 407 if i < len(heuristicsList) { 408 heuristics = heuristicsList[i] 409 } 410 411 var rels []string 412 if i < len(logicRelsList) { 413 rels = logicRelsList[i] 414 } 415 416 for _, lang := range langSlice { 417 lh := &languageHeuristics{ 418 Language: lang, 419 Heuristics: heuristics, 420 LogicRelations: rels, 421 } 422 423 langsHeuristics = append(langsHeuristics, lh) 424 } 425 } 426 427 return langsHeuristics 428 } 429 430 func executeContentTemplate(out io.Writer, disambiguators []*disambiguator, tmplPath, tmplName, commit string) error { 431 fmap := template.FuncMap{ 432 "getAllHeuristics": getAllHeuristics, 433 "returnStringSlice": func(slice []string) string { 434 if len(slice) == 0 { 435 return "nil" 436 } 437 438 return `[]string{` + strings.Join(slice, `, `) + `}` 439 }, 440 "returnLanguages": returnLanguages, 441 "avoidLanguage": avoidLanguage, 442 } 443 return executeTemplate(out, tmplName, tmplPath, commit, fmap, disambiguators) 444 } 445 446 func getAllHeuristics(disambiguators []*disambiguator) []*heuristic { 447 heuristics := make([]*heuristic, 0) 448 for _, disamb := range disambiguators { 449 for _, lang := range disamb.Languages { 450 if !avoidLanguage(lang) { 451 heuristics = append(heuristics, lang.Heuristics...) 452 } 453 } 454 } 455 456 return heuristics 457 } 458 459 func avoidLanguage(lang *languageHeuristics) bool { 460 // necessary to avoid corner cases 461 for _, heuristic := range lang.Heuristics { 462 if containsInvalidRegexp(heuristic.Regexp) { 463 return true 464 } 465 } 466 467 return false 468 } 469 470 func containsInvalidRegexp(reg string) bool { 471 return strings.Contains(reg, `(?<`) || strings.Contains(reg, `\1`) 472 } 473 474 func returnLanguages(langsHeuristics []*languageHeuristics) []string { 475 langs := make([]string, 0) 476 for _, langHeu := range langsHeuristics { 477 if len(langHeu.Heuristics) == 0 { 478 langs = append(langs, `"`+langHeu.Language+`"`) 479 } 480 } 481 482 return langs 483 }