github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/automaton/regexp.go (about) 1 package automaton 2 3 import ( 4 "bytes" 5 "fmt" 6 "strconv" 7 "strings" 8 ) 9 10 // util/automaton/RegExp.java 11 12 type Kind int 13 14 const ( 15 REGEXP_UNION = Kind(1) 16 REGEXP_CONCATENATION = Kind(2) 17 REGEXP_INTERSECTION = Kind(3) 18 REGEXP_OPTIONAL = Kind(4) 19 REGEXP_REPEAT = Kind(5) 20 REGEXP_REPEAT_MIN = Kind(6) 21 REGEXP_REPEAT_MINMAX = Kind(7) 22 REGEXP_COMPLEMENT = Kind(8) 23 REGEXP_CHAR = Kind(9) 24 REGEXP_CHAR_RANGE = Kind(10) 25 REGEXP_ANYCHAR = Kind(11) 26 REGEXP_EMPTY = Kind(12) 27 REGEXP_STRING = Kind(13) 28 REGEXP_ANYSTRING = Kind(14) 29 REGEXP_AUTOMATON = Kind(15) 30 REGEXP_INTERVAL = Kind(16) 31 ) 32 33 // Syntax flags 34 const ( 35 INTERSECTION = 0x0001 // & 36 COMPLEMENT = 0x0002 // ~ 37 EMPTY = 0x0004 // # 38 ANYSTRING = 0x0008 // @ 39 AUTOMATON = 0x0010 // <identifier> 40 INTERVAL = 0x0020 // <n-m> 41 ALL = 0xffff // enables all optional regexp syntax. 42 NONE = 0x0000 // enables no optional regexp syntax. 43 44 allow_mutation = false 45 ) 46 47 /* 48 Regular Expression extension to Automaton. 49 50 Regular expressions are built from the following abstract syntax: 51 52 regexp ::= unionexp 53 | 54 unionexp ::= interexp | unionexp (union) 55 | interexp 56 interexp ::= concatexp & interexp (intersection) [OPTIONAL] 57 | concatexp 58 concatexp ::= repeatexp concatexp (concatenation) 59 | repeatexp 60 repeatexp ::= repeatexp ? (zero or one occurrence) 61 | repeatexp * (zero or more occurrences) 62 | repeatexp + (one or more occurrences) 63 | repeatexp {n} (n occurrences) 64 | repeatexp {n,} (n or more occurrences) 65 | repeatexp {n,m} (n to m occurrences, including both) 66 | complexp 67 complexp ::= ~ complexp (complement) [OPTIONAL] 68 | charclassexp 69 charclassexp ::= [ charclasses ] (character class) 70 | [^ charclasses ] (negated character class) 71 | simpleexp 72 charclasses ::= charclass charclasses 73 | charclass 74 charclass ::= charexp - charexp (character range, including end-points) 75 | charexp 76 simpleexp ::= charexp 77 | . (any single character) 78 | # (the empty language) [OPTIONAL] 79 | @ (any string) [OPTIONAL] 80 | " <Unicode string without double-quotes> " (a string) 81 | ( ) (the empty string) 82 | ( unionexp ) (precedence override) 83 | < <identifier> > (named automaton) [OPTIONAL] 84 | <n-m> (numerical interval) [OPTIONAL] 85 charexp ::= <Unicode character> (a single non-reserved character) 86 | \ <Unicode character> (a single character) 87 88 The productions marked [OPTIONAL] are only allowed if specified by 89 the syntax flags passed to the RegExp constructor. The reserved 90 characters used in the (enabled) syntax must be escaped with 91 backslash (\) or double-quotes ("..."). (In contrast to other regexp 92 syntaxes, this is required also in character classes.) Be aware that 93 dash (-) has a special meaning in charclass expressions. An 94 identifier is a string not containing right angle bracket (>) or dash 95 (-). Numerical intervals are specified by non-negative decimal 96 integers and include both end points, and if n and m have the same 97 number of digits, then the conforming strings must have that length 98 (i.e. prefixed by 0's). 99 */ 100 type RegExp struct { 101 kind Kind 102 exp1, exp2 *RegExp 103 s string 104 c int 105 min, max, digits int 106 from, to int 107 b []rune 108 flags int 109 pos int 110 } 111 112 // Constructs new RegExp from a string. Same as RegExp(s, ALL) 113 func NewRegExp(s string) *RegExp { 114 return NewRegExpWithFlag(s, ALL) 115 } 116 117 // Constructs new RegExp from a string. 118 func NewRegExpWithFlag(s string, flags int) *RegExp { 119 ans := &RegExp{ 120 b: []rune(s), 121 flags: flags, 122 } 123 var e *RegExp 124 if len(s) == 0 { 125 e = makeStringRE("") 126 } else { 127 e = ans.parseUnionExp() 128 if ans.pos < len(ans.b) { 129 panic(fmt.Sprintf("end-of-string expected at position %v", ans.pos)) 130 } 131 } 132 ans.kind = e.kind 133 ans.exp1, ans.exp2 = e.exp1, e.exp2 134 ans.s = e.s 135 ans.c = e.c 136 ans.min, ans.max, ans.digits = e.min, e.max, e.digits 137 ans.from, ans.to = e.from, e.to 138 ans.b = nil 139 return ans 140 } 141 142 // Constructs new Automaton from this RegExp. Same as 143 // ToAutomaton(nil) (empty automaton map). 144 func (re *RegExp) ToAutomaton() *Automaton { 145 return re.toAutomaton(nil, nil) 146 } 147 148 func (re *RegExp) toAutomaton(automata map[string]*Automaton, 149 provider AutomatonProvider) *Automaton { 150 var list []*Automaton 151 var a *Automaton = nil 152 switch re.kind { 153 case REGEXP_UNION: 154 list = make([]*Automaton, 0) 155 list = re.findLeaves(re.exp1, REGEXP_UNION, list, automata, provider) 156 list = re.findLeaves(re.exp2, REGEXP_UNION, list, automata, provider) 157 a = unionN(list) 158 a = minimize(a) 159 case REGEXP_CONCATENATION: 160 list = make([]*Automaton, 0) 161 list = re.findLeaves(re.exp1, REGEXP_CONCATENATION, list, automata, provider) 162 list = re.findLeaves(re.exp2, REGEXP_CONCATENATION, list, automata, provider) 163 a = concatenateN(list) 164 a = minimize(a) 165 case REGEXP_INTERSECTION: 166 a = intersection(re.exp1.toAutomaton(automata, provider), 167 re.exp2.toAutomaton(automata, provider)) 168 a = minimize(a) 169 case REGEXP_OPTIONAL: 170 a = optional(re.exp1.toAutomaton(automata, provider)) 171 a = minimize(a) 172 case REGEXP_REPEAT: 173 a = repeat(re.exp1.toAutomaton(automata, provider)) 174 a = minimize(a) 175 case REGEXP_REPEAT_MIN: 176 a = repeatMin(re.exp1.toAutomaton(automata, provider), re.min) 177 a = minimize(a) 178 case REGEXP_REPEAT_MINMAX: 179 panic("not implemented yet") 180 case REGEXP_COMPLEMENT: 181 a = complement(re.exp1.toAutomaton(automata, provider)) 182 a = minimize(a) 183 case REGEXP_CHAR: 184 a = makeChar(re.c) 185 case REGEXP_CHAR_RANGE: 186 a = makeCharRange(re.from, re.to) 187 case REGEXP_ANYCHAR: 188 a = makeAnyChar() 189 case REGEXP_EMPTY: 190 panic("not implemented yet") 191 case REGEXP_STRING: 192 a = makeString(re.s) 193 case REGEXP_ANYSTRING: 194 panic("not implemented yet") 195 case REGEXP_AUTOMATON: 196 panic("not implemented yet") 197 case REGEXP_INTERVAL: 198 panic("not implemented yet") 199 } 200 return a 201 } 202 203 func (re *RegExp) findLeaves(exp *RegExp, kind Kind, list []*Automaton, 204 automata map[string]*Automaton, provider AutomatonProvider) []*Automaton { 205 if exp.kind == kind { 206 list = re.findLeaves(exp.exp1, kind, list, automata, provider) 207 list = re.findLeaves(exp.exp2, kind, list, automata, provider) 208 return list 209 } else { 210 return append(list, exp.toAutomaton(automata, provider)) 211 } 212 } 213 214 // Constructs string from parsed regular expression 215 func (re *RegExp) String() string { 216 var b bytes.Buffer 217 return re.toStringBuilder(&b).String() 218 } 219 220 func (re *RegExp) toStringBuilder(b *bytes.Buffer) *bytes.Buffer { 221 switch re.kind { 222 case REGEXP_UNION: 223 b.WriteRune('(') 224 re.exp1.toStringBuilder(b) 225 b.WriteRune('|') 226 re.exp2.toStringBuilder(b) 227 b.WriteRune(')') 228 case REGEXP_CONCATENATION: 229 re.exp1.toStringBuilder(b) 230 re.exp2.toStringBuilder(b) 231 case REGEXP_INTERSECTION: 232 b.WriteRune('(') 233 re.exp1.toStringBuilder(b) 234 b.WriteRune('&') 235 re.exp2.toStringBuilder(b) 236 b.WriteRune(')') 237 case REGEXP_OPTIONAL: 238 b.WriteRune('(') 239 re.exp1.toStringBuilder(b) 240 b.WriteString(")?") 241 case REGEXP_REPEAT: 242 b.WriteRune('(') 243 re.exp1.toStringBuilder(b) 244 b.WriteString(")*") 245 case REGEXP_REPEAT_MIN: 246 b.WriteRune('(') 247 re.exp1.toStringBuilder(b) 248 fmt.Fprintf(b, "){%v,}", re.min) 249 case REGEXP_REPEAT_MINMAX: 250 panic("not implemented yet3") 251 case REGEXP_COMPLEMENT: 252 b.WriteString("~(") 253 re.exp1.toStringBuilder(b) 254 b.WriteRune(')') 255 case REGEXP_CHAR: 256 b.WriteString("\\") 257 if rune(re.c) == '\r' { // edge case 258 b.WriteRune('r') 259 } else if rune(re.c) == '\t' { // edge case 260 b.WriteRune('t') 261 } else if rune(re.c) == '\n' { // edge case 262 b.WriteRune('n') 263 } else { 264 b.WriteRune(rune(re.c)) 265 } 266 case REGEXP_CHAR_RANGE: 267 panic("not implemented yet4") 268 case REGEXP_ANYCHAR: 269 b.WriteRune('.') 270 case REGEXP_EMPTY: 271 panic("not implemented yet5") 272 case REGEXP_STRING: 273 fmt.Fprintf(b, "\"%v\"", re.s) 274 case REGEXP_ANYSTRING: 275 panic("not implemented yet7") 276 case REGEXP_AUTOMATON: 277 panic("not implemented yet8") 278 case REGEXP_INTERVAL: 279 panic("not implemented yet9") 280 default: 281 panic("not supported yet10") 282 } 283 return b 284 } 285 286 func makeUnion(exp1, exp2 *RegExp) *RegExp { 287 return &RegExp{ 288 kind: REGEXP_UNION, 289 exp1: exp1, 290 exp2: exp2, 291 } 292 } 293 294 func makeConcatenation(exp1, exp2 *RegExp) *RegExp { 295 if (exp1.kind == REGEXP_CHAR || exp1.kind == REGEXP_STRING) && 296 (exp2.kind == REGEXP_CHAR || exp2.kind == REGEXP_STRING) { 297 return makeString2RE(exp1, exp2) 298 } 299 r := &RegExp{kind: REGEXP_CONCATENATION} 300 if exp1.kind == REGEXP_CONCATENATION && 301 (exp1.exp2.kind == REGEXP_CHAR || exp1.exp2.kind == REGEXP_STRING) && 302 (exp2.kind == REGEXP_CHAR || exp2.kind == REGEXP_STRING) { 303 r.exp1 = exp1.exp1 304 r.exp2 = makeString2RE(exp1.exp2, exp2) 305 } else if (exp1.kind == REGEXP_CHAR || exp1.kind == REGEXP_STRING) && 306 exp2.kind == REGEXP_CONCATENATION && 307 (exp2.exp1.kind == REGEXP_CHAR || exp2.exp1.kind == REGEXP_STRING) { 308 r.exp1 = makeString2RE(exp1, exp2.exp1) 309 r.exp2 = exp2.exp2 310 } else { 311 r.exp1 = exp1 312 r.exp2 = exp2 313 } 314 return r 315 } 316 317 func makeString2RE(exp1, exp2 *RegExp) *RegExp { 318 var b bytes.Buffer 319 if exp1.kind == REGEXP_STRING { 320 b.WriteString(exp1.s) 321 } else { 322 assert(REGEXP_CHAR == exp1.kind) 323 b.WriteRune(rune(exp1.c)) 324 } 325 if exp2.kind == REGEXP_STRING { 326 b.WriteString(exp1.s) 327 } else { 328 assert(REGEXP_CHAR == exp2.kind) 329 b.WriteRune(rune(exp2.c)) 330 } 331 return makeStringRE(b.String()) 332 } 333 334 func makeIntersection(exp1, exp2 *RegExp) *RegExp { 335 return &RegExp{ 336 kind: REGEXP_INTERSECTION, 337 exp1: exp1, 338 exp2: exp2, 339 } 340 } 341 342 func makeOptional(exp *RegExp) *RegExp { 343 return &RegExp{ 344 kind: REGEXP_OPTIONAL, 345 exp1: exp, 346 } 347 } 348 349 func makeRepeat(exp *RegExp) *RegExp { 350 return &RegExp{ 351 kind: REGEXP_REPEAT, 352 exp1: exp, 353 } 354 } 355 356 func makeRepeatMin(exp *RegExp, min int) *RegExp { 357 return &RegExp{ 358 kind: REGEXP_REPEAT_MIN, 359 exp1: exp, 360 min: min, 361 } 362 } 363 364 func makeRepeatRange(exp *RegExp, min, max int) *RegExp { 365 panic("not implemented yet") 366 } 367 368 func makeComplement(exp *RegExp) *RegExp { 369 return &RegExp{ 370 kind: REGEXP_COMPLEMENT, 371 exp1: exp, 372 } 373 } 374 375 func makeCharRE(c int) *RegExp { 376 return &RegExp{ 377 kind: REGEXP_CHAR, 378 c: c, 379 } 380 } 381 382 func makeCharRangeRE(from, to int) *RegExp { 383 assert2(from <= to, fmt.Sprintf("invalid range: from (%v) cannot be > to (%v)", from, to)) 384 return &RegExp{ 385 kind: REGEXP_CHAR_RANGE, 386 from: from, 387 to: to, 388 } 389 } 390 391 func assert(ok bool) { 392 if !ok { 393 panic("assert fail") 394 } 395 } 396 397 func assert2(ok bool, msg string, args ...interface{}) { 398 if !ok { 399 panic(fmt.Sprintf(msg, args...)) 400 } 401 } 402 403 func makeAnyCharRE() *RegExp { 404 return &RegExp{kind: REGEXP_ANYCHAR} 405 } 406 407 func makeEmptyRE() *RegExp { 408 return &RegExp{kind: REGEXP_EMPTY} 409 } 410 411 func makeStringRE(s string) *RegExp { 412 return &RegExp{kind: REGEXP_STRING, s: s} 413 } 414 415 func makeAnyStringRE() *RegExp { 416 return &RegExp{kind: REGEXP_STRING} 417 } 418 419 func (re *RegExp) peek(s string) bool { 420 return re.more() && strings.ContainsRune(s, re.b[re.pos]) 421 } 422 423 func (re *RegExp) match(c rune) bool { 424 if re.pos >= len(re.b) { 425 return false 426 } 427 if re.b[re.pos] == c { 428 re.pos++ 429 return true 430 } 431 return false 432 } 433 434 func (re *RegExp) more() bool { 435 return re.pos < len(re.b) 436 } 437 438 func (re *RegExp) next() int { 439 assert2(re.more(), "unexpected end-of-string") 440 ch := re.b[re.pos] 441 re.pos++ 442 return int(ch) // int >= rune 443 } 444 445 func (re *RegExp) check(flag int) bool { 446 return (re.flags & flag) != 0 447 } 448 449 func (re *RegExp) parseUnionExp() *RegExp { 450 e := re.parseInterExp() 451 if re.match('|') { 452 e = makeUnion(e, re.parseUnionExp()) 453 } 454 return e 455 } 456 457 func (re *RegExp) parseInterExp() *RegExp { 458 e := re.parseConcatExp() 459 if re.check(INTERSECTION) && re.match('&') { 460 e = makeIntersection(e, re.parseInterExp()) 461 } 462 return e 463 } 464 465 func (re *RegExp) parseConcatExp() *RegExp { 466 e := re.parseRepeatExp() 467 if re.more() && !re.peek(")|") && (!re.check(INTERSECTION) || !re.peek("&")) { 468 e = makeConcatenation(e, re.parseConcatExp()) 469 } 470 return e 471 } 472 473 func (re *RegExp) parseRepeatExp() *RegExp { 474 e := re.parseComplExp() 475 for re.peek("?*+{") { 476 if re.match('?') { 477 e = makeOptional(e) 478 } else if re.match('*') { 479 e = makeRepeat(e) 480 } else if re.match('+') { 481 e = makeRepeatMin(e, 1) 482 } else if re.match('{') { 483 start := re.pos 484 for re.peek("0123456789") { 485 re.next() 486 } 487 assert2(start != re.pos, fmt.Sprintf("integer expected at position %v", re.pos)) 488 n, err := strconv.Atoi(string(re.b[start:re.pos])) 489 assertNoError(err) 490 m := -1 491 if re.match(',') { 492 start = re.pos 493 for re.peek("0123456789") { 494 re.next() 495 } 496 if start != re.pos { 497 m, err = strconv.Atoi(string(re.b[start:re.pos])) 498 assertNoError(err) 499 } 500 } else { 501 m = n 502 } 503 assert2(re.match('}'), fmt.Sprintf("expected '}' at position %v", re.pos)) 504 if m == -1 { 505 e = makeRepeatMin(e, n) 506 } else { 507 e = makeRepeatRange(e, n, m) 508 } 509 } 510 } 511 return e 512 } 513 514 func assertNoError(err error) { 515 if err != nil { 516 panic(err) 517 } 518 } 519 520 func (re *RegExp) parseComplExp() *RegExp { 521 if re.check(COMPLEMENT) && re.match('~') { 522 return makeComplement(re.parseComplExp()) 523 } 524 return re.parseCharClassExp() 525 } 526 527 func (re *RegExp) parseCharClassExp() *RegExp { 528 if re.match('[') { 529 negate := re.match('^') 530 e := re.parseCharClasses() 531 if negate { 532 e = makeIntersection(makeAnyCharRE(), makeComplement(e)) 533 } 534 assert2(re.match(']'), fmt.Sprintf("expected ']' at position %v", re.pos)) 535 return e 536 } 537 return re.parseSimpleExp() 538 } 539 540 func (re *RegExp) parseCharClasses() *RegExp { 541 e := re.parseCharClass() 542 for re.more() && !re.peek("]") { 543 e = makeUnion(e, re.parseCharClass()) 544 } 545 return e 546 } 547 548 func (re *RegExp) parseCharClass() *RegExp { 549 c := re.parseCharExp() 550 if re.match('-') { 551 return makeCharRangeRE(c, re.parseCharExp()) 552 } 553 return makeCharRE(c) 554 } 555 556 func (re *RegExp) parseSimpleExp() *RegExp { 557 if re.match('.') { 558 return makeAnyCharRE() 559 } 560 if re.check(EMPTY) && re.match('#') { 561 return makeEmptyRE() 562 } 563 if re.check(ANYSTRING) && re.match('@') { 564 return makeAnyStringRE() 565 } 566 if re.match('"') { 567 start := re.pos 568 for re.more() && !re.peek("\"") { 569 re.next() 570 } 571 if !re.match('"') { 572 panic(fmt.Sprintf("expected '\"' at position %v", re.pos)) 573 } 574 return makeStringRE(string(re.b[start : re.pos-1])) 575 } 576 if re.match('(') { 577 if re.match(')') { 578 return makeStringRE("") 579 } 580 e := re.parseUnionExp() 581 if !re.match(')') { 582 panic(fmt.Sprintf("expected ')' at position %v", re.pos)) 583 } 584 return e 585 } 586 if (re.check(AUTOMATON) || re.check(INTERVAL)) && re.match('<') { 587 panic("not implemented yet") 588 } 589 return makeCharRE(re.parseCharExp()) 590 } 591 592 func (re *RegExp) parseCharExp() int { 593 re.match('\\') 594 return re.next() 595 } 596 597 // util/automaton/AutomatonProvider.java 598 599 // Automaton provider for RegExp. 600 type AutomatonProvider func(name string) *Automaton