github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/automaton/regexp.go

github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/automaton/regexp.go (about)

     1  package automaton
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"strconv"
     7  	"strings"
     8  )
     9  
    10  // util/automaton/RegExp.java
    11  
    12  type Kind int
    13  
    14  const (
    15  	REGEXP_UNION         = Kind(1)
    16  	REGEXP_CONCATENATION = Kind(2)
    17  	REGEXP_INTERSECTION  = Kind(3)
    18  	REGEXP_OPTIONAL      = Kind(4)
    19  	REGEXP_REPEAT        = Kind(5)
    20  	REGEXP_REPEAT_MIN    = Kind(6)
    21  	REGEXP_REPEAT_MINMAX = Kind(7)
    22  	REGEXP_COMPLEMENT    = Kind(8)
    23  	REGEXP_CHAR          = Kind(9)
    24  	REGEXP_CHAR_RANGE    = Kind(10)
    25  	REGEXP_ANYCHAR       = Kind(11)
    26  	REGEXP_EMPTY         = Kind(12)
    27  	REGEXP_STRING        = Kind(13)
    28  	REGEXP_ANYSTRING     = Kind(14)
    29  	REGEXP_AUTOMATON     = Kind(15)
    30  	REGEXP_INTERVAL      = Kind(16)
    31  )
    32  
    33  // Syntax flags
    34  const (
    35  	INTERSECTION = 0x0001 // &
    36  	COMPLEMENT   = 0x0002 // ~
    37  	EMPTY        = 0x0004 // #
    38  	ANYSTRING    = 0x0008 // @
    39  	AUTOMATON    = 0x0010 // <identifier>
    40  	INTERVAL     = 0x0020 // <n-m>
    41  	ALL          = 0xffff // enables all optional regexp syntax.
    42  	NONE         = 0x0000 // enables no optional regexp syntax.
    43  
    44  	allow_mutation = false
    45  )
    46  
    47  /*
    48  Regular Expression extension to Automaton.
    49  
    50  Regular expressions are built from the following abstract syntax:
    51  
    52  	regexp	::= unionexp
    53   			|
    54  	unionexp ::= interexp | unionexp 	(union)
    55  	 		| interexp
    56  	interexp ::= concatexp & interexp 	(intersection) 						[OPTIONAL]
    57   			| concatexp
    58  	concatexp ::= repeatexp concatexp	 (concatenation)
    59   			| repeatexp
    60  	repeatexp ::= repeatexp ? 			(zero or one occurrence)
    61   			| repeatexp * 				(zero or more occurrences)
    62   			| repeatexp + 				(one or more occurrences)
    63   			| repeatexp {n} 			(n occurrences)
    64   			| repeatexp {n,} 			(n or more occurrences)
    65   			| repeatexp {n,m} 			(n to m occurrences, including both)
    66   			| complexp
    67  	complexp ::= ~ complexp 			(complement) 						[OPTIONAL]
    68   			| charclassexp
    69  	charclassexp ::= [ charclasses ] 	(character class)
    70   			| [^ charclasses ] 			(negated character class)
    71   			| simpleexp
    72  	charclasses ::= charclass charclasses
    73   			| charclass
    74  	charclass ::= charexp - charexp 	(character range, including end-points)
    75   			| charexp
    76  	simpleexp ::= charexp
    77   			| . 						(any single character)
    78   			| # 						(the empty language) 				[OPTIONAL]
    79   			| @ 						(any string) 						[OPTIONAL]
    80  			| " <Unicode string without double-quotes>  " (a string)
    81   			| ( ) 						(the empty string)
    82   			| ( unionexp ) 				(precedence override)
    83   			| < <identifier> > 			(named automaton) 					[OPTIONAL]
    84  			| <n-m> 					(numerical interval) 				[OPTIONAL]
    85  	charexp ::= <Unicode character> 	(a single non-reserved character)
    86   			| \ <Unicode character>  	(a single character)
    87  
    88  The productions marked [OPTIONAL] are only allowed if specified by
    89  the syntax flags passed to the RegExp constructor. The reserved
    90  characters used in the (enabled) syntax must be escaped with
    91  backslash (\) or double-quotes ("..."). (In contrast to other regexp
    92  syntaxes, this is required also in character classes.) Be aware that
    93  dash (-) has a special meaning in charclass expressions. An
    94  identifier is a string not containing right angle bracket (>) or dash
    95  (-). Numerical intervals are specified by non-negative decimal
    96  integers and include both end points, and if n and m have the same
    97  number of digits, then the conforming strings must have that length
    98  (i.e. prefixed by 0's).
    99  */
   100  type RegExp struct {
   101  	kind             Kind
   102  	exp1, exp2       *RegExp
   103  	s                string
   104  	c                int
   105  	min, max, digits int
   106  	from, to         int
   107  	b                []rune
   108  	flags            int
   109  	pos              int
   110  }
   111  
   112  // Constructs new RegExp from a string. Same as RegExp(s, ALL)
   113  func NewRegExp(s string) *RegExp {
   114  	return NewRegExpWithFlag(s, ALL)
   115  }
   116  
   117  // Constructs new RegExp from a string.
   118  func NewRegExpWithFlag(s string, flags int) *RegExp {
   119  	ans := &RegExp{
   120  		b:     []rune(s),
   121  		flags: flags,
   122  	}
   123  	var e *RegExp
   124  	if len(s) == 0 {
   125  		e = makeStringRE("")
   126  	} else {
   127  		e = ans.parseUnionExp()
   128  		if ans.pos < len(ans.b) {
   129  			panic(fmt.Sprintf("end-of-string expected at position %v", ans.pos))
   130  		}
   131  	}
   132  	ans.kind = e.kind
   133  	ans.exp1, ans.exp2 = e.exp1, e.exp2
   134  	ans.s = e.s
   135  	ans.c = e.c
   136  	ans.min, ans.max, ans.digits = e.min, e.max, e.digits
   137  	ans.from, ans.to = e.from, e.to
   138  	ans.b = nil
   139  	return ans
   140  }
   141  
   142  // Constructs new Automaton from this RegExp. Same as
   143  // ToAutomaton(nil) (empty automaton map).
   144  func (re *RegExp) ToAutomaton() *Automaton {
   145  	return re.toAutomaton(nil, nil)
   146  }
   147  
   148  func (re *RegExp) toAutomaton(automata map[string]*Automaton,
   149  	provider AutomatonProvider) *Automaton {
   150  	var list []*Automaton
   151  	var a *Automaton = nil
   152  	switch re.kind {
   153  	case REGEXP_UNION:
   154  		list = make([]*Automaton, 0)
   155  		list = re.findLeaves(re.exp1, REGEXP_UNION, list, automata, provider)
   156  		list = re.findLeaves(re.exp2, REGEXP_UNION, list, automata, provider)
   157  		a = unionN(list)
   158  		a = minimize(a)
   159  	case REGEXP_CONCATENATION:
   160  		list = make([]*Automaton, 0)
   161  		list = re.findLeaves(re.exp1, REGEXP_CONCATENATION, list, automata, provider)
   162  		list = re.findLeaves(re.exp2, REGEXP_CONCATENATION, list, automata, provider)
   163  		a = concatenateN(list)
   164  		a = minimize(a)
   165  	case REGEXP_INTERSECTION:
   166  		a = intersection(re.exp1.toAutomaton(automata, provider),
   167  			re.exp2.toAutomaton(automata, provider))
   168  		a = minimize(a)
   169  	case REGEXP_OPTIONAL:
   170  		a = optional(re.exp1.toAutomaton(automata, provider))
   171  		a = minimize(a)
   172  	case REGEXP_REPEAT:
   173  		a = repeat(re.exp1.toAutomaton(automata, provider))
   174  		a = minimize(a)
   175  	case REGEXP_REPEAT_MIN:
   176  		a = repeatMin(re.exp1.toAutomaton(automata, provider), re.min)
   177  		a = minimize(a)
   178  	case REGEXP_REPEAT_MINMAX:
   179  		panic("not implemented yet")
   180  	case REGEXP_COMPLEMENT:
   181  		a = complement(re.exp1.toAutomaton(automata, provider))
   182  		a = minimize(a)
   183  	case REGEXP_CHAR:
   184  		a = makeChar(re.c)
   185  	case REGEXP_CHAR_RANGE:
   186  		a = makeCharRange(re.from, re.to)
   187  	case REGEXP_ANYCHAR:
   188  		a = makeAnyChar()
   189  	case REGEXP_EMPTY:
   190  		panic("not implemented yet")
   191  	case REGEXP_STRING:
   192  		a = makeString(re.s)
   193  	case REGEXP_ANYSTRING:
   194  		panic("not implemented yet")
   195  	case REGEXP_AUTOMATON:
   196  		panic("not implemented yet")
   197  	case REGEXP_INTERVAL:
   198  		panic("not implemented yet")
   199  	}
   200  	return a
   201  }
   202  
   203  func (re *RegExp) findLeaves(exp *RegExp, kind Kind, list []*Automaton,
   204  	automata map[string]*Automaton, provider AutomatonProvider) []*Automaton {
   205  	if exp.kind == kind {
   206  		list = re.findLeaves(exp.exp1, kind, list, automata, provider)
   207  		list = re.findLeaves(exp.exp2, kind, list, automata, provider)
   208  		return list
   209  	} else {
   210  		return append(list, exp.toAutomaton(automata, provider))
   211  	}
   212  }
   213  
   214  // Constructs string from parsed regular expression
   215  func (re *RegExp) String() string {
   216  	var b bytes.Buffer
   217  	return re.toStringBuilder(&b).String()
   218  }
   219  
   220  func (re *RegExp) toStringBuilder(b *bytes.Buffer) *bytes.Buffer {
   221  	switch re.kind {
   222  	case REGEXP_UNION:
   223  		b.WriteRune('(')
   224  		re.exp1.toStringBuilder(b)
   225  		b.WriteRune('|')
   226  		re.exp2.toStringBuilder(b)
   227  		b.WriteRune(')')
   228  	case REGEXP_CONCATENATION:
   229  		re.exp1.toStringBuilder(b)
   230  		re.exp2.toStringBuilder(b)
   231  	case REGEXP_INTERSECTION:
   232  		b.WriteRune('(')
   233  		re.exp1.toStringBuilder(b)
   234  		b.WriteRune('&')
   235  		re.exp2.toStringBuilder(b)
   236  		b.WriteRune(')')
   237  	case REGEXP_OPTIONAL:
   238  		b.WriteRune('(')
   239  		re.exp1.toStringBuilder(b)
   240  		b.WriteString(")?")
   241  	case REGEXP_REPEAT:
   242  		b.WriteRune('(')
   243  		re.exp1.toStringBuilder(b)
   244  		b.WriteString(")*")
   245  	case REGEXP_REPEAT_MIN:
   246  		b.WriteRune('(')
   247  		re.exp1.toStringBuilder(b)
   248  		fmt.Fprintf(b, "){%v,}", re.min)
   249  	case REGEXP_REPEAT_MINMAX:
   250  		panic("not implemented yet3")
   251  	case REGEXP_COMPLEMENT:
   252  		b.WriteString("~(")
   253  		re.exp1.toStringBuilder(b)
   254  		b.WriteRune(')')
   255  	case REGEXP_CHAR:
   256  		b.WriteString("\\")
   257  		if rune(re.c) == '\r' { // edge case
   258  			b.WriteRune('r')
   259  		} else if rune(re.c) == '\t' { // edge case
   260  			b.WriteRune('t')
   261  		} else if rune(re.c) == '\n' { // edge case
   262  			b.WriteRune('n')
   263  		} else {
   264  			b.WriteRune(rune(re.c))
   265  		}
   266  	case REGEXP_CHAR_RANGE:
   267  		panic("not implemented yet4")
   268  	case REGEXP_ANYCHAR:
   269  		b.WriteRune('.')
   270  	case REGEXP_EMPTY:
   271  		panic("not implemented yet5")
   272  	case REGEXP_STRING:
   273  		fmt.Fprintf(b, "\"%v\"", re.s)
   274  	case REGEXP_ANYSTRING:
   275  		panic("not implemented yet7")
   276  	case REGEXP_AUTOMATON:
   277  		panic("not implemented yet8")
   278  	case REGEXP_INTERVAL:
   279  		panic("not implemented yet9")
   280  	default:
   281  		panic("not supported yet10")
   282  	}
   283  	return b
   284  }
   285  
   286  func makeUnion(exp1, exp2 *RegExp) *RegExp {
   287  	return &RegExp{
   288  		kind: REGEXP_UNION,
   289  		exp1: exp1,
   290  		exp2: exp2,
   291  	}
   292  }
   293  
   294  func makeConcatenation(exp1, exp2 *RegExp) *RegExp {
   295  	if (exp1.kind == REGEXP_CHAR || exp1.kind == REGEXP_STRING) &&
   296  		(exp2.kind == REGEXP_CHAR || exp2.kind == REGEXP_STRING) {
   297  		return makeString2RE(exp1, exp2)
   298  	}
   299  	r := &RegExp{kind: REGEXP_CONCATENATION}
   300  	if exp1.kind == REGEXP_CONCATENATION &&
   301  		(exp1.exp2.kind == REGEXP_CHAR || exp1.exp2.kind == REGEXP_STRING) &&
   302  		(exp2.kind == REGEXP_CHAR || exp2.kind == REGEXP_STRING) {
   303  		r.exp1 = exp1.exp1
   304  		r.exp2 = makeString2RE(exp1.exp2, exp2)
   305  	} else if (exp1.kind == REGEXP_CHAR || exp1.kind == REGEXP_STRING) &&
   306  		exp2.kind == REGEXP_CONCATENATION &&
   307  		(exp2.exp1.kind == REGEXP_CHAR || exp2.exp1.kind == REGEXP_STRING) {
   308  		r.exp1 = makeString2RE(exp1, exp2.exp1)
   309  		r.exp2 = exp2.exp2
   310  	} else {
   311  		r.exp1 = exp1
   312  		r.exp2 = exp2
   313  	}
   314  	return r
   315  }
   316  
   317  func makeString2RE(exp1, exp2 *RegExp) *RegExp {
   318  	var b bytes.Buffer
   319  	if exp1.kind == REGEXP_STRING {
   320  		b.WriteString(exp1.s)
   321  	} else {
   322  		assert(REGEXP_CHAR == exp1.kind)
   323  		b.WriteRune(rune(exp1.c))
   324  	}
   325  	if exp2.kind == REGEXP_STRING {
   326  		b.WriteString(exp1.s)
   327  	} else {
   328  		assert(REGEXP_CHAR == exp2.kind)
   329  		b.WriteRune(rune(exp2.c))
   330  	}
   331  	return makeStringRE(b.String())
   332  }
   333  
   334  func makeIntersection(exp1, exp2 *RegExp) *RegExp {
   335  	return &RegExp{
   336  		kind: REGEXP_INTERSECTION,
   337  		exp1: exp1,
   338  		exp2: exp2,
   339  	}
   340  }
   341  
   342  func makeOptional(exp *RegExp) *RegExp {
   343  	return &RegExp{
   344  		kind: REGEXP_OPTIONAL,
   345  		exp1: exp,
   346  	}
   347  }
   348  
   349  func makeRepeat(exp *RegExp) *RegExp {
   350  	return &RegExp{
   351  		kind: REGEXP_REPEAT,
   352  		exp1: exp,
   353  	}
   354  }
   355  
   356  func makeRepeatMin(exp *RegExp, min int) *RegExp {
   357  	return &RegExp{
   358  		kind: REGEXP_REPEAT_MIN,
   359  		exp1: exp,
   360  		min:  min,
   361  	}
   362  }
   363  
   364  func makeRepeatRange(exp *RegExp, min, max int) *RegExp {
   365  	panic("not implemented yet")
   366  }
   367  
   368  func makeComplement(exp *RegExp) *RegExp {
   369  	return &RegExp{
   370  		kind: REGEXP_COMPLEMENT,
   371  		exp1: exp,
   372  	}
   373  }
   374  
   375  func makeCharRE(c int) *RegExp {
   376  	return &RegExp{
   377  		kind: REGEXP_CHAR,
   378  		c:    c,
   379  	}
   380  }
   381  
   382  func makeCharRangeRE(from, to int) *RegExp {
   383  	assert2(from <= to, fmt.Sprintf("invalid range: from (%v) cannot be > to (%v)", from, to))
   384  	return &RegExp{
   385  		kind: REGEXP_CHAR_RANGE,
   386  		from: from,
   387  		to:   to,
   388  	}
   389  }
   390  
   391  func assert(ok bool) {
   392  	if !ok {
   393  		panic("assert fail")
   394  	}
   395  }
   396  
   397  func assert2(ok bool, msg string, args ...interface{}) {
   398  	if !ok {
   399  		panic(fmt.Sprintf(msg, args...))
   400  	}
   401  }
   402  
   403  func makeAnyCharRE() *RegExp {
   404  	return &RegExp{kind: REGEXP_ANYCHAR}
   405  }
   406  
   407  func makeEmptyRE() *RegExp {
   408  	return &RegExp{kind: REGEXP_EMPTY}
   409  }
   410  
   411  func makeStringRE(s string) *RegExp {
   412  	return &RegExp{kind: REGEXP_STRING, s: s}
   413  }
   414  
   415  func makeAnyStringRE() *RegExp {
   416  	return &RegExp{kind: REGEXP_STRING}
   417  }
   418  
   419  func (re *RegExp) peek(s string) bool {
   420  	return re.more() && strings.ContainsRune(s, re.b[re.pos])
   421  }
   422  
   423  func (re *RegExp) match(c rune) bool {
   424  	if re.pos >= len(re.b) {
   425  		return false
   426  	}
   427  	if re.b[re.pos] == c {
   428  		re.pos++
   429  		return true
   430  	}
   431  	return false
   432  }
   433  
   434  func (re *RegExp) more() bool {
   435  	return re.pos < len(re.b)
   436  }
   437  
   438  func (re *RegExp) next() int {
   439  	assert2(re.more(), "unexpected end-of-string")
   440  	ch := re.b[re.pos]
   441  	re.pos++
   442  	return int(ch) // int >= rune
   443  }
   444  
   445  func (re *RegExp) check(flag int) bool {
   446  	return (re.flags & flag) != 0
   447  }
   448  
   449  func (re *RegExp) parseUnionExp() *RegExp {
   450  	e := re.parseInterExp()
   451  	if re.match('|') {
   452  		e = makeUnion(e, re.parseUnionExp())
   453  	}
   454  	return e
   455  }
   456  
   457  func (re *RegExp) parseInterExp() *RegExp {
   458  	e := re.parseConcatExp()
   459  	if re.check(INTERSECTION) && re.match('&') {
   460  		e = makeIntersection(e, re.parseInterExp())
   461  	}
   462  	return e
   463  }
   464  
   465  func (re *RegExp) parseConcatExp() *RegExp {
   466  	e := re.parseRepeatExp()
   467  	if re.more() && !re.peek(")|") && (!re.check(INTERSECTION) || !re.peek("&")) {
   468  		e = makeConcatenation(e, re.parseConcatExp())
   469  	}
   470  	return e
   471  }
   472  
   473  func (re *RegExp) parseRepeatExp() *RegExp {
   474  	e := re.parseComplExp()
   475  	for re.peek("?*+{") {
   476  		if re.match('?') {
   477  			e = makeOptional(e)
   478  		} else if re.match('*') {
   479  			e = makeRepeat(e)
   480  		} else if re.match('+') {
   481  			e = makeRepeatMin(e, 1)
   482  		} else if re.match('{') {
   483  			start := re.pos
   484  			for re.peek("0123456789") {
   485  				re.next()
   486  			}
   487  			assert2(start != re.pos, fmt.Sprintf("integer expected at position %v", re.pos))
   488  			n, err := strconv.Atoi(string(re.b[start:re.pos]))
   489  			assertNoError(err)
   490  			m := -1
   491  			if re.match(',') {
   492  				start = re.pos
   493  				for re.peek("0123456789") {
   494  					re.next()
   495  				}
   496  				if start != re.pos {
   497  					m, err = strconv.Atoi(string(re.b[start:re.pos]))
   498  					assertNoError(err)
   499  				}
   500  			} else {
   501  				m = n
   502  			}
   503  			assert2(re.match('}'), fmt.Sprintf("expected '}' at position %v", re.pos))
   504  			if m == -1 {
   505  				e = makeRepeatMin(e, n)
   506  			} else {
   507  				e = makeRepeatRange(e, n, m)
   508  			}
   509  		}
   510  	}
   511  	return e
   512  }
   513  
   514  func assertNoError(err error) {
   515  	if err != nil {
   516  		panic(err)
   517  	}
   518  }
   519  
   520  func (re *RegExp) parseComplExp() *RegExp {
   521  	if re.check(COMPLEMENT) && re.match('~') {
   522  		return makeComplement(re.parseComplExp())
   523  	}
   524  	return re.parseCharClassExp()
   525  }
   526  
   527  func (re *RegExp) parseCharClassExp() *RegExp {
   528  	if re.match('[') {
   529  		negate := re.match('^')
   530  		e := re.parseCharClasses()
   531  		if negate {
   532  			e = makeIntersection(makeAnyCharRE(), makeComplement(e))
   533  		}
   534  		assert2(re.match(']'), fmt.Sprintf("expected ']' at position %v", re.pos))
   535  		return e
   536  	}
   537  	return re.parseSimpleExp()
   538  }
   539  
   540  func (re *RegExp) parseCharClasses() *RegExp {
   541  	e := re.parseCharClass()
   542  	for re.more() && !re.peek("]") {
   543  		e = makeUnion(e, re.parseCharClass())
   544  	}
   545  	return e
   546  }
   547  
   548  func (re *RegExp) parseCharClass() *RegExp {
   549  	c := re.parseCharExp()
   550  	if re.match('-') {
   551  		return makeCharRangeRE(c, re.parseCharExp())
   552  	}
   553  	return makeCharRE(c)
   554  }
   555  
   556  func (re *RegExp) parseSimpleExp() *RegExp {
   557  	if re.match('.') {
   558  		return makeAnyCharRE()
   559  	}
   560  	if re.check(EMPTY) && re.match('#') {
   561  		return makeEmptyRE()
   562  	}
   563  	if re.check(ANYSTRING) && re.match('@') {
   564  		return makeAnyStringRE()
   565  	}
   566  	if re.match('"') {
   567  		start := re.pos
   568  		for re.more() && !re.peek("\"") {
   569  			re.next()
   570  		}
   571  		if !re.match('"') {
   572  			panic(fmt.Sprintf("expected '\"' at position %v", re.pos))
   573  		}
   574  		return makeStringRE(string(re.b[start : re.pos-1]))
   575  	}
   576  	if re.match('(') {
   577  		if re.match(')') {
   578  			return makeStringRE("")
   579  		}
   580  		e := re.parseUnionExp()
   581  		if !re.match(')') {
   582  			panic(fmt.Sprintf("expected ')' at position %v", re.pos))
   583  		}
   584  		return e
   585  	}
   586  	if (re.check(AUTOMATON) || re.check(INTERVAL)) && re.match('<') {
   587  		panic("not implemented yet")
   588  	}
   589  	return makeCharRE(re.parseCharExp())
   590  }
   591  
   592  func (re *RegExp) parseCharExp() int {
   593  	re.match('\\')
   594  	return re.next()
   595  }
   596  
   597  // util/automaton/AutomatonProvider.java
   598  
   599  // Automaton provider for RegExp.
   600  type AutomatonProvider func(name string) *Automaton