github.com/itchyny/rassemble-go@v0.1.1/rassemble.go (about)

     1  // Package rassemble provides a method to assemble regular expressions.
     2  package rassemble
     3  
     4  import (
     5  	"regexp/syntax"
     6  	"sort"
     7  	"unicode"
     8  	_ "unsafe"
     9  )
    10  
    11  // Join patterns to build a regexp pattern.
    12  func Join(patterns []string) (string, error) {
    13  	var sub []*syntax.Regexp
    14  	for _, pattern := range patterns {
    15  		r, err := syntax.Parse(pattern, syntax.PerlX|syntax.ClassNL)
    16  		if err != nil {
    17  			return "", err
    18  		}
    19  		sub = add(sub, breakLiterals(r))
    20  	}
    21  	return mergeSuffix(alternate(sub...)).String(), nil
    22  }
    23  
    24  func breakLiterals(r *syntax.Regexp) *syntax.Regexp {
    25  	if r.Op == syntax.OpLiteral {
    26  		if len(r.Rune) <= 1 {
    27  			return r
    28  		}
    29  		sub := make([]*syntax.Regexp, len(r.Rune))
    30  		for i := range r.Rune {
    31  			sub[i] = &syntax.Regexp{
    32  				Op: syntax.OpLiteral, Flags: r.Flags, Rune: r.Rune[i : i+1],
    33  			}
    34  		}
    35  		return concat(sub...)
    36  	}
    37  	for i, rr := range r.Sub {
    38  		r.Sub[i] = breakLiterals(rr)
    39  	}
    40  	if r.Op == syntax.OpConcat {
    41  		r = flattenConcat(r)
    42  	}
    43  	return r
    44  }
    45  
    46  func add(sub []*syntax.Regexp, r2 *syntax.Regexp) []*syntax.Regexp {
    47  	if r2.Op == syntax.OpAlternate {
    48  		for _, r2 := range r2.Sub {
    49  			sub = add(sub, r2)
    50  		}
    51  		return sub
    52  	}
    53  	for i, r1 := range sub {
    54  		if r1.Equal(r2) {
    55  			return sub
    56  		}
    57  		if r := mergePrefix(r1, r2); r != nil {
    58  			sub[i] = r
    59  			return sub
    60  		}
    61  	}
    62  	return append(sub, r2)
    63  }
    64  
    65  func mergePrefix(r1, r2 *syntax.Regexp) *syntax.Regexp {
    66  	if r1.Op > r2.Op {
    67  		r1, r2 = r2, r1
    68  	}
    69  	switch r1.Op {
    70  	case syntax.OpEmptyMatch:
    71  		switch r2.Op {
    72  		case syntax.OpLiteral, syntax.OpCharClass,
    73  			syntax.OpStar, syntax.OpPlus, syntax.OpQuest:
    74  			// (?:)|x+ => x*, etc.
    75  			return quest(r2)
    76  		}
    77  	case syntax.OpLiteral:
    78  		switch r2.Op {
    79  		case syntax.OpCharClass:
    80  			// a|[bc] => [a-c]
    81  			// (?i:a)|[bc] => [Aa-c]
    82  			return charClass(appendLiteral(r2.Rune, r1.Rune[0], r1.Flags))
    83  		case syntax.OpQuest:
    84  			if r2 := r2.Sub[0]; r2.Op == syntax.OpCharClass {
    85  				// a|[bc]? => [a-c]?
    86  				// (?i:a)|[bc]? => [Aa-c]?
    87  				return quest(charClass(appendLiteral(r2.Rune, r1.Rune[0], r1.Flags)))
    88  			}
    89  		}
    90  	case syntax.OpCharClass:
    91  		switch r2.Op {
    92  		case syntax.OpCharClass:
    93  			// [a-c]|[d-f] => [a-f]
    94  			return charClass(append(r1.Rune, r2.Rune...))
    95  		case syntax.OpQuest:
    96  			switch r2 := r2.Sub[0]; r2.Op {
    97  			case syntax.OpLiteral:
    98  				// [ab]|c? => [a-c]?
    99  				// [ab]|(?i:c)? => [Ca-c]?
   100  				return quest(charClass(appendLiteral(r1.Rune, r2.Rune[0], r2.Flags)))
   101  			case syntax.OpCharClass:
   102  				// [ab]|[cd]? => [a-d]?
   103  				return quest(charClass(append(r1.Rune, r2.Rune...)))
   104  			}
   105  		}
   106  	case syntax.OpStar, syntax.OpPlus, syntax.OpQuest:
   107  		if r1.Sub[0].Equal(r2) {
   108  			// x*|x => x*
   109  			// x+|x => x+
   110  			// x?|x => x?
   111  			return r1
   112  		}
   113  		if r1.Op < r2.Op && r2.Op <= syntax.OpQuest && r1.Sub[0].Equal(r2.Sub[0]) {
   114  			// x*|x+ => x*
   115  			// x*|x? => x*
   116  			// x+|x? => x*
   117  			return &syntax.Regexp{Op: syntax.OpStar, Sub: r1.Sub}
   118  		}
   119  	case syntax.OpConcat:
   120  		return mergePrefixConcat(r1, r2)
   121  	}
   122  	switch r2.Op {
   123  	case syntax.OpConcat:
   124  		return mergePrefixConcat(r2, r1)
   125  	case syntax.OpStar, syntax.OpPlus, syntax.OpQuest:
   126  		if r1.Equal(r2.Sub[0]) {
   127  			// x|x* => x*
   128  			// x|x? => x?
   129  			// x|x+ => x+
   130  			return r2
   131  		}
   132  	}
   133  	return nil
   134  }
   135  
   136  func mergePrefixConcat(r1, r2 *syntax.Regexp) *syntax.Regexp {
   137  	if r2.Op == syntax.OpConcat {
   138  		var i int
   139  		for ; i < len(r1.Sub) && i < len(r2.Sub); i++ {
   140  			if !r1.Sub[i].Equal(r2.Sub[i]) {
   141  				break
   142  			}
   143  		}
   144  		if i > 0 {
   145  			// x*y*z*w*|x*y*u*v* => x*y*(?:z*w*|u*v*)
   146  			return concat(
   147  				append(
   148  					append(make([]*syntax.Regexp, 0, i+1), r1.Sub[:i]...),
   149  					alternate(concat(r1.Sub[i:]...), concat(r2.Sub[i:]...)),
   150  				)...,
   151  			)
   152  		}
   153  	} else if r1.Sub[0].Equal(r2) {
   154  		// x*y*z*|x* => x*(?:y*z*)?
   155  		return concat(r2, quest(concat(r1.Sub[1:]...)))
   156  	}
   157  	return nil
   158  }
   159  
   160  func mergeSuffix(r *syntax.Regexp) *syntax.Regexp {
   161  	for i, rr := range r.Sub {
   162  		r.Sub[i] = mergeSuffix(rr)
   163  	}
   164  	switch r.Op {
   165  	case syntax.OpAlternate:
   166  		sub, k, rs, merge := r.Sub, -1, r.Rune0[:0], false
   167  		for i := 0; i < len(sub); i++ {
   168  			r1 := sub[i]
   169  			for j := i + 1; j < len(sub); j++ {
   170  				r2 := sub[j]
   171  				if r := mergeSuffixConcat(r1, r2); r != nil {
   172  					r1, j, sub = r, j-1, append(sub[:j], sub[j+1:]...)
   173  				}
   174  			}
   175  			if r1 != sub[i] {
   176  				sub[i] = mergeSuffix(r1)
   177  				continue
   178  			}
   179  			// merge literals and character classes here
   180  			// to prefer ax?|bx?|cx? over [abc]|ax|bx|cx
   181  			switch r1.Op {
   182  			case syntax.OpLiteral:
   183  				rs = appendLiteral(rs, r1.Rune[0], r1.Flags)
   184  			case syntax.OpCharClass:
   185  				rs = append(rs, r1.Rune...)
   186  			default:
   187  				continue
   188  			}
   189  			if k < 0 {
   190  				k = i
   191  			} else {
   192  				i, sub, merge = i-1, append(sub[:i], sub[i+1:]...), true
   193  			}
   194  		}
   195  		if merge {
   196  			// (?:a|b|[c-e]) => [a-e]
   197  			sub[k] = charClass(rs)
   198  		}
   199  		return alternate(sub...)
   200  	case syntax.OpQuest:
   201  		if r := r.Sub[0]; r.Op == syntax.OpAlternate {
   202  			for i, rr := range r.Sub {
   203  				if rr.Op == syntax.OpLiteral {
   204  					for _, rs := range r.Sub {
   205  						if rs.Op == syntax.OpConcat &&
   206  							rs.Sub[len(rs.Sub)-1].Op == syntax.OpQuest &&
   207  							rr.Equal(rs.Sub[len(rs.Sub)-1].Sub[0]) {
   208  							// (?:ab?|b)? => (?:ab?|b?) => a?b?
   209  							r.Sub[i] = quest(rr)
   210  							return mergeSuffix(r)
   211  						}
   212  					}
   213  				}
   214  			}
   215  		}
   216  		return r
   217  	case syntax.OpConcat:
   218  		return flattenConcat(r)
   219  	default:
   220  		return r
   221  	}
   222  }
   223  
   224  func mergeSuffixConcat(r1, r2 *syntax.Regexp) *syntax.Regexp {
   225  	if r1.Op != syntax.OpConcat {
   226  		if r2.Op != syntax.OpConcat {
   227  			return nil
   228  		}
   229  		r1, r2 = r2, r1
   230  	}
   231  	if r2.Op == syntax.OpConcat {
   232  		var i int
   233  		for ; i < len(r1.Sub) && i < len(r2.Sub); i++ {
   234  			if !r1.Sub[len(r1.Sub)-1-i].Equal(r2.Sub[len(r2.Sub)-1-i]) {
   235  				break
   236  			}
   237  		}
   238  		if i > 0 {
   239  			// x*y*z*w*|u*v*z*w* => (?:x*y*|u*v*)z*w*
   240  			return concat(
   241  				append(
   242  					[]*syntax.Regexp{
   243  						alternate(
   244  							concat(r1.Sub[:len(r1.Sub)-i]...),
   245  							concat(r2.Sub[:len(r2.Sub)-i]...),
   246  						),
   247  					},
   248  					r1.Sub[len(r1.Sub)-i:]...,
   249  				)...,
   250  			)
   251  		}
   252  	} else if r1.Sub[len(r1.Sub)-1].Equal(r2) {
   253  		// x*y*z*|z* => (?:x*y*)?z*
   254  		return concat(quest(concat(r1.Sub[:len(r1.Sub)-1]...)), r2)
   255  	}
   256  	return nil
   257  }
   258  
   259  func flattenConcat(r *syntax.Regexp) *syntax.Regexp {
   260  	n := len(r.Sub)
   261  	for _, rr := range r.Sub {
   262  		if rr.Op == syntax.OpConcat {
   263  			n += len(rr.Sub) - 1
   264  		}
   265  	}
   266  	sub := make([]*syntax.Regexp, 0, n)
   267  	for _, rr := range r.Sub {
   268  		if rr.Op == syntax.OpConcat {
   269  			sub = append(sub, rr.Sub...)
   270  		} else {
   271  			sub = append(sub, rr)
   272  		}
   273  	}
   274  	return concat(sub...)
   275  }
   276  
   277  func concat(sub ...*syntax.Regexp) *syntax.Regexp {
   278  	switch len(sub) {
   279  	case 0:
   280  		return &syntax.Regexp{Op: syntax.OpEmptyMatch}
   281  	case 1:
   282  		return sub[0]
   283  	default:
   284  		return &syntax.Regexp{Op: syntax.OpConcat, Sub: sub}
   285  	}
   286  }
   287  
   288  func alternate(sub ...*syntax.Regexp) *syntax.Regexp {
   289  	switch len(sub) {
   290  	case 1:
   291  		return sub[0]
   292  	case 2:
   293  		r1, r2 := sub[0], sub[1]
   294  		if r := mergePrefix(r1, r2); r != nil {
   295  			return r
   296  		}
   297  		if r2.Op == syntax.OpEmptyMatch {
   298  			// x*y*|(?:) => (?:x*y*)?
   299  			return quest(r1)
   300  		}
   301  		switch r1.Op {
   302  		case syntax.OpEmptyMatch:
   303  			// (?:)|x*y* => (?:x*y*)?
   304  			return quest(r2)
   305  		case syntax.OpAlternate:
   306  			// (?:x*|y*)|z* => x*|y*|z*
   307  			return alternate(add(r1.Sub, r2)...)
   308  		case syntax.OpQuest:
   309  			// x?|y* => (?:x|y*)?
   310  			return quest(alternate(r1.Sub[0], r2))
   311  		}
   312  		fallthrough
   313  	default:
   314  		return &syntax.Regexp{Op: syntax.OpAlternate, Sub: sub}
   315  	}
   316  }
   317  
   318  func quest(r *syntax.Regexp) *syntax.Regexp {
   319  	switch r.Op {
   320  	case syntax.OpQuest, syntax.OpStar:
   321  		// (?:x?)? => x?
   322  		// (?:x*)? => x*
   323  		return r
   324  	case syntax.OpPlus:
   325  		// (?:x+)? => x*
   326  		return &syntax.Regexp{Op: syntax.OpStar, Sub: r.Sub}
   327  	case syntax.OpAlternate:
   328  		for i, rr := range r.Sub {
   329  			switch rr.Op {
   330  			case syntax.OpQuest, syntax.OpStar:
   331  				// (?:x|y?|z)? => x|y?|z
   332  				// (?:x|y*|z)? => x|y*|z
   333  				return r
   334  			case syntax.OpPlus:
   335  				// (?:x|y+|z)? => x|y*|z
   336  				r.Sub[i].Op = syntax.OpStar
   337  				return r
   338  			}
   339  		}
   340  		fallthrough
   341  	default:
   342  		return &syntax.Regexp{Op: syntax.OpQuest, Sub: []*syntax.Regexp{r}}
   343  	}
   344  }
   345  
   346  type charClassSlice []rune
   347  
   348  func (rs charClassSlice) Len() int {
   349  	return len(rs) / 2
   350  }
   351  func (rs charClassSlice) Less(i, j int) bool {
   352  	return rs[i*2] < rs[j*2]
   353  }
   354  func (rs charClassSlice) Swap(i, j int) {
   355  	i, j = i*2, j*2
   356  	rs[i], rs[i+1], rs[j], rs[j+1] = rs[j], rs[j+1], rs[i], rs[i+1]
   357  }
   358  
   359  func charClass(rs []rune) *syntax.Regexp {
   360  	sort.Sort(charClassSlice(rs))
   361  	var i int
   362  	for j := 2; j < len(rs); j += 2 {
   363  		switch {
   364  		case rs[i+1] >= rs[j]:
   365  			if rs[i+1] < rs[j+1] {
   366  				// [a-dc-e] => [a-e]
   367  				rs[i+1] = rs[j+1]
   368  			}
   369  		case rs[i+1]+1 == rs[j]:
   370  			switch {
   371  			case i > 0 && rs[i-1]+1 == rs[i]:
   372  				// [abc-e] => [a-e]
   373  				i -= 2
   374  				fallthrough
   375  			case rs[i] < rs[i+1] || rs[j] < rs[j+1]:
   376  				// [a-de], [ab-e] => [a-e]
   377  				rs[i+1] = rs[j+1]
   378  				continue
   379  			}
   380  			// [ab] =/> [a-b]
   381  			fallthrough
   382  		default:
   383  			if i += 2; i != j {
   384  				rs[i], rs[i+1] = rs[j], rs[j+1]
   385  			}
   386  		}
   387  	}
   388  	rs = rs[:i+2]
   389  	if len(rs) == 2 && rs[0] == 0 && rs[1] == unicode.MaxRune {
   390  		// [^a]|a => (?s:.)
   391  		return &syntax.Regexp{Op: syntax.OpAnyChar}
   392  	}
   393  	return &syntax.Regexp{Op: syntax.OpCharClass, Rune: rs}
   394  }
   395  
   396  //go:linkname appendLiteral regexp/syntax.appendLiteral
   397  func appendLiteral([]rune, rune, syntax.Flags) []rune