github.com/euank/go@v0.0.0-20160829210321-495514729181/src/regexp/syntax/regexp.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // Note to implementers:
     8  // In this package, re is always a *Regexp and r is always a rune.
     9  
    10  import (
    11  	"bytes"
    12  	"strconv"
    13  	"strings"
    14  	"unicode"
    15  )
    16  
    17  // A Regexp is a node in a regular expression syntax tree.
    18  type Regexp struct {
    19  	Op       Op // operator
    20  	Flags    Flags
    21  	Sub      []*Regexp  // subexpressions, if any
    22  	Sub0     [1]*Regexp // storage for short Sub
    23  	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
    24  	Rune0    [2]rune    // storage for short Rune
    25  	Min, Max int        // min, max for OpRepeat
    26  	Cap      int        // capturing index, for OpCapture
    27  	Name     string     // capturing name, for OpCapture
    28  }
    29  
    30  // An Op is a single regular expression operator.
    31  type Op uint8
    32  
    33  // Operators are listed in precedence order, tightest binding to weakest.
    34  // Character class operators are listed simplest to most complex
    35  // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
    36  
    37  const (
    38  	OpNoMatch        Op = 1 + iota // matches no strings
    39  	OpEmptyMatch                   // matches empty string
    40  	OpLiteral                      // matches Runes sequence
    41  	OpCharClass                    // matches Runes interpreted as range pair list
    42  	OpAnyCharNotNL                 // matches any character except newline
    43  	OpAnyChar                      // matches any character
    44  	OpBeginLine                    // matches empty string at beginning of line
    45  	OpEndLine                      // matches empty string at end of line
    46  	OpBeginText                    // matches empty string at beginning of text
    47  	OpEndText                      // matches empty string at end of text
    48  	OpWordBoundary                 // matches word boundary `\b`
    49  	OpNoWordBoundary               // matches word non-boundary `\B`
    50  	OpCapture                      // capturing subexpression with index Cap, optional name Name
    51  	OpStar                         // matches Sub[0] zero or more times
    52  	OpPlus                         // matches Sub[0] one or more times
    53  	OpQuest                        // matches Sub[0] zero or one times
    54  	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
    55  	OpConcat                       // matches concatenation of Subs
    56  	OpAlternate                    // matches alternation of Subs
    57  )
    58  
    59  const opPseudo Op = 128 // where pseudo-ops start
    60  
    61  // Equal returns true if x and y have identical structure.
    62  func (x *Regexp) Equal(y *Regexp) bool {
    63  	if x == nil || y == nil {
    64  		return x == y
    65  	}
    66  	if x.Op != y.Op {
    67  		return false
    68  	}
    69  	switch x.Op {
    70  	case OpEndText:
    71  		// The parse flags remember whether this is \z or \Z.
    72  		if x.Flags&WasDollar != y.Flags&WasDollar {
    73  			return false
    74  		}
    75  
    76  	case OpLiteral, OpCharClass:
    77  		if len(x.Rune) != len(y.Rune) {
    78  			return false
    79  		}
    80  		for i, r := range x.Rune {
    81  			if r != y.Rune[i] {
    82  				return false
    83  			}
    84  		}
    85  
    86  	case OpAlternate, OpConcat:
    87  		if len(x.Sub) != len(y.Sub) {
    88  			return false
    89  		}
    90  		for i, sub := range x.Sub {
    91  			if !sub.Equal(y.Sub[i]) {
    92  				return false
    93  			}
    94  		}
    95  
    96  	case OpStar, OpPlus, OpQuest:
    97  		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
    98  			return false
    99  		}
   100  
   101  	case OpRepeat:
   102  		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
   103  			return false
   104  		}
   105  
   106  	case OpCapture:
   107  		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
   108  			return false
   109  		}
   110  	}
   111  	return true
   112  }
   113  
   114  // writeRegexp writes the Perl syntax for the regular expression re to b.
   115  func writeRegexp(b *bytes.Buffer, re *Regexp) {
   116  	switch re.Op {
   117  	default:
   118  		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
   119  	case OpNoMatch:
   120  		b.WriteString(`[^\x00-\x{10FFFF}]`)
   121  	case OpEmptyMatch:
   122  		b.WriteString(`(?:)`)
   123  	case OpLiteral:
   124  		if re.Flags&FoldCase != 0 {
   125  			b.WriteString(`(?i:`)
   126  		}
   127  		for _, r := range re.Rune {
   128  			escape(b, r, false)
   129  		}
   130  		if re.Flags&FoldCase != 0 {
   131  			b.WriteString(`)`)
   132  		}
   133  	case OpCharClass:
   134  		if len(re.Rune)%2 != 0 {
   135  			b.WriteString(`[invalid char class]`)
   136  			break
   137  		}
   138  		b.WriteRune('[')
   139  		if len(re.Rune) == 0 {
   140  			b.WriteString(`^\x00-\x{10FFFF}`)
   141  		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
   142  			// Contains 0 and MaxRune. Probably a negated class.
   143  			// Print the gaps.
   144  			b.WriteRune('^')
   145  			for i := 1; i < len(re.Rune)-1; i += 2 {
   146  				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
   147  				escape(b, lo, lo == '-')
   148  				if lo != hi {
   149  					b.WriteRune('-')
   150  					escape(b, hi, hi == '-')
   151  				}
   152  			}
   153  		} else {
   154  			for i := 0; i < len(re.Rune); i += 2 {
   155  				lo, hi := re.Rune[i], re.Rune[i+1]
   156  				escape(b, lo, lo == '-')
   157  				if lo != hi {
   158  					b.WriteRune('-')
   159  					escape(b, hi, hi == '-')
   160  				}
   161  			}
   162  		}
   163  		b.WriteRune(']')
   164  	case OpAnyCharNotNL:
   165  		b.WriteString(`(?-s:.)`)
   166  	case OpAnyChar:
   167  		b.WriteString(`(?s:.)`)
   168  	case OpBeginLine:
   169  		b.WriteString(`(?m:^)`)
   170  	case OpEndLine:
   171  		b.WriteString(`(?m:$)`)
   172  	case OpBeginText:
   173  		b.WriteString(`\A`)
   174  	case OpEndText:
   175  		if re.Flags&WasDollar != 0 {
   176  			b.WriteString(`(?-m:$)`)
   177  		} else {
   178  			b.WriteString(`\z`)
   179  		}
   180  	case OpWordBoundary:
   181  		b.WriteString(`\b`)
   182  	case OpNoWordBoundary:
   183  		b.WriteString(`\B`)
   184  	case OpCapture:
   185  		if re.Name != "" {
   186  			b.WriteString(`(?P<`)
   187  			b.WriteString(re.Name)
   188  			b.WriteRune('>')
   189  		} else {
   190  			b.WriteRune('(')
   191  		}
   192  		if re.Sub[0].Op != OpEmptyMatch {
   193  			writeRegexp(b, re.Sub[0])
   194  		}
   195  		b.WriteRune(')')
   196  	case OpStar, OpPlus, OpQuest, OpRepeat:
   197  		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
   198  			b.WriteString(`(?:`)
   199  			writeRegexp(b, sub)
   200  			b.WriteString(`)`)
   201  		} else {
   202  			writeRegexp(b, sub)
   203  		}
   204  		switch re.Op {
   205  		case OpStar:
   206  			b.WriteRune('*')
   207  		case OpPlus:
   208  			b.WriteRune('+')
   209  		case OpQuest:
   210  			b.WriteRune('?')
   211  		case OpRepeat:
   212  			b.WriteRune('{')
   213  			b.WriteString(strconv.Itoa(re.Min))
   214  			if re.Max != re.Min {
   215  				b.WriteRune(',')
   216  				if re.Max >= 0 {
   217  					b.WriteString(strconv.Itoa(re.Max))
   218  				}
   219  			}
   220  			b.WriteRune('}')
   221  		}
   222  		if re.Flags&NonGreedy != 0 {
   223  			b.WriteRune('?')
   224  		}
   225  	case OpConcat:
   226  		for _, sub := range re.Sub {
   227  			if sub.Op == OpAlternate {
   228  				b.WriteString(`(?:`)
   229  				writeRegexp(b, sub)
   230  				b.WriteString(`)`)
   231  			} else {
   232  				writeRegexp(b, sub)
   233  			}
   234  		}
   235  	case OpAlternate:
   236  		for i, sub := range re.Sub {
   237  			if i > 0 {
   238  				b.WriteRune('|')
   239  			}
   240  			writeRegexp(b, sub)
   241  		}
   242  	}
   243  }
   244  
   245  func (re *Regexp) String() string {
   246  	var b bytes.Buffer
   247  	writeRegexp(&b, re)
   248  	return b.String()
   249  }
   250  
   251  const meta = `\.+*?()|[]{}^$`
   252  
   253  func escape(b *bytes.Buffer, r rune, force bool) {
   254  	if unicode.IsPrint(r) {
   255  		if strings.ContainsRune(meta, r) || force {
   256  			b.WriteRune('\\')
   257  		}
   258  		b.WriteRune(r)
   259  		return
   260  	}
   261  
   262  	switch r {
   263  	case '\a':
   264  		b.WriteString(`\a`)
   265  	case '\f':
   266  		b.WriteString(`\f`)
   267  	case '\n':
   268  		b.WriteString(`\n`)
   269  	case '\r':
   270  		b.WriteString(`\r`)
   271  	case '\t':
   272  		b.WriteString(`\t`)
   273  	case '\v':
   274  		b.WriteString(`\v`)
   275  	default:
   276  		if r < 0x100 {
   277  			b.WriteString(`\x`)
   278  			s := strconv.FormatInt(int64(r), 16)
   279  			if len(s) == 1 {
   280  				b.WriteRune('0')
   281  			}
   282  			b.WriteString(s)
   283  			break
   284  		}
   285  		b.WriteString(`\x{`)
   286  		b.WriteString(strconv.FormatInt(int64(r), 16))
   287  		b.WriteString(`}`)
   288  	}
   289  }
   290  
   291  // MaxCap walks the regexp to find the maximum capture index.
   292  func (re *Regexp) MaxCap() int {
   293  	m := 0
   294  	if re.Op == OpCapture {
   295  		m = re.Cap
   296  	}
   297  	for _, sub := range re.Sub {
   298  		if n := sub.MaxCap(); m < n {
   299  			m = n
   300  		}
   301  	}
   302  	return m
   303  }
   304  
   305  // CapNames walks the regexp to find the names of capturing groups.
   306  func (re *Regexp) CapNames() []string {
   307  	names := make([]string, re.MaxCap()+1)
   308  	re.capNames(names)
   309  	return names
   310  }
   311  
   312  func (re *Regexp) capNames(names []string) {
   313  	if re.Op == OpCapture {
   314  		names[re.Cap] = re.Name
   315  	}
   316  	for _, sub := range re.Sub {
   317  		sub.capNames(names)
   318  	}
   319  }