github.com/corona10/go@v0.0.0-20180224231303-7a218942be57/src/regexp/syntax/regexp.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // Note to implementers:
     8  // In this package, re is always a *Regexp and r is always a rune.
     9  
    10  import (
    11  	"bytes"
    12  	"strconv"
    13  	"strings"
    14  	"unicode"
    15  )
    16  
    17  // A Regexp is a node in a regular expression syntax tree.
    18  type Regexp struct {
    19  	Op       Op // operator
    20  	Flags    Flags
    21  	Sub      []*Regexp  // subexpressions, if any
    22  	Sub0     [1]*Regexp // storage for short Sub
    23  	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
    24  	Rune0    [2]rune    // storage for short Rune
    25  	Min, Max int        // min, max for OpRepeat
    26  	Cap      int        // capturing index, for OpCapture
    27  	Name     string     // capturing name, for OpCapture
    28  }
    29  
    30  //go:generate stringer -type Op -trimprefix Op
    31  
    32  // An Op is a single regular expression operator.
    33  type Op uint8
    34  
    35  // Operators are listed in precedence order, tightest binding to weakest.
    36  // Character class operators are listed simplest to most complex
    37  // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
    38  
    39  const (
    40  	OpNoMatch        Op = 1 + iota // matches no strings
    41  	OpEmptyMatch                   // matches empty string
    42  	OpLiteral                      // matches Runes sequence
    43  	OpCharClass                    // matches Runes interpreted as range pair list
    44  	OpAnyCharNotNL                 // matches any character except newline
    45  	OpAnyChar                      // matches any character
    46  	OpBeginLine                    // matches empty string at beginning of line
    47  	OpEndLine                      // matches empty string at end of line
    48  	OpBeginText                    // matches empty string at beginning of text
    49  	OpEndText                      // matches empty string at end of text
    50  	OpWordBoundary                 // matches word boundary `\b`
    51  	OpNoWordBoundary               // matches word non-boundary `\B`
    52  	OpCapture                      // capturing subexpression with index Cap, optional name Name
    53  	OpStar                         // matches Sub[0] zero or more times
    54  	OpPlus                         // matches Sub[0] one or more times
    55  	OpQuest                        // matches Sub[0] zero or one times
    56  	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
    57  	OpConcat                       // matches concatenation of Subs
    58  	OpAlternate                    // matches alternation of Subs
    59  )
    60  
    61  const opPseudo Op = 128 // where pseudo-ops start
    62  
    63  // Equal returns true if x and y have identical structure.
    64  func (x *Regexp) Equal(y *Regexp) bool {
    65  	if x == nil || y == nil {
    66  		return x == y
    67  	}
    68  	if x.Op != y.Op {
    69  		return false
    70  	}
    71  	switch x.Op {
    72  	case OpEndText:
    73  		// The parse flags remember whether this is \z or \Z.
    74  		if x.Flags&WasDollar != y.Flags&WasDollar {
    75  			return false
    76  		}
    77  
    78  	case OpLiteral, OpCharClass:
    79  		if len(x.Rune) != len(y.Rune) {
    80  			return false
    81  		}
    82  		for i, r := range x.Rune {
    83  			if r != y.Rune[i] {
    84  				return false
    85  			}
    86  		}
    87  
    88  	case OpAlternate, OpConcat:
    89  		if len(x.Sub) != len(y.Sub) {
    90  			return false
    91  		}
    92  		for i, sub := range x.Sub {
    93  			if !sub.Equal(y.Sub[i]) {
    94  				return false
    95  			}
    96  		}
    97  
    98  	case OpStar, OpPlus, OpQuest:
    99  		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
   100  			return false
   101  		}
   102  
   103  	case OpRepeat:
   104  		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
   105  			return false
   106  		}
   107  
   108  	case OpCapture:
   109  		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
   110  			return false
   111  		}
   112  	}
   113  	return true
   114  }
   115  
   116  // writeRegexp writes the Perl syntax for the regular expression re to b.
   117  func writeRegexp(b *bytes.Buffer, re *Regexp) {
   118  	switch re.Op {
   119  	default:
   120  		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
   121  	case OpNoMatch:
   122  		b.WriteString(`[^\x00-\x{10FFFF}]`)
   123  	case OpEmptyMatch:
   124  		b.WriteString(`(?:)`)
   125  	case OpLiteral:
   126  		if re.Flags&FoldCase != 0 {
   127  			b.WriteString(`(?i:`)
   128  		}
   129  		for _, r := range re.Rune {
   130  			escape(b, r, false)
   131  		}
   132  		if re.Flags&FoldCase != 0 {
   133  			b.WriteString(`)`)
   134  		}
   135  	case OpCharClass:
   136  		if len(re.Rune)%2 != 0 {
   137  			b.WriteString(`[invalid char class]`)
   138  			break
   139  		}
   140  		b.WriteRune('[')
   141  		if len(re.Rune) == 0 {
   142  			b.WriteString(`^\x00-\x{10FFFF}`)
   143  		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
   144  			// Contains 0 and MaxRune. Probably a negated class.
   145  			// Print the gaps.
   146  			b.WriteRune('^')
   147  			for i := 1; i < len(re.Rune)-1; i += 2 {
   148  				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
   149  				escape(b, lo, lo == '-')
   150  				if lo != hi {
   151  					b.WriteRune('-')
   152  					escape(b, hi, hi == '-')
   153  				}
   154  			}
   155  		} else {
   156  			for i := 0; i < len(re.Rune); i += 2 {
   157  				lo, hi := re.Rune[i], re.Rune[i+1]
   158  				escape(b, lo, lo == '-')
   159  				if lo != hi {
   160  					b.WriteRune('-')
   161  					escape(b, hi, hi == '-')
   162  				}
   163  			}
   164  		}
   165  		b.WriteRune(']')
   166  	case OpAnyCharNotNL:
   167  		b.WriteString(`(?-s:.)`)
   168  	case OpAnyChar:
   169  		b.WriteString(`(?s:.)`)
   170  	case OpBeginLine:
   171  		b.WriteString(`(?m:^)`)
   172  	case OpEndLine:
   173  		b.WriteString(`(?m:$)`)
   174  	case OpBeginText:
   175  		b.WriteString(`\A`)
   176  	case OpEndText:
   177  		if re.Flags&WasDollar != 0 {
   178  			b.WriteString(`(?-m:$)`)
   179  		} else {
   180  			b.WriteString(`\z`)
   181  		}
   182  	case OpWordBoundary:
   183  		b.WriteString(`\b`)
   184  	case OpNoWordBoundary:
   185  		b.WriteString(`\B`)
   186  	case OpCapture:
   187  		if re.Name != "" {
   188  			b.WriteString(`(?P<`)
   189  			b.WriteString(re.Name)
   190  			b.WriteRune('>')
   191  		} else {
   192  			b.WriteRune('(')
   193  		}
   194  		if re.Sub[0].Op != OpEmptyMatch {
   195  			writeRegexp(b, re.Sub[0])
   196  		}
   197  		b.WriteRune(')')
   198  	case OpStar, OpPlus, OpQuest, OpRepeat:
   199  		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
   200  			b.WriteString(`(?:`)
   201  			writeRegexp(b, sub)
   202  			b.WriteString(`)`)
   203  		} else {
   204  			writeRegexp(b, sub)
   205  		}
   206  		switch re.Op {
   207  		case OpStar:
   208  			b.WriteRune('*')
   209  		case OpPlus:
   210  			b.WriteRune('+')
   211  		case OpQuest:
   212  			b.WriteRune('?')
   213  		case OpRepeat:
   214  			b.WriteRune('{')
   215  			b.WriteString(strconv.Itoa(re.Min))
   216  			if re.Max != re.Min {
   217  				b.WriteRune(',')
   218  				if re.Max >= 0 {
   219  					b.WriteString(strconv.Itoa(re.Max))
   220  				}
   221  			}
   222  			b.WriteRune('}')
   223  		}
   224  		if re.Flags&NonGreedy != 0 {
   225  			b.WriteRune('?')
   226  		}
   227  	case OpConcat:
   228  		for _, sub := range re.Sub {
   229  			if sub.Op == OpAlternate {
   230  				b.WriteString(`(?:`)
   231  				writeRegexp(b, sub)
   232  				b.WriteString(`)`)
   233  			} else {
   234  				writeRegexp(b, sub)
   235  			}
   236  		}
   237  	case OpAlternate:
   238  		for i, sub := range re.Sub {
   239  			if i > 0 {
   240  				b.WriteRune('|')
   241  			}
   242  			writeRegexp(b, sub)
   243  		}
   244  	}
   245  }
   246  
   247  func (re *Regexp) String() string {
   248  	var b bytes.Buffer
   249  	writeRegexp(&b, re)
   250  	return b.String()
   251  }
   252  
   253  const meta = `\.+*?()|[]{}^$`
   254  
   255  func escape(b *bytes.Buffer, r rune, force bool) {
   256  	if unicode.IsPrint(r) {
   257  		if strings.ContainsRune(meta, r) || force {
   258  			b.WriteRune('\\')
   259  		}
   260  		b.WriteRune(r)
   261  		return
   262  	}
   263  
   264  	switch r {
   265  	case '\a':
   266  		b.WriteString(`\a`)
   267  	case '\f':
   268  		b.WriteString(`\f`)
   269  	case '\n':
   270  		b.WriteString(`\n`)
   271  	case '\r':
   272  		b.WriteString(`\r`)
   273  	case '\t':
   274  		b.WriteString(`\t`)
   275  	case '\v':
   276  		b.WriteString(`\v`)
   277  	default:
   278  		if r < 0x100 {
   279  			b.WriteString(`\x`)
   280  			s := strconv.FormatInt(int64(r), 16)
   281  			if len(s) == 1 {
   282  				b.WriteRune('0')
   283  			}
   284  			b.WriteString(s)
   285  			break
   286  		}
   287  		b.WriteString(`\x{`)
   288  		b.WriteString(strconv.FormatInt(int64(r), 16))
   289  		b.WriteString(`}`)
   290  	}
   291  }
   292  
   293  // MaxCap walks the regexp to find the maximum capture index.
   294  func (re *Regexp) MaxCap() int {
   295  	m := 0
   296  	if re.Op == OpCapture {
   297  		m = re.Cap
   298  	}
   299  	for _, sub := range re.Sub {
   300  		if n := sub.MaxCap(); m < n {
   301  			m = n
   302  		}
   303  	}
   304  	return m
   305  }
   306  
   307  // CapNames walks the regexp to find the names of capturing groups.
   308  func (re *Regexp) CapNames() []string {
   309  	names := make([]string, re.MaxCap()+1)
   310  	re.capNames(names)
   311  	return names
   312  }
   313  
   314  func (re *Regexp) capNames(names []string) {
   315  	if re.Op == OpCapture {
   316  		names[re.Cap] = re.Name
   317  	}
   318  	for _, sub := range re.Sub {
   319  		sub.capNames(names)
   320  	}
   321  }