github.com/corona10/go@v0.0.0-20180224231303-7a218942be57/src/regexp/syntax/regexp.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 // Note to implementers: 8 // In this package, re is always a *Regexp and r is always a rune. 9 10 import ( 11 "bytes" 12 "strconv" 13 "strings" 14 "unicode" 15 ) 16 17 // A Regexp is a node in a regular expression syntax tree. 18 type Regexp struct { 19 Op Op // operator 20 Flags Flags 21 Sub []*Regexp // subexpressions, if any 22 Sub0 [1]*Regexp // storage for short Sub 23 Rune []rune // matched runes, for OpLiteral, OpCharClass 24 Rune0 [2]rune // storage for short Rune 25 Min, Max int // min, max for OpRepeat 26 Cap int // capturing index, for OpCapture 27 Name string // capturing name, for OpCapture 28 } 29 30 //go:generate stringer -type Op -trimprefix Op 31 32 // An Op is a single regular expression operator. 33 type Op uint8 34 35 // Operators are listed in precedence order, tightest binding to weakest. 36 // Character class operators are listed simplest to most complex 37 // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). 38 39 const ( 40 OpNoMatch Op = 1 + iota // matches no strings 41 OpEmptyMatch // matches empty string 42 OpLiteral // matches Runes sequence 43 OpCharClass // matches Runes interpreted as range pair list 44 OpAnyCharNotNL // matches any character except newline 45 OpAnyChar // matches any character 46 OpBeginLine // matches empty string at beginning of line 47 OpEndLine // matches empty string at end of line 48 OpBeginText // matches empty string at beginning of text 49 OpEndText // matches empty string at end of text 50 OpWordBoundary // matches word boundary `\b` 51 OpNoWordBoundary // matches word non-boundary `\B` 52 OpCapture // capturing subexpression with index Cap, optional name Name 53 OpStar // matches Sub[0] zero or more times 54 OpPlus // matches Sub[0] one or more times 55 OpQuest // matches Sub[0] zero or one times 56 OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) 57 OpConcat // matches concatenation of Subs 58 OpAlternate // matches alternation of Subs 59 ) 60 61 const opPseudo Op = 128 // where pseudo-ops start 62 63 // Equal returns true if x and y have identical structure. 64 func (x *Regexp) Equal(y *Regexp) bool { 65 if x == nil || y == nil { 66 return x == y 67 } 68 if x.Op != y.Op { 69 return false 70 } 71 switch x.Op { 72 case OpEndText: 73 // The parse flags remember whether this is \z or \Z. 74 if x.Flags&WasDollar != y.Flags&WasDollar { 75 return false 76 } 77 78 case OpLiteral, OpCharClass: 79 if len(x.Rune) != len(y.Rune) { 80 return false 81 } 82 for i, r := range x.Rune { 83 if r != y.Rune[i] { 84 return false 85 } 86 } 87 88 case OpAlternate, OpConcat: 89 if len(x.Sub) != len(y.Sub) { 90 return false 91 } 92 for i, sub := range x.Sub { 93 if !sub.Equal(y.Sub[i]) { 94 return false 95 } 96 } 97 98 case OpStar, OpPlus, OpQuest: 99 if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { 100 return false 101 } 102 103 case OpRepeat: 104 if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { 105 return false 106 } 107 108 case OpCapture: 109 if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { 110 return false 111 } 112 } 113 return true 114 } 115 116 // writeRegexp writes the Perl syntax for the regular expression re to b. 117 func writeRegexp(b *bytes.Buffer, re *Regexp) { 118 switch re.Op { 119 default: 120 b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") 121 case OpNoMatch: 122 b.WriteString(`[^\x00-\x{10FFFF}]`) 123 case OpEmptyMatch: 124 b.WriteString(`(?:)`) 125 case OpLiteral: 126 if re.Flags&FoldCase != 0 { 127 b.WriteString(`(?i:`) 128 } 129 for _, r := range re.Rune { 130 escape(b, r, false) 131 } 132 if re.Flags&FoldCase != 0 { 133 b.WriteString(`)`) 134 } 135 case OpCharClass: 136 if len(re.Rune)%2 != 0 { 137 b.WriteString(`[invalid char class]`) 138 break 139 } 140 b.WriteRune('[') 141 if len(re.Rune) == 0 { 142 b.WriteString(`^\x00-\x{10FFFF}`) 143 } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { 144 // Contains 0 and MaxRune. Probably a negated class. 145 // Print the gaps. 146 b.WriteRune('^') 147 for i := 1; i < len(re.Rune)-1; i += 2 { 148 lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 149 escape(b, lo, lo == '-') 150 if lo != hi { 151 b.WriteRune('-') 152 escape(b, hi, hi == '-') 153 } 154 } 155 } else { 156 for i := 0; i < len(re.Rune); i += 2 { 157 lo, hi := re.Rune[i], re.Rune[i+1] 158 escape(b, lo, lo == '-') 159 if lo != hi { 160 b.WriteRune('-') 161 escape(b, hi, hi == '-') 162 } 163 } 164 } 165 b.WriteRune(']') 166 case OpAnyCharNotNL: 167 b.WriteString(`(?-s:.)`) 168 case OpAnyChar: 169 b.WriteString(`(?s:.)`) 170 case OpBeginLine: 171 b.WriteString(`(?m:^)`) 172 case OpEndLine: 173 b.WriteString(`(?m:$)`) 174 case OpBeginText: 175 b.WriteString(`\A`) 176 case OpEndText: 177 if re.Flags&WasDollar != 0 { 178 b.WriteString(`(?-m:$)`) 179 } else { 180 b.WriteString(`\z`) 181 } 182 case OpWordBoundary: 183 b.WriteString(`\b`) 184 case OpNoWordBoundary: 185 b.WriteString(`\B`) 186 case OpCapture: 187 if re.Name != "" { 188 b.WriteString(`(?P<`) 189 b.WriteString(re.Name) 190 b.WriteRune('>') 191 } else { 192 b.WriteRune('(') 193 } 194 if re.Sub[0].Op != OpEmptyMatch { 195 writeRegexp(b, re.Sub[0]) 196 } 197 b.WriteRune(')') 198 case OpStar, OpPlus, OpQuest, OpRepeat: 199 if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { 200 b.WriteString(`(?:`) 201 writeRegexp(b, sub) 202 b.WriteString(`)`) 203 } else { 204 writeRegexp(b, sub) 205 } 206 switch re.Op { 207 case OpStar: 208 b.WriteRune('*') 209 case OpPlus: 210 b.WriteRune('+') 211 case OpQuest: 212 b.WriteRune('?') 213 case OpRepeat: 214 b.WriteRune('{') 215 b.WriteString(strconv.Itoa(re.Min)) 216 if re.Max != re.Min { 217 b.WriteRune(',') 218 if re.Max >= 0 { 219 b.WriteString(strconv.Itoa(re.Max)) 220 } 221 } 222 b.WriteRune('}') 223 } 224 if re.Flags&NonGreedy != 0 { 225 b.WriteRune('?') 226 } 227 case OpConcat: 228 for _, sub := range re.Sub { 229 if sub.Op == OpAlternate { 230 b.WriteString(`(?:`) 231 writeRegexp(b, sub) 232 b.WriteString(`)`) 233 } else { 234 writeRegexp(b, sub) 235 } 236 } 237 case OpAlternate: 238 for i, sub := range re.Sub { 239 if i > 0 { 240 b.WriteRune('|') 241 } 242 writeRegexp(b, sub) 243 } 244 } 245 } 246 247 func (re *Regexp) String() string { 248 var b bytes.Buffer 249 writeRegexp(&b, re) 250 return b.String() 251 } 252 253 const meta = `\.+*?()|[]{}^$` 254 255 func escape(b *bytes.Buffer, r rune, force bool) { 256 if unicode.IsPrint(r) { 257 if strings.ContainsRune(meta, r) || force { 258 b.WriteRune('\\') 259 } 260 b.WriteRune(r) 261 return 262 } 263 264 switch r { 265 case '\a': 266 b.WriteString(`\a`) 267 case '\f': 268 b.WriteString(`\f`) 269 case '\n': 270 b.WriteString(`\n`) 271 case '\r': 272 b.WriteString(`\r`) 273 case '\t': 274 b.WriteString(`\t`) 275 case '\v': 276 b.WriteString(`\v`) 277 default: 278 if r < 0x100 { 279 b.WriteString(`\x`) 280 s := strconv.FormatInt(int64(r), 16) 281 if len(s) == 1 { 282 b.WriteRune('0') 283 } 284 b.WriteString(s) 285 break 286 } 287 b.WriteString(`\x{`) 288 b.WriteString(strconv.FormatInt(int64(r), 16)) 289 b.WriteString(`}`) 290 } 291 } 292 293 // MaxCap walks the regexp to find the maximum capture index. 294 func (re *Regexp) MaxCap() int { 295 m := 0 296 if re.Op == OpCapture { 297 m = re.Cap 298 } 299 for _, sub := range re.Sub { 300 if n := sub.MaxCap(); m < n { 301 m = n 302 } 303 } 304 return m 305 } 306 307 // CapNames walks the regexp to find the names of capturing groups. 308 func (re *Regexp) CapNames() []string { 309 names := make([]string, re.MaxCap()+1) 310 re.capNames(names) 311 return names 312 } 313 314 func (re *Regexp) capNames(names []string) { 315 if re.Op == OpCapture { 316 names[re.Cap] = re.Name 317 } 318 for _, sub := range re.Sub { 319 sub.capNames(names) 320 } 321 }