github.com/ltltlt/go-source-code@v0.0.0-20190830023027-95be009773aa/regexp/syntax/regexp.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 // Note to implementers: 8 // In this package, re is always a *Regexp and r is always a rune. 9 10 import ( 11 "bytes" 12 "strconv" 13 "strings" 14 "unicode" 15 ) 16 17 // A Regexp is a node in a regular expression syntax tree. 18 type Regexp struct { 19 Op Op // operator 20 Flags Flags 21 Sub []*Regexp // subexpressions, if any 22 Sub0 [1]*Regexp // storage for short Sub 23 Rune []rune // matched runes, for OpLiteral, OpCharClass 24 Rune0 [2]rune // storage for short Rune 25 Min, Max int // min, max for OpRepeat 26 Cap int // capturing index, for OpCapture 27 Name string // capturing name, for OpCapture 28 } 29 30 // An Op is a single regular expression operator. 31 type Op uint8 32 33 // Operators are listed in precedence order, tightest binding to weakest. 34 // Character class operators are listed simplest to most complex 35 // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). 36 37 const ( 38 OpNoMatch Op = 1 + iota // matches no strings 39 OpEmptyMatch // matches empty string 40 OpLiteral // matches Runes sequence 41 OpCharClass // matches Runes interpreted as range pair list 42 OpAnyCharNotNL // matches any character except newline 43 OpAnyChar // matches any character 44 OpBeginLine // matches empty string at beginning of line 45 OpEndLine // matches empty string at end of line 46 OpBeginText // matches empty string at beginning of text 47 OpEndText // matches empty string at end of text 48 OpWordBoundary // matches word boundary `\b` 49 OpNoWordBoundary // matches word non-boundary `\B` 50 OpCapture // capturing subexpression with index Cap, optional name Name 51 OpStar // matches Sub[0] zero or more times 52 OpPlus // matches Sub[0] one or more times 53 OpQuest // matches Sub[0] zero or one times 54 OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) 55 OpConcat // matches concatenation of Subs 56 OpAlternate // matches alternation of Subs 57 ) 58 59 const opPseudo Op = 128 // where pseudo-ops start 60 61 // Equal returns true if x and y have identical structure. 62 func (x *Regexp) Equal(y *Regexp) bool { 63 if x == nil || y == nil { 64 return x == y 65 } 66 if x.Op != y.Op { 67 return false 68 } 69 switch x.Op { 70 case OpEndText: 71 // The parse flags remember whether this is \z or \Z. 72 if x.Flags&WasDollar != y.Flags&WasDollar { 73 return false 74 } 75 76 case OpLiteral, OpCharClass: 77 if len(x.Rune) != len(y.Rune) { 78 return false 79 } 80 for i, r := range x.Rune { 81 if r != y.Rune[i] { 82 return false 83 } 84 } 85 86 case OpAlternate, OpConcat: 87 if len(x.Sub) != len(y.Sub) { 88 return false 89 } 90 for i, sub := range x.Sub { 91 if !sub.Equal(y.Sub[i]) { 92 return false 93 } 94 } 95 96 case OpStar, OpPlus, OpQuest: 97 if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { 98 return false 99 } 100 101 case OpRepeat: 102 if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { 103 return false 104 } 105 106 case OpCapture: 107 if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { 108 return false 109 } 110 } 111 return true 112 } 113 114 // writeRegexp writes the Perl syntax for the regular expression re to b. 115 func writeRegexp(b *bytes.Buffer, re *Regexp) { 116 switch re.Op { 117 default: 118 b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") 119 case OpNoMatch: 120 b.WriteString(`[^\x00-\x{10FFFF}]`) 121 case OpEmptyMatch: 122 b.WriteString(`(?:)`) 123 case OpLiteral: 124 if re.Flags&FoldCase != 0 { 125 b.WriteString(`(?i:`) 126 } 127 for _, r := range re.Rune { 128 escape(b, r, false) 129 } 130 if re.Flags&FoldCase != 0 { 131 b.WriteString(`)`) 132 } 133 case OpCharClass: 134 if len(re.Rune)%2 != 0 { 135 b.WriteString(`[invalid char class]`) 136 break 137 } 138 b.WriteRune('[') 139 if len(re.Rune) == 0 { 140 b.WriteString(`^\x00-\x{10FFFF}`) 141 } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { 142 // Contains 0 and MaxRune. Probably a negated class. 143 // Print the gaps. 144 b.WriteRune('^') 145 for i := 1; i < len(re.Rune)-1; i += 2 { 146 lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 147 escape(b, lo, lo == '-') 148 if lo != hi { 149 b.WriteRune('-') 150 escape(b, hi, hi == '-') 151 } 152 } 153 } else { 154 for i := 0; i < len(re.Rune); i += 2 { 155 lo, hi := re.Rune[i], re.Rune[i+1] 156 escape(b, lo, lo == '-') 157 if lo != hi { 158 b.WriteRune('-') 159 escape(b, hi, hi == '-') 160 } 161 } 162 } 163 b.WriteRune(']') 164 case OpAnyCharNotNL: 165 b.WriteString(`(?-s:.)`) 166 case OpAnyChar: 167 b.WriteString(`(?s:.)`) 168 case OpBeginLine: 169 b.WriteString(`(?m:^)`) 170 case OpEndLine: 171 b.WriteString(`(?m:$)`) 172 case OpBeginText: 173 b.WriteString(`\A`) 174 case OpEndText: 175 if re.Flags&WasDollar != 0 { 176 b.WriteString(`(?-m:$)`) 177 } else { 178 b.WriteString(`\z`) 179 } 180 case OpWordBoundary: 181 b.WriteString(`\b`) 182 case OpNoWordBoundary: 183 b.WriteString(`\B`) 184 case OpCapture: 185 if re.Name != "" { 186 b.WriteString(`(?P<`) 187 b.WriteString(re.Name) 188 b.WriteRune('>') 189 } else { 190 b.WriteRune('(') 191 } 192 if re.Sub[0].Op != OpEmptyMatch { 193 writeRegexp(b, re.Sub[0]) 194 } 195 b.WriteRune(')') 196 case OpStar, OpPlus, OpQuest, OpRepeat: 197 if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { 198 b.WriteString(`(?:`) 199 writeRegexp(b, sub) 200 b.WriteString(`)`) 201 } else { 202 writeRegexp(b, sub) 203 } 204 switch re.Op { 205 case OpStar: 206 b.WriteRune('*') 207 case OpPlus: 208 b.WriteRune('+') 209 case OpQuest: 210 b.WriteRune('?') 211 case OpRepeat: 212 b.WriteRune('{') 213 b.WriteString(strconv.Itoa(re.Min)) 214 if re.Max != re.Min { 215 b.WriteRune(',') 216 if re.Max >= 0 { 217 b.WriteString(strconv.Itoa(re.Max)) 218 } 219 } 220 b.WriteRune('}') 221 } 222 if re.Flags&NonGreedy != 0 { 223 b.WriteRune('?') 224 } 225 case OpConcat: 226 for _, sub := range re.Sub { 227 if sub.Op == OpAlternate { 228 b.WriteString(`(?:`) 229 writeRegexp(b, sub) 230 b.WriteString(`)`) 231 } else { 232 writeRegexp(b, sub) 233 } 234 } 235 case OpAlternate: 236 for i, sub := range re.Sub { 237 if i > 0 { 238 b.WriteRune('|') 239 } 240 writeRegexp(b, sub) 241 } 242 } 243 } 244 245 func (re *Regexp) String() string { 246 var b bytes.Buffer 247 writeRegexp(&b, re) 248 return b.String() 249 } 250 251 const meta = `\.+*?()|[]{}^$` 252 253 func escape(b *bytes.Buffer, r rune, force bool) { 254 if unicode.IsPrint(r) { 255 if strings.ContainsRune(meta, r) || force { 256 b.WriteRune('\\') 257 } 258 b.WriteRune(r) 259 return 260 } 261 262 switch r { 263 case '\a': 264 b.WriteString(`\a`) 265 case '\f': 266 b.WriteString(`\f`) 267 case '\n': 268 b.WriteString(`\n`) 269 case '\r': 270 b.WriteString(`\r`) 271 case '\t': 272 b.WriteString(`\t`) 273 case '\v': 274 b.WriteString(`\v`) 275 default: 276 if r < 0x100 { 277 b.WriteString(`\x`) 278 s := strconv.FormatInt(int64(r), 16) 279 if len(s) == 1 { 280 b.WriteRune('0') 281 } 282 b.WriteString(s) 283 break 284 } 285 b.WriteString(`\x{`) 286 b.WriteString(strconv.FormatInt(int64(r), 16)) 287 b.WriteString(`}`) 288 } 289 } 290 291 // MaxCap walks the regexp to find the maximum capture index. 292 func (re *Regexp) MaxCap() int { 293 m := 0 294 if re.Op == OpCapture { 295 m = re.Cap 296 } 297 for _, sub := range re.Sub { 298 if n := sub.MaxCap(); m < n { 299 m = n 300 } 301 } 302 return m 303 } 304 305 // CapNames walks the regexp to find the names of capturing groups. 306 func (re *Regexp) CapNames() []string { 307 names := make([]string, re.MaxCap()+1) 308 re.capNames(names) 309 return names 310 } 311 312 func (re *Regexp) capNames(names []string) { 313 if re.Op == OpCapture { 314 names[re.Cap] = re.Name 315 } 316 for _, sub := range re.Sub { 317 sub.capNames(names) 318 } 319 }