github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/regexp/syntax/regexp.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 // Note to implementers: 8 // In this package, re is always a *Regexp and r is always a rune. 9 10 import ( 11 "strconv" 12 "strings" 13 "unicode" 14 ) 15 16 // A Regexp is a node in a regular expression syntax tree. 17 type Regexp struct { 18 Op Op // operator 19 Flags Flags 20 Sub []*Regexp // subexpressions, if any 21 Sub0 [1]*Regexp // storage for short Sub 22 Rune []rune // matched runes, for OpLiteral, OpCharClass 23 Rune0 [2]rune // storage for short Rune 24 Min, Max int // min, max for OpRepeat 25 Cap int // capturing index, for OpCapture 26 Name string // capturing name, for OpCapture 27 } 28 29 //go:generate stringer -type Op -trimprefix Op 30 31 // An Op is a single regular expression operator. 32 type Op uint8 33 34 // Operators are listed in precedence order, tightest binding to weakest. 35 // Character class operators are listed simplest to most complex 36 // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). 37 38 const ( 39 OpNoMatch Op = 1 + iota // matches no strings 40 OpEmptyMatch // matches empty string 41 OpLiteral // matches Runes sequence 42 OpCharClass // matches Runes interpreted as range pair list 43 OpAnyCharNotNL // matches any character except newline 44 OpAnyChar // matches any character 45 OpBeginLine // matches empty string at beginning of line 46 OpEndLine // matches empty string at end of line 47 OpBeginText // matches empty string at beginning of text 48 OpEndText // matches empty string at end of text 49 OpWordBoundary // matches word boundary `\b` 50 OpNoWordBoundary // matches word non-boundary `\B` 51 OpCapture // capturing subexpression with index Cap, optional name Name 52 OpStar // matches Sub[0] zero or more times 53 OpPlus // matches Sub[0] one or more times 54 OpQuest // matches Sub[0] zero or one times 55 OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) 56 OpConcat // matches concatenation of Subs 57 OpAlternate // matches alternation of Subs 58 ) 59 60 const opPseudo Op = 128 // where pseudo-ops start 61 62 // Equal reports whether x and y have identical structure. 63 func (x *Regexp) Equal(y *Regexp) bool { 64 if x == nil || y == nil { 65 return x == y 66 } 67 if x.Op != y.Op { 68 return false 69 } 70 switch x.Op { 71 case OpEndText: 72 // The parse flags remember whether this is \z or \Z. 73 if x.Flags&WasDollar != y.Flags&WasDollar { 74 return false 75 } 76 77 case OpLiteral, OpCharClass: 78 if len(x.Rune) != len(y.Rune) { 79 return false 80 } 81 for i, r := range x.Rune { 82 if r != y.Rune[i] { 83 return false 84 } 85 } 86 87 case OpAlternate, OpConcat: 88 if len(x.Sub) != len(y.Sub) { 89 return false 90 } 91 for i, sub := range x.Sub { 92 if !sub.Equal(y.Sub[i]) { 93 return false 94 } 95 } 96 97 case OpStar, OpPlus, OpQuest: 98 if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { 99 return false 100 } 101 102 case OpRepeat: 103 if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { 104 return false 105 } 106 107 case OpCapture: 108 if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { 109 return false 110 } 111 } 112 return true 113 } 114 115 // writeRegexp writes the Perl syntax for the regular expression re to b. 116 func writeRegexp(b *strings.Builder, re *Regexp) { 117 switch re.Op { 118 default: 119 b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") 120 case OpNoMatch: 121 b.WriteString(`[^\x00-\x{10FFFF}]`) 122 case OpEmptyMatch: 123 b.WriteString(`(?:)`) 124 case OpLiteral: 125 if re.Flags&FoldCase != 0 { 126 b.WriteString(`(?i:`) 127 } 128 for _, r := range re.Rune { 129 escape(b, r, false) 130 } 131 if re.Flags&FoldCase != 0 { 132 b.WriteString(`)`) 133 } 134 case OpCharClass: 135 if len(re.Rune)%2 != 0 { 136 b.WriteString(`[invalid char class]`) 137 break 138 } 139 b.WriteRune('[') 140 if len(re.Rune) == 0 { 141 b.WriteString(`^\x00-\x{10FFFF}`) 142 } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { 143 // Contains 0 and MaxRune. Probably a negated class. 144 // Print the gaps. 145 b.WriteRune('^') 146 for i := 1; i < len(re.Rune)-1; i += 2 { 147 lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 148 escape(b, lo, lo == '-') 149 if lo != hi { 150 b.WriteRune('-') 151 escape(b, hi, hi == '-') 152 } 153 } 154 } else { 155 for i := 0; i < len(re.Rune); i += 2 { 156 lo, hi := re.Rune[i], re.Rune[i+1] 157 escape(b, lo, lo == '-') 158 if lo != hi { 159 b.WriteRune('-') 160 escape(b, hi, hi == '-') 161 } 162 } 163 } 164 b.WriteRune(']') 165 case OpAnyCharNotNL: 166 b.WriteString(`(?-s:.)`) 167 case OpAnyChar: 168 b.WriteString(`(?s:.)`) 169 case OpBeginLine: 170 b.WriteString(`(?m:^)`) 171 case OpEndLine: 172 b.WriteString(`(?m:$)`) 173 case OpBeginText: 174 b.WriteString(`\A`) 175 case OpEndText: 176 if re.Flags&WasDollar != 0 { 177 b.WriteString(`(?-m:$)`) 178 } else { 179 b.WriteString(`\z`) 180 } 181 case OpWordBoundary: 182 b.WriteString(`\b`) 183 case OpNoWordBoundary: 184 b.WriteString(`\B`) 185 case OpCapture: 186 if re.Name != "" { 187 b.WriteString(`(?P<`) 188 b.WriteString(re.Name) 189 b.WriteRune('>') 190 } else { 191 b.WriteRune('(') 192 } 193 if re.Sub[0].Op != OpEmptyMatch { 194 writeRegexp(b, re.Sub[0]) 195 } 196 b.WriteRune(')') 197 case OpStar, OpPlus, OpQuest, OpRepeat: 198 if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { 199 b.WriteString(`(?:`) 200 writeRegexp(b, sub) 201 b.WriteString(`)`) 202 } else { 203 writeRegexp(b, sub) 204 } 205 switch re.Op { 206 case OpStar: 207 b.WriteRune('*') 208 case OpPlus: 209 b.WriteRune('+') 210 case OpQuest: 211 b.WriteRune('?') 212 case OpRepeat: 213 b.WriteRune('{') 214 b.WriteString(strconv.Itoa(re.Min)) 215 if re.Max != re.Min { 216 b.WriteRune(',') 217 if re.Max >= 0 { 218 b.WriteString(strconv.Itoa(re.Max)) 219 } 220 } 221 b.WriteRune('}') 222 } 223 if re.Flags&NonGreedy != 0 { 224 b.WriteRune('?') 225 } 226 case OpConcat: 227 for _, sub := range re.Sub { 228 if sub.Op == OpAlternate { 229 b.WriteString(`(?:`) 230 writeRegexp(b, sub) 231 b.WriteString(`)`) 232 } else { 233 writeRegexp(b, sub) 234 } 235 } 236 case OpAlternate: 237 for i, sub := range re.Sub { 238 if i > 0 { 239 b.WriteRune('|') 240 } 241 writeRegexp(b, sub) 242 } 243 } 244 } 245 246 func (re *Regexp) String() string { 247 var b strings.Builder 248 writeRegexp(&b, re) 249 return b.String() 250 } 251 252 const meta = `\.+*?()|[]{}^$` 253 254 func escape(b *strings.Builder, r rune, force bool) { 255 if unicode.IsPrint(r) { 256 if strings.ContainsRune(meta, r) || force { 257 b.WriteRune('\\') 258 } 259 b.WriteRune(r) 260 return 261 } 262 263 switch r { 264 case '\a': 265 b.WriteString(`\a`) 266 case '\f': 267 b.WriteString(`\f`) 268 case '\n': 269 b.WriteString(`\n`) 270 case '\r': 271 b.WriteString(`\r`) 272 case '\t': 273 b.WriteString(`\t`) 274 case '\v': 275 b.WriteString(`\v`) 276 default: 277 if r < 0x100 { 278 b.WriteString(`\x`) 279 s := strconv.FormatInt(int64(r), 16) 280 if len(s) == 1 { 281 b.WriteRune('0') 282 } 283 b.WriteString(s) 284 break 285 } 286 b.WriteString(`\x{`) 287 b.WriteString(strconv.FormatInt(int64(r), 16)) 288 b.WriteString(`}`) 289 } 290 } 291 292 // MaxCap walks the regexp to find the maximum capture index. 293 func (re *Regexp) MaxCap() int { 294 m := 0 295 if re.Op == OpCapture { 296 m = re.Cap 297 } 298 for _, sub := range re.Sub { 299 if n := sub.MaxCap(); m < n { 300 m = n 301 } 302 } 303 return m 304 } 305 306 // CapNames walks the regexp to find the names of capturing groups. 307 func (re *Regexp) CapNames() []string { 308 names := make([]string, re.MaxCap()+1) 309 re.capNames(names) 310 return names 311 } 312 313 func (re *Regexp) capNames(names []string) { 314 if re.Op == OpCapture { 315 names[re.Cap] = re.Name 316 } 317 for _, sub := range re.Sub { 318 sub.capNames(names) 319 } 320 }