github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/regexp/syntax/regexp.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 // Note to implementers: 8 // In this package, re is always a *Regexp and r is always a rune. 9 10 import ( 11 "strconv" 12 "strings" 13 "unicode" 14 ) 15 16 // A Regexp is a node in a regular expression syntax tree. 17 type Regexp struct { 18 Op Op // operator 19 Flags Flags 20 Sub []*Regexp // subexpressions, if any 21 Sub0 [1]*Regexp // storage for short Sub 22 Rune []rune // matched runes, for OpLiteral, OpCharClass 23 Rune0 [2]rune // storage for short Rune 24 Min, Max int // min, max for OpRepeat 25 Cap int // capturing index, for OpCapture 26 Name string // capturing name, for OpCapture 27 } 28 29 //go:generate stringer -type Op -trimprefix Op 30 31 // An Op is a single regular expression operator. 32 type Op uint8 33 34 // Operators are listed in precedence order, tightest binding to weakest. 35 // Character class operators are listed simplest to most complex 36 // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). 37 38 const ( 39 OpNoMatch Op = 1 + iota // matches no strings 40 OpEmptyMatch // matches empty string 41 OpLiteral // matches Runes sequence 42 OpCharClass // matches Runes interpreted as range pair list 43 OpAnyCharNotNL // matches any character except newline 44 OpAnyChar // matches any character 45 OpBeginLine // matches empty string at beginning of line 46 OpEndLine // matches empty string at end of line 47 OpBeginText // matches empty string at beginning of text 48 OpEndText // matches empty string at end of text 49 OpWordBoundary // matches word boundary `\b` 50 OpNoWordBoundary // matches word non-boundary `\B` 51 OpCapture // capturing subexpression with index Cap, optional name Name 52 OpStar // matches Sub[0] zero or more times 53 OpPlus // matches Sub[0] one or more times 54 OpQuest // matches Sub[0] zero or one times 55 OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) 56 OpConcat // matches concatenation of Subs 57 OpAlternate // matches alternation of Subs 58 ) 59 60 const opPseudo Op = 128 // where pseudo-ops start 61 62 // Equal reports whether x and y have identical structure. 63 func (x *Regexp) Equal(y *Regexp) bool { 64 if x == nil || y == nil { 65 return x == y 66 } 67 if x.Op != y.Op { 68 return false 69 } 70 switch x.Op { 71 case OpEndText: 72 // The parse flags remember whether this is \z or \Z. 73 if x.Flags&WasDollar != y.Flags&WasDollar { 74 return false 75 } 76 77 case OpLiteral, OpCharClass: 78 if len(x.Rune) != len(y.Rune) { 79 return false 80 } 81 for i, r := range x.Rune { 82 if r != y.Rune[i] { 83 return false 84 } 85 } 86 87 case OpAlternate, OpConcat: 88 if len(x.Sub) != len(y.Sub) { 89 return false 90 } 91 for i, sub := range x.Sub { 92 if !sub.Equal(y.Sub[i]) { 93 return false 94 } 95 } 96 97 case OpStar, OpPlus, OpQuest: 98 if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { 99 return false 100 } 101 102 case OpRepeat: 103 if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { 104 return false 105 } 106 107 case OpCapture: 108 if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { 109 return false 110 } 111 } 112 return true 113 } 114 115 // printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp. 116 type printFlags uint8 117 118 const ( 119 flagI printFlags = 1 << iota // (?i: 120 flagM // (?m: 121 flagS // (?s: 122 flagOff // ) 123 flagPrec // (?: ) 124 negShift = 5 // flagI<<negShift is (?-i: 125 ) 126 127 // addSpan enables the flags f around start..last, 128 // by setting flags[start] = f and flags[last] = flagOff. 129 func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) { 130 if *flags == nil { 131 *flags = make(map[*Regexp]printFlags) 132 } 133 (*flags)[start] = f 134 (*flags)[last] |= flagOff // maybe start==last 135 } 136 137 // calcFlags calculates the flags to print around each subexpression in re, 138 // storing that information in (*flags)[sub] for each affected subexpression. 139 // The first time an entry needs to be written to *flags, calcFlags allocates the map. 140 // calcFlags also calculates the flags that must be active or can't be active 141 // around re and returns those flags. 142 func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) { 143 switch re.Op { 144 default: 145 return 0, 0 146 147 case OpLiteral: 148 // If literal is fold-sensitive, return (flagI, 0) or (0, flagI) 149 // according to whether (?i) is active. 150 // If literal is not fold-sensitive, return 0, 0. 151 for _, r := range re.Rune { 152 if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r { 153 if re.Flags&FoldCase != 0 { 154 return flagI, 0 155 } else { 156 return 0, flagI 157 } 158 } 159 } 160 return 0, 0 161 162 case OpCharClass: 163 // If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out. 164 // If literal is not fold-sensitive, return 0, 0. 165 for i := 0; i < len(re.Rune); i += 2 { 166 lo := max(minFold, re.Rune[i]) 167 hi := min(maxFold, re.Rune[i+1]) 168 for r := lo; r <= hi; r++ { 169 for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) { 170 if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) { 171 return 0, flagI 172 } 173 } 174 } 175 } 176 return 0, 0 177 178 case OpAnyCharNotNL: // (?-s). 179 return 0, flagS 180 181 case OpAnyChar: // (?s). 182 return flagS, 0 183 184 case OpBeginLine, OpEndLine: // (?m)^ (?m)$ 185 return flagM, 0 186 187 case OpEndText: 188 if re.Flags&WasDollar != 0 { // (?-m)$ 189 return 0, flagM 190 } 191 return 0, 0 192 193 case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat: 194 return calcFlags(re.Sub[0], flags) 195 196 case OpConcat, OpAlternate: 197 // Gather the must and cant for each subexpression. 198 // When we find a conflicting subexpression, insert the necessary 199 // flags around the previously identified span and start over. 200 var must, cant, allCant printFlags 201 start := 0 202 last := 0 203 did := false 204 for i, sub := range re.Sub { 205 subMust, subCant := calcFlags(sub, flags) 206 if must&subCant != 0 || subMust&cant != 0 { 207 if must != 0 { 208 addSpan(re.Sub[start], re.Sub[last], must, flags) 209 } 210 must = 0 211 cant = 0 212 start = i 213 did = true 214 } 215 must |= subMust 216 cant |= subCant 217 allCant |= subCant 218 if subMust != 0 { 219 last = i 220 } 221 if must == 0 && start == i { 222 start++ 223 } 224 } 225 if !did { 226 // No conflicts: pass the accumulated must and cant upward. 227 return must, cant 228 } 229 if must != 0 { 230 // Conflicts found; need to finish final span. 231 addSpan(re.Sub[start], re.Sub[last], must, flags) 232 } 233 return 0, allCant 234 } 235 } 236 237 // writeRegexp writes the Perl syntax for the regular expression re to b. 238 func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) { 239 f |= flags[re] 240 if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 { 241 // flagPrec is redundant with other flags being added and terminated 242 f &^= flagPrec 243 } 244 if f&^(flagOff|flagPrec) != 0 { 245 b.WriteString(`(?`) 246 if f&flagI != 0 { 247 b.WriteString(`i`) 248 } 249 if f&flagM != 0 { 250 b.WriteString(`m`) 251 } 252 if f&flagS != 0 { 253 b.WriteString(`s`) 254 } 255 if f&((flagM|flagS)<<negShift) != 0 { 256 b.WriteString(`-`) 257 if f&(flagM<<negShift) != 0 { 258 b.WriteString(`m`) 259 } 260 if f&(flagS<<negShift) != 0 { 261 b.WriteString(`s`) 262 } 263 } 264 b.WriteString(`:`) 265 } 266 if f&flagOff != 0 { 267 defer b.WriteString(`)`) 268 } 269 if f&flagPrec != 0 { 270 b.WriteString(`(?:`) 271 defer b.WriteString(`)`) 272 } 273 274 switch re.Op { 275 default: 276 b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") 277 case OpNoMatch: 278 b.WriteString(`[^\x00-\x{10FFFF}]`) 279 case OpEmptyMatch: 280 b.WriteString(`(?:)`) 281 case OpLiteral: 282 for _, r := range re.Rune { 283 escape(b, r, false) 284 } 285 case OpCharClass: 286 if len(re.Rune)%2 != 0 { 287 b.WriteString(`[invalid char class]`) 288 break 289 } 290 b.WriteRune('[') 291 if len(re.Rune) == 0 { 292 b.WriteString(`^\x00-\x{10FFFF}`) 293 } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { 294 // Contains 0 and MaxRune. Probably a negated class. 295 // Print the gaps. 296 b.WriteRune('^') 297 for i := 1; i < len(re.Rune)-1; i += 2 { 298 lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 299 escape(b, lo, lo == '-') 300 if lo != hi { 301 if hi != lo+1 { 302 b.WriteRune('-') 303 } 304 escape(b, hi, hi == '-') 305 } 306 } 307 } else { 308 for i := 0; i < len(re.Rune); i += 2 { 309 lo, hi := re.Rune[i], re.Rune[i+1] 310 escape(b, lo, lo == '-') 311 if lo != hi { 312 if hi != lo+1 { 313 b.WriteRune('-') 314 } 315 escape(b, hi, hi == '-') 316 } 317 } 318 } 319 b.WriteRune(']') 320 case OpAnyCharNotNL, OpAnyChar: 321 b.WriteString(`.`) 322 case OpBeginLine: 323 b.WriteString(`^`) 324 case OpEndLine: 325 b.WriteString(`$`) 326 case OpBeginText: 327 b.WriteString(`\A`) 328 case OpEndText: 329 if re.Flags&WasDollar != 0 { 330 b.WriteString(`$`) 331 } else { 332 b.WriteString(`\z`) 333 } 334 case OpWordBoundary: 335 b.WriteString(`\b`) 336 case OpNoWordBoundary: 337 b.WriteString(`\B`) 338 case OpCapture: 339 if re.Name != "" { 340 b.WriteString(`(?P<`) 341 b.WriteString(re.Name) 342 b.WriteRune('>') 343 } else { 344 b.WriteRune('(') 345 } 346 if re.Sub[0].Op != OpEmptyMatch { 347 writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags) 348 } 349 b.WriteRune(')') 350 case OpStar, OpPlus, OpQuest, OpRepeat: 351 p := printFlags(0) 352 sub := re.Sub[0] 353 if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { 354 p = flagPrec 355 } 356 writeRegexp(b, sub, p, flags) 357 358 switch re.Op { 359 case OpStar: 360 b.WriteRune('*') 361 case OpPlus: 362 b.WriteRune('+') 363 case OpQuest: 364 b.WriteRune('?') 365 case OpRepeat: 366 b.WriteRune('{') 367 b.WriteString(strconv.Itoa(re.Min)) 368 if re.Max != re.Min { 369 b.WriteRune(',') 370 if re.Max >= 0 { 371 b.WriteString(strconv.Itoa(re.Max)) 372 } 373 } 374 b.WriteRune('}') 375 } 376 if re.Flags&NonGreedy != 0 { 377 b.WriteRune('?') 378 } 379 case OpConcat: 380 for _, sub := range re.Sub { 381 p := printFlags(0) 382 if sub.Op == OpAlternate { 383 p = flagPrec 384 } 385 writeRegexp(b, sub, p, flags) 386 } 387 case OpAlternate: 388 for i, sub := range re.Sub { 389 if i > 0 { 390 b.WriteRune('|') 391 } 392 writeRegexp(b, sub, 0, flags) 393 } 394 } 395 } 396 397 func (re *Regexp) String() string { 398 var b strings.Builder 399 var flags map[*Regexp]printFlags 400 must, cant := calcFlags(re, &flags) 401 must |= (cant &^ flagI) << negShift 402 if must != 0 { 403 must |= flagOff 404 } 405 writeRegexp(&b, re, must, flags) 406 return b.String() 407 } 408 409 const meta = `\.+*?()|[]{}^$` 410 411 func escape(b *strings.Builder, r rune, force bool) { 412 if unicode.IsPrint(r) { 413 if strings.ContainsRune(meta, r) || force { 414 b.WriteRune('\\') 415 } 416 b.WriteRune(r) 417 return 418 } 419 420 switch r { 421 case '\a': 422 b.WriteString(`\a`) 423 case '\f': 424 b.WriteString(`\f`) 425 case '\n': 426 b.WriteString(`\n`) 427 case '\r': 428 b.WriteString(`\r`) 429 case '\t': 430 b.WriteString(`\t`) 431 case '\v': 432 b.WriteString(`\v`) 433 default: 434 if r < 0x100 { 435 b.WriteString(`\x`) 436 s := strconv.FormatInt(int64(r), 16) 437 if len(s) == 1 { 438 b.WriteRune('0') 439 } 440 b.WriteString(s) 441 break 442 } 443 b.WriteString(`\x{`) 444 b.WriteString(strconv.FormatInt(int64(r), 16)) 445 b.WriteString(`}`) 446 } 447 } 448 449 // MaxCap walks the regexp to find the maximum capture index. 450 func (re *Regexp) MaxCap() int { 451 m := 0 452 if re.Op == OpCapture { 453 m = re.Cap 454 } 455 for _, sub := range re.Sub { 456 if n := sub.MaxCap(); m < n { 457 m = n 458 } 459 } 460 return m 461 } 462 463 // CapNames walks the regexp to find the names of capturing groups. 464 func (re *Regexp) CapNames() []string { 465 names := make([]string, re.MaxCap()+1) 466 re.capNames(names) 467 return names 468 } 469 470 func (re *Regexp) capNames(names []string) { 471 if re.Op == OpCapture { 472 names[re.Cap] = re.Name 473 } 474 for _, sub := range re.Sub { 475 sub.capNames(names) 476 } 477 }