github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/regexp/syntax/parse.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "sort" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // An Error describes a failure to parse a regular expression 15 // and gives the offending expression. 16 type Error struct { 17 Code ErrorCode 18 Expr string 19 } 20 21 func (e *Error) Error() string { 22 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" 23 } 24 25 // An ErrorCode describes a failure to parse a regular expression. 26 type ErrorCode string 27 28 const ( 29 // Unexpected error 30 ErrInternalError ErrorCode = "regexp/syntax: internal error" 31 32 // Parse errors 33 ErrInvalidCharClass ErrorCode = "invalid character class" 34 ErrInvalidCharRange ErrorCode = "invalid character class range" 35 ErrInvalidEscape ErrorCode = "invalid escape sequence" 36 ErrInvalidNamedCapture ErrorCode = "invalid named capture" 37 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" 38 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" 39 ErrInvalidRepeatSize ErrorCode = "invalid repeat count" 40 ErrInvalidUTF8 ErrorCode = "invalid UTF-8" 41 ErrMissingBracket ErrorCode = "missing closing ]" 42 ErrMissingParen ErrorCode = "missing closing )" 43 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" 44 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" 45 ErrUnexpectedParen ErrorCode = "unexpected )" 46 ErrNestingDepth ErrorCode = "expression nests too deeply" 47 ) 48 49 func (e ErrorCode) String() string { 50 return string(e) 51 } 52 53 // Flags control the behavior of the parser and record information about regexp context. 54 type Flags uint16 55 56 const ( 57 FoldCase Flags = 1 << iota // case-insensitive match 58 Literal // treat pattern as literal string 59 ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline 60 DotNL // allow . to match newline 61 OneLine // treat ^ and $ as only matching at beginning and end of text 62 NonGreedy // make repetition operators default to non-greedy 63 PerlX // allow Perl extensions 64 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation 65 WasDollar // regexp OpEndText was $, not \z 66 Simple // regexp contains no counted repetition 67 68 MatchNL = ClassNL | DotNL 69 70 Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible 71 POSIX Flags = 0 // POSIX syntax 72 ) 73 74 // Pseudo-ops for parsing stack. 75 const ( 76 opLeftParen = opPseudo + iota 77 opVerticalBar 78 ) 79 80 // maxHeight is the maximum height of a regexp parse tree. 81 // It is somewhat arbitrarily chosen, but the idea is to be large enough 82 // that no one will actually hit in real use but at the same time small enough 83 // that recursion on the Regexp tree will not hit the 1GB Go stack limit. 84 // The maximum amount of stack for a single recursive frame is probably 85 // closer to 1kB, so this could potentially be raised, but it seems unlikely 86 // that people have regexps nested even this deeply. 87 // We ran a test on Google's C++ code base and turned up only 88 // a single use case with depth > 100; it had depth 128. 89 // Using depth 1000 should be plenty of margin. 90 // As an optimization, we don't even bother calculating heights 91 // until we've allocated at least maxHeight Regexp structures. 92 const maxHeight = 1000 93 94 // maxSize is the maximum size of a compiled regexp in Insts. 95 // It too is somewhat arbitrarily chosen, but the idea is to be large enough 96 // to allow significant regexps while at the same time small enough that 97 // the compiled form will not take up too much memory. 98 // 128 MB is enough for a 3.3 million Inst structures, which roughly 99 // corresponds to a 3.3 MB regexp. 100 const ( 101 maxSize = 128 << 20 / instSize 102 instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words 103 ) 104 105 // maxRunes is the maximum number of runes allowed in a regexp tree 106 // counting the runes in all the nodes. 107 // Ignoring character classes p.numRunes is always less than the length of the regexp. 108 // Character classes can make it much larger: each \pL adds 1292 runes. 109 // 128 MB is enough for 32M runes, which is over 26k \pL instances. 110 // Note that repetitions do not make copies of the rune slices, 111 // so \pL{1000} is only one rune slice, not 1000. 112 // We could keep a cache of character classes we've seen, 113 // so that all the \pL we see use the same rune list, 114 // but that doesn't remove the problem entirely: 115 // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. 116 // And because the Rune slice is exposed directly in the Regexp, 117 // there is not an opportunity to change the representation to allow 118 // partial sharing between different character classes. 119 // So the limit is the best we can do. 120 const ( 121 maxRunes = 128 << 20 / runeSize 122 runeSize = 4 // rune is int32 123 ) 124 125 type parser struct { 126 flags Flags // parse mode flags 127 stack []*Regexp // stack of parsed expressions 128 free *Regexp 129 numCap int // number of capturing groups seen 130 wholeRegexp string 131 tmpClass []rune // temporary char class work space 132 numRegexp int // number of regexps allocated 133 numRunes int // number of runes in char classes 134 repeats int64 // product of all repetitions seen 135 height map[*Regexp]int // regexp height, for height limit check 136 size map[*Regexp]int64 // regexp compiled size, for size limit check 137 } 138 139 func (p *parser) newRegexp(op Op) *Regexp { 140 re := p.free 141 if re != nil { 142 p.free = re.Sub0[0] 143 *re = Regexp{} 144 } else { 145 re = new(Regexp) 146 p.numRegexp++ 147 } 148 re.Op = op 149 return re 150 } 151 152 func (p *parser) reuse(re *Regexp) { 153 if p.height != nil { 154 delete(p.height, re) 155 } 156 re.Sub0[0] = p.free 157 p.free = re 158 } 159 160 func (p *parser) checkLimits(re *Regexp) { 161 if p.numRunes > maxRunes { 162 panic(ErrInternalError) 163 } 164 p.checkSize(re) 165 p.checkHeight(re) 166 } 167 168 func (p *parser) checkSize(re *Regexp) { 169 if p.size == nil { 170 // We haven't started tracking size yet. 171 // Do a relatively cheap check to see if we need to start. 172 // Maintain the product of all the repeats we've seen 173 // and don't track if the total number of regexp nodes 174 // we've seen times the repeat product is in budget. 175 if p.repeats == 0 { 176 p.repeats = 1 177 } 178 if re.Op == OpRepeat { 179 n := re.Max 180 if n == -1 { 181 n = re.Min 182 } 183 if n <= 0 { 184 n = 1 185 } 186 if int64(n) > maxSize/p.repeats { 187 p.repeats = maxSize 188 } else { 189 p.repeats *= int64(n) 190 } 191 } 192 if int64(p.numRegexp) < maxSize/p.repeats { 193 return 194 } 195 196 // We need to start tracking size. 197 // Make the map and belatedly populate it 198 // with info about everything we've constructed so far. 199 p.size = make(map[*Regexp]int64) 200 for _, re := range p.stack { 201 p.checkSize(re) 202 } 203 } 204 205 if p.calcSize(re, true) > maxSize { 206 panic(ErrInternalError) 207 } 208 } 209 210 func (p *parser) calcSize(re *Regexp, force bool) int64 { 211 if !force { 212 if size, ok := p.size[re]; ok { 213 return size 214 } 215 } 216 217 var size int64 218 switch re.Op { 219 case OpLiteral: 220 size = int64(len(re.Rune)) 221 case OpCapture, OpStar: 222 // star can be 1+ or 2+; assume 2 pessimistically 223 size = 2 + p.calcSize(re.Sub[0], false) 224 case OpPlus, OpQuest: 225 size = 1 + p.calcSize(re.Sub[0], false) 226 case OpConcat: 227 for _, sub := range re.Sub { 228 size += p.calcSize(sub, false) 229 } 230 case OpAlternate: 231 for _, sub := range re.Sub { 232 size += p.calcSize(sub, false) 233 } 234 if len(re.Sub) > 1 { 235 size += int64(len(re.Sub)) - 1 236 } 237 case OpRepeat: 238 sub := p.calcSize(re.Sub[0], false) 239 if re.Max == -1 { 240 if re.Min == 0 { 241 size = 2 + sub // x* 242 } else { 243 size = 1 + int64(re.Min)*sub // xxx+ 244 } 245 break 246 } 247 // x{2,5} = xx(x(x(x)?)?)? 248 size = int64(re.Max)*sub + int64(re.Max-re.Min) 249 } 250 251 if size < 1 { 252 size = 1 253 } 254 p.size[re] = size 255 return size 256 } 257 258 func (p *parser) checkHeight(re *Regexp) { 259 if p.numRegexp < maxHeight { 260 return 261 } 262 if p.height == nil { 263 p.height = make(map[*Regexp]int) 264 for _, re := range p.stack { 265 p.checkHeight(re) 266 } 267 } 268 if p.calcHeight(re, true) > maxHeight { 269 panic(ErrNestingDepth) 270 } 271 } 272 273 func (p *parser) calcHeight(re *Regexp, force bool) int { 274 if !force { 275 if h, ok := p.height[re]; ok { 276 return h 277 } 278 } 279 h := 1 280 for _, sub := range re.Sub { 281 hsub := p.calcHeight(sub, false) 282 if h < 1+hsub { 283 h = 1 + hsub 284 } 285 } 286 p.height[re] = h 287 return h 288 } 289 290 // Parse stack manipulation. 291 292 // push pushes the regexp re onto the parse stack and returns the regexp. 293 func (p *parser) push(re *Regexp) *Regexp { 294 p.numRunes += len(re.Rune) 295 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { 296 // Single rune. 297 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { 298 return nil 299 } 300 re.Op = OpLiteral 301 re.Rune = re.Rune[:1] 302 re.Flags = p.flags &^ FoldCase 303 } else if re.Op == OpCharClass && len(re.Rune) == 4 && 304 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && 305 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && 306 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || 307 re.Op == OpCharClass && len(re.Rune) == 2 && 308 re.Rune[0]+1 == re.Rune[1] && 309 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && 310 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { 311 // Case-insensitive rune like [Aa] or [Δδ]. 312 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { 313 return nil 314 } 315 316 // Rewrite as (case-insensitive) literal. 317 re.Op = OpLiteral 318 re.Rune = re.Rune[:1] 319 re.Flags = p.flags | FoldCase 320 } else { 321 // Incremental concatenation. 322 p.maybeConcat(-1, 0) 323 } 324 325 p.stack = append(p.stack, re) 326 p.checkLimits(re) 327 return re 328 } 329 330 // maybeConcat implements incremental concatenation 331 // of literal runes into string nodes. The parser calls this 332 // before each push, so only the top fragment of the stack 333 // might need processing. Since this is called before a push, 334 // the topmost literal is no longer subject to operators like * 335 // (Otherwise ab* would turn into (ab)*.) 336 // If r >= 0 and there's a node left over, maybeConcat uses it 337 // to push r with the given flags. 338 // maybeConcat reports whether r was pushed. 339 func (p *parser) maybeConcat(r rune, flags Flags) bool { 340 n := len(p.stack) 341 if n < 2 { 342 return false 343 } 344 345 re1 := p.stack[n-1] 346 re2 := p.stack[n-2] 347 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { 348 return false 349 } 350 351 // Push re1 into re2. 352 re2.Rune = append(re2.Rune, re1.Rune...) 353 354 // Reuse re1 if possible. 355 if r >= 0 { 356 re1.Rune = re1.Rune0[:1] 357 re1.Rune[0] = r 358 re1.Flags = flags 359 return true 360 } 361 362 p.stack = p.stack[:n-1] 363 p.reuse(re1) 364 return false // did not push r 365 } 366 367 // literal pushes a literal regexp for the rune r on the stack. 368 func (p *parser) literal(r rune) { 369 re := p.newRegexp(OpLiteral) 370 re.Flags = p.flags 371 if p.flags&FoldCase != 0 { 372 r = minFoldRune(r) 373 } 374 re.Rune0[0] = r 375 re.Rune = re.Rune0[:1] 376 p.push(re) 377 } 378 379 // minFoldRune returns the minimum rune fold-equivalent to r. 380 func minFoldRune(r rune) rune { 381 if r < minFold || r > maxFold { 382 return r 383 } 384 min := r 385 r0 := r 386 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { 387 if min > r { 388 min = r 389 } 390 } 391 return min 392 } 393 394 // op pushes a regexp with the given op onto the stack 395 // and returns that regexp. 396 func (p *parser) op(op Op) *Regexp { 397 re := p.newRegexp(op) 398 re.Flags = p.flags 399 return p.push(re) 400 } 401 402 // repeat replaces the top stack element with itself repeated according to op, min, max. 403 // before is the regexp suffix starting at the repetition operator. 404 // after is the regexp suffix following after the repetition operator. 405 // repeat returns an updated 'after' and an error, if any. 406 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { 407 flags := p.flags 408 if p.flags&PerlX != 0 { 409 if len(after) > 0 && after[0] == '?' { 410 after = after[1:] 411 flags ^= NonGreedy 412 } 413 if lastRepeat != "" { 414 // In Perl it is not allowed to stack repetition operators: 415 // a** is a syntax error, not a doubled star, and a++ means 416 // something else entirely, which we don't support! 417 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} 418 } 419 } 420 n := len(p.stack) 421 if n == 0 { 422 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 423 } 424 sub := p.stack[n-1] 425 if sub.Op >= opPseudo { 426 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 427 } 428 429 re := p.newRegexp(op) 430 re.Min = min 431 re.Max = max 432 re.Flags = flags 433 re.Sub = re.Sub0[:1] 434 re.Sub[0] = sub 435 p.stack[n-1] = re 436 p.checkLimits(re) 437 438 if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) { 439 return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 440 } 441 442 return after, nil 443 } 444 445 // repeatIsValid reports whether the repetition re is valid. 446 // Valid means that the combination of the top-level repetition 447 // and any inner repetitions does not exceed n copies of the 448 // innermost thing. 449 // This function rewalks the regexp tree and is called for every repetition, 450 // so we have to worry about inducing quadratic behavior in the parser. 451 // We avoid this by only calling repeatIsValid when min or max >= 2. 452 // In that case the depth of any >= 2 nesting can only get to 9 without 453 // triggering a parse error, so each subtree can only be rewalked 9 times. 454 func repeatIsValid(re *Regexp, n int) bool { 455 if re.Op == OpRepeat { 456 m := re.Max 457 if m == 0 { 458 return true 459 } 460 if m < 0 { 461 m = re.Min 462 } 463 if m > n { 464 return false 465 } 466 if m > 0 { 467 n /= m 468 } 469 } 470 for _, sub := range re.Sub { 471 if !repeatIsValid(sub, n) { 472 return false 473 } 474 } 475 return true 476 } 477 478 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. 479 func (p *parser) concat() *Regexp { 480 p.maybeConcat(-1, 0) 481 482 // Scan down to find pseudo-operator | or (. 483 i := len(p.stack) 484 for i > 0 && p.stack[i-1].Op < opPseudo { 485 i-- 486 } 487 subs := p.stack[i:] 488 p.stack = p.stack[:i] 489 490 // Empty concatenation is special case. 491 if len(subs) == 0 { 492 return p.push(p.newRegexp(OpEmptyMatch)) 493 } 494 495 return p.push(p.collapse(subs, OpConcat)) 496 } 497 498 // alternate replaces the top of the stack (above the topmost '(') with its alternation. 499 func (p *parser) alternate() *Regexp { 500 // Scan down to find pseudo-operator (. 501 // There are no | above (. 502 i := len(p.stack) 503 for i > 0 && p.stack[i-1].Op < opPseudo { 504 i-- 505 } 506 subs := p.stack[i:] 507 p.stack = p.stack[:i] 508 509 // Make sure top class is clean. 510 // All the others already are (see swapVerticalBar). 511 if len(subs) > 0 { 512 cleanAlt(subs[len(subs)-1]) 513 } 514 515 // Empty alternate is special case 516 // (shouldn't happen but easy to handle). 517 if len(subs) == 0 { 518 return p.push(p.newRegexp(OpNoMatch)) 519 } 520 521 return p.push(p.collapse(subs, OpAlternate)) 522 } 523 524 // cleanAlt cleans re for eventual inclusion in an alternation. 525 func cleanAlt(re *Regexp) { 526 switch re.Op { 527 case OpCharClass: 528 re.Rune = cleanClass(&re.Rune) 529 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { 530 re.Rune = nil 531 re.Op = OpAnyChar 532 return 533 } 534 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { 535 re.Rune = nil 536 re.Op = OpAnyCharNotNL 537 return 538 } 539 if cap(re.Rune)-len(re.Rune) > 100 { 540 // re.Rune will not grow any more. 541 // Make a copy or inline to reclaim storage. 542 re.Rune = append(re.Rune0[:0], re.Rune...) 543 } 544 } 545 } 546 547 // collapse returns the result of applying op to sub. 548 // If sub contains op nodes, they all get hoisted up 549 // so that there is never a concat of a concat or an 550 // alternate of an alternate. 551 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { 552 if len(subs) == 1 { 553 return subs[0] 554 } 555 re := p.newRegexp(op) 556 re.Sub = re.Sub0[:0] 557 for _, sub := range subs { 558 if sub.Op == op { 559 re.Sub = append(re.Sub, sub.Sub...) 560 p.reuse(sub) 561 } else { 562 re.Sub = append(re.Sub, sub) 563 } 564 } 565 if op == OpAlternate { 566 re.Sub = p.factor(re.Sub) 567 if len(re.Sub) == 1 { 568 old := re 569 re = re.Sub[0] 570 p.reuse(old) 571 } 572 } 573 return re 574 } 575 576 // factor factors common prefixes from the alternation list sub. 577 // It returns a replacement list that reuses the same storage and 578 // frees (passes to p.reuse) any removed *Regexps. 579 // 580 // For example, 581 // 582 // ABC|ABD|AEF|BCX|BCY 583 // 584 // simplifies by literal prefix extraction to 585 // 586 // A(B(C|D)|EF)|BC(X|Y) 587 // 588 // which simplifies by character class introduction to 589 // 590 // A(B[CD]|EF)|BC[XY] 591 func (p *parser) factor(sub []*Regexp) []*Regexp { 592 if len(sub) < 2 { 593 return sub 594 } 595 596 // Round 1: Factor out common literal prefixes. 597 var str []rune 598 var strflags Flags 599 start := 0 600 out := sub[:0] 601 for i := 0; i <= len(sub); i++ { 602 // Invariant: the Regexps that were in sub[0:start] have been 603 // used or marked for reuse, and the slice space has been reused 604 // for out (len(out) <= start). 605 // 606 // Invariant: sub[start:i] consists of regexps that all begin 607 // with str as modified by strflags. 608 var istr []rune 609 var iflags Flags 610 if i < len(sub) { 611 istr, iflags = p.leadingString(sub[i]) 612 if iflags == strflags { 613 same := 0 614 for same < len(str) && same < len(istr) && str[same] == istr[same] { 615 same++ 616 } 617 if same > 0 { 618 // Matches at least one rune in current range. 619 // Keep going around. 620 str = str[:same] 621 continue 622 } 623 } 624 } 625 626 // Found end of a run with common leading literal string: 627 // sub[start:i] all begin with str[0:len(str)], but sub[i] 628 // does not even begin with str[0]. 629 // 630 // Factor out common string and append factored expression to out. 631 if i == start { 632 // Nothing to do - run of length 0. 633 } else if i == start+1 { 634 // Just one: don't bother factoring. 635 out = append(out, sub[start]) 636 } else { 637 // Construct factored form: prefix(suffix1|suffix2|...) 638 prefix := p.newRegexp(OpLiteral) 639 prefix.Flags = strflags 640 prefix.Rune = append(prefix.Rune[:0], str...) 641 642 for j := start; j < i; j++ { 643 sub[j] = p.removeLeadingString(sub[j], len(str)) 644 p.checkLimits(sub[j]) 645 } 646 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 647 648 re := p.newRegexp(OpConcat) 649 re.Sub = append(re.Sub[:0], prefix, suffix) 650 out = append(out, re) 651 } 652 653 // Prepare for next iteration. 654 start = i 655 str = istr 656 strflags = iflags 657 } 658 sub = out 659 660 // Round 2: Factor out common simple prefixes, 661 // just the first piece of each concatenation. 662 // This will be good enough a lot of the time. 663 // 664 // Complex subexpressions (e.g. involving quantifiers) 665 // are not safe to factor because that collapses their 666 // distinct paths through the automaton, which affects 667 // correctness in some cases. 668 start = 0 669 out = sub[:0] 670 var first *Regexp 671 for i := 0; i <= len(sub); i++ { 672 // Invariant: the Regexps that were in sub[0:start] have been 673 // used or marked for reuse, and the slice space has been reused 674 // for out (len(out) <= start). 675 // 676 // Invariant: sub[start:i] consists of regexps that all begin with ifirst. 677 var ifirst *Regexp 678 if i < len(sub) { 679 ifirst = p.leadingRegexp(sub[i]) 680 if first != nil && first.Equal(ifirst) && 681 // first must be a character class OR a fixed repeat of a character class. 682 (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) { 683 continue 684 } 685 } 686 687 // Found end of a run with common leading regexp: 688 // sub[start:i] all begin with first but sub[i] does not. 689 // 690 // Factor out common regexp and append factored expression to out. 691 if i == start { 692 // Nothing to do - run of length 0. 693 } else if i == start+1 { 694 // Just one: don't bother factoring. 695 out = append(out, sub[start]) 696 } else { 697 // Construct factored form: prefix(suffix1|suffix2|...) 698 prefix := first 699 for j := start; j < i; j++ { 700 reuse := j != start // prefix came from sub[start] 701 sub[j] = p.removeLeadingRegexp(sub[j], reuse) 702 p.checkLimits(sub[j]) 703 } 704 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 705 706 re := p.newRegexp(OpConcat) 707 re.Sub = append(re.Sub[:0], prefix, suffix) 708 out = append(out, re) 709 } 710 711 // Prepare for next iteration. 712 start = i 713 first = ifirst 714 } 715 sub = out 716 717 // Round 3: Collapse runs of single literals into character classes. 718 start = 0 719 out = sub[:0] 720 for i := 0; i <= len(sub); i++ { 721 // Invariant: the Regexps that were in sub[0:start] have been 722 // used or marked for reuse, and the slice space has been reused 723 // for out (len(out) <= start). 724 // 725 // Invariant: sub[start:i] consists of regexps that are either 726 // literal runes or character classes. 727 if i < len(sub) && isCharClass(sub[i]) { 728 continue 729 } 730 731 // sub[i] is not a char or char class; 732 // emit char class for sub[start:i]... 733 if i == start { 734 // Nothing to do - run of length 0. 735 } else if i == start+1 { 736 out = append(out, sub[start]) 737 } else { 738 // Make new char class. 739 // Start with most complex regexp in sub[start]. 740 max := start 741 for j := start + 1; j < i; j++ { 742 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { 743 max = j 744 } 745 } 746 sub[start], sub[max] = sub[max], sub[start] 747 748 for j := start + 1; j < i; j++ { 749 mergeCharClass(sub[start], sub[j]) 750 p.reuse(sub[j]) 751 } 752 cleanAlt(sub[start]) 753 out = append(out, sub[start]) 754 } 755 756 // ... and then emit sub[i]. 757 if i < len(sub) { 758 out = append(out, sub[i]) 759 } 760 start = i + 1 761 } 762 sub = out 763 764 // Round 4: Collapse runs of empty matches into a single empty match. 765 start = 0 766 out = sub[:0] 767 for i := range sub { 768 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { 769 continue 770 } 771 out = append(out, sub[i]) 772 } 773 sub = out 774 775 return sub 776 } 777 778 // leadingString returns the leading literal string that re begins with. 779 // The string refers to storage in re or its children. 780 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { 781 if re.Op == OpConcat && len(re.Sub) > 0 { 782 re = re.Sub[0] 783 } 784 if re.Op != OpLiteral { 785 return nil, 0 786 } 787 return re.Rune, re.Flags & FoldCase 788 } 789 790 // removeLeadingString removes the first n leading runes 791 // from the beginning of re. It returns the replacement for re. 792 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { 793 if re.Op == OpConcat && len(re.Sub) > 0 { 794 // Removing a leading string in a concatenation 795 // might simplify the concatenation. 796 sub := re.Sub[0] 797 sub = p.removeLeadingString(sub, n) 798 re.Sub[0] = sub 799 if sub.Op == OpEmptyMatch { 800 p.reuse(sub) 801 switch len(re.Sub) { 802 case 0, 1: 803 // Impossible but handle. 804 re.Op = OpEmptyMatch 805 re.Sub = nil 806 case 2: 807 old := re 808 re = re.Sub[1] 809 p.reuse(old) 810 default: 811 copy(re.Sub, re.Sub[1:]) 812 re.Sub = re.Sub[:len(re.Sub)-1] 813 } 814 } 815 return re 816 } 817 818 if re.Op == OpLiteral { 819 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] 820 if len(re.Rune) == 0 { 821 re.Op = OpEmptyMatch 822 } 823 } 824 return re 825 } 826 827 // leadingRegexp returns the leading regexp that re begins with. 828 // The regexp refers to storage in re or its children. 829 func (p *parser) leadingRegexp(re *Regexp) *Regexp { 830 if re.Op == OpEmptyMatch { 831 return nil 832 } 833 if re.Op == OpConcat && len(re.Sub) > 0 { 834 sub := re.Sub[0] 835 if sub.Op == OpEmptyMatch { 836 return nil 837 } 838 return sub 839 } 840 return re 841 } 842 843 // removeLeadingRegexp removes the leading regexp in re. 844 // It returns the replacement for re. 845 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. 846 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { 847 if re.Op == OpConcat && len(re.Sub) > 0 { 848 if reuse { 849 p.reuse(re.Sub[0]) 850 } 851 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] 852 switch len(re.Sub) { 853 case 0: 854 re.Op = OpEmptyMatch 855 re.Sub = nil 856 case 1: 857 old := re 858 re = re.Sub[0] 859 p.reuse(old) 860 } 861 return re 862 } 863 if reuse { 864 p.reuse(re) 865 } 866 return p.newRegexp(OpEmptyMatch) 867 } 868 869 func literalRegexp(s string, flags Flags) *Regexp { 870 re := &Regexp{Op: OpLiteral} 871 re.Flags = flags 872 re.Rune = re.Rune0[:0] // use local storage for small strings 873 for _, c := range s { 874 if len(re.Rune) >= cap(re.Rune) { 875 // string is too long to fit in Rune0. let Go handle it 876 re.Rune = []rune(s) 877 break 878 } 879 re.Rune = append(re.Rune, c) 880 } 881 return re 882 } 883 884 // Parsing. 885 886 // Parse parses a regular expression string s, controlled by the specified 887 // Flags, and returns a regular expression parse tree. The syntax is 888 // described in the top-level comment. 889 func Parse(s string, flags Flags) (*Regexp, error) { 890 return parse(s, flags) 891 } 892 893 func parse(s string, flags Flags) (_ *Regexp, err error) { 894 defer func() { 895 switch r := recover(); r { 896 default: 897 panic(r) 898 case nil: 899 // ok 900 case ErrInternalError: // too big 901 err = &Error{Code: ErrInternalError, Expr: s} 902 case ErrNestingDepth: 903 err = &Error{Code: ErrNestingDepth, Expr: s} 904 } 905 }() 906 907 if flags&Literal != 0 { 908 // Trivial parser for literal string. 909 if err := checkUTF8(s); err != nil { 910 return nil, err 911 } 912 return literalRegexp(s, flags), nil 913 } 914 915 // Otherwise, must do real work. 916 var ( 917 p parser 918 c rune 919 op Op 920 lastRepeat string 921 ) 922 p.flags = flags 923 p.wholeRegexp = s 924 t := s 925 for t != "" { 926 repeat := "" 927 BigSwitch: 928 switch t[0] { 929 default: 930 if c, t, err = nextRune(t); err != nil { 931 return nil, err 932 } 933 p.literal(c) 934 935 case '(': 936 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { 937 // Flag changes and non-capturing groups. 938 if t, err = p.parsePerlFlags(t); err != nil { 939 return nil, err 940 } 941 break 942 } 943 p.numCap++ 944 p.op(opLeftParen).Cap = p.numCap 945 t = t[1:] 946 case '|': 947 if err = p.parseVerticalBar(); err != nil { 948 return nil, err 949 } 950 t = t[1:] 951 case ')': 952 if err = p.parseRightParen(); err != nil { 953 return nil, err 954 } 955 t = t[1:] 956 case '^': 957 if p.flags&OneLine != 0 { 958 p.op(OpBeginText) 959 } else { 960 p.op(OpBeginLine) 961 } 962 t = t[1:] 963 case '$': 964 if p.flags&OneLine != 0 { 965 p.op(OpEndText).Flags |= WasDollar 966 } else { 967 p.op(OpEndLine) 968 } 969 t = t[1:] 970 case '.': 971 if p.flags&DotNL != 0 { 972 p.op(OpAnyChar) 973 } else { 974 p.op(OpAnyCharNotNL) 975 } 976 t = t[1:] 977 case '[': 978 if t, err = p.parseClass(t); err != nil { 979 return nil, err 980 } 981 case '*', '+', '?': 982 before := t 983 switch t[0] { 984 case '*': 985 op = OpStar 986 case '+': 987 op = OpPlus 988 case '?': 989 op = OpQuest 990 } 991 after := t[1:] 992 if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil { 993 return nil, err 994 } 995 repeat = before 996 t = after 997 case '{': 998 op = OpRepeat 999 before := t 1000 min, max, after, ok := p.parseRepeat(t) 1001 if !ok { 1002 // If the repeat cannot be parsed, { is a literal. 1003 p.literal('{') 1004 t = t[1:] 1005 break 1006 } 1007 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { 1008 // Numbers were too big, or max is present and min > max. 1009 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 1010 } 1011 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { 1012 return nil, err 1013 } 1014 repeat = before 1015 t = after 1016 case '\\': 1017 if p.flags&PerlX != 0 && len(t) >= 2 { 1018 switch t[1] { 1019 case 'A': 1020 p.op(OpBeginText) 1021 t = t[2:] 1022 break BigSwitch 1023 case 'b': 1024 p.op(OpWordBoundary) 1025 t = t[2:] 1026 break BigSwitch 1027 case 'B': 1028 p.op(OpNoWordBoundary) 1029 t = t[2:] 1030 break BigSwitch 1031 case 'C': 1032 // any byte; not supported 1033 return nil, &Error{ErrInvalidEscape, t[:2]} 1034 case 'Q': 1035 // \Q ... \E: the ... is always literals 1036 var lit string 1037 lit, t, _ = strings.Cut(t[2:], `\E`) 1038 for lit != "" { 1039 c, rest, err := nextRune(lit) 1040 if err != nil { 1041 return nil, err 1042 } 1043 p.literal(c) 1044 lit = rest 1045 } 1046 break BigSwitch 1047 case 'z': 1048 p.op(OpEndText) 1049 t = t[2:] 1050 break BigSwitch 1051 } 1052 } 1053 1054 re := p.newRegexp(OpCharClass) 1055 re.Flags = p.flags 1056 1057 // Look for Unicode character group like \p{Han} 1058 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { 1059 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) 1060 if err != nil { 1061 return nil, err 1062 } 1063 if r != nil { 1064 re.Rune = r 1065 t = rest 1066 p.push(re) 1067 break BigSwitch 1068 } 1069 } 1070 1071 // Perl character class escape. 1072 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { 1073 re.Rune = r 1074 t = rest 1075 p.push(re) 1076 break BigSwitch 1077 } 1078 p.reuse(re) 1079 1080 // Ordinary single-character escape. 1081 if c, t, err = p.parseEscape(t); err != nil { 1082 return nil, err 1083 } 1084 p.literal(c) 1085 } 1086 lastRepeat = repeat 1087 } 1088 1089 p.concat() 1090 if p.swapVerticalBar() { 1091 // pop vertical bar 1092 p.stack = p.stack[:len(p.stack)-1] 1093 } 1094 p.alternate() 1095 1096 n := len(p.stack) 1097 if n != 1 { 1098 return nil, &Error{ErrMissingParen, s} 1099 } 1100 return p.stack[0], nil 1101 } 1102 1103 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. 1104 // If s is not of that form, it returns ok == false. 1105 // If s has the right form but the values are too big, it returns min == -1, ok == true. 1106 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { 1107 if s == "" || s[0] != '{' { 1108 return 1109 } 1110 s = s[1:] 1111 var ok1 bool 1112 if min, s, ok1 = p.parseInt(s); !ok1 { 1113 return 1114 } 1115 if s == "" { 1116 return 1117 } 1118 if s[0] != ',' { 1119 max = min 1120 } else { 1121 s = s[1:] 1122 if s == "" { 1123 return 1124 } 1125 if s[0] == '}' { 1126 max = -1 1127 } else if max, s, ok1 = p.parseInt(s); !ok1 { 1128 return 1129 } else if max < 0 { 1130 // parseInt found too big a number 1131 min = -1 1132 } 1133 } 1134 if s == "" || s[0] != '}' { 1135 return 1136 } 1137 rest = s[1:] 1138 ok = true 1139 return 1140 } 1141 1142 // parsePerlFlags parses a Perl flag setting or non-capturing group or both, 1143 // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. 1144 // The caller must have ensured that s begins with "(?". 1145 func (p *parser) parsePerlFlags(s string) (rest string, err error) { 1146 t := s 1147 1148 // Check for named captures, first introduced in Python's regexp library. 1149 // As usual, there are three slightly different syntaxes: 1150 // 1151 // (?P<name>expr) the original, introduced by Python 1152 // (?<name>expr) the .NET alteration, adopted by Perl 5.10 1153 // (?'name'expr) another .NET alteration, adopted by Perl 5.10 1154 // 1155 // Perl 5.10 gave in and implemented the Python version too, 1156 // but they claim that the last two are the preferred forms. 1157 // PCRE and languages based on it (specifically, PHP and Ruby) 1158 // support all three as well. EcmaScript 4 uses only the Python form. 1159 // 1160 // In both the open source world (via Code Search) and the 1161 // Google source tree, (?P<expr>name) is the dominant form, 1162 // so that's the one we implement. One is enough. 1163 if len(t) > 4 && t[2] == 'P' && t[3] == '<' { 1164 // Pull out name. 1165 end := strings.IndexRune(t, '>') 1166 if end < 0 { 1167 if err = checkUTF8(t); err != nil { 1168 return "", err 1169 } 1170 return "", &Error{ErrInvalidNamedCapture, s} 1171 } 1172 1173 capture := t[:end+1] // "(?P<name>" 1174 name := t[4:end] // "name" 1175 if err = checkUTF8(name); err != nil { 1176 return "", err 1177 } 1178 if !isValidCaptureName(name) { 1179 return "", &Error{ErrInvalidNamedCapture, capture} 1180 } 1181 1182 // Like ordinary capture, but named. 1183 p.numCap++ 1184 re := p.op(opLeftParen) 1185 re.Cap = p.numCap 1186 re.Name = name 1187 return t[end+1:], nil 1188 } 1189 1190 // Non-capturing group. Might also twiddle Perl flags. 1191 var c rune 1192 t = t[2:] // skip (? 1193 flags := p.flags 1194 sign := +1 1195 sawFlag := false 1196 Loop: 1197 for t != "" { 1198 if c, t, err = nextRune(t); err != nil { 1199 return "", err 1200 } 1201 switch c { 1202 default: 1203 break Loop 1204 1205 // Flags. 1206 case 'i': 1207 flags |= FoldCase 1208 sawFlag = true 1209 case 'm': 1210 flags &^= OneLine 1211 sawFlag = true 1212 case 's': 1213 flags |= DotNL 1214 sawFlag = true 1215 case 'U': 1216 flags |= NonGreedy 1217 sawFlag = true 1218 1219 // Switch to negation. 1220 case '-': 1221 if sign < 0 { 1222 break Loop 1223 } 1224 sign = -1 1225 // Invert flags so that | above turn into &^ and vice versa. 1226 // We'll invert flags again before using it below. 1227 flags = ^flags 1228 sawFlag = false 1229 1230 // End of flags, starting group or not. 1231 case ':', ')': 1232 if sign < 0 { 1233 if !sawFlag { 1234 break Loop 1235 } 1236 flags = ^flags 1237 } 1238 if c == ':' { 1239 // Open new group 1240 p.op(opLeftParen) 1241 } 1242 p.flags = flags 1243 return t, nil 1244 } 1245 } 1246 1247 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} 1248 } 1249 1250 // isValidCaptureName reports whether name 1251 // is a valid capture name: [A-Za-z0-9_]+. 1252 // PCRE limits names to 32 bytes. 1253 // Python rejects names starting with digits. 1254 // We don't enforce either of those. 1255 func isValidCaptureName(name string) bool { 1256 if name == "" { 1257 return false 1258 } 1259 for _, c := range name { 1260 if c != '_' && !isalnum(c) { 1261 return false 1262 } 1263 } 1264 return true 1265 } 1266 1267 // parseInt parses a decimal integer. 1268 func (p *parser) parseInt(s string) (n int, rest string, ok bool) { 1269 if s == "" || s[0] < '0' || '9' < s[0] { 1270 return 1271 } 1272 // Disallow leading zeros. 1273 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { 1274 return 1275 } 1276 t := s 1277 for s != "" && '0' <= s[0] && s[0] <= '9' { 1278 s = s[1:] 1279 } 1280 rest = s 1281 ok = true 1282 // Have digits, compute value. 1283 t = t[:len(t)-len(s)] 1284 for i := 0; i < len(t); i++ { 1285 // Avoid overflow. 1286 if n >= 1e8 { 1287 n = -1 1288 break 1289 } 1290 n = n*10 + int(t[i]) - '0' 1291 } 1292 return 1293 } 1294 1295 // can this be represented as a character class? 1296 // single-rune literal string, char class, ., and .|\n. 1297 func isCharClass(re *Regexp) bool { 1298 return re.Op == OpLiteral && len(re.Rune) == 1 || 1299 re.Op == OpCharClass || 1300 re.Op == OpAnyCharNotNL || 1301 re.Op == OpAnyChar 1302 } 1303 1304 // does re match r? 1305 func matchRune(re *Regexp, r rune) bool { 1306 switch re.Op { 1307 case OpLiteral: 1308 return len(re.Rune) == 1 && re.Rune[0] == r 1309 case OpCharClass: 1310 for i := 0; i < len(re.Rune); i += 2 { 1311 if re.Rune[i] <= r && r <= re.Rune[i+1] { 1312 return true 1313 } 1314 } 1315 return false 1316 case OpAnyCharNotNL: 1317 return r != '\n' 1318 case OpAnyChar: 1319 return true 1320 } 1321 return false 1322 } 1323 1324 // parseVerticalBar handles a | in the input. 1325 func (p *parser) parseVerticalBar() error { 1326 p.concat() 1327 1328 // The concatenation we just parsed is on top of the stack. 1329 // If it sits above an opVerticalBar, swap it below 1330 // (things below an opVerticalBar become an alternation). 1331 // Otherwise, push a new vertical bar. 1332 if !p.swapVerticalBar() { 1333 p.op(opVerticalBar) 1334 } 1335 1336 return nil 1337 } 1338 1339 // mergeCharClass makes dst = dst|src. 1340 // The caller must ensure that dst.Op >= src.Op, 1341 // to reduce the amount of copying. 1342 func mergeCharClass(dst, src *Regexp) { 1343 switch dst.Op { 1344 case OpAnyChar: 1345 // src doesn't add anything. 1346 case OpAnyCharNotNL: 1347 // src might add \n 1348 if matchRune(src, '\n') { 1349 dst.Op = OpAnyChar 1350 } 1351 case OpCharClass: 1352 // src is simpler, so either literal or char class 1353 if src.Op == OpLiteral { 1354 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1355 } else { 1356 dst.Rune = appendClass(dst.Rune, src.Rune) 1357 } 1358 case OpLiteral: 1359 // both literal 1360 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { 1361 break 1362 } 1363 dst.Op = OpCharClass 1364 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) 1365 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1366 } 1367 } 1368 1369 // If the top of the stack is an element followed by an opVerticalBar 1370 // swapVerticalBar swaps the two and returns true. 1371 // Otherwise it returns false. 1372 func (p *parser) swapVerticalBar() bool { 1373 // If above and below vertical bar are literal or char class, 1374 // can merge into a single char class. 1375 n := len(p.stack) 1376 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { 1377 re1 := p.stack[n-1] 1378 re3 := p.stack[n-3] 1379 // Make re3 the more complex of the two. 1380 if re1.Op > re3.Op { 1381 re1, re3 = re3, re1 1382 p.stack[n-3] = re3 1383 } 1384 mergeCharClass(re3, re1) 1385 p.reuse(re1) 1386 p.stack = p.stack[:n-1] 1387 return true 1388 } 1389 1390 if n >= 2 { 1391 re1 := p.stack[n-1] 1392 re2 := p.stack[n-2] 1393 if re2.Op == opVerticalBar { 1394 if n >= 3 { 1395 // Now out of reach. 1396 // Clean opportunistically. 1397 cleanAlt(p.stack[n-3]) 1398 } 1399 p.stack[n-2] = re1 1400 p.stack[n-1] = re2 1401 return true 1402 } 1403 } 1404 return false 1405 } 1406 1407 // parseRightParen handles a ) in the input. 1408 func (p *parser) parseRightParen() error { 1409 p.concat() 1410 if p.swapVerticalBar() { 1411 // pop vertical bar 1412 p.stack = p.stack[:len(p.stack)-1] 1413 } 1414 p.alternate() 1415 1416 n := len(p.stack) 1417 if n < 2 { 1418 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1419 } 1420 re1 := p.stack[n-1] 1421 re2 := p.stack[n-2] 1422 p.stack = p.stack[:n-2] 1423 if re2.Op != opLeftParen { 1424 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1425 } 1426 // Restore flags at time of paren. 1427 p.flags = re2.Flags 1428 if re2.Cap == 0 { 1429 // Just for grouping. 1430 p.push(re1) 1431 } else { 1432 re2.Op = OpCapture 1433 re2.Sub = re2.Sub0[:1] 1434 re2.Sub[0] = re1 1435 p.push(re2) 1436 } 1437 return nil 1438 } 1439 1440 // parseEscape parses an escape sequence at the beginning of s 1441 // and returns the rune. 1442 func (p *parser) parseEscape(s string) (r rune, rest string, err error) { 1443 t := s[1:] 1444 if t == "" { 1445 return 0, "", &Error{ErrTrailingBackslash, ""} 1446 } 1447 c, t, err := nextRune(t) 1448 if err != nil { 1449 return 0, "", err 1450 } 1451 1452 Switch: 1453 switch c { 1454 default: 1455 if c < utf8.RuneSelf && !isalnum(c) { 1456 // Escaped non-word characters are always themselves. 1457 // PCRE is not quite so rigorous: it accepts things like 1458 // \q, but we don't. We once rejected \_, but too many 1459 // programs and people insist on using it, so allow \_. 1460 return c, t, nil 1461 } 1462 1463 // Octal escapes. 1464 case '1', '2', '3', '4', '5', '6', '7': 1465 // Single non-zero digit is a backreference; not supported 1466 if t == "" || t[0] < '0' || t[0] > '7' { 1467 break 1468 } 1469 fallthrough 1470 case '0': 1471 // Consume up to three octal digits; already have one. 1472 r = c - '0' 1473 for i := 1; i < 3; i++ { 1474 if t == "" || t[0] < '0' || t[0] > '7' { 1475 break 1476 } 1477 r = r*8 + rune(t[0]) - '0' 1478 t = t[1:] 1479 } 1480 return r, t, nil 1481 1482 // Hexadecimal escapes. 1483 case 'x': 1484 if t == "" { 1485 break 1486 } 1487 if c, t, err = nextRune(t); err != nil { 1488 return 0, "", err 1489 } 1490 if c == '{' { 1491 // Any number of digits in braces. 1492 // Perl accepts any text at all; it ignores all text 1493 // after the first non-hex digit. We require only hex digits, 1494 // and at least one. 1495 nhex := 0 1496 r = 0 1497 for { 1498 if t == "" { 1499 break Switch 1500 } 1501 if c, t, err = nextRune(t); err != nil { 1502 return 0, "", err 1503 } 1504 if c == '}' { 1505 break 1506 } 1507 v := unhex(c) 1508 if v < 0 { 1509 break Switch 1510 } 1511 r = r*16 + v 1512 if r > unicode.MaxRune { 1513 break Switch 1514 } 1515 nhex++ 1516 } 1517 if nhex == 0 { 1518 break Switch 1519 } 1520 return r, t, nil 1521 } 1522 1523 // Easy case: two hex digits. 1524 x := unhex(c) 1525 if c, t, err = nextRune(t); err != nil { 1526 return 0, "", err 1527 } 1528 y := unhex(c) 1529 if x < 0 || y < 0 { 1530 break 1531 } 1532 return x*16 + y, t, nil 1533 1534 // C escapes. There is no case 'b', to avoid misparsing 1535 // the Perl word-boundary \b as the C backspace \b 1536 // when in POSIX mode. In Perl, /\b/ means word-boundary 1537 // but /[\b]/ means backspace. We don't support that. 1538 // If you want a backspace, embed a literal backspace 1539 // character or use \x08. 1540 case 'a': 1541 return '\a', t, err 1542 case 'f': 1543 return '\f', t, err 1544 case 'n': 1545 return '\n', t, err 1546 case 'r': 1547 return '\r', t, err 1548 case 't': 1549 return '\t', t, err 1550 case 'v': 1551 return '\v', t, err 1552 } 1553 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} 1554 } 1555 1556 // parseClassChar parses a character class character at the beginning of s 1557 // and returns it. 1558 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { 1559 if s == "" { 1560 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} 1561 } 1562 1563 // Allow regular escape sequences even though 1564 // many need not be escaped in this context. 1565 if s[0] == '\\' { 1566 return p.parseEscape(s) 1567 } 1568 1569 return nextRune(s) 1570 } 1571 1572 type charGroup struct { 1573 sign int 1574 class []rune 1575 } 1576 1577 // parsePerlClassEscape parses a leading Perl character class escape like \d 1578 // from the beginning of s. If one is present, it appends the characters to r 1579 // and returns the new slice r and the remainder of the string. 1580 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { 1581 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { 1582 return 1583 } 1584 g := perlGroup[s[0:2]] 1585 if g.sign == 0 { 1586 return 1587 } 1588 return p.appendGroup(r, g), s[2:] 1589 } 1590 1591 // parseNamedClass parses a leading POSIX named character class like [:alnum:] 1592 // from the beginning of s. If one is present, it appends the characters to r 1593 // and returns the new slice r and the remainder of the string. 1594 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { 1595 if len(s) < 2 || s[0] != '[' || s[1] != ':' { 1596 return 1597 } 1598 1599 i := strings.Index(s[2:], ":]") 1600 if i < 0 { 1601 return 1602 } 1603 i += 2 1604 name, s := s[0:i+2], s[i+2:] 1605 g := posixGroup[name] 1606 if g.sign == 0 { 1607 return nil, "", &Error{ErrInvalidCharRange, name} 1608 } 1609 return p.appendGroup(r, g), s, nil 1610 } 1611 1612 func (p *parser) appendGroup(r []rune, g charGroup) []rune { 1613 if p.flags&FoldCase == 0 { 1614 if g.sign < 0 { 1615 r = appendNegatedClass(r, g.class) 1616 } else { 1617 r = appendClass(r, g.class) 1618 } 1619 } else { 1620 tmp := p.tmpClass[:0] 1621 tmp = appendFoldedClass(tmp, g.class) 1622 p.tmpClass = tmp 1623 tmp = cleanClass(&p.tmpClass) 1624 if g.sign < 0 { 1625 r = appendNegatedClass(r, tmp) 1626 } else { 1627 r = appendClass(r, tmp) 1628 } 1629 } 1630 return r 1631 } 1632 1633 var anyTable = &unicode.RangeTable{ 1634 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, 1635 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, 1636 } 1637 1638 // unicodeTable returns the unicode.RangeTable identified by name 1639 // and the table of additional fold-equivalent code points. 1640 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { 1641 // Special case: "Any" means any. 1642 if name == "Any" { 1643 return anyTable, anyTable 1644 } 1645 if t := unicode.Categories[name]; t != nil { 1646 return t, unicode.FoldCategory[name] 1647 } 1648 if t := unicode.Scripts[name]; t != nil { 1649 return t, unicode.FoldScript[name] 1650 } 1651 return nil, nil 1652 } 1653 1654 // parseUnicodeClass parses a leading Unicode character class like \p{Han} 1655 // from the beginning of s. If one is present, it appends the characters to r 1656 // and returns the new slice r and the remainder of the string. 1657 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { 1658 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { 1659 return 1660 } 1661 1662 // Committed to parse or return error. 1663 sign := +1 1664 if s[1] == 'P' { 1665 sign = -1 1666 } 1667 t := s[2:] 1668 c, t, err := nextRune(t) 1669 if err != nil { 1670 return 1671 } 1672 var seq, name string 1673 if c != '{' { 1674 // Single-letter name. 1675 seq = s[:len(s)-len(t)] 1676 name = seq[2:] 1677 } else { 1678 // Name is in braces. 1679 end := strings.IndexRune(s, '}') 1680 if end < 0 { 1681 if err = checkUTF8(s); err != nil { 1682 return 1683 } 1684 return nil, "", &Error{ErrInvalidCharRange, s} 1685 } 1686 seq, t = s[:end+1], s[end+1:] 1687 name = s[3:end] 1688 if err = checkUTF8(name); err != nil { 1689 return 1690 } 1691 } 1692 1693 // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. 1694 if name != "" && name[0] == '^' { 1695 sign = -sign 1696 name = name[1:] 1697 } 1698 1699 tab, fold := unicodeTable(name) 1700 if tab == nil { 1701 return nil, "", &Error{ErrInvalidCharRange, seq} 1702 } 1703 1704 if p.flags&FoldCase == 0 || fold == nil { 1705 if sign > 0 { 1706 r = appendTable(r, tab) 1707 } else { 1708 r = appendNegatedTable(r, tab) 1709 } 1710 } else { 1711 // Merge and clean tab and fold in a temporary buffer. 1712 // This is necessary for the negative case and just tidy 1713 // for the positive case. 1714 tmp := p.tmpClass[:0] 1715 tmp = appendTable(tmp, tab) 1716 tmp = appendTable(tmp, fold) 1717 p.tmpClass = tmp 1718 tmp = cleanClass(&p.tmpClass) 1719 if sign > 0 { 1720 r = appendClass(r, tmp) 1721 } else { 1722 r = appendNegatedClass(r, tmp) 1723 } 1724 } 1725 return r, t, nil 1726 } 1727 1728 // parseClass parses a character class at the beginning of s 1729 // and pushes it onto the parse stack. 1730 func (p *parser) parseClass(s string) (rest string, err error) { 1731 t := s[1:] // chop [ 1732 re := p.newRegexp(OpCharClass) 1733 re.Flags = p.flags 1734 re.Rune = re.Rune0[:0] 1735 1736 sign := +1 1737 if t != "" && t[0] == '^' { 1738 sign = -1 1739 t = t[1:] 1740 1741 // If character class does not match \n, add it here, 1742 // so that negation later will do the right thing. 1743 if p.flags&ClassNL == 0 { 1744 re.Rune = append(re.Rune, '\n', '\n') 1745 } 1746 } 1747 1748 class := re.Rune 1749 first := true // ] and - are okay as first char in class 1750 for t == "" || t[0] != ']' || first { 1751 // POSIX: - is only okay unescaped as first or last in class. 1752 // Perl: - is okay anywhere. 1753 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { 1754 _, size := utf8.DecodeRuneInString(t[1:]) 1755 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} 1756 } 1757 first = false 1758 1759 // Look for POSIX [:alnum:] etc. 1760 if len(t) > 2 && t[0] == '[' && t[1] == ':' { 1761 nclass, nt, err := p.parseNamedClass(t, class) 1762 if err != nil { 1763 return "", err 1764 } 1765 if nclass != nil { 1766 class, t = nclass, nt 1767 continue 1768 } 1769 } 1770 1771 // Look for Unicode character group like \p{Han}. 1772 nclass, nt, err := p.parseUnicodeClass(t, class) 1773 if err != nil { 1774 return "", err 1775 } 1776 if nclass != nil { 1777 class, t = nclass, nt 1778 continue 1779 } 1780 1781 // Look for Perl character class symbols (extension). 1782 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { 1783 class, t = nclass, nt 1784 continue 1785 } 1786 1787 // Single character or simple range. 1788 rng := t 1789 var lo, hi rune 1790 if lo, t, err = p.parseClassChar(t, s); err != nil { 1791 return "", err 1792 } 1793 hi = lo 1794 // [a-] means (a|-) so check for final ]. 1795 if len(t) >= 2 && t[0] == '-' && t[1] != ']' { 1796 t = t[1:] 1797 if hi, t, err = p.parseClassChar(t, s); err != nil { 1798 return "", err 1799 } 1800 if hi < lo { 1801 rng = rng[:len(rng)-len(t)] 1802 return "", &Error{Code: ErrInvalidCharRange, Expr: rng} 1803 } 1804 } 1805 if p.flags&FoldCase == 0 { 1806 class = appendRange(class, lo, hi) 1807 } else { 1808 class = appendFoldedRange(class, lo, hi) 1809 } 1810 } 1811 t = t[1:] // chop ] 1812 1813 // Use &re.Rune instead of &class to avoid allocation. 1814 re.Rune = class 1815 class = cleanClass(&re.Rune) 1816 if sign < 0 { 1817 class = negateClass(class) 1818 } 1819 re.Rune = class 1820 p.push(re) 1821 return t, nil 1822 } 1823 1824 // cleanClass sorts the ranges (pairs of elements of r), 1825 // merges them, and eliminates duplicates. 1826 func cleanClass(rp *[]rune) []rune { 1827 1828 // Sort by lo increasing, hi decreasing to break ties. 1829 sort.Sort(ranges{rp}) 1830 1831 r := *rp 1832 if len(r) < 2 { 1833 return r 1834 } 1835 1836 // Merge abutting, overlapping. 1837 w := 2 // write index 1838 for i := 2; i < len(r); i += 2 { 1839 lo, hi := r[i], r[i+1] 1840 if lo <= r[w-1]+1 { 1841 // merge with previous range 1842 if hi > r[w-1] { 1843 r[w-1] = hi 1844 } 1845 continue 1846 } 1847 // new disjoint range 1848 r[w] = lo 1849 r[w+1] = hi 1850 w += 2 1851 } 1852 1853 return r[:w] 1854 } 1855 1856 // appendLiteral returns the result of appending the literal x to the class r. 1857 func appendLiteral(r []rune, x rune, flags Flags) []rune { 1858 if flags&FoldCase != 0 { 1859 return appendFoldedRange(r, x, x) 1860 } 1861 return appendRange(r, x, x) 1862 } 1863 1864 // appendRange returns the result of appending the range lo-hi to the class r. 1865 func appendRange(r []rune, lo, hi rune) []rune { 1866 // Expand last range or next to last range if it overlaps or abuts. 1867 // Checking two ranges helps when appending case-folded 1868 // alphabets, so that one range can be expanding A-Z and the 1869 // other expanding a-z. 1870 n := len(r) 1871 for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 1872 if n >= i { 1873 rlo, rhi := r[n-i], r[n-i+1] 1874 if lo <= rhi+1 && rlo <= hi+1 { 1875 if lo < rlo { 1876 r[n-i] = lo 1877 } 1878 if hi > rhi { 1879 r[n-i+1] = hi 1880 } 1881 return r 1882 } 1883 } 1884 } 1885 1886 return append(r, lo, hi) 1887 } 1888 1889 const ( 1890 // minimum and maximum runes involved in folding. 1891 // checked during test. 1892 minFold = 0x0041 1893 maxFold = 0x1e943 1894 ) 1895 1896 // appendFoldedRange returns the result of appending the range lo-hi 1897 // and its case folding-equivalent runes to the class r. 1898 func appendFoldedRange(r []rune, lo, hi rune) []rune { 1899 // Optimizations. 1900 if lo <= minFold && hi >= maxFold { 1901 // Range is full: folding can't add more. 1902 return appendRange(r, lo, hi) 1903 } 1904 if hi < minFold || lo > maxFold { 1905 // Range is outside folding possibilities. 1906 return appendRange(r, lo, hi) 1907 } 1908 if lo < minFold { 1909 // [lo, minFold-1] needs no folding. 1910 r = appendRange(r, lo, minFold-1) 1911 lo = minFold 1912 } 1913 if hi > maxFold { 1914 // [maxFold+1, hi] needs no folding. 1915 r = appendRange(r, maxFold+1, hi) 1916 hi = maxFold 1917 } 1918 1919 // Brute force. Depend on appendRange to coalesce ranges on the fly. 1920 for c := lo; c <= hi; c++ { 1921 r = appendRange(r, c, c) 1922 f := unicode.SimpleFold(c) 1923 for f != c { 1924 r = appendRange(r, f, f) 1925 f = unicode.SimpleFold(f) 1926 } 1927 } 1928 return r 1929 } 1930 1931 // appendClass returns the result of appending the class x to the class r. 1932 // It assume x is clean. 1933 func appendClass(r []rune, x []rune) []rune { 1934 for i := 0; i < len(x); i += 2 { 1935 r = appendRange(r, x[i], x[i+1]) 1936 } 1937 return r 1938 } 1939 1940 // appendFoldedClass returns the result of appending the case folding of the class x to the class r. 1941 func appendFoldedClass(r []rune, x []rune) []rune { 1942 for i := 0; i < len(x); i += 2 { 1943 r = appendFoldedRange(r, x[i], x[i+1]) 1944 } 1945 return r 1946 } 1947 1948 // appendNegatedClass returns the result of appending the negation of the class x to the class r. 1949 // It assumes x is clean. 1950 func appendNegatedClass(r []rune, x []rune) []rune { 1951 nextLo := '\u0000' 1952 for i := 0; i < len(x); i += 2 { 1953 lo, hi := x[i], x[i+1] 1954 if nextLo <= lo-1 { 1955 r = appendRange(r, nextLo, lo-1) 1956 } 1957 nextLo = hi + 1 1958 } 1959 if nextLo <= unicode.MaxRune { 1960 r = appendRange(r, nextLo, unicode.MaxRune) 1961 } 1962 return r 1963 } 1964 1965 // appendTable returns the result of appending x to the class r. 1966 func appendTable(r []rune, x *unicode.RangeTable) []rune { 1967 for _, xr := range x.R16 { 1968 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1969 if stride == 1 { 1970 r = appendRange(r, lo, hi) 1971 continue 1972 } 1973 for c := lo; c <= hi; c += stride { 1974 r = appendRange(r, c, c) 1975 } 1976 } 1977 for _, xr := range x.R32 { 1978 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1979 if stride == 1 { 1980 r = appendRange(r, lo, hi) 1981 continue 1982 } 1983 for c := lo; c <= hi; c += stride { 1984 r = appendRange(r, c, c) 1985 } 1986 } 1987 return r 1988 } 1989 1990 // appendNegatedTable returns the result of appending the negation of x to the class r. 1991 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { 1992 nextLo := '\u0000' // lo end of next class to add 1993 for _, xr := range x.R16 { 1994 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1995 if stride == 1 { 1996 if nextLo <= lo-1 { 1997 r = appendRange(r, nextLo, lo-1) 1998 } 1999 nextLo = hi + 1 2000 continue 2001 } 2002 for c := lo; c <= hi; c += stride { 2003 if nextLo <= c-1 { 2004 r = appendRange(r, nextLo, c-1) 2005 } 2006 nextLo = c + 1 2007 } 2008 } 2009 for _, xr := range x.R32 { 2010 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 2011 if stride == 1 { 2012 if nextLo <= lo-1 { 2013 r = appendRange(r, nextLo, lo-1) 2014 } 2015 nextLo = hi + 1 2016 continue 2017 } 2018 for c := lo; c <= hi; c += stride { 2019 if nextLo <= c-1 { 2020 r = appendRange(r, nextLo, c-1) 2021 } 2022 nextLo = c + 1 2023 } 2024 } 2025 if nextLo <= unicode.MaxRune { 2026 r = appendRange(r, nextLo, unicode.MaxRune) 2027 } 2028 return r 2029 } 2030 2031 // negateClass overwrites r and returns r's negation. 2032 // It assumes the class r is already clean. 2033 func negateClass(r []rune) []rune { 2034 nextLo := '\u0000' // lo end of next class to add 2035 w := 0 // write index 2036 for i := 0; i < len(r); i += 2 { 2037 lo, hi := r[i], r[i+1] 2038 if nextLo <= lo-1 { 2039 r[w] = nextLo 2040 r[w+1] = lo - 1 2041 w += 2 2042 } 2043 nextLo = hi + 1 2044 } 2045 r = r[:w] 2046 if nextLo <= unicode.MaxRune { 2047 // It's possible for the negation to have one more 2048 // range - this one - than the original class, so use append. 2049 r = append(r, nextLo, unicode.MaxRune) 2050 } 2051 return r 2052 } 2053 2054 // ranges implements sort.Interface on a []rune. 2055 // The choice of receiver type definition is strange 2056 // but avoids an allocation since we already have 2057 // a *[]rune. 2058 type ranges struct { 2059 p *[]rune 2060 } 2061 2062 func (ra ranges) Less(i, j int) bool { 2063 p := *ra.p 2064 i *= 2 2065 j *= 2 2066 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] 2067 } 2068 2069 func (ra ranges) Len() int { 2070 return len(*ra.p) / 2 2071 } 2072 2073 func (ra ranges) Swap(i, j int) { 2074 p := *ra.p 2075 i *= 2 2076 j *= 2 2077 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] 2078 } 2079 2080 func checkUTF8(s string) error { 2081 for s != "" { 2082 rune, size := utf8.DecodeRuneInString(s) 2083 if rune == utf8.RuneError && size == 1 { 2084 return &Error{Code: ErrInvalidUTF8, Expr: s} 2085 } 2086 s = s[size:] 2087 } 2088 return nil 2089 } 2090 2091 func nextRune(s string) (c rune, t string, err error) { 2092 c, size := utf8.DecodeRuneInString(s) 2093 if c == utf8.RuneError && size == 1 { 2094 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} 2095 } 2096 return c, s[size:], nil 2097 } 2098 2099 func isalnum(c rune) bool { 2100 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 2101 } 2102 2103 func unhex(c rune) rune { 2104 if '0' <= c && c <= '9' { 2105 return c - '0' 2106 } 2107 if 'a' <= c && c <= 'f' { 2108 return c - 'a' + 10 2109 } 2110 if 'A' <= c && c <= 'F' { 2111 return c - 'A' + 10 2112 } 2113 return -1 2114 }