github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/regexp/syntax/parse.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "sort" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // An Error describes a failure to parse a regular expression 15 // and gives the offending expression. 16 type Error struct { 17 Code ErrorCode 18 Expr string 19 } 20 21 func (e *Error) Error() string { 22 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" 23 } 24 25 // An ErrorCode describes a failure to parse a regular expression. 26 type ErrorCode string 27 28 const ( 29 // Unexpected error 30 ErrInternalError ErrorCode = "regexp/syntax: internal error" 31 32 // Parse errors 33 ErrInvalidCharClass ErrorCode = "invalid character class" 34 ErrInvalidCharRange ErrorCode = "invalid character class range" 35 ErrInvalidEscape ErrorCode = "invalid escape sequence" 36 ErrInvalidNamedCapture ErrorCode = "invalid named capture" 37 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" 38 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" 39 ErrInvalidRepeatSize ErrorCode = "invalid repeat count" 40 ErrInvalidUTF8 ErrorCode = "invalid UTF-8" 41 ErrMissingBracket ErrorCode = "missing closing ]" 42 ErrMissingParen ErrorCode = "missing closing )" 43 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" 44 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" 45 ErrUnexpectedParen ErrorCode = "unexpected )" 46 ErrNestingDepth ErrorCode = "expression nests too deeply" 47 ErrLarge ErrorCode = "expression too large" 48 ) 49 50 func (e ErrorCode) String() string { 51 return string(e) 52 } 53 54 // Flags control the behavior of the parser and record information about regexp context. 55 type Flags uint16 56 57 const ( 58 FoldCase Flags = 1 << iota // case-insensitive match 59 Literal // treat pattern as literal string 60 ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline 61 DotNL // allow . to match newline 62 OneLine // treat ^ and $ as only matching at beginning and end of text 63 NonGreedy // make repetition operators default to non-greedy 64 PerlX // allow Perl extensions 65 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation 66 WasDollar // regexp OpEndText was $, not \z 67 Simple // regexp contains no counted repetition 68 69 MatchNL = ClassNL | DotNL 70 71 Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible 72 POSIX Flags = 0 // POSIX syntax 73 ) 74 75 // Pseudo-ops for parsing stack. 76 const ( 77 opLeftParen = opPseudo + iota 78 opVerticalBar 79 ) 80 81 // maxHeight is the maximum height of a regexp parse tree. 82 // It is somewhat arbitrarily chosen, but the idea is to be large enough 83 // that no one will actually hit in real use but at the same time small enough 84 // that recursion on the Regexp tree will not hit the 1GB Go stack limit. 85 // The maximum amount of stack for a single recursive frame is probably 86 // closer to 1kB, so this could potentially be raised, but it seems unlikely 87 // that people have regexps nested even this deeply. 88 // We ran a test on Google's C++ code base and turned up only 89 // a single use case with depth > 100; it had depth 128. 90 // Using depth 1000 should be plenty of margin. 91 // As an optimization, we don't even bother calculating heights 92 // until we've allocated at least maxHeight Regexp structures. 93 const maxHeight = 1000 94 95 // maxSize is the maximum size of a compiled regexp in Insts. 96 // It too is somewhat arbitrarily chosen, but the idea is to be large enough 97 // to allow significant regexps while at the same time small enough that 98 // the compiled form will not take up too much memory. 99 // 128 MB is enough for a 3.3 million Inst structures, which roughly 100 // corresponds to a 3.3 MB regexp. 101 const ( 102 maxSize = 128 << 20 / instSize 103 instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words 104 ) 105 106 // maxRunes is the maximum number of runes allowed in a regexp tree 107 // counting the runes in all the nodes. 108 // Ignoring character classes p.numRunes is always less than the length of the regexp. 109 // Character classes can make it much larger: each \pL adds 1292 runes. 110 // 128 MB is enough for 32M runes, which is over 26k \pL instances. 111 // Note that repetitions do not make copies of the rune slices, 112 // so \pL{1000} is only one rune slice, not 1000. 113 // We could keep a cache of character classes we've seen, 114 // so that all the \pL we see use the same rune list, 115 // but that doesn't remove the problem entirely: 116 // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. 117 // And because the Rune slice is exposed directly in the Regexp, 118 // there is not an opportunity to change the representation to allow 119 // partial sharing between different character classes. 120 // So the limit is the best we can do. 121 const ( 122 maxRunes = 128 << 20 / runeSize 123 runeSize = 4 // rune is int32 124 ) 125 126 type parser struct { 127 flags Flags // parse mode flags 128 stack []*Regexp // stack of parsed expressions 129 free *Regexp 130 numCap int // number of capturing groups seen 131 wholeRegexp string 132 tmpClass []rune // temporary char class work space 133 numRegexp int // number of regexps allocated 134 numRunes int // number of runes in char classes 135 repeats int64 // product of all repetitions seen 136 height map[*Regexp]int // regexp height, for height limit check 137 size map[*Regexp]int64 // regexp compiled size, for size limit check 138 } 139 140 func (p *parser) newRegexp(op Op) *Regexp { 141 re := p.free 142 if re != nil { 143 p.free = re.Sub0[0] 144 *re = Regexp{} 145 } else { 146 re = new(Regexp) 147 p.numRegexp++ 148 } 149 re.Op = op 150 return re 151 } 152 153 func (p *parser) reuse(re *Regexp) { 154 if p.height != nil { 155 delete(p.height, re) 156 } 157 re.Sub0[0] = p.free 158 p.free = re 159 } 160 161 func (p *parser) checkLimits(re *Regexp) { 162 if p.numRunes > maxRunes { 163 panic(ErrLarge) 164 } 165 p.checkSize(re) 166 p.checkHeight(re) 167 } 168 169 func (p *parser) checkSize(re *Regexp) { 170 if p.size == nil { 171 // We haven't started tracking size yet. 172 // Do a relatively cheap check to see if we need to start. 173 // Maintain the product of all the repeats we've seen 174 // and don't track if the total number of regexp nodes 175 // we've seen times the repeat product is in budget. 176 if p.repeats == 0 { 177 p.repeats = 1 178 } 179 if re.Op == OpRepeat { 180 n := re.Max 181 if n == -1 { 182 n = re.Min 183 } 184 if n <= 0 { 185 n = 1 186 } 187 if int64(n) > maxSize/p.repeats { 188 p.repeats = maxSize 189 } else { 190 p.repeats *= int64(n) 191 } 192 } 193 if int64(p.numRegexp) < maxSize/p.repeats { 194 return 195 } 196 197 // We need to start tracking size. 198 // Make the map and belatedly populate it 199 // with info about everything we've constructed so far. 200 p.size = make(map[*Regexp]int64) 201 for _, re := range p.stack { 202 p.checkSize(re) 203 } 204 } 205 206 if p.calcSize(re, true) > maxSize { 207 panic(ErrLarge) 208 } 209 } 210 211 func (p *parser) calcSize(re *Regexp, force bool) int64 { 212 if !force { 213 if size, ok := p.size[re]; ok { 214 return size 215 } 216 } 217 218 var size int64 219 switch re.Op { 220 case OpLiteral: 221 size = int64(len(re.Rune)) 222 case OpCapture, OpStar: 223 // star can be 1+ or 2+; assume 2 pessimistically 224 size = 2 + p.calcSize(re.Sub[0], false) 225 case OpPlus, OpQuest: 226 size = 1 + p.calcSize(re.Sub[0], false) 227 case OpConcat: 228 for _, sub := range re.Sub { 229 size += p.calcSize(sub, false) 230 } 231 case OpAlternate: 232 for _, sub := range re.Sub { 233 size += p.calcSize(sub, false) 234 } 235 if len(re.Sub) > 1 { 236 size += int64(len(re.Sub)) - 1 237 } 238 case OpRepeat: 239 sub := p.calcSize(re.Sub[0], false) 240 if re.Max == -1 { 241 if re.Min == 0 { 242 size = 2 + sub // x* 243 } else { 244 size = 1 + int64(re.Min)*sub // xxx+ 245 } 246 break 247 } 248 // x{2,5} = xx(x(x(x)?)?)? 249 size = int64(re.Max)*sub + int64(re.Max-re.Min) 250 } 251 252 if size < 1 { 253 size = 1 254 } 255 p.size[re] = size 256 return size 257 } 258 259 func (p *parser) checkHeight(re *Regexp) { 260 if p.numRegexp < maxHeight { 261 return 262 } 263 if p.height == nil { 264 p.height = make(map[*Regexp]int) 265 for _, re := range p.stack { 266 p.checkHeight(re) 267 } 268 } 269 if p.calcHeight(re, true) > maxHeight { 270 panic(ErrNestingDepth) 271 } 272 } 273 274 func (p *parser) calcHeight(re *Regexp, force bool) int { 275 if !force { 276 if h, ok := p.height[re]; ok { 277 return h 278 } 279 } 280 h := 1 281 for _, sub := range re.Sub { 282 hsub := p.calcHeight(sub, false) 283 if h < 1+hsub { 284 h = 1 + hsub 285 } 286 } 287 p.height[re] = h 288 return h 289 } 290 291 // Parse stack manipulation. 292 293 // push pushes the regexp re onto the parse stack and returns the regexp. 294 func (p *parser) push(re *Regexp) *Regexp { 295 p.numRunes += len(re.Rune) 296 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { 297 // Single rune. 298 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { 299 return nil 300 } 301 re.Op = OpLiteral 302 re.Rune = re.Rune[:1] 303 re.Flags = p.flags &^ FoldCase 304 } else if re.Op == OpCharClass && len(re.Rune) == 4 && 305 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && 306 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && 307 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || 308 re.Op == OpCharClass && len(re.Rune) == 2 && 309 re.Rune[0]+1 == re.Rune[1] && 310 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && 311 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { 312 // Case-insensitive rune like [Aa] or [Δδ]. 313 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { 314 return nil 315 } 316 317 // Rewrite as (case-insensitive) literal. 318 re.Op = OpLiteral 319 re.Rune = re.Rune[:1] 320 re.Flags = p.flags | FoldCase 321 } else { 322 // Incremental concatenation. 323 p.maybeConcat(-1, 0) 324 } 325 326 p.stack = append(p.stack, re) 327 p.checkLimits(re) 328 return re 329 } 330 331 // maybeConcat implements incremental concatenation 332 // of literal runes into string nodes. The parser calls this 333 // before each push, so only the top fragment of the stack 334 // might need processing. Since this is called before a push, 335 // the topmost literal is no longer subject to operators like * 336 // (Otherwise ab* would turn into (ab)*.) 337 // If r >= 0 and there's a node left over, maybeConcat uses it 338 // to push r with the given flags. 339 // maybeConcat reports whether r was pushed. 340 func (p *parser) maybeConcat(r rune, flags Flags) bool { 341 n := len(p.stack) 342 if n < 2 { 343 return false 344 } 345 346 re1 := p.stack[n-1] 347 re2 := p.stack[n-2] 348 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { 349 return false 350 } 351 352 // Push re1 into re2. 353 re2.Rune = append(re2.Rune, re1.Rune...) 354 355 // Reuse re1 if possible. 356 if r >= 0 { 357 re1.Rune = re1.Rune0[:1] 358 re1.Rune[0] = r 359 re1.Flags = flags 360 return true 361 } 362 363 p.stack = p.stack[:n-1] 364 p.reuse(re1) 365 return false // did not push r 366 } 367 368 // literal pushes a literal regexp for the rune r on the stack. 369 func (p *parser) literal(r rune) { 370 re := p.newRegexp(OpLiteral) 371 re.Flags = p.flags 372 if p.flags&FoldCase != 0 { 373 r = minFoldRune(r) 374 } 375 re.Rune0[0] = r 376 re.Rune = re.Rune0[:1] 377 p.push(re) 378 } 379 380 // minFoldRune returns the minimum rune fold-equivalent to r. 381 func minFoldRune(r rune) rune { 382 if r < minFold || r > maxFold { 383 return r 384 } 385 min := r 386 r0 := r 387 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { 388 if min > r { 389 min = r 390 } 391 } 392 return min 393 } 394 395 // op pushes a regexp with the given op onto the stack 396 // and returns that regexp. 397 func (p *parser) op(op Op) *Regexp { 398 re := p.newRegexp(op) 399 re.Flags = p.flags 400 return p.push(re) 401 } 402 403 // repeat replaces the top stack element with itself repeated according to op, min, max. 404 // before is the regexp suffix starting at the repetition operator. 405 // after is the regexp suffix following after the repetition operator. 406 // repeat returns an updated 'after' and an error, if any. 407 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { 408 flags := p.flags 409 if p.flags&PerlX != 0 { 410 if len(after) > 0 && after[0] == '?' { 411 after = after[1:] 412 flags ^= NonGreedy 413 } 414 if lastRepeat != "" { 415 // In Perl it is not allowed to stack repetition operators: 416 // a** is a syntax error, not a doubled star, and a++ means 417 // something else entirely, which we don't support! 418 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} 419 } 420 } 421 n := len(p.stack) 422 if n == 0 { 423 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 424 } 425 sub := p.stack[n-1] 426 if sub.Op >= opPseudo { 427 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 428 } 429 430 re := p.newRegexp(op) 431 re.Min = min 432 re.Max = max 433 re.Flags = flags 434 re.Sub = re.Sub0[:1] 435 re.Sub[0] = sub 436 p.stack[n-1] = re 437 p.checkLimits(re) 438 439 if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) { 440 return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 441 } 442 443 return after, nil 444 } 445 446 // repeatIsValid reports whether the repetition re is valid. 447 // Valid means that the combination of the top-level repetition 448 // and any inner repetitions does not exceed n copies of the 449 // innermost thing. 450 // This function rewalks the regexp tree and is called for every repetition, 451 // so we have to worry about inducing quadratic behavior in the parser. 452 // We avoid this by only calling repeatIsValid when min or max >= 2. 453 // In that case the depth of any >= 2 nesting can only get to 9 without 454 // triggering a parse error, so each subtree can only be rewalked 9 times. 455 func repeatIsValid(re *Regexp, n int) bool { 456 if re.Op == OpRepeat { 457 m := re.Max 458 if m == 0 { 459 return true 460 } 461 if m < 0 { 462 m = re.Min 463 } 464 if m > n { 465 return false 466 } 467 if m > 0 { 468 n /= m 469 } 470 } 471 for _, sub := range re.Sub { 472 if !repeatIsValid(sub, n) { 473 return false 474 } 475 } 476 return true 477 } 478 479 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. 480 func (p *parser) concat() *Regexp { 481 p.maybeConcat(-1, 0) 482 483 // Scan down to find pseudo-operator | or (. 484 i := len(p.stack) 485 for i > 0 && p.stack[i-1].Op < opPseudo { 486 i-- 487 } 488 subs := p.stack[i:] 489 p.stack = p.stack[:i] 490 491 // Empty concatenation is special case. 492 if len(subs) == 0 { 493 return p.push(p.newRegexp(OpEmptyMatch)) 494 } 495 496 return p.push(p.collapse(subs, OpConcat)) 497 } 498 499 // alternate replaces the top of the stack (above the topmost '(') with its alternation. 500 func (p *parser) alternate() *Regexp { 501 // Scan down to find pseudo-operator (. 502 // There are no | above (. 503 i := len(p.stack) 504 for i > 0 && p.stack[i-1].Op < opPseudo { 505 i-- 506 } 507 subs := p.stack[i:] 508 p.stack = p.stack[:i] 509 510 // Make sure top class is clean. 511 // All the others already are (see swapVerticalBar). 512 if len(subs) > 0 { 513 cleanAlt(subs[len(subs)-1]) 514 } 515 516 // Empty alternate is special case 517 // (shouldn't happen but easy to handle). 518 if len(subs) == 0 { 519 return p.push(p.newRegexp(OpNoMatch)) 520 } 521 522 return p.push(p.collapse(subs, OpAlternate)) 523 } 524 525 // cleanAlt cleans re for eventual inclusion in an alternation. 526 func cleanAlt(re *Regexp) { 527 switch re.Op { 528 case OpCharClass: 529 re.Rune = cleanClass(&re.Rune) 530 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { 531 re.Rune = nil 532 re.Op = OpAnyChar 533 return 534 } 535 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { 536 re.Rune = nil 537 re.Op = OpAnyCharNotNL 538 return 539 } 540 if cap(re.Rune)-len(re.Rune) > 100 { 541 // re.Rune will not grow any more. 542 // Make a copy or inline to reclaim storage. 543 re.Rune = append(re.Rune0[:0], re.Rune...) 544 } 545 } 546 } 547 548 // collapse returns the result of applying op to sub. 549 // If sub contains op nodes, they all get hoisted up 550 // so that there is never a concat of a concat or an 551 // alternate of an alternate. 552 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { 553 if len(subs) == 1 { 554 return subs[0] 555 } 556 re := p.newRegexp(op) 557 re.Sub = re.Sub0[:0] 558 for _, sub := range subs { 559 if sub.Op == op { 560 re.Sub = append(re.Sub, sub.Sub...) 561 p.reuse(sub) 562 } else { 563 re.Sub = append(re.Sub, sub) 564 } 565 } 566 if op == OpAlternate { 567 re.Sub = p.factor(re.Sub) 568 if len(re.Sub) == 1 { 569 old := re 570 re = re.Sub[0] 571 p.reuse(old) 572 } 573 } 574 return re 575 } 576 577 // factor factors common prefixes from the alternation list sub. 578 // It returns a replacement list that reuses the same storage and 579 // frees (passes to p.reuse) any removed *Regexps. 580 // 581 // For example, 582 // 583 // ABC|ABD|AEF|BCX|BCY 584 // 585 // simplifies by literal prefix extraction to 586 // 587 // A(B(C|D)|EF)|BC(X|Y) 588 // 589 // which simplifies by character class introduction to 590 // 591 // A(B[CD]|EF)|BC[XY] 592 func (p *parser) factor(sub []*Regexp) []*Regexp { 593 if len(sub) < 2 { 594 return sub 595 } 596 597 // Round 1: Factor out common literal prefixes. 598 var str []rune 599 var strflags Flags 600 start := 0 601 out := sub[:0] 602 for i := 0; i <= len(sub); i++ { 603 // Invariant: the Regexps that were in sub[0:start] have been 604 // used or marked for reuse, and the slice space has been reused 605 // for out (len(out) <= start). 606 // 607 // Invariant: sub[start:i] consists of regexps that all begin 608 // with str as modified by strflags. 609 var istr []rune 610 var iflags Flags 611 if i < len(sub) { 612 istr, iflags = p.leadingString(sub[i]) 613 if iflags == strflags { 614 same := 0 615 for same < len(str) && same < len(istr) && str[same] == istr[same] { 616 same++ 617 } 618 if same > 0 { 619 // Matches at least one rune in current range. 620 // Keep going around. 621 str = str[:same] 622 continue 623 } 624 } 625 } 626 627 // Found end of a run with common leading literal string: 628 // sub[start:i] all begin with str[0:len(str)], but sub[i] 629 // does not even begin with str[0]. 630 // 631 // Factor out common string and append factored expression to out. 632 if i == start { 633 // Nothing to do - run of length 0. 634 } else if i == start+1 { 635 // Just one: don't bother factoring. 636 out = append(out, sub[start]) 637 } else { 638 // Construct factored form: prefix(suffix1|suffix2|...) 639 prefix := p.newRegexp(OpLiteral) 640 prefix.Flags = strflags 641 prefix.Rune = append(prefix.Rune[:0], str...) 642 643 for j := start; j < i; j++ { 644 sub[j] = p.removeLeadingString(sub[j], len(str)) 645 p.checkLimits(sub[j]) 646 } 647 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 648 649 re := p.newRegexp(OpConcat) 650 re.Sub = append(re.Sub[:0], prefix, suffix) 651 out = append(out, re) 652 } 653 654 // Prepare for next iteration. 655 start = i 656 str = istr 657 strflags = iflags 658 } 659 sub = out 660 661 // Round 2: Factor out common simple prefixes, 662 // just the first piece of each concatenation. 663 // This will be good enough a lot of the time. 664 // 665 // Complex subexpressions (e.g. involving quantifiers) 666 // are not safe to factor because that collapses their 667 // distinct paths through the automaton, which affects 668 // correctness in some cases. 669 start = 0 670 out = sub[:0] 671 var first *Regexp 672 for i := 0; i <= len(sub); i++ { 673 // Invariant: the Regexps that were in sub[0:start] have been 674 // used or marked for reuse, and the slice space has been reused 675 // for out (len(out) <= start). 676 // 677 // Invariant: sub[start:i] consists of regexps that all begin with ifirst. 678 var ifirst *Regexp 679 if i < len(sub) { 680 ifirst = p.leadingRegexp(sub[i]) 681 if first != nil && first.Equal(ifirst) && 682 // first must be a character class OR a fixed repeat of a character class. 683 (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) { 684 continue 685 } 686 } 687 688 // Found end of a run with common leading regexp: 689 // sub[start:i] all begin with first but sub[i] does not. 690 // 691 // Factor out common regexp and append factored expression to out. 692 if i == start { 693 // Nothing to do - run of length 0. 694 } else if i == start+1 { 695 // Just one: don't bother factoring. 696 out = append(out, sub[start]) 697 } else { 698 // Construct factored form: prefix(suffix1|suffix2|...) 699 prefix := first 700 for j := start; j < i; j++ { 701 reuse := j != start // prefix came from sub[start] 702 sub[j] = p.removeLeadingRegexp(sub[j], reuse) 703 p.checkLimits(sub[j]) 704 } 705 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 706 707 re := p.newRegexp(OpConcat) 708 re.Sub = append(re.Sub[:0], prefix, suffix) 709 out = append(out, re) 710 } 711 712 // Prepare for next iteration. 713 start = i 714 first = ifirst 715 } 716 sub = out 717 718 // Round 3: Collapse runs of single literals into character classes. 719 start = 0 720 out = sub[:0] 721 for i := 0; i <= len(sub); i++ { 722 // Invariant: the Regexps that were in sub[0:start] have been 723 // used or marked for reuse, and the slice space has been reused 724 // for out (len(out) <= start). 725 // 726 // Invariant: sub[start:i] consists of regexps that are either 727 // literal runes or character classes. 728 if i < len(sub) && isCharClass(sub[i]) { 729 continue 730 } 731 732 // sub[i] is not a char or char class; 733 // emit char class for sub[start:i]... 734 if i == start { 735 // Nothing to do - run of length 0. 736 } else if i == start+1 { 737 out = append(out, sub[start]) 738 } else { 739 // Make new char class. 740 // Start with most complex regexp in sub[start]. 741 max := start 742 for j := start + 1; j < i; j++ { 743 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { 744 max = j 745 } 746 } 747 sub[start], sub[max] = sub[max], sub[start] 748 749 for j := start + 1; j < i; j++ { 750 mergeCharClass(sub[start], sub[j]) 751 p.reuse(sub[j]) 752 } 753 cleanAlt(sub[start]) 754 out = append(out, sub[start]) 755 } 756 757 // ... and then emit sub[i]. 758 if i < len(sub) { 759 out = append(out, sub[i]) 760 } 761 start = i + 1 762 } 763 sub = out 764 765 // Round 4: Collapse runs of empty matches into a single empty match. 766 start = 0 767 out = sub[:0] 768 for i := range sub { 769 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { 770 continue 771 } 772 out = append(out, sub[i]) 773 } 774 sub = out 775 776 return sub 777 } 778 779 // leadingString returns the leading literal string that re begins with. 780 // The string refers to storage in re or its children. 781 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { 782 if re.Op == OpConcat && len(re.Sub) > 0 { 783 re = re.Sub[0] 784 } 785 if re.Op != OpLiteral { 786 return nil, 0 787 } 788 return re.Rune, re.Flags & FoldCase 789 } 790 791 // removeLeadingString removes the first n leading runes 792 // from the beginning of re. It returns the replacement for re. 793 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { 794 if re.Op == OpConcat && len(re.Sub) > 0 { 795 // Removing a leading string in a concatenation 796 // might simplify the concatenation. 797 sub := re.Sub[0] 798 sub = p.removeLeadingString(sub, n) 799 re.Sub[0] = sub 800 if sub.Op == OpEmptyMatch { 801 p.reuse(sub) 802 switch len(re.Sub) { 803 case 0, 1: 804 // Impossible but handle. 805 re.Op = OpEmptyMatch 806 re.Sub = nil 807 case 2: 808 old := re 809 re = re.Sub[1] 810 p.reuse(old) 811 default: 812 copy(re.Sub, re.Sub[1:]) 813 re.Sub = re.Sub[:len(re.Sub)-1] 814 } 815 } 816 return re 817 } 818 819 if re.Op == OpLiteral { 820 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] 821 if len(re.Rune) == 0 { 822 re.Op = OpEmptyMatch 823 } 824 } 825 return re 826 } 827 828 // leadingRegexp returns the leading regexp that re begins with. 829 // The regexp refers to storage in re or its children. 830 func (p *parser) leadingRegexp(re *Regexp) *Regexp { 831 if re.Op == OpEmptyMatch { 832 return nil 833 } 834 if re.Op == OpConcat && len(re.Sub) > 0 { 835 sub := re.Sub[0] 836 if sub.Op == OpEmptyMatch { 837 return nil 838 } 839 return sub 840 } 841 return re 842 } 843 844 // removeLeadingRegexp removes the leading regexp in re. 845 // It returns the replacement for re. 846 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. 847 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { 848 if re.Op == OpConcat && len(re.Sub) > 0 { 849 if reuse { 850 p.reuse(re.Sub[0]) 851 } 852 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] 853 switch len(re.Sub) { 854 case 0: 855 re.Op = OpEmptyMatch 856 re.Sub = nil 857 case 1: 858 old := re 859 re = re.Sub[0] 860 p.reuse(old) 861 } 862 return re 863 } 864 if reuse { 865 p.reuse(re) 866 } 867 return p.newRegexp(OpEmptyMatch) 868 } 869 870 func literalRegexp(s string, flags Flags) *Regexp { 871 re := &Regexp{Op: OpLiteral} 872 re.Flags = flags 873 re.Rune = re.Rune0[:0] // use local storage for small strings 874 for _, c := range s { 875 if len(re.Rune) >= cap(re.Rune) { 876 // string is too long to fit in Rune0. let Go handle it 877 re.Rune = []rune(s) 878 break 879 } 880 re.Rune = append(re.Rune, c) 881 } 882 return re 883 } 884 885 // Parsing. 886 887 // Parse parses a regular expression string s, controlled by the specified 888 // Flags, and returns a regular expression parse tree. The syntax is 889 // described in the top-level comment. 890 func Parse(s string, flags Flags) (*Regexp, error) { 891 return parse(s, flags) 892 } 893 894 func parse(s string, flags Flags) (_ *Regexp, err error) { 895 defer func() { 896 switch r := recover(); r { 897 default: 898 panic(r) 899 case nil: 900 // ok 901 case ErrLarge: // too big 902 err = &Error{Code: ErrLarge, Expr: s} 903 case ErrNestingDepth: 904 err = &Error{Code: ErrNestingDepth, Expr: s} 905 } 906 }() 907 908 if flags&Literal != 0 { 909 // Trivial parser for literal string. 910 if err := checkUTF8(s); err != nil { 911 return nil, err 912 } 913 return literalRegexp(s, flags), nil 914 } 915 916 // Otherwise, must do real work. 917 var ( 918 p parser 919 c rune 920 op Op 921 lastRepeat string 922 ) 923 p.flags = flags 924 p.wholeRegexp = s 925 t := s 926 for t != "" { 927 repeat := "" 928 BigSwitch: 929 switch t[0] { 930 default: 931 if c, t, err = nextRune(t); err != nil { 932 return nil, err 933 } 934 p.literal(c) 935 936 case '(': 937 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { 938 // Flag changes and non-capturing groups. 939 if t, err = p.parsePerlFlags(t); err != nil { 940 return nil, err 941 } 942 break 943 } 944 p.numCap++ 945 p.op(opLeftParen).Cap = p.numCap 946 t = t[1:] 947 case '|': 948 if err = p.parseVerticalBar(); err != nil { 949 return nil, err 950 } 951 t = t[1:] 952 case ')': 953 if err = p.parseRightParen(); err != nil { 954 return nil, err 955 } 956 t = t[1:] 957 case '^': 958 if p.flags&OneLine != 0 { 959 p.op(OpBeginText) 960 } else { 961 p.op(OpBeginLine) 962 } 963 t = t[1:] 964 case '$': 965 if p.flags&OneLine != 0 { 966 p.op(OpEndText).Flags |= WasDollar 967 } else { 968 p.op(OpEndLine) 969 } 970 t = t[1:] 971 case '.': 972 if p.flags&DotNL != 0 { 973 p.op(OpAnyChar) 974 } else { 975 p.op(OpAnyCharNotNL) 976 } 977 t = t[1:] 978 case '[': 979 if t, err = p.parseClass(t); err != nil { 980 return nil, err 981 } 982 case '*', '+', '?': 983 before := t 984 switch t[0] { 985 case '*': 986 op = OpStar 987 case '+': 988 op = OpPlus 989 case '?': 990 op = OpQuest 991 } 992 after := t[1:] 993 if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil { 994 return nil, err 995 } 996 repeat = before 997 t = after 998 case '{': 999 op = OpRepeat 1000 before := t 1001 min, max, after, ok := p.parseRepeat(t) 1002 if !ok { 1003 // If the repeat cannot be parsed, { is a literal. 1004 p.literal('{') 1005 t = t[1:] 1006 break 1007 } 1008 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { 1009 // Numbers were too big, or max is present and min > max. 1010 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 1011 } 1012 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { 1013 return nil, err 1014 } 1015 repeat = before 1016 t = after 1017 case '\\': 1018 if p.flags&PerlX != 0 && len(t) >= 2 { 1019 switch t[1] { 1020 case 'A': 1021 p.op(OpBeginText) 1022 t = t[2:] 1023 break BigSwitch 1024 case 'b': 1025 p.op(OpWordBoundary) 1026 t = t[2:] 1027 break BigSwitch 1028 case 'B': 1029 p.op(OpNoWordBoundary) 1030 t = t[2:] 1031 break BigSwitch 1032 case 'C': 1033 // any byte; not supported 1034 return nil, &Error{ErrInvalidEscape, t[:2]} 1035 case 'Q': 1036 // \Q ... \E: the ... is always literals 1037 var lit string 1038 lit, t, _ = strings.Cut(t[2:], `\E`) 1039 for lit != "" { 1040 c, rest, err := nextRune(lit) 1041 if err != nil { 1042 return nil, err 1043 } 1044 p.literal(c) 1045 lit = rest 1046 } 1047 break BigSwitch 1048 case 'z': 1049 p.op(OpEndText) 1050 t = t[2:] 1051 break BigSwitch 1052 } 1053 } 1054 1055 re := p.newRegexp(OpCharClass) 1056 re.Flags = p.flags 1057 1058 // Look for Unicode character group like \p{Han} 1059 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { 1060 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) 1061 if err != nil { 1062 return nil, err 1063 } 1064 if r != nil { 1065 re.Rune = r 1066 t = rest 1067 p.push(re) 1068 break BigSwitch 1069 } 1070 } 1071 1072 // Perl character class escape. 1073 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { 1074 re.Rune = r 1075 t = rest 1076 p.push(re) 1077 break BigSwitch 1078 } 1079 p.reuse(re) 1080 1081 // Ordinary single-character escape. 1082 if c, t, err = p.parseEscape(t); err != nil { 1083 return nil, err 1084 } 1085 p.literal(c) 1086 } 1087 lastRepeat = repeat 1088 } 1089 1090 p.concat() 1091 if p.swapVerticalBar() { 1092 // pop vertical bar 1093 p.stack = p.stack[:len(p.stack)-1] 1094 } 1095 p.alternate() 1096 1097 n := len(p.stack) 1098 if n != 1 { 1099 return nil, &Error{ErrMissingParen, s} 1100 } 1101 return p.stack[0], nil 1102 } 1103 1104 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. 1105 // If s is not of that form, it returns ok == false. 1106 // If s has the right form but the values are too big, it returns min == -1, ok == true. 1107 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { 1108 if s == "" || s[0] != '{' { 1109 return 1110 } 1111 s = s[1:] 1112 var ok1 bool 1113 if min, s, ok1 = p.parseInt(s); !ok1 { 1114 return 1115 } 1116 if s == "" { 1117 return 1118 } 1119 if s[0] != ',' { 1120 max = min 1121 } else { 1122 s = s[1:] 1123 if s == "" { 1124 return 1125 } 1126 if s[0] == '}' { 1127 max = -1 1128 } else if max, s, ok1 = p.parseInt(s); !ok1 { 1129 return 1130 } else if max < 0 { 1131 // parseInt found too big a number 1132 min = -1 1133 } 1134 } 1135 if s == "" || s[0] != '}' { 1136 return 1137 } 1138 rest = s[1:] 1139 ok = true 1140 return 1141 } 1142 1143 // parsePerlFlags parses a Perl flag setting or non-capturing group or both, 1144 // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. 1145 // The caller must have ensured that s begins with "(?". 1146 func (p *parser) parsePerlFlags(s string) (rest string, err error) { 1147 t := s 1148 1149 // Check for named captures, first introduced in Python's regexp library. 1150 // As usual, there are three slightly different syntaxes: 1151 // 1152 // (?P<name>expr) the original, introduced by Python 1153 // (?<name>expr) the .NET alteration, adopted by Perl 5.10 1154 // (?'name'expr) another .NET alteration, adopted by Perl 5.10 1155 // 1156 // Perl 5.10 gave in and implemented the Python version too, 1157 // but they claim that the last two are the preferred forms. 1158 // PCRE and languages based on it (specifically, PHP and Ruby) 1159 // support all three as well. EcmaScript 4 uses only the Python form. 1160 // 1161 // In both the open source world (via Code Search) and the 1162 // Google source tree, (?P<expr>name) is the dominant form, 1163 // so that's the one we implement. One is enough. 1164 if len(t) > 4 && t[2] == 'P' && t[3] == '<' { 1165 // Pull out name. 1166 end := strings.IndexRune(t, '>') 1167 if end < 0 { 1168 if err = checkUTF8(t); err != nil { 1169 return "", err 1170 } 1171 return "", &Error{ErrInvalidNamedCapture, s} 1172 } 1173 1174 capture := t[:end+1] // "(?P<name>" 1175 name := t[4:end] // "name" 1176 if err = checkUTF8(name); err != nil { 1177 return "", err 1178 } 1179 if !isValidCaptureName(name) { 1180 return "", &Error{ErrInvalidNamedCapture, capture} 1181 } 1182 1183 // Like ordinary capture, but named. 1184 p.numCap++ 1185 re := p.op(opLeftParen) 1186 re.Cap = p.numCap 1187 re.Name = name 1188 return t[end+1:], nil 1189 } 1190 1191 // Non-capturing group. Might also twiddle Perl flags. 1192 var c rune 1193 t = t[2:] // skip (? 1194 flags := p.flags 1195 sign := +1 1196 sawFlag := false 1197 Loop: 1198 for t != "" { 1199 if c, t, err = nextRune(t); err != nil { 1200 return "", err 1201 } 1202 switch c { 1203 default: 1204 break Loop 1205 1206 // Flags. 1207 case 'i': 1208 flags |= FoldCase 1209 sawFlag = true 1210 case 'm': 1211 flags &^= OneLine 1212 sawFlag = true 1213 case 's': 1214 flags |= DotNL 1215 sawFlag = true 1216 case 'U': 1217 flags |= NonGreedy 1218 sawFlag = true 1219 1220 // Switch to negation. 1221 case '-': 1222 if sign < 0 { 1223 break Loop 1224 } 1225 sign = -1 1226 // Invert flags so that | above turn into &^ and vice versa. 1227 // We'll invert flags again before using it below. 1228 flags = ^flags 1229 sawFlag = false 1230 1231 // End of flags, starting group or not. 1232 case ':', ')': 1233 if sign < 0 { 1234 if !sawFlag { 1235 break Loop 1236 } 1237 flags = ^flags 1238 } 1239 if c == ':' { 1240 // Open new group 1241 p.op(opLeftParen) 1242 } 1243 p.flags = flags 1244 return t, nil 1245 } 1246 } 1247 1248 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} 1249 } 1250 1251 // isValidCaptureName reports whether name 1252 // is a valid capture name: [A-Za-z0-9_]+. 1253 // PCRE limits names to 32 bytes. 1254 // Python rejects names starting with digits. 1255 // We don't enforce either of those. 1256 func isValidCaptureName(name string) bool { 1257 if name == "" { 1258 return false 1259 } 1260 for _, c := range name { 1261 if c != '_' && !isalnum(c) { 1262 return false 1263 } 1264 } 1265 return true 1266 } 1267 1268 // parseInt parses a decimal integer. 1269 func (p *parser) parseInt(s string) (n int, rest string, ok bool) { 1270 if s == "" || s[0] < '0' || '9' < s[0] { 1271 return 1272 } 1273 // Disallow leading zeros. 1274 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { 1275 return 1276 } 1277 t := s 1278 for s != "" && '0' <= s[0] && s[0] <= '9' { 1279 s = s[1:] 1280 } 1281 rest = s 1282 ok = true 1283 // Have digits, compute value. 1284 t = t[:len(t)-len(s)] 1285 for i := 0; i < len(t); i++ { 1286 // Avoid overflow. 1287 if n >= 1e8 { 1288 n = -1 1289 break 1290 } 1291 n = n*10 + int(t[i]) - '0' 1292 } 1293 return 1294 } 1295 1296 // can this be represented as a character class? 1297 // single-rune literal string, char class, ., and .|\n. 1298 func isCharClass(re *Regexp) bool { 1299 return re.Op == OpLiteral && len(re.Rune) == 1 || 1300 re.Op == OpCharClass || 1301 re.Op == OpAnyCharNotNL || 1302 re.Op == OpAnyChar 1303 } 1304 1305 // does re match r? 1306 func matchRune(re *Regexp, r rune) bool { 1307 switch re.Op { 1308 case OpLiteral: 1309 return len(re.Rune) == 1 && re.Rune[0] == r 1310 case OpCharClass: 1311 for i := 0; i < len(re.Rune); i += 2 { 1312 if re.Rune[i] <= r && r <= re.Rune[i+1] { 1313 return true 1314 } 1315 } 1316 return false 1317 case OpAnyCharNotNL: 1318 return r != '\n' 1319 case OpAnyChar: 1320 return true 1321 } 1322 return false 1323 } 1324 1325 // parseVerticalBar handles a | in the input. 1326 func (p *parser) parseVerticalBar() error { 1327 p.concat() 1328 1329 // The concatenation we just parsed is on top of the stack. 1330 // If it sits above an opVerticalBar, swap it below 1331 // (things below an opVerticalBar become an alternation). 1332 // Otherwise, push a new vertical bar. 1333 if !p.swapVerticalBar() { 1334 p.op(opVerticalBar) 1335 } 1336 1337 return nil 1338 } 1339 1340 // mergeCharClass makes dst = dst|src. 1341 // The caller must ensure that dst.Op >= src.Op, 1342 // to reduce the amount of copying. 1343 func mergeCharClass(dst, src *Regexp) { 1344 switch dst.Op { 1345 case OpAnyChar: 1346 // src doesn't add anything. 1347 case OpAnyCharNotNL: 1348 // src might add \n 1349 if matchRune(src, '\n') { 1350 dst.Op = OpAnyChar 1351 } 1352 case OpCharClass: 1353 // src is simpler, so either literal or char class 1354 if src.Op == OpLiteral { 1355 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1356 } else { 1357 dst.Rune = appendClass(dst.Rune, src.Rune) 1358 } 1359 case OpLiteral: 1360 // both literal 1361 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { 1362 break 1363 } 1364 dst.Op = OpCharClass 1365 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) 1366 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1367 } 1368 } 1369 1370 // If the top of the stack is an element followed by an opVerticalBar 1371 // swapVerticalBar swaps the two and returns true. 1372 // Otherwise it returns false. 1373 func (p *parser) swapVerticalBar() bool { 1374 // If above and below vertical bar are literal or char class, 1375 // can merge into a single char class. 1376 n := len(p.stack) 1377 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { 1378 re1 := p.stack[n-1] 1379 re3 := p.stack[n-3] 1380 // Make re3 the more complex of the two. 1381 if re1.Op > re3.Op { 1382 re1, re3 = re3, re1 1383 p.stack[n-3] = re3 1384 } 1385 mergeCharClass(re3, re1) 1386 p.reuse(re1) 1387 p.stack = p.stack[:n-1] 1388 return true 1389 } 1390 1391 if n >= 2 { 1392 re1 := p.stack[n-1] 1393 re2 := p.stack[n-2] 1394 if re2.Op == opVerticalBar { 1395 if n >= 3 { 1396 // Now out of reach. 1397 // Clean opportunistically. 1398 cleanAlt(p.stack[n-3]) 1399 } 1400 p.stack[n-2] = re1 1401 p.stack[n-1] = re2 1402 return true 1403 } 1404 } 1405 return false 1406 } 1407 1408 // parseRightParen handles a ) in the input. 1409 func (p *parser) parseRightParen() error { 1410 p.concat() 1411 if p.swapVerticalBar() { 1412 // pop vertical bar 1413 p.stack = p.stack[:len(p.stack)-1] 1414 } 1415 p.alternate() 1416 1417 n := len(p.stack) 1418 if n < 2 { 1419 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1420 } 1421 re1 := p.stack[n-1] 1422 re2 := p.stack[n-2] 1423 p.stack = p.stack[:n-2] 1424 if re2.Op != opLeftParen { 1425 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1426 } 1427 // Restore flags at time of paren. 1428 p.flags = re2.Flags 1429 if re2.Cap == 0 { 1430 // Just for grouping. 1431 p.push(re1) 1432 } else { 1433 re2.Op = OpCapture 1434 re2.Sub = re2.Sub0[:1] 1435 re2.Sub[0] = re1 1436 p.push(re2) 1437 } 1438 return nil 1439 } 1440 1441 // parseEscape parses an escape sequence at the beginning of s 1442 // and returns the rune. 1443 func (p *parser) parseEscape(s string) (r rune, rest string, err error) { 1444 t := s[1:] 1445 if t == "" { 1446 return 0, "", &Error{ErrTrailingBackslash, ""} 1447 } 1448 c, t, err := nextRune(t) 1449 if err != nil { 1450 return 0, "", err 1451 } 1452 1453 Switch: 1454 switch c { 1455 default: 1456 if c < utf8.RuneSelf && !isalnum(c) { 1457 // Escaped non-word characters are always themselves. 1458 // PCRE is not quite so rigorous: it accepts things like 1459 // \q, but we don't. We once rejected \_, but too many 1460 // programs and people insist on using it, so allow \_. 1461 return c, t, nil 1462 } 1463 1464 // Octal escapes. 1465 case '1', '2', '3', '4', '5', '6', '7': 1466 // Single non-zero digit is a backreference; not supported 1467 if t == "" || t[0] < '0' || t[0] > '7' { 1468 break 1469 } 1470 fallthrough 1471 case '0': 1472 // Consume up to three octal digits; already have one. 1473 r = c - '0' 1474 for i := 1; i < 3; i++ { 1475 if t == "" || t[0] < '0' || t[0] > '7' { 1476 break 1477 } 1478 r = r*8 + rune(t[0]) - '0' 1479 t = t[1:] 1480 } 1481 return r, t, nil 1482 1483 // Hexadecimal escapes. 1484 case 'x': 1485 if t == "" { 1486 break 1487 } 1488 if c, t, err = nextRune(t); err != nil { 1489 return 0, "", err 1490 } 1491 if c == '{' { 1492 // Any number of digits in braces. 1493 // Perl accepts any text at all; it ignores all text 1494 // after the first non-hex digit. We require only hex digits, 1495 // and at least one. 1496 nhex := 0 1497 r = 0 1498 for { 1499 if t == "" { 1500 break Switch 1501 } 1502 if c, t, err = nextRune(t); err != nil { 1503 return 0, "", err 1504 } 1505 if c == '}' { 1506 break 1507 } 1508 v := unhex(c) 1509 if v < 0 { 1510 break Switch 1511 } 1512 r = r*16 + v 1513 if r > unicode.MaxRune { 1514 break Switch 1515 } 1516 nhex++ 1517 } 1518 if nhex == 0 { 1519 break Switch 1520 } 1521 return r, t, nil 1522 } 1523 1524 // Easy case: two hex digits. 1525 x := unhex(c) 1526 if c, t, err = nextRune(t); err != nil { 1527 return 0, "", err 1528 } 1529 y := unhex(c) 1530 if x < 0 || y < 0 { 1531 break 1532 } 1533 return x*16 + y, t, nil 1534 1535 // C escapes. There is no case 'b', to avoid misparsing 1536 // the Perl word-boundary \b as the C backspace \b 1537 // when in POSIX mode. In Perl, /\b/ means word-boundary 1538 // but /[\b]/ means backspace. We don't support that. 1539 // If you want a backspace, embed a literal backspace 1540 // character or use \x08. 1541 case 'a': 1542 return '\a', t, err 1543 case 'f': 1544 return '\f', t, err 1545 case 'n': 1546 return '\n', t, err 1547 case 'r': 1548 return '\r', t, err 1549 case 't': 1550 return '\t', t, err 1551 case 'v': 1552 return '\v', t, err 1553 } 1554 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} 1555 } 1556 1557 // parseClassChar parses a character class character at the beginning of s 1558 // and returns it. 1559 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { 1560 if s == "" { 1561 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} 1562 } 1563 1564 // Allow regular escape sequences even though 1565 // many need not be escaped in this context. 1566 if s[0] == '\\' { 1567 return p.parseEscape(s) 1568 } 1569 1570 return nextRune(s) 1571 } 1572 1573 type charGroup struct { 1574 sign int 1575 class []rune 1576 } 1577 1578 // parsePerlClassEscape parses a leading Perl character class escape like \d 1579 // from the beginning of s. If one is present, it appends the characters to r 1580 // and returns the new slice r and the remainder of the string. 1581 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { 1582 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { 1583 return 1584 } 1585 g := perlGroup[s[0:2]] 1586 if g.sign == 0 { 1587 return 1588 } 1589 return p.appendGroup(r, g), s[2:] 1590 } 1591 1592 // parseNamedClass parses a leading POSIX named character class like [:alnum:] 1593 // from the beginning of s. If one is present, it appends the characters to r 1594 // and returns the new slice r and the remainder of the string. 1595 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { 1596 if len(s) < 2 || s[0] != '[' || s[1] != ':' { 1597 return 1598 } 1599 1600 i := strings.Index(s[2:], ":]") 1601 if i < 0 { 1602 return 1603 } 1604 i += 2 1605 name, s := s[0:i+2], s[i+2:] 1606 g := posixGroup[name] 1607 if g.sign == 0 { 1608 return nil, "", &Error{ErrInvalidCharRange, name} 1609 } 1610 return p.appendGroup(r, g), s, nil 1611 } 1612 1613 func (p *parser) appendGroup(r []rune, g charGroup) []rune { 1614 if p.flags&FoldCase == 0 { 1615 if g.sign < 0 { 1616 r = appendNegatedClass(r, g.class) 1617 } else { 1618 r = appendClass(r, g.class) 1619 } 1620 } else { 1621 tmp := p.tmpClass[:0] 1622 tmp = appendFoldedClass(tmp, g.class) 1623 p.tmpClass = tmp 1624 tmp = cleanClass(&p.tmpClass) 1625 if g.sign < 0 { 1626 r = appendNegatedClass(r, tmp) 1627 } else { 1628 r = appendClass(r, tmp) 1629 } 1630 } 1631 return r 1632 } 1633 1634 var anyTable = &unicode.RangeTable{ 1635 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, 1636 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, 1637 } 1638 1639 // unicodeTable returns the unicode.RangeTable identified by name 1640 // and the table of additional fold-equivalent code points. 1641 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { 1642 // Special case: "Any" means any. 1643 if name == "Any" { 1644 return anyTable, anyTable 1645 } 1646 if t := unicode.Categories[name]; t != nil { 1647 return t, unicode.FoldCategory[name] 1648 } 1649 if t := unicode.Scripts[name]; t != nil { 1650 return t, unicode.FoldScript[name] 1651 } 1652 return nil, nil 1653 } 1654 1655 // parseUnicodeClass parses a leading Unicode character class like \p{Han} 1656 // from the beginning of s. If one is present, it appends the characters to r 1657 // and returns the new slice r and the remainder of the string. 1658 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { 1659 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { 1660 return 1661 } 1662 1663 // Committed to parse or return error. 1664 sign := +1 1665 if s[1] == 'P' { 1666 sign = -1 1667 } 1668 t := s[2:] 1669 c, t, err := nextRune(t) 1670 if err != nil { 1671 return 1672 } 1673 var seq, name string 1674 if c != '{' { 1675 // Single-letter name. 1676 seq = s[:len(s)-len(t)] 1677 name = seq[2:] 1678 } else { 1679 // Name is in braces. 1680 end := strings.IndexRune(s, '}') 1681 if end < 0 { 1682 if err = checkUTF8(s); err != nil { 1683 return 1684 } 1685 return nil, "", &Error{ErrInvalidCharRange, s} 1686 } 1687 seq, t = s[:end+1], s[end+1:] 1688 name = s[3:end] 1689 if err = checkUTF8(name); err != nil { 1690 return 1691 } 1692 } 1693 1694 // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. 1695 if name != "" && name[0] == '^' { 1696 sign = -sign 1697 name = name[1:] 1698 } 1699 1700 tab, fold := unicodeTable(name) 1701 if tab == nil { 1702 return nil, "", &Error{ErrInvalidCharRange, seq} 1703 } 1704 1705 if p.flags&FoldCase == 0 || fold == nil { 1706 if sign > 0 { 1707 r = appendTable(r, tab) 1708 } else { 1709 r = appendNegatedTable(r, tab) 1710 } 1711 } else { 1712 // Merge and clean tab and fold in a temporary buffer. 1713 // This is necessary for the negative case and just tidy 1714 // for the positive case. 1715 tmp := p.tmpClass[:0] 1716 tmp = appendTable(tmp, tab) 1717 tmp = appendTable(tmp, fold) 1718 p.tmpClass = tmp 1719 tmp = cleanClass(&p.tmpClass) 1720 if sign > 0 { 1721 r = appendClass(r, tmp) 1722 } else { 1723 r = appendNegatedClass(r, tmp) 1724 } 1725 } 1726 return r, t, nil 1727 } 1728 1729 // parseClass parses a character class at the beginning of s 1730 // and pushes it onto the parse stack. 1731 func (p *parser) parseClass(s string) (rest string, err error) { 1732 t := s[1:] // chop [ 1733 re := p.newRegexp(OpCharClass) 1734 re.Flags = p.flags 1735 re.Rune = re.Rune0[:0] 1736 1737 sign := +1 1738 if t != "" && t[0] == '^' { 1739 sign = -1 1740 t = t[1:] 1741 1742 // If character class does not match \n, add it here, 1743 // so that negation later will do the right thing. 1744 if p.flags&ClassNL == 0 { 1745 re.Rune = append(re.Rune, '\n', '\n') 1746 } 1747 } 1748 1749 class := re.Rune 1750 first := true // ] and - are okay as first char in class 1751 for t == "" || t[0] != ']' || first { 1752 // POSIX: - is only okay unescaped as first or last in class. 1753 // Perl: - is okay anywhere. 1754 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { 1755 _, size := utf8.DecodeRuneInString(t[1:]) 1756 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} 1757 } 1758 first = false 1759 1760 // Look for POSIX [:alnum:] etc. 1761 if len(t) > 2 && t[0] == '[' && t[1] == ':' { 1762 nclass, nt, err := p.parseNamedClass(t, class) 1763 if err != nil { 1764 return "", err 1765 } 1766 if nclass != nil { 1767 class, t = nclass, nt 1768 continue 1769 } 1770 } 1771 1772 // Look for Unicode character group like \p{Han}. 1773 nclass, nt, err := p.parseUnicodeClass(t, class) 1774 if err != nil { 1775 return "", err 1776 } 1777 if nclass != nil { 1778 class, t = nclass, nt 1779 continue 1780 } 1781 1782 // Look for Perl character class symbols (extension). 1783 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { 1784 class, t = nclass, nt 1785 continue 1786 } 1787 1788 // Single character or simple range. 1789 rng := t 1790 var lo, hi rune 1791 if lo, t, err = p.parseClassChar(t, s); err != nil { 1792 return "", err 1793 } 1794 hi = lo 1795 // [a-] means (a|-) so check for final ]. 1796 if len(t) >= 2 && t[0] == '-' && t[1] != ']' { 1797 t = t[1:] 1798 if hi, t, err = p.parseClassChar(t, s); err != nil { 1799 return "", err 1800 } 1801 if hi < lo { 1802 rng = rng[:len(rng)-len(t)] 1803 return "", &Error{Code: ErrInvalidCharRange, Expr: rng} 1804 } 1805 } 1806 if p.flags&FoldCase == 0 { 1807 class = appendRange(class, lo, hi) 1808 } else { 1809 class = appendFoldedRange(class, lo, hi) 1810 } 1811 } 1812 t = t[1:] // chop ] 1813 1814 // Use &re.Rune instead of &class to avoid allocation. 1815 re.Rune = class 1816 class = cleanClass(&re.Rune) 1817 if sign < 0 { 1818 class = negateClass(class) 1819 } 1820 re.Rune = class 1821 p.push(re) 1822 return t, nil 1823 } 1824 1825 // cleanClass sorts the ranges (pairs of elements of r), 1826 // merges them, and eliminates duplicates. 1827 func cleanClass(rp *[]rune) []rune { 1828 1829 // Sort by lo increasing, hi decreasing to break ties. 1830 sort.Sort(ranges{rp}) 1831 1832 r := *rp 1833 if len(r) < 2 { 1834 return r 1835 } 1836 1837 // Merge abutting, overlapping. 1838 w := 2 // write index 1839 for i := 2; i < len(r); i += 2 { 1840 lo, hi := r[i], r[i+1] 1841 if lo <= r[w-1]+1 { 1842 // merge with previous range 1843 if hi > r[w-1] { 1844 r[w-1] = hi 1845 } 1846 continue 1847 } 1848 // new disjoint range 1849 r[w] = lo 1850 r[w+1] = hi 1851 w += 2 1852 } 1853 1854 return r[:w] 1855 } 1856 1857 // appendLiteral returns the result of appending the literal x to the class r. 1858 func appendLiteral(r []rune, x rune, flags Flags) []rune { 1859 if flags&FoldCase != 0 { 1860 return appendFoldedRange(r, x, x) 1861 } 1862 return appendRange(r, x, x) 1863 } 1864 1865 // appendRange returns the result of appending the range lo-hi to the class r. 1866 func appendRange(r []rune, lo, hi rune) []rune { 1867 // Expand last range or next to last range if it overlaps or abuts. 1868 // Checking two ranges helps when appending case-folded 1869 // alphabets, so that one range can be expanding A-Z and the 1870 // other expanding a-z. 1871 n := len(r) 1872 for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 1873 if n >= i { 1874 rlo, rhi := r[n-i], r[n-i+1] 1875 if lo <= rhi+1 && rlo <= hi+1 { 1876 if lo < rlo { 1877 r[n-i] = lo 1878 } 1879 if hi > rhi { 1880 r[n-i+1] = hi 1881 } 1882 return r 1883 } 1884 } 1885 } 1886 1887 return append(r, lo, hi) 1888 } 1889 1890 const ( 1891 // minimum and maximum runes involved in folding. 1892 // checked during test. 1893 minFold = 0x0041 1894 maxFold = 0x1e943 1895 ) 1896 1897 // appendFoldedRange returns the result of appending the range lo-hi 1898 // and its case folding-equivalent runes to the class r. 1899 func appendFoldedRange(r []rune, lo, hi rune) []rune { 1900 // Optimizations. 1901 if lo <= minFold && hi >= maxFold { 1902 // Range is full: folding can't add more. 1903 return appendRange(r, lo, hi) 1904 } 1905 if hi < minFold || lo > maxFold { 1906 // Range is outside folding possibilities. 1907 return appendRange(r, lo, hi) 1908 } 1909 if lo < minFold { 1910 // [lo, minFold-1] needs no folding. 1911 r = appendRange(r, lo, minFold-1) 1912 lo = minFold 1913 } 1914 if hi > maxFold { 1915 // [maxFold+1, hi] needs no folding. 1916 r = appendRange(r, maxFold+1, hi) 1917 hi = maxFold 1918 } 1919 1920 // Brute force. Depend on appendRange to coalesce ranges on the fly. 1921 for c := lo; c <= hi; c++ { 1922 r = appendRange(r, c, c) 1923 f := unicode.SimpleFold(c) 1924 for f != c { 1925 r = appendRange(r, f, f) 1926 f = unicode.SimpleFold(f) 1927 } 1928 } 1929 return r 1930 } 1931 1932 // appendClass returns the result of appending the class x to the class r. 1933 // It assume x is clean. 1934 func appendClass(r []rune, x []rune) []rune { 1935 for i := 0; i < len(x); i += 2 { 1936 r = appendRange(r, x[i], x[i+1]) 1937 } 1938 return r 1939 } 1940 1941 // appendFoldedClass returns the result of appending the case folding of the class x to the class r. 1942 func appendFoldedClass(r []rune, x []rune) []rune { 1943 for i := 0; i < len(x); i += 2 { 1944 r = appendFoldedRange(r, x[i], x[i+1]) 1945 } 1946 return r 1947 } 1948 1949 // appendNegatedClass returns the result of appending the negation of the class x to the class r. 1950 // It assumes x is clean. 1951 func appendNegatedClass(r []rune, x []rune) []rune { 1952 nextLo := '\u0000' 1953 for i := 0; i < len(x); i += 2 { 1954 lo, hi := x[i], x[i+1] 1955 if nextLo <= lo-1 { 1956 r = appendRange(r, nextLo, lo-1) 1957 } 1958 nextLo = hi + 1 1959 } 1960 if nextLo <= unicode.MaxRune { 1961 r = appendRange(r, nextLo, unicode.MaxRune) 1962 } 1963 return r 1964 } 1965 1966 // appendTable returns the result of appending x to the class r. 1967 func appendTable(r []rune, x *unicode.RangeTable) []rune { 1968 for _, xr := range x.R16 { 1969 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1970 if stride == 1 { 1971 r = appendRange(r, lo, hi) 1972 continue 1973 } 1974 for c := lo; c <= hi; c += stride { 1975 r = appendRange(r, c, c) 1976 } 1977 } 1978 for _, xr := range x.R32 { 1979 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1980 if stride == 1 { 1981 r = appendRange(r, lo, hi) 1982 continue 1983 } 1984 for c := lo; c <= hi; c += stride { 1985 r = appendRange(r, c, c) 1986 } 1987 } 1988 return r 1989 } 1990 1991 // appendNegatedTable returns the result of appending the negation of x to the class r. 1992 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { 1993 nextLo := '\u0000' // lo end of next class to add 1994 for _, xr := range x.R16 { 1995 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1996 if stride == 1 { 1997 if nextLo <= lo-1 { 1998 r = appendRange(r, nextLo, lo-1) 1999 } 2000 nextLo = hi + 1 2001 continue 2002 } 2003 for c := lo; c <= hi; c += stride { 2004 if nextLo <= c-1 { 2005 r = appendRange(r, nextLo, c-1) 2006 } 2007 nextLo = c + 1 2008 } 2009 } 2010 for _, xr := range x.R32 { 2011 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 2012 if stride == 1 { 2013 if nextLo <= lo-1 { 2014 r = appendRange(r, nextLo, lo-1) 2015 } 2016 nextLo = hi + 1 2017 continue 2018 } 2019 for c := lo; c <= hi; c += stride { 2020 if nextLo <= c-1 { 2021 r = appendRange(r, nextLo, c-1) 2022 } 2023 nextLo = c + 1 2024 } 2025 } 2026 if nextLo <= unicode.MaxRune { 2027 r = appendRange(r, nextLo, unicode.MaxRune) 2028 } 2029 return r 2030 } 2031 2032 // negateClass overwrites r and returns r's negation. 2033 // It assumes the class r is already clean. 2034 func negateClass(r []rune) []rune { 2035 nextLo := '\u0000' // lo end of next class to add 2036 w := 0 // write index 2037 for i := 0; i < len(r); i += 2 { 2038 lo, hi := r[i], r[i+1] 2039 if nextLo <= lo-1 { 2040 r[w] = nextLo 2041 r[w+1] = lo - 1 2042 w += 2 2043 } 2044 nextLo = hi + 1 2045 } 2046 r = r[:w] 2047 if nextLo <= unicode.MaxRune { 2048 // It's possible for the negation to have one more 2049 // range - this one - than the original class, so use append. 2050 r = append(r, nextLo, unicode.MaxRune) 2051 } 2052 return r 2053 } 2054 2055 // ranges implements sort.Interface on a []rune. 2056 // The choice of receiver type definition is strange 2057 // but avoids an allocation since we already have 2058 // a *[]rune. 2059 type ranges struct { 2060 p *[]rune 2061 } 2062 2063 func (ra ranges) Less(i, j int) bool { 2064 p := *ra.p 2065 i *= 2 2066 j *= 2 2067 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] 2068 } 2069 2070 func (ra ranges) Len() int { 2071 return len(*ra.p) / 2 2072 } 2073 2074 func (ra ranges) Swap(i, j int) { 2075 p := *ra.p 2076 i *= 2 2077 j *= 2 2078 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] 2079 } 2080 2081 func checkUTF8(s string) error { 2082 for s != "" { 2083 rune, size := utf8.DecodeRuneInString(s) 2084 if rune == utf8.RuneError && size == 1 { 2085 return &Error{Code: ErrInvalidUTF8, Expr: s} 2086 } 2087 s = s[size:] 2088 } 2089 return nil 2090 } 2091 2092 func nextRune(s string) (c rune, t string, err error) { 2093 c, size := utf8.DecodeRuneInString(s) 2094 if c == utf8.RuneError && size == 1 { 2095 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} 2096 } 2097 return c, s[size:], nil 2098 } 2099 2100 func isalnum(c rune) bool { 2101 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 2102 } 2103 2104 func unhex(c rune) rune { 2105 if '0' <= c && c <= '9' { 2106 return c - '0' 2107 } 2108 if 'a' <= c && c <= 'f' { 2109 return c - 'a' + 10 2110 } 2111 if 'A' <= c && c <= 'F' { 2112 return c - 'A' + 10 2113 } 2114 return -1 2115 }