github.com/comwrg/go/src@v0.0.0-20220319063731-c238d0440370/regexp/syntax/parse.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "sort" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // An Error describes a failure to parse a regular expression 15 // and gives the offending expression. 16 type Error struct { 17 Code ErrorCode 18 Expr string 19 } 20 21 func (e *Error) Error() string { 22 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" 23 } 24 25 // An ErrorCode describes a failure to parse a regular expression. 26 type ErrorCode string 27 28 const ( 29 // Unexpected error 30 ErrInternalError ErrorCode = "regexp/syntax: internal error" 31 32 // Parse errors 33 ErrInvalidCharClass ErrorCode = "invalid character class" 34 ErrInvalidCharRange ErrorCode = "invalid character class range" 35 ErrInvalidEscape ErrorCode = "invalid escape sequence" 36 ErrInvalidNamedCapture ErrorCode = "invalid named capture" 37 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" 38 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" 39 ErrInvalidRepeatSize ErrorCode = "invalid repeat count" 40 ErrInvalidUTF8 ErrorCode = "invalid UTF-8" 41 ErrMissingBracket ErrorCode = "missing closing ]" 42 ErrMissingParen ErrorCode = "missing closing )" 43 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" 44 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" 45 ErrUnexpectedParen ErrorCode = "unexpected )" 46 ) 47 48 func (e ErrorCode) String() string { 49 return string(e) 50 } 51 52 // Flags control the behavior of the parser and record information about regexp context. 53 type Flags uint16 54 55 const ( 56 FoldCase Flags = 1 << iota // case-insensitive match 57 Literal // treat pattern as literal string 58 ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline 59 DotNL // allow . to match newline 60 OneLine // treat ^ and $ as only matching at beginning and end of text 61 NonGreedy // make repetition operators default to non-greedy 62 PerlX // allow Perl extensions 63 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation 64 WasDollar // regexp OpEndText was $, not \z 65 Simple // regexp contains no counted repetition 66 67 MatchNL = ClassNL | DotNL 68 69 Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible 70 POSIX Flags = 0 // POSIX syntax 71 ) 72 73 // Pseudo-ops for parsing stack. 74 const ( 75 opLeftParen = opPseudo + iota 76 opVerticalBar 77 ) 78 79 // maxHeight is the maximum height of a regexp parse tree. 80 // It is somewhat arbitrarily chosen, but the idea is to be large enough 81 // that no one will actually hit in real use but at the same time small enough 82 // that recursion on the Regexp tree will not hit the 1GB Go stack limit. 83 // The maximum amount of stack for a single recursive frame is probably 84 // closer to 1kB, so this could potentially be raised, but it seems unlikely 85 // that people have regexps nested even this deeply. 86 // We ran a test on Google's C++ code base and turned up only 87 // a single use case with depth > 100; it had depth 128. 88 // Using depth 1000 should be plenty of margin. 89 // As an optimization, we don't even bother calculating heights 90 // until we've allocated at least maxHeight Regexp structures. 91 const maxHeight = 1000 92 93 type parser struct { 94 flags Flags // parse mode flags 95 stack []*Regexp // stack of parsed expressions 96 free *Regexp 97 numCap int // number of capturing groups seen 98 wholeRegexp string 99 tmpClass []rune // temporary char class work space 100 numRegexp int // number of regexps allocated 101 height map[*Regexp]int // regexp height for height limit check 102 } 103 104 func (p *parser) newRegexp(op Op) *Regexp { 105 re := p.free 106 if re != nil { 107 p.free = re.Sub0[0] 108 *re = Regexp{} 109 } else { 110 re = new(Regexp) 111 p.numRegexp++ 112 } 113 re.Op = op 114 return re 115 } 116 117 func (p *parser) reuse(re *Regexp) { 118 if p.height != nil { 119 delete(p.height, re) 120 } 121 re.Sub0[0] = p.free 122 p.free = re 123 } 124 125 func (p *parser) checkHeight(re *Regexp) { 126 if p.numRegexp < maxHeight { 127 return 128 } 129 if p.height == nil { 130 p.height = make(map[*Regexp]int) 131 for _, re := range p.stack { 132 p.checkHeight(re) 133 } 134 } 135 if p.calcHeight(re, true) > maxHeight { 136 panic(ErrInternalError) 137 } 138 } 139 140 func (p *parser) calcHeight(re *Regexp, force bool) int { 141 if !force { 142 if h, ok := p.height[re]; ok { 143 return h 144 } 145 } 146 h := 1 147 for _, sub := range re.Sub { 148 hsub := p.calcHeight(sub, false) 149 if h < 1+hsub { 150 h = 1 + hsub 151 } 152 } 153 p.height[re] = h 154 return h 155 } 156 157 // Parse stack manipulation. 158 159 // push pushes the regexp re onto the parse stack and returns the regexp. 160 func (p *parser) push(re *Regexp) *Regexp { 161 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { 162 // Single rune. 163 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { 164 return nil 165 } 166 re.Op = OpLiteral 167 re.Rune = re.Rune[:1] 168 re.Flags = p.flags &^ FoldCase 169 } else if re.Op == OpCharClass && len(re.Rune) == 4 && 170 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && 171 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && 172 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || 173 re.Op == OpCharClass && len(re.Rune) == 2 && 174 re.Rune[0]+1 == re.Rune[1] && 175 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && 176 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { 177 // Case-insensitive rune like [Aa] or [Δδ]. 178 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { 179 return nil 180 } 181 182 // Rewrite as (case-insensitive) literal. 183 re.Op = OpLiteral 184 re.Rune = re.Rune[:1] 185 re.Flags = p.flags | FoldCase 186 } else { 187 // Incremental concatenation. 188 p.maybeConcat(-1, 0) 189 } 190 191 p.stack = append(p.stack, re) 192 p.checkHeight(re) 193 return re 194 } 195 196 // maybeConcat implements incremental concatenation 197 // of literal runes into string nodes. The parser calls this 198 // before each push, so only the top fragment of the stack 199 // might need processing. Since this is called before a push, 200 // the topmost literal is no longer subject to operators like * 201 // (Otherwise ab* would turn into (ab)*.) 202 // If r >= 0 and there's a node left over, maybeConcat uses it 203 // to push r with the given flags. 204 // maybeConcat reports whether r was pushed. 205 func (p *parser) maybeConcat(r rune, flags Flags) bool { 206 n := len(p.stack) 207 if n < 2 { 208 return false 209 } 210 211 re1 := p.stack[n-1] 212 re2 := p.stack[n-2] 213 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { 214 return false 215 } 216 217 // Push re1 into re2. 218 re2.Rune = append(re2.Rune, re1.Rune...) 219 220 // Reuse re1 if possible. 221 if r >= 0 { 222 re1.Rune = re1.Rune0[:1] 223 re1.Rune[0] = r 224 re1.Flags = flags 225 return true 226 } 227 228 p.stack = p.stack[:n-1] 229 p.reuse(re1) 230 return false // did not push r 231 } 232 233 // literal pushes a literal regexp for the rune r on the stack. 234 func (p *parser) literal(r rune) { 235 re := p.newRegexp(OpLiteral) 236 re.Flags = p.flags 237 if p.flags&FoldCase != 0 { 238 r = minFoldRune(r) 239 } 240 re.Rune0[0] = r 241 re.Rune = re.Rune0[:1] 242 p.push(re) 243 } 244 245 // minFoldRune returns the minimum rune fold-equivalent to r. 246 func minFoldRune(r rune) rune { 247 if r < minFold || r > maxFold { 248 return r 249 } 250 min := r 251 r0 := r 252 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { 253 if min > r { 254 min = r 255 } 256 } 257 return min 258 } 259 260 // op pushes a regexp with the given op onto the stack 261 // and returns that regexp. 262 func (p *parser) op(op Op) *Regexp { 263 re := p.newRegexp(op) 264 re.Flags = p.flags 265 return p.push(re) 266 } 267 268 // repeat replaces the top stack element with itself repeated according to op, min, max. 269 // before is the regexp suffix starting at the repetition operator. 270 // after is the regexp suffix following after the repetition operator. 271 // repeat returns an updated 'after' and an error, if any. 272 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { 273 flags := p.flags 274 if p.flags&PerlX != 0 { 275 if len(after) > 0 && after[0] == '?' { 276 after = after[1:] 277 flags ^= NonGreedy 278 } 279 if lastRepeat != "" { 280 // In Perl it is not allowed to stack repetition operators: 281 // a** is a syntax error, not a doubled star, and a++ means 282 // something else entirely, which we don't support! 283 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} 284 } 285 } 286 n := len(p.stack) 287 if n == 0 { 288 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 289 } 290 sub := p.stack[n-1] 291 if sub.Op >= opPseudo { 292 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 293 } 294 295 re := p.newRegexp(op) 296 re.Min = min 297 re.Max = max 298 re.Flags = flags 299 re.Sub = re.Sub0[:1] 300 re.Sub[0] = sub 301 p.stack[n-1] = re 302 p.checkHeight(re) 303 304 if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) { 305 return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 306 } 307 308 return after, nil 309 } 310 311 // repeatIsValid reports whether the repetition re is valid. 312 // Valid means that the combination of the top-level repetition 313 // and any inner repetitions does not exceed n copies of the 314 // innermost thing. 315 // This function rewalks the regexp tree and is called for every repetition, 316 // so we have to worry about inducing quadratic behavior in the parser. 317 // We avoid this by only calling repeatIsValid when min or max >= 2. 318 // In that case the depth of any >= 2 nesting can only get to 9 without 319 // triggering a parse error, so each subtree can only be rewalked 9 times. 320 func repeatIsValid(re *Regexp, n int) bool { 321 if re.Op == OpRepeat { 322 m := re.Max 323 if m == 0 { 324 return true 325 } 326 if m < 0 { 327 m = re.Min 328 } 329 if m > n { 330 return false 331 } 332 if m > 0 { 333 n /= m 334 } 335 } 336 for _, sub := range re.Sub { 337 if !repeatIsValid(sub, n) { 338 return false 339 } 340 } 341 return true 342 } 343 344 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. 345 func (p *parser) concat() *Regexp { 346 p.maybeConcat(-1, 0) 347 348 // Scan down to find pseudo-operator | or (. 349 i := len(p.stack) 350 for i > 0 && p.stack[i-1].Op < opPseudo { 351 i-- 352 } 353 subs := p.stack[i:] 354 p.stack = p.stack[:i] 355 356 // Empty concatenation is special case. 357 if len(subs) == 0 { 358 return p.push(p.newRegexp(OpEmptyMatch)) 359 } 360 361 return p.push(p.collapse(subs, OpConcat)) 362 } 363 364 // alternate replaces the top of the stack (above the topmost '(') with its alternation. 365 func (p *parser) alternate() *Regexp { 366 // Scan down to find pseudo-operator (. 367 // There are no | above (. 368 i := len(p.stack) 369 for i > 0 && p.stack[i-1].Op < opPseudo { 370 i-- 371 } 372 subs := p.stack[i:] 373 p.stack = p.stack[:i] 374 375 // Make sure top class is clean. 376 // All the others already are (see swapVerticalBar). 377 if len(subs) > 0 { 378 cleanAlt(subs[len(subs)-1]) 379 } 380 381 // Empty alternate is special case 382 // (shouldn't happen but easy to handle). 383 if len(subs) == 0 { 384 return p.push(p.newRegexp(OpNoMatch)) 385 } 386 387 return p.push(p.collapse(subs, OpAlternate)) 388 } 389 390 // cleanAlt cleans re for eventual inclusion in an alternation. 391 func cleanAlt(re *Regexp) { 392 switch re.Op { 393 case OpCharClass: 394 re.Rune = cleanClass(&re.Rune) 395 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { 396 re.Rune = nil 397 re.Op = OpAnyChar 398 return 399 } 400 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { 401 re.Rune = nil 402 re.Op = OpAnyCharNotNL 403 return 404 } 405 if cap(re.Rune)-len(re.Rune) > 100 { 406 // re.Rune will not grow any more. 407 // Make a copy or inline to reclaim storage. 408 re.Rune = append(re.Rune0[:0], re.Rune...) 409 } 410 } 411 } 412 413 // collapse returns the result of applying op to sub. 414 // If sub contains op nodes, they all get hoisted up 415 // so that there is never a concat of a concat or an 416 // alternate of an alternate. 417 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { 418 if len(subs) == 1 { 419 return subs[0] 420 } 421 re := p.newRegexp(op) 422 re.Sub = re.Sub0[:0] 423 for _, sub := range subs { 424 if sub.Op == op { 425 re.Sub = append(re.Sub, sub.Sub...) 426 p.reuse(sub) 427 } else { 428 re.Sub = append(re.Sub, sub) 429 } 430 } 431 if op == OpAlternate { 432 re.Sub = p.factor(re.Sub) 433 if len(re.Sub) == 1 { 434 old := re 435 re = re.Sub[0] 436 p.reuse(old) 437 } 438 } 439 return re 440 } 441 442 // factor factors common prefixes from the alternation list sub. 443 // It returns a replacement list that reuses the same storage and 444 // frees (passes to p.reuse) any removed *Regexps. 445 // 446 // For example, 447 // ABC|ABD|AEF|BCX|BCY 448 // simplifies by literal prefix extraction to 449 // A(B(C|D)|EF)|BC(X|Y) 450 // which simplifies by character class introduction to 451 // A(B[CD]|EF)|BC[XY] 452 // 453 func (p *parser) factor(sub []*Regexp) []*Regexp { 454 if len(sub) < 2 { 455 return sub 456 } 457 458 // Round 1: Factor out common literal prefixes. 459 var str []rune 460 var strflags Flags 461 start := 0 462 out := sub[:0] 463 for i := 0; i <= len(sub); i++ { 464 // Invariant: the Regexps that were in sub[0:start] have been 465 // used or marked for reuse, and the slice space has been reused 466 // for out (len(out) <= start). 467 // 468 // Invariant: sub[start:i] consists of regexps that all begin 469 // with str as modified by strflags. 470 var istr []rune 471 var iflags Flags 472 if i < len(sub) { 473 istr, iflags = p.leadingString(sub[i]) 474 if iflags == strflags { 475 same := 0 476 for same < len(str) && same < len(istr) && str[same] == istr[same] { 477 same++ 478 } 479 if same > 0 { 480 // Matches at least one rune in current range. 481 // Keep going around. 482 str = str[:same] 483 continue 484 } 485 } 486 } 487 488 // Found end of a run with common leading literal string: 489 // sub[start:i] all begin with str[0:len(str)], but sub[i] 490 // does not even begin with str[0]. 491 // 492 // Factor out common string and append factored expression to out. 493 if i == start { 494 // Nothing to do - run of length 0. 495 } else if i == start+1 { 496 // Just one: don't bother factoring. 497 out = append(out, sub[start]) 498 } else { 499 // Construct factored form: prefix(suffix1|suffix2|...) 500 prefix := p.newRegexp(OpLiteral) 501 prefix.Flags = strflags 502 prefix.Rune = append(prefix.Rune[:0], str...) 503 504 for j := start; j < i; j++ { 505 sub[j] = p.removeLeadingString(sub[j], len(str)) 506 } 507 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 508 509 re := p.newRegexp(OpConcat) 510 re.Sub = append(re.Sub[:0], prefix, suffix) 511 out = append(out, re) 512 } 513 514 // Prepare for next iteration. 515 start = i 516 str = istr 517 strflags = iflags 518 } 519 sub = out 520 521 // Round 2: Factor out common simple prefixes, 522 // just the first piece of each concatenation. 523 // This will be good enough a lot of the time. 524 // 525 // Complex subexpressions (e.g. involving quantifiers) 526 // are not safe to factor because that collapses their 527 // distinct paths through the automaton, which affects 528 // correctness in some cases. 529 start = 0 530 out = sub[:0] 531 var first *Regexp 532 for i := 0; i <= len(sub); i++ { 533 // Invariant: the Regexps that were in sub[0:start] have been 534 // used or marked for reuse, and the slice space has been reused 535 // for out (len(out) <= start). 536 // 537 // Invariant: sub[start:i] consists of regexps that all begin with ifirst. 538 var ifirst *Regexp 539 if i < len(sub) { 540 ifirst = p.leadingRegexp(sub[i]) 541 if first != nil && first.Equal(ifirst) && 542 // first must be a character class OR a fixed repeat of a character class. 543 (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) { 544 continue 545 } 546 } 547 548 // Found end of a run with common leading regexp: 549 // sub[start:i] all begin with first but sub[i] does not. 550 // 551 // Factor out common regexp and append factored expression to out. 552 if i == start { 553 // Nothing to do - run of length 0. 554 } else if i == start+1 { 555 // Just one: don't bother factoring. 556 out = append(out, sub[start]) 557 } else { 558 // Construct factored form: prefix(suffix1|suffix2|...) 559 prefix := first 560 for j := start; j < i; j++ { 561 reuse := j != start // prefix came from sub[start] 562 sub[j] = p.removeLeadingRegexp(sub[j], reuse) 563 } 564 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 565 566 re := p.newRegexp(OpConcat) 567 re.Sub = append(re.Sub[:0], prefix, suffix) 568 out = append(out, re) 569 } 570 571 // Prepare for next iteration. 572 start = i 573 first = ifirst 574 } 575 sub = out 576 577 // Round 3: Collapse runs of single literals into character classes. 578 start = 0 579 out = sub[:0] 580 for i := 0; i <= len(sub); i++ { 581 // Invariant: the Regexps that were in sub[0:start] have been 582 // used or marked for reuse, and the slice space has been reused 583 // for out (len(out) <= start). 584 // 585 // Invariant: sub[start:i] consists of regexps that are either 586 // literal runes or character classes. 587 if i < len(sub) && isCharClass(sub[i]) { 588 continue 589 } 590 591 // sub[i] is not a char or char class; 592 // emit char class for sub[start:i]... 593 if i == start { 594 // Nothing to do - run of length 0. 595 } else if i == start+1 { 596 out = append(out, sub[start]) 597 } else { 598 // Make new char class. 599 // Start with most complex regexp in sub[start]. 600 max := start 601 for j := start + 1; j < i; j++ { 602 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { 603 max = j 604 } 605 } 606 sub[start], sub[max] = sub[max], sub[start] 607 608 for j := start + 1; j < i; j++ { 609 mergeCharClass(sub[start], sub[j]) 610 p.reuse(sub[j]) 611 } 612 cleanAlt(sub[start]) 613 out = append(out, sub[start]) 614 } 615 616 // ... and then emit sub[i]. 617 if i < len(sub) { 618 out = append(out, sub[i]) 619 } 620 start = i + 1 621 } 622 sub = out 623 624 // Round 4: Collapse runs of empty matches into a single empty match. 625 start = 0 626 out = sub[:0] 627 for i := range sub { 628 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { 629 continue 630 } 631 out = append(out, sub[i]) 632 } 633 sub = out 634 635 return sub 636 } 637 638 // leadingString returns the leading literal string that re begins with. 639 // The string refers to storage in re or its children. 640 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { 641 if re.Op == OpConcat && len(re.Sub) > 0 { 642 re = re.Sub[0] 643 } 644 if re.Op != OpLiteral { 645 return nil, 0 646 } 647 return re.Rune, re.Flags & FoldCase 648 } 649 650 // removeLeadingString removes the first n leading runes 651 // from the beginning of re. It returns the replacement for re. 652 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { 653 if re.Op == OpConcat && len(re.Sub) > 0 { 654 // Removing a leading string in a concatenation 655 // might simplify the concatenation. 656 sub := re.Sub[0] 657 sub = p.removeLeadingString(sub, n) 658 re.Sub[0] = sub 659 if sub.Op == OpEmptyMatch { 660 p.reuse(sub) 661 switch len(re.Sub) { 662 case 0, 1: 663 // Impossible but handle. 664 re.Op = OpEmptyMatch 665 re.Sub = nil 666 case 2: 667 old := re 668 re = re.Sub[1] 669 p.reuse(old) 670 default: 671 copy(re.Sub, re.Sub[1:]) 672 re.Sub = re.Sub[:len(re.Sub)-1] 673 } 674 } 675 return re 676 } 677 678 if re.Op == OpLiteral { 679 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] 680 if len(re.Rune) == 0 { 681 re.Op = OpEmptyMatch 682 } 683 } 684 return re 685 } 686 687 // leadingRegexp returns the leading regexp that re begins with. 688 // The regexp refers to storage in re or its children. 689 func (p *parser) leadingRegexp(re *Regexp) *Regexp { 690 if re.Op == OpEmptyMatch { 691 return nil 692 } 693 if re.Op == OpConcat && len(re.Sub) > 0 { 694 sub := re.Sub[0] 695 if sub.Op == OpEmptyMatch { 696 return nil 697 } 698 return sub 699 } 700 return re 701 } 702 703 // removeLeadingRegexp removes the leading regexp in re. 704 // It returns the replacement for re. 705 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. 706 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { 707 if re.Op == OpConcat && len(re.Sub) > 0 { 708 if reuse { 709 p.reuse(re.Sub[0]) 710 } 711 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] 712 switch len(re.Sub) { 713 case 0: 714 re.Op = OpEmptyMatch 715 re.Sub = nil 716 case 1: 717 old := re 718 re = re.Sub[0] 719 p.reuse(old) 720 } 721 return re 722 } 723 if reuse { 724 p.reuse(re) 725 } 726 return p.newRegexp(OpEmptyMatch) 727 } 728 729 func literalRegexp(s string, flags Flags) *Regexp { 730 re := &Regexp{Op: OpLiteral} 731 re.Flags = flags 732 re.Rune = re.Rune0[:0] // use local storage for small strings 733 for _, c := range s { 734 if len(re.Rune) >= cap(re.Rune) { 735 // string is too long to fit in Rune0. let Go handle it 736 re.Rune = []rune(s) 737 break 738 } 739 re.Rune = append(re.Rune, c) 740 } 741 return re 742 } 743 744 // Parsing. 745 746 // Parse parses a regular expression string s, controlled by the specified 747 // Flags, and returns a regular expression parse tree. The syntax is 748 // described in the top-level comment. 749 func Parse(s string, flags Flags) (*Regexp, error) { 750 return parse(s, flags) 751 } 752 753 func parse(s string, flags Flags) (_ *Regexp, err error) { 754 defer func() { 755 switch r := recover(); r { 756 default: 757 panic(r) 758 case nil: 759 // ok 760 case ErrInternalError: 761 err = &Error{Code: ErrInternalError, Expr: s} 762 } 763 }() 764 765 if flags&Literal != 0 { 766 // Trivial parser for literal string. 767 if err := checkUTF8(s); err != nil { 768 return nil, err 769 } 770 return literalRegexp(s, flags), nil 771 } 772 773 // Otherwise, must do real work. 774 var ( 775 p parser 776 c rune 777 op Op 778 lastRepeat string 779 ) 780 p.flags = flags 781 p.wholeRegexp = s 782 t := s 783 for t != "" { 784 repeat := "" 785 BigSwitch: 786 switch t[0] { 787 default: 788 if c, t, err = nextRune(t); err != nil { 789 return nil, err 790 } 791 p.literal(c) 792 793 case '(': 794 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { 795 // Flag changes and non-capturing groups. 796 if t, err = p.parsePerlFlags(t); err != nil { 797 return nil, err 798 } 799 break 800 } 801 p.numCap++ 802 p.op(opLeftParen).Cap = p.numCap 803 t = t[1:] 804 case '|': 805 if err = p.parseVerticalBar(); err != nil { 806 return nil, err 807 } 808 t = t[1:] 809 case ')': 810 if err = p.parseRightParen(); err != nil { 811 return nil, err 812 } 813 t = t[1:] 814 case '^': 815 if p.flags&OneLine != 0 { 816 p.op(OpBeginText) 817 } else { 818 p.op(OpBeginLine) 819 } 820 t = t[1:] 821 case '$': 822 if p.flags&OneLine != 0 { 823 p.op(OpEndText).Flags |= WasDollar 824 } else { 825 p.op(OpEndLine) 826 } 827 t = t[1:] 828 case '.': 829 if p.flags&DotNL != 0 { 830 p.op(OpAnyChar) 831 } else { 832 p.op(OpAnyCharNotNL) 833 } 834 t = t[1:] 835 case '[': 836 if t, err = p.parseClass(t); err != nil { 837 return nil, err 838 } 839 case '*', '+', '?': 840 before := t 841 switch t[0] { 842 case '*': 843 op = OpStar 844 case '+': 845 op = OpPlus 846 case '?': 847 op = OpQuest 848 } 849 after := t[1:] 850 if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil { 851 return nil, err 852 } 853 repeat = before 854 t = after 855 case '{': 856 op = OpRepeat 857 before := t 858 min, max, after, ok := p.parseRepeat(t) 859 if !ok { 860 // If the repeat cannot be parsed, { is a literal. 861 p.literal('{') 862 t = t[1:] 863 break 864 } 865 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { 866 // Numbers were too big, or max is present and min > max. 867 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 868 } 869 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { 870 return nil, err 871 } 872 repeat = before 873 t = after 874 case '\\': 875 if p.flags&PerlX != 0 && len(t) >= 2 { 876 switch t[1] { 877 case 'A': 878 p.op(OpBeginText) 879 t = t[2:] 880 break BigSwitch 881 case 'b': 882 p.op(OpWordBoundary) 883 t = t[2:] 884 break BigSwitch 885 case 'B': 886 p.op(OpNoWordBoundary) 887 t = t[2:] 888 break BigSwitch 889 case 'C': 890 // any byte; not supported 891 return nil, &Error{ErrInvalidEscape, t[:2]} 892 case 'Q': 893 // \Q ... \E: the ... is always literals 894 var lit string 895 if i := strings.Index(t, `\E`); i < 0 { 896 lit = t[2:] 897 t = "" 898 } else { 899 lit = t[2:i] 900 t = t[i+2:] 901 } 902 for lit != "" { 903 c, rest, err := nextRune(lit) 904 if err != nil { 905 return nil, err 906 } 907 p.literal(c) 908 lit = rest 909 } 910 break BigSwitch 911 case 'z': 912 p.op(OpEndText) 913 t = t[2:] 914 break BigSwitch 915 } 916 } 917 918 re := p.newRegexp(OpCharClass) 919 re.Flags = p.flags 920 921 // Look for Unicode character group like \p{Han} 922 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { 923 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) 924 if err != nil { 925 return nil, err 926 } 927 if r != nil { 928 re.Rune = r 929 t = rest 930 p.push(re) 931 break BigSwitch 932 } 933 } 934 935 // Perl character class escape. 936 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { 937 re.Rune = r 938 t = rest 939 p.push(re) 940 break BigSwitch 941 } 942 p.reuse(re) 943 944 // Ordinary single-character escape. 945 if c, t, err = p.parseEscape(t); err != nil { 946 return nil, err 947 } 948 p.literal(c) 949 } 950 lastRepeat = repeat 951 } 952 953 p.concat() 954 if p.swapVerticalBar() { 955 // pop vertical bar 956 p.stack = p.stack[:len(p.stack)-1] 957 } 958 p.alternate() 959 960 n := len(p.stack) 961 if n != 1 { 962 return nil, &Error{ErrMissingParen, s} 963 } 964 return p.stack[0], nil 965 } 966 967 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. 968 // If s is not of that form, it returns ok == false. 969 // If s has the right form but the values are too big, it returns min == -1, ok == true. 970 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { 971 if s == "" || s[0] != '{' { 972 return 973 } 974 s = s[1:] 975 var ok1 bool 976 if min, s, ok1 = p.parseInt(s); !ok1 { 977 return 978 } 979 if s == "" { 980 return 981 } 982 if s[0] != ',' { 983 max = min 984 } else { 985 s = s[1:] 986 if s == "" { 987 return 988 } 989 if s[0] == '}' { 990 max = -1 991 } else if max, s, ok1 = p.parseInt(s); !ok1 { 992 return 993 } else if max < 0 { 994 // parseInt found too big a number 995 min = -1 996 } 997 } 998 if s == "" || s[0] != '}' { 999 return 1000 } 1001 rest = s[1:] 1002 ok = true 1003 return 1004 } 1005 1006 // parsePerlFlags parses a Perl flag setting or non-capturing group or both, 1007 // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. 1008 // The caller must have ensured that s begins with "(?". 1009 func (p *parser) parsePerlFlags(s string) (rest string, err error) { 1010 t := s 1011 1012 // Check for named captures, first introduced in Python's regexp library. 1013 // As usual, there are three slightly different syntaxes: 1014 // 1015 // (?P<name>expr) the original, introduced by Python 1016 // (?<name>expr) the .NET alteration, adopted by Perl 5.10 1017 // (?'name'expr) another .NET alteration, adopted by Perl 5.10 1018 // 1019 // Perl 5.10 gave in and implemented the Python version too, 1020 // but they claim that the last two are the preferred forms. 1021 // PCRE and languages based on it (specifically, PHP and Ruby) 1022 // support all three as well. EcmaScript 4 uses only the Python form. 1023 // 1024 // In both the open source world (via Code Search) and the 1025 // Google source tree, (?P<expr>name) is the dominant form, 1026 // so that's the one we implement. One is enough. 1027 if len(t) > 4 && t[2] == 'P' && t[3] == '<' { 1028 // Pull out name. 1029 end := strings.IndexRune(t, '>') 1030 if end < 0 { 1031 if err = checkUTF8(t); err != nil { 1032 return "", err 1033 } 1034 return "", &Error{ErrInvalidNamedCapture, s} 1035 } 1036 1037 capture := t[:end+1] // "(?P<name>" 1038 name := t[4:end] // "name" 1039 if err = checkUTF8(name); err != nil { 1040 return "", err 1041 } 1042 if !isValidCaptureName(name) { 1043 return "", &Error{ErrInvalidNamedCapture, capture} 1044 } 1045 1046 // Like ordinary capture, but named. 1047 p.numCap++ 1048 re := p.op(opLeftParen) 1049 re.Cap = p.numCap 1050 re.Name = name 1051 return t[end+1:], nil 1052 } 1053 1054 // Non-capturing group. Might also twiddle Perl flags. 1055 var c rune 1056 t = t[2:] // skip (? 1057 flags := p.flags 1058 sign := +1 1059 sawFlag := false 1060 Loop: 1061 for t != "" { 1062 if c, t, err = nextRune(t); err != nil { 1063 return "", err 1064 } 1065 switch c { 1066 default: 1067 break Loop 1068 1069 // Flags. 1070 case 'i': 1071 flags |= FoldCase 1072 sawFlag = true 1073 case 'm': 1074 flags &^= OneLine 1075 sawFlag = true 1076 case 's': 1077 flags |= DotNL 1078 sawFlag = true 1079 case 'U': 1080 flags |= NonGreedy 1081 sawFlag = true 1082 1083 // Switch to negation. 1084 case '-': 1085 if sign < 0 { 1086 break Loop 1087 } 1088 sign = -1 1089 // Invert flags so that | above turn into &^ and vice versa. 1090 // We'll invert flags again before using it below. 1091 flags = ^flags 1092 sawFlag = false 1093 1094 // End of flags, starting group or not. 1095 case ':', ')': 1096 if sign < 0 { 1097 if !sawFlag { 1098 break Loop 1099 } 1100 flags = ^flags 1101 } 1102 if c == ':' { 1103 // Open new group 1104 p.op(opLeftParen) 1105 } 1106 p.flags = flags 1107 return t, nil 1108 } 1109 } 1110 1111 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} 1112 } 1113 1114 // isValidCaptureName reports whether name 1115 // is a valid capture name: [A-Za-z0-9_]+. 1116 // PCRE limits names to 32 bytes. 1117 // Python rejects names starting with digits. 1118 // We don't enforce either of those. 1119 func isValidCaptureName(name string) bool { 1120 if name == "" { 1121 return false 1122 } 1123 for _, c := range name { 1124 if c != '_' && !isalnum(c) { 1125 return false 1126 } 1127 } 1128 return true 1129 } 1130 1131 // parseInt parses a decimal integer. 1132 func (p *parser) parseInt(s string) (n int, rest string, ok bool) { 1133 if s == "" || s[0] < '0' || '9' < s[0] { 1134 return 1135 } 1136 // Disallow leading zeros. 1137 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { 1138 return 1139 } 1140 t := s 1141 for s != "" && '0' <= s[0] && s[0] <= '9' { 1142 s = s[1:] 1143 } 1144 rest = s 1145 ok = true 1146 // Have digits, compute value. 1147 t = t[:len(t)-len(s)] 1148 for i := 0; i < len(t); i++ { 1149 // Avoid overflow. 1150 if n >= 1e8 { 1151 n = -1 1152 break 1153 } 1154 n = n*10 + int(t[i]) - '0' 1155 } 1156 return 1157 } 1158 1159 // can this be represented as a character class? 1160 // single-rune literal string, char class, ., and .|\n. 1161 func isCharClass(re *Regexp) bool { 1162 return re.Op == OpLiteral && len(re.Rune) == 1 || 1163 re.Op == OpCharClass || 1164 re.Op == OpAnyCharNotNL || 1165 re.Op == OpAnyChar 1166 } 1167 1168 // does re match r? 1169 func matchRune(re *Regexp, r rune) bool { 1170 switch re.Op { 1171 case OpLiteral: 1172 return len(re.Rune) == 1 && re.Rune[0] == r 1173 case OpCharClass: 1174 for i := 0; i < len(re.Rune); i += 2 { 1175 if re.Rune[i] <= r && r <= re.Rune[i+1] { 1176 return true 1177 } 1178 } 1179 return false 1180 case OpAnyCharNotNL: 1181 return r != '\n' 1182 case OpAnyChar: 1183 return true 1184 } 1185 return false 1186 } 1187 1188 // parseVerticalBar handles a | in the input. 1189 func (p *parser) parseVerticalBar() error { 1190 p.concat() 1191 1192 // The concatenation we just parsed is on top of the stack. 1193 // If it sits above an opVerticalBar, swap it below 1194 // (things below an opVerticalBar become an alternation). 1195 // Otherwise, push a new vertical bar. 1196 if !p.swapVerticalBar() { 1197 p.op(opVerticalBar) 1198 } 1199 1200 return nil 1201 } 1202 1203 // mergeCharClass makes dst = dst|src. 1204 // The caller must ensure that dst.Op >= src.Op, 1205 // to reduce the amount of copying. 1206 func mergeCharClass(dst, src *Regexp) { 1207 switch dst.Op { 1208 case OpAnyChar: 1209 // src doesn't add anything. 1210 case OpAnyCharNotNL: 1211 // src might add \n 1212 if matchRune(src, '\n') { 1213 dst.Op = OpAnyChar 1214 } 1215 case OpCharClass: 1216 // src is simpler, so either literal or char class 1217 if src.Op == OpLiteral { 1218 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1219 } else { 1220 dst.Rune = appendClass(dst.Rune, src.Rune) 1221 } 1222 case OpLiteral: 1223 // both literal 1224 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { 1225 break 1226 } 1227 dst.Op = OpCharClass 1228 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) 1229 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1230 } 1231 } 1232 1233 // If the top of the stack is an element followed by an opVerticalBar 1234 // swapVerticalBar swaps the two and returns true. 1235 // Otherwise it returns false. 1236 func (p *parser) swapVerticalBar() bool { 1237 // If above and below vertical bar are literal or char class, 1238 // can merge into a single char class. 1239 n := len(p.stack) 1240 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { 1241 re1 := p.stack[n-1] 1242 re3 := p.stack[n-3] 1243 // Make re3 the more complex of the two. 1244 if re1.Op > re3.Op { 1245 re1, re3 = re3, re1 1246 p.stack[n-3] = re3 1247 } 1248 mergeCharClass(re3, re1) 1249 p.reuse(re1) 1250 p.stack = p.stack[:n-1] 1251 return true 1252 } 1253 1254 if n >= 2 { 1255 re1 := p.stack[n-1] 1256 re2 := p.stack[n-2] 1257 if re2.Op == opVerticalBar { 1258 if n >= 3 { 1259 // Now out of reach. 1260 // Clean opportunistically. 1261 cleanAlt(p.stack[n-3]) 1262 } 1263 p.stack[n-2] = re1 1264 p.stack[n-1] = re2 1265 return true 1266 } 1267 } 1268 return false 1269 } 1270 1271 // parseRightParen handles a ) in the input. 1272 func (p *parser) parseRightParen() error { 1273 p.concat() 1274 if p.swapVerticalBar() { 1275 // pop vertical bar 1276 p.stack = p.stack[:len(p.stack)-1] 1277 } 1278 p.alternate() 1279 1280 n := len(p.stack) 1281 if n < 2 { 1282 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1283 } 1284 re1 := p.stack[n-1] 1285 re2 := p.stack[n-2] 1286 p.stack = p.stack[:n-2] 1287 if re2.Op != opLeftParen { 1288 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1289 } 1290 // Restore flags at time of paren. 1291 p.flags = re2.Flags 1292 if re2.Cap == 0 { 1293 // Just for grouping. 1294 p.push(re1) 1295 } else { 1296 re2.Op = OpCapture 1297 re2.Sub = re2.Sub0[:1] 1298 re2.Sub[0] = re1 1299 p.push(re2) 1300 } 1301 return nil 1302 } 1303 1304 // parseEscape parses an escape sequence at the beginning of s 1305 // and returns the rune. 1306 func (p *parser) parseEscape(s string) (r rune, rest string, err error) { 1307 t := s[1:] 1308 if t == "" { 1309 return 0, "", &Error{ErrTrailingBackslash, ""} 1310 } 1311 c, t, err := nextRune(t) 1312 if err != nil { 1313 return 0, "", err 1314 } 1315 1316 Switch: 1317 switch c { 1318 default: 1319 if c < utf8.RuneSelf && !isalnum(c) { 1320 // Escaped non-word characters are always themselves. 1321 // PCRE is not quite so rigorous: it accepts things like 1322 // \q, but we don't. We once rejected \_, but too many 1323 // programs and people insist on using it, so allow \_. 1324 return c, t, nil 1325 } 1326 1327 // Octal escapes. 1328 case '1', '2', '3', '4', '5', '6', '7': 1329 // Single non-zero digit is a backreference; not supported 1330 if t == "" || t[0] < '0' || t[0] > '7' { 1331 break 1332 } 1333 fallthrough 1334 case '0': 1335 // Consume up to three octal digits; already have one. 1336 r = c - '0' 1337 for i := 1; i < 3; i++ { 1338 if t == "" || t[0] < '0' || t[0] > '7' { 1339 break 1340 } 1341 r = r*8 + rune(t[0]) - '0' 1342 t = t[1:] 1343 } 1344 return r, t, nil 1345 1346 // Hexadecimal escapes. 1347 case 'x': 1348 if t == "" { 1349 break 1350 } 1351 if c, t, err = nextRune(t); err != nil { 1352 return 0, "", err 1353 } 1354 if c == '{' { 1355 // Any number of digits in braces. 1356 // Perl accepts any text at all; it ignores all text 1357 // after the first non-hex digit. We require only hex digits, 1358 // and at least one. 1359 nhex := 0 1360 r = 0 1361 for { 1362 if t == "" { 1363 break Switch 1364 } 1365 if c, t, err = nextRune(t); err != nil { 1366 return 0, "", err 1367 } 1368 if c == '}' { 1369 break 1370 } 1371 v := unhex(c) 1372 if v < 0 { 1373 break Switch 1374 } 1375 r = r*16 + v 1376 if r > unicode.MaxRune { 1377 break Switch 1378 } 1379 nhex++ 1380 } 1381 if nhex == 0 { 1382 break Switch 1383 } 1384 return r, t, nil 1385 } 1386 1387 // Easy case: two hex digits. 1388 x := unhex(c) 1389 if c, t, err = nextRune(t); err != nil { 1390 return 0, "", err 1391 } 1392 y := unhex(c) 1393 if x < 0 || y < 0 { 1394 break 1395 } 1396 return x*16 + y, t, nil 1397 1398 // C escapes. There is no case 'b', to avoid misparsing 1399 // the Perl word-boundary \b as the C backspace \b 1400 // when in POSIX mode. In Perl, /\b/ means word-boundary 1401 // but /[\b]/ means backspace. We don't support that. 1402 // If you want a backspace, embed a literal backspace 1403 // character or use \x08. 1404 case 'a': 1405 return '\a', t, err 1406 case 'f': 1407 return '\f', t, err 1408 case 'n': 1409 return '\n', t, err 1410 case 'r': 1411 return '\r', t, err 1412 case 't': 1413 return '\t', t, err 1414 case 'v': 1415 return '\v', t, err 1416 } 1417 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} 1418 } 1419 1420 // parseClassChar parses a character class character at the beginning of s 1421 // and returns it. 1422 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { 1423 if s == "" { 1424 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} 1425 } 1426 1427 // Allow regular escape sequences even though 1428 // many need not be escaped in this context. 1429 if s[0] == '\\' { 1430 return p.parseEscape(s) 1431 } 1432 1433 return nextRune(s) 1434 } 1435 1436 type charGroup struct { 1437 sign int 1438 class []rune 1439 } 1440 1441 // parsePerlClassEscape parses a leading Perl character class escape like \d 1442 // from the beginning of s. If one is present, it appends the characters to r 1443 // and returns the new slice r and the remainder of the string. 1444 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { 1445 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { 1446 return 1447 } 1448 g := perlGroup[s[0:2]] 1449 if g.sign == 0 { 1450 return 1451 } 1452 return p.appendGroup(r, g), s[2:] 1453 } 1454 1455 // parseNamedClass parses a leading POSIX named character class like [:alnum:] 1456 // from the beginning of s. If one is present, it appends the characters to r 1457 // and returns the new slice r and the remainder of the string. 1458 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { 1459 if len(s) < 2 || s[0] != '[' || s[1] != ':' { 1460 return 1461 } 1462 1463 i := strings.Index(s[2:], ":]") 1464 if i < 0 { 1465 return 1466 } 1467 i += 2 1468 name, s := s[0:i+2], s[i+2:] 1469 g := posixGroup[name] 1470 if g.sign == 0 { 1471 return nil, "", &Error{ErrInvalidCharRange, name} 1472 } 1473 return p.appendGroup(r, g), s, nil 1474 } 1475 1476 func (p *parser) appendGroup(r []rune, g charGroup) []rune { 1477 if p.flags&FoldCase == 0 { 1478 if g.sign < 0 { 1479 r = appendNegatedClass(r, g.class) 1480 } else { 1481 r = appendClass(r, g.class) 1482 } 1483 } else { 1484 tmp := p.tmpClass[:0] 1485 tmp = appendFoldedClass(tmp, g.class) 1486 p.tmpClass = tmp 1487 tmp = cleanClass(&p.tmpClass) 1488 if g.sign < 0 { 1489 r = appendNegatedClass(r, tmp) 1490 } else { 1491 r = appendClass(r, tmp) 1492 } 1493 } 1494 return r 1495 } 1496 1497 var anyTable = &unicode.RangeTable{ 1498 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, 1499 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, 1500 } 1501 1502 // unicodeTable returns the unicode.RangeTable identified by name 1503 // and the table of additional fold-equivalent code points. 1504 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { 1505 // Special case: "Any" means any. 1506 if name == "Any" { 1507 return anyTable, anyTable 1508 } 1509 if t := unicode.Categories[name]; t != nil { 1510 return t, unicode.FoldCategory[name] 1511 } 1512 if t := unicode.Scripts[name]; t != nil { 1513 return t, unicode.FoldScript[name] 1514 } 1515 return nil, nil 1516 } 1517 1518 // parseUnicodeClass parses a leading Unicode character class like \p{Han} 1519 // from the beginning of s. If one is present, it appends the characters to r 1520 // and returns the new slice r and the remainder of the string. 1521 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { 1522 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { 1523 return 1524 } 1525 1526 // Committed to parse or return error. 1527 sign := +1 1528 if s[1] == 'P' { 1529 sign = -1 1530 } 1531 t := s[2:] 1532 c, t, err := nextRune(t) 1533 if err != nil { 1534 return 1535 } 1536 var seq, name string 1537 if c != '{' { 1538 // Single-letter name. 1539 seq = s[:len(s)-len(t)] 1540 name = seq[2:] 1541 } else { 1542 // Name is in braces. 1543 end := strings.IndexRune(s, '}') 1544 if end < 0 { 1545 if err = checkUTF8(s); err != nil { 1546 return 1547 } 1548 return nil, "", &Error{ErrInvalidCharRange, s} 1549 } 1550 seq, t = s[:end+1], s[end+1:] 1551 name = s[3:end] 1552 if err = checkUTF8(name); err != nil { 1553 return 1554 } 1555 } 1556 1557 // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. 1558 if name != "" && name[0] == '^' { 1559 sign = -sign 1560 name = name[1:] 1561 } 1562 1563 tab, fold := unicodeTable(name) 1564 if tab == nil { 1565 return nil, "", &Error{ErrInvalidCharRange, seq} 1566 } 1567 1568 if p.flags&FoldCase == 0 || fold == nil { 1569 if sign > 0 { 1570 r = appendTable(r, tab) 1571 } else { 1572 r = appendNegatedTable(r, tab) 1573 } 1574 } else { 1575 // Merge and clean tab and fold in a temporary buffer. 1576 // This is necessary for the negative case and just tidy 1577 // for the positive case. 1578 tmp := p.tmpClass[:0] 1579 tmp = appendTable(tmp, tab) 1580 tmp = appendTable(tmp, fold) 1581 p.tmpClass = tmp 1582 tmp = cleanClass(&p.tmpClass) 1583 if sign > 0 { 1584 r = appendClass(r, tmp) 1585 } else { 1586 r = appendNegatedClass(r, tmp) 1587 } 1588 } 1589 return r, t, nil 1590 } 1591 1592 // parseClass parses a character class at the beginning of s 1593 // and pushes it onto the parse stack. 1594 func (p *parser) parseClass(s string) (rest string, err error) { 1595 t := s[1:] // chop [ 1596 re := p.newRegexp(OpCharClass) 1597 re.Flags = p.flags 1598 re.Rune = re.Rune0[:0] 1599 1600 sign := +1 1601 if t != "" && t[0] == '^' { 1602 sign = -1 1603 t = t[1:] 1604 1605 // If character class does not match \n, add it here, 1606 // so that negation later will do the right thing. 1607 if p.flags&ClassNL == 0 { 1608 re.Rune = append(re.Rune, '\n', '\n') 1609 } 1610 } 1611 1612 class := re.Rune 1613 first := true // ] and - are okay as first char in class 1614 for t == "" || t[0] != ']' || first { 1615 // POSIX: - is only okay unescaped as first or last in class. 1616 // Perl: - is okay anywhere. 1617 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { 1618 _, size := utf8.DecodeRuneInString(t[1:]) 1619 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} 1620 } 1621 first = false 1622 1623 // Look for POSIX [:alnum:] etc. 1624 if len(t) > 2 && t[0] == '[' && t[1] == ':' { 1625 nclass, nt, err := p.parseNamedClass(t, class) 1626 if err != nil { 1627 return "", err 1628 } 1629 if nclass != nil { 1630 class, t = nclass, nt 1631 continue 1632 } 1633 } 1634 1635 // Look for Unicode character group like \p{Han}. 1636 nclass, nt, err := p.parseUnicodeClass(t, class) 1637 if err != nil { 1638 return "", err 1639 } 1640 if nclass != nil { 1641 class, t = nclass, nt 1642 continue 1643 } 1644 1645 // Look for Perl character class symbols (extension). 1646 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { 1647 class, t = nclass, nt 1648 continue 1649 } 1650 1651 // Single character or simple range. 1652 rng := t 1653 var lo, hi rune 1654 if lo, t, err = p.parseClassChar(t, s); err != nil { 1655 return "", err 1656 } 1657 hi = lo 1658 // [a-] means (a|-) so check for final ]. 1659 if len(t) >= 2 && t[0] == '-' && t[1] != ']' { 1660 t = t[1:] 1661 if hi, t, err = p.parseClassChar(t, s); err != nil { 1662 return "", err 1663 } 1664 if hi < lo { 1665 rng = rng[:len(rng)-len(t)] 1666 return "", &Error{Code: ErrInvalidCharRange, Expr: rng} 1667 } 1668 } 1669 if p.flags&FoldCase == 0 { 1670 class = appendRange(class, lo, hi) 1671 } else { 1672 class = appendFoldedRange(class, lo, hi) 1673 } 1674 } 1675 t = t[1:] // chop ] 1676 1677 // Use &re.Rune instead of &class to avoid allocation. 1678 re.Rune = class 1679 class = cleanClass(&re.Rune) 1680 if sign < 0 { 1681 class = negateClass(class) 1682 } 1683 re.Rune = class 1684 p.push(re) 1685 return t, nil 1686 } 1687 1688 // cleanClass sorts the ranges (pairs of elements of r), 1689 // merges them, and eliminates duplicates. 1690 func cleanClass(rp *[]rune) []rune { 1691 1692 // Sort by lo increasing, hi decreasing to break ties. 1693 sort.Sort(ranges{rp}) 1694 1695 r := *rp 1696 if len(r) < 2 { 1697 return r 1698 } 1699 1700 // Merge abutting, overlapping. 1701 w := 2 // write index 1702 for i := 2; i < len(r); i += 2 { 1703 lo, hi := r[i], r[i+1] 1704 if lo <= r[w-1]+1 { 1705 // merge with previous range 1706 if hi > r[w-1] { 1707 r[w-1] = hi 1708 } 1709 continue 1710 } 1711 // new disjoint range 1712 r[w] = lo 1713 r[w+1] = hi 1714 w += 2 1715 } 1716 1717 return r[:w] 1718 } 1719 1720 // appendLiteral returns the result of appending the literal x to the class r. 1721 func appendLiteral(r []rune, x rune, flags Flags) []rune { 1722 if flags&FoldCase != 0 { 1723 return appendFoldedRange(r, x, x) 1724 } 1725 return appendRange(r, x, x) 1726 } 1727 1728 // appendRange returns the result of appending the range lo-hi to the class r. 1729 func appendRange(r []rune, lo, hi rune) []rune { 1730 // Expand last range or next to last range if it overlaps or abuts. 1731 // Checking two ranges helps when appending case-folded 1732 // alphabets, so that one range can be expanding A-Z and the 1733 // other expanding a-z. 1734 n := len(r) 1735 for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 1736 if n >= i { 1737 rlo, rhi := r[n-i], r[n-i+1] 1738 if lo <= rhi+1 && rlo <= hi+1 { 1739 if lo < rlo { 1740 r[n-i] = lo 1741 } 1742 if hi > rhi { 1743 r[n-i+1] = hi 1744 } 1745 return r 1746 } 1747 } 1748 } 1749 1750 return append(r, lo, hi) 1751 } 1752 1753 const ( 1754 // minimum and maximum runes involved in folding. 1755 // checked during test. 1756 minFold = 0x0041 1757 maxFold = 0x1e943 1758 ) 1759 1760 // appendFoldedRange returns the result of appending the range lo-hi 1761 // and its case folding-equivalent runes to the class r. 1762 func appendFoldedRange(r []rune, lo, hi rune) []rune { 1763 // Optimizations. 1764 if lo <= minFold && hi >= maxFold { 1765 // Range is full: folding can't add more. 1766 return appendRange(r, lo, hi) 1767 } 1768 if hi < minFold || lo > maxFold { 1769 // Range is outside folding possibilities. 1770 return appendRange(r, lo, hi) 1771 } 1772 if lo < minFold { 1773 // [lo, minFold-1] needs no folding. 1774 r = appendRange(r, lo, minFold-1) 1775 lo = minFold 1776 } 1777 if hi > maxFold { 1778 // [maxFold+1, hi] needs no folding. 1779 r = appendRange(r, maxFold+1, hi) 1780 hi = maxFold 1781 } 1782 1783 // Brute force. Depend on appendRange to coalesce ranges on the fly. 1784 for c := lo; c <= hi; c++ { 1785 r = appendRange(r, c, c) 1786 f := unicode.SimpleFold(c) 1787 for f != c { 1788 r = appendRange(r, f, f) 1789 f = unicode.SimpleFold(f) 1790 } 1791 } 1792 return r 1793 } 1794 1795 // appendClass returns the result of appending the class x to the class r. 1796 // It assume x is clean. 1797 func appendClass(r []rune, x []rune) []rune { 1798 for i := 0; i < len(x); i += 2 { 1799 r = appendRange(r, x[i], x[i+1]) 1800 } 1801 return r 1802 } 1803 1804 // appendFolded returns the result of appending the case folding of the class x to the class r. 1805 func appendFoldedClass(r []rune, x []rune) []rune { 1806 for i := 0; i < len(x); i += 2 { 1807 r = appendFoldedRange(r, x[i], x[i+1]) 1808 } 1809 return r 1810 } 1811 1812 // appendNegatedClass returns the result of appending the negation of the class x to the class r. 1813 // It assumes x is clean. 1814 func appendNegatedClass(r []rune, x []rune) []rune { 1815 nextLo := '\u0000' 1816 for i := 0; i < len(x); i += 2 { 1817 lo, hi := x[i], x[i+1] 1818 if nextLo <= lo-1 { 1819 r = appendRange(r, nextLo, lo-1) 1820 } 1821 nextLo = hi + 1 1822 } 1823 if nextLo <= unicode.MaxRune { 1824 r = appendRange(r, nextLo, unicode.MaxRune) 1825 } 1826 return r 1827 } 1828 1829 // appendTable returns the result of appending x to the class r. 1830 func appendTable(r []rune, x *unicode.RangeTable) []rune { 1831 for _, xr := range x.R16 { 1832 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1833 if stride == 1 { 1834 r = appendRange(r, lo, hi) 1835 continue 1836 } 1837 for c := lo; c <= hi; c += stride { 1838 r = appendRange(r, c, c) 1839 } 1840 } 1841 for _, xr := range x.R32 { 1842 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1843 if stride == 1 { 1844 r = appendRange(r, lo, hi) 1845 continue 1846 } 1847 for c := lo; c <= hi; c += stride { 1848 r = appendRange(r, c, c) 1849 } 1850 } 1851 return r 1852 } 1853 1854 // appendNegatedTable returns the result of appending the negation of x to the class r. 1855 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { 1856 nextLo := '\u0000' // lo end of next class to add 1857 for _, xr := range x.R16 { 1858 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1859 if stride == 1 { 1860 if nextLo <= lo-1 { 1861 r = appendRange(r, nextLo, lo-1) 1862 } 1863 nextLo = hi + 1 1864 continue 1865 } 1866 for c := lo; c <= hi; c += stride { 1867 if nextLo <= c-1 { 1868 r = appendRange(r, nextLo, c-1) 1869 } 1870 nextLo = c + 1 1871 } 1872 } 1873 for _, xr := range x.R32 { 1874 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1875 if stride == 1 { 1876 if nextLo <= lo-1 { 1877 r = appendRange(r, nextLo, lo-1) 1878 } 1879 nextLo = hi + 1 1880 continue 1881 } 1882 for c := lo; c <= hi; c += stride { 1883 if nextLo <= c-1 { 1884 r = appendRange(r, nextLo, c-1) 1885 } 1886 nextLo = c + 1 1887 } 1888 } 1889 if nextLo <= unicode.MaxRune { 1890 r = appendRange(r, nextLo, unicode.MaxRune) 1891 } 1892 return r 1893 } 1894 1895 // negateClass overwrites r and returns r's negation. 1896 // It assumes the class r is already clean. 1897 func negateClass(r []rune) []rune { 1898 nextLo := '\u0000' // lo end of next class to add 1899 w := 0 // write index 1900 for i := 0; i < len(r); i += 2 { 1901 lo, hi := r[i], r[i+1] 1902 if nextLo <= lo-1 { 1903 r[w] = nextLo 1904 r[w+1] = lo - 1 1905 w += 2 1906 } 1907 nextLo = hi + 1 1908 } 1909 r = r[:w] 1910 if nextLo <= unicode.MaxRune { 1911 // It's possible for the negation to have one more 1912 // range - this one - than the original class, so use append. 1913 r = append(r, nextLo, unicode.MaxRune) 1914 } 1915 return r 1916 } 1917 1918 // ranges implements sort.Interface on a []rune. 1919 // The choice of receiver type definition is strange 1920 // but avoids an allocation since we already have 1921 // a *[]rune. 1922 type ranges struct { 1923 p *[]rune 1924 } 1925 1926 func (ra ranges) Less(i, j int) bool { 1927 p := *ra.p 1928 i *= 2 1929 j *= 2 1930 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] 1931 } 1932 1933 func (ra ranges) Len() int { 1934 return len(*ra.p) / 2 1935 } 1936 1937 func (ra ranges) Swap(i, j int) { 1938 p := *ra.p 1939 i *= 2 1940 j *= 2 1941 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] 1942 } 1943 1944 func checkUTF8(s string) error { 1945 for s != "" { 1946 rune, size := utf8.DecodeRuneInString(s) 1947 if rune == utf8.RuneError && size == 1 { 1948 return &Error{Code: ErrInvalidUTF8, Expr: s} 1949 } 1950 s = s[size:] 1951 } 1952 return nil 1953 } 1954 1955 func nextRune(s string) (c rune, t string, err error) { 1956 c, size := utf8.DecodeRuneInString(s) 1957 if c == utf8.RuneError && size == 1 { 1958 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} 1959 } 1960 return c, s[size:], nil 1961 } 1962 1963 func isalnum(c rune) bool { 1964 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 1965 } 1966 1967 func unhex(c rune) rune { 1968 if '0' <= c && c <= '9' { 1969 return c - '0' 1970 } 1971 if 'a' <= c && c <= 'f' { 1972 return c - 'a' + 10 1973 } 1974 if 'A' <= c && c <= 'F' { 1975 return c - 'A' + 10 1976 } 1977 return -1 1978 }