github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/regexp/syntax/parse.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "sort" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // An Error describes a failure to parse a regular expression 15 // and gives the offending expression. 16 type Error struct { 17 Code ErrorCode 18 Expr string 19 } 20 21 func (e *Error) Error() string { 22 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" 23 } 24 25 // An ErrorCode describes a failure to parse a regular expression. 26 type ErrorCode string 27 28 const ( 29 // Unexpected error 30 ErrInternalError ErrorCode = "regexp/syntax: internal error" 31 32 // Parse errors 33 ErrInvalidCharClass ErrorCode = "invalid character class" 34 ErrInvalidCharRange ErrorCode = "invalid character class range" 35 ErrInvalidEscape ErrorCode = "invalid escape sequence" 36 ErrInvalidNamedCapture ErrorCode = "invalid named capture" 37 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" 38 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" 39 ErrInvalidRepeatSize ErrorCode = "invalid repeat count" 40 ErrInvalidUTF8 ErrorCode = "invalid UTF-8" 41 ErrMissingBracket ErrorCode = "missing closing ]" 42 ErrMissingParen ErrorCode = "missing closing )" 43 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" 44 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" 45 ErrUnexpectedParen ErrorCode = "unexpected )" 46 ) 47 48 func (e ErrorCode) String() string { 49 return string(e) 50 } 51 52 // Flags control the behavior of the parser and record information about regexp context. 53 type Flags uint16 54 55 const ( 56 FoldCase Flags = 1 << iota // case-insensitive match 57 Literal // treat pattern as literal string 58 ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline 59 DotNL // allow . to match newline 60 OneLine // treat ^ and $ as only matching at beginning and end of text 61 NonGreedy // make repetition operators default to non-greedy 62 PerlX // allow Perl extensions 63 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation 64 WasDollar // regexp OpEndText was $, not \z 65 Simple // regexp contains no counted repetition 66 67 MatchNL = ClassNL | DotNL 68 69 Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible 70 POSIX Flags = 0 // POSIX syntax 71 ) 72 73 // Pseudo-ops for parsing stack. 74 const ( 75 opLeftParen = opPseudo + iota 76 opVerticalBar 77 ) 78 79 type parser struct { 80 flags Flags // parse mode flags 81 stack []*Regexp // stack of parsed expressions 82 free *Regexp 83 numCap int // number of capturing groups seen 84 wholeRegexp string 85 tmpClass []rune // temporary char class work space 86 } 87 88 func (p *parser) newRegexp(op Op) *Regexp { 89 re := p.free 90 if re != nil { 91 p.free = re.Sub0[0] 92 *re = Regexp{} 93 } else { 94 re = new(Regexp) 95 } 96 re.Op = op 97 return re 98 } 99 100 func (p *parser) reuse(re *Regexp) { 101 re.Sub0[0] = p.free 102 p.free = re 103 } 104 105 // Parse stack manipulation. 106 107 // push pushes the regexp re onto the parse stack and returns the regexp. 108 func (p *parser) push(re *Regexp) *Regexp { 109 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { 110 // Single rune. 111 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { 112 return nil 113 } 114 re.Op = OpLiteral 115 re.Rune = re.Rune[:1] 116 re.Flags = p.flags &^ FoldCase 117 } else if re.Op == OpCharClass && len(re.Rune) == 4 && 118 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && 119 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && 120 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || 121 re.Op == OpCharClass && len(re.Rune) == 2 && 122 re.Rune[0]+1 == re.Rune[1] && 123 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && 124 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { 125 // Case-insensitive rune like [Aa] or [Δδ]. 126 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { 127 return nil 128 } 129 130 // Rewrite as (case-insensitive) literal. 131 re.Op = OpLiteral 132 re.Rune = re.Rune[:1] 133 re.Flags = p.flags | FoldCase 134 } else { 135 // Incremental concatenation. 136 p.maybeConcat(-1, 0) 137 } 138 139 p.stack = append(p.stack, re) 140 return re 141 } 142 143 // maybeConcat implements incremental concatenation 144 // of literal runes into string nodes. The parser calls this 145 // before each push, so only the top fragment of the stack 146 // might need processing. Since this is called before a push, 147 // the topmost literal is no longer subject to operators like * 148 // (Otherwise ab* would turn into (ab)*.) 149 // If r >= 0 and there's a node left over, maybeConcat uses it 150 // to push r with the given flags. 151 // maybeConcat reports whether r was pushed. 152 func (p *parser) maybeConcat(r rune, flags Flags) bool { 153 n := len(p.stack) 154 if n < 2 { 155 return false 156 } 157 158 re1 := p.stack[n-1] 159 re2 := p.stack[n-2] 160 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { 161 return false 162 } 163 164 // Push re1 into re2. 165 re2.Rune = append(re2.Rune, re1.Rune...) 166 167 // Reuse re1 if possible. 168 if r >= 0 { 169 re1.Rune = re1.Rune0[:1] 170 re1.Rune[0] = r 171 re1.Flags = flags 172 return true 173 } 174 175 p.stack = p.stack[:n-1] 176 p.reuse(re1) 177 return false // did not push r 178 } 179 180 // literal pushes a literal regexp for the rune r on the stack. 181 func (p *parser) literal(r rune) { 182 re := p.newRegexp(OpLiteral) 183 re.Flags = p.flags 184 if p.flags&FoldCase != 0 { 185 r = minFoldRune(r) 186 } 187 re.Rune0[0] = r 188 re.Rune = re.Rune0[:1] 189 p.push(re) 190 } 191 192 // minFoldRune returns the minimum rune fold-equivalent to r. 193 func minFoldRune(r rune) rune { 194 if r < minFold || r > maxFold { 195 return r 196 } 197 min := r 198 r0 := r 199 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { 200 if min > r { 201 min = r 202 } 203 } 204 return min 205 } 206 207 // op pushes a regexp with the given op onto the stack 208 // and returns that regexp. 209 func (p *parser) op(op Op) *Regexp { 210 re := p.newRegexp(op) 211 re.Flags = p.flags 212 return p.push(re) 213 } 214 215 // repeat replaces the top stack element with itself repeated according to op, min, max. 216 // before is the regexp suffix starting at the repetition operator. 217 // after is the regexp suffix following after the repetition operator. 218 // repeat returns an updated 'after' and an error, if any. 219 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { 220 flags := p.flags 221 if p.flags&PerlX != 0 { 222 if len(after) > 0 && after[0] == '?' { 223 after = after[1:] 224 flags ^= NonGreedy 225 } 226 if lastRepeat != "" { 227 // In Perl it is not allowed to stack repetition operators: 228 // a** is a syntax error, not a doubled star, and a++ means 229 // something else entirely, which we don't support! 230 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} 231 } 232 } 233 n := len(p.stack) 234 if n == 0 { 235 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 236 } 237 sub := p.stack[n-1] 238 if sub.Op >= opPseudo { 239 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 240 } 241 242 re := p.newRegexp(op) 243 re.Min = min 244 re.Max = max 245 re.Flags = flags 246 re.Sub = re.Sub0[:1] 247 re.Sub[0] = sub 248 p.stack[n-1] = re 249 250 if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) { 251 return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 252 } 253 254 return after, nil 255 } 256 257 // repeatIsValid reports whether the repetition re is valid. 258 // Valid means that the combination of the top-level repetition 259 // and any inner repetitions does not exceed n copies of the 260 // innermost thing. 261 // This function rewalks the regexp tree and is called for every repetition, 262 // so we have to worry about inducing quadratic behavior in the parser. 263 // We avoid this by only calling repeatIsValid when min or max >= 2. 264 // In that case the depth of any >= 2 nesting can only get to 9 without 265 // triggering a parse error, so each subtree can only be rewalked 9 times. 266 func repeatIsValid(re *Regexp, n int) bool { 267 if re.Op == OpRepeat { 268 m := re.Max 269 if m == 0 { 270 return true 271 } 272 if m < 0 { 273 m = re.Min 274 } 275 if m > n { 276 return false 277 } 278 if m > 0 { 279 n /= m 280 } 281 } 282 for _, sub := range re.Sub { 283 if !repeatIsValid(sub, n) { 284 return false 285 } 286 } 287 return true 288 } 289 290 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. 291 func (p *parser) concat() *Regexp { 292 p.maybeConcat(-1, 0) 293 294 // Scan down to find pseudo-operator | or (. 295 i := len(p.stack) 296 for i > 0 && p.stack[i-1].Op < opPseudo { 297 i-- 298 } 299 subs := p.stack[i:] 300 p.stack = p.stack[:i] 301 302 // Empty concatenation is special case. 303 if len(subs) == 0 { 304 return p.push(p.newRegexp(OpEmptyMatch)) 305 } 306 307 return p.push(p.collapse(subs, OpConcat)) 308 } 309 310 // alternate replaces the top of the stack (above the topmost '(') with its alternation. 311 func (p *parser) alternate() *Regexp { 312 // Scan down to find pseudo-operator (. 313 // There are no | above (. 314 i := len(p.stack) 315 for i > 0 && p.stack[i-1].Op < opPseudo { 316 i-- 317 } 318 subs := p.stack[i:] 319 p.stack = p.stack[:i] 320 321 // Make sure top class is clean. 322 // All the others already are (see swapVerticalBar). 323 if len(subs) > 0 { 324 cleanAlt(subs[len(subs)-1]) 325 } 326 327 // Empty alternate is special case 328 // (shouldn't happen but easy to handle). 329 if len(subs) == 0 { 330 return p.push(p.newRegexp(OpNoMatch)) 331 } 332 333 return p.push(p.collapse(subs, OpAlternate)) 334 } 335 336 // cleanAlt cleans re for eventual inclusion in an alternation. 337 func cleanAlt(re *Regexp) { 338 switch re.Op { 339 case OpCharClass: 340 re.Rune = cleanClass(&re.Rune) 341 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { 342 re.Rune = nil 343 re.Op = OpAnyChar 344 return 345 } 346 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { 347 re.Rune = nil 348 re.Op = OpAnyCharNotNL 349 return 350 } 351 if cap(re.Rune)-len(re.Rune) > 100 { 352 // re.Rune will not grow any more. 353 // Make a copy or inline to reclaim storage. 354 re.Rune = append(re.Rune0[:0], re.Rune...) 355 } 356 } 357 } 358 359 // collapse returns the result of applying op to sub. 360 // If sub contains op nodes, they all get hoisted up 361 // so that there is never a concat of a concat or an 362 // alternate of an alternate. 363 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { 364 if len(subs) == 1 { 365 return subs[0] 366 } 367 re := p.newRegexp(op) 368 re.Sub = re.Sub0[:0] 369 for _, sub := range subs { 370 if sub.Op == op { 371 re.Sub = append(re.Sub, sub.Sub...) 372 p.reuse(sub) 373 } else { 374 re.Sub = append(re.Sub, sub) 375 } 376 } 377 if op == OpAlternate { 378 re.Sub = p.factor(re.Sub) 379 if len(re.Sub) == 1 { 380 old := re 381 re = re.Sub[0] 382 p.reuse(old) 383 } 384 } 385 return re 386 } 387 388 // factor factors common prefixes from the alternation list sub. 389 // It returns a replacement list that reuses the same storage and 390 // frees (passes to p.reuse) any removed *Regexps. 391 // 392 // For example, 393 // ABC|ABD|AEF|BCX|BCY 394 // simplifies by literal prefix extraction to 395 // A(B(C|D)|EF)|BC(X|Y) 396 // which simplifies by character class introduction to 397 // A(B[CD]|EF)|BC[XY] 398 // 399 func (p *parser) factor(sub []*Regexp) []*Regexp { 400 if len(sub) < 2 { 401 return sub 402 } 403 404 // Round 1: Factor out common literal prefixes. 405 var str []rune 406 var strflags Flags 407 start := 0 408 out := sub[:0] 409 for i := 0; i <= len(sub); i++ { 410 // Invariant: the Regexps that were in sub[0:start] have been 411 // used or marked for reuse, and the slice space has been reused 412 // for out (len(out) <= start). 413 // 414 // Invariant: sub[start:i] consists of regexps that all begin 415 // with str as modified by strflags. 416 var istr []rune 417 var iflags Flags 418 if i < len(sub) { 419 istr, iflags = p.leadingString(sub[i]) 420 if iflags == strflags { 421 same := 0 422 for same < len(str) && same < len(istr) && str[same] == istr[same] { 423 same++ 424 } 425 if same > 0 { 426 // Matches at least one rune in current range. 427 // Keep going around. 428 str = str[:same] 429 continue 430 } 431 } 432 } 433 434 // Found end of a run with common leading literal string: 435 // sub[start:i] all begin with str[0:len(str)], but sub[i] 436 // does not even begin with str[0]. 437 // 438 // Factor out common string and append factored expression to out. 439 if i == start { 440 // Nothing to do - run of length 0. 441 } else if i == start+1 { 442 // Just one: don't bother factoring. 443 out = append(out, sub[start]) 444 } else { 445 // Construct factored form: prefix(suffix1|suffix2|...) 446 prefix := p.newRegexp(OpLiteral) 447 prefix.Flags = strflags 448 prefix.Rune = append(prefix.Rune[:0], str...) 449 450 for j := start; j < i; j++ { 451 sub[j] = p.removeLeadingString(sub[j], len(str)) 452 } 453 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 454 455 re := p.newRegexp(OpConcat) 456 re.Sub = append(re.Sub[:0], prefix, suffix) 457 out = append(out, re) 458 } 459 460 // Prepare for next iteration. 461 start = i 462 str = istr 463 strflags = iflags 464 } 465 sub = out 466 467 // Round 2: Factor out common simple prefixes, 468 // just the first piece of each concatenation. 469 // This will be good enough a lot of the time. 470 // 471 // Complex subexpressions (e.g. involving quantifiers) 472 // are not safe to factor because that collapses their 473 // distinct paths through the automaton, which affects 474 // correctness in some cases. 475 start = 0 476 out = sub[:0] 477 var first *Regexp 478 for i := 0; i <= len(sub); i++ { 479 // Invariant: the Regexps that were in sub[0:start] have been 480 // used or marked for reuse, and the slice space has been reused 481 // for out (len(out) <= start). 482 // 483 // Invariant: sub[start:i] consists of regexps that all begin with ifirst. 484 var ifirst *Regexp 485 if i < len(sub) { 486 ifirst = p.leadingRegexp(sub[i]) 487 if first != nil && first.Equal(ifirst) && 488 // first must be a character class OR a fixed repeat of a character class. 489 (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) { 490 continue 491 } 492 } 493 494 // Found end of a run with common leading regexp: 495 // sub[start:i] all begin with first but sub[i] does not. 496 // 497 // Factor out common regexp and append factored expression to out. 498 if i == start { 499 // Nothing to do - run of length 0. 500 } else if i == start+1 { 501 // Just one: don't bother factoring. 502 out = append(out, sub[start]) 503 } else { 504 // Construct factored form: prefix(suffix1|suffix2|...) 505 prefix := first 506 for j := start; j < i; j++ { 507 reuse := j != start // prefix came from sub[start] 508 sub[j] = p.removeLeadingRegexp(sub[j], reuse) 509 } 510 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 511 512 re := p.newRegexp(OpConcat) 513 re.Sub = append(re.Sub[:0], prefix, suffix) 514 out = append(out, re) 515 } 516 517 // Prepare for next iteration. 518 start = i 519 first = ifirst 520 } 521 sub = out 522 523 // Round 3: Collapse runs of single literals into character classes. 524 start = 0 525 out = sub[:0] 526 for i := 0; i <= len(sub); i++ { 527 // Invariant: the Regexps that were in sub[0:start] have been 528 // used or marked for reuse, and the slice space has been reused 529 // for out (len(out) <= start). 530 // 531 // Invariant: sub[start:i] consists of regexps that are either 532 // literal runes or character classes. 533 if i < len(sub) && isCharClass(sub[i]) { 534 continue 535 } 536 537 // sub[i] is not a char or char class; 538 // emit char class for sub[start:i]... 539 if i == start { 540 // Nothing to do - run of length 0. 541 } else if i == start+1 { 542 out = append(out, sub[start]) 543 } else { 544 // Make new char class. 545 // Start with most complex regexp in sub[start]. 546 max := start 547 for j := start + 1; j < i; j++ { 548 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { 549 max = j 550 } 551 } 552 sub[start], sub[max] = sub[max], sub[start] 553 554 for j := start + 1; j < i; j++ { 555 mergeCharClass(sub[start], sub[j]) 556 p.reuse(sub[j]) 557 } 558 cleanAlt(sub[start]) 559 out = append(out, sub[start]) 560 } 561 562 // ... and then emit sub[i]. 563 if i < len(sub) { 564 out = append(out, sub[i]) 565 } 566 start = i + 1 567 } 568 sub = out 569 570 // Round 4: Collapse runs of empty matches into a single empty match. 571 start = 0 572 out = sub[:0] 573 for i := range sub { 574 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { 575 continue 576 } 577 out = append(out, sub[i]) 578 } 579 sub = out 580 581 return sub 582 } 583 584 // leadingString returns the leading literal string that re begins with. 585 // The string refers to storage in re or its children. 586 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { 587 if re.Op == OpConcat && len(re.Sub) > 0 { 588 re = re.Sub[0] 589 } 590 if re.Op != OpLiteral { 591 return nil, 0 592 } 593 return re.Rune, re.Flags & FoldCase 594 } 595 596 // removeLeadingString removes the first n leading runes 597 // from the beginning of re. It returns the replacement for re. 598 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { 599 if re.Op == OpConcat && len(re.Sub) > 0 { 600 // Removing a leading string in a concatenation 601 // might simplify the concatenation. 602 sub := re.Sub[0] 603 sub = p.removeLeadingString(sub, n) 604 re.Sub[0] = sub 605 if sub.Op == OpEmptyMatch { 606 p.reuse(sub) 607 switch len(re.Sub) { 608 case 0, 1: 609 // Impossible but handle. 610 re.Op = OpEmptyMatch 611 re.Sub = nil 612 case 2: 613 old := re 614 re = re.Sub[1] 615 p.reuse(old) 616 default: 617 copy(re.Sub, re.Sub[1:]) 618 re.Sub = re.Sub[:len(re.Sub)-1] 619 } 620 } 621 return re 622 } 623 624 if re.Op == OpLiteral { 625 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] 626 if len(re.Rune) == 0 { 627 re.Op = OpEmptyMatch 628 } 629 } 630 return re 631 } 632 633 // leadingRegexp returns the leading regexp that re begins with. 634 // The regexp refers to storage in re or its children. 635 func (p *parser) leadingRegexp(re *Regexp) *Regexp { 636 if re.Op == OpEmptyMatch { 637 return nil 638 } 639 if re.Op == OpConcat && len(re.Sub) > 0 { 640 sub := re.Sub[0] 641 if sub.Op == OpEmptyMatch { 642 return nil 643 } 644 return sub 645 } 646 return re 647 } 648 649 // removeLeadingRegexp removes the leading regexp in re. 650 // It returns the replacement for re. 651 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. 652 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { 653 if re.Op == OpConcat && len(re.Sub) > 0 { 654 if reuse { 655 p.reuse(re.Sub[0]) 656 } 657 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] 658 switch len(re.Sub) { 659 case 0: 660 re.Op = OpEmptyMatch 661 re.Sub = nil 662 case 1: 663 old := re 664 re = re.Sub[0] 665 p.reuse(old) 666 } 667 return re 668 } 669 if reuse { 670 p.reuse(re) 671 } 672 return p.newRegexp(OpEmptyMatch) 673 } 674 675 func literalRegexp(s string, flags Flags) *Regexp { 676 re := &Regexp{Op: OpLiteral} 677 re.Flags = flags 678 re.Rune = re.Rune0[:0] // use local storage for small strings 679 for _, c := range s { 680 if len(re.Rune) >= cap(re.Rune) { 681 // string is too long to fit in Rune0. let Go handle it 682 re.Rune = []rune(s) 683 break 684 } 685 re.Rune = append(re.Rune, c) 686 } 687 return re 688 } 689 690 // Parsing. 691 692 // Parse parses a regular expression string s, controlled by the specified 693 // Flags, and returns a regular expression parse tree. The syntax is 694 // described in the top-level comment. 695 func Parse(s string, flags Flags) (*Regexp, error) { 696 if flags&Literal != 0 { 697 // Trivial parser for literal string. 698 if err := checkUTF8(s); err != nil { 699 return nil, err 700 } 701 return literalRegexp(s, flags), nil 702 } 703 704 // Otherwise, must do real work. 705 var ( 706 p parser 707 err error 708 c rune 709 op Op 710 lastRepeat string 711 ) 712 p.flags = flags 713 p.wholeRegexp = s 714 t := s 715 for t != "" { 716 repeat := "" 717 BigSwitch: 718 switch t[0] { 719 default: 720 if c, t, err = nextRune(t); err != nil { 721 return nil, err 722 } 723 p.literal(c) 724 725 case '(': 726 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { 727 // Flag changes and non-capturing groups. 728 if t, err = p.parsePerlFlags(t); err != nil { 729 return nil, err 730 } 731 break 732 } 733 p.numCap++ 734 p.op(opLeftParen).Cap = p.numCap 735 t = t[1:] 736 case '|': 737 if err = p.parseVerticalBar(); err != nil { 738 return nil, err 739 } 740 t = t[1:] 741 case ')': 742 if err = p.parseRightParen(); err != nil { 743 return nil, err 744 } 745 t = t[1:] 746 case '^': 747 if p.flags&OneLine != 0 { 748 p.op(OpBeginText) 749 } else { 750 p.op(OpBeginLine) 751 } 752 t = t[1:] 753 case '$': 754 if p.flags&OneLine != 0 { 755 p.op(OpEndText).Flags |= WasDollar 756 } else { 757 p.op(OpEndLine) 758 } 759 t = t[1:] 760 case '.': 761 if p.flags&DotNL != 0 { 762 p.op(OpAnyChar) 763 } else { 764 p.op(OpAnyCharNotNL) 765 } 766 t = t[1:] 767 case '[': 768 if t, err = p.parseClass(t); err != nil { 769 return nil, err 770 } 771 case '*', '+', '?': 772 before := t 773 switch t[0] { 774 case '*': 775 op = OpStar 776 case '+': 777 op = OpPlus 778 case '?': 779 op = OpQuest 780 } 781 after := t[1:] 782 if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil { 783 return nil, err 784 } 785 repeat = before 786 t = after 787 case '{': 788 op = OpRepeat 789 before := t 790 min, max, after, ok := p.parseRepeat(t) 791 if !ok { 792 // If the repeat cannot be parsed, { is a literal. 793 p.literal('{') 794 t = t[1:] 795 break 796 } 797 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { 798 // Numbers were too big, or max is present and min > max. 799 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 800 } 801 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { 802 return nil, err 803 } 804 repeat = before 805 t = after 806 case '\\': 807 if p.flags&PerlX != 0 && len(t) >= 2 { 808 switch t[1] { 809 case 'A': 810 p.op(OpBeginText) 811 t = t[2:] 812 break BigSwitch 813 case 'b': 814 p.op(OpWordBoundary) 815 t = t[2:] 816 break BigSwitch 817 case 'B': 818 p.op(OpNoWordBoundary) 819 t = t[2:] 820 break BigSwitch 821 case 'C': 822 // any byte; not supported 823 return nil, &Error{ErrInvalidEscape, t[:2]} 824 case 'Q': 825 // \Q ... \E: the ... is always literals 826 var lit string 827 if i := strings.Index(t, `\E`); i < 0 { 828 lit = t[2:] 829 t = "" 830 } else { 831 lit = t[2:i] 832 t = t[i+2:] 833 } 834 for lit != "" { 835 c, rest, err := nextRune(lit) 836 if err != nil { 837 return nil, err 838 } 839 p.literal(c) 840 lit = rest 841 } 842 break BigSwitch 843 case 'z': 844 p.op(OpEndText) 845 t = t[2:] 846 break BigSwitch 847 } 848 } 849 850 re := p.newRegexp(OpCharClass) 851 re.Flags = p.flags 852 853 // Look for Unicode character group like \p{Han} 854 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { 855 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) 856 if err != nil { 857 return nil, err 858 } 859 if r != nil { 860 re.Rune = r 861 t = rest 862 p.push(re) 863 break BigSwitch 864 } 865 } 866 867 // Perl character class escape. 868 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { 869 re.Rune = r 870 t = rest 871 p.push(re) 872 break BigSwitch 873 } 874 p.reuse(re) 875 876 // Ordinary single-character escape. 877 if c, t, err = p.parseEscape(t); err != nil { 878 return nil, err 879 } 880 p.literal(c) 881 } 882 lastRepeat = repeat 883 } 884 885 p.concat() 886 if p.swapVerticalBar() { 887 // pop vertical bar 888 p.stack = p.stack[:len(p.stack)-1] 889 } 890 p.alternate() 891 892 n := len(p.stack) 893 if n != 1 { 894 return nil, &Error{ErrMissingParen, s} 895 } 896 return p.stack[0], nil 897 } 898 899 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. 900 // If s is not of that form, it returns ok == false. 901 // If s has the right form but the values are too big, it returns min == -1, ok == true. 902 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { 903 if s == "" || s[0] != '{' { 904 return 905 } 906 s = s[1:] 907 var ok1 bool 908 if min, s, ok1 = p.parseInt(s); !ok1 { 909 return 910 } 911 if s == "" { 912 return 913 } 914 if s[0] != ',' { 915 max = min 916 } else { 917 s = s[1:] 918 if s == "" { 919 return 920 } 921 if s[0] == '}' { 922 max = -1 923 } else if max, s, ok1 = p.parseInt(s); !ok1 { 924 return 925 } else if max < 0 { 926 // parseInt found too big a number 927 min = -1 928 } 929 } 930 if s == "" || s[0] != '}' { 931 return 932 } 933 rest = s[1:] 934 ok = true 935 return 936 } 937 938 // parsePerlFlags parses a Perl flag setting or non-capturing group or both, 939 // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. 940 // The caller must have ensured that s begins with "(?". 941 func (p *parser) parsePerlFlags(s string) (rest string, err error) { 942 t := s 943 944 // Check for named captures, first introduced in Python's regexp library. 945 // As usual, there are three slightly different syntaxes: 946 // 947 // (?P<name>expr) the original, introduced by Python 948 // (?<name>expr) the .NET alteration, adopted by Perl 5.10 949 // (?'name'expr) another .NET alteration, adopted by Perl 5.10 950 // 951 // Perl 5.10 gave in and implemented the Python version too, 952 // but they claim that the last two are the preferred forms. 953 // PCRE and languages based on it (specifically, PHP and Ruby) 954 // support all three as well. EcmaScript 4 uses only the Python form. 955 // 956 // In both the open source world (via Code Search) and the 957 // Google source tree, (?P<expr>name) is the dominant form, 958 // so that's the one we implement. One is enough. 959 if len(t) > 4 && t[2] == 'P' && t[3] == '<' { 960 // Pull out name. 961 end := strings.IndexRune(t, '>') 962 if end < 0 { 963 if err = checkUTF8(t); err != nil { 964 return "", err 965 } 966 return "", &Error{ErrInvalidNamedCapture, s} 967 } 968 969 capture := t[:end+1] // "(?P<name>" 970 name := t[4:end] // "name" 971 if err = checkUTF8(name); err != nil { 972 return "", err 973 } 974 if !isValidCaptureName(name) { 975 return "", &Error{ErrInvalidNamedCapture, capture} 976 } 977 978 // Like ordinary capture, but named. 979 p.numCap++ 980 re := p.op(opLeftParen) 981 re.Cap = p.numCap 982 re.Name = name 983 return t[end+1:], nil 984 } 985 986 // Non-capturing group. Might also twiddle Perl flags. 987 var c rune 988 t = t[2:] // skip (? 989 flags := p.flags 990 sign := +1 991 sawFlag := false 992 Loop: 993 for t != "" { 994 if c, t, err = nextRune(t); err != nil { 995 return "", err 996 } 997 switch c { 998 default: 999 break Loop 1000 1001 // Flags. 1002 case 'i': 1003 flags |= FoldCase 1004 sawFlag = true 1005 case 'm': 1006 flags &^= OneLine 1007 sawFlag = true 1008 case 's': 1009 flags |= DotNL 1010 sawFlag = true 1011 case 'U': 1012 flags |= NonGreedy 1013 sawFlag = true 1014 1015 // Switch to negation. 1016 case '-': 1017 if sign < 0 { 1018 break Loop 1019 } 1020 sign = -1 1021 // Invert flags so that | above turn into &^ and vice versa. 1022 // We'll invert flags again before using it below. 1023 flags = ^flags 1024 sawFlag = false 1025 1026 // End of flags, starting group or not. 1027 case ':', ')': 1028 if sign < 0 { 1029 if !sawFlag { 1030 break Loop 1031 } 1032 flags = ^flags 1033 } 1034 if c == ':' { 1035 // Open new group 1036 p.op(opLeftParen) 1037 } 1038 p.flags = flags 1039 return t, nil 1040 } 1041 } 1042 1043 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} 1044 } 1045 1046 // isValidCaptureName reports whether name 1047 // is a valid capture name: [A-Za-z0-9_]+. 1048 // PCRE limits names to 32 bytes. 1049 // Python rejects names starting with digits. 1050 // We don't enforce either of those. 1051 func isValidCaptureName(name string) bool { 1052 if name == "" { 1053 return false 1054 } 1055 for _, c := range name { 1056 if c != '_' && !isalnum(c) { 1057 return false 1058 } 1059 } 1060 return true 1061 } 1062 1063 // parseInt parses a decimal integer. 1064 func (p *parser) parseInt(s string) (n int, rest string, ok bool) { 1065 if s == "" || s[0] < '0' || '9' < s[0] { 1066 return 1067 } 1068 // Disallow leading zeros. 1069 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { 1070 return 1071 } 1072 t := s 1073 for s != "" && '0' <= s[0] && s[0] <= '9' { 1074 s = s[1:] 1075 } 1076 rest = s 1077 ok = true 1078 // Have digits, compute value. 1079 t = t[:len(t)-len(s)] 1080 for i := 0; i < len(t); i++ { 1081 // Avoid overflow. 1082 if n >= 1e8 { 1083 n = -1 1084 break 1085 } 1086 n = n*10 + int(t[i]) - '0' 1087 } 1088 return 1089 } 1090 1091 // can this be represented as a character class? 1092 // single-rune literal string, char class, ., and .|\n. 1093 func isCharClass(re *Regexp) bool { 1094 return re.Op == OpLiteral && len(re.Rune) == 1 || 1095 re.Op == OpCharClass || 1096 re.Op == OpAnyCharNotNL || 1097 re.Op == OpAnyChar 1098 } 1099 1100 // does re match r? 1101 func matchRune(re *Regexp, r rune) bool { 1102 switch re.Op { 1103 case OpLiteral: 1104 return len(re.Rune) == 1 && re.Rune[0] == r 1105 case OpCharClass: 1106 for i := 0; i < len(re.Rune); i += 2 { 1107 if re.Rune[i] <= r && r <= re.Rune[i+1] { 1108 return true 1109 } 1110 } 1111 return false 1112 case OpAnyCharNotNL: 1113 return r != '\n' 1114 case OpAnyChar: 1115 return true 1116 } 1117 return false 1118 } 1119 1120 // parseVerticalBar handles a | in the input. 1121 func (p *parser) parseVerticalBar() error { 1122 p.concat() 1123 1124 // The concatenation we just parsed is on top of the stack. 1125 // If it sits above an opVerticalBar, swap it below 1126 // (things below an opVerticalBar become an alternation). 1127 // Otherwise, push a new vertical bar. 1128 if !p.swapVerticalBar() { 1129 p.op(opVerticalBar) 1130 } 1131 1132 return nil 1133 } 1134 1135 // mergeCharClass makes dst = dst|src. 1136 // The caller must ensure that dst.Op >= src.Op, 1137 // to reduce the amount of copying. 1138 func mergeCharClass(dst, src *Regexp) { 1139 switch dst.Op { 1140 case OpAnyChar: 1141 // src doesn't add anything. 1142 case OpAnyCharNotNL: 1143 // src might add \n 1144 if matchRune(src, '\n') { 1145 dst.Op = OpAnyChar 1146 } 1147 case OpCharClass: 1148 // src is simpler, so either literal or char class 1149 if src.Op == OpLiteral { 1150 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1151 } else { 1152 dst.Rune = appendClass(dst.Rune, src.Rune) 1153 } 1154 case OpLiteral: 1155 // both literal 1156 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { 1157 break 1158 } 1159 dst.Op = OpCharClass 1160 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) 1161 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1162 } 1163 } 1164 1165 // If the top of the stack is an element followed by an opVerticalBar 1166 // swapVerticalBar swaps the two and returns true. 1167 // Otherwise it returns false. 1168 func (p *parser) swapVerticalBar() bool { 1169 // If above and below vertical bar are literal or char class, 1170 // can merge into a single char class. 1171 n := len(p.stack) 1172 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { 1173 re1 := p.stack[n-1] 1174 re3 := p.stack[n-3] 1175 // Make re3 the more complex of the two. 1176 if re1.Op > re3.Op { 1177 re1, re3 = re3, re1 1178 p.stack[n-3] = re3 1179 } 1180 mergeCharClass(re3, re1) 1181 p.reuse(re1) 1182 p.stack = p.stack[:n-1] 1183 return true 1184 } 1185 1186 if n >= 2 { 1187 re1 := p.stack[n-1] 1188 re2 := p.stack[n-2] 1189 if re2.Op == opVerticalBar { 1190 if n >= 3 { 1191 // Now out of reach. 1192 // Clean opportunistically. 1193 cleanAlt(p.stack[n-3]) 1194 } 1195 p.stack[n-2] = re1 1196 p.stack[n-1] = re2 1197 return true 1198 } 1199 } 1200 return false 1201 } 1202 1203 // parseRightParen handles a ) in the input. 1204 func (p *parser) parseRightParen() error { 1205 p.concat() 1206 if p.swapVerticalBar() { 1207 // pop vertical bar 1208 p.stack = p.stack[:len(p.stack)-1] 1209 } 1210 p.alternate() 1211 1212 n := len(p.stack) 1213 if n < 2 { 1214 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1215 } 1216 re1 := p.stack[n-1] 1217 re2 := p.stack[n-2] 1218 p.stack = p.stack[:n-2] 1219 if re2.Op != opLeftParen { 1220 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1221 } 1222 // Restore flags at time of paren. 1223 p.flags = re2.Flags 1224 if re2.Cap == 0 { 1225 // Just for grouping. 1226 p.push(re1) 1227 } else { 1228 re2.Op = OpCapture 1229 re2.Sub = re2.Sub0[:1] 1230 re2.Sub[0] = re1 1231 p.push(re2) 1232 } 1233 return nil 1234 } 1235 1236 // parseEscape parses an escape sequence at the beginning of s 1237 // and returns the rune. 1238 func (p *parser) parseEscape(s string) (r rune, rest string, err error) { 1239 t := s[1:] 1240 if t == "" { 1241 return 0, "", &Error{ErrTrailingBackslash, ""} 1242 } 1243 c, t, err := nextRune(t) 1244 if err != nil { 1245 return 0, "", err 1246 } 1247 1248 Switch: 1249 switch c { 1250 default: 1251 if c < utf8.RuneSelf && !isalnum(c) { 1252 // Escaped non-word characters are always themselves. 1253 // PCRE is not quite so rigorous: it accepts things like 1254 // \q, but we don't. We once rejected \_, but too many 1255 // programs and people insist on using it, so allow \_. 1256 return c, t, nil 1257 } 1258 1259 // Octal escapes. 1260 case '1', '2', '3', '4', '5', '6', '7': 1261 // Single non-zero digit is a backreference; not supported 1262 if t == "" || t[0] < '0' || t[0] > '7' { 1263 break 1264 } 1265 fallthrough 1266 case '0': 1267 // Consume up to three octal digits; already have one. 1268 r = c - '0' 1269 for i := 1; i < 3; i++ { 1270 if t == "" || t[0] < '0' || t[0] > '7' { 1271 break 1272 } 1273 r = r*8 + rune(t[0]) - '0' 1274 t = t[1:] 1275 } 1276 return r, t, nil 1277 1278 // Hexadecimal escapes. 1279 case 'x': 1280 if t == "" { 1281 break 1282 } 1283 if c, t, err = nextRune(t); err != nil { 1284 return 0, "", err 1285 } 1286 if c == '{' { 1287 // Any number of digits in braces. 1288 // Perl accepts any text at all; it ignores all text 1289 // after the first non-hex digit. We require only hex digits, 1290 // and at least one. 1291 nhex := 0 1292 r = 0 1293 for { 1294 if t == "" { 1295 break Switch 1296 } 1297 if c, t, err = nextRune(t); err != nil { 1298 return 0, "", err 1299 } 1300 if c == '}' { 1301 break 1302 } 1303 v := unhex(c) 1304 if v < 0 { 1305 break Switch 1306 } 1307 r = r*16 + v 1308 if r > unicode.MaxRune { 1309 break Switch 1310 } 1311 nhex++ 1312 } 1313 if nhex == 0 { 1314 break Switch 1315 } 1316 return r, t, nil 1317 } 1318 1319 // Easy case: two hex digits. 1320 x := unhex(c) 1321 if c, t, err = nextRune(t); err != nil { 1322 return 0, "", err 1323 } 1324 y := unhex(c) 1325 if x < 0 || y < 0 { 1326 break 1327 } 1328 return x*16 + y, t, nil 1329 1330 // C escapes. There is no case 'b', to avoid misparsing 1331 // the Perl word-boundary \b as the C backspace \b 1332 // when in POSIX mode. In Perl, /\b/ means word-boundary 1333 // but /[\b]/ means backspace. We don't support that. 1334 // If you want a backspace, embed a literal backspace 1335 // character or use \x08. 1336 case 'a': 1337 return '\a', t, err 1338 case 'f': 1339 return '\f', t, err 1340 case 'n': 1341 return '\n', t, err 1342 case 'r': 1343 return '\r', t, err 1344 case 't': 1345 return '\t', t, err 1346 case 'v': 1347 return '\v', t, err 1348 } 1349 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} 1350 } 1351 1352 // parseClassChar parses a character class character at the beginning of s 1353 // and returns it. 1354 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { 1355 if s == "" { 1356 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} 1357 } 1358 1359 // Allow regular escape sequences even though 1360 // many need not be escaped in this context. 1361 if s[0] == '\\' { 1362 return p.parseEscape(s) 1363 } 1364 1365 return nextRune(s) 1366 } 1367 1368 type charGroup struct { 1369 sign int 1370 class []rune 1371 } 1372 1373 // parsePerlClassEscape parses a leading Perl character class escape like \d 1374 // from the beginning of s. If one is present, it appends the characters to r 1375 // and returns the new slice r and the remainder of the string. 1376 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { 1377 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { 1378 return 1379 } 1380 g := perlGroup[s[0:2]] 1381 if g.sign == 0 { 1382 return 1383 } 1384 return p.appendGroup(r, g), s[2:] 1385 } 1386 1387 // parseNamedClass parses a leading POSIX named character class like [:alnum:] 1388 // from the beginning of s. If one is present, it appends the characters to r 1389 // and returns the new slice r and the remainder of the string. 1390 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { 1391 if len(s) < 2 || s[0] != '[' || s[1] != ':' { 1392 return 1393 } 1394 1395 i := strings.Index(s[2:], ":]") 1396 if i < 0 { 1397 return 1398 } 1399 i += 2 1400 name, s := s[0:i+2], s[i+2:] 1401 g := posixGroup[name] 1402 if g.sign == 0 { 1403 return nil, "", &Error{ErrInvalidCharRange, name} 1404 } 1405 return p.appendGroup(r, g), s, nil 1406 } 1407 1408 func (p *parser) appendGroup(r []rune, g charGroup) []rune { 1409 if p.flags&FoldCase == 0 { 1410 if g.sign < 0 { 1411 r = appendNegatedClass(r, g.class) 1412 } else { 1413 r = appendClass(r, g.class) 1414 } 1415 } else { 1416 tmp := p.tmpClass[:0] 1417 tmp = appendFoldedClass(tmp, g.class) 1418 p.tmpClass = tmp 1419 tmp = cleanClass(&p.tmpClass) 1420 if g.sign < 0 { 1421 r = appendNegatedClass(r, tmp) 1422 } else { 1423 r = appendClass(r, tmp) 1424 } 1425 } 1426 return r 1427 } 1428 1429 var anyTable = &unicode.RangeTable{ 1430 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, 1431 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, 1432 } 1433 1434 // unicodeTable returns the unicode.RangeTable identified by name 1435 // and the table of additional fold-equivalent code points. 1436 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { 1437 // Special case: "Any" means any. 1438 if name == "Any" { 1439 return anyTable, anyTable 1440 } 1441 if t := unicode.Categories[name]; t != nil { 1442 return t, unicode.FoldCategory[name] 1443 } 1444 if t := unicode.Scripts[name]; t != nil { 1445 return t, unicode.FoldScript[name] 1446 } 1447 return nil, nil 1448 } 1449 1450 // parseUnicodeClass parses a leading Unicode character class like \p{Han} 1451 // from the beginning of s. If one is present, it appends the characters to r 1452 // and returns the new slice r and the remainder of the string. 1453 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { 1454 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { 1455 return 1456 } 1457 1458 // Committed to parse or return error. 1459 sign := +1 1460 if s[1] == 'P' { 1461 sign = -1 1462 } 1463 t := s[2:] 1464 c, t, err := nextRune(t) 1465 if err != nil { 1466 return 1467 } 1468 var seq, name string 1469 if c != '{' { 1470 // Single-letter name. 1471 seq = s[:len(s)-len(t)] 1472 name = seq[2:] 1473 } else { 1474 // Name is in braces. 1475 end := strings.IndexRune(s, '}') 1476 if end < 0 { 1477 if err = checkUTF8(s); err != nil { 1478 return 1479 } 1480 return nil, "", &Error{ErrInvalidCharRange, s} 1481 } 1482 seq, t = s[:end+1], s[end+1:] 1483 name = s[3:end] 1484 if err = checkUTF8(name); err != nil { 1485 return 1486 } 1487 } 1488 1489 // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. 1490 if name != "" && name[0] == '^' { 1491 sign = -sign 1492 name = name[1:] 1493 } 1494 1495 tab, fold := unicodeTable(name) 1496 if tab == nil { 1497 return nil, "", &Error{ErrInvalidCharRange, seq} 1498 } 1499 1500 if p.flags&FoldCase == 0 || fold == nil { 1501 if sign > 0 { 1502 r = appendTable(r, tab) 1503 } else { 1504 r = appendNegatedTable(r, tab) 1505 } 1506 } else { 1507 // Merge and clean tab and fold in a temporary buffer. 1508 // This is necessary for the negative case and just tidy 1509 // for the positive case. 1510 tmp := p.tmpClass[:0] 1511 tmp = appendTable(tmp, tab) 1512 tmp = appendTable(tmp, fold) 1513 p.tmpClass = tmp 1514 tmp = cleanClass(&p.tmpClass) 1515 if sign > 0 { 1516 r = appendClass(r, tmp) 1517 } else { 1518 r = appendNegatedClass(r, tmp) 1519 } 1520 } 1521 return r, t, nil 1522 } 1523 1524 // parseClass parses a character class at the beginning of s 1525 // and pushes it onto the parse stack. 1526 func (p *parser) parseClass(s string) (rest string, err error) { 1527 t := s[1:] // chop [ 1528 re := p.newRegexp(OpCharClass) 1529 re.Flags = p.flags 1530 re.Rune = re.Rune0[:0] 1531 1532 sign := +1 1533 if t != "" && t[0] == '^' { 1534 sign = -1 1535 t = t[1:] 1536 1537 // If character class does not match \n, add it here, 1538 // so that negation later will do the right thing. 1539 if p.flags&ClassNL == 0 { 1540 re.Rune = append(re.Rune, '\n', '\n') 1541 } 1542 } 1543 1544 class := re.Rune 1545 first := true // ] and - are okay as first char in class 1546 for t == "" || t[0] != ']' || first { 1547 // POSIX: - is only okay unescaped as first or last in class. 1548 // Perl: - is okay anywhere. 1549 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { 1550 _, size := utf8.DecodeRuneInString(t[1:]) 1551 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} 1552 } 1553 first = false 1554 1555 // Look for POSIX [:alnum:] etc. 1556 if len(t) > 2 && t[0] == '[' && t[1] == ':' { 1557 nclass, nt, err := p.parseNamedClass(t, class) 1558 if err != nil { 1559 return "", err 1560 } 1561 if nclass != nil { 1562 class, t = nclass, nt 1563 continue 1564 } 1565 } 1566 1567 // Look for Unicode character group like \p{Han}. 1568 nclass, nt, err := p.parseUnicodeClass(t, class) 1569 if err != nil { 1570 return "", err 1571 } 1572 if nclass != nil { 1573 class, t = nclass, nt 1574 continue 1575 } 1576 1577 // Look for Perl character class symbols (extension). 1578 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { 1579 class, t = nclass, nt 1580 continue 1581 } 1582 1583 // Single character or simple range. 1584 rng := t 1585 var lo, hi rune 1586 if lo, t, err = p.parseClassChar(t, s); err != nil { 1587 return "", err 1588 } 1589 hi = lo 1590 // [a-] means (a|-) so check for final ]. 1591 if len(t) >= 2 && t[0] == '-' && t[1] != ']' { 1592 t = t[1:] 1593 if hi, t, err = p.parseClassChar(t, s); err != nil { 1594 return "", err 1595 } 1596 if hi < lo { 1597 rng = rng[:len(rng)-len(t)] 1598 return "", &Error{Code: ErrInvalidCharRange, Expr: rng} 1599 } 1600 } 1601 if p.flags&FoldCase == 0 { 1602 class = appendRange(class, lo, hi) 1603 } else { 1604 class = appendFoldedRange(class, lo, hi) 1605 } 1606 } 1607 t = t[1:] // chop ] 1608 1609 // Use &re.Rune instead of &class to avoid allocation. 1610 re.Rune = class 1611 class = cleanClass(&re.Rune) 1612 if sign < 0 { 1613 class = negateClass(class) 1614 } 1615 re.Rune = class 1616 p.push(re) 1617 return t, nil 1618 } 1619 1620 // cleanClass sorts the ranges (pairs of elements of r), 1621 // merges them, and eliminates duplicates. 1622 func cleanClass(rp *[]rune) []rune { 1623 1624 // Sort by lo increasing, hi decreasing to break ties. 1625 sort.Sort(ranges{rp}) 1626 1627 r := *rp 1628 if len(r) < 2 { 1629 return r 1630 } 1631 1632 // Merge abutting, overlapping. 1633 w := 2 // write index 1634 for i := 2; i < len(r); i += 2 { 1635 lo, hi := r[i], r[i+1] 1636 if lo <= r[w-1]+1 { 1637 // merge with previous range 1638 if hi > r[w-1] { 1639 r[w-1] = hi 1640 } 1641 continue 1642 } 1643 // new disjoint range 1644 r[w] = lo 1645 r[w+1] = hi 1646 w += 2 1647 } 1648 1649 return r[:w] 1650 } 1651 1652 // appendLiteral returns the result of appending the literal x to the class r. 1653 func appendLiteral(r []rune, x rune, flags Flags) []rune { 1654 if flags&FoldCase != 0 { 1655 return appendFoldedRange(r, x, x) 1656 } 1657 return appendRange(r, x, x) 1658 } 1659 1660 // appendRange returns the result of appending the range lo-hi to the class r. 1661 func appendRange(r []rune, lo, hi rune) []rune { 1662 // Expand last range or next to last range if it overlaps or abuts. 1663 // Checking two ranges helps when appending case-folded 1664 // alphabets, so that one range can be expanding A-Z and the 1665 // other expanding a-z. 1666 n := len(r) 1667 for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 1668 if n >= i { 1669 rlo, rhi := r[n-i], r[n-i+1] 1670 if lo <= rhi+1 && rlo <= hi+1 { 1671 if lo < rlo { 1672 r[n-i] = lo 1673 } 1674 if hi > rhi { 1675 r[n-i+1] = hi 1676 } 1677 return r 1678 } 1679 } 1680 } 1681 1682 return append(r, lo, hi) 1683 } 1684 1685 const ( 1686 // minimum and maximum runes involved in folding. 1687 // checked during test. 1688 minFold = 0x0041 1689 maxFold = 0x1e943 1690 ) 1691 1692 // appendFoldedRange returns the result of appending the range lo-hi 1693 // and its case folding-equivalent runes to the class r. 1694 func appendFoldedRange(r []rune, lo, hi rune) []rune { 1695 // Optimizations. 1696 if lo <= minFold && hi >= maxFold { 1697 // Range is full: folding can't add more. 1698 return appendRange(r, lo, hi) 1699 } 1700 if hi < minFold || lo > maxFold { 1701 // Range is outside folding possibilities. 1702 return appendRange(r, lo, hi) 1703 } 1704 if lo < minFold { 1705 // [lo, minFold-1] needs no folding. 1706 r = appendRange(r, lo, minFold-1) 1707 lo = minFold 1708 } 1709 if hi > maxFold { 1710 // [maxFold+1, hi] needs no folding. 1711 r = appendRange(r, maxFold+1, hi) 1712 hi = maxFold 1713 } 1714 1715 // Brute force. Depend on appendRange to coalesce ranges on the fly. 1716 for c := lo; c <= hi; c++ { 1717 r = appendRange(r, c, c) 1718 f := unicode.SimpleFold(c) 1719 for f != c { 1720 r = appendRange(r, f, f) 1721 f = unicode.SimpleFold(f) 1722 } 1723 } 1724 return r 1725 } 1726 1727 // appendClass returns the result of appending the class x to the class r. 1728 // It assume x is clean. 1729 func appendClass(r []rune, x []rune) []rune { 1730 for i := 0; i < len(x); i += 2 { 1731 r = appendRange(r, x[i], x[i+1]) 1732 } 1733 return r 1734 } 1735 1736 // appendFolded returns the result of appending the case folding of the class x to the class r. 1737 func appendFoldedClass(r []rune, x []rune) []rune { 1738 for i := 0; i < len(x); i += 2 { 1739 r = appendFoldedRange(r, x[i], x[i+1]) 1740 } 1741 return r 1742 } 1743 1744 // appendNegatedClass returns the result of appending the negation of the class x to the class r. 1745 // It assumes x is clean. 1746 func appendNegatedClass(r []rune, x []rune) []rune { 1747 nextLo := '\u0000' 1748 for i := 0; i < len(x); i += 2 { 1749 lo, hi := x[i], x[i+1] 1750 if nextLo <= lo-1 { 1751 r = appendRange(r, nextLo, lo-1) 1752 } 1753 nextLo = hi + 1 1754 } 1755 if nextLo <= unicode.MaxRune { 1756 r = appendRange(r, nextLo, unicode.MaxRune) 1757 } 1758 return r 1759 } 1760 1761 // appendTable returns the result of appending x to the class r. 1762 func appendTable(r []rune, x *unicode.RangeTable) []rune { 1763 for _, xr := range x.R16 { 1764 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1765 if stride == 1 { 1766 r = appendRange(r, lo, hi) 1767 continue 1768 } 1769 for c := lo; c <= hi; c += stride { 1770 r = appendRange(r, c, c) 1771 } 1772 } 1773 for _, xr := range x.R32 { 1774 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1775 if stride == 1 { 1776 r = appendRange(r, lo, hi) 1777 continue 1778 } 1779 for c := lo; c <= hi; c += stride { 1780 r = appendRange(r, c, c) 1781 } 1782 } 1783 return r 1784 } 1785 1786 // appendNegatedTable returns the result of appending the negation of x to the class r. 1787 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { 1788 nextLo := '\u0000' // lo end of next class to add 1789 for _, xr := range x.R16 { 1790 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1791 if stride == 1 { 1792 if nextLo <= lo-1 { 1793 r = appendRange(r, nextLo, lo-1) 1794 } 1795 nextLo = hi + 1 1796 continue 1797 } 1798 for c := lo; c <= hi; c += stride { 1799 if nextLo <= c-1 { 1800 r = appendRange(r, nextLo, c-1) 1801 } 1802 nextLo = c + 1 1803 } 1804 } 1805 for _, xr := range x.R32 { 1806 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1807 if stride == 1 { 1808 if nextLo <= lo-1 { 1809 r = appendRange(r, nextLo, lo-1) 1810 } 1811 nextLo = hi + 1 1812 continue 1813 } 1814 for c := lo; c <= hi; c += stride { 1815 if nextLo <= c-1 { 1816 r = appendRange(r, nextLo, c-1) 1817 } 1818 nextLo = c + 1 1819 } 1820 } 1821 if nextLo <= unicode.MaxRune { 1822 r = appendRange(r, nextLo, unicode.MaxRune) 1823 } 1824 return r 1825 } 1826 1827 // negateClass overwrites r and returns r's negation. 1828 // It assumes the class r is already clean. 1829 func negateClass(r []rune) []rune { 1830 nextLo := '\u0000' // lo end of next class to add 1831 w := 0 // write index 1832 for i := 0; i < len(r); i += 2 { 1833 lo, hi := r[i], r[i+1] 1834 if nextLo <= lo-1 { 1835 r[w] = nextLo 1836 r[w+1] = lo - 1 1837 w += 2 1838 } 1839 nextLo = hi + 1 1840 } 1841 r = r[:w] 1842 if nextLo <= unicode.MaxRune { 1843 // It's possible for the negation to have one more 1844 // range - this one - than the original class, so use append. 1845 r = append(r, nextLo, unicode.MaxRune) 1846 } 1847 return r 1848 } 1849 1850 // ranges implements sort.Interface on a []rune. 1851 // The choice of receiver type definition is strange 1852 // but avoids an allocation since we already have 1853 // a *[]rune. 1854 type ranges struct { 1855 p *[]rune 1856 } 1857 1858 func (ra ranges) Less(i, j int) bool { 1859 p := *ra.p 1860 i *= 2 1861 j *= 2 1862 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] 1863 } 1864 1865 func (ra ranges) Len() int { 1866 return len(*ra.p) / 2 1867 } 1868 1869 func (ra ranges) Swap(i, j int) { 1870 p := *ra.p 1871 i *= 2 1872 j *= 2 1873 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] 1874 } 1875 1876 func checkUTF8(s string) error { 1877 for s != "" { 1878 rune, size := utf8.DecodeRuneInString(s) 1879 if rune == utf8.RuneError && size == 1 { 1880 return &Error{Code: ErrInvalidUTF8, Expr: s} 1881 } 1882 s = s[size:] 1883 } 1884 return nil 1885 } 1886 1887 func nextRune(s string) (c rune, t string, err error) { 1888 c, size := utf8.DecodeRuneInString(s) 1889 if c == utf8.RuneError && size == 1 { 1890 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} 1891 } 1892 return c, s[size:], nil 1893 } 1894 1895 func isalnum(c rune) bool { 1896 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 1897 } 1898 1899 func unhex(c rune) rune { 1900 if '0' <= c && c <= '9' { 1901 return c - '0' 1902 } 1903 if 'a' <= c && c <= 'f' { 1904 return c - 'a' + 10 1905 } 1906 if 'A' <= c && c <= 'F' { 1907 return c - 'A' + 10 1908 } 1909 return -1 1910 }