github.com/s1s1ty/go@v0.0.0-20180207192209-104445e3140f/src/regexp/syntax/parse.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "sort" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // An Error describes a failure to parse a regular expression 15 // and gives the offending expression. 16 type Error struct { 17 Code ErrorCode 18 Expr string 19 } 20 21 func (e *Error) Error() string { 22 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" 23 } 24 25 // An ErrorCode describes a failure to parse a regular expression. 26 type ErrorCode string 27 28 const ( 29 // Unexpected error 30 ErrInternalError ErrorCode = "regexp/syntax: internal error" 31 32 // Parse errors 33 ErrInvalidCharClass ErrorCode = "invalid character class" 34 ErrInvalidCharRange ErrorCode = "invalid character class range" 35 ErrInvalidEscape ErrorCode = "invalid escape sequence" 36 ErrInvalidNamedCapture ErrorCode = "invalid named capture" 37 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" 38 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" 39 ErrInvalidRepeatSize ErrorCode = "invalid repeat count" 40 ErrInvalidUTF8 ErrorCode = "invalid UTF-8" 41 ErrMissingBracket ErrorCode = "missing closing ]" 42 ErrMissingParen ErrorCode = "missing closing )" 43 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" 44 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" 45 ErrUnexpectedParen ErrorCode = "unexpected )" 46 ) 47 48 func (e ErrorCode) String() string { 49 return string(e) 50 } 51 52 // Flags control the behavior of the parser and record information about regexp context. 53 type Flags uint16 54 55 const ( 56 FoldCase Flags = 1 << iota // case-insensitive match 57 Literal // treat pattern as literal string 58 ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline 59 DotNL // allow . to match newline 60 OneLine // treat ^ and $ as only matching at beginning and end of text 61 NonGreedy // make repetition operators default to non-greedy 62 PerlX // allow Perl extensions 63 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation 64 WasDollar // regexp OpEndText was $, not \z 65 Simple // regexp contains no counted repetition 66 67 MatchNL = ClassNL | DotNL 68 69 Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible 70 POSIX Flags = 0 // POSIX syntax 71 ) 72 73 // Pseudo-ops for parsing stack. 74 const ( 75 opLeftParen = opPseudo + iota 76 opVerticalBar 77 ) 78 79 type parser struct { 80 flags Flags // parse mode flags 81 stack []*Regexp // stack of parsed expressions 82 free *Regexp 83 numCap int // number of capturing groups seen 84 wholeRegexp string 85 tmpClass []rune // temporary char class work space 86 } 87 88 func (p *parser) newRegexp(op Op) *Regexp { 89 re := p.free 90 if re != nil { 91 p.free = re.Sub0[0] 92 *re = Regexp{} 93 } else { 94 re = new(Regexp) 95 } 96 re.Op = op 97 return re 98 } 99 100 func (p *parser) reuse(re *Regexp) { 101 re.Sub0[0] = p.free 102 p.free = re 103 } 104 105 // Parse stack manipulation. 106 107 // push pushes the regexp re onto the parse stack and returns the regexp. 108 func (p *parser) push(re *Regexp) *Regexp { 109 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { 110 // Single rune. 111 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { 112 return nil 113 } 114 re.Op = OpLiteral 115 re.Rune = re.Rune[:1] 116 re.Flags = p.flags &^ FoldCase 117 } else if re.Op == OpCharClass && len(re.Rune) == 4 && 118 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && 119 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && 120 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || 121 re.Op == OpCharClass && len(re.Rune) == 2 && 122 re.Rune[0]+1 == re.Rune[1] && 123 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && 124 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { 125 // Case-insensitive rune like [Aa] or [Δδ]. 126 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { 127 return nil 128 } 129 130 // Rewrite as (case-insensitive) literal. 131 re.Op = OpLiteral 132 re.Rune = re.Rune[:1] 133 re.Flags = p.flags | FoldCase 134 } else { 135 // Incremental concatenation. 136 p.maybeConcat(-1, 0) 137 } 138 139 p.stack = append(p.stack, re) 140 return re 141 } 142 143 // maybeConcat implements incremental concatenation 144 // of literal runes into string nodes. The parser calls this 145 // before each push, so only the top fragment of the stack 146 // might need processing. Since this is called before a push, 147 // the topmost literal is no longer subject to operators like * 148 // (Otherwise ab* would turn into (ab)*.) 149 // If r >= 0 and there's a node left over, maybeConcat uses it 150 // to push r with the given flags. 151 // maybeConcat reports whether r was pushed. 152 func (p *parser) maybeConcat(r rune, flags Flags) bool { 153 n := len(p.stack) 154 if n < 2 { 155 return false 156 } 157 158 re1 := p.stack[n-1] 159 re2 := p.stack[n-2] 160 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { 161 return false 162 } 163 164 // Push re1 into re2. 165 re2.Rune = append(re2.Rune, re1.Rune...) 166 167 // Reuse re1 if possible. 168 if r >= 0 { 169 re1.Rune = re1.Rune0[:1] 170 re1.Rune[0] = r 171 re1.Flags = flags 172 return true 173 } 174 175 p.stack = p.stack[:n-1] 176 p.reuse(re1) 177 return false // did not push r 178 } 179 180 // newLiteral returns a new OpLiteral Regexp with the given flags 181 func (p *parser) newLiteral(r rune, flags Flags) *Regexp { 182 re := p.newRegexp(OpLiteral) 183 re.Flags = flags 184 if flags&FoldCase != 0 { 185 r = minFoldRune(r) 186 } 187 re.Rune0[0] = r 188 re.Rune = re.Rune0[:1] 189 return re 190 } 191 192 // minFoldRune returns the minimum rune fold-equivalent to r. 193 func minFoldRune(r rune) rune { 194 if r < minFold || r > maxFold { 195 return r 196 } 197 min := r 198 r0 := r 199 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { 200 if min > r { 201 min = r 202 } 203 } 204 return min 205 } 206 207 // literal pushes a literal regexp for the rune r on the stack 208 // and returns that regexp. 209 func (p *parser) literal(r rune) { 210 p.push(p.newLiteral(r, p.flags)) 211 } 212 213 // op pushes a regexp with the given op onto the stack 214 // and returns that regexp. 215 func (p *parser) op(op Op) *Regexp { 216 re := p.newRegexp(op) 217 re.Flags = p.flags 218 return p.push(re) 219 } 220 221 // repeat replaces the top stack element with itself repeated according to op, min, max. 222 // before is the regexp suffix starting at the repetition operator. 223 // after is the regexp suffix following after the repetition operator. 224 // repeat returns an updated 'after' and an error, if any. 225 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { 226 flags := p.flags 227 if p.flags&PerlX != 0 { 228 if len(after) > 0 && after[0] == '?' { 229 after = after[1:] 230 flags ^= NonGreedy 231 } 232 if lastRepeat != "" { 233 // In Perl it is not allowed to stack repetition operators: 234 // a** is a syntax error, not a doubled star, and a++ means 235 // something else entirely, which we don't support! 236 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} 237 } 238 } 239 n := len(p.stack) 240 if n == 0 { 241 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 242 } 243 sub := p.stack[n-1] 244 if sub.Op >= opPseudo { 245 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 246 } 247 248 re := p.newRegexp(op) 249 re.Min = min 250 re.Max = max 251 re.Flags = flags 252 re.Sub = re.Sub0[:1] 253 re.Sub[0] = sub 254 p.stack[n-1] = re 255 256 if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) { 257 return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 258 } 259 260 return after, nil 261 } 262 263 // repeatIsValid reports whether the repetition re is valid. 264 // Valid means that the combination of the top-level repetition 265 // and any inner repetitions does not exceed n copies of the 266 // innermost thing. 267 // This function rewalks the regexp tree and is called for every repetition, 268 // so we have to worry about inducing quadratic behavior in the parser. 269 // We avoid this by only calling repeatIsValid when min or max >= 2. 270 // In that case the depth of any >= 2 nesting can only get to 9 without 271 // triggering a parse error, so each subtree can only be rewalked 9 times. 272 func repeatIsValid(re *Regexp, n int) bool { 273 if re.Op == OpRepeat { 274 m := re.Max 275 if m == 0 { 276 return true 277 } 278 if m < 0 { 279 m = re.Min 280 } 281 if m > n { 282 return false 283 } 284 if m > 0 { 285 n /= m 286 } 287 } 288 for _, sub := range re.Sub { 289 if !repeatIsValid(sub, n) { 290 return false 291 } 292 } 293 return true 294 } 295 296 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. 297 func (p *parser) concat() *Regexp { 298 p.maybeConcat(-1, 0) 299 300 // Scan down to find pseudo-operator | or (. 301 i := len(p.stack) 302 for i > 0 && p.stack[i-1].Op < opPseudo { 303 i-- 304 } 305 subs := p.stack[i:] 306 p.stack = p.stack[:i] 307 308 // Empty concatenation is special case. 309 if len(subs) == 0 { 310 return p.push(p.newRegexp(OpEmptyMatch)) 311 } 312 313 return p.push(p.collapse(subs, OpConcat)) 314 } 315 316 // alternate replaces the top of the stack (above the topmost '(') with its alternation. 317 func (p *parser) alternate() *Regexp { 318 // Scan down to find pseudo-operator (. 319 // There are no | above (. 320 i := len(p.stack) 321 for i > 0 && p.stack[i-1].Op < opPseudo { 322 i-- 323 } 324 subs := p.stack[i:] 325 p.stack = p.stack[:i] 326 327 // Make sure top class is clean. 328 // All the others already are (see swapVerticalBar). 329 if len(subs) > 0 { 330 cleanAlt(subs[len(subs)-1]) 331 } 332 333 // Empty alternate is special case 334 // (shouldn't happen but easy to handle). 335 if len(subs) == 0 { 336 return p.push(p.newRegexp(OpNoMatch)) 337 } 338 339 return p.push(p.collapse(subs, OpAlternate)) 340 } 341 342 // cleanAlt cleans re for eventual inclusion in an alternation. 343 func cleanAlt(re *Regexp) { 344 switch re.Op { 345 case OpCharClass: 346 re.Rune = cleanClass(&re.Rune) 347 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { 348 re.Rune = nil 349 re.Op = OpAnyChar 350 return 351 } 352 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { 353 re.Rune = nil 354 re.Op = OpAnyCharNotNL 355 return 356 } 357 if cap(re.Rune)-len(re.Rune) > 100 { 358 // re.Rune will not grow any more. 359 // Make a copy or inline to reclaim storage. 360 re.Rune = append(re.Rune0[:0], re.Rune...) 361 } 362 } 363 } 364 365 // collapse returns the result of applying op to sub. 366 // If sub contains op nodes, they all get hoisted up 367 // so that there is never a concat of a concat or an 368 // alternate of an alternate. 369 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { 370 if len(subs) == 1 { 371 return subs[0] 372 } 373 re := p.newRegexp(op) 374 re.Sub = re.Sub0[:0] 375 for _, sub := range subs { 376 if sub.Op == op { 377 re.Sub = append(re.Sub, sub.Sub...) 378 p.reuse(sub) 379 } else { 380 re.Sub = append(re.Sub, sub) 381 } 382 } 383 if op == OpAlternate { 384 re.Sub = p.factor(re.Sub) 385 if len(re.Sub) == 1 { 386 old := re 387 re = re.Sub[0] 388 p.reuse(old) 389 } 390 } 391 return re 392 } 393 394 // factor factors common prefixes from the alternation list sub. 395 // It returns a replacement list that reuses the same storage and 396 // frees (passes to p.reuse) any removed *Regexps. 397 // 398 // For example, 399 // ABC|ABD|AEF|BCX|BCY 400 // simplifies by literal prefix extraction to 401 // A(B(C|D)|EF)|BC(X|Y) 402 // which simplifies by character class introduction to 403 // A(B[CD]|EF)|BC[XY] 404 // 405 func (p *parser) factor(sub []*Regexp) []*Regexp { 406 if len(sub) < 2 { 407 return sub 408 } 409 410 // Round 1: Factor out common literal prefixes. 411 var str []rune 412 var strflags Flags 413 start := 0 414 out := sub[:0] 415 for i := 0; i <= len(sub); i++ { 416 // Invariant: the Regexps that were in sub[0:start] have been 417 // used or marked for reuse, and the slice space has been reused 418 // for out (len(out) <= start). 419 // 420 // Invariant: sub[start:i] consists of regexps that all begin 421 // with str as modified by strflags. 422 var istr []rune 423 var iflags Flags 424 if i < len(sub) { 425 istr, iflags = p.leadingString(sub[i]) 426 if iflags == strflags { 427 same := 0 428 for same < len(str) && same < len(istr) && str[same] == istr[same] { 429 same++ 430 } 431 if same > 0 { 432 // Matches at least one rune in current range. 433 // Keep going around. 434 str = str[:same] 435 continue 436 } 437 } 438 } 439 440 // Found end of a run with common leading literal string: 441 // sub[start:i] all begin with str[0:len(str)], but sub[i] 442 // does not even begin with str[0]. 443 // 444 // Factor out common string and append factored expression to out. 445 if i == start { 446 // Nothing to do - run of length 0. 447 } else if i == start+1 { 448 // Just one: don't bother factoring. 449 out = append(out, sub[start]) 450 } else { 451 // Construct factored form: prefix(suffix1|suffix2|...) 452 prefix := p.newRegexp(OpLiteral) 453 prefix.Flags = strflags 454 prefix.Rune = append(prefix.Rune[:0], str...) 455 456 for j := start; j < i; j++ { 457 sub[j] = p.removeLeadingString(sub[j], len(str)) 458 } 459 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 460 461 re := p.newRegexp(OpConcat) 462 re.Sub = append(re.Sub[:0], prefix, suffix) 463 out = append(out, re) 464 } 465 466 // Prepare for next iteration. 467 start = i 468 str = istr 469 strflags = iflags 470 } 471 sub = out 472 473 // Round 2: Factor out common simple prefixes, 474 // just the first piece of each concatenation. 475 // This will be good enough a lot of the time. 476 // 477 // Complex subexpressions (e.g. involving quantifiers) 478 // are not safe to factor because that collapses their 479 // distinct paths through the automaton, which affects 480 // correctness in some cases. 481 start = 0 482 out = sub[:0] 483 var first *Regexp 484 for i := 0; i <= len(sub); i++ { 485 // Invariant: the Regexps that were in sub[0:start] have been 486 // used or marked for reuse, and the slice space has been reused 487 // for out (len(out) <= start). 488 // 489 // Invariant: sub[start:i] consists of regexps that all begin with ifirst. 490 var ifirst *Regexp 491 if i < len(sub) { 492 ifirst = p.leadingRegexp(sub[i]) 493 if first != nil && first.Equal(ifirst) && 494 // first must be a character class OR a fixed repeat of a character class. 495 (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) { 496 continue 497 } 498 } 499 500 // Found end of a run with common leading regexp: 501 // sub[start:i] all begin with first but sub[i] does not. 502 // 503 // Factor out common regexp and append factored expression to out. 504 if i == start { 505 // Nothing to do - run of length 0. 506 } else if i == start+1 { 507 // Just one: don't bother factoring. 508 out = append(out, sub[start]) 509 } else { 510 // Construct factored form: prefix(suffix1|suffix2|...) 511 prefix := first 512 for j := start; j < i; j++ { 513 reuse := j != start // prefix came from sub[start] 514 sub[j] = p.removeLeadingRegexp(sub[j], reuse) 515 } 516 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 517 518 re := p.newRegexp(OpConcat) 519 re.Sub = append(re.Sub[:0], prefix, suffix) 520 out = append(out, re) 521 } 522 523 // Prepare for next iteration. 524 start = i 525 first = ifirst 526 } 527 sub = out 528 529 // Round 3: Collapse runs of single literals into character classes. 530 start = 0 531 out = sub[:0] 532 for i := 0; i <= len(sub); i++ { 533 // Invariant: the Regexps that were in sub[0:start] have been 534 // used or marked for reuse, and the slice space has been reused 535 // for out (len(out) <= start). 536 // 537 // Invariant: sub[start:i] consists of regexps that are either 538 // literal runes or character classes. 539 if i < len(sub) && isCharClass(sub[i]) { 540 continue 541 } 542 543 // sub[i] is not a char or char class; 544 // emit char class for sub[start:i]... 545 if i == start { 546 // Nothing to do - run of length 0. 547 } else if i == start+1 { 548 out = append(out, sub[start]) 549 } else { 550 // Make new char class. 551 // Start with most complex regexp in sub[start]. 552 max := start 553 for j := start + 1; j < i; j++ { 554 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { 555 max = j 556 } 557 } 558 sub[start], sub[max] = sub[max], sub[start] 559 560 for j := start + 1; j < i; j++ { 561 mergeCharClass(sub[start], sub[j]) 562 p.reuse(sub[j]) 563 } 564 cleanAlt(sub[start]) 565 out = append(out, sub[start]) 566 } 567 568 // ... and then emit sub[i]. 569 if i < len(sub) { 570 out = append(out, sub[i]) 571 } 572 start = i + 1 573 } 574 sub = out 575 576 // Round 4: Collapse runs of empty matches into a single empty match. 577 start = 0 578 out = sub[:0] 579 for i := range sub { 580 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { 581 continue 582 } 583 out = append(out, sub[i]) 584 } 585 sub = out 586 587 return sub 588 } 589 590 // leadingString returns the leading literal string that re begins with. 591 // The string refers to storage in re or its children. 592 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { 593 if re.Op == OpConcat && len(re.Sub) > 0 { 594 re = re.Sub[0] 595 } 596 if re.Op != OpLiteral { 597 return nil, 0 598 } 599 return re.Rune, re.Flags & FoldCase 600 } 601 602 // removeLeadingString removes the first n leading runes 603 // from the beginning of re. It returns the replacement for re. 604 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { 605 if re.Op == OpConcat && len(re.Sub) > 0 { 606 // Removing a leading string in a concatenation 607 // might simplify the concatenation. 608 sub := re.Sub[0] 609 sub = p.removeLeadingString(sub, n) 610 re.Sub[0] = sub 611 if sub.Op == OpEmptyMatch { 612 p.reuse(sub) 613 switch len(re.Sub) { 614 case 0, 1: 615 // Impossible but handle. 616 re.Op = OpEmptyMatch 617 re.Sub = nil 618 case 2: 619 old := re 620 re = re.Sub[1] 621 p.reuse(old) 622 default: 623 copy(re.Sub, re.Sub[1:]) 624 re.Sub = re.Sub[:len(re.Sub)-1] 625 } 626 } 627 return re 628 } 629 630 if re.Op == OpLiteral { 631 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] 632 if len(re.Rune) == 0 { 633 re.Op = OpEmptyMatch 634 } 635 } 636 return re 637 } 638 639 // leadingRegexp returns the leading regexp that re begins with. 640 // The regexp refers to storage in re or its children. 641 func (p *parser) leadingRegexp(re *Regexp) *Regexp { 642 if re.Op == OpEmptyMatch { 643 return nil 644 } 645 if re.Op == OpConcat && len(re.Sub) > 0 { 646 sub := re.Sub[0] 647 if sub.Op == OpEmptyMatch { 648 return nil 649 } 650 return sub 651 } 652 return re 653 } 654 655 // removeLeadingRegexp removes the leading regexp in re. 656 // It returns the replacement for re. 657 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. 658 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { 659 if re.Op == OpConcat && len(re.Sub) > 0 { 660 if reuse { 661 p.reuse(re.Sub[0]) 662 } 663 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] 664 switch len(re.Sub) { 665 case 0: 666 re.Op = OpEmptyMatch 667 re.Sub = nil 668 case 1: 669 old := re 670 re = re.Sub[0] 671 p.reuse(old) 672 } 673 return re 674 } 675 if reuse { 676 p.reuse(re) 677 } 678 return p.newRegexp(OpEmptyMatch) 679 } 680 681 func literalRegexp(s string, flags Flags) *Regexp { 682 re := &Regexp{Op: OpLiteral} 683 re.Flags = flags 684 re.Rune = re.Rune0[:0] // use local storage for small strings 685 for _, c := range s { 686 if len(re.Rune) >= cap(re.Rune) { 687 // string is too long to fit in Rune0. let Go handle it 688 re.Rune = []rune(s) 689 break 690 } 691 re.Rune = append(re.Rune, c) 692 } 693 return re 694 } 695 696 // Parsing. 697 698 // Parse parses a regular expression string s, controlled by the specified 699 // Flags, and returns a regular expression parse tree. The syntax is 700 // described in the top-level comment. 701 func Parse(s string, flags Flags) (*Regexp, error) { 702 if flags&Literal != 0 { 703 // Trivial parser for literal string. 704 if err := checkUTF8(s); err != nil { 705 return nil, err 706 } 707 return literalRegexp(s, flags), nil 708 } 709 710 // Otherwise, must do real work. 711 var ( 712 p parser 713 err error 714 c rune 715 op Op 716 lastRepeat string 717 ) 718 p.flags = flags 719 p.wholeRegexp = s 720 t := s 721 for t != "" { 722 repeat := "" 723 BigSwitch: 724 switch t[0] { 725 default: 726 if c, t, err = nextRune(t); err != nil { 727 return nil, err 728 } 729 p.literal(c) 730 731 case '(': 732 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { 733 // Flag changes and non-capturing groups. 734 if t, err = p.parsePerlFlags(t); err != nil { 735 return nil, err 736 } 737 break 738 } 739 p.numCap++ 740 p.op(opLeftParen).Cap = p.numCap 741 t = t[1:] 742 case '|': 743 if err = p.parseVerticalBar(); err != nil { 744 return nil, err 745 } 746 t = t[1:] 747 case ')': 748 if err = p.parseRightParen(); err != nil { 749 return nil, err 750 } 751 t = t[1:] 752 case '^': 753 if p.flags&OneLine != 0 { 754 p.op(OpBeginText) 755 } else { 756 p.op(OpBeginLine) 757 } 758 t = t[1:] 759 case '$': 760 if p.flags&OneLine != 0 { 761 p.op(OpEndText).Flags |= WasDollar 762 } else { 763 p.op(OpEndLine) 764 } 765 t = t[1:] 766 case '.': 767 if p.flags&DotNL != 0 { 768 p.op(OpAnyChar) 769 } else { 770 p.op(OpAnyCharNotNL) 771 } 772 t = t[1:] 773 case '[': 774 if t, err = p.parseClass(t); err != nil { 775 return nil, err 776 } 777 case '*', '+', '?': 778 before := t 779 switch t[0] { 780 case '*': 781 op = OpStar 782 case '+': 783 op = OpPlus 784 case '?': 785 op = OpQuest 786 } 787 after := t[1:] 788 if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil { 789 return nil, err 790 } 791 repeat = before 792 t = after 793 case '{': 794 op = OpRepeat 795 before := t 796 min, max, after, ok := p.parseRepeat(t) 797 if !ok { 798 // If the repeat cannot be parsed, { is a literal. 799 p.literal('{') 800 t = t[1:] 801 break 802 } 803 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { 804 // Numbers were too big, or max is present and min > max. 805 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 806 } 807 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { 808 return nil, err 809 } 810 repeat = before 811 t = after 812 case '\\': 813 if p.flags&PerlX != 0 && len(t) >= 2 { 814 switch t[1] { 815 case 'A': 816 p.op(OpBeginText) 817 t = t[2:] 818 break BigSwitch 819 case 'b': 820 p.op(OpWordBoundary) 821 t = t[2:] 822 break BigSwitch 823 case 'B': 824 p.op(OpNoWordBoundary) 825 t = t[2:] 826 break BigSwitch 827 case 'C': 828 // any byte; not supported 829 return nil, &Error{ErrInvalidEscape, t[:2]} 830 case 'Q': 831 // \Q ... \E: the ... is always literals 832 var lit string 833 if i := strings.Index(t, `\E`); i < 0 { 834 lit = t[2:] 835 t = "" 836 } else { 837 lit = t[2:i] 838 t = t[i+2:] 839 } 840 for lit != "" { 841 c, rest, err := nextRune(lit) 842 if err != nil { 843 return nil, err 844 } 845 p.literal(c) 846 lit = rest 847 } 848 break BigSwitch 849 case 'z': 850 p.op(OpEndText) 851 t = t[2:] 852 break BigSwitch 853 } 854 } 855 856 re := p.newRegexp(OpCharClass) 857 re.Flags = p.flags 858 859 // Look for Unicode character group like \p{Han} 860 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { 861 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) 862 if err != nil { 863 return nil, err 864 } 865 if r != nil { 866 re.Rune = r 867 t = rest 868 p.push(re) 869 break BigSwitch 870 } 871 } 872 873 // Perl character class escape. 874 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { 875 re.Rune = r 876 t = rest 877 p.push(re) 878 break BigSwitch 879 } 880 p.reuse(re) 881 882 // Ordinary single-character escape. 883 if c, t, err = p.parseEscape(t); err != nil { 884 return nil, err 885 } 886 p.literal(c) 887 } 888 lastRepeat = repeat 889 } 890 891 p.concat() 892 if p.swapVerticalBar() { 893 // pop vertical bar 894 p.stack = p.stack[:len(p.stack)-1] 895 } 896 p.alternate() 897 898 n := len(p.stack) 899 if n != 1 { 900 return nil, &Error{ErrMissingParen, s} 901 } 902 return p.stack[0], nil 903 } 904 905 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. 906 // If s is not of that form, it returns ok == false. 907 // If s has the right form but the values are too big, it returns min == -1, ok == true. 908 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { 909 if s == "" || s[0] != '{' { 910 return 911 } 912 s = s[1:] 913 var ok1 bool 914 if min, s, ok1 = p.parseInt(s); !ok1 { 915 return 916 } 917 if s == "" { 918 return 919 } 920 if s[0] != ',' { 921 max = min 922 } else { 923 s = s[1:] 924 if s == "" { 925 return 926 } 927 if s[0] == '}' { 928 max = -1 929 } else if max, s, ok1 = p.parseInt(s); !ok1 { 930 return 931 } else if max < 0 { 932 // parseInt found too big a number 933 min = -1 934 } 935 } 936 if s == "" || s[0] != '}' { 937 return 938 } 939 rest = s[1:] 940 ok = true 941 return 942 } 943 944 // parsePerlFlags parses a Perl flag setting or non-capturing group or both, 945 // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. 946 // The caller must have ensured that s begins with "(?". 947 func (p *parser) parsePerlFlags(s string) (rest string, err error) { 948 t := s 949 950 // Check for named captures, first introduced in Python's regexp library. 951 // As usual, there are three slightly different syntaxes: 952 // 953 // (?P<name>expr) the original, introduced by Python 954 // (?<name>expr) the .NET alteration, adopted by Perl 5.10 955 // (?'name'expr) another .NET alteration, adopted by Perl 5.10 956 // 957 // Perl 5.10 gave in and implemented the Python version too, 958 // but they claim that the last two are the preferred forms. 959 // PCRE and languages based on it (specifically, PHP and Ruby) 960 // support all three as well. EcmaScript 4 uses only the Python form. 961 // 962 // In both the open source world (via Code Search) and the 963 // Google source tree, (?P<expr>name) is the dominant form, 964 // so that's the one we implement. One is enough. 965 if len(t) > 4 && t[2] == 'P' && t[3] == '<' { 966 // Pull out name. 967 end := strings.IndexRune(t, '>') 968 if end < 0 { 969 if err = checkUTF8(t); err != nil { 970 return "", err 971 } 972 return "", &Error{ErrInvalidNamedCapture, s} 973 } 974 975 capture := t[:end+1] // "(?P<name>" 976 name := t[4:end] // "name" 977 if err = checkUTF8(name); err != nil { 978 return "", err 979 } 980 if !isValidCaptureName(name) { 981 return "", &Error{ErrInvalidNamedCapture, capture} 982 } 983 984 // Like ordinary capture, but named. 985 p.numCap++ 986 re := p.op(opLeftParen) 987 re.Cap = p.numCap 988 re.Name = name 989 return t[end+1:], nil 990 } 991 992 // Non-capturing group. Might also twiddle Perl flags. 993 var c rune 994 t = t[2:] // skip (? 995 flags := p.flags 996 sign := +1 997 sawFlag := false 998 Loop: 999 for t != "" { 1000 if c, t, err = nextRune(t); err != nil { 1001 return "", err 1002 } 1003 switch c { 1004 default: 1005 break Loop 1006 1007 // Flags. 1008 case 'i': 1009 flags |= FoldCase 1010 sawFlag = true 1011 case 'm': 1012 flags &^= OneLine 1013 sawFlag = true 1014 case 's': 1015 flags |= DotNL 1016 sawFlag = true 1017 case 'U': 1018 flags |= NonGreedy 1019 sawFlag = true 1020 1021 // Switch to negation. 1022 case '-': 1023 if sign < 0 { 1024 break Loop 1025 } 1026 sign = -1 1027 // Invert flags so that | above turn into &^ and vice versa. 1028 // We'll invert flags again before using it below. 1029 flags = ^flags 1030 sawFlag = false 1031 1032 // End of flags, starting group or not. 1033 case ':', ')': 1034 if sign < 0 { 1035 if !sawFlag { 1036 break Loop 1037 } 1038 flags = ^flags 1039 } 1040 if c == ':' { 1041 // Open new group 1042 p.op(opLeftParen) 1043 } 1044 p.flags = flags 1045 return t, nil 1046 } 1047 } 1048 1049 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} 1050 } 1051 1052 // isValidCaptureName reports whether name 1053 // is a valid capture name: [A-Za-z0-9_]+. 1054 // PCRE limits names to 32 bytes. 1055 // Python rejects names starting with digits. 1056 // We don't enforce either of those. 1057 func isValidCaptureName(name string) bool { 1058 if name == "" { 1059 return false 1060 } 1061 for _, c := range name { 1062 if c != '_' && !isalnum(c) { 1063 return false 1064 } 1065 } 1066 return true 1067 } 1068 1069 // parseInt parses a decimal integer. 1070 func (p *parser) parseInt(s string) (n int, rest string, ok bool) { 1071 if s == "" || s[0] < '0' || '9' < s[0] { 1072 return 1073 } 1074 // Disallow leading zeros. 1075 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { 1076 return 1077 } 1078 t := s 1079 for s != "" && '0' <= s[0] && s[0] <= '9' { 1080 s = s[1:] 1081 } 1082 rest = s 1083 ok = true 1084 // Have digits, compute value. 1085 t = t[:len(t)-len(s)] 1086 for i := 0; i < len(t); i++ { 1087 // Avoid overflow. 1088 if n >= 1e8 { 1089 n = -1 1090 break 1091 } 1092 n = n*10 + int(t[i]) - '0' 1093 } 1094 return 1095 } 1096 1097 // can this be represented as a character class? 1098 // single-rune literal string, char class, ., and .|\n. 1099 func isCharClass(re *Regexp) bool { 1100 return re.Op == OpLiteral && len(re.Rune) == 1 || 1101 re.Op == OpCharClass || 1102 re.Op == OpAnyCharNotNL || 1103 re.Op == OpAnyChar 1104 } 1105 1106 // does re match r? 1107 func matchRune(re *Regexp, r rune) bool { 1108 switch re.Op { 1109 case OpLiteral: 1110 return len(re.Rune) == 1 && re.Rune[0] == r 1111 case OpCharClass: 1112 for i := 0; i < len(re.Rune); i += 2 { 1113 if re.Rune[i] <= r && r <= re.Rune[i+1] { 1114 return true 1115 } 1116 } 1117 return false 1118 case OpAnyCharNotNL: 1119 return r != '\n' 1120 case OpAnyChar: 1121 return true 1122 } 1123 return false 1124 } 1125 1126 // parseVerticalBar handles a | in the input. 1127 func (p *parser) parseVerticalBar() error { 1128 p.concat() 1129 1130 // The concatenation we just parsed is on top of the stack. 1131 // If it sits above an opVerticalBar, swap it below 1132 // (things below an opVerticalBar become an alternation). 1133 // Otherwise, push a new vertical bar. 1134 if !p.swapVerticalBar() { 1135 p.op(opVerticalBar) 1136 } 1137 1138 return nil 1139 } 1140 1141 // mergeCharClass makes dst = dst|src. 1142 // The caller must ensure that dst.Op >= src.Op, 1143 // to reduce the amount of copying. 1144 func mergeCharClass(dst, src *Regexp) { 1145 switch dst.Op { 1146 case OpAnyChar: 1147 // src doesn't add anything. 1148 case OpAnyCharNotNL: 1149 // src might add \n 1150 if matchRune(src, '\n') { 1151 dst.Op = OpAnyChar 1152 } 1153 case OpCharClass: 1154 // src is simpler, so either literal or char class 1155 if src.Op == OpLiteral { 1156 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1157 } else { 1158 dst.Rune = appendClass(dst.Rune, src.Rune) 1159 } 1160 case OpLiteral: 1161 // both literal 1162 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { 1163 break 1164 } 1165 dst.Op = OpCharClass 1166 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) 1167 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1168 } 1169 } 1170 1171 // If the top of the stack is an element followed by an opVerticalBar 1172 // swapVerticalBar swaps the two and returns true. 1173 // Otherwise it returns false. 1174 func (p *parser) swapVerticalBar() bool { 1175 // If above and below vertical bar are literal or char class, 1176 // can merge into a single char class. 1177 n := len(p.stack) 1178 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { 1179 re1 := p.stack[n-1] 1180 re3 := p.stack[n-3] 1181 // Make re3 the more complex of the two. 1182 if re1.Op > re3.Op { 1183 re1, re3 = re3, re1 1184 p.stack[n-3] = re3 1185 } 1186 mergeCharClass(re3, re1) 1187 p.reuse(re1) 1188 p.stack = p.stack[:n-1] 1189 return true 1190 } 1191 1192 if n >= 2 { 1193 re1 := p.stack[n-1] 1194 re2 := p.stack[n-2] 1195 if re2.Op == opVerticalBar { 1196 if n >= 3 { 1197 // Now out of reach. 1198 // Clean opportunistically. 1199 cleanAlt(p.stack[n-3]) 1200 } 1201 p.stack[n-2] = re1 1202 p.stack[n-1] = re2 1203 return true 1204 } 1205 } 1206 return false 1207 } 1208 1209 // parseRightParen handles a ) in the input. 1210 func (p *parser) parseRightParen() error { 1211 p.concat() 1212 if p.swapVerticalBar() { 1213 // pop vertical bar 1214 p.stack = p.stack[:len(p.stack)-1] 1215 } 1216 p.alternate() 1217 1218 n := len(p.stack) 1219 if n < 2 { 1220 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1221 } 1222 re1 := p.stack[n-1] 1223 re2 := p.stack[n-2] 1224 p.stack = p.stack[:n-2] 1225 if re2.Op != opLeftParen { 1226 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1227 } 1228 // Restore flags at time of paren. 1229 p.flags = re2.Flags 1230 if re2.Cap == 0 { 1231 // Just for grouping. 1232 p.push(re1) 1233 } else { 1234 re2.Op = OpCapture 1235 re2.Sub = re2.Sub0[:1] 1236 re2.Sub[0] = re1 1237 p.push(re2) 1238 } 1239 return nil 1240 } 1241 1242 // parseEscape parses an escape sequence at the beginning of s 1243 // and returns the rune. 1244 func (p *parser) parseEscape(s string) (r rune, rest string, err error) { 1245 t := s[1:] 1246 if t == "" { 1247 return 0, "", &Error{ErrTrailingBackslash, ""} 1248 } 1249 c, t, err := nextRune(t) 1250 if err != nil { 1251 return 0, "", err 1252 } 1253 1254 Switch: 1255 switch c { 1256 default: 1257 if c < utf8.RuneSelf && !isalnum(c) { 1258 // Escaped non-word characters are always themselves. 1259 // PCRE is not quite so rigorous: it accepts things like 1260 // \q, but we don't. We once rejected \_, but too many 1261 // programs and people insist on using it, so allow \_. 1262 return c, t, nil 1263 } 1264 1265 // Octal escapes. 1266 case '1', '2', '3', '4', '5', '6', '7': 1267 // Single non-zero digit is a backreference; not supported 1268 if t == "" || t[0] < '0' || t[0] > '7' { 1269 break 1270 } 1271 fallthrough 1272 case '0': 1273 // Consume up to three octal digits; already have one. 1274 r = c - '0' 1275 for i := 1; i < 3; i++ { 1276 if t == "" || t[0] < '0' || t[0] > '7' { 1277 break 1278 } 1279 r = r*8 + rune(t[0]) - '0' 1280 t = t[1:] 1281 } 1282 return r, t, nil 1283 1284 // Hexadecimal escapes. 1285 case 'x': 1286 if t == "" { 1287 break 1288 } 1289 if c, t, err = nextRune(t); err != nil { 1290 return 0, "", err 1291 } 1292 if c == '{' { 1293 // Any number of digits in braces. 1294 // Perl accepts any text at all; it ignores all text 1295 // after the first non-hex digit. We require only hex digits, 1296 // and at least one. 1297 nhex := 0 1298 r = 0 1299 for { 1300 if t == "" { 1301 break Switch 1302 } 1303 if c, t, err = nextRune(t); err != nil { 1304 return 0, "", err 1305 } 1306 if c == '}' { 1307 break 1308 } 1309 v := unhex(c) 1310 if v < 0 { 1311 break Switch 1312 } 1313 r = r*16 + v 1314 if r > unicode.MaxRune { 1315 break Switch 1316 } 1317 nhex++ 1318 } 1319 if nhex == 0 { 1320 break Switch 1321 } 1322 return r, t, nil 1323 } 1324 1325 // Easy case: two hex digits. 1326 x := unhex(c) 1327 if c, t, err = nextRune(t); err != nil { 1328 return 0, "", err 1329 } 1330 y := unhex(c) 1331 if x < 0 || y < 0 { 1332 break 1333 } 1334 return x*16 + y, t, nil 1335 1336 // C escapes. There is no case 'b', to avoid misparsing 1337 // the Perl word-boundary \b as the C backspace \b 1338 // when in POSIX mode. In Perl, /\b/ means word-boundary 1339 // but /[\b]/ means backspace. We don't support that. 1340 // If you want a backspace, embed a literal backspace 1341 // character or use \x08. 1342 case 'a': 1343 return '\a', t, err 1344 case 'f': 1345 return '\f', t, err 1346 case 'n': 1347 return '\n', t, err 1348 case 'r': 1349 return '\r', t, err 1350 case 't': 1351 return '\t', t, err 1352 case 'v': 1353 return '\v', t, err 1354 } 1355 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} 1356 } 1357 1358 // parseClassChar parses a character class character at the beginning of s 1359 // and returns it. 1360 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { 1361 if s == "" { 1362 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} 1363 } 1364 1365 // Allow regular escape sequences even though 1366 // many need not be escaped in this context. 1367 if s[0] == '\\' { 1368 return p.parseEscape(s) 1369 } 1370 1371 return nextRune(s) 1372 } 1373 1374 type charGroup struct { 1375 sign int 1376 class []rune 1377 } 1378 1379 // parsePerlClassEscape parses a leading Perl character class escape like \d 1380 // from the beginning of s. If one is present, it appends the characters to r 1381 // and returns the new slice r and the remainder of the string. 1382 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { 1383 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { 1384 return 1385 } 1386 g := perlGroup[s[0:2]] 1387 if g.sign == 0 { 1388 return 1389 } 1390 return p.appendGroup(r, g), s[2:] 1391 } 1392 1393 // parseNamedClass parses a leading POSIX named character class like [:alnum:] 1394 // from the beginning of s. If one is present, it appends the characters to r 1395 // and returns the new slice r and the remainder of the string. 1396 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { 1397 if len(s) < 2 || s[0] != '[' || s[1] != ':' { 1398 return 1399 } 1400 1401 i := strings.Index(s[2:], ":]") 1402 if i < 0 { 1403 return 1404 } 1405 i += 2 1406 name, s := s[0:i+2], s[i+2:] 1407 g := posixGroup[name] 1408 if g.sign == 0 { 1409 return nil, "", &Error{ErrInvalidCharRange, name} 1410 } 1411 return p.appendGroup(r, g), s, nil 1412 } 1413 1414 func (p *parser) appendGroup(r []rune, g charGroup) []rune { 1415 if p.flags&FoldCase == 0 { 1416 if g.sign < 0 { 1417 r = appendNegatedClass(r, g.class) 1418 } else { 1419 r = appendClass(r, g.class) 1420 } 1421 } else { 1422 tmp := p.tmpClass[:0] 1423 tmp = appendFoldedClass(tmp, g.class) 1424 p.tmpClass = tmp 1425 tmp = cleanClass(&p.tmpClass) 1426 if g.sign < 0 { 1427 r = appendNegatedClass(r, tmp) 1428 } else { 1429 r = appendClass(r, tmp) 1430 } 1431 } 1432 return r 1433 } 1434 1435 var anyTable = &unicode.RangeTable{ 1436 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, 1437 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, 1438 } 1439 1440 // unicodeTable returns the unicode.RangeTable identified by name 1441 // and the table of additional fold-equivalent code points. 1442 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { 1443 // Special case: "Any" means any. 1444 if name == "Any" { 1445 return anyTable, anyTable 1446 } 1447 if t := unicode.Categories[name]; t != nil { 1448 return t, unicode.FoldCategory[name] 1449 } 1450 if t := unicode.Scripts[name]; t != nil { 1451 return t, unicode.FoldScript[name] 1452 } 1453 return nil, nil 1454 } 1455 1456 // parseUnicodeClass parses a leading Unicode character class like \p{Han} 1457 // from the beginning of s. If one is present, it appends the characters to r 1458 // and returns the new slice r and the remainder of the string. 1459 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { 1460 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { 1461 return 1462 } 1463 1464 // Committed to parse or return error. 1465 sign := +1 1466 if s[1] == 'P' { 1467 sign = -1 1468 } 1469 t := s[2:] 1470 c, t, err := nextRune(t) 1471 if err != nil { 1472 return 1473 } 1474 var seq, name string 1475 if c != '{' { 1476 // Single-letter name. 1477 seq = s[:len(s)-len(t)] 1478 name = seq[2:] 1479 } else { 1480 // Name is in braces. 1481 end := strings.IndexRune(s, '}') 1482 if end < 0 { 1483 if err = checkUTF8(s); err != nil { 1484 return 1485 } 1486 return nil, "", &Error{ErrInvalidCharRange, s} 1487 } 1488 seq, t = s[:end+1], s[end+1:] 1489 name = s[3:end] 1490 if err = checkUTF8(name); err != nil { 1491 return 1492 } 1493 } 1494 1495 // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. 1496 if name != "" && name[0] == '^' { 1497 sign = -sign 1498 name = name[1:] 1499 } 1500 1501 tab, fold := unicodeTable(name) 1502 if tab == nil { 1503 return nil, "", &Error{ErrInvalidCharRange, seq} 1504 } 1505 1506 if p.flags&FoldCase == 0 || fold == nil { 1507 if sign > 0 { 1508 r = appendTable(r, tab) 1509 } else { 1510 r = appendNegatedTable(r, tab) 1511 } 1512 } else { 1513 // Merge and clean tab and fold in a temporary buffer. 1514 // This is necessary for the negative case and just tidy 1515 // for the positive case. 1516 tmp := p.tmpClass[:0] 1517 tmp = appendTable(tmp, tab) 1518 tmp = appendTable(tmp, fold) 1519 p.tmpClass = tmp 1520 tmp = cleanClass(&p.tmpClass) 1521 if sign > 0 { 1522 r = appendClass(r, tmp) 1523 } else { 1524 r = appendNegatedClass(r, tmp) 1525 } 1526 } 1527 return r, t, nil 1528 } 1529 1530 // parseClass parses a character class at the beginning of s 1531 // and pushes it onto the parse stack. 1532 func (p *parser) parseClass(s string) (rest string, err error) { 1533 t := s[1:] // chop [ 1534 re := p.newRegexp(OpCharClass) 1535 re.Flags = p.flags 1536 re.Rune = re.Rune0[:0] 1537 1538 sign := +1 1539 if t != "" && t[0] == '^' { 1540 sign = -1 1541 t = t[1:] 1542 1543 // If character class does not match \n, add it here, 1544 // so that negation later will do the right thing. 1545 if p.flags&ClassNL == 0 { 1546 re.Rune = append(re.Rune, '\n', '\n') 1547 } 1548 } 1549 1550 class := re.Rune 1551 first := true // ] and - are okay as first char in class 1552 for t == "" || t[0] != ']' || first { 1553 // POSIX: - is only okay unescaped as first or last in class. 1554 // Perl: - is okay anywhere. 1555 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { 1556 _, size := utf8.DecodeRuneInString(t[1:]) 1557 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} 1558 } 1559 first = false 1560 1561 // Look for POSIX [:alnum:] etc. 1562 if len(t) > 2 && t[0] == '[' && t[1] == ':' { 1563 nclass, nt, err := p.parseNamedClass(t, class) 1564 if err != nil { 1565 return "", err 1566 } 1567 if nclass != nil { 1568 class, t = nclass, nt 1569 continue 1570 } 1571 } 1572 1573 // Look for Unicode character group like \p{Han}. 1574 nclass, nt, err := p.parseUnicodeClass(t, class) 1575 if err != nil { 1576 return "", err 1577 } 1578 if nclass != nil { 1579 class, t = nclass, nt 1580 continue 1581 } 1582 1583 // Look for Perl character class symbols (extension). 1584 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { 1585 class, t = nclass, nt 1586 continue 1587 } 1588 1589 // Single character or simple range. 1590 rng := t 1591 var lo, hi rune 1592 if lo, t, err = p.parseClassChar(t, s); err != nil { 1593 return "", err 1594 } 1595 hi = lo 1596 // [a-] means (a|-) so check for final ]. 1597 if len(t) >= 2 && t[0] == '-' && t[1] != ']' { 1598 t = t[1:] 1599 if hi, t, err = p.parseClassChar(t, s); err != nil { 1600 return "", err 1601 } 1602 if hi < lo { 1603 rng = rng[:len(rng)-len(t)] 1604 return "", &Error{Code: ErrInvalidCharRange, Expr: rng} 1605 } 1606 } 1607 if p.flags&FoldCase == 0 { 1608 class = appendRange(class, lo, hi) 1609 } else { 1610 class = appendFoldedRange(class, lo, hi) 1611 } 1612 } 1613 t = t[1:] // chop ] 1614 1615 // Use &re.Rune instead of &class to avoid allocation. 1616 re.Rune = class 1617 class = cleanClass(&re.Rune) 1618 if sign < 0 { 1619 class = negateClass(class) 1620 } 1621 re.Rune = class 1622 p.push(re) 1623 return t, nil 1624 } 1625 1626 // cleanClass sorts the ranges (pairs of elements of r), 1627 // merges them, and eliminates duplicates. 1628 func cleanClass(rp *[]rune) []rune { 1629 1630 // Sort by lo increasing, hi decreasing to break ties. 1631 sort.Sort(ranges{rp}) 1632 1633 r := *rp 1634 if len(r) < 2 { 1635 return r 1636 } 1637 1638 // Merge abutting, overlapping. 1639 w := 2 // write index 1640 for i := 2; i < len(r); i += 2 { 1641 lo, hi := r[i], r[i+1] 1642 if lo <= r[w-1]+1 { 1643 // merge with previous range 1644 if hi > r[w-1] { 1645 r[w-1] = hi 1646 } 1647 continue 1648 } 1649 // new disjoint range 1650 r[w] = lo 1651 r[w+1] = hi 1652 w += 2 1653 } 1654 1655 return r[:w] 1656 } 1657 1658 // appendLiteral returns the result of appending the literal x to the class r. 1659 func appendLiteral(r []rune, x rune, flags Flags) []rune { 1660 if flags&FoldCase != 0 { 1661 return appendFoldedRange(r, x, x) 1662 } 1663 return appendRange(r, x, x) 1664 } 1665 1666 // appendRange returns the result of appending the range lo-hi to the class r. 1667 func appendRange(r []rune, lo, hi rune) []rune { 1668 // Expand last range or next to last range if it overlaps or abuts. 1669 // Checking two ranges helps when appending case-folded 1670 // alphabets, so that one range can be expanding A-Z and the 1671 // other expanding a-z. 1672 n := len(r) 1673 for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 1674 if n >= i { 1675 rlo, rhi := r[n-i], r[n-i+1] 1676 if lo <= rhi+1 && rlo <= hi+1 { 1677 if lo < rlo { 1678 r[n-i] = lo 1679 } 1680 if hi > rhi { 1681 r[n-i+1] = hi 1682 } 1683 return r 1684 } 1685 } 1686 } 1687 1688 return append(r, lo, hi) 1689 } 1690 1691 const ( 1692 // minimum and maximum runes involved in folding. 1693 // checked during test. 1694 minFold = 0x0041 1695 maxFold = 0x1e943 1696 ) 1697 1698 // appendFoldedRange returns the result of appending the range lo-hi 1699 // and its case folding-equivalent runes to the class r. 1700 func appendFoldedRange(r []rune, lo, hi rune) []rune { 1701 // Optimizations. 1702 if lo <= minFold && hi >= maxFold { 1703 // Range is full: folding can't add more. 1704 return appendRange(r, lo, hi) 1705 } 1706 if hi < minFold || lo > maxFold { 1707 // Range is outside folding possibilities. 1708 return appendRange(r, lo, hi) 1709 } 1710 if lo < minFold { 1711 // [lo, minFold-1] needs no folding. 1712 r = appendRange(r, lo, minFold-1) 1713 lo = minFold 1714 } 1715 if hi > maxFold { 1716 // [maxFold+1, hi] needs no folding. 1717 r = appendRange(r, maxFold+1, hi) 1718 hi = maxFold 1719 } 1720 1721 // Brute force. Depend on appendRange to coalesce ranges on the fly. 1722 for c := lo; c <= hi; c++ { 1723 r = appendRange(r, c, c) 1724 f := unicode.SimpleFold(c) 1725 for f != c { 1726 r = appendRange(r, f, f) 1727 f = unicode.SimpleFold(f) 1728 } 1729 } 1730 return r 1731 } 1732 1733 // appendClass returns the result of appending the class x to the class r. 1734 // It assume x is clean. 1735 func appendClass(r []rune, x []rune) []rune { 1736 for i := 0; i < len(x); i += 2 { 1737 r = appendRange(r, x[i], x[i+1]) 1738 } 1739 return r 1740 } 1741 1742 // appendFolded returns the result of appending the case folding of the class x to the class r. 1743 func appendFoldedClass(r []rune, x []rune) []rune { 1744 for i := 0; i < len(x); i += 2 { 1745 r = appendFoldedRange(r, x[i], x[i+1]) 1746 } 1747 return r 1748 } 1749 1750 // appendNegatedClass returns the result of appending the negation of the class x to the class r. 1751 // It assumes x is clean. 1752 func appendNegatedClass(r []rune, x []rune) []rune { 1753 nextLo := '\u0000' 1754 for i := 0; i < len(x); i += 2 { 1755 lo, hi := x[i], x[i+1] 1756 if nextLo <= lo-1 { 1757 r = appendRange(r, nextLo, lo-1) 1758 } 1759 nextLo = hi + 1 1760 } 1761 if nextLo <= unicode.MaxRune { 1762 r = appendRange(r, nextLo, unicode.MaxRune) 1763 } 1764 return r 1765 } 1766 1767 // appendTable returns the result of appending x to the class r. 1768 func appendTable(r []rune, x *unicode.RangeTable) []rune { 1769 for _, xr := range x.R16 { 1770 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1771 if stride == 1 { 1772 r = appendRange(r, lo, hi) 1773 continue 1774 } 1775 for c := lo; c <= hi; c += stride { 1776 r = appendRange(r, c, c) 1777 } 1778 } 1779 for _, xr := range x.R32 { 1780 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1781 if stride == 1 { 1782 r = appendRange(r, lo, hi) 1783 continue 1784 } 1785 for c := lo; c <= hi; c += stride { 1786 r = appendRange(r, c, c) 1787 } 1788 } 1789 return r 1790 } 1791 1792 // appendNegatedTable returns the result of appending the negation of x to the class r. 1793 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { 1794 nextLo := '\u0000' // lo end of next class to add 1795 for _, xr := range x.R16 { 1796 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1797 if stride == 1 { 1798 if nextLo <= lo-1 { 1799 r = appendRange(r, nextLo, lo-1) 1800 } 1801 nextLo = hi + 1 1802 continue 1803 } 1804 for c := lo; c <= hi; c += stride { 1805 if nextLo <= c-1 { 1806 r = appendRange(r, nextLo, c-1) 1807 } 1808 nextLo = c + 1 1809 } 1810 } 1811 for _, xr := range x.R32 { 1812 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1813 if stride == 1 { 1814 if nextLo <= lo-1 { 1815 r = appendRange(r, nextLo, lo-1) 1816 } 1817 nextLo = hi + 1 1818 continue 1819 } 1820 for c := lo; c <= hi; c += stride { 1821 if nextLo <= c-1 { 1822 r = appendRange(r, nextLo, c-1) 1823 } 1824 nextLo = c + 1 1825 } 1826 } 1827 if nextLo <= unicode.MaxRune { 1828 r = appendRange(r, nextLo, unicode.MaxRune) 1829 } 1830 return r 1831 } 1832 1833 // negateClass overwrites r and returns r's negation. 1834 // It assumes the class r is already clean. 1835 func negateClass(r []rune) []rune { 1836 nextLo := '\u0000' // lo end of next class to add 1837 w := 0 // write index 1838 for i := 0; i < len(r); i += 2 { 1839 lo, hi := r[i], r[i+1] 1840 if nextLo <= lo-1 { 1841 r[w] = nextLo 1842 r[w+1] = lo - 1 1843 w += 2 1844 } 1845 nextLo = hi + 1 1846 } 1847 r = r[:w] 1848 if nextLo <= unicode.MaxRune { 1849 // It's possible for the negation to have one more 1850 // range - this one - than the original class, so use append. 1851 r = append(r, nextLo, unicode.MaxRune) 1852 } 1853 return r 1854 } 1855 1856 // ranges implements sort.Interface on a []rune. 1857 // The choice of receiver type definition is strange 1858 // but avoids an allocation since we already have 1859 // a *[]rune. 1860 type ranges struct { 1861 p *[]rune 1862 } 1863 1864 func (ra ranges) Less(i, j int) bool { 1865 p := *ra.p 1866 i *= 2 1867 j *= 2 1868 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] 1869 } 1870 1871 func (ra ranges) Len() int { 1872 return len(*ra.p) / 2 1873 } 1874 1875 func (ra ranges) Swap(i, j int) { 1876 p := *ra.p 1877 i *= 2 1878 j *= 2 1879 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] 1880 } 1881 1882 func checkUTF8(s string) error { 1883 for s != "" { 1884 rune, size := utf8.DecodeRuneInString(s) 1885 if rune == utf8.RuneError && size == 1 { 1886 return &Error{Code: ErrInvalidUTF8, Expr: s} 1887 } 1888 s = s[size:] 1889 } 1890 return nil 1891 } 1892 1893 func nextRune(s string) (c rune, t string, err error) { 1894 c, size := utf8.DecodeRuneInString(s) 1895 if c == utf8.RuneError && size == 1 { 1896 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} 1897 } 1898 return c, s[size:], nil 1899 } 1900 1901 func isalnum(c rune) bool { 1902 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 1903 } 1904 1905 func unhex(c rune) rune { 1906 if '0' <= c && c <= '9' { 1907 return c - '0' 1908 } 1909 if 'a' <= c && c <= 'f' { 1910 return c - 'a' + 10 1911 } 1912 if 'A' <= c && c <= 'F' { 1913 return c - 'A' + 10 1914 } 1915 return -1 1916 }