github.com/spotify/syslog-redirector-golang@v0.0.0-20140320174030-4859f03d829a/src/pkg/regexp/syntax/parse.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package syntax 6 7 import ( 8 "sort" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // An Error describes a failure to parse a regular expression 15 // and gives the offending expression. 16 type Error struct { 17 Code ErrorCode 18 Expr string 19 } 20 21 func (e *Error) Error() string { 22 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" 23 } 24 25 // An ErrorCode describes a failure to parse a regular expression. 26 type ErrorCode string 27 28 const ( 29 // Unexpected error 30 ErrInternalError ErrorCode = "regexp/syntax: internal error" 31 32 // Parse errors 33 ErrInvalidCharClass ErrorCode = "invalid character class" 34 ErrInvalidCharRange ErrorCode = "invalid character class range" 35 ErrInvalidEscape ErrorCode = "invalid escape sequence" 36 ErrInvalidNamedCapture ErrorCode = "invalid named capture" 37 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" 38 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" 39 ErrInvalidRepeatSize ErrorCode = "invalid repeat count" 40 ErrInvalidUTF8 ErrorCode = "invalid UTF-8" 41 ErrMissingBracket ErrorCode = "missing closing ]" 42 ErrMissingParen ErrorCode = "missing closing )" 43 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" 44 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" 45 ErrUnexpectedParen ErrorCode = "unexpected )" 46 ) 47 48 func (e ErrorCode) String() string { 49 return string(e) 50 } 51 52 // Flags control the behavior of the parser and record information about regexp context. 53 type Flags uint16 54 55 const ( 56 FoldCase Flags = 1 << iota // case-insensitive match 57 Literal // treat pattern as literal string 58 ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline 59 DotNL // allow . to match newline 60 OneLine // treat ^ and $ as only matching at beginning and end of text 61 NonGreedy // make repetition operators default to non-greedy 62 PerlX // allow Perl extensions 63 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation 64 WasDollar // regexp OpEndText was $, not \z 65 Simple // regexp contains no counted repetition 66 67 MatchNL = ClassNL | DotNL 68 69 Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible 70 POSIX Flags = 0 // POSIX syntax 71 ) 72 73 // Pseudo-ops for parsing stack. 74 const ( 75 opLeftParen = opPseudo + iota 76 opVerticalBar 77 ) 78 79 type parser struct { 80 flags Flags // parse mode flags 81 stack []*Regexp // stack of parsed expressions 82 free *Regexp 83 numCap int // number of capturing groups seen 84 wholeRegexp string 85 tmpClass []rune // temporary char class work space 86 } 87 88 func (p *parser) newRegexp(op Op) *Regexp { 89 re := p.free 90 if re != nil { 91 p.free = re.Sub0[0] 92 *re = Regexp{} 93 } else { 94 re = new(Regexp) 95 } 96 re.Op = op 97 return re 98 } 99 100 func (p *parser) reuse(re *Regexp) { 101 re.Sub0[0] = p.free 102 p.free = re 103 } 104 105 // Parse stack manipulation. 106 107 // push pushes the regexp re onto the parse stack and returns the regexp. 108 func (p *parser) push(re *Regexp) *Regexp { 109 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { 110 // Single rune. 111 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { 112 return nil 113 } 114 re.Op = OpLiteral 115 re.Rune = re.Rune[:1] 116 re.Flags = p.flags &^ FoldCase 117 } else if re.Op == OpCharClass && len(re.Rune) == 4 && 118 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && 119 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && 120 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || 121 re.Op == OpCharClass && len(re.Rune) == 2 && 122 re.Rune[0]+1 == re.Rune[1] && 123 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && 124 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { 125 // Case-insensitive rune like [Aa] or [Δδ]. 126 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { 127 return nil 128 } 129 130 // Rewrite as (case-insensitive) literal. 131 re.Op = OpLiteral 132 re.Rune = re.Rune[:1] 133 re.Flags = p.flags | FoldCase 134 } else { 135 // Incremental concatenation. 136 p.maybeConcat(-1, 0) 137 } 138 139 p.stack = append(p.stack, re) 140 return re 141 } 142 143 // maybeConcat implements incremental concatenation 144 // of literal runes into string nodes. The parser calls this 145 // before each push, so only the top fragment of the stack 146 // might need processing. Since this is called before a push, 147 // the topmost literal is no longer subject to operators like * 148 // (Otherwise ab* would turn into (ab)*.) 149 // If r >= 0 and there's a node left over, maybeConcat uses it 150 // to push r with the given flags. 151 // maybeConcat reports whether r was pushed. 152 func (p *parser) maybeConcat(r rune, flags Flags) bool { 153 n := len(p.stack) 154 if n < 2 { 155 return false 156 } 157 158 re1 := p.stack[n-1] 159 re2 := p.stack[n-2] 160 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { 161 return false 162 } 163 164 // Push re1 into re2. 165 re2.Rune = append(re2.Rune, re1.Rune...) 166 167 // Reuse re1 if possible. 168 if r >= 0 { 169 re1.Rune = re1.Rune0[:1] 170 re1.Rune[0] = r 171 re1.Flags = flags 172 return true 173 } 174 175 p.stack = p.stack[:n-1] 176 p.reuse(re1) 177 return false // did not push r 178 } 179 180 // newLiteral returns a new OpLiteral Regexp with the given flags 181 func (p *parser) newLiteral(r rune, flags Flags) *Regexp { 182 re := p.newRegexp(OpLiteral) 183 re.Flags = flags 184 if flags&FoldCase != 0 { 185 r = minFoldRune(r) 186 } 187 re.Rune0[0] = r 188 re.Rune = re.Rune0[:1] 189 return re 190 } 191 192 // minFoldRune returns the minimum rune fold-equivalent to r. 193 func minFoldRune(r rune) rune { 194 if r < minFold || r > maxFold { 195 return r 196 } 197 min := r 198 r0 := r 199 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { 200 if min > r { 201 min = r 202 } 203 } 204 return min 205 } 206 207 // literal pushes a literal regexp for the rune r on the stack 208 // and returns that regexp. 209 func (p *parser) literal(r rune) { 210 p.push(p.newLiteral(r, p.flags)) 211 } 212 213 // op pushes a regexp with the given op onto the stack 214 // and returns that regexp. 215 func (p *parser) op(op Op) *Regexp { 216 re := p.newRegexp(op) 217 re.Flags = p.flags 218 return p.push(re) 219 } 220 221 // repeat replaces the top stack element with itself repeated according to op, min, max. 222 // before is the regexp suffix starting at the repetition operator. 223 // after is the regexp suffix following after the repetition operator. 224 // repeat returns an updated 'after' and an error, if any. 225 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { 226 flags := p.flags 227 if p.flags&PerlX != 0 { 228 if len(after) > 0 && after[0] == '?' { 229 after = after[1:] 230 flags ^= NonGreedy 231 } 232 if lastRepeat != "" { 233 // In Perl it is not allowed to stack repetition operators: 234 // a** is a syntax error, not a doubled star, and a++ means 235 // something else entirely, which we don't support! 236 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} 237 } 238 } 239 n := len(p.stack) 240 if n == 0 { 241 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 242 } 243 sub := p.stack[n-1] 244 if sub.Op >= opPseudo { 245 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 246 } 247 re := p.newRegexp(op) 248 re.Min = min 249 re.Max = max 250 re.Flags = flags 251 re.Sub = re.Sub0[:1] 252 re.Sub[0] = sub 253 p.stack[n-1] = re 254 return after, nil 255 } 256 257 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. 258 func (p *parser) concat() *Regexp { 259 p.maybeConcat(-1, 0) 260 261 // Scan down to find pseudo-operator | or (. 262 i := len(p.stack) 263 for i > 0 && p.stack[i-1].Op < opPseudo { 264 i-- 265 } 266 subs := p.stack[i:] 267 p.stack = p.stack[:i] 268 269 // Empty concatenation is special case. 270 if len(subs) == 0 { 271 return p.push(p.newRegexp(OpEmptyMatch)) 272 } 273 274 return p.push(p.collapse(subs, OpConcat)) 275 } 276 277 // alternate replaces the top of the stack (above the topmost '(') with its alternation. 278 func (p *parser) alternate() *Regexp { 279 // Scan down to find pseudo-operator (. 280 // There are no | above (. 281 i := len(p.stack) 282 for i > 0 && p.stack[i-1].Op < opPseudo { 283 i-- 284 } 285 subs := p.stack[i:] 286 p.stack = p.stack[:i] 287 288 // Make sure top class is clean. 289 // All the others already are (see swapVerticalBar). 290 if len(subs) > 0 { 291 cleanAlt(subs[len(subs)-1]) 292 } 293 294 // Empty alternate is special case 295 // (shouldn't happen but easy to handle). 296 if len(subs) == 0 { 297 return p.push(p.newRegexp(OpNoMatch)) 298 } 299 300 return p.push(p.collapse(subs, OpAlternate)) 301 } 302 303 // cleanAlt cleans re for eventual inclusion in an alternation. 304 func cleanAlt(re *Regexp) { 305 switch re.Op { 306 case OpCharClass: 307 re.Rune = cleanClass(&re.Rune) 308 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { 309 re.Rune = nil 310 re.Op = OpAnyChar 311 return 312 } 313 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { 314 re.Rune = nil 315 re.Op = OpAnyCharNotNL 316 return 317 } 318 if cap(re.Rune)-len(re.Rune) > 100 { 319 // re.Rune will not grow any more. 320 // Make a copy or inline to reclaim storage. 321 re.Rune = append(re.Rune0[:0], re.Rune...) 322 } 323 } 324 } 325 326 // collapse returns the result of applying op to sub. 327 // If sub contains op nodes, they all get hoisted up 328 // so that there is never a concat of a concat or an 329 // alternate of an alternate. 330 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { 331 if len(subs) == 1 { 332 return subs[0] 333 } 334 re := p.newRegexp(op) 335 re.Sub = re.Sub0[:0] 336 for _, sub := range subs { 337 if sub.Op == op { 338 re.Sub = append(re.Sub, sub.Sub...) 339 p.reuse(sub) 340 } else { 341 re.Sub = append(re.Sub, sub) 342 } 343 } 344 if op == OpAlternate { 345 re.Sub = p.factor(re.Sub, re.Flags) 346 if len(re.Sub) == 1 { 347 old := re 348 re = re.Sub[0] 349 p.reuse(old) 350 } 351 } 352 return re 353 } 354 355 // factor factors common prefixes from the alternation list sub. 356 // It returns a replacement list that reuses the same storage and 357 // frees (passes to p.reuse) any removed *Regexps. 358 // 359 // For example, 360 // ABC|ABD|AEF|BCX|BCY 361 // simplifies by literal prefix extraction to 362 // A(B(C|D)|EF)|BC(X|Y) 363 // which simplifies by character class introduction to 364 // A(B[CD]|EF)|BC[XY] 365 // 366 func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { 367 if len(sub) < 2 { 368 return sub 369 } 370 371 // Round 1: Factor out common literal prefixes. 372 var str []rune 373 var strflags Flags 374 start := 0 375 out := sub[:0] 376 for i := 0; i <= len(sub); i++ { 377 // Invariant: the Regexps that were in sub[0:start] have been 378 // used or marked for reuse, and the slice space has been reused 379 // for out (len(out) <= start). 380 // 381 // Invariant: sub[start:i] consists of regexps that all begin 382 // with str as modified by strflags. 383 var istr []rune 384 var iflags Flags 385 if i < len(sub) { 386 istr, iflags = p.leadingString(sub[i]) 387 if iflags == strflags { 388 same := 0 389 for same < len(str) && same < len(istr) && str[same] == istr[same] { 390 same++ 391 } 392 if same > 0 { 393 // Matches at least one rune in current range. 394 // Keep going around. 395 str = str[:same] 396 continue 397 } 398 } 399 } 400 401 // Found end of a run with common leading literal string: 402 // sub[start:i] all begin with str[0:len(str)], but sub[i] 403 // does not even begin with str[0]. 404 // 405 // Factor out common string and append factored expression to out. 406 if i == start { 407 // Nothing to do - run of length 0. 408 } else if i == start+1 { 409 // Just one: don't bother factoring. 410 out = append(out, sub[start]) 411 } else { 412 // Construct factored form: prefix(suffix1|suffix2|...) 413 prefix := p.newRegexp(OpLiteral) 414 prefix.Flags = strflags 415 prefix.Rune = append(prefix.Rune[:0], str...) 416 417 for j := start; j < i; j++ { 418 sub[j] = p.removeLeadingString(sub[j], len(str)) 419 } 420 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 421 422 re := p.newRegexp(OpConcat) 423 re.Sub = append(re.Sub[:0], prefix, suffix) 424 out = append(out, re) 425 } 426 427 // Prepare for next iteration. 428 start = i 429 str = istr 430 strflags = iflags 431 } 432 sub = out 433 434 // Round 2: Factor out common complex prefixes, 435 // just the first piece of each concatenation, 436 // whatever it is. This is good enough a lot of the time. 437 start = 0 438 out = sub[:0] 439 var first *Regexp 440 for i := 0; i <= len(sub); i++ { 441 // Invariant: the Regexps that were in sub[0:start] have been 442 // used or marked for reuse, and the slice space has been reused 443 // for out (len(out) <= start). 444 // 445 // Invariant: sub[start:i] consists of regexps that all begin with ifirst. 446 var ifirst *Regexp 447 if i < len(sub) { 448 ifirst = p.leadingRegexp(sub[i]) 449 if first != nil && first.Equal(ifirst) { 450 continue 451 } 452 } 453 454 // Found end of a run with common leading regexp: 455 // sub[start:i] all begin with first but sub[i] does not. 456 // 457 // Factor out common regexp and append factored expression to out. 458 if i == start { 459 // Nothing to do - run of length 0. 460 } else if i == start+1 { 461 // Just one: don't bother factoring. 462 out = append(out, sub[start]) 463 } else { 464 // Construct factored form: prefix(suffix1|suffix2|...) 465 prefix := first 466 for j := start; j < i; j++ { 467 reuse := j != start // prefix came from sub[start] 468 sub[j] = p.removeLeadingRegexp(sub[j], reuse) 469 } 470 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 471 472 re := p.newRegexp(OpConcat) 473 re.Sub = append(re.Sub[:0], prefix, suffix) 474 out = append(out, re) 475 } 476 477 // Prepare for next iteration. 478 start = i 479 first = ifirst 480 } 481 sub = out 482 483 // Round 3: Collapse runs of single literals into character classes. 484 start = 0 485 out = sub[:0] 486 for i := 0; i <= len(sub); i++ { 487 // Invariant: the Regexps that were in sub[0:start] have been 488 // used or marked for reuse, and the slice space has been reused 489 // for out (len(out) <= start). 490 // 491 // Invariant: sub[start:i] consists of regexps that are either 492 // literal runes or character classes. 493 if i < len(sub) && isCharClass(sub[i]) { 494 continue 495 } 496 497 // sub[i] is not a char or char class; 498 // emit char class for sub[start:i]... 499 if i == start { 500 // Nothing to do - run of length 0. 501 } else if i == start+1 { 502 out = append(out, sub[start]) 503 } else { 504 // Make new char class. 505 // Start with most complex regexp in sub[start]. 506 max := start 507 for j := start + 1; j < i; j++ { 508 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { 509 max = j 510 } 511 } 512 sub[start], sub[max] = sub[max], sub[start] 513 514 for j := start + 1; j < i; j++ { 515 mergeCharClass(sub[start], sub[j]) 516 p.reuse(sub[j]) 517 } 518 cleanAlt(sub[start]) 519 out = append(out, sub[start]) 520 } 521 522 // ... and then emit sub[i]. 523 if i < len(sub) { 524 out = append(out, sub[i]) 525 } 526 start = i + 1 527 } 528 sub = out 529 530 // Round 4: Collapse runs of empty matches into a single empty match. 531 start = 0 532 out = sub[:0] 533 for i := range sub { 534 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { 535 continue 536 } 537 out = append(out, sub[i]) 538 } 539 sub = out 540 541 return sub 542 } 543 544 // leadingString returns the leading literal string that re begins with. 545 // The string refers to storage in re or its children. 546 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { 547 if re.Op == OpConcat && len(re.Sub) > 0 { 548 re = re.Sub[0] 549 } 550 if re.Op != OpLiteral { 551 return nil, 0 552 } 553 return re.Rune, re.Flags & FoldCase 554 } 555 556 // removeLeadingString removes the first n leading runes 557 // from the beginning of re. It returns the replacement for re. 558 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { 559 if re.Op == OpConcat && len(re.Sub) > 0 { 560 // Removing a leading string in a concatenation 561 // might simplify the concatenation. 562 sub := re.Sub[0] 563 sub = p.removeLeadingString(sub, n) 564 re.Sub[0] = sub 565 if sub.Op == OpEmptyMatch { 566 p.reuse(sub) 567 switch len(re.Sub) { 568 case 0, 1: 569 // Impossible but handle. 570 re.Op = OpEmptyMatch 571 re.Sub = nil 572 case 2: 573 old := re 574 re = re.Sub[1] 575 p.reuse(old) 576 default: 577 copy(re.Sub, re.Sub[1:]) 578 re.Sub = re.Sub[:len(re.Sub)-1] 579 } 580 } 581 return re 582 } 583 584 if re.Op == OpLiteral { 585 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] 586 if len(re.Rune) == 0 { 587 re.Op = OpEmptyMatch 588 } 589 } 590 return re 591 } 592 593 // leadingRegexp returns the leading regexp that re begins with. 594 // The regexp refers to storage in re or its children. 595 func (p *parser) leadingRegexp(re *Regexp) *Regexp { 596 if re.Op == OpEmptyMatch { 597 return nil 598 } 599 if re.Op == OpConcat && len(re.Sub) > 0 { 600 sub := re.Sub[0] 601 if sub.Op == OpEmptyMatch { 602 return nil 603 } 604 return sub 605 } 606 return re 607 } 608 609 // removeLeadingRegexp removes the leading regexp in re. 610 // It returns the replacement for re. 611 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. 612 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { 613 if re.Op == OpConcat && len(re.Sub) > 0 { 614 if reuse { 615 p.reuse(re.Sub[0]) 616 } 617 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] 618 switch len(re.Sub) { 619 case 0: 620 re.Op = OpEmptyMatch 621 re.Sub = nil 622 case 1: 623 old := re 624 re = re.Sub[0] 625 p.reuse(old) 626 } 627 return re 628 } 629 if reuse { 630 p.reuse(re) 631 } 632 return p.newRegexp(OpEmptyMatch) 633 } 634 635 func literalRegexp(s string, flags Flags) *Regexp { 636 re := &Regexp{Op: OpLiteral} 637 re.Flags = flags 638 re.Rune = re.Rune0[:0] // use local storage for small strings 639 for _, c := range s { 640 if len(re.Rune) >= cap(re.Rune) { 641 // string is too long to fit in Rune0. let Go handle it 642 re.Rune = []rune(s) 643 break 644 } 645 re.Rune = append(re.Rune, c) 646 } 647 return re 648 } 649 650 // Parsing. 651 652 // Parse parses a regular expression string s, controlled by the specified 653 // Flags, and returns a regular expression parse tree. The syntax is 654 // described in the top-level comment. 655 func Parse(s string, flags Flags) (*Regexp, error) { 656 if flags&Literal != 0 { 657 // Trivial parser for literal string. 658 if err := checkUTF8(s); err != nil { 659 return nil, err 660 } 661 return literalRegexp(s, flags), nil 662 } 663 664 // Otherwise, must do real work. 665 var ( 666 p parser 667 err error 668 c rune 669 op Op 670 lastRepeat string 671 min, max int 672 ) 673 p.flags = flags 674 p.wholeRegexp = s 675 t := s 676 for t != "" { 677 repeat := "" 678 BigSwitch: 679 switch t[0] { 680 default: 681 if c, t, err = nextRune(t); err != nil { 682 return nil, err 683 } 684 p.literal(c) 685 686 case '(': 687 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { 688 // Flag changes and non-capturing groups. 689 if t, err = p.parsePerlFlags(t); err != nil { 690 return nil, err 691 } 692 break 693 } 694 p.numCap++ 695 p.op(opLeftParen).Cap = p.numCap 696 t = t[1:] 697 case '|': 698 if err = p.parseVerticalBar(); err != nil { 699 return nil, err 700 } 701 t = t[1:] 702 case ')': 703 if err = p.parseRightParen(); err != nil { 704 return nil, err 705 } 706 t = t[1:] 707 case '^': 708 if p.flags&OneLine != 0 { 709 p.op(OpBeginText) 710 } else { 711 p.op(OpBeginLine) 712 } 713 t = t[1:] 714 case '$': 715 if p.flags&OneLine != 0 { 716 p.op(OpEndText).Flags |= WasDollar 717 } else { 718 p.op(OpEndLine) 719 } 720 t = t[1:] 721 case '.': 722 if p.flags&DotNL != 0 { 723 p.op(OpAnyChar) 724 } else { 725 p.op(OpAnyCharNotNL) 726 } 727 t = t[1:] 728 case '[': 729 if t, err = p.parseClass(t); err != nil { 730 return nil, err 731 } 732 case '*', '+', '?': 733 before := t 734 switch t[0] { 735 case '*': 736 op = OpStar 737 case '+': 738 op = OpPlus 739 case '?': 740 op = OpQuest 741 } 742 after := t[1:] 743 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { 744 return nil, err 745 } 746 repeat = before 747 t = after 748 case '{': 749 op = OpRepeat 750 before := t 751 min, max, after, ok := p.parseRepeat(t) 752 if !ok { 753 // If the repeat cannot be parsed, { is a literal. 754 p.literal('{') 755 t = t[1:] 756 break 757 } 758 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { 759 // Numbers were too big, or max is present and min > max. 760 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 761 } 762 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { 763 return nil, err 764 } 765 repeat = before 766 t = after 767 case '\\': 768 if p.flags&PerlX != 0 && len(t) >= 2 { 769 switch t[1] { 770 case 'A': 771 p.op(OpBeginText) 772 t = t[2:] 773 break BigSwitch 774 case 'b': 775 p.op(OpWordBoundary) 776 t = t[2:] 777 break BigSwitch 778 case 'B': 779 p.op(OpNoWordBoundary) 780 t = t[2:] 781 break BigSwitch 782 case 'C': 783 // any byte; not supported 784 return nil, &Error{ErrInvalidEscape, t[:2]} 785 case 'Q': 786 // \Q ... \E: the ... is always literals 787 var lit string 788 if i := strings.Index(t, `\E`); i < 0 { 789 lit = t[2:] 790 t = "" 791 } else { 792 lit = t[2:i] 793 t = t[i+2:] 794 } 795 p.push(literalRegexp(lit, p.flags)) 796 break BigSwitch 797 case 'z': 798 p.op(OpEndText) 799 t = t[2:] 800 break BigSwitch 801 } 802 } 803 804 re := p.newRegexp(OpCharClass) 805 re.Flags = p.flags 806 807 // Look for Unicode character group like \p{Han} 808 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { 809 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) 810 if err != nil { 811 return nil, err 812 } 813 if r != nil { 814 re.Rune = r 815 t = rest 816 p.push(re) 817 break BigSwitch 818 } 819 } 820 821 // Perl character class escape. 822 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { 823 re.Rune = r 824 t = rest 825 p.push(re) 826 break BigSwitch 827 } 828 p.reuse(re) 829 830 // Ordinary single-character escape. 831 if c, t, err = p.parseEscape(t); err != nil { 832 return nil, err 833 } 834 p.literal(c) 835 } 836 lastRepeat = repeat 837 } 838 839 p.concat() 840 if p.swapVerticalBar() { 841 // pop vertical bar 842 p.stack = p.stack[:len(p.stack)-1] 843 } 844 p.alternate() 845 846 n := len(p.stack) 847 if n != 1 { 848 return nil, &Error{ErrMissingParen, s} 849 } 850 return p.stack[0], nil 851 } 852 853 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. 854 // If s is not of that form, it returns ok == false. 855 // If s has the right form but the values are too big, it returns min == -1, ok == true. 856 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { 857 if s == "" || s[0] != '{' { 858 return 859 } 860 s = s[1:] 861 var ok1 bool 862 if min, s, ok1 = p.parseInt(s); !ok1 { 863 return 864 } 865 if s == "" { 866 return 867 } 868 if s[0] != ',' { 869 max = min 870 } else { 871 s = s[1:] 872 if s == "" { 873 return 874 } 875 if s[0] == '}' { 876 max = -1 877 } else if max, s, ok1 = p.parseInt(s); !ok1 { 878 return 879 } else if max < 0 { 880 // parseInt found too big a number 881 min = -1 882 } 883 } 884 if s == "" || s[0] != '}' { 885 return 886 } 887 rest = s[1:] 888 ok = true 889 return 890 } 891 892 // parsePerlFlags parses a Perl flag setting or non-capturing group or both, 893 // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. 894 // The caller must have ensured that s begins with "(?". 895 func (p *parser) parsePerlFlags(s string) (rest string, err error) { 896 t := s 897 898 // Check for named captures, first introduced in Python's regexp library. 899 // As usual, there are three slightly different syntaxes: 900 // 901 // (?P<name>expr) the original, introduced by Python 902 // (?<name>expr) the .NET alteration, adopted by Perl 5.10 903 // (?'name'expr) another .NET alteration, adopted by Perl 5.10 904 // 905 // Perl 5.10 gave in and implemented the Python version too, 906 // but they claim that the last two are the preferred forms. 907 // PCRE and languages based on it (specifically, PHP and Ruby) 908 // support all three as well. EcmaScript 4 uses only the Python form. 909 // 910 // In both the open source world (via Code Search) and the 911 // Google source tree, (?P<expr>name) is the dominant form, 912 // so that's the one we implement. One is enough. 913 if len(t) > 4 && t[2] == 'P' && t[3] == '<' { 914 // Pull out name. 915 end := strings.IndexRune(t, '>') 916 if end < 0 { 917 if err = checkUTF8(t); err != nil { 918 return "", err 919 } 920 return "", &Error{ErrInvalidNamedCapture, s} 921 } 922 923 capture := t[:end+1] // "(?P<name>" 924 name := t[4:end] // "name" 925 if err = checkUTF8(name); err != nil { 926 return "", err 927 } 928 if !isValidCaptureName(name) { 929 return "", &Error{ErrInvalidNamedCapture, capture} 930 } 931 932 // Like ordinary capture, but named. 933 p.numCap++ 934 re := p.op(opLeftParen) 935 re.Cap = p.numCap 936 re.Name = name 937 return t[end+1:], nil 938 } 939 940 // Non-capturing group. Might also twiddle Perl flags. 941 var c rune 942 t = t[2:] // skip (? 943 flags := p.flags 944 sign := +1 945 sawFlag := false 946 Loop: 947 for t != "" { 948 if c, t, err = nextRune(t); err != nil { 949 return "", err 950 } 951 switch c { 952 default: 953 break Loop 954 955 // Flags. 956 case 'i': 957 flags |= FoldCase 958 sawFlag = true 959 case 'm': 960 flags &^= OneLine 961 sawFlag = true 962 case 's': 963 flags |= DotNL 964 sawFlag = true 965 case 'U': 966 flags |= NonGreedy 967 sawFlag = true 968 969 // Switch to negation. 970 case '-': 971 if sign < 0 { 972 break Loop 973 } 974 sign = -1 975 // Invert flags so that | above turn into &^ and vice versa. 976 // We'll invert flags again before using it below. 977 flags = ^flags 978 sawFlag = false 979 980 // End of flags, starting group or not. 981 case ':', ')': 982 if sign < 0 { 983 if !sawFlag { 984 break Loop 985 } 986 flags = ^flags 987 } 988 if c == ':' { 989 // Open new group 990 p.op(opLeftParen) 991 } 992 p.flags = flags 993 return t, nil 994 } 995 } 996 997 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} 998 } 999 1000 // isValidCaptureName reports whether name 1001 // is a valid capture name: [A-Za-z0-9_]+. 1002 // PCRE limits names to 32 bytes. 1003 // Python rejects names starting with digits. 1004 // We don't enforce either of those. 1005 func isValidCaptureName(name string) bool { 1006 if name == "" { 1007 return false 1008 } 1009 for _, c := range name { 1010 if c != '_' && !isalnum(c) { 1011 return false 1012 } 1013 } 1014 return true 1015 } 1016 1017 // parseInt parses a decimal integer. 1018 func (p *parser) parseInt(s string) (n int, rest string, ok bool) { 1019 if s == "" || s[0] < '0' || '9' < s[0] { 1020 return 1021 } 1022 // Disallow leading zeros. 1023 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { 1024 return 1025 } 1026 t := s 1027 for s != "" && '0' <= s[0] && s[0] <= '9' { 1028 s = s[1:] 1029 } 1030 rest = s 1031 ok = true 1032 // Have digits, compute value. 1033 t = t[:len(t)-len(s)] 1034 for i := 0; i < len(t); i++ { 1035 // Avoid overflow. 1036 if n >= 1e8 { 1037 n = -1 1038 break 1039 } 1040 n = n*10 + int(t[i]) - '0' 1041 } 1042 return 1043 } 1044 1045 // can this be represented as a character class? 1046 // single-rune literal string, char class, ., and .|\n. 1047 func isCharClass(re *Regexp) bool { 1048 return re.Op == OpLiteral && len(re.Rune) == 1 || 1049 re.Op == OpCharClass || 1050 re.Op == OpAnyCharNotNL || 1051 re.Op == OpAnyChar 1052 } 1053 1054 // does re match r? 1055 func matchRune(re *Regexp, r rune) bool { 1056 switch re.Op { 1057 case OpLiteral: 1058 return len(re.Rune) == 1 && re.Rune[0] == r 1059 case OpCharClass: 1060 for i := 0; i < len(re.Rune); i += 2 { 1061 if re.Rune[i] <= r && r <= re.Rune[i+1] { 1062 return true 1063 } 1064 } 1065 return false 1066 case OpAnyCharNotNL: 1067 return r != '\n' 1068 case OpAnyChar: 1069 return true 1070 } 1071 return false 1072 } 1073 1074 // parseVerticalBar handles a | in the input. 1075 func (p *parser) parseVerticalBar() error { 1076 p.concat() 1077 1078 // The concatenation we just parsed is on top of the stack. 1079 // If it sits above an opVerticalBar, swap it below 1080 // (things below an opVerticalBar become an alternation). 1081 // Otherwise, push a new vertical bar. 1082 if !p.swapVerticalBar() { 1083 p.op(opVerticalBar) 1084 } 1085 1086 return nil 1087 } 1088 1089 // mergeCharClass makes dst = dst|src. 1090 // The caller must ensure that dst.Op >= src.Op, 1091 // to reduce the amount of copying. 1092 func mergeCharClass(dst, src *Regexp) { 1093 switch dst.Op { 1094 case OpAnyChar: 1095 // src doesn't add anything. 1096 case OpAnyCharNotNL: 1097 // src might add \n 1098 if matchRune(src, '\n') { 1099 dst.Op = OpAnyChar 1100 } 1101 case OpCharClass: 1102 // src is simpler, so either literal or char class 1103 if src.Op == OpLiteral { 1104 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1105 } else { 1106 dst.Rune = appendClass(dst.Rune, src.Rune) 1107 } 1108 case OpLiteral: 1109 // both literal 1110 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { 1111 break 1112 } 1113 dst.Op = OpCharClass 1114 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) 1115 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1116 } 1117 } 1118 1119 // If the top of the stack is an element followed by an opVerticalBar 1120 // swapVerticalBar swaps the two and returns true. 1121 // Otherwise it returns false. 1122 func (p *parser) swapVerticalBar() bool { 1123 // If above and below vertical bar are literal or char class, 1124 // can merge into a single char class. 1125 n := len(p.stack) 1126 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { 1127 re1 := p.stack[n-1] 1128 re3 := p.stack[n-3] 1129 // Make re3 the more complex of the two. 1130 if re1.Op > re3.Op { 1131 re1, re3 = re3, re1 1132 p.stack[n-3] = re3 1133 } 1134 mergeCharClass(re3, re1) 1135 p.reuse(re1) 1136 p.stack = p.stack[:n-1] 1137 return true 1138 } 1139 1140 if n >= 2 { 1141 re1 := p.stack[n-1] 1142 re2 := p.stack[n-2] 1143 if re2.Op == opVerticalBar { 1144 if n >= 3 { 1145 // Now out of reach. 1146 // Clean opportunistically. 1147 cleanAlt(p.stack[n-3]) 1148 } 1149 p.stack[n-2] = re1 1150 p.stack[n-1] = re2 1151 return true 1152 } 1153 } 1154 return false 1155 } 1156 1157 // parseRightParen handles a ) in the input. 1158 func (p *parser) parseRightParen() error { 1159 p.concat() 1160 if p.swapVerticalBar() { 1161 // pop vertical bar 1162 p.stack = p.stack[:len(p.stack)-1] 1163 } 1164 p.alternate() 1165 1166 n := len(p.stack) 1167 if n < 2 { 1168 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1169 } 1170 re1 := p.stack[n-1] 1171 re2 := p.stack[n-2] 1172 p.stack = p.stack[:n-2] 1173 if re2.Op != opLeftParen { 1174 return &Error{ErrUnexpectedParen, p.wholeRegexp} 1175 } 1176 // Restore flags at time of paren. 1177 p.flags = re2.Flags 1178 if re2.Cap == 0 { 1179 // Just for grouping. 1180 p.push(re1) 1181 } else { 1182 re2.Op = OpCapture 1183 re2.Sub = re2.Sub0[:1] 1184 re2.Sub[0] = re1 1185 p.push(re2) 1186 } 1187 return nil 1188 } 1189 1190 // parseEscape parses an escape sequence at the beginning of s 1191 // and returns the rune. 1192 func (p *parser) parseEscape(s string) (r rune, rest string, err error) { 1193 t := s[1:] 1194 if t == "" { 1195 return 0, "", &Error{ErrTrailingBackslash, ""} 1196 } 1197 c, t, err := nextRune(t) 1198 if err != nil { 1199 return 0, "", err 1200 } 1201 1202 Switch: 1203 switch c { 1204 default: 1205 if c < utf8.RuneSelf && !isalnum(c) { 1206 // Escaped non-word characters are always themselves. 1207 // PCRE is not quite so rigorous: it accepts things like 1208 // \q, but we don't. We once rejected \_, but too many 1209 // programs and people insist on using it, so allow \_. 1210 return c, t, nil 1211 } 1212 1213 // Octal escapes. 1214 case '1', '2', '3', '4', '5', '6', '7': 1215 // Single non-zero digit is a backreference; not supported 1216 if t == "" || t[0] < '0' || t[0] > '7' { 1217 break 1218 } 1219 fallthrough 1220 case '0': 1221 // Consume up to three octal digits; already have one. 1222 r = c - '0' 1223 for i := 1; i < 3; i++ { 1224 if t == "" || t[0] < '0' || t[0] > '7' { 1225 break 1226 } 1227 r = r*8 + rune(t[0]) - '0' 1228 t = t[1:] 1229 } 1230 return r, t, nil 1231 1232 // Hexadecimal escapes. 1233 case 'x': 1234 if t == "" { 1235 break 1236 } 1237 if c, t, err = nextRune(t); err != nil { 1238 return 0, "", err 1239 } 1240 if c == '{' { 1241 // Any number of digits in braces. 1242 // Perl accepts any text at all; it ignores all text 1243 // after the first non-hex digit. We require only hex digits, 1244 // and at least one. 1245 nhex := 0 1246 r = 0 1247 for { 1248 if t == "" { 1249 break Switch 1250 } 1251 if c, t, err = nextRune(t); err != nil { 1252 return 0, "", err 1253 } 1254 if c == '}' { 1255 break 1256 } 1257 v := unhex(c) 1258 if v < 0 { 1259 break Switch 1260 } 1261 r = r*16 + v 1262 if r > unicode.MaxRune { 1263 break Switch 1264 } 1265 nhex++ 1266 } 1267 if nhex == 0 { 1268 break Switch 1269 } 1270 return r, t, nil 1271 } 1272 1273 // Easy case: two hex digits. 1274 x := unhex(c) 1275 if c, t, err = nextRune(t); err != nil { 1276 return 0, "", err 1277 } 1278 y := unhex(c) 1279 if x < 0 || y < 0 { 1280 break 1281 } 1282 return x*16 + y, t, nil 1283 1284 // C escapes. There is no case 'b', to avoid misparsing 1285 // the Perl word-boundary \b as the C backspace \b 1286 // when in POSIX mode. In Perl, /\b/ means word-boundary 1287 // but /[\b]/ means backspace. We don't support that. 1288 // If you want a backspace, embed a literal backspace 1289 // character or use \x08. 1290 case 'a': 1291 return '\a', t, err 1292 case 'f': 1293 return '\f', t, err 1294 case 'n': 1295 return '\n', t, err 1296 case 'r': 1297 return '\r', t, err 1298 case 't': 1299 return '\t', t, err 1300 case 'v': 1301 return '\v', t, err 1302 } 1303 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} 1304 } 1305 1306 // parseClassChar parses a character class character at the beginning of s 1307 // and returns it. 1308 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { 1309 if s == "" { 1310 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} 1311 } 1312 1313 // Allow regular escape sequences even though 1314 // many need not be escaped in this context. 1315 if s[0] == '\\' { 1316 return p.parseEscape(s) 1317 } 1318 1319 return nextRune(s) 1320 } 1321 1322 type charGroup struct { 1323 sign int 1324 class []rune 1325 } 1326 1327 // parsePerlClassEscape parses a leading Perl character class escape like \d 1328 // from the beginning of s. If one is present, it appends the characters to r 1329 // and returns the new slice r and the remainder of the string. 1330 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { 1331 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { 1332 return 1333 } 1334 g := perlGroup[s[0:2]] 1335 if g.sign == 0 { 1336 return 1337 } 1338 return p.appendGroup(r, g), s[2:] 1339 } 1340 1341 // parseNamedClass parses a leading POSIX named character class like [:alnum:] 1342 // from the beginning of s. If one is present, it appends the characters to r 1343 // and returns the new slice r and the remainder of the string. 1344 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { 1345 if len(s) < 2 || s[0] != '[' || s[1] != ':' { 1346 return 1347 } 1348 1349 i := strings.Index(s[2:], ":]") 1350 if i < 0 { 1351 return 1352 } 1353 i += 2 1354 name, s := s[0:i+2], s[i+2:] 1355 g := posixGroup[name] 1356 if g.sign == 0 { 1357 return nil, "", &Error{ErrInvalidCharRange, name} 1358 } 1359 return p.appendGroup(r, g), s, nil 1360 } 1361 1362 func (p *parser) appendGroup(r []rune, g charGroup) []rune { 1363 if p.flags&FoldCase == 0 { 1364 if g.sign < 0 { 1365 r = appendNegatedClass(r, g.class) 1366 } else { 1367 r = appendClass(r, g.class) 1368 } 1369 } else { 1370 tmp := p.tmpClass[:0] 1371 tmp = appendFoldedClass(tmp, g.class) 1372 p.tmpClass = tmp 1373 tmp = cleanClass(&p.tmpClass) 1374 if g.sign < 0 { 1375 r = appendNegatedClass(r, tmp) 1376 } else { 1377 r = appendClass(r, tmp) 1378 } 1379 } 1380 return r 1381 } 1382 1383 var anyTable = &unicode.RangeTable{ 1384 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, 1385 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, 1386 } 1387 1388 // unicodeTable returns the unicode.RangeTable identified by name 1389 // and the table of additional fold-equivalent code points. 1390 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { 1391 // Special case: "Any" means any. 1392 if name == "Any" { 1393 return anyTable, anyTable 1394 } 1395 if t := unicode.Categories[name]; t != nil { 1396 return t, unicode.FoldCategory[name] 1397 } 1398 if t := unicode.Scripts[name]; t != nil { 1399 return t, unicode.FoldScript[name] 1400 } 1401 return nil, nil 1402 } 1403 1404 // parseUnicodeClass parses a leading Unicode character class like \p{Han} 1405 // from the beginning of s. If one is present, it appends the characters to r 1406 // and returns the new slice r and the remainder of the string. 1407 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { 1408 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { 1409 return 1410 } 1411 1412 // Committed to parse or return error. 1413 sign := +1 1414 if s[1] == 'P' { 1415 sign = -1 1416 } 1417 t := s[2:] 1418 c, t, err := nextRune(t) 1419 if err != nil { 1420 return 1421 } 1422 var seq, name string 1423 if c != '{' { 1424 // Single-letter name. 1425 seq = s[:len(s)-len(t)] 1426 name = seq[2:] 1427 } else { 1428 // Name is in braces. 1429 end := strings.IndexRune(s, '}') 1430 if end < 0 { 1431 if err = checkUTF8(s); err != nil { 1432 return 1433 } 1434 return nil, "", &Error{ErrInvalidCharRange, s} 1435 } 1436 seq, t = s[:end+1], s[end+1:] 1437 name = s[3:end] 1438 if err = checkUTF8(name); err != nil { 1439 return 1440 } 1441 } 1442 1443 // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. 1444 if name != "" && name[0] == '^' { 1445 sign = -sign 1446 name = name[1:] 1447 } 1448 1449 tab, fold := unicodeTable(name) 1450 if tab == nil { 1451 return nil, "", &Error{ErrInvalidCharRange, seq} 1452 } 1453 1454 if p.flags&FoldCase == 0 || fold == nil { 1455 if sign > 0 { 1456 r = appendTable(r, tab) 1457 } else { 1458 r = appendNegatedTable(r, tab) 1459 } 1460 } else { 1461 // Merge and clean tab and fold in a temporary buffer. 1462 // This is necessary for the negative case and just tidy 1463 // for the positive case. 1464 tmp := p.tmpClass[:0] 1465 tmp = appendTable(tmp, tab) 1466 tmp = appendTable(tmp, fold) 1467 p.tmpClass = tmp 1468 tmp = cleanClass(&p.tmpClass) 1469 if sign > 0 { 1470 r = appendClass(r, tmp) 1471 } else { 1472 r = appendNegatedClass(r, tmp) 1473 } 1474 } 1475 return r, t, nil 1476 } 1477 1478 // parseClass parses a character class at the beginning of s 1479 // and pushes it onto the parse stack. 1480 func (p *parser) parseClass(s string) (rest string, err error) { 1481 t := s[1:] // chop [ 1482 re := p.newRegexp(OpCharClass) 1483 re.Flags = p.flags 1484 re.Rune = re.Rune0[:0] 1485 1486 sign := +1 1487 if t != "" && t[0] == '^' { 1488 sign = -1 1489 t = t[1:] 1490 1491 // If character class does not match \n, add it here, 1492 // so that negation later will do the right thing. 1493 if p.flags&ClassNL == 0 { 1494 re.Rune = append(re.Rune, '\n', '\n') 1495 } 1496 } 1497 1498 class := re.Rune 1499 first := true // ] and - are okay as first char in class 1500 for t == "" || t[0] != ']' || first { 1501 // POSIX: - is only okay unescaped as first or last in class. 1502 // Perl: - is okay anywhere. 1503 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { 1504 _, size := utf8.DecodeRuneInString(t[1:]) 1505 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} 1506 } 1507 first = false 1508 1509 // Look for POSIX [:alnum:] etc. 1510 if len(t) > 2 && t[0] == '[' && t[1] == ':' { 1511 nclass, nt, err := p.parseNamedClass(t, class) 1512 if err != nil { 1513 return "", err 1514 } 1515 if nclass != nil { 1516 class, t = nclass, nt 1517 continue 1518 } 1519 } 1520 1521 // Look for Unicode character group like \p{Han}. 1522 nclass, nt, err := p.parseUnicodeClass(t, class) 1523 if err != nil { 1524 return "", err 1525 } 1526 if nclass != nil { 1527 class, t = nclass, nt 1528 continue 1529 } 1530 1531 // Look for Perl character class symbols (extension). 1532 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { 1533 class, t = nclass, nt 1534 continue 1535 } 1536 1537 // Single character or simple range. 1538 rng := t 1539 var lo, hi rune 1540 if lo, t, err = p.parseClassChar(t, s); err != nil { 1541 return "", err 1542 } 1543 hi = lo 1544 // [a-] means (a|-) so check for final ]. 1545 if len(t) >= 2 && t[0] == '-' && t[1] != ']' { 1546 t = t[1:] 1547 if hi, t, err = p.parseClassChar(t, s); err != nil { 1548 return "", err 1549 } 1550 if hi < lo { 1551 rng = rng[:len(rng)-len(t)] 1552 return "", &Error{Code: ErrInvalidCharRange, Expr: rng} 1553 } 1554 } 1555 if p.flags&FoldCase == 0 { 1556 class = appendRange(class, lo, hi) 1557 } else { 1558 class = appendFoldedRange(class, lo, hi) 1559 } 1560 } 1561 t = t[1:] // chop ] 1562 1563 // Use &re.Rune instead of &class to avoid allocation. 1564 re.Rune = class 1565 class = cleanClass(&re.Rune) 1566 if sign < 0 { 1567 class = negateClass(class) 1568 } 1569 re.Rune = class 1570 p.push(re) 1571 return t, nil 1572 } 1573 1574 // cleanClass sorts the ranges (pairs of elements of r), 1575 // merges them, and eliminates duplicates. 1576 func cleanClass(rp *[]rune) []rune { 1577 1578 // Sort by lo increasing, hi decreasing to break ties. 1579 sort.Sort(ranges{rp}) 1580 1581 r := *rp 1582 if len(r) < 2 { 1583 return r 1584 } 1585 1586 // Merge abutting, overlapping. 1587 w := 2 // write index 1588 for i := 2; i < len(r); i += 2 { 1589 lo, hi := r[i], r[i+1] 1590 if lo <= r[w-1]+1 { 1591 // merge with previous range 1592 if hi > r[w-1] { 1593 r[w-1] = hi 1594 } 1595 continue 1596 } 1597 // new disjoint range 1598 r[w] = lo 1599 r[w+1] = hi 1600 w += 2 1601 } 1602 1603 return r[:w] 1604 } 1605 1606 // appendLiteral returns the result of appending the literal x to the class r. 1607 func appendLiteral(r []rune, x rune, flags Flags) []rune { 1608 if flags&FoldCase != 0 { 1609 return appendFoldedRange(r, x, x) 1610 } 1611 return appendRange(r, x, x) 1612 } 1613 1614 // appendRange returns the result of appending the range lo-hi to the class r. 1615 func appendRange(r []rune, lo, hi rune) []rune { 1616 // Expand last range or next to last range if it overlaps or abuts. 1617 // Checking two ranges helps when appending case-folded 1618 // alphabets, so that one range can be expanding A-Z and the 1619 // other expanding a-z. 1620 n := len(r) 1621 for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 1622 if n >= i { 1623 rlo, rhi := r[n-i], r[n-i+1] 1624 if lo <= rhi+1 && rlo <= hi+1 { 1625 if lo < rlo { 1626 r[n-i] = lo 1627 } 1628 if hi > rhi { 1629 r[n-i+1] = hi 1630 } 1631 return r 1632 } 1633 } 1634 } 1635 1636 return append(r, lo, hi) 1637 } 1638 1639 const ( 1640 // minimum and maximum runes involved in folding. 1641 // checked during test. 1642 minFold = 0x0041 1643 maxFold = 0x1044f 1644 ) 1645 1646 // appendFoldedRange returns the result of appending the range lo-hi 1647 // and its case folding-equivalent runes to the class r. 1648 func appendFoldedRange(r []rune, lo, hi rune) []rune { 1649 // Optimizations. 1650 if lo <= minFold && hi >= maxFold { 1651 // Range is full: folding can't add more. 1652 return appendRange(r, lo, hi) 1653 } 1654 if hi < minFold || lo > maxFold { 1655 // Range is outside folding possibilities. 1656 return appendRange(r, lo, hi) 1657 } 1658 if lo < minFold { 1659 // [lo, minFold-1] needs no folding. 1660 r = appendRange(r, lo, minFold-1) 1661 lo = minFold 1662 } 1663 if hi > maxFold { 1664 // [maxFold+1, hi] needs no folding. 1665 r = appendRange(r, maxFold+1, hi) 1666 hi = maxFold 1667 } 1668 1669 // Brute force. Depend on appendRange to coalesce ranges on the fly. 1670 for c := lo; c <= hi; c++ { 1671 r = appendRange(r, c, c) 1672 f := unicode.SimpleFold(c) 1673 for f != c { 1674 r = appendRange(r, f, f) 1675 f = unicode.SimpleFold(f) 1676 } 1677 } 1678 return r 1679 } 1680 1681 // appendClass returns the result of appending the class x to the class r. 1682 // It assume x is clean. 1683 func appendClass(r []rune, x []rune) []rune { 1684 for i := 0; i < len(x); i += 2 { 1685 r = appendRange(r, x[i], x[i+1]) 1686 } 1687 return r 1688 } 1689 1690 // appendFolded returns the result of appending the case folding of the class x to the class r. 1691 func appendFoldedClass(r []rune, x []rune) []rune { 1692 for i := 0; i < len(x); i += 2 { 1693 r = appendFoldedRange(r, x[i], x[i+1]) 1694 } 1695 return r 1696 } 1697 1698 // appendNegatedClass returns the result of appending the negation of the class x to the class r. 1699 // It assumes x is clean. 1700 func appendNegatedClass(r []rune, x []rune) []rune { 1701 nextLo := '\u0000' 1702 for i := 0; i < len(x); i += 2 { 1703 lo, hi := x[i], x[i+1] 1704 if nextLo <= lo-1 { 1705 r = appendRange(r, nextLo, lo-1) 1706 } 1707 nextLo = hi + 1 1708 } 1709 if nextLo <= unicode.MaxRune { 1710 r = appendRange(r, nextLo, unicode.MaxRune) 1711 } 1712 return r 1713 } 1714 1715 // appendTable returns the result of appending x to the class r. 1716 func appendTable(r []rune, x *unicode.RangeTable) []rune { 1717 for _, xr := range x.R16 { 1718 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1719 if stride == 1 { 1720 r = appendRange(r, lo, hi) 1721 continue 1722 } 1723 for c := lo; c <= hi; c += stride { 1724 r = appendRange(r, c, c) 1725 } 1726 } 1727 for _, xr := range x.R32 { 1728 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1729 if stride == 1 { 1730 r = appendRange(r, lo, hi) 1731 continue 1732 } 1733 for c := lo; c <= hi; c += stride { 1734 r = appendRange(r, c, c) 1735 } 1736 } 1737 return r 1738 } 1739 1740 // appendNegatedTable returns the result of appending the negation of x to the class r. 1741 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { 1742 nextLo := '\u0000' // lo end of next class to add 1743 for _, xr := range x.R16 { 1744 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1745 if stride == 1 { 1746 if nextLo <= lo-1 { 1747 r = appendRange(r, nextLo, lo-1) 1748 } 1749 nextLo = hi + 1 1750 continue 1751 } 1752 for c := lo; c <= hi; c += stride { 1753 if nextLo <= c-1 { 1754 r = appendRange(r, nextLo, c-1) 1755 } 1756 nextLo = c + 1 1757 } 1758 } 1759 for _, xr := range x.R32 { 1760 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1761 if stride == 1 { 1762 if nextLo <= lo-1 { 1763 r = appendRange(r, nextLo, lo-1) 1764 } 1765 nextLo = hi + 1 1766 continue 1767 } 1768 for c := lo; c <= hi; c += stride { 1769 if nextLo <= c-1 { 1770 r = appendRange(r, nextLo, c-1) 1771 } 1772 nextLo = c + 1 1773 } 1774 } 1775 if nextLo <= unicode.MaxRune { 1776 r = appendRange(r, nextLo, unicode.MaxRune) 1777 } 1778 return r 1779 } 1780 1781 // negateClass overwrites r and returns r's negation. 1782 // It assumes the class r is already clean. 1783 func negateClass(r []rune) []rune { 1784 nextLo := '\u0000' // lo end of next class to add 1785 w := 0 // write index 1786 for i := 0; i < len(r); i += 2 { 1787 lo, hi := r[i], r[i+1] 1788 if nextLo <= lo-1 { 1789 r[w] = nextLo 1790 r[w+1] = lo - 1 1791 w += 2 1792 } 1793 nextLo = hi + 1 1794 } 1795 r = r[:w] 1796 if nextLo <= unicode.MaxRune { 1797 // It's possible for the negation to have one more 1798 // range - this one - than the original class, so use append. 1799 r = append(r, nextLo, unicode.MaxRune) 1800 } 1801 return r 1802 } 1803 1804 // ranges implements sort.Interface on a []rune. 1805 // The choice of receiver type definition is strange 1806 // but avoids an allocation since we already have 1807 // a *[]rune. 1808 type ranges struct { 1809 p *[]rune 1810 } 1811 1812 func (ra ranges) Less(i, j int) bool { 1813 p := *ra.p 1814 i *= 2 1815 j *= 2 1816 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] 1817 } 1818 1819 func (ra ranges) Len() int { 1820 return len(*ra.p) / 2 1821 } 1822 1823 func (ra ranges) Swap(i, j int) { 1824 p := *ra.p 1825 i *= 2 1826 j *= 2 1827 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] 1828 } 1829 1830 func checkUTF8(s string) error { 1831 for s != "" { 1832 rune, size := utf8.DecodeRuneInString(s) 1833 if rune == utf8.RuneError && size == 1 { 1834 return &Error{Code: ErrInvalidUTF8, Expr: s} 1835 } 1836 s = s[size:] 1837 } 1838 return nil 1839 } 1840 1841 func nextRune(s string) (c rune, t string, err error) { 1842 c, size := utf8.DecodeRuneInString(s) 1843 if c == utf8.RuneError && size == 1 { 1844 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} 1845 } 1846 return c, s[size:], nil 1847 } 1848 1849 func isalnum(c rune) bool { 1850 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 1851 } 1852 1853 func unhex(c rune) rune { 1854 if '0' <= c && c <= '9' { 1855 return c - '0' 1856 } 1857 if 'a' <= c && c <= 'f' { 1858 return c - 'a' + 10 1859 } 1860 if 'A' <= c && c <= 'F' { 1861 return c - 'A' + 10 1862 } 1863 return -1 1864 }