github.com/prattmic/llgo-embedded@v0.0.0-20150820070356-41cfecea0e1e/third_party/gofrontend/libgo/go/old/regexp/regexp.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements a simple regular expression library. 6 // 7 // The syntax of the regular expressions accepted is: 8 // 9 // regexp: 10 // concatenation { '|' concatenation } 11 // concatenation: 12 // { closure } 13 // closure: 14 // term [ '*' | '+' | '?' ] 15 // term: 16 // '^' 17 // '$' 18 // '.' 19 // character 20 // '[' [ '^' ] { character-range } ']' 21 // '(' regexp ')' 22 // character-range: 23 // character [ '-' character ] 24 // 25 // All characters are UTF-8-encoded code points. Backslashes escape special 26 // characters, including inside character classes. The standard Go character 27 // escapes are also recognized: \a \b \f \n \r \t \v. 28 // 29 // There are 16 methods of Regexp that match a regular expression and identify 30 // the matched text. Their names are matched by this regular expression: 31 // 32 // Find(All)?(String)?(Submatch)?(Index)? 33 // 34 // If 'All' is present, the routine matches successive non-overlapping 35 // matches of the entire expression. Empty matches abutting a preceding 36 // match are ignored. The return value is a slice containing the successive 37 // return values of the corresponding non-'All' routine. These routines take 38 // an extra integer argument, n; if n >= 0, the function returns at most n 39 // matches/submatches. 40 // 41 // If 'String' is present, the argument is a string; otherwise it is a slice 42 // of bytes; return values are adjusted as appropriate. 43 // 44 // If 'Submatch' is present, the return value is a slice identifying the 45 // successive submatches of the expression. Submatches are matches of 46 // parenthesized subexpressions within the regular expression, numbered from 47 // left to right in order of opening parenthesis. Submatch 0 is the match of 48 // the entire expression, submatch 1 the match of the first parenthesized 49 // subexpression, and so on. 50 // 51 // If 'Index' is present, matches and submatches are identified by byte index 52 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 53 // the nth submatch. The pair for n==0 identifies the match of the entire 54 // expression. If 'Index' is not present, the match is identified by the 55 // text of the match/submatch. If an index is negative, it means that 56 // subexpression did not match any string in the input. 57 // 58 // There is also a subset of the methods that can be applied to text read 59 // from a RuneReader: 60 // 61 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 62 // 63 // This set may grow. Note that regular expression matches may need to 64 // examine text beyond the text returned by a match, so the methods that 65 // match text from a RuneReader may read arbitrarily far into the input 66 // before returning. 67 // 68 // (There are a few other methods that do not match this pattern.) 69 // 70 package regexp 71 72 import ( 73 "bytes" 74 "io" 75 "strings" 76 "unicode/utf8" 77 ) 78 79 var debug = false 80 81 // Error is the local type for a parsing error. 82 type Error string 83 84 func (e Error) Error() string { 85 return string(e) 86 } 87 88 // Error codes returned by failures to parse an expression. 89 var ( 90 ErrInternal = Error("regexp: internal error") 91 ErrUnmatchedLpar = Error("regexp: unmatched '('") 92 ErrUnmatchedRpar = Error("regexp: unmatched ')'") 93 ErrUnmatchedLbkt = Error("regexp: unmatched '['") 94 ErrUnmatchedRbkt = Error("regexp: unmatched ']'") 95 ErrBadRange = Error("regexp: bad range in character class") 96 ErrExtraneousBackslash = Error("regexp: extraneous backslash") 97 ErrBadClosure = Error("regexp: repeated closure (**, ++, etc.)") 98 ErrBareClosure = Error("regexp: closure applies to nothing") 99 ErrBadBackslash = Error("regexp: illegal backslash escape") 100 ) 101 102 const ( 103 iStart = iota // beginning of program 104 iEnd // end of program: success 105 iBOT // '^' beginning of text 106 iEOT // '$' end of text 107 iChar // 'a' regular character 108 iCharClass // [a-z] character class 109 iAny // '.' any character including newline 110 iNotNL // [^\n] special case: any character but newline 111 iBra // '(' parenthesized expression: 2*braNum for left, 2*braNum+1 for right 112 iAlt // '|' alternation 113 iNop // do nothing; makes it easy to link without patching 114 ) 115 116 // An instruction executed by the NFA 117 type instr struct { 118 kind int // the type of this instruction: iChar, iAny, etc. 119 index int // used only in debugging; could be eliminated 120 next *instr // the instruction to execute after this one 121 // Special fields valid only for some items. 122 char rune // iChar 123 braNum int // iBra, iEbra 124 cclass *charClass // iCharClass 125 left *instr // iAlt, other branch 126 } 127 128 func (i *instr) print() { 129 switch i.kind { 130 case iStart: 131 print("start") 132 case iEnd: 133 print("end") 134 case iBOT: 135 print("bot") 136 case iEOT: 137 print("eot") 138 case iChar: 139 print("char ", string(i.char)) 140 case iCharClass: 141 i.cclass.print() 142 case iAny: 143 print("any") 144 case iNotNL: 145 print("notnl") 146 case iBra: 147 if i.braNum&1 == 0 { 148 print("bra", i.braNum/2) 149 } else { 150 print("ebra", i.braNum/2) 151 } 152 case iAlt: 153 print("alt(", i.left.index, ")") 154 case iNop: 155 print("nop") 156 } 157 } 158 159 // Regexp is the representation of a compiled regular expression. 160 // The public interface is entirely through methods. 161 // A Regexp is safe for concurrent use by multiple goroutines. 162 type Regexp struct { 163 expr string // the original expression 164 prefix string // initial plain text string 165 prefixBytes []byte // initial plain text bytes 166 inst []*instr 167 start *instr // first instruction of machine 168 prefixStart *instr // where to start if there is a prefix 169 nbra int // number of brackets in expression, for subexpressions 170 } 171 172 type charClass struct { 173 negate bool // is character class negated? ([^a-z]) 174 // slice of int, stored pairwise: [a-z] is (a,z); x is (x,x): 175 ranges []rune 176 cmin, cmax rune 177 } 178 179 func (cclass *charClass) print() { 180 print("charclass") 181 if cclass.negate { 182 print(" (negated)") 183 } 184 for i := 0; i < len(cclass.ranges); i += 2 { 185 l := cclass.ranges[i] 186 r := cclass.ranges[i+1] 187 if l == r { 188 print(" [", string(l), "]") 189 } else { 190 print(" [", string(l), "-", string(r), "]") 191 } 192 } 193 } 194 195 func (cclass *charClass) addRange(a, b rune) { 196 // range is a through b inclusive 197 cclass.ranges = append(cclass.ranges, a, b) 198 if a < cclass.cmin { 199 cclass.cmin = a 200 } 201 if b > cclass.cmax { 202 cclass.cmax = b 203 } 204 } 205 206 func (cclass *charClass) matches(c rune) bool { 207 if c < cclass.cmin || c > cclass.cmax { 208 return cclass.negate 209 } 210 ranges := cclass.ranges 211 for i := 0; i < len(ranges); i = i + 2 { 212 if ranges[i] <= c && c <= ranges[i+1] { 213 return !cclass.negate 214 } 215 } 216 return cclass.negate 217 } 218 219 func newCharClass() *instr { 220 i := &instr{kind: iCharClass} 221 i.cclass = new(charClass) 222 i.cclass.ranges = make([]rune, 0, 4) 223 i.cclass.cmin = 0x10FFFF + 1 // MaxRune + 1 224 i.cclass.cmax = -1 225 return i 226 } 227 228 func (re *Regexp) add(i *instr) *instr { 229 i.index = len(re.inst) 230 re.inst = append(re.inst, i) 231 return i 232 } 233 234 type parser struct { 235 re *Regexp 236 nlpar int // number of unclosed lpars 237 pos int 238 ch rune 239 } 240 241 func (p *parser) error(err Error) { 242 panic(err) 243 } 244 245 const endOfText = -1 246 247 func (p *parser) c() rune { return p.ch } 248 249 func (p *parser) nextc() rune { 250 if p.pos >= len(p.re.expr) { 251 p.ch = endOfText 252 } else { 253 c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:]) 254 p.ch = c 255 p.pos += w 256 } 257 return p.ch 258 } 259 260 func newParser(re *Regexp) *parser { 261 p := new(parser) 262 p.re = re 263 p.nextc() // load p.ch 264 return p 265 } 266 267 func special(c rune) bool { 268 for _, r := range `\.+*?()|[]^$` { 269 if c == r { 270 return true 271 } 272 } 273 return false 274 } 275 276 func ispunct(c rune) bool { 277 for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" { 278 if c == r { 279 return true 280 } 281 } 282 return false 283 } 284 285 var escapes = []byte("abfnrtv") 286 var escaped = []byte("\a\b\f\n\r\t\v") 287 288 func escape(c rune) int { 289 for i, b := range escapes { 290 if rune(b) == c { 291 return i 292 } 293 } 294 return -1 295 } 296 297 func (p *parser) checkBackslash() rune { 298 c := p.c() 299 if c == '\\' { 300 c = p.nextc() 301 switch { 302 case c == endOfText: 303 p.error(ErrExtraneousBackslash) 304 case ispunct(c): 305 // c is as delivered 306 case escape(c) >= 0: 307 c = rune(escaped[escape(c)]) 308 default: 309 p.error(ErrBadBackslash) 310 } 311 } 312 return c 313 } 314 315 func (p *parser) charClass() *instr { 316 i := newCharClass() 317 cc := i.cclass 318 if p.c() == '^' { 319 cc.negate = true 320 p.nextc() 321 } 322 left := rune(-1) 323 for { 324 switch c := p.c(); c { 325 case ']', endOfText: 326 if left >= 0 { 327 p.error(ErrBadRange) 328 } 329 // Is it [^\n]? 330 if cc.negate && len(cc.ranges) == 2 && 331 cc.ranges[0] == '\n' && cc.ranges[1] == '\n' { 332 nl := &instr{kind: iNotNL} 333 p.re.add(nl) 334 return nl 335 } 336 // Special common case: "[a]" -> "a" 337 if !cc.negate && len(cc.ranges) == 2 && cc.ranges[0] == cc.ranges[1] { 338 c := &instr{kind: iChar, char: cc.ranges[0]} 339 p.re.add(c) 340 return c 341 } 342 p.re.add(i) 343 return i 344 case '-': // do this before backslash processing 345 p.error(ErrBadRange) 346 default: 347 c = p.checkBackslash() 348 p.nextc() 349 switch { 350 case left < 0: // first of pair 351 if p.c() == '-' { // range 352 p.nextc() 353 left = c 354 } else { // single char 355 cc.addRange(c, c) 356 } 357 case left <= c: // second of pair 358 cc.addRange(left, c) 359 left = -1 360 default: 361 p.error(ErrBadRange) 362 } 363 } 364 } 365 panic("unreachable") 366 } 367 368 func (p *parser) term() (start, end *instr) { 369 switch c := p.c(); c { 370 case '|', endOfText: 371 return nil, nil 372 case '*', '+', '?': 373 p.error(ErrBareClosure) 374 case ')': 375 if p.nlpar == 0 { 376 p.error(ErrUnmatchedRpar) 377 } 378 return nil, nil 379 case ']': 380 p.error(ErrUnmatchedRbkt) 381 case '^': 382 p.nextc() 383 start = p.re.add(&instr{kind: iBOT}) 384 return start, start 385 case '$': 386 p.nextc() 387 start = p.re.add(&instr{kind: iEOT}) 388 return start, start 389 case '.': 390 p.nextc() 391 start = p.re.add(&instr{kind: iAny}) 392 return start, start 393 case '[': 394 p.nextc() 395 start = p.charClass() 396 if p.c() != ']' { 397 p.error(ErrUnmatchedLbkt) 398 } 399 p.nextc() 400 return start, start 401 case '(': 402 p.nextc() 403 p.nlpar++ 404 p.re.nbra++ // increment first so first subexpr is \1 405 nbra := p.re.nbra 406 start, end = p.regexp() 407 if p.c() != ')' { 408 p.error(ErrUnmatchedLpar) 409 } 410 p.nlpar-- 411 p.nextc() 412 bra := &instr{kind: iBra, braNum: 2 * nbra} 413 p.re.add(bra) 414 ebra := &instr{kind: iBra, braNum: 2*nbra + 1} 415 p.re.add(ebra) 416 if start == nil { 417 if end == nil { 418 p.error(ErrInternal) 419 return 420 } 421 start = ebra 422 } else { 423 end.next = ebra 424 } 425 bra.next = start 426 return bra, ebra 427 default: 428 c = p.checkBackslash() 429 p.nextc() 430 start = &instr{kind: iChar, char: c} 431 p.re.add(start) 432 return start, start 433 } 434 panic("unreachable") 435 } 436 437 func (p *parser) closure() (start, end *instr) { 438 start, end = p.term() 439 if start == nil { 440 return 441 } 442 switch p.c() { 443 case '*': 444 // (start,end)*: 445 alt := &instr{kind: iAlt} 446 p.re.add(alt) 447 end.next = alt // after end, do alt 448 alt.left = start // alternate brach: return to start 449 start = alt // alt becomes new (start, end) 450 end = alt 451 case '+': 452 // (start,end)+: 453 alt := &instr{kind: iAlt} 454 p.re.add(alt) 455 end.next = alt // after end, do alt 456 alt.left = start // alternate brach: return to start 457 end = alt // start is unchanged; end is alt 458 case '?': 459 // (start,end)?: 460 alt := &instr{kind: iAlt} 461 p.re.add(alt) 462 nop := &instr{kind: iNop} 463 p.re.add(nop) 464 alt.left = start // alternate branch is start 465 alt.next = nop // follow on to nop 466 end.next = nop // after end, go to nop 467 start = alt // start is now alt 468 end = nop // end is nop pointed to by both branches 469 default: 470 return 471 } 472 switch p.nextc() { 473 case '*', '+', '?': 474 p.error(ErrBadClosure) 475 } 476 return 477 } 478 479 func (p *parser) concatenation() (start, end *instr) { 480 for { 481 nstart, nend := p.closure() 482 switch { 483 case nstart == nil: // end of this concatenation 484 if start == nil { // this is the empty string 485 nop := p.re.add(&instr{kind: iNop}) 486 return nop, nop 487 } 488 return 489 case start == nil: // this is first element of concatenation 490 start, end = nstart, nend 491 default: 492 end.next = nstart 493 end = nend 494 } 495 } 496 panic("unreachable") 497 } 498 499 func (p *parser) regexp() (start, end *instr) { 500 start, end = p.concatenation() 501 for { 502 switch p.c() { 503 default: 504 return 505 case '|': 506 p.nextc() 507 nstart, nend := p.concatenation() 508 alt := &instr{kind: iAlt} 509 p.re.add(alt) 510 alt.left = start 511 alt.next = nstart 512 nop := &instr{kind: iNop} 513 p.re.add(nop) 514 end.next = nop 515 nend.next = nop 516 start, end = alt, nop 517 } 518 } 519 panic("unreachable") 520 } 521 522 func unNop(i *instr) *instr { 523 for i.kind == iNop { 524 i = i.next 525 } 526 return i 527 } 528 529 func (re *Regexp) eliminateNops() { 530 for _, inst := range re.inst { 531 if inst.kind == iEnd { 532 continue 533 } 534 inst.next = unNop(inst.next) 535 if inst.kind == iAlt { 536 inst.left = unNop(inst.left) 537 } 538 } 539 } 540 541 func (re *Regexp) dump() { 542 print("prefix <", re.prefix, ">\n") 543 for _, inst := range re.inst { 544 print(inst.index, ": ") 545 inst.print() 546 if inst.kind != iEnd { 547 print(" -> ", inst.next.index) 548 } 549 print("\n") 550 } 551 } 552 553 func (re *Regexp) doParse() { 554 p := newParser(re) 555 start := &instr{kind: iStart} 556 re.add(start) 557 s, e := p.regexp() 558 start.next = s 559 re.start = start 560 e.next = re.add(&instr{kind: iEnd}) 561 562 if debug { 563 re.dump() 564 println() 565 } 566 567 re.eliminateNops() 568 if debug { 569 re.dump() 570 println() 571 } 572 re.setPrefix() 573 if debug { 574 re.dump() 575 println() 576 } 577 } 578 579 // Extract regular text from the beginning of the pattern, 580 // possibly after a leading iBOT. 581 // That text can be used by doExecute to speed up matching. 582 func (re *Regexp) setPrefix() { 583 var b []byte 584 var utf = make([]byte, utf8.UTFMax) 585 var inst *instr 586 // First instruction is start; skip that. Also skip any initial iBOT. 587 inst = re.inst[0].next 588 for inst.kind == iBOT { 589 inst = inst.next 590 } 591 Loop: 592 for ; inst.kind != iEnd; inst = inst.next { 593 // stop if this is not a char 594 if inst.kind != iChar { 595 break 596 } 597 // stop if this char can be followed by a match for an empty string, 598 // which includes closures, ^, and $. 599 switch inst.next.kind { 600 case iBOT, iEOT, iAlt: 601 break Loop 602 } 603 n := utf8.EncodeRune(utf, inst.char) 604 b = append(b, utf[0:n]...) 605 } 606 // point prefixStart instruction to first non-CHAR after prefix 607 re.prefixStart = inst 608 re.prefixBytes = b 609 re.prefix = string(b) 610 } 611 612 // String returns the source text used to compile the regular expression. 613 func (re *Regexp) String() string { 614 return re.expr 615 } 616 617 // Compile parses a regular expression and returns, if successful, a Regexp 618 // object that can be used to match against text. 619 func Compile(str string) (regexp *Regexp, error error) { 620 regexp = new(Regexp) 621 // doParse will panic if there is a parse error. 622 defer func() { 623 if e := recover(); e != nil { 624 regexp = nil 625 error = e.(Error) // Will re-panic if error was not an Error, e.g. nil-pointer exception 626 } 627 }() 628 regexp.expr = str 629 regexp.inst = make([]*instr, 0, 10) 630 regexp.doParse() 631 return 632 } 633 634 // MustCompile is like Compile but panics if the expression cannot be parsed. 635 // It simplifies safe initialization of global variables holding compiled regular 636 // expressions. 637 func MustCompile(str string) *Regexp { 638 regexp, error := Compile(str) 639 if error != nil { 640 panic(`regexp: compiling "` + str + `": ` + error.Error()) 641 } 642 return regexp 643 } 644 645 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 646 func (re *Regexp) NumSubexp() int { return re.nbra } 647 648 // The match arena allows us to reduce the garbage generated by tossing 649 // match vectors away as we execute. Matches are ref counted and returned 650 // to a free list when no longer active. Increases a simple benchmark by 22X. 651 type matchArena struct { 652 head *matchVec 653 len int // length of match vector 654 pos int 655 atBOT bool // whether we're at beginning of text 656 atEOT bool // whether we're at end of text 657 } 658 659 type matchVec struct { 660 m []int // pairs of bracketing submatches. 0th is start,end 661 ref int 662 next *matchVec 663 } 664 665 func (a *matchArena) new() *matchVec { 666 if a.head == nil { 667 const N = 10 668 block := make([]matchVec, N) 669 for i := 0; i < N; i++ { 670 b := &block[i] 671 b.next = a.head 672 a.head = b 673 } 674 } 675 m := a.head 676 a.head = m.next 677 m.ref = 0 678 if m.m == nil { 679 m.m = make([]int, a.len) 680 } 681 return m 682 } 683 684 func (a *matchArena) free(m *matchVec) { 685 m.ref-- 686 if m.ref == 0 { 687 m.next = a.head 688 a.head = m 689 } 690 } 691 692 func (a *matchArena) copy(m *matchVec) *matchVec { 693 m1 := a.new() 694 copy(m1.m, m.m) 695 return m1 696 } 697 698 func (a *matchArena) noMatch() *matchVec { 699 m := a.new() 700 for i := range m.m { 701 m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac" 702 } 703 m.ref = 1 704 return m 705 } 706 707 type state struct { 708 inst *instr // next instruction to execute 709 prefixed bool // this match began with a fixed prefix 710 match *matchVec 711 } 712 713 // Append new state to to-do list. Leftmost-longest wins so avoid 714 // adding a state that's already active. The matchVec will be inc-ref'ed 715 // if it is assigned to a state. 716 func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec) []state { 717 switch inst.kind { 718 case iBOT: 719 if a.atBOT { 720 s = a.addState(s, inst.next, prefixed, match) 721 } 722 return s 723 case iEOT: 724 if a.atEOT { 725 s = a.addState(s, inst.next, prefixed, match) 726 } 727 return s 728 case iBra: 729 match.m[inst.braNum] = a.pos 730 s = a.addState(s, inst.next, prefixed, match) 731 return s 732 } 733 l := len(s) 734 // States are inserted in order so it's sufficient to see if we have the same 735 // instruction; no need to see if existing match is earlier (it is). 736 for i := 0; i < l; i++ { 737 if s[i].inst == inst { 738 return s 739 } 740 } 741 s = append(s, state{inst, prefixed, match}) 742 match.ref++ 743 if inst.kind == iAlt { 744 s = a.addState(s, inst.left, prefixed, a.copy(match)) 745 // give other branch a copy of this match vector 746 s = a.addState(s, inst.next, prefixed, a.copy(match)) 747 } 748 return s 749 } 750 751 // input abstracts different representations of the input text. It provides 752 // one-character lookahead. 753 type input interface { 754 step(pos int) (r rune, width int) // advance one rune 755 canCheckPrefix() bool // can we look ahead without losing info? 756 hasPrefix(re *Regexp) bool 757 index(re *Regexp, pos int) int 758 } 759 760 // inputString scans a string. 761 type inputString struct { 762 str string 763 } 764 765 func newInputString(str string) *inputString { 766 return &inputString{str: str} 767 } 768 769 func (i *inputString) step(pos int) (rune, int) { 770 if pos < len(i.str) { 771 return utf8.DecodeRuneInString(i.str[pos:len(i.str)]) 772 } 773 return endOfText, 0 774 } 775 776 func (i *inputString) canCheckPrefix() bool { 777 return true 778 } 779 780 func (i *inputString) hasPrefix(re *Regexp) bool { 781 return strings.HasPrefix(i.str, re.prefix) 782 } 783 784 func (i *inputString) index(re *Regexp, pos int) int { 785 return strings.Index(i.str[pos:], re.prefix) 786 } 787 788 // inputBytes scans a byte slice. 789 type inputBytes struct { 790 str []byte 791 } 792 793 func newInputBytes(str []byte) *inputBytes { 794 return &inputBytes{str: str} 795 } 796 797 func (i *inputBytes) step(pos int) (rune, int) { 798 if pos < len(i.str) { 799 return utf8.DecodeRune(i.str[pos:len(i.str)]) 800 } 801 return endOfText, 0 802 } 803 804 func (i *inputBytes) canCheckPrefix() bool { 805 return true 806 } 807 808 func (i *inputBytes) hasPrefix(re *Regexp) bool { 809 return bytes.HasPrefix(i.str, re.prefixBytes) 810 } 811 812 func (i *inputBytes) index(re *Regexp, pos int) int { 813 return bytes.Index(i.str[pos:], re.prefixBytes) 814 } 815 816 // inputReader scans a RuneReader. 817 type inputReader struct { 818 r io.RuneReader 819 atEOT bool 820 pos int 821 } 822 823 func newInputReader(r io.RuneReader) *inputReader { 824 return &inputReader{r: r} 825 } 826 827 func (i *inputReader) step(pos int) (rune, int) { 828 if !i.atEOT && pos != i.pos { 829 return endOfText, 0 830 831 } 832 r, w, err := i.r.ReadRune() 833 if err != nil { 834 i.atEOT = true 835 return endOfText, 0 836 } 837 i.pos += w 838 return r, w 839 } 840 841 func (i *inputReader) canCheckPrefix() bool { 842 return false 843 } 844 845 func (i *inputReader) hasPrefix(re *Regexp) bool { 846 return false 847 } 848 849 func (i *inputReader) index(re *Regexp, pos int) int { 850 return -1 851 } 852 853 // Search match starting from pos bytes into the input. 854 func (re *Regexp) doExecute(i input, pos int) []int { 855 var s [2][]state 856 s[0] = make([]state, 0, 10) 857 s[1] = make([]state, 0, 10) 858 in, out := 0, 1 859 var final state 860 found := false 861 anchored := re.inst[0].next.kind == iBOT 862 if anchored && pos > 0 { 863 return nil 864 } 865 // fast check for initial plain substring 866 if i.canCheckPrefix() && re.prefix != "" { 867 advance := 0 868 if anchored { 869 if !i.hasPrefix(re) { 870 return nil 871 } 872 } else { 873 advance = i.index(re, pos) 874 if advance == -1 { 875 return nil 876 } 877 } 878 pos += advance 879 } 880 // We look one character ahead so we can match $, which checks whether 881 // we are at EOT. 882 nextChar, nextWidth := i.step(pos) 883 arena := &matchArena{ 884 len: 2 * (re.nbra + 1), 885 pos: pos, 886 atBOT: pos == 0, 887 atEOT: nextChar == endOfText, 888 } 889 for c, startPos := rune(0), pos; c != endOfText; { 890 if !found && (pos == startPos || !anchored) { 891 // prime the pump if we haven't seen a match yet 892 match := arena.noMatch() 893 match.m[0] = pos 894 s[out] = arena.addState(s[out], re.start.next, false, match) 895 arena.free(match) // if addState saved it, ref was incremented 896 } else if len(s[out]) == 0 { 897 // machine has completed 898 break 899 } 900 in, out = out, in // old out state is new in state 901 // clear out old state 902 old := s[out] 903 for _, state := range old { 904 arena.free(state.match) 905 } 906 s[out] = old[0:0] // truncate state vector 907 c = nextChar 908 thisPos := pos 909 pos += nextWidth 910 nextChar, nextWidth = i.step(pos) 911 arena.atEOT = nextChar == endOfText 912 arena.atBOT = false 913 arena.pos = pos 914 for _, st := range s[in] { 915 switch st.inst.kind { 916 case iBOT: 917 case iEOT: 918 case iChar: 919 if c == st.inst.char { 920 s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match) 921 } 922 case iCharClass: 923 if st.inst.cclass.matches(c) { 924 s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match) 925 } 926 case iAny: 927 if c != endOfText { 928 s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match) 929 } 930 case iNotNL: 931 if c != endOfText && c != '\n' { 932 s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match) 933 } 934 case iBra: 935 case iAlt: 936 case iEnd: 937 // choose leftmost longest 938 if !found || // first 939 st.match.m[0] < final.match.m[0] || // leftmost 940 (st.match.m[0] == final.match.m[0] && thisPos > final.match.m[1]) { // longest 941 if final.match != nil { 942 arena.free(final.match) 943 } 944 final = st 945 final.match.ref++ 946 final.match.m[1] = thisPos 947 } 948 found = true 949 default: 950 st.inst.print() 951 panic("unknown instruction in execute") 952 } 953 } 954 } 955 if final.match == nil { 956 return nil 957 } 958 // if match found, back up start of match by width of prefix. 959 if final.prefixed && len(final.match.m) > 0 { 960 final.match.m[0] -= len(re.prefix) 961 } 962 return final.match.m 963 } 964 965 // LiteralPrefix returns a literal string that must begin any match 966 // of the regular expression re. It returns the boolean true if the 967 // literal string comprises the entire regular expression. 968 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 969 c := make([]rune, len(re.inst)-2) // minus start and end. 970 // First instruction is start; skip that. 971 i := 0 972 for inst := re.inst[0].next; inst.kind != iEnd; inst = inst.next { 973 // stop if this is not a char 974 if inst.kind != iChar { 975 return string(c[:i]), false 976 } 977 c[i] = inst.char 978 i++ 979 } 980 return string(c[:i]), true 981 } 982 983 // MatchReader returns whether the Regexp matches the text read by the 984 // RuneReader. The return value is a boolean: true for match, false for no 985 // match. 986 func (re *Regexp) MatchReader(r io.RuneReader) bool { 987 return len(re.doExecute(newInputReader(r), 0)) > 0 988 } 989 990 // MatchString returns whether the Regexp matches the string s. 991 // The return value is a boolean: true for match, false for no match. 992 func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(newInputString(s), 0)) > 0 } 993 994 // Match returns whether the Regexp matches the byte slice b. 995 // The return value is a boolean: true for match, false for no match. 996 func (re *Regexp) Match(b []byte) bool { return len(re.doExecute(newInputBytes(b), 0)) > 0 } 997 998 // MatchReader checks whether a textual regular expression matches the text 999 // read by the RuneReader. More complicated queries need to use Compile and 1000 // the full Regexp interface. 1001 func MatchReader(pattern string, r io.RuneReader) (matched bool, error error) { 1002 re, err := Compile(pattern) 1003 if err != nil { 1004 return false, err 1005 } 1006 return re.MatchReader(r), nil 1007 } 1008 1009 // MatchString checks whether a textual regular expression 1010 // matches a string. More complicated queries need 1011 // to use Compile and the full Regexp interface. 1012 func MatchString(pattern string, s string) (matched bool, error error) { 1013 re, err := Compile(pattern) 1014 if err != nil { 1015 return false, err 1016 } 1017 return re.MatchString(s), nil 1018 } 1019 1020 // Match checks whether a textual regular expression 1021 // matches a byte slice. More complicated queries need 1022 // to use Compile and the full Regexp interface. 1023 func Match(pattern string, b []byte) (matched bool, error error) { 1024 re, err := Compile(pattern) 1025 if err != nil { 1026 return false, err 1027 } 1028 return re.Match(b), nil 1029 } 1030 1031 // ReplaceAllString returns a copy of src in which all matches for the Regexp 1032 // have been replaced by repl. No support is provided for expressions 1033 // (e.g. \1 or $1) in the replacement string. 1034 func (re *Regexp) ReplaceAllString(src, repl string) string { 1035 return re.ReplaceAllStringFunc(src, func(string) string { return repl }) 1036 } 1037 1038 // ReplaceAllStringFunc returns a copy of src in which all matches for the 1039 // Regexp have been replaced by the return value of of function repl (whose 1040 // first argument is the matched string). No support is provided for 1041 // expressions (e.g. \1 or $1) in the replacement string. 1042 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 1043 lastMatchEnd := 0 // end position of the most recent match 1044 searchPos := 0 // position where we next look for a match 1045 buf := new(bytes.Buffer) 1046 for searchPos <= len(src) { 1047 a := re.doExecute(newInputString(src), searchPos) 1048 if len(a) == 0 { 1049 break // no more matches 1050 } 1051 1052 // Copy the unmatched characters before this match. 1053 io.WriteString(buf, src[lastMatchEnd:a[0]]) 1054 1055 // Now insert a copy of the replacement string, but not for a 1056 // match of the empty string immediately after another match. 1057 // (Otherwise, we get double replacement for patterns that 1058 // match both empty and nonempty strings.) 1059 if a[1] > lastMatchEnd || a[0] == 0 { 1060 io.WriteString(buf, repl(src[a[0]:a[1]])) 1061 } 1062 lastMatchEnd = a[1] 1063 1064 // Advance past this match; always advance at least one character. 1065 _, width := utf8.DecodeRuneInString(src[searchPos:]) 1066 if searchPos+width > a[1] { 1067 searchPos += width 1068 } else if searchPos+1 > a[1] { 1069 // This clause is only needed at the end of the input 1070 // string. In that case, DecodeRuneInString returns width=0. 1071 searchPos++ 1072 } else { 1073 searchPos = a[1] 1074 } 1075 } 1076 1077 // Copy the unmatched characters after the last match. 1078 io.WriteString(buf, src[lastMatchEnd:]) 1079 1080 return buf.String() 1081 } 1082 1083 // ReplaceAll returns a copy of src in which all matches for the Regexp 1084 // have been replaced by repl. No support is provided for expressions 1085 // (e.g. \1 or $1) in the replacement text. 1086 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 1087 return re.ReplaceAllFunc(src, func([]byte) []byte { return repl }) 1088 } 1089 1090 // ReplaceAllFunc returns a copy of src in which all matches for the 1091 // Regexp have been replaced by the return value of of function repl (whose 1092 // first argument is the matched []byte). No support is provided for 1093 // expressions (e.g. \1 or $1) in the replacement string. 1094 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 1095 lastMatchEnd := 0 // end position of the most recent match 1096 searchPos := 0 // position where we next look for a match 1097 buf := new(bytes.Buffer) 1098 for searchPos <= len(src) { 1099 a := re.doExecute(newInputBytes(src), searchPos) 1100 if len(a) == 0 { 1101 break // no more matches 1102 } 1103 1104 // Copy the unmatched characters before this match. 1105 buf.Write(src[lastMatchEnd:a[0]]) 1106 1107 // Now insert a copy of the replacement string, but not for a 1108 // match of the empty string immediately after another match. 1109 // (Otherwise, we get double replacement for patterns that 1110 // match both empty and nonempty strings.) 1111 if a[1] > lastMatchEnd || a[0] == 0 { 1112 buf.Write(repl(src[a[0]:a[1]])) 1113 } 1114 lastMatchEnd = a[1] 1115 1116 // Advance past this match; always advance at least one character. 1117 _, width := utf8.DecodeRune(src[searchPos:]) 1118 if searchPos+width > a[1] { 1119 searchPos += width 1120 } else if searchPos+1 > a[1] { 1121 // This clause is only needed at the end of the input 1122 // string. In that case, DecodeRuneInString returns width=0. 1123 searchPos++ 1124 } else { 1125 searchPos = a[1] 1126 } 1127 } 1128 1129 // Copy the unmatched characters after the last match. 1130 buf.Write(src[lastMatchEnd:]) 1131 1132 return buf.Bytes() 1133 } 1134 1135 // QuoteMeta returns a string that quotes all regular expression metacharacters 1136 // inside the argument text; the returned string is a regular expression matching 1137 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. 1138 func QuoteMeta(s string) string { 1139 b := make([]byte, 2*len(s)) 1140 1141 // A byte loop is correct because all metacharacters are ASCII. 1142 j := 0 1143 for i := 0; i < len(s); i++ { 1144 if special(rune(s[i])) { 1145 b[j] = '\\' 1146 j++ 1147 } 1148 b[j] = s[i] 1149 j++ 1150 } 1151 return string(b[0:j]) 1152 } 1153 1154 // Find matches in slice b if b is non-nil, otherwise find matches in string s. 1155 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 1156 var end int 1157 if b == nil { 1158 end = len(s) 1159 } else { 1160 end = len(b) 1161 } 1162 1163 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 1164 var in input 1165 if b == nil { 1166 in = newInputString(s) 1167 } else { 1168 in = newInputBytes(b) 1169 } 1170 matches := re.doExecute(in, pos) 1171 if len(matches) == 0 { 1172 break 1173 } 1174 1175 accept := true 1176 if matches[1] == pos { 1177 // We've found an empty match. 1178 if matches[0] == prevMatchEnd { 1179 // We don't allow an empty match right 1180 // after a previous match, so ignore it. 1181 accept = false 1182 } 1183 var width int 1184 // TODO: use step() 1185 if b == nil { 1186 _, width = utf8.DecodeRuneInString(s[pos:end]) 1187 } else { 1188 _, width = utf8.DecodeRune(b[pos:end]) 1189 } 1190 if width > 0 { 1191 pos += width 1192 } else { 1193 pos = end + 1 1194 } 1195 } else { 1196 pos = matches[1] 1197 } 1198 prevMatchEnd = matches[1] 1199 1200 if accept { 1201 deliver(matches) 1202 i++ 1203 } 1204 } 1205 } 1206 1207 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 1208 // A return value of nil indicates no match. 1209 func (re *Regexp) Find(b []byte) []byte { 1210 a := re.doExecute(newInputBytes(b), 0) 1211 if a == nil { 1212 return nil 1213 } 1214 return b[a[0]:a[1]] 1215 } 1216 1217 // FindIndex returns a two-element slice of integers defining the location of 1218 // the leftmost match in b of the regular expression. The match itself is at 1219 // b[loc[0]:loc[1]]. 1220 // A return value of nil indicates no match. 1221 func (re *Regexp) FindIndex(b []byte) (loc []int) { 1222 a := re.doExecute(newInputBytes(b), 0) 1223 if a == nil { 1224 return nil 1225 } 1226 return a[0:2] 1227 } 1228 1229 // FindString returns a string holding the text of the leftmost match in s of the regular 1230 // expression. If there is no match, the return value is an empty string, 1231 // but it will also be empty if the regular expression successfully matches 1232 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 1233 // necessary to distinguish these cases. 1234 func (re *Regexp) FindString(s string) string { 1235 a := re.doExecute(newInputString(s), 0) 1236 if a == nil { 1237 return "" 1238 } 1239 return s[a[0]:a[1]] 1240 } 1241 1242 // FindStringIndex returns a two-element slice of integers defining the 1243 // location of the leftmost match in s of the regular expression. The match 1244 // itself is at s[loc[0]:loc[1]]. 1245 // A return value of nil indicates no match. 1246 func (re *Regexp) FindStringIndex(s string) []int { 1247 a := re.doExecute(newInputString(s), 0) 1248 if a == nil { 1249 return nil 1250 } 1251 return a[0:2] 1252 } 1253 1254 // FindReaderIndex returns a two-element slice of integers defining the 1255 // location of the leftmost match of the regular expression in text read from 1256 // the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return 1257 // value of nil indicates no match. 1258 func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { 1259 a := re.doExecute(newInputReader(r), 0) 1260 if a == nil { 1261 return nil 1262 } 1263 return a[0:2] 1264 } 1265 1266 // FindSubmatch returns a slice of slices holding the text of the leftmost 1267 // match of the regular expression in b and the matches, if any, of its 1268 // subexpressions, as defined by the 'Submatch' descriptions in the package 1269 // comment. 1270 // A return value of nil indicates no match. 1271 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 1272 a := re.doExecute(newInputBytes(b), 0) 1273 if a == nil { 1274 return nil 1275 } 1276 ret := make([][]byte, len(a)/2) 1277 for i := range ret { 1278 if a[2*i] >= 0 { 1279 ret[i] = b[a[2*i]:a[2*i+1]] 1280 } 1281 } 1282 return ret 1283 } 1284 1285 // FindSubmatchIndex returns a slice holding the index pairs identifying the 1286 // leftmost match of the regular expression in b and the matches, if any, of 1287 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 1288 // in the package comment. 1289 // A return value of nil indicates no match. 1290 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 1291 return re.doExecute(newInputBytes(b), 0) 1292 } 1293 1294 // FindStringSubmatch returns a slice of strings holding the text of the 1295 // leftmost match of the regular expression in s and the matches, if any, of 1296 // its subexpressions, as defined by the 'Submatch' description in the 1297 // package comment. 1298 // A return value of nil indicates no match. 1299 func (re *Regexp) FindStringSubmatch(s string) []string { 1300 a := re.doExecute(newInputString(s), 0) 1301 if a == nil { 1302 return nil 1303 } 1304 ret := make([]string, len(a)/2) 1305 for i := range ret { 1306 if a[2*i] >= 0 { 1307 ret[i] = s[a[2*i]:a[2*i+1]] 1308 } 1309 } 1310 return ret 1311 } 1312 1313 // FindStringSubmatchIndex returns a slice holding the index pairs 1314 // identifying the leftmost match of the regular expression in s and the 1315 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 1316 // 'Index' descriptions in the package comment. 1317 // A return value of nil indicates no match. 1318 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 1319 return re.doExecute(newInputString(s), 0) 1320 } 1321 1322 // FindReaderSubmatchIndex returns a slice holding the index pairs 1323 // identifying the leftmost match of the regular expression of text read by 1324 // the RuneReader, and the matches, if any, of its subexpressions, as defined 1325 // by the 'Submatch' and 'Index' descriptions in the package comment. A 1326 // return value of nil indicates no match. 1327 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 1328 return re.doExecute(newInputReader(r), 0) 1329 } 1330 1331 const startSize = 10 // The size at which to start a slice in the 'All' routines. 1332 1333 // FindAll is the 'All' version of Find; it returns a slice of all successive 1334 // matches of the expression, as defined by the 'All' description in the 1335 // package comment. 1336 // A return value of nil indicates no match. 1337 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 1338 if n < 0 { 1339 n = len(b) + 1 1340 } 1341 result := make([][]byte, 0, startSize) 1342 re.allMatches("", b, n, func(match []int) { 1343 result = append(result, b[match[0]:match[1]]) 1344 }) 1345 if len(result) == 0 { 1346 return nil 1347 } 1348 return result 1349 } 1350 1351 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 1352 // successive matches of the expression, as defined by the 'All' description 1353 // in the package comment. 1354 // A return value of nil indicates no match. 1355 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 1356 if n < 0 { 1357 n = len(b) + 1 1358 } 1359 result := make([][]int, 0, startSize) 1360 re.allMatches("", b, n, func(match []int) { 1361 result = append(result, match[0:2]) 1362 }) 1363 if len(result) == 0 { 1364 return nil 1365 } 1366 return result 1367 } 1368 1369 // FindAllString is the 'All' version of FindString; it returns a slice of all 1370 // successive matches of the expression, as defined by the 'All' description 1371 // in the package comment. 1372 // A return value of nil indicates no match. 1373 func (re *Regexp) FindAllString(s string, n int) []string { 1374 if n < 0 { 1375 n = len(s) + 1 1376 } 1377 result := make([]string, 0, startSize) 1378 re.allMatches(s, nil, n, func(match []int) { 1379 result = append(result, s[match[0]:match[1]]) 1380 }) 1381 if len(result) == 0 { 1382 return nil 1383 } 1384 return result 1385 } 1386 1387 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 1388 // slice of all successive matches of the expression, as defined by the 'All' 1389 // description in the package comment. 1390 // A return value of nil indicates no match. 1391 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1392 if n < 0 { 1393 n = len(s) + 1 1394 } 1395 result := make([][]int, 0, startSize) 1396 re.allMatches(s, nil, n, func(match []int) { 1397 result = append(result, match[0:2]) 1398 }) 1399 if len(result) == 0 { 1400 return nil 1401 } 1402 return result 1403 } 1404 1405 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1406 // of all successive matches of the expression, as defined by the 'All' 1407 // description in the package comment. 1408 // A return value of nil indicates no match. 1409 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1410 if n < 0 { 1411 n = len(b) + 1 1412 } 1413 result := make([][][]byte, 0, startSize) 1414 re.allMatches("", b, n, func(match []int) { 1415 slice := make([][]byte, len(match)/2) 1416 for j := range slice { 1417 if match[2*j] >= 0 { 1418 slice[j] = b[match[2*j]:match[2*j+1]] 1419 } 1420 } 1421 result = append(result, slice) 1422 }) 1423 if len(result) == 0 { 1424 return nil 1425 } 1426 return result 1427 } 1428 1429 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1430 // a slice of all successive matches of the expression, as defined by the 1431 // 'All' description in the package comment. 1432 // A return value of nil indicates no match. 1433 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1434 if n < 0 { 1435 n = len(b) + 1 1436 } 1437 result := make([][]int, 0, startSize) 1438 re.allMatches("", b, n, func(match []int) { 1439 result = append(result, match) 1440 }) 1441 if len(result) == 0 { 1442 return nil 1443 } 1444 return result 1445 } 1446 1447 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1448 // returns a slice of all successive matches of the expression, as defined by 1449 // the 'All' description in the package comment. 1450 // A return value of nil indicates no match. 1451 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1452 if n < 0 { 1453 n = len(s) + 1 1454 } 1455 result := make([][]string, 0, startSize) 1456 re.allMatches(s, nil, n, func(match []int) { 1457 slice := make([]string, len(match)/2) 1458 for j := range slice { 1459 if match[2*j] >= 0 { 1460 slice[j] = s[match[2*j]:match[2*j+1]] 1461 } 1462 } 1463 result = append(result, slice) 1464 }) 1465 if len(result) == 0 { 1466 return nil 1467 } 1468 return result 1469 } 1470 1471 // FindAllStringSubmatchIndex is the 'All' version of 1472 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1473 // the expression, as defined by the 'All' description in the package 1474 // comment. 1475 // A return value of nil indicates no match. 1476 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1477 if n < 0 { 1478 n = len(s) + 1 1479 } 1480 result := make([][]int, 0, startSize) 1481 re.allMatches(s, nil, n, func(match []int) { 1482 result = append(result, match) 1483 }) 1484 if len(result) == 0 { 1485 return nil 1486 } 1487 return result 1488 }