github.com/corona10/go@v0.0.0-20180224231303-7a218942be57/src/regexp/regexp.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // https://golang.org/s/re2syntax, except for \C. 11 // For an overview of the syntax, run 12 // go doc regexp/syntax 13 // 14 // The regexp implementation provided by this package is 15 // guaranteed to run in time linear in the size of the input. 16 // (This is a property not guaranteed by most open source 17 // implementations of regular expressions.) For more information 18 // about this property, see 19 // http://swtch.com/~rsc/regexp/regexp1.html 20 // or any book about automata theory. 21 // 22 // All characters are UTF-8-encoded code points. 23 // 24 // There are 16 methods of Regexp that match a regular expression and identify 25 // the matched text. Their names are matched by this regular expression: 26 // 27 // Find(All)?(String)?(Submatch)?(Index)? 28 // 29 // If 'All' is present, the routine matches successive non-overlapping 30 // matches of the entire expression. Empty matches abutting a preceding 31 // match are ignored. The return value is a slice containing the successive 32 // return values of the corresponding non-'All' routine. These routines take 33 // an extra integer argument, n; if n >= 0, the function returns at most n 34 // matches/submatches. 35 // 36 // If 'String' is present, the argument is a string; otherwise it is a slice 37 // of bytes; return values are adjusted as appropriate. 38 // 39 // If 'Submatch' is present, the return value is a slice identifying the 40 // successive submatches of the expression. Submatches are matches of 41 // parenthesized subexpressions (also known as capturing groups) within the 42 // regular expression, numbered from left to right in order of opening 43 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 44 // the match of the first parenthesized subexpression, and so on. 45 // 46 // If 'Index' is present, matches and submatches are identified by byte index 47 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 48 // the nth submatch. The pair for n==0 identifies the match of the entire 49 // expression. If 'Index' is not present, the match is identified by the 50 // text of the match/submatch. If an index is negative, it means that 51 // subexpression did not match any string in the input. 52 // 53 // There is also a subset of the methods that can be applied to text read 54 // from a RuneReader: 55 // 56 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 57 // 58 // This set may grow. Note that regular expression matches may need to 59 // examine text beyond the text returned by a match, so the methods that 60 // match text from a RuneReader may read arbitrarily far into the input 61 // before returning. 62 // 63 // (There are a few other methods that do not match this pattern.) 64 // 65 package regexp 66 67 import ( 68 "bytes" 69 "io" 70 "regexp/syntax" 71 "strconv" 72 "strings" 73 "sync" 74 "unicode" 75 "unicode/utf8" 76 ) 77 78 // Regexp is the representation of a compiled regular expression. 79 // A Regexp is safe for concurrent use by multiple goroutines, 80 // except for configuration methods, such as Longest. 81 type Regexp struct { 82 // read-only after Compile 83 regexpRO 84 85 // cache of machines for running regexp 86 mu sync.Mutex 87 machine []*machine 88 } 89 90 type regexpRO struct { 91 expr string // as passed to Compile 92 prog *syntax.Prog // compiled program 93 onepass *onePassProg // onepass program or nil 94 prefix string // required prefix in unanchored matches 95 prefixBytes []byte // prefix, as a []byte 96 prefixComplete bool // prefix is the entire regexp 97 prefixRune rune // first rune in prefix 98 prefixEnd uint32 // pc for last rune in prefix 99 cond syntax.EmptyOp // empty-width conditions required at start of match 100 numSubexp int 101 subexpNames []string 102 longest bool 103 } 104 105 // String returns the source text used to compile the regular expression. 106 func (re *Regexp) String() string { 107 return re.expr 108 } 109 110 // Copy returns a new Regexp object copied from re. 111 // 112 // When using a Regexp in multiple goroutines, giving each goroutine 113 // its own copy helps to avoid lock contention. 114 func (re *Regexp) Copy() *Regexp { 115 // It is not safe to copy Regexp by value 116 // since it contains a sync.Mutex. 117 return &Regexp{ 118 regexpRO: re.regexpRO, 119 } 120 } 121 122 // Compile parses a regular expression and returns, if successful, 123 // a Regexp object that can be used to match against text. 124 // 125 // When matching against text, the regexp returns a match that 126 // begins as early as possible in the input (leftmost), and among those 127 // it chooses the one that a backtracking search would have found first. 128 // This so-called leftmost-first matching is the same semantics 129 // that Perl, Python, and other implementations use, although this 130 // package implements it without the expense of backtracking. 131 // For POSIX leftmost-longest matching, see CompilePOSIX. 132 func Compile(expr string) (*Regexp, error) { 133 return compile(expr, syntax.Perl, false) 134 } 135 136 // CompilePOSIX is like Compile but restricts the regular expression 137 // to POSIX ERE (egrep) syntax and changes the match semantics to 138 // leftmost-longest. 139 // 140 // That is, when matching against text, the regexp returns a match that 141 // begins as early as possible in the input (leftmost), and among those 142 // it chooses a match that is as long as possible. 143 // This so-called leftmost-longest matching is the same semantics 144 // that early regular expression implementations used and that POSIX 145 // specifies. 146 // 147 // However, there can be multiple leftmost-longest matches, with different 148 // submatch choices, and here this package diverges from POSIX. 149 // Among the possible leftmost-longest matches, this package chooses 150 // the one that a backtracking search would have found first, while POSIX 151 // specifies that the match be chosen to maximize the length of the first 152 // subexpression, then the second, and so on from left to right. 153 // The POSIX rule is computationally prohibitive and not even well-defined. 154 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details. 155 func CompilePOSIX(expr string) (*Regexp, error) { 156 return compile(expr, syntax.POSIX, true) 157 } 158 159 // Longest makes future searches prefer the leftmost-longest match. 160 // That is, when matching against text, the regexp returns a match that 161 // begins as early as possible in the input (leftmost), and among those 162 // it chooses a match that is as long as possible. 163 // This method modifies the Regexp and may not be called concurrently 164 // with any other methods. 165 func (re *Regexp) Longest() { 166 re.longest = true 167 } 168 169 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 170 re, err := syntax.Parse(expr, mode) 171 if err != nil { 172 return nil, err 173 } 174 maxCap := re.MaxCap() 175 capNames := re.CapNames() 176 177 re = re.Simplify() 178 prog, err := syntax.Compile(re) 179 if err != nil { 180 return nil, err 181 } 182 regexp := &Regexp{ 183 regexpRO: regexpRO{ 184 expr: expr, 185 prog: prog, 186 onepass: compileOnePass(prog), 187 numSubexp: maxCap, 188 subexpNames: capNames, 189 cond: prog.StartCond(), 190 longest: longest, 191 }, 192 } 193 if regexp.onepass == notOnePass { 194 regexp.prefix, regexp.prefixComplete = prog.Prefix() 195 } else { 196 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 197 } 198 if regexp.prefix != "" { 199 // TODO(rsc): Remove this allocation by adding 200 // IndexString to package bytes. 201 regexp.prefixBytes = []byte(regexp.prefix) 202 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 203 } 204 return regexp, nil 205 } 206 207 // get returns a machine to use for matching re. 208 // It uses the re's machine cache if possible, to avoid 209 // unnecessary allocation. 210 func (re *Regexp) get() *machine { 211 re.mu.Lock() 212 if n := len(re.machine); n > 0 { 213 z := re.machine[n-1] 214 re.machine = re.machine[:n-1] 215 re.mu.Unlock() 216 return z 217 } 218 re.mu.Unlock() 219 z := progMachine(re.prog, re.onepass) 220 z.re = re 221 return z 222 } 223 224 // put returns a machine to the re's machine cache. 225 // There is no attempt to limit the size of the cache, so it will 226 // grow to the maximum number of simultaneous matches 227 // run using re. (The cache empties when re gets garbage collected.) 228 func (re *Regexp) put(z *machine) { 229 // Remove references to input data that we no longer need. 230 z.inputBytes.str = nil 231 z.inputString.str = "" 232 z.inputReader.r = nil 233 234 re.mu.Lock() 235 re.machine = append(re.machine, z) 236 re.mu.Unlock() 237 } 238 239 // MustCompile is like Compile but panics if the expression cannot be parsed. 240 // It simplifies safe initialization of global variables holding compiled regular 241 // expressions. 242 func MustCompile(str string) *Regexp { 243 regexp, err := Compile(str) 244 if err != nil { 245 panic(`regexp: Compile(` + quote(str) + `): ` + err.Error()) 246 } 247 return regexp 248 } 249 250 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 251 // It simplifies safe initialization of global variables holding compiled regular 252 // expressions. 253 func MustCompilePOSIX(str string) *Regexp { 254 regexp, err := CompilePOSIX(str) 255 if err != nil { 256 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error()) 257 } 258 return regexp 259 } 260 261 func quote(s string) string { 262 if strconv.CanBackquote(s) { 263 return "`" + s + "`" 264 } 265 return strconv.Quote(s) 266 } 267 268 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 269 func (re *Regexp) NumSubexp() int { 270 return re.numSubexp 271 } 272 273 // SubexpNames returns the names of the parenthesized subexpressions 274 // in this Regexp. The name for the first sub-expression is names[1], 275 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 276 // Since the Regexp as a whole cannot be named, names[0] is always 277 // the empty string. The slice should not be modified. 278 func (re *Regexp) SubexpNames() []string { 279 return re.subexpNames 280 } 281 282 const endOfText rune = -1 283 284 // input abstracts different representations of the input text. It provides 285 // one-character lookahead. 286 type input interface { 287 step(pos int) (r rune, width int) // advance one rune 288 canCheckPrefix() bool // can we look ahead without losing info? 289 hasPrefix(re *Regexp) bool 290 index(re *Regexp, pos int) int 291 context(pos int) syntax.EmptyOp 292 } 293 294 // inputString scans a string. 295 type inputString struct { 296 str string 297 } 298 299 func (i *inputString) step(pos int) (rune, int) { 300 if pos < len(i.str) { 301 c := i.str[pos] 302 if c < utf8.RuneSelf { 303 return rune(c), 1 304 } 305 return utf8.DecodeRuneInString(i.str[pos:]) 306 } 307 return endOfText, 0 308 } 309 310 func (i *inputString) canCheckPrefix() bool { 311 return true 312 } 313 314 func (i *inputString) hasPrefix(re *Regexp) bool { 315 return strings.HasPrefix(i.str, re.prefix) 316 } 317 318 func (i *inputString) index(re *Regexp, pos int) int { 319 return strings.Index(i.str[pos:], re.prefix) 320 } 321 322 func (i *inputString) context(pos int) syntax.EmptyOp { 323 r1, r2 := endOfText, endOfText 324 // 0 < pos && pos <= len(i.str) 325 if uint(pos-1) < uint(len(i.str)) { 326 r1 = rune(i.str[pos-1]) 327 if r1 >= utf8.RuneSelf { 328 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 329 } 330 } 331 // 0 <= pos && pos < len(i.str) 332 if uint(pos) < uint(len(i.str)) { 333 r2 = rune(i.str[pos]) 334 if r2 >= utf8.RuneSelf { 335 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 336 } 337 } 338 return syntax.EmptyOpContext(r1, r2) 339 } 340 341 // inputBytes scans a byte slice. 342 type inputBytes struct { 343 str []byte 344 } 345 346 func (i *inputBytes) step(pos int) (rune, int) { 347 if pos < len(i.str) { 348 c := i.str[pos] 349 if c < utf8.RuneSelf { 350 return rune(c), 1 351 } 352 return utf8.DecodeRune(i.str[pos:]) 353 } 354 return endOfText, 0 355 } 356 357 func (i *inputBytes) canCheckPrefix() bool { 358 return true 359 } 360 361 func (i *inputBytes) hasPrefix(re *Regexp) bool { 362 return bytes.HasPrefix(i.str, re.prefixBytes) 363 } 364 365 func (i *inputBytes) index(re *Regexp, pos int) int { 366 return bytes.Index(i.str[pos:], re.prefixBytes) 367 } 368 369 func (i *inputBytes) context(pos int) syntax.EmptyOp { 370 r1, r2 := endOfText, endOfText 371 // 0 < pos && pos <= len(i.str) 372 if uint(pos-1) < uint(len(i.str)) { 373 r1 = rune(i.str[pos-1]) 374 if r1 >= utf8.RuneSelf { 375 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 376 } 377 } 378 // 0 <= pos && pos < len(i.str) 379 if uint(pos) < uint(len(i.str)) { 380 r2 = rune(i.str[pos]) 381 if r2 >= utf8.RuneSelf { 382 r2, _ = utf8.DecodeRune(i.str[pos:]) 383 } 384 } 385 return syntax.EmptyOpContext(r1, r2) 386 } 387 388 // inputReader scans a RuneReader. 389 type inputReader struct { 390 r io.RuneReader 391 atEOT bool 392 pos int 393 } 394 395 func (i *inputReader) step(pos int) (rune, int) { 396 if !i.atEOT && pos != i.pos { 397 return endOfText, 0 398 399 } 400 r, w, err := i.r.ReadRune() 401 if err != nil { 402 i.atEOT = true 403 return endOfText, 0 404 } 405 i.pos += w 406 return r, w 407 } 408 409 func (i *inputReader) canCheckPrefix() bool { 410 return false 411 } 412 413 func (i *inputReader) hasPrefix(re *Regexp) bool { 414 return false 415 } 416 417 func (i *inputReader) index(re *Regexp, pos int) int { 418 return -1 419 } 420 421 func (i *inputReader) context(pos int) syntax.EmptyOp { 422 return 0 423 } 424 425 // LiteralPrefix returns a literal string that must begin any match 426 // of the regular expression re. It returns the boolean true if the 427 // literal string comprises the entire regular expression. 428 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 429 return re.prefix, re.prefixComplete 430 } 431 432 // MatchReader reports whether the Regexp matches the text read by the 433 // RuneReader. 434 func (re *Regexp) MatchReader(r io.RuneReader) bool { 435 return re.doMatch(r, nil, "") 436 } 437 438 // MatchString reports whether the Regexp matches the string s. 439 func (re *Regexp) MatchString(s string) bool { 440 return re.doMatch(nil, nil, s) 441 } 442 443 // Match reports whether the Regexp matches the byte slice b. 444 func (re *Regexp) Match(b []byte) bool { 445 return re.doMatch(nil, b, "") 446 } 447 448 // MatchReader checks whether a textual regular expression matches the text 449 // read by the RuneReader. More complicated queries need to use Compile and 450 // the full Regexp interface. 451 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 452 re, err := Compile(pattern) 453 if err != nil { 454 return false, err 455 } 456 return re.MatchReader(r), nil 457 } 458 459 // MatchString checks whether a textual regular expression 460 // matches a string. More complicated queries need 461 // to use Compile and the full Regexp interface. 462 func MatchString(pattern string, s string) (matched bool, err error) { 463 re, err := Compile(pattern) 464 if err != nil { 465 return false, err 466 } 467 return re.MatchString(s), nil 468 } 469 470 // Match checks whether a textual regular expression 471 // matches a byte slice. More complicated queries need 472 // to use Compile and the full Regexp interface. 473 func Match(pattern string, b []byte) (matched bool, err error) { 474 re, err := Compile(pattern) 475 if err != nil { 476 return false, err 477 } 478 return re.Match(b), nil 479 } 480 481 // ReplaceAllString returns a copy of src, replacing matches of the Regexp 482 // with the replacement string repl. Inside repl, $ signs are interpreted as 483 // in Expand, so for instance $1 represents the text of the first submatch. 484 func (re *Regexp) ReplaceAllString(src, repl string) string { 485 n := 2 486 if strings.Contains(repl, "$") { 487 n = 2 * (re.numSubexp + 1) 488 } 489 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 490 return re.expand(dst, repl, nil, src, match) 491 }) 492 return string(b) 493 } 494 495 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 496 // with the replacement string repl. The replacement repl is substituted directly, 497 // without using Expand. 498 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 499 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 500 return append(dst, repl...) 501 })) 502 } 503 504 // ReplaceAllStringFunc returns a copy of src in which all matches of the 505 // Regexp have been replaced by the return value of function repl applied 506 // to the matched substring. The replacement returned by repl is substituted 507 // directly, without using Expand. 508 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 509 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 510 return append(dst, repl(src[match[0]:match[1]])...) 511 }) 512 return string(b) 513 } 514 515 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 516 lastMatchEnd := 0 // end position of the most recent match 517 searchPos := 0 // position where we next look for a match 518 var buf []byte 519 var endPos int 520 if bsrc != nil { 521 endPos = len(bsrc) 522 } else { 523 endPos = len(src) 524 } 525 if nmatch > re.prog.NumCap { 526 nmatch = re.prog.NumCap 527 } 528 529 var dstCap [2]int 530 for searchPos <= endPos { 531 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) 532 if len(a) == 0 { 533 break // no more matches 534 } 535 536 // Copy the unmatched characters before this match. 537 if bsrc != nil { 538 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 539 } else { 540 buf = append(buf, src[lastMatchEnd:a[0]]...) 541 } 542 543 // Now insert a copy of the replacement string, but not for a 544 // match of the empty string immediately after another match. 545 // (Otherwise, we get double replacement for patterns that 546 // match both empty and nonempty strings.) 547 if a[1] > lastMatchEnd || a[0] == 0 { 548 buf = repl(buf, a) 549 } 550 lastMatchEnd = a[1] 551 552 // Advance past this match; always advance at least one character. 553 var width int 554 if bsrc != nil { 555 _, width = utf8.DecodeRune(bsrc[searchPos:]) 556 } else { 557 _, width = utf8.DecodeRuneInString(src[searchPos:]) 558 } 559 if searchPos+width > a[1] { 560 searchPos += width 561 } else if searchPos+1 > a[1] { 562 // This clause is only needed at the end of the input 563 // string. In that case, DecodeRuneInString returns width=0. 564 searchPos++ 565 } else { 566 searchPos = a[1] 567 } 568 } 569 570 // Copy the unmatched characters after the last match. 571 if bsrc != nil { 572 buf = append(buf, bsrc[lastMatchEnd:]...) 573 } else { 574 buf = append(buf, src[lastMatchEnd:]...) 575 } 576 577 return buf 578 } 579 580 // ReplaceAll returns a copy of src, replacing matches of the Regexp 581 // with the replacement text repl. Inside repl, $ signs are interpreted as 582 // in Expand, so for instance $1 represents the text of the first submatch. 583 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 584 n := 2 585 if bytes.IndexByte(repl, '$') >= 0 { 586 n = 2 * (re.numSubexp + 1) 587 } 588 srepl := "" 589 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 590 if len(srepl) != len(repl) { 591 srepl = string(repl) 592 } 593 return re.expand(dst, srepl, src, "", match) 594 }) 595 return b 596 } 597 598 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 599 // with the replacement bytes repl. The replacement repl is substituted directly, 600 // without using Expand. 601 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 602 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 603 return append(dst, repl...) 604 }) 605 } 606 607 // ReplaceAllFunc returns a copy of src in which all matches of the 608 // Regexp have been replaced by the return value of function repl applied 609 // to the matched byte slice. The replacement returned by repl is substituted 610 // directly, without using Expand. 611 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 612 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 613 return append(dst, repl(src[match[0]:match[1]])...) 614 }) 615 } 616 617 // Bitmap used by func special to check whether a character needs to be escaped. 618 var specialBytes [16]byte 619 620 // special reports whether byte b needs to be escaped by QuoteMeta. 621 func special(b byte) bool { 622 return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0 623 } 624 625 func init() { 626 for _, b := range []byte(`\.+*?()|[]{}^$`) { 627 specialBytes[b%16] |= 1 << (b / 16) 628 } 629 } 630 631 // QuoteMeta returns a string that quotes all regular expression metacharacters 632 // inside the argument text; the returned string is a regular expression matching 633 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. 634 func QuoteMeta(s string) string { 635 // A byte loop is correct because all metacharacters are ASCII. 636 var i int 637 for i = 0; i < len(s); i++ { 638 if special(s[i]) { 639 break 640 } 641 } 642 // No meta characters found, so return original string. 643 if i >= len(s) { 644 return s 645 } 646 647 b := make([]byte, 2*len(s)-i) 648 copy(b, s[:i]) 649 j := i 650 for ; i < len(s); i++ { 651 if special(s[i]) { 652 b[j] = '\\' 653 j++ 654 } 655 b[j] = s[i] 656 j++ 657 } 658 return string(b[:j]) 659 } 660 661 // The number of capture values in the program may correspond 662 // to fewer capturing expressions than are in the regexp. 663 // For example, "(a){0}" turns into an empty program, so the 664 // maximum capture in the program is 0 but we need to return 665 // an expression for \1. Pad appends -1s to the slice a as needed. 666 func (re *Regexp) pad(a []int) []int { 667 if a == nil { 668 // No match. 669 return nil 670 } 671 n := (1 + re.numSubexp) * 2 672 for len(a) < n { 673 a = append(a, -1) 674 } 675 return a 676 } 677 678 // Find matches in slice b if b is non-nil, otherwise find matches in string s. 679 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 680 var end int 681 if b == nil { 682 end = len(s) 683 } else { 684 end = len(b) 685 } 686 687 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 688 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) 689 if len(matches) == 0 { 690 break 691 } 692 693 accept := true 694 if matches[1] == pos { 695 // We've found an empty match. 696 if matches[0] == prevMatchEnd { 697 // We don't allow an empty match right 698 // after a previous match, so ignore it. 699 accept = false 700 } 701 var width int 702 // TODO: use step() 703 if b == nil { 704 _, width = utf8.DecodeRuneInString(s[pos:end]) 705 } else { 706 _, width = utf8.DecodeRune(b[pos:end]) 707 } 708 if width > 0 { 709 pos += width 710 } else { 711 pos = end + 1 712 } 713 } else { 714 pos = matches[1] 715 } 716 prevMatchEnd = matches[1] 717 718 if accept { 719 deliver(re.pad(matches)) 720 i++ 721 } 722 } 723 } 724 725 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 726 // A return value of nil indicates no match. 727 func (re *Regexp) Find(b []byte) []byte { 728 var dstCap [2]int 729 a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) 730 if a == nil { 731 return nil 732 } 733 return b[a[0]:a[1]] 734 } 735 736 // FindIndex returns a two-element slice of integers defining the location of 737 // the leftmost match in b of the regular expression. The match itself is at 738 // b[loc[0]:loc[1]]. 739 // A return value of nil indicates no match. 740 func (re *Regexp) FindIndex(b []byte) (loc []int) { 741 a := re.doExecute(nil, b, "", 0, 2, nil) 742 if a == nil { 743 return nil 744 } 745 return a[0:2] 746 } 747 748 // FindString returns a string holding the text of the leftmost match in s of the regular 749 // expression. If there is no match, the return value is an empty string, 750 // but it will also be empty if the regular expression successfully matches 751 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 752 // necessary to distinguish these cases. 753 func (re *Regexp) FindString(s string) string { 754 var dstCap [2]int 755 a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) 756 if a == nil { 757 return "" 758 } 759 return s[a[0]:a[1]] 760 } 761 762 // FindStringIndex returns a two-element slice of integers defining the 763 // location of the leftmost match in s of the regular expression. The match 764 // itself is at s[loc[0]:loc[1]]. 765 // A return value of nil indicates no match. 766 func (re *Regexp) FindStringIndex(s string) (loc []int) { 767 a := re.doExecute(nil, nil, s, 0, 2, nil) 768 if a == nil { 769 return nil 770 } 771 return a[0:2] 772 } 773 774 // FindReaderIndex returns a two-element slice of integers defining the 775 // location of the leftmost match of the regular expression in text read from 776 // the RuneReader. The match text was found in the input stream at 777 // byte offset loc[0] through loc[1]-1. 778 // A return value of nil indicates no match. 779 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 780 a := re.doExecute(r, nil, "", 0, 2, nil) 781 if a == nil { 782 return nil 783 } 784 return a[0:2] 785 } 786 787 // FindSubmatch returns a slice of slices holding the text of the leftmost 788 // match of the regular expression in b and the matches, if any, of its 789 // subexpressions, as defined by the 'Submatch' descriptions in the package 790 // comment. 791 // A return value of nil indicates no match. 792 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 793 var dstCap [4]int 794 a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) 795 if a == nil { 796 return nil 797 } 798 ret := make([][]byte, 1+re.numSubexp) 799 for i := range ret { 800 if 2*i < len(a) && a[2*i] >= 0 { 801 ret[i] = b[a[2*i]:a[2*i+1]] 802 } 803 } 804 return ret 805 } 806 807 // Expand appends template to dst and returns the result; during the 808 // append, Expand replaces variables in the template with corresponding 809 // matches drawn from src. The match slice should have been returned by 810 // FindSubmatchIndex. 811 // 812 // In the template, a variable is denoted by a substring of the form 813 // $name or ${name}, where name is a non-empty sequence of letters, 814 // digits, and underscores. A purely numeric name like $1 refers to 815 // the submatch with the corresponding index; other names refer to 816 // capturing parentheses named with the (?P<name>...) syntax. A 817 // reference to an out of range or unmatched index or a name that is not 818 // present in the regular expression is replaced with an empty slice. 819 // 820 // In the $name form, name is taken to be as long as possible: $1x is 821 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 822 // 823 // To insert a literal $ in the output, use $$ in the template. 824 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 825 return re.expand(dst, string(template), src, "", match) 826 } 827 828 // ExpandString is like Expand but the template and source are strings. 829 // It appends to and returns a byte slice in order to give the calling 830 // code control over allocation. 831 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 832 return re.expand(dst, template, nil, src, match) 833 } 834 835 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 836 for len(template) > 0 { 837 i := strings.Index(template, "$") 838 if i < 0 { 839 break 840 } 841 dst = append(dst, template[:i]...) 842 template = template[i:] 843 if len(template) > 1 && template[1] == '$' { 844 // Treat $$ as $. 845 dst = append(dst, '$') 846 template = template[2:] 847 continue 848 } 849 name, num, rest, ok := extract(template) 850 if !ok { 851 // Malformed; treat $ as raw text. 852 dst = append(dst, '$') 853 template = template[1:] 854 continue 855 } 856 template = rest 857 if num >= 0 { 858 if 2*num+1 < len(match) && match[2*num] >= 0 { 859 if bsrc != nil { 860 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 861 } else { 862 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 863 } 864 } 865 } else { 866 for i, namei := range re.subexpNames { 867 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 868 if bsrc != nil { 869 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 870 } else { 871 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 872 } 873 break 874 } 875 } 876 } 877 } 878 dst = append(dst, template...) 879 return dst 880 } 881 882 // extract returns the name from a leading "$name" or "${name}" in str. 883 // If it is a number, extract returns num set to that number; otherwise num = -1. 884 func extract(str string) (name string, num int, rest string, ok bool) { 885 if len(str) < 2 || str[0] != '$' { 886 return 887 } 888 brace := false 889 if str[1] == '{' { 890 brace = true 891 str = str[2:] 892 } else { 893 str = str[1:] 894 } 895 i := 0 896 for i < len(str) { 897 rune, size := utf8.DecodeRuneInString(str[i:]) 898 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 899 break 900 } 901 i += size 902 } 903 if i == 0 { 904 // empty name is not okay 905 return 906 } 907 name = str[:i] 908 if brace { 909 if i >= len(str) || str[i] != '}' { 910 // missing closing brace 911 return 912 } 913 i++ 914 } 915 916 // Parse number. 917 num = 0 918 for i := 0; i < len(name); i++ { 919 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 920 num = -1 921 break 922 } 923 num = num*10 + int(name[i]) - '0' 924 } 925 // Disallow leading zeros. 926 if name[0] == '0' && len(name) > 1 { 927 num = -1 928 } 929 930 rest = str[i:] 931 ok = true 932 return 933 } 934 935 // FindSubmatchIndex returns a slice holding the index pairs identifying the 936 // leftmost match of the regular expression in b and the matches, if any, of 937 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 938 // in the package comment. 939 // A return value of nil indicates no match. 940 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 941 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) 942 } 943 944 // FindStringSubmatch returns a slice of strings holding the text of the 945 // leftmost match of the regular expression in s and the matches, if any, of 946 // its subexpressions, as defined by the 'Submatch' description in the 947 // package comment. 948 // A return value of nil indicates no match. 949 func (re *Regexp) FindStringSubmatch(s string) []string { 950 var dstCap [4]int 951 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) 952 if a == nil { 953 return nil 954 } 955 ret := make([]string, 1+re.numSubexp) 956 for i := range ret { 957 if 2*i < len(a) && a[2*i] >= 0 { 958 ret[i] = s[a[2*i]:a[2*i+1]] 959 } 960 } 961 return ret 962 } 963 964 // FindStringSubmatchIndex returns a slice holding the index pairs 965 // identifying the leftmost match of the regular expression in s and the 966 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 967 // 'Index' descriptions in the package comment. 968 // A return value of nil indicates no match. 969 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 970 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) 971 } 972 973 // FindReaderSubmatchIndex returns a slice holding the index pairs 974 // identifying the leftmost match of the regular expression of text read by 975 // the RuneReader, and the matches, if any, of its subexpressions, as defined 976 // by the 'Submatch' and 'Index' descriptions in the package comment. A 977 // return value of nil indicates no match. 978 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 979 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) 980 } 981 982 const startSize = 10 // The size at which to start a slice in the 'All' routines. 983 984 // FindAll is the 'All' version of Find; it returns a slice of all successive 985 // matches of the expression, as defined by the 'All' description in the 986 // package comment. 987 // A return value of nil indicates no match. 988 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 989 if n < 0 { 990 n = len(b) + 1 991 } 992 var result [][]byte 993 re.allMatches("", b, n, func(match []int) { 994 if result == nil { 995 result = make([][]byte, 0, startSize) 996 } 997 result = append(result, b[match[0]:match[1]]) 998 }) 999 return result 1000 } 1001 1002 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 1003 // successive matches of the expression, as defined by the 'All' description 1004 // in the package comment. 1005 // A return value of nil indicates no match. 1006 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 1007 if n < 0 { 1008 n = len(b) + 1 1009 } 1010 var result [][]int 1011 re.allMatches("", b, n, func(match []int) { 1012 if result == nil { 1013 result = make([][]int, 0, startSize) 1014 } 1015 result = append(result, match[0:2]) 1016 }) 1017 return result 1018 } 1019 1020 // FindAllString is the 'All' version of FindString; it returns a slice of all 1021 // successive matches of the expression, as defined by the 'All' description 1022 // in the package comment. 1023 // A return value of nil indicates no match. 1024 func (re *Regexp) FindAllString(s string, n int) []string { 1025 if n < 0 { 1026 n = len(s) + 1 1027 } 1028 var result []string 1029 re.allMatches(s, nil, n, func(match []int) { 1030 if result == nil { 1031 result = make([]string, 0, startSize) 1032 } 1033 result = append(result, s[match[0]:match[1]]) 1034 }) 1035 return result 1036 } 1037 1038 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 1039 // slice of all successive matches of the expression, as defined by the 'All' 1040 // description in the package comment. 1041 // A return value of nil indicates no match. 1042 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1043 if n < 0 { 1044 n = len(s) + 1 1045 } 1046 var result [][]int 1047 re.allMatches(s, nil, n, func(match []int) { 1048 if result == nil { 1049 result = make([][]int, 0, startSize) 1050 } 1051 result = append(result, match[0:2]) 1052 }) 1053 return result 1054 } 1055 1056 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1057 // of all successive matches of the expression, as defined by the 'All' 1058 // description in the package comment. 1059 // A return value of nil indicates no match. 1060 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1061 if n < 0 { 1062 n = len(b) + 1 1063 } 1064 var result [][][]byte 1065 re.allMatches("", b, n, func(match []int) { 1066 if result == nil { 1067 result = make([][][]byte, 0, startSize) 1068 } 1069 slice := make([][]byte, len(match)/2) 1070 for j := range slice { 1071 if match[2*j] >= 0 { 1072 slice[j] = b[match[2*j]:match[2*j+1]] 1073 } 1074 } 1075 result = append(result, slice) 1076 }) 1077 return result 1078 } 1079 1080 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1081 // a slice of all successive matches of the expression, as defined by the 1082 // 'All' description in the package comment. 1083 // A return value of nil indicates no match. 1084 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1085 if n < 0 { 1086 n = len(b) + 1 1087 } 1088 var result [][]int 1089 re.allMatches("", b, n, func(match []int) { 1090 if result == nil { 1091 result = make([][]int, 0, startSize) 1092 } 1093 result = append(result, match) 1094 }) 1095 return result 1096 } 1097 1098 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1099 // returns a slice of all successive matches of the expression, as defined by 1100 // the 'All' description in the package comment. 1101 // A return value of nil indicates no match. 1102 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1103 if n < 0 { 1104 n = len(s) + 1 1105 } 1106 var result [][]string 1107 re.allMatches(s, nil, n, func(match []int) { 1108 if result == nil { 1109 result = make([][]string, 0, startSize) 1110 } 1111 slice := make([]string, len(match)/2) 1112 for j := range slice { 1113 if match[2*j] >= 0 { 1114 slice[j] = s[match[2*j]:match[2*j+1]] 1115 } 1116 } 1117 result = append(result, slice) 1118 }) 1119 return result 1120 } 1121 1122 // FindAllStringSubmatchIndex is the 'All' version of 1123 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1124 // the expression, as defined by the 'All' description in the package 1125 // comment. 1126 // A return value of nil indicates no match. 1127 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1128 if n < 0 { 1129 n = len(s) + 1 1130 } 1131 var result [][]int 1132 re.allMatches(s, nil, n, func(match []int) { 1133 if result == nil { 1134 result = make([][]int, 0, startSize) 1135 } 1136 result = append(result, match) 1137 }) 1138 return result 1139 } 1140 1141 // Split slices s into substrings separated by the expression and returns a slice of 1142 // the substrings between those expression matches. 1143 // 1144 // The slice returned by this method consists of all the substrings of s 1145 // not contained in the slice returned by FindAllString. When called on an expression 1146 // that contains no metacharacters, it is equivalent to strings.SplitN. 1147 // 1148 // Example: 1149 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1150 // // s: ["", "b", "b", "c", "cadaaae"] 1151 // 1152 // The count determines the number of substrings to return: 1153 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 1154 // n == 0: the result is nil (zero substrings) 1155 // n < 0: all substrings 1156 func (re *Regexp) Split(s string, n int) []string { 1157 1158 if n == 0 { 1159 return nil 1160 } 1161 1162 if len(re.expr) > 0 && len(s) == 0 { 1163 return []string{""} 1164 } 1165 1166 matches := re.FindAllStringIndex(s, n) 1167 strings := make([]string, 0, len(matches)) 1168 1169 beg := 0 1170 end := 0 1171 for _, match := range matches { 1172 if n > 0 && len(strings) >= n-1 { 1173 break 1174 } 1175 1176 end = match[0] 1177 if match[1] != 0 { 1178 strings = append(strings, s[beg:end]) 1179 } 1180 beg = match[1] 1181 } 1182 1183 if end != len(s) { 1184 strings = append(strings, s[beg:]) 1185 } 1186 1187 return strings 1188 }