github.com/rakyll/go@v0.0.0-20170216000551-64c02460d703/src/regexp/regexp.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // https://golang.org/s/re2syntax, except for \C. 11 // For an overview of the syntax, run 12 // go doc regexp/syntax 13 // 14 // The regexp implementation provided by this package is 15 // guaranteed to run in time linear in the size of the input. 16 // (This is a property not guaranteed by most open source 17 // implementations of regular expressions.) For more information 18 // about this property, see 19 // http://swtch.com/~rsc/regexp/regexp1.html 20 // or any book about automata theory. 21 // 22 // All characters are UTF-8-encoded code points. 23 // 24 // There are 16 methods of Regexp that match a regular expression and identify 25 // the matched text. Their names are matched by this regular expression: 26 // 27 // Find(All)?(String)?(Submatch)?(Index)? 28 // 29 // If 'All' is present, the routine matches successive non-overlapping 30 // matches of the entire expression. Empty matches abutting a preceding 31 // match are ignored. The return value is a slice containing the successive 32 // return values of the corresponding non-'All' routine. These routines take 33 // an extra integer argument, n; if n >= 0, the function returns at most n 34 // matches/submatches. 35 // 36 // If 'String' is present, the argument is a string; otherwise it is a slice 37 // of bytes; return values are adjusted as appropriate. 38 // 39 // If 'Submatch' is present, the return value is a slice identifying the 40 // successive submatches of the expression. Submatches are matches of 41 // parenthesized subexpressions (also known as capturing groups) within the 42 // regular expression, numbered from left to right in order of opening 43 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 44 // the match of the first parenthesized subexpression, and so on. 45 // 46 // If 'Index' is present, matches and submatches are identified by byte index 47 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 48 // the nth submatch. The pair for n==0 identifies the match of the entire 49 // expression. If 'Index' is not present, the match is identified by the 50 // text of the match/submatch. If an index is negative, it means that 51 // subexpression did not match any string in the input. 52 // 53 // There is also a subset of the methods that can be applied to text read 54 // from a RuneReader: 55 // 56 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 57 // 58 // This set may grow. Note that regular expression matches may need to 59 // examine text beyond the text returned by a match, so the methods that 60 // match text from a RuneReader may read arbitrarily far into the input 61 // before returning. 62 // 63 // (There are a few other methods that do not match this pattern.) 64 // 65 package regexp 66 67 import ( 68 "bytes" 69 "io" 70 "regexp/syntax" 71 "strconv" 72 "strings" 73 "sync" 74 "unicode" 75 "unicode/utf8" 76 ) 77 78 // Regexp is the representation of a compiled regular expression. 79 // A Regexp is safe for concurrent use by multiple goroutines. 80 type Regexp struct { 81 // read-only after Compile 82 regexpRO 83 84 // cache of machines for running regexp 85 mu sync.Mutex 86 machine []*machine 87 } 88 89 type regexpRO struct { 90 expr string // as passed to Compile 91 prog *syntax.Prog // compiled program 92 onepass *onePassProg // onepass program or nil 93 prefix string // required prefix in unanchored matches 94 prefixBytes []byte // prefix, as a []byte 95 prefixComplete bool // prefix is the entire regexp 96 prefixRune rune // first rune in prefix 97 prefixEnd uint32 // pc for last rune in prefix 98 cond syntax.EmptyOp // empty-width conditions required at start of match 99 numSubexp int 100 subexpNames []string 101 longest bool 102 } 103 104 // String returns the source text used to compile the regular expression. 105 func (re *Regexp) String() string { 106 return re.expr 107 } 108 109 // Copy returns a new Regexp object copied from re. 110 // 111 // When using a Regexp in multiple goroutines, giving each goroutine 112 // its own copy helps to avoid lock contention. 113 func (re *Regexp) Copy() *Regexp { 114 // It is not safe to copy Regexp by value 115 // since it contains a sync.Mutex. 116 return &Regexp{ 117 regexpRO: re.regexpRO, 118 } 119 } 120 121 // Compile parses a regular expression and returns, if successful, 122 // a Regexp object that can be used to match against text. 123 // 124 // When matching against text, the regexp returns a match that 125 // begins as early as possible in the input (leftmost), and among those 126 // it chooses the one that a backtracking search would have found first. 127 // This so-called leftmost-first matching is the same semantics 128 // that Perl, Python, and other implementations use, although this 129 // package implements it without the expense of backtracking. 130 // For POSIX leftmost-longest matching, see CompilePOSIX. 131 func Compile(expr string) (*Regexp, error) { 132 return compile(expr, syntax.Perl, false) 133 } 134 135 // CompilePOSIX is like Compile but restricts the regular expression 136 // to POSIX ERE (egrep) syntax and changes the match semantics to 137 // leftmost-longest. 138 // 139 // That is, when matching against text, the regexp returns a match that 140 // begins as early as possible in the input (leftmost), and among those 141 // it chooses a match that is as long as possible. 142 // This so-called leftmost-longest matching is the same semantics 143 // that early regular expression implementations used and that POSIX 144 // specifies. 145 // 146 // However, there can be multiple leftmost-longest matches, with different 147 // submatch choices, and here this package diverges from POSIX. 148 // Among the possible leftmost-longest matches, this package chooses 149 // the one that a backtracking search would have found first, while POSIX 150 // specifies that the match be chosen to maximize the length of the first 151 // subexpression, then the second, and so on from left to right. 152 // The POSIX rule is computationally prohibitive and not even well-defined. 153 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details. 154 func CompilePOSIX(expr string) (*Regexp, error) { 155 return compile(expr, syntax.POSIX, true) 156 } 157 158 // Longest makes future searches prefer the leftmost-longest match. 159 // That is, when matching against text, the regexp returns a match that 160 // begins as early as possible in the input (leftmost), and among those 161 // it chooses a match that is as long as possible. 162 func (re *Regexp) Longest() { 163 re.longest = true 164 } 165 166 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 167 re, err := syntax.Parse(expr, mode) 168 if err != nil { 169 return nil, err 170 } 171 maxCap := re.MaxCap() 172 capNames := re.CapNames() 173 174 re = re.Simplify() 175 prog, err := syntax.Compile(re) 176 if err != nil { 177 return nil, err 178 } 179 regexp := &Regexp{ 180 regexpRO: regexpRO{ 181 expr: expr, 182 prog: prog, 183 onepass: compileOnePass(prog), 184 numSubexp: maxCap, 185 subexpNames: capNames, 186 cond: prog.StartCond(), 187 longest: longest, 188 }, 189 } 190 if regexp.onepass == notOnePass { 191 regexp.prefix, regexp.prefixComplete = prog.Prefix() 192 } else { 193 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 194 } 195 if regexp.prefix != "" { 196 // TODO(rsc): Remove this allocation by adding 197 // IndexString to package bytes. 198 regexp.prefixBytes = []byte(regexp.prefix) 199 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 200 } 201 return regexp, nil 202 } 203 204 // get returns a machine to use for matching re. 205 // It uses the re's machine cache if possible, to avoid 206 // unnecessary allocation. 207 func (re *Regexp) get() *machine { 208 re.mu.Lock() 209 if n := len(re.machine); n > 0 { 210 z := re.machine[n-1] 211 re.machine = re.machine[:n-1] 212 re.mu.Unlock() 213 return z 214 } 215 re.mu.Unlock() 216 z := progMachine(re.prog, re.onepass) 217 z.re = re 218 return z 219 } 220 221 // put returns a machine to the re's machine cache. 222 // There is no attempt to limit the size of the cache, so it will 223 // grow to the maximum number of simultaneous matches 224 // run using re. (The cache empties when re gets garbage collected.) 225 func (re *Regexp) put(z *machine) { 226 re.mu.Lock() 227 re.machine = append(re.machine, z) 228 re.mu.Unlock() 229 } 230 231 // MustCompile is like Compile but panics if the expression cannot be parsed. 232 // It simplifies safe initialization of global variables holding compiled regular 233 // expressions. 234 func MustCompile(str string) *Regexp { 235 regexp, error := Compile(str) 236 if error != nil { 237 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error()) 238 } 239 return regexp 240 } 241 242 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 243 // It simplifies safe initialization of global variables holding compiled regular 244 // expressions. 245 func MustCompilePOSIX(str string) *Regexp { 246 regexp, error := CompilePOSIX(str) 247 if error != nil { 248 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error()) 249 } 250 return regexp 251 } 252 253 func quote(s string) string { 254 if strconv.CanBackquote(s) { 255 return "`" + s + "`" 256 } 257 return strconv.Quote(s) 258 } 259 260 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 261 func (re *Regexp) NumSubexp() int { 262 return re.numSubexp 263 } 264 265 // SubexpNames returns the names of the parenthesized subexpressions 266 // in this Regexp. The name for the first sub-expression is names[1], 267 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 268 // Since the Regexp as a whole cannot be named, names[0] is always 269 // the empty string. The slice should not be modified. 270 func (re *Regexp) SubexpNames() []string { 271 return re.subexpNames 272 } 273 274 const endOfText rune = -1 275 276 // input abstracts different representations of the input text. It provides 277 // one-character lookahead. 278 type input interface { 279 step(pos int) (r rune, width int) // advance one rune 280 canCheckPrefix() bool // can we look ahead without losing info? 281 hasPrefix(re *Regexp) bool 282 index(re *Regexp, pos int) int 283 context(pos int) syntax.EmptyOp 284 } 285 286 // inputString scans a string. 287 type inputString struct { 288 str string 289 } 290 291 func (i *inputString) step(pos int) (rune, int) { 292 if pos < len(i.str) { 293 c := i.str[pos] 294 if c < utf8.RuneSelf { 295 return rune(c), 1 296 } 297 return utf8.DecodeRuneInString(i.str[pos:]) 298 } 299 return endOfText, 0 300 } 301 302 func (i *inputString) canCheckPrefix() bool { 303 return true 304 } 305 306 func (i *inputString) hasPrefix(re *Regexp) bool { 307 return strings.HasPrefix(i.str, re.prefix) 308 } 309 310 func (i *inputString) index(re *Regexp, pos int) int { 311 return strings.Index(i.str[pos:], re.prefix) 312 } 313 314 func (i *inputString) context(pos int) syntax.EmptyOp { 315 r1, r2 := endOfText, endOfText 316 if pos > 0 && pos <= len(i.str) { 317 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 318 } 319 if pos < len(i.str) { 320 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 321 } 322 return syntax.EmptyOpContext(r1, r2) 323 } 324 325 // inputBytes scans a byte slice. 326 type inputBytes struct { 327 str []byte 328 } 329 330 func (i *inputBytes) step(pos int) (rune, int) { 331 if pos < len(i.str) { 332 c := i.str[pos] 333 if c < utf8.RuneSelf { 334 return rune(c), 1 335 } 336 return utf8.DecodeRune(i.str[pos:]) 337 } 338 return endOfText, 0 339 } 340 341 func (i *inputBytes) canCheckPrefix() bool { 342 return true 343 } 344 345 func (i *inputBytes) hasPrefix(re *Regexp) bool { 346 return bytes.HasPrefix(i.str, re.prefixBytes) 347 } 348 349 func (i *inputBytes) index(re *Regexp, pos int) int { 350 return bytes.Index(i.str[pos:], re.prefixBytes) 351 } 352 353 func (i *inputBytes) context(pos int) syntax.EmptyOp { 354 r1, r2 := endOfText, endOfText 355 if pos > 0 && pos <= len(i.str) { 356 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 357 } 358 if pos < len(i.str) { 359 r2, _ = utf8.DecodeRune(i.str[pos:]) 360 } 361 return syntax.EmptyOpContext(r1, r2) 362 } 363 364 // inputReader scans a RuneReader. 365 type inputReader struct { 366 r io.RuneReader 367 atEOT bool 368 pos int 369 } 370 371 func (i *inputReader) step(pos int) (rune, int) { 372 if !i.atEOT && pos != i.pos { 373 return endOfText, 0 374 375 } 376 r, w, err := i.r.ReadRune() 377 if err != nil { 378 i.atEOT = true 379 return endOfText, 0 380 } 381 i.pos += w 382 return r, w 383 } 384 385 func (i *inputReader) canCheckPrefix() bool { 386 return false 387 } 388 389 func (i *inputReader) hasPrefix(re *Regexp) bool { 390 return false 391 } 392 393 func (i *inputReader) index(re *Regexp, pos int) int { 394 return -1 395 } 396 397 func (i *inputReader) context(pos int) syntax.EmptyOp { 398 return 0 399 } 400 401 // LiteralPrefix returns a literal string that must begin any match 402 // of the regular expression re. It returns the boolean true if the 403 // literal string comprises the entire regular expression. 404 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 405 return re.prefix, re.prefixComplete 406 } 407 408 // MatchReader reports whether the Regexp matches the text read by the 409 // RuneReader. 410 func (re *Regexp) MatchReader(r io.RuneReader) bool { 411 return re.doMatch(r, nil, "") 412 } 413 414 // MatchString reports whether the Regexp matches the string s. 415 func (re *Regexp) MatchString(s string) bool { 416 return re.doMatch(nil, nil, s) 417 } 418 419 // Match reports whether the Regexp matches the byte slice b. 420 func (re *Regexp) Match(b []byte) bool { 421 return re.doMatch(nil, b, "") 422 } 423 424 // MatchReader checks whether a textual regular expression matches the text 425 // read by the RuneReader. More complicated queries need to use Compile and 426 // the full Regexp interface. 427 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 428 re, err := Compile(pattern) 429 if err != nil { 430 return false, err 431 } 432 return re.MatchReader(r), nil 433 } 434 435 // MatchString checks whether a textual regular expression 436 // matches a string. More complicated queries need 437 // to use Compile and the full Regexp interface. 438 func MatchString(pattern string, s string) (matched bool, err error) { 439 re, err := Compile(pattern) 440 if err != nil { 441 return false, err 442 } 443 return re.MatchString(s), nil 444 } 445 446 // Match checks whether a textual regular expression 447 // matches a byte slice. More complicated queries need 448 // to use Compile and the full Regexp interface. 449 func Match(pattern string, b []byte) (matched bool, err error) { 450 re, err := Compile(pattern) 451 if err != nil { 452 return false, err 453 } 454 return re.Match(b), nil 455 } 456 457 // ReplaceAllString returns a copy of src, replacing matches of the Regexp 458 // with the replacement string repl. Inside repl, $ signs are interpreted as 459 // in Expand, so for instance $1 represents the text of the first submatch. 460 func (re *Regexp) ReplaceAllString(src, repl string) string { 461 n := 2 462 if strings.Contains(repl, "$") { 463 n = 2 * (re.numSubexp + 1) 464 } 465 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 466 return re.expand(dst, repl, nil, src, match) 467 }) 468 return string(b) 469 } 470 471 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 472 // with the replacement string repl. The replacement repl is substituted directly, 473 // without using Expand. 474 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 475 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 476 return append(dst, repl...) 477 })) 478 } 479 480 // ReplaceAllStringFunc returns a copy of src in which all matches of the 481 // Regexp have been replaced by the return value of function repl applied 482 // to the matched substring. The replacement returned by repl is substituted 483 // directly, without using Expand. 484 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 485 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 486 return append(dst, repl(src[match[0]:match[1]])...) 487 }) 488 return string(b) 489 } 490 491 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 492 lastMatchEnd := 0 // end position of the most recent match 493 searchPos := 0 // position where we next look for a match 494 var buf []byte 495 var endPos int 496 if bsrc != nil { 497 endPos = len(bsrc) 498 } else { 499 endPos = len(src) 500 } 501 if nmatch > re.prog.NumCap { 502 nmatch = re.prog.NumCap 503 } 504 505 var dstCap [2]int 506 for searchPos <= endPos { 507 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) 508 if len(a) == 0 { 509 break // no more matches 510 } 511 512 // Copy the unmatched characters before this match. 513 if bsrc != nil { 514 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 515 } else { 516 buf = append(buf, src[lastMatchEnd:a[0]]...) 517 } 518 519 // Now insert a copy of the replacement string, but not for a 520 // match of the empty string immediately after another match. 521 // (Otherwise, we get double replacement for patterns that 522 // match both empty and nonempty strings.) 523 if a[1] > lastMatchEnd || a[0] == 0 { 524 buf = repl(buf, a) 525 } 526 lastMatchEnd = a[1] 527 528 // Advance past this match; always advance at least one character. 529 var width int 530 if bsrc != nil { 531 _, width = utf8.DecodeRune(bsrc[searchPos:]) 532 } else { 533 _, width = utf8.DecodeRuneInString(src[searchPos:]) 534 } 535 if searchPos+width > a[1] { 536 searchPos += width 537 } else if searchPos+1 > a[1] { 538 // This clause is only needed at the end of the input 539 // string. In that case, DecodeRuneInString returns width=0. 540 searchPos++ 541 } else { 542 searchPos = a[1] 543 } 544 } 545 546 // Copy the unmatched characters after the last match. 547 if bsrc != nil { 548 buf = append(buf, bsrc[lastMatchEnd:]...) 549 } else { 550 buf = append(buf, src[lastMatchEnd:]...) 551 } 552 553 return buf 554 } 555 556 // ReplaceAll returns a copy of src, replacing matches of the Regexp 557 // with the replacement text repl. Inside repl, $ signs are interpreted as 558 // in Expand, so for instance $1 represents the text of the first submatch. 559 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 560 n := 2 561 if bytes.IndexByte(repl, '$') >= 0 { 562 n = 2 * (re.numSubexp + 1) 563 } 564 srepl := "" 565 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 566 if len(srepl) != len(repl) { 567 srepl = string(repl) 568 } 569 return re.expand(dst, srepl, src, "", match) 570 }) 571 return b 572 } 573 574 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 575 // with the replacement bytes repl. The replacement repl is substituted directly, 576 // without using Expand. 577 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 578 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 579 return append(dst, repl...) 580 }) 581 } 582 583 // ReplaceAllFunc returns a copy of src in which all matches of the 584 // Regexp have been replaced by the return value of function repl applied 585 // to the matched byte slice. The replacement returned by repl is substituted 586 // directly, without using Expand. 587 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 588 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 589 return append(dst, repl(src[match[0]:match[1]])...) 590 }) 591 } 592 593 var specialBytes = []byte(`\.+*?()|[]{}^$`) 594 595 func special(b byte) bool { 596 return bytes.IndexByte(specialBytes, b) >= 0 597 } 598 599 // QuoteMeta returns a string that quotes all regular expression metacharacters 600 // inside the argument text; the returned string is a regular expression matching 601 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. 602 func QuoteMeta(s string) string { 603 // A byte loop is correct because all metacharacters are ASCII. 604 var i int 605 for i = 0; i < len(s); i++ { 606 if special(s[i]) { 607 break 608 } 609 } 610 // No meta characters found, so return original string. 611 if i >= len(s) { 612 return s 613 } 614 615 b := make([]byte, 2*len(s)-i) 616 copy(b, s[:i]) 617 j := i 618 for ; i < len(s); i++ { 619 if special(s[i]) { 620 b[j] = '\\' 621 j++ 622 } 623 b[j] = s[i] 624 j++ 625 } 626 return string(b[:j]) 627 } 628 629 // The number of capture values in the program may correspond 630 // to fewer capturing expressions than are in the regexp. 631 // For example, "(a){0}" turns into an empty program, so the 632 // maximum capture in the program is 0 but we need to return 633 // an expression for \1. Pad appends -1s to the slice a as needed. 634 func (re *Regexp) pad(a []int) []int { 635 if a == nil { 636 // No match. 637 return nil 638 } 639 n := (1 + re.numSubexp) * 2 640 for len(a) < n { 641 a = append(a, -1) 642 } 643 return a 644 } 645 646 // Find matches in slice b if b is non-nil, otherwise find matches in string s. 647 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 648 var end int 649 if b == nil { 650 end = len(s) 651 } else { 652 end = len(b) 653 } 654 655 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 656 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) 657 if len(matches) == 0 { 658 break 659 } 660 661 accept := true 662 if matches[1] == pos { 663 // We've found an empty match. 664 if matches[0] == prevMatchEnd { 665 // We don't allow an empty match right 666 // after a previous match, so ignore it. 667 accept = false 668 } 669 var width int 670 // TODO: use step() 671 if b == nil { 672 _, width = utf8.DecodeRuneInString(s[pos:end]) 673 } else { 674 _, width = utf8.DecodeRune(b[pos:end]) 675 } 676 if width > 0 { 677 pos += width 678 } else { 679 pos = end + 1 680 } 681 } else { 682 pos = matches[1] 683 } 684 prevMatchEnd = matches[1] 685 686 if accept { 687 deliver(re.pad(matches)) 688 i++ 689 } 690 } 691 } 692 693 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 694 // A return value of nil indicates no match. 695 func (re *Regexp) Find(b []byte) []byte { 696 var dstCap [2]int 697 a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) 698 if a == nil { 699 return nil 700 } 701 return b[a[0]:a[1]] 702 } 703 704 // FindIndex returns a two-element slice of integers defining the location of 705 // the leftmost match in b of the regular expression. The match itself is at 706 // b[loc[0]:loc[1]]. 707 // A return value of nil indicates no match. 708 func (re *Regexp) FindIndex(b []byte) (loc []int) { 709 a := re.doExecute(nil, b, "", 0, 2, nil) 710 if a == nil { 711 return nil 712 } 713 return a[0:2] 714 } 715 716 // FindString returns a string holding the text of the leftmost match in s of the regular 717 // expression. If there is no match, the return value is an empty string, 718 // but it will also be empty if the regular expression successfully matches 719 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 720 // necessary to distinguish these cases. 721 func (re *Regexp) FindString(s string) string { 722 var dstCap [2]int 723 a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) 724 if a == nil { 725 return "" 726 } 727 return s[a[0]:a[1]] 728 } 729 730 // FindStringIndex returns a two-element slice of integers defining the 731 // location of the leftmost match in s of the regular expression. The match 732 // itself is at s[loc[0]:loc[1]]. 733 // A return value of nil indicates no match. 734 func (re *Regexp) FindStringIndex(s string) (loc []int) { 735 a := re.doExecute(nil, nil, s, 0, 2, nil) 736 if a == nil { 737 return nil 738 } 739 return a[0:2] 740 } 741 742 // FindReaderIndex returns a two-element slice of integers defining the 743 // location of the leftmost match of the regular expression in text read from 744 // the RuneReader. The match text was found in the input stream at 745 // byte offset loc[0] through loc[1]-1. 746 // A return value of nil indicates no match. 747 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 748 a := re.doExecute(r, nil, "", 0, 2, nil) 749 if a == nil { 750 return nil 751 } 752 return a[0:2] 753 } 754 755 // FindSubmatch returns a slice of slices holding the text of the leftmost 756 // match of the regular expression in b and the matches, if any, of its 757 // subexpressions, as defined by the 'Submatch' descriptions in the package 758 // comment. 759 // A return value of nil indicates no match. 760 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 761 var dstCap [4]int 762 a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) 763 if a == nil { 764 return nil 765 } 766 ret := make([][]byte, 1+re.numSubexp) 767 for i := range ret { 768 if 2*i < len(a) && a[2*i] >= 0 { 769 ret[i] = b[a[2*i]:a[2*i+1]] 770 } 771 } 772 return ret 773 } 774 775 // Expand appends template to dst and returns the result; during the 776 // append, Expand replaces variables in the template with corresponding 777 // matches drawn from src. The match slice should have been returned by 778 // FindSubmatchIndex. 779 // 780 // In the template, a variable is denoted by a substring of the form 781 // $name or ${name}, where name is a non-empty sequence of letters, 782 // digits, and underscores. A purely numeric name like $1 refers to 783 // the submatch with the corresponding index; other names refer to 784 // capturing parentheses named with the (?P<name>...) syntax. A 785 // reference to an out of range or unmatched index or a name that is not 786 // present in the regular expression is replaced with an empty slice. 787 // 788 // In the $name form, name is taken to be as long as possible: $1x is 789 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 790 // 791 // To insert a literal $ in the output, use $$ in the template. 792 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 793 return re.expand(dst, string(template), src, "", match) 794 } 795 796 // ExpandString is like Expand but the template and source are strings. 797 // It appends to and returns a byte slice in order to give the calling 798 // code control over allocation. 799 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 800 return re.expand(dst, template, nil, src, match) 801 } 802 803 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 804 for len(template) > 0 { 805 i := strings.Index(template, "$") 806 if i < 0 { 807 break 808 } 809 dst = append(dst, template[:i]...) 810 template = template[i:] 811 if len(template) > 1 && template[1] == '$' { 812 // Treat $$ as $. 813 dst = append(dst, '$') 814 template = template[2:] 815 continue 816 } 817 name, num, rest, ok := extract(template) 818 if !ok { 819 // Malformed; treat $ as raw text. 820 dst = append(dst, '$') 821 template = template[1:] 822 continue 823 } 824 template = rest 825 if num >= 0 { 826 if 2*num+1 < len(match) && match[2*num] >= 0 { 827 if bsrc != nil { 828 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 829 } else { 830 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 831 } 832 } 833 } else { 834 for i, namei := range re.subexpNames { 835 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 836 if bsrc != nil { 837 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 838 } else { 839 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 840 } 841 break 842 } 843 } 844 } 845 } 846 dst = append(dst, template...) 847 return dst 848 } 849 850 // extract returns the name from a leading "$name" or "${name}" in str. 851 // If it is a number, extract returns num set to that number; otherwise num = -1. 852 func extract(str string) (name string, num int, rest string, ok bool) { 853 if len(str) < 2 || str[0] != '$' { 854 return 855 } 856 brace := false 857 if str[1] == '{' { 858 brace = true 859 str = str[2:] 860 } else { 861 str = str[1:] 862 } 863 i := 0 864 for i < len(str) { 865 rune, size := utf8.DecodeRuneInString(str[i:]) 866 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 867 break 868 } 869 i += size 870 } 871 if i == 0 { 872 // empty name is not okay 873 return 874 } 875 name = str[:i] 876 if brace { 877 if i >= len(str) || str[i] != '}' { 878 // missing closing brace 879 return 880 } 881 i++ 882 } 883 884 // Parse number. 885 num = 0 886 for i := 0; i < len(name); i++ { 887 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 888 num = -1 889 break 890 } 891 num = num*10 + int(name[i]) - '0' 892 } 893 // Disallow leading zeros. 894 if name[0] == '0' && len(name) > 1 { 895 num = -1 896 } 897 898 rest = str[i:] 899 ok = true 900 return 901 } 902 903 // FindSubmatchIndex returns a slice holding the index pairs identifying the 904 // leftmost match of the regular expression in b and the matches, if any, of 905 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 906 // in the package comment. 907 // A return value of nil indicates no match. 908 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 909 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) 910 } 911 912 // FindStringSubmatch returns a slice of strings holding the text of the 913 // leftmost match of the regular expression in s and the matches, if any, of 914 // its subexpressions, as defined by the 'Submatch' description in the 915 // package comment. 916 // A return value of nil indicates no match. 917 func (re *Regexp) FindStringSubmatch(s string) []string { 918 var dstCap [4]int 919 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) 920 if a == nil { 921 return nil 922 } 923 ret := make([]string, 1+re.numSubexp) 924 for i := range ret { 925 if 2*i < len(a) && a[2*i] >= 0 { 926 ret[i] = s[a[2*i]:a[2*i+1]] 927 } 928 } 929 return ret 930 } 931 932 // FindStringSubmatchIndex returns a slice holding the index pairs 933 // identifying the leftmost match of the regular expression in s and the 934 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 935 // 'Index' descriptions in the package comment. 936 // A return value of nil indicates no match. 937 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 938 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) 939 } 940 941 // FindReaderSubmatchIndex returns a slice holding the index pairs 942 // identifying the leftmost match of the regular expression of text read by 943 // the RuneReader, and the matches, if any, of its subexpressions, as defined 944 // by the 'Submatch' and 'Index' descriptions in the package comment. A 945 // return value of nil indicates no match. 946 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 947 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) 948 } 949 950 const startSize = 10 // The size at which to start a slice in the 'All' routines. 951 952 // FindAll is the 'All' version of Find; it returns a slice of all successive 953 // matches of the expression, as defined by the 'All' description in the 954 // package comment. 955 // A return value of nil indicates no match. 956 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 957 if n < 0 { 958 n = len(b) + 1 959 } 960 result := make([][]byte, 0, startSize) 961 re.allMatches("", b, n, func(match []int) { 962 result = append(result, b[match[0]:match[1]]) 963 }) 964 if len(result) == 0 { 965 return nil 966 } 967 return result 968 } 969 970 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 971 // successive matches of the expression, as defined by the 'All' description 972 // in the package comment. 973 // A return value of nil indicates no match. 974 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 975 if n < 0 { 976 n = len(b) + 1 977 } 978 result := make([][]int, 0, startSize) 979 re.allMatches("", b, n, func(match []int) { 980 result = append(result, match[0:2]) 981 }) 982 if len(result) == 0 { 983 return nil 984 } 985 return result 986 } 987 988 // FindAllString is the 'All' version of FindString; it returns a slice of all 989 // successive matches of the expression, as defined by the 'All' description 990 // in the package comment. 991 // A return value of nil indicates no match. 992 func (re *Regexp) FindAllString(s string, n int) []string { 993 if n < 0 { 994 n = len(s) + 1 995 } 996 result := make([]string, 0, startSize) 997 re.allMatches(s, nil, n, func(match []int) { 998 result = append(result, s[match[0]:match[1]]) 999 }) 1000 if len(result) == 0 { 1001 return nil 1002 } 1003 return result 1004 } 1005 1006 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 1007 // slice of all successive matches of the expression, as defined by the 'All' 1008 // description in the package comment. 1009 // A return value of nil indicates no match. 1010 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1011 if n < 0 { 1012 n = len(s) + 1 1013 } 1014 result := make([][]int, 0, startSize) 1015 re.allMatches(s, nil, n, func(match []int) { 1016 result = append(result, match[0:2]) 1017 }) 1018 if len(result) == 0 { 1019 return nil 1020 } 1021 return result 1022 } 1023 1024 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1025 // of all successive matches of the expression, as defined by the 'All' 1026 // description in the package comment. 1027 // A return value of nil indicates no match. 1028 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1029 if n < 0 { 1030 n = len(b) + 1 1031 } 1032 result := make([][][]byte, 0, startSize) 1033 re.allMatches("", b, n, func(match []int) { 1034 slice := make([][]byte, len(match)/2) 1035 for j := range slice { 1036 if match[2*j] >= 0 { 1037 slice[j] = b[match[2*j]:match[2*j+1]] 1038 } 1039 } 1040 result = append(result, slice) 1041 }) 1042 if len(result) == 0 { 1043 return nil 1044 } 1045 return result 1046 } 1047 1048 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1049 // a slice of all successive matches of the expression, as defined by the 1050 // 'All' description in the package comment. 1051 // A return value of nil indicates no match. 1052 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1053 if n < 0 { 1054 n = len(b) + 1 1055 } 1056 result := make([][]int, 0, startSize) 1057 re.allMatches("", b, n, func(match []int) { 1058 result = append(result, match) 1059 }) 1060 if len(result) == 0 { 1061 return nil 1062 } 1063 return result 1064 } 1065 1066 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1067 // returns a slice of all successive matches of the expression, as defined by 1068 // the 'All' description in the package comment. 1069 // A return value of nil indicates no match. 1070 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1071 if n < 0 { 1072 n = len(s) + 1 1073 } 1074 result := make([][]string, 0, startSize) 1075 re.allMatches(s, nil, n, func(match []int) { 1076 slice := make([]string, len(match)/2) 1077 for j := range slice { 1078 if match[2*j] >= 0 { 1079 slice[j] = s[match[2*j]:match[2*j+1]] 1080 } 1081 } 1082 result = append(result, slice) 1083 }) 1084 if len(result) == 0 { 1085 return nil 1086 } 1087 return result 1088 } 1089 1090 // FindAllStringSubmatchIndex is the 'All' version of 1091 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1092 // the expression, as defined by the 'All' description in the package 1093 // comment. 1094 // A return value of nil indicates no match. 1095 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1096 if n < 0 { 1097 n = len(s) + 1 1098 } 1099 result := make([][]int, 0, startSize) 1100 re.allMatches(s, nil, n, func(match []int) { 1101 result = append(result, match) 1102 }) 1103 if len(result) == 0 { 1104 return nil 1105 } 1106 return result 1107 } 1108 1109 // Split slices s into substrings separated by the expression and returns a slice of 1110 // the substrings between those expression matches. 1111 // 1112 // The slice returned by this method consists of all the substrings of s 1113 // not contained in the slice returned by FindAllString. When called on an expression 1114 // that contains no metacharacters, it is equivalent to strings.SplitN. 1115 // 1116 // Example: 1117 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1118 // // s: ["", "b", "b", "c", "cadaaae"] 1119 // 1120 // The count determines the number of substrings to return: 1121 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 1122 // n == 0: the result is nil (zero substrings) 1123 // n < 0: all substrings 1124 func (re *Regexp) Split(s string, n int) []string { 1125 1126 if n == 0 { 1127 return nil 1128 } 1129 1130 if len(re.expr) > 0 && len(s) == 0 { 1131 return []string{""} 1132 } 1133 1134 matches := re.FindAllStringIndex(s, n) 1135 strings := make([]string, 0, len(matches)) 1136 1137 beg := 0 1138 end := 0 1139 for _, match := range matches { 1140 if n > 0 && len(strings) >= n-1 { 1141 break 1142 } 1143 1144 end = match[0] 1145 if match[1] != 0 { 1146 strings = append(strings, s[beg:end]) 1147 } 1148 beg = match[1] 1149 } 1150 1151 if end != len(s) { 1152 strings = append(strings, s[beg:]) 1153 } 1154 1155 return strings 1156 }