github.com/hbdrawn/golang@v0.0.0-20141214014649-6b835209aba2/src/regexp/regexp.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // http://code.google.com/p/re2/wiki/Syntax, except for \C. 11 // For an overview of the syntax, run 12 // godoc regexp/syntax 13 // 14 // The regexp implementation provided by this package is 15 // guaranteed to run in time linear in the size of the input. 16 // (This is a property not guaranteed by most open source 17 // implementations of regular expressions.) For more information 18 // about this property, see 19 // http://swtch.com/~rsc/regexp/regexp1.html 20 // or any book about automata theory. 21 // 22 // All characters are UTF-8-encoded code points. 23 // 24 // There are 16 methods of Regexp that match a regular expression and identify 25 // the matched text. Their names are matched by this regular expression: 26 // 27 // Find(All)?(String)?(Submatch)?(Index)? 28 // 29 // If 'All' is present, the routine matches successive non-overlapping 30 // matches of the entire expression. Empty matches abutting a preceding 31 // match are ignored. The return value is a slice containing the successive 32 // return values of the corresponding non-'All' routine. These routines take 33 // an extra integer argument, n; if n >= 0, the function returns at most n 34 // matches/submatches. 35 // 36 // If 'String' is present, the argument is a string; otherwise it is a slice 37 // of bytes; return values are adjusted as appropriate. 38 // 39 // If 'Submatch' is present, the return value is a slice identifying the 40 // successive submatches of the expression. Submatches are matches of 41 // parenthesized subexpressions (also known as capturing groups) within the 42 // regular expression, numbered from left to right in order of opening 43 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 44 // the match of the first parenthesized subexpression, and so on. 45 // 46 // If 'Index' is present, matches and submatches are identified by byte index 47 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 48 // the nth submatch. The pair for n==0 identifies the match of the entire 49 // expression. If 'Index' is not present, the match is identified by the 50 // text of the match/submatch. If an index is negative, it means that 51 // subexpression did not match any string in the input. 52 // 53 // There is also a subset of the methods that can be applied to text read 54 // from a RuneReader: 55 // 56 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 57 // 58 // This set may grow. Note that regular expression matches may need to 59 // examine text beyond the text returned by a match, so the methods that 60 // match text from a RuneReader may read arbitrarily far into the input 61 // before returning. 62 // 63 // (There are a few other methods that do not match this pattern.) 64 // 65 package regexp 66 67 import ( 68 "bytes" 69 "io" 70 "regexp/syntax" 71 "strconv" 72 "strings" 73 "sync" 74 "unicode" 75 "unicode/utf8" 76 ) 77 78 var debug = false 79 80 // Regexp is the representation of a compiled regular expression. 81 // A Regexp is safe for concurrent use by multiple goroutines. 82 type Regexp struct { 83 // read-only after Compile 84 expr string // as passed to Compile 85 prog *syntax.Prog // compiled program 86 onepass *onePassProg // onpass program or nil 87 prefix string // required prefix in unanchored matches 88 prefixBytes []byte // prefix, as a []byte 89 prefixComplete bool // prefix is the entire regexp 90 prefixRune rune // first rune in prefix 91 prefixEnd uint32 // pc for last rune in prefix 92 cond syntax.EmptyOp // empty-width conditions required at start of match 93 numSubexp int 94 subexpNames []string 95 longest bool 96 97 // cache of machines for running regexp 98 mu sync.Mutex 99 machine []*machine 100 } 101 102 // String returns the source text used to compile the regular expression. 103 func (re *Regexp) String() string { 104 return re.expr 105 } 106 107 // Compile parses a regular expression and returns, if successful, 108 // a Regexp object that can be used to match against text. 109 // 110 // When matching against text, the regexp returns a match that 111 // begins as early as possible in the input (leftmost), and among those 112 // it chooses the one that a backtracking search would have found first. 113 // This so-called leftmost-first matching is the same semantics 114 // that Perl, Python, and other implementations use, although this 115 // package implements it without the expense of backtracking. 116 // For POSIX leftmost-longest matching, see CompilePOSIX. 117 func Compile(expr string) (*Regexp, error) { 118 return compile(expr, syntax.Perl, false) 119 } 120 121 // CompilePOSIX is like Compile but restricts the regular expression 122 // to POSIX ERE (egrep) syntax and changes the match semantics to 123 // leftmost-longest. 124 // 125 // That is, when matching against text, the regexp returns a match that 126 // begins as early as possible in the input (leftmost), and among those 127 // it chooses a match that is as long as possible. 128 // This so-called leftmost-longest matching is the same semantics 129 // that early regular expression implementations used and that POSIX 130 // specifies. 131 // 132 // However, there can be multiple leftmost-longest matches, with different 133 // submatch choices, and here this package diverges from POSIX. 134 // Among the possible leftmost-longest matches, this package chooses 135 // the one that a backtracking search would have found first, while POSIX 136 // specifies that the match be chosen to maximize the length of the first 137 // subexpression, then the second, and so on from left to right. 138 // The POSIX rule is computationally prohibitive and not even well-defined. 139 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details. 140 func CompilePOSIX(expr string) (*Regexp, error) { 141 return compile(expr, syntax.POSIX, true) 142 } 143 144 // Longest makes future searches prefer the leftmost-longest match. 145 // That is, when matching against text, the regexp returns a match that 146 // begins as early as possible in the input (leftmost), and among those 147 // it chooses a match that is as long as possible. 148 func (re *Regexp) Longest() { 149 re.longest = true 150 } 151 152 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 153 re, err := syntax.Parse(expr, mode) 154 if err != nil { 155 return nil, err 156 } 157 maxCap := re.MaxCap() 158 capNames := re.CapNames() 159 160 re = re.Simplify() 161 prog, err := syntax.Compile(re) 162 if err != nil { 163 return nil, err 164 } 165 regexp := &Regexp{ 166 expr: expr, 167 prog: prog, 168 onepass: compileOnePass(prog), 169 numSubexp: maxCap, 170 subexpNames: capNames, 171 cond: prog.StartCond(), 172 longest: longest, 173 } 174 if regexp.onepass == notOnePass { 175 regexp.prefix, regexp.prefixComplete = prog.Prefix() 176 } else { 177 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 178 } 179 if regexp.prefix != "" { 180 // TODO(rsc): Remove this allocation by adding 181 // IndexString to package bytes. 182 regexp.prefixBytes = []byte(regexp.prefix) 183 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 184 } 185 return regexp, nil 186 } 187 188 // get returns a machine to use for matching re. 189 // It uses the re's machine cache if possible, to avoid 190 // unnecessary allocation. 191 func (re *Regexp) get() *machine { 192 re.mu.Lock() 193 if n := len(re.machine); n > 0 { 194 z := re.machine[n-1] 195 re.machine = re.machine[:n-1] 196 re.mu.Unlock() 197 return z 198 } 199 re.mu.Unlock() 200 z := progMachine(re.prog, re.onepass) 201 z.re = re 202 return z 203 } 204 205 // put returns a machine to the re's machine cache. 206 // There is no attempt to limit the size of the cache, so it will 207 // grow to the maximum number of simultaneous matches 208 // run using re. (The cache empties when re gets garbage collected.) 209 func (re *Regexp) put(z *machine) { 210 re.mu.Lock() 211 re.machine = append(re.machine, z) 212 re.mu.Unlock() 213 } 214 215 // MustCompile is like Compile but panics if the expression cannot be parsed. 216 // It simplifies safe initialization of global variables holding compiled regular 217 // expressions. 218 func MustCompile(str string) *Regexp { 219 regexp, error := Compile(str) 220 if error != nil { 221 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error()) 222 } 223 return regexp 224 } 225 226 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 227 // It simplifies safe initialization of global variables holding compiled regular 228 // expressions. 229 func MustCompilePOSIX(str string) *Regexp { 230 regexp, error := CompilePOSIX(str) 231 if error != nil { 232 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error()) 233 } 234 return regexp 235 } 236 237 func quote(s string) string { 238 if strconv.CanBackquote(s) { 239 return "`" + s + "`" 240 } 241 return strconv.Quote(s) 242 } 243 244 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 245 func (re *Regexp) NumSubexp() int { 246 return re.numSubexp 247 } 248 249 // SubexpNames returns the names of the parenthesized subexpressions 250 // in this Regexp. The name for the first sub-expression is names[1], 251 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 252 // Since the Regexp as a whole cannot be named, names[0] is always 253 // the empty string. The slice should not be modified. 254 func (re *Regexp) SubexpNames() []string { 255 return re.subexpNames 256 } 257 258 const endOfText rune = -1 259 260 // input abstracts different representations of the input text. It provides 261 // one-character lookahead. 262 type input interface { 263 step(pos int) (r rune, width int) // advance one rune 264 canCheckPrefix() bool // can we look ahead without losing info? 265 hasPrefix(re *Regexp) bool 266 index(re *Regexp, pos int) int 267 context(pos int) syntax.EmptyOp 268 } 269 270 // inputString scans a string. 271 type inputString struct { 272 str string 273 } 274 275 func (i *inputString) step(pos int) (rune, int) { 276 if pos < len(i.str) { 277 c := i.str[pos] 278 if c < utf8.RuneSelf { 279 return rune(c), 1 280 } 281 return utf8.DecodeRuneInString(i.str[pos:]) 282 } 283 return endOfText, 0 284 } 285 286 func (i *inputString) canCheckPrefix() bool { 287 return true 288 } 289 290 func (i *inputString) hasPrefix(re *Regexp) bool { 291 return strings.HasPrefix(i.str, re.prefix) 292 } 293 294 func (i *inputString) index(re *Regexp, pos int) int { 295 return strings.Index(i.str[pos:], re.prefix) 296 } 297 298 func (i *inputString) context(pos int) syntax.EmptyOp { 299 r1, r2 := endOfText, endOfText 300 if pos > 0 && pos <= len(i.str) { 301 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 302 } 303 if pos < len(i.str) { 304 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 305 } 306 return syntax.EmptyOpContext(r1, r2) 307 } 308 309 // inputBytes scans a byte slice. 310 type inputBytes struct { 311 str []byte 312 } 313 314 func (i *inputBytes) step(pos int) (rune, int) { 315 if pos < len(i.str) { 316 c := i.str[pos] 317 if c < utf8.RuneSelf { 318 return rune(c), 1 319 } 320 return utf8.DecodeRune(i.str[pos:]) 321 } 322 return endOfText, 0 323 } 324 325 func (i *inputBytes) canCheckPrefix() bool { 326 return true 327 } 328 329 func (i *inputBytes) hasPrefix(re *Regexp) bool { 330 return bytes.HasPrefix(i.str, re.prefixBytes) 331 } 332 333 func (i *inputBytes) index(re *Regexp, pos int) int { 334 return bytes.Index(i.str[pos:], re.prefixBytes) 335 } 336 337 func (i *inputBytes) context(pos int) syntax.EmptyOp { 338 r1, r2 := endOfText, endOfText 339 if pos > 0 && pos <= len(i.str) { 340 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 341 } 342 if pos < len(i.str) { 343 r2, _ = utf8.DecodeRune(i.str[pos:]) 344 } 345 return syntax.EmptyOpContext(r1, r2) 346 } 347 348 // inputReader scans a RuneReader. 349 type inputReader struct { 350 r io.RuneReader 351 atEOT bool 352 pos int 353 } 354 355 func (i *inputReader) step(pos int) (rune, int) { 356 if !i.atEOT && pos != i.pos { 357 return endOfText, 0 358 359 } 360 r, w, err := i.r.ReadRune() 361 if err != nil { 362 i.atEOT = true 363 return endOfText, 0 364 } 365 i.pos += w 366 return r, w 367 } 368 369 func (i *inputReader) canCheckPrefix() bool { 370 return false 371 } 372 373 func (i *inputReader) hasPrefix(re *Regexp) bool { 374 return false 375 } 376 377 func (i *inputReader) index(re *Regexp, pos int) int { 378 return -1 379 } 380 381 func (i *inputReader) context(pos int) syntax.EmptyOp { 382 return 0 383 } 384 385 // LiteralPrefix returns a literal string that must begin any match 386 // of the regular expression re. It returns the boolean true if the 387 // literal string comprises the entire regular expression. 388 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 389 return re.prefix, re.prefixComplete 390 } 391 392 // MatchReader reports whether the Regexp matches the text read by the 393 // RuneReader. 394 func (re *Regexp) MatchReader(r io.RuneReader) bool { 395 return re.doExecute(r, nil, "", 0, 0) != nil 396 } 397 398 // MatchString reports whether the Regexp matches the string s. 399 func (re *Regexp) MatchString(s string) bool { 400 return re.doExecute(nil, nil, s, 0, 0) != nil 401 } 402 403 // Match reports whether the Regexp matches the byte slice b. 404 func (re *Regexp) Match(b []byte) bool { 405 return re.doExecute(nil, b, "", 0, 0) != nil 406 } 407 408 // MatchReader checks whether a textual regular expression matches the text 409 // read by the RuneReader. More complicated queries need to use Compile and 410 // the full Regexp interface. 411 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 412 re, err := Compile(pattern) 413 if err != nil { 414 return false, err 415 } 416 return re.MatchReader(r), nil 417 } 418 419 // MatchString checks whether a textual regular expression 420 // matches a string. More complicated queries need 421 // to use Compile and the full Regexp interface. 422 func MatchString(pattern string, s string) (matched bool, err error) { 423 re, err := Compile(pattern) 424 if err != nil { 425 return false, err 426 } 427 return re.MatchString(s), nil 428 } 429 430 // Match checks whether a textual regular expression 431 // matches a byte slice. More complicated queries need 432 // to use Compile and the full Regexp interface. 433 func Match(pattern string, b []byte) (matched bool, err error) { 434 re, err := Compile(pattern) 435 if err != nil { 436 return false, err 437 } 438 return re.Match(b), nil 439 } 440 441 // ReplaceAllString returns a copy of src, replacing matches of the Regexp 442 // with the replacement string repl. Inside repl, $ signs are interpreted as 443 // in Expand, so for instance $1 represents the text of the first submatch. 444 func (re *Regexp) ReplaceAllString(src, repl string) string { 445 n := 2 446 if strings.Index(repl, "$") >= 0 { 447 n = 2 * (re.numSubexp + 1) 448 } 449 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 450 return re.expand(dst, repl, nil, src, match) 451 }) 452 return string(b) 453 } 454 455 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 456 // with the replacement string repl. The replacement repl is substituted directly, 457 // without using Expand. 458 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 459 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 460 return append(dst, repl...) 461 })) 462 } 463 464 // ReplaceAllStringFunc returns a copy of src in which all matches of the 465 // Regexp have been replaced by the return value of function repl applied 466 // to the matched substring. The replacement returned by repl is substituted 467 // directly, without using Expand. 468 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 469 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 470 return append(dst, repl(src[match[0]:match[1]])...) 471 }) 472 return string(b) 473 } 474 475 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 476 lastMatchEnd := 0 // end position of the most recent match 477 searchPos := 0 // position where we next look for a match 478 var buf []byte 479 var endPos int 480 if bsrc != nil { 481 endPos = len(bsrc) 482 } else { 483 endPos = len(src) 484 } 485 for searchPos <= endPos { 486 a := re.doExecute(nil, bsrc, src, searchPos, nmatch) 487 if len(a) == 0 { 488 break // no more matches 489 } 490 491 // Copy the unmatched characters before this match. 492 if bsrc != nil { 493 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 494 } else { 495 buf = append(buf, src[lastMatchEnd:a[0]]...) 496 } 497 498 // Now insert a copy of the replacement string, but not for a 499 // match of the empty string immediately after another match. 500 // (Otherwise, we get double replacement for patterns that 501 // match both empty and nonempty strings.) 502 if a[1] > lastMatchEnd || a[0] == 0 { 503 buf = repl(buf, a) 504 } 505 lastMatchEnd = a[1] 506 507 // Advance past this match; always advance at least one character. 508 var width int 509 if bsrc != nil { 510 _, width = utf8.DecodeRune(bsrc[searchPos:]) 511 } else { 512 _, width = utf8.DecodeRuneInString(src[searchPos:]) 513 } 514 if searchPos+width > a[1] { 515 searchPos += width 516 } else if searchPos+1 > a[1] { 517 // This clause is only needed at the end of the input 518 // string. In that case, DecodeRuneInString returns width=0. 519 searchPos++ 520 } else { 521 searchPos = a[1] 522 } 523 } 524 525 // Copy the unmatched characters after the last match. 526 if bsrc != nil { 527 buf = append(buf, bsrc[lastMatchEnd:]...) 528 } else { 529 buf = append(buf, src[lastMatchEnd:]...) 530 } 531 532 return buf 533 } 534 535 // ReplaceAll returns a copy of src, replacing matches of the Regexp 536 // with the replacement text repl. Inside repl, $ signs are interpreted as 537 // in Expand, so for instance $1 represents the text of the first submatch. 538 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 539 n := 2 540 if bytes.IndexByte(repl, '$') >= 0 { 541 n = 2 * (re.numSubexp + 1) 542 } 543 srepl := "" 544 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 545 if len(srepl) != len(repl) { 546 srepl = string(repl) 547 } 548 return re.expand(dst, srepl, src, "", match) 549 }) 550 return b 551 } 552 553 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 554 // with the replacement bytes repl. The replacement repl is substituted directly, 555 // without using Expand. 556 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 557 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 558 return append(dst, repl...) 559 }) 560 } 561 562 // ReplaceAllFunc returns a copy of src in which all matches of the 563 // Regexp have been replaced by the return value of function repl applied 564 // to the matched byte slice. The replacement returned by repl is substituted 565 // directly, without using Expand. 566 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 567 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 568 return append(dst, repl(src[match[0]:match[1]])...) 569 }) 570 } 571 572 var specialBytes = []byte(`\.+*?()|[]{}^$`) 573 574 func special(b byte) bool { 575 return bytes.IndexByte(specialBytes, b) >= 0 576 } 577 578 // QuoteMeta returns a string that quotes all regular expression metacharacters 579 // inside the argument text; the returned string is a regular expression matching 580 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. 581 func QuoteMeta(s string) string { 582 b := make([]byte, 2*len(s)) 583 584 // A byte loop is correct because all metacharacters are ASCII. 585 j := 0 586 for i := 0; i < len(s); i++ { 587 if special(s[i]) { 588 b[j] = '\\' 589 j++ 590 } 591 b[j] = s[i] 592 j++ 593 } 594 return string(b[0:j]) 595 } 596 597 // The number of capture values in the program may correspond 598 // to fewer capturing expressions than are in the regexp. 599 // For example, "(a){0}" turns into an empty program, so the 600 // maximum capture in the program is 0 but we need to return 601 // an expression for \1. Pad appends -1s to the slice a as needed. 602 func (re *Regexp) pad(a []int) []int { 603 if a == nil { 604 // No match. 605 return nil 606 } 607 n := (1 + re.numSubexp) * 2 608 for len(a) < n { 609 a = append(a, -1) 610 } 611 return a 612 } 613 614 // Find matches in slice b if b is non-nil, otherwise find matches in string s. 615 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 616 var end int 617 if b == nil { 618 end = len(s) 619 } else { 620 end = len(b) 621 } 622 623 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 624 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap) 625 if len(matches) == 0 { 626 break 627 } 628 629 accept := true 630 if matches[1] == pos { 631 // We've found an empty match. 632 if matches[0] == prevMatchEnd { 633 // We don't allow an empty match right 634 // after a previous match, so ignore it. 635 accept = false 636 } 637 var width int 638 // TODO: use step() 639 if b == nil { 640 _, width = utf8.DecodeRuneInString(s[pos:end]) 641 } else { 642 _, width = utf8.DecodeRune(b[pos:end]) 643 } 644 if width > 0 { 645 pos += width 646 } else { 647 pos = end + 1 648 } 649 } else { 650 pos = matches[1] 651 } 652 prevMatchEnd = matches[1] 653 654 if accept { 655 deliver(re.pad(matches)) 656 i++ 657 } 658 } 659 } 660 661 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 662 // A return value of nil indicates no match. 663 func (re *Regexp) Find(b []byte) []byte { 664 a := re.doExecute(nil, b, "", 0, 2) 665 if a == nil { 666 return nil 667 } 668 return b[a[0]:a[1]] 669 } 670 671 // FindIndex returns a two-element slice of integers defining the location of 672 // the leftmost match in b of the regular expression. The match itself is at 673 // b[loc[0]:loc[1]]. 674 // A return value of nil indicates no match. 675 func (re *Regexp) FindIndex(b []byte) (loc []int) { 676 a := re.doExecute(nil, b, "", 0, 2) 677 if a == nil { 678 return nil 679 } 680 return a[0:2] 681 } 682 683 // FindString returns a string holding the text of the leftmost match in s of the regular 684 // expression. If there is no match, the return value is an empty string, 685 // but it will also be empty if the regular expression successfully matches 686 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 687 // necessary to distinguish these cases. 688 func (re *Regexp) FindString(s string) string { 689 a := re.doExecute(nil, nil, s, 0, 2) 690 if a == nil { 691 return "" 692 } 693 return s[a[0]:a[1]] 694 } 695 696 // FindStringIndex returns a two-element slice of integers defining the 697 // location of the leftmost match in s of the regular expression. The match 698 // itself is at s[loc[0]:loc[1]]. 699 // A return value of nil indicates no match. 700 func (re *Regexp) FindStringIndex(s string) (loc []int) { 701 a := re.doExecute(nil, nil, s, 0, 2) 702 if a == nil { 703 return nil 704 } 705 return a[0:2] 706 } 707 708 // FindReaderIndex returns a two-element slice of integers defining the 709 // location of the leftmost match of the regular expression in text read from 710 // the RuneReader. The match text was found in the input stream at 711 // byte offset loc[0] through loc[1]-1. 712 // A return value of nil indicates no match. 713 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 714 a := re.doExecute(r, nil, "", 0, 2) 715 if a == nil { 716 return nil 717 } 718 return a[0:2] 719 } 720 721 // FindSubmatch returns a slice of slices holding the text of the leftmost 722 // match of the regular expression in b and the matches, if any, of its 723 // subexpressions, as defined by the 'Submatch' descriptions in the package 724 // comment. 725 // A return value of nil indicates no match. 726 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 727 a := re.doExecute(nil, b, "", 0, re.prog.NumCap) 728 if a == nil { 729 return nil 730 } 731 ret := make([][]byte, 1+re.numSubexp) 732 for i := range ret { 733 if 2*i < len(a) && a[2*i] >= 0 { 734 ret[i] = b[a[2*i]:a[2*i+1]] 735 } 736 } 737 return ret 738 } 739 740 // Expand appends template to dst and returns the result; during the 741 // append, Expand replaces variables in the template with corresponding 742 // matches drawn from src. The match slice should have been returned by 743 // FindSubmatchIndex. 744 // 745 // In the template, a variable is denoted by a substring of the form 746 // $name or ${name}, where name is a non-empty sequence of letters, 747 // digits, and underscores. A purely numeric name like $1 refers to 748 // the submatch with the corresponding index; other names refer to 749 // capturing parentheses named with the (?P<name>...) syntax. A 750 // reference to an out of range or unmatched index or a name that is not 751 // present in the regular expression is replaced with an empty slice. 752 // 753 // In the $name form, name is taken to be as long as possible: $1x is 754 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 755 // 756 // To insert a literal $ in the output, use $$ in the template. 757 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 758 return re.expand(dst, string(template), src, "", match) 759 } 760 761 // ExpandString is like Expand but the template and source are strings. 762 // It appends to and returns a byte slice in order to give the calling 763 // code control over allocation. 764 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 765 return re.expand(dst, template, nil, src, match) 766 } 767 768 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 769 for len(template) > 0 { 770 i := strings.Index(template, "$") 771 if i < 0 { 772 break 773 } 774 dst = append(dst, template[:i]...) 775 template = template[i:] 776 if len(template) > 1 && template[1] == '$' { 777 // Treat $$ as $. 778 dst = append(dst, '$') 779 template = template[2:] 780 continue 781 } 782 name, num, rest, ok := extract(template) 783 if !ok { 784 // Malformed; treat $ as raw text. 785 dst = append(dst, '$') 786 template = template[1:] 787 continue 788 } 789 template = rest 790 if num >= 0 { 791 if 2*num+1 < len(match) && match[2*num] >= 0 { 792 if bsrc != nil { 793 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 794 } else { 795 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 796 } 797 } 798 } else { 799 for i, namei := range re.subexpNames { 800 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 801 if bsrc != nil { 802 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 803 } else { 804 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 805 } 806 break 807 } 808 } 809 } 810 } 811 dst = append(dst, template...) 812 return dst 813 } 814 815 // extract returns the name from a leading "$name" or "${name}" in str. 816 // If it is a number, extract returns num set to that number; otherwise num = -1. 817 func extract(str string) (name string, num int, rest string, ok bool) { 818 if len(str) < 2 || str[0] != '$' { 819 return 820 } 821 brace := false 822 if str[1] == '{' { 823 brace = true 824 str = str[2:] 825 } else { 826 str = str[1:] 827 } 828 i := 0 829 for i < len(str) { 830 rune, size := utf8.DecodeRuneInString(str[i:]) 831 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 832 break 833 } 834 i += size 835 } 836 if i == 0 { 837 // empty name is not okay 838 return 839 } 840 name = str[:i] 841 if brace { 842 if i >= len(str) || str[i] != '}' { 843 // missing closing brace 844 return 845 } 846 i++ 847 } 848 849 // Parse number. 850 num = 0 851 for i := 0; i < len(name); i++ { 852 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 853 num = -1 854 break 855 } 856 num = num*10 + int(name[i]) - '0' 857 } 858 // Disallow leading zeros. 859 if name[0] == '0' && len(name) > 1 { 860 num = -1 861 } 862 863 rest = str[i:] 864 ok = true 865 return 866 } 867 868 // FindSubmatchIndex returns a slice holding the index pairs identifying the 869 // leftmost match of the regular expression in b and the matches, if any, of 870 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 871 // in the package comment. 872 // A return value of nil indicates no match. 873 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 874 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap)) 875 } 876 877 // FindStringSubmatch returns a slice of strings holding the text of the 878 // leftmost match of the regular expression in s and the matches, if any, of 879 // its subexpressions, as defined by the 'Submatch' description in the 880 // package comment. 881 // A return value of nil indicates no match. 882 func (re *Regexp) FindStringSubmatch(s string) []string { 883 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap) 884 if a == nil { 885 return nil 886 } 887 ret := make([]string, 1+re.numSubexp) 888 for i := range ret { 889 if 2*i < len(a) && a[2*i] >= 0 { 890 ret[i] = s[a[2*i]:a[2*i+1]] 891 } 892 } 893 return ret 894 } 895 896 // FindStringSubmatchIndex returns a slice holding the index pairs 897 // identifying the leftmost match of the regular expression in s and the 898 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 899 // 'Index' descriptions in the package comment. 900 // A return value of nil indicates no match. 901 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 902 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap)) 903 } 904 905 // FindReaderSubmatchIndex returns a slice holding the index pairs 906 // identifying the leftmost match of the regular expression of text read by 907 // the RuneReader, and the matches, if any, of its subexpressions, as defined 908 // by the 'Submatch' and 'Index' descriptions in the package comment. A 909 // return value of nil indicates no match. 910 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 911 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap)) 912 } 913 914 const startSize = 10 // The size at which to start a slice in the 'All' routines. 915 916 // FindAll is the 'All' version of Find; it returns a slice of all successive 917 // matches of the expression, as defined by the 'All' description in the 918 // package comment. 919 // A return value of nil indicates no match. 920 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 921 if n < 0 { 922 n = len(b) + 1 923 } 924 result := make([][]byte, 0, startSize) 925 re.allMatches("", b, n, func(match []int) { 926 result = append(result, b[match[0]:match[1]]) 927 }) 928 if len(result) == 0 { 929 return nil 930 } 931 return result 932 } 933 934 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 935 // successive matches of the expression, as defined by the 'All' description 936 // in the package comment. 937 // A return value of nil indicates no match. 938 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 939 if n < 0 { 940 n = len(b) + 1 941 } 942 result := make([][]int, 0, startSize) 943 re.allMatches("", b, n, func(match []int) { 944 result = append(result, match[0:2]) 945 }) 946 if len(result) == 0 { 947 return nil 948 } 949 return result 950 } 951 952 // FindAllString is the 'All' version of FindString; it returns a slice of all 953 // successive matches of the expression, as defined by the 'All' description 954 // in the package comment. 955 // A return value of nil indicates no match. 956 func (re *Regexp) FindAllString(s string, n int) []string { 957 if n < 0 { 958 n = len(s) + 1 959 } 960 result := make([]string, 0, startSize) 961 re.allMatches(s, nil, n, func(match []int) { 962 result = append(result, s[match[0]:match[1]]) 963 }) 964 if len(result) == 0 { 965 return nil 966 } 967 return result 968 } 969 970 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 971 // slice of all successive matches of the expression, as defined by the 'All' 972 // description in the package comment. 973 // A return value of nil indicates no match. 974 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 975 if n < 0 { 976 n = len(s) + 1 977 } 978 result := make([][]int, 0, startSize) 979 re.allMatches(s, nil, n, func(match []int) { 980 result = append(result, match[0:2]) 981 }) 982 if len(result) == 0 { 983 return nil 984 } 985 return result 986 } 987 988 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 989 // of all successive matches of the expression, as defined by the 'All' 990 // description in the package comment. 991 // A return value of nil indicates no match. 992 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 993 if n < 0 { 994 n = len(b) + 1 995 } 996 result := make([][][]byte, 0, startSize) 997 re.allMatches("", b, n, func(match []int) { 998 slice := make([][]byte, len(match)/2) 999 for j := range slice { 1000 if match[2*j] >= 0 { 1001 slice[j] = b[match[2*j]:match[2*j+1]] 1002 } 1003 } 1004 result = append(result, slice) 1005 }) 1006 if len(result) == 0 { 1007 return nil 1008 } 1009 return result 1010 } 1011 1012 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1013 // a slice of all successive matches of the expression, as defined by the 1014 // 'All' description in the package comment. 1015 // A return value of nil indicates no match. 1016 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1017 if n < 0 { 1018 n = len(b) + 1 1019 } 1020 result := make([][]int, 0, startSize) 1021 re.allMatches("", b, n, func(match []int) { 1022 result = append(result, match) 1023 }) 1024 if len(result) == 0 { 1025 return nil 1026 } 1027 return result 1028 } 1029 1030 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1031 // returns a slice of all successive matches of the expression, as defined by 1032 // the 'All' description in the package comment. 1033 // A return value of nil indicates no match. 1034 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1035 if n < 0 { 1036 n = len(s) + 1 1037 } 1038 result := make([][]string, 0, startSize) 1039 re.allMatches(s, nil, n, func(match []int) { 1040 slice := make([]string, len(match)/2) 1041 for j := range slice { 1042 if match[2*j] >= 0 { 1043 slice[j] = s[match[2*j]:match[2*j+1]] 1044 } 1045 } 1046 result = append(result, slice) 1047 }) 1048 if len(result) == 0 { 1049 return nil 1050 } 1051 return result 1052 } 1053 1054 // FindAllStringSubmatchIndex is the 'All' version of 1055 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1056 // the expression, as defined by the 'All' description in the package 1057 // comment. 1058 // A return value of nil indicates no match. 1059 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1060 if n < 0 { 1061 n = len(s) + 1 1062 } 1063 result := make([][]int, 0, startSize) 1064 re.allMatches(s, nil, n, func(match []int) { 1065 result = append(result, match) 1066 }) 1067 if len(result) == 0 { 1068 return nil 1069 } 1070 return result 1071 } 1072 1073 // Split slices s into substrings separated by the expression and returns a slice of 1074 // the substrings between those expression matches. 1075 // 1076 // The slice returned by this method consists of all the substrings of s 1077 // not contained in the slice returned by FindAllString. When called on an expression 1078 // that contains no metacharacters, it is equivalent to strings.SplitN. 1079 // 1080 // Example: 1081 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1082 // // s: ["", "b", "b", "c", "cadaaae"] 1083 // 1084 // The count determines the number of substrings to return: 1085 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 1086 // n == 0: the result is nil (zero substrings) 1087 // n < 0: all substrings 1088 func (re *Regexp) Split(s string, n int) []string { 1089 1090 if n == 0 { 1091 return nil 1092 } 1093 1094 if len(re.expr) > 0 && len(s) == 0 { 1095 return []string{""} 1096 } 1097 1098 matches := re.FindAllStringIndex(s, n) 1099 strings := make([]string, 0, len(matches)) 1100 1101 beg := 0 1102 end := 0 1103 for _, match := range matches { 1104 if n > 0 && len(strings) >= n-1 { 1105 break 1106 } 1107 1108 end = match[0] 1109 if match[1] != 0 { 1110 strings = append(strings, s[beg:end]) 1111 } 1112 beg = match[1] 1113 } 1114 1115 if end != len(s) { 1116 strings = append(strings, s[beg:]) 1117 } 1118 1119 return strings 1120 }