github.com/fjballest/golang@v0.0.0-20151209143359-e4c5fe594ca8/src/regexp/regexp.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // https://golang.org/s/re2syntax, except for \C. 11 // For an overview of the syntax, run 12 // go doc regexp/syntax 13 // 14 // The regexp implementation provided by this package is 15 // guaranteed to run in time linear in the size of the input. 16 // (This is a property not guaranteed by most open source 17 // implementations of regular expressions.) For more information 18 // about this property, see 19 // http://swtch.com/~rsc/regexp/regexp1.html 20 // or any book about automata theory. 21 // 22 // All characters are UTF-8-encoded code points. 23 // 24 // There are 16 methods of Regexp that match a regular expression and identify 25 // the matched text. Their names are matched by this regular expression: 26 // 27 // Find(All)?(String)?(Submatch)?(Index)? 28 // 29 // If 'All' is present, the routine matches successive non-overlapping 30 // matches of the entire expression. Empty matches abutting a preceding 31 // match are ignored. The return value is a slice containing the successive 32 // return values of the corresponding non-'All' routine. These routines take 33 // an extra integer argument, n; if n >= 0, the function returns at most n 34 // matches/submatches. 35 // 36 // If 'String' is present, the argument is a string; otherwise it is a slice 37 // of bytes; return values are adjusted as appropriate. 38 // 39 // If 'Submatch' is present, the return value is a slice identifying the 40 // successive submatches of the expression. Submatches are matches of 41 // parenthesized subexpressions (also known as capturing groups) within the 42 // regular expression, numbered from left to right in order of opening 43 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 44 // the match of the first parenthesized subexpression, and so on. 45 // 46 // If 'Index' is present, matches and submatches are identified by byte index 47 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 48 // the nth submatch. The pair for n==0 identifies the match of the entire 49 // expression. If 'Index' is not present, the match is identified by the 50 // text of the match/submatch. If an index is negative, it means that 51 // subexpression did not match any string in the input. 52 // 53 // There is also a subset of the methods that can be applied to text read 54 // from a RuneReader: 55 // 56 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 57 // 58 // This set may grow. Note that regular expression matches may need to 59 // examine text beyond the text returned by a match, so the methods that 60 // match text from a RuneReader may read arbitrarily far into the input 61 // before returning. 62 // 63 // (There are a few other methods that do not match this pattern.) 64 // 65 package regexp 66 67 import ( 68 "bytes" 69 "io" 70 "regexp/syntax" 71 "strconv" 72 "strings" 73 "sync" 74 "unicode" 75 "unicode/utf8" 76 ) 77 78 var debug = false 79 80 // Regexp is the representation of a compiled regular expression. 81 // A Regexp is safe for concurrent use by multiple goroutines. 82 type Regexp struct { 83 // read-only after Compile 84 expr string // as passed to Compile 85 prog *syntax.Prog // compiled program 86 onepass *onePassProg // onepass program or nil 87 prefix string // required prefix in unanchored matches 88 prefixBytes []byte // prefix, as a []byte 89 prefixComplete bool // prefix is the entire regexp 90 prefixRune rune // first rune in prefix 91 prefixEnd uint32 // pc for last rune in prefix 92 cond syntax.EmptyOp // empty-width conditions required at start of match 93 numSubexp int 94 subexpNames []string 95 longest bool 96 97 // cache of machines for running regexp 98 mu sync.Mutex 99 machine []*machine 100 } 101 102 // String returns the source text used to compile the regular expression. 103 func (re *Regexp) String() string { 104 return re.expr 105 } 106 107 // Copy returns a new Regexp object copied from re. 108 // 109 // When using a Regexp in multiple goroutines, giving each goroutine 110 // its own copy helps to avoid lock contention. 111 func (re *Regexp) Copy() *Regexp { 112 r := *re 113 r.mu = sync.Mutex{} 114 r.machine = nil 115 return &r 116 } 117 118 // Compile parses a regular expression and returns, if successful, 119 // a Regexp object that can be used to match against text. 120 // 121 // When matching against text, the regexp returns a match that 122 // begins as early as possible in the input (leftmost), and among those 123 // it chooses the one that a backtracking search would have found first. 124 // This so-called leftmost-first matching is the same semantics 125 // that Perl, Python, and other implementations use, although this 126 // package implements it without the expense of backtracking. 127 // For POSIX leftmost-longest matching, see CompilePOSIX. 128 func Compile(expr string) (*Regexp, error) { 129 return compile(expr, syntax.Perl, false) 130 } 131 132 // CompilePOSIX is like Compile but restricts the regular expression 133 // to POSIX ERE (egrep) syntax and changes the match semantics to 134 // leftmost-longest. 135 // 136 // That is, when matching against text, the regexp returns a match that 137 // begins as early as possible in the input (leftmost), and among those 138 // it chooses a match that is as long as possible. 139 // This so-called leftmost-longest matching is the same semantics 140 // that early regular expression implementations used and that POSIX 141 // specifies. 142 // 143 // However, there can be multiple leftmost-longest matches, with different 144 // submatch choices, and here this package diverges from POSIX. 145 // Among the possible leftmost-longest matches, this package chooses 146 // the one that a backtracking search would have found first, while POSIX 147 // specifies that the match be chosen to maximize the length of the first 148 // subexpression, then the second, and so on from left to right. 149 // The POSIX rule is computationally prohibitive and not even well-defined. 150 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details. 151 func CompilePOSIX(expr string) (*Regexp, error) { 152 return compile(expr, syntax.POSIX, true) 153 } 154 155 // Longest makes future searches prefer the leftmost-longest match. 156 // That is, when matching against text, the regexp returns a match that 157 // begins as early as possible in the input (leftmost), and among those 158 // it chooses a match that is as long as possible. 159 func (re *Regexp) Longest() { 160 re.longest = true 161 } 162 163 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 164 re, err := syntax.Parse(expr, mode) 165 if err != nil { 166 return nil, err 167 } 168 maxCap := re.MaxCap() 169 capNames := re.CapNames() 170 171 re = re.Simplify() 172 prog, err := syntax.Compile(re) 173 if err != nil { 174 return nil, err 175 } 176 regexp := &Regexp{ 177 expr: expr, 178 prog: prog, 179 onepass: compileOnePass(prog), 180 numSubexp: maxCap, 181 subexpNames: capNames, 182 cond: prog.StartCond(), 183 longest: longest, 184 } 185 if regexp.onepass == notOnePass { 186 regexp.prefix, regexp.prefixComplete = prog.Prefix() 187 } else { 188 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 189 } 190 if regexp.prefix != "" { 191 // TODO(rsc): Remove this allocation by adding 192 // IndexString to package bytes. 193 regexp.prefixBytes = []byte(regexp.prefix) 194 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 195 } 196 return regexp, nil 197 } 198 199 // get returns a machine to use for matching re. 200 // It uses the re's machine cache if possible, to avoid 201 // unnecessary allocation. 202 func (re *Regexp) get() *machine { 203 re.mu.Lock() 204 if n := len(re.machine); n > 0 { 205 z := re.machine[n-1] 206 re.machine = re.machine[:n-1] 207 re.mu.Unlock() 208 return z 209 } 210 re.mu.Unlock() 211 z := progMachine(re.prog, re.onepass) 212 z.re = re 213 return z 214 } 215 216 // put returns a machine to the re's machine cache. 217 // There is no attempt to limit the size of the cache, so it will 218 // grow to the maximum number of simultaneous matches 219 // run using re. (The cache empties when re gets garbage collected.) 220 func (re *Regexp) put(z *machine) { 221 re.mu.Lock() 222 re.machine = append(re.machine, z) 223 re.mu.Unlock() 224 } 225 226 // MustCompile is like Compile but panics if the expression cannot be parsed. 227 // It simplifies safe initialization of global variables holding compiled regular 228 // expressions. 229 func MustCompile(str string) *Regexp { 230 regexp, error := Compile(str) 231 if error != nil { 232 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error()) 233 } 234 return regexp 235 } 236 237 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 238 // It simplifies safe initialization of global variables holding compiled regular 239 // expressions. 240 func MustCompilePOSIX(str string) *Regexp { 241 regexp, error := CompilePOSIX(str) 242 if error != nil { 243 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error()) 244 } 245 return regexp 246 } 247 248 func quote(s string) string { 249 if strconv.CanBackquote(s) { 250 return "`" + s + "`" 251 } 252 return strconv.Quote(s) 253 } 254 255 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 256 func (re *Regexp) NumSubexp() int { 257 return re.numSubexp 258 } 259 260 // SubexpNames returns the names of the parenthesized subexpressions 261 // in this Regexp. The name for the first sub-expression is names[1], 262 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 263 // Since the Regexp as a whole cannot be named, names[0] is always 264 // the empty string. The slice should not be modified. 265 func (re *Regexp) SubexpNames() []string { 266 return re.subexpNames 267 } 268 269 const endOfText rune = -1 270 271 // input abstracts different representations of the input text. It provides 272 // one-character lookahead. 273 type input interface { 274 step(pos int) (r rune, width int) // advance one rune 275 canCheckPrefix() bool // can we look ahead without losing info? 276 hasPrefix(re *Regexp) bool 277 index(re *Regexp, pos int) int 278 context(pos int) syntax.EmptyOp 279 } 280 281 // inputString scans a string. 282 type inputString struct { 283 str string 284 } 285 286 func (i *inputString) step(pos int) (rune, int) { 287 if pos < len(i.str) { 288 c := i.str[pos] 289 if c < utf8.RuneSelf { 290 return rune(c), 1 291 } 292 return utf8.DecodeRuneInString(i.str[pos:]) 293 } 294 return endOfText, 0 295 } 296 297 func (i *inputString) canCheckPrefix() bool { 298 return true 299 } 300 301 func (i *inputString) hasPrefix(re *Regexp) bool { 302 return strings.HasPrefix(i.str, re.prefix) 303 } 304 305 func (i *inputString) index(re *Regexp, pos int) int { 306 return strings.Index(i.str[pos:], re.prefix) 307 } 308 309 func (i *inputString) context(pos int) syntax.EmptyOp { 310 r1, r2 := endOfText, endOfText 311 if pos > 0 && pos <= len(i.str) { 312 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 313 } 314 if pos < len(i.str) { 315 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 316 } 317 return syntax.EmptyOpContext(r1, r2) 318 } 319 320 // inputBytes scans a byte slice. 321 type inputBytes struct { 322 str []byte 323 } 324 325 func (i *inputBytes) step(pos int) (rune, int) { 326 if pos < len(i.str) { 327 c := i.str[pos] 328 if c < utf8.RuneSelf { 329 return rune(c), 1 330 } 331 return utf8.DecodeRune(i.str[pos:]) 332 } 333 return endOfText, 0 334 } 335 336 func (i *inputBytes) canCheckPrefix() bool { 337 return true 338 } 339 340 func (i *inputBytes) hasPrefix(re *Regexp) bool { 341 return bytes.HasPrefix(i.str, re.prefixBytes) 342 } 343 344 func (i *inputBytes) index(re *Regexp, pos int) int { 345 return bytes.Index(i.str[pos:], re.prefixBytes) 346 } 347 348 func (i *inputBytes) context(pos int) syntax.EmptyOp { 349 r1, r2 := endOfText, endOfText 350 if pos > 0 && pos <= len(i.str) { 351 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 352 } 353 if pos < len(i.str) { 354 r2, _ = utf8.DecodeRune(i.str[pos:]) 355 } 356 return syntax.EmptyOpContext(r1, r2) 357 } 358 359 // inputReader scans a RuneReader. 360 type inputReader struct { 361 r io.RuneReader 362 atEOT bool 363 pos int 364 } 365 366 func (i *inputReader) step(pos int) (rune, int) { 367 if !i.atEOT && pos != i.pos { 368 return endOfText, 0 369 370 } 371 r, w, err := i.r.ReadRune() 372 if err != nil { 373 i.atEOT = true 374 return endOfText, 0 375 } 376 i.pos += w 377 return r, w 378 } 379 380 func (i *inputReader) canCheckPrefix() bool { 381 return false 382 } 383 384 func (i *inputReader) hasPrefix(re *Regexp) bool { 385 return false 386 } 387 388 func (i *inputReader) index(re *Regexp, pos int) int { 389 return -1 390 } 391 392 func (i *inputReader) context(pos int) syntax.EmptyOp { 393 return 0 394 } 395 396 // LiteralPrefix returns a literal string that must begin any match 397 // of the regular expression re. It returns the boolean true if the 398 // literal string comprises the entire regular expression. 399 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 400 return re.prefix, re.prefixComplete 401 } 402 403 // MatchReader reports whether the Regexp matches the text read by the 404 // RuneReader. 405 func (re *Regexp) MatchReader(r io.RuneReader) bool { 406 return re.doExecute(r, nil, "", 0, 0) != nil 407 } 408 409 // MatchString reports whether the Regexp matches the string s. 410 func (re *Regexp) MatchString(s string) bool { 411 return re.doExecute(nil, nil, s, 0, 0) != nil 412 } 413 414 // Match reports whether the Regexp matches the byte slice b. 415 func (re *Regexp) Match(b []byte) bool { 416 return re.doExecute(nil, b, "", 0, 0) != nil 417 } 418 419 // MatchReader checks whether a textual regular expression matches the text 420 // read by the RuneReader. More complicated queries need to use Compile and 421 // the full Regexp interface. 422 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 423 re, err := Compile(pattern) 424 if err != nil { 425 return false, err 426 } 427 return re.MatchReader(r), nil 428 } 429 430 // MatchString checks whether a textual regular expression 431 // matches a string. More complicated queries need 432 // to use Compile and the full Regexp interface. 433 func MatchString(pattern string, s string) (matched bool, err error) { 434 re, err := Compile(pattern) 435 if err != nil { 436 return false, err 437 } 438 return re.MatchString(s), nil 439 } 440 441 // Match checks whether a textual regular expression 442 // matches a byte slice. More complicated queries need 443 // to use Compile and the full Regexp interface. 444 func Match(pattern string, b []byte) (matched bool, err error) { 445 re, err := Compile(pattern) 446 if err != nil { 447 return false, err 448 } 449 return re.Match(b), nil 450 } 451 452 // ReplaceAllString returns a copy of src, replacing matches of the Regexp 453 // with the replacement string repl. Inside repl, $ signs are interpreted as 454 // in Expand, so for instance $1 represents the text of the first submatch. 455 func (re *Regexp) ReplaceAllString(src, repl string) string { 456 n := 2 457 if strings.Index(repl, "$") >= 0 { 458 n = 2 * (re.numSubexp + 1) 459 } 460 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 461 return re.expand(dst, repl, nil, src, match) 462 }) 463 return string(b) 464 } 465 466 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 467 // with the replacement string repl. The replacement repl is substituted directly, 468 // without using Expand. 469 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 470 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 471 return append(dst, repl...) 472 })) 473 } 474 475 // ReplaceAllStringFunc returns a copy of src in which all matches of the 476 // Regexp have been replaced by the return value of function repl applied 477 // to the matched substring. The replacement returned by repl is substituted 478 // directly, without using Expand. 479 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 480 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 481 return append(dst, repl(src[match[0]:match[1]])...) 482 }) 483 return string(b) 484 } 485 486 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 487 lastMatchEnd := 0 // end position of the most recent match 488 searchPos := 0 // position where we next look for a match 489 var buf []byte 490 var endPos int 491 if bsrc != nil { 492 endPos = len(bsrc) 493 } else { 494 endPos = len(src) 495 } 496 if nmatch > re.prog.NumCap { 497 nmatch = re.prog.NumCap 498 } 499 500 for searchPos <= endPos { 501 a := re.doExecute(nil, bsrc, src, searchPos, nmatch) 502 if len(a) == 0 { 503 break // no more matches 504 } 505 506 // Copy the unmatched characters before this match. 507 if bsrc != nil { 508 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 509 } else { 510 buf = append(buf, src[lastMatchEnd:a[0]]...) 511 } 512 513 // Now insert a copy of the replacement string, but not for a 514 // match of the empty string immediately after another match. 515 // (Otherwise, we get double replacement for patterns that 516 // match both empty and nonempty strings.) 517 if a[1] > lastMatchEnd || a[0] == 0 { 518 buf = repl(buf, a) 519 } 520 lastMatchEnd = a[1] 521 522 // Advance past this match; always advance at least one character. 523 var width int 524 if bsrc != nil { 525 _, width = utf8.DecodeRune(bsrc[searchPos:]) 526 } else { 527 _, width = utf8.DecodeRuneInString(src[searchPos:]) 528 } 529 if searchPos+width > a[1] { 530 searchPos += width 531 } else if searchPos+1 > a[1] { 532 // This clause is only needed at the end of the input 533 // string. In that case, DecodeRuneInString returns width=0. 534 searchPos++ 535 } else { 536 searchPos = a[1] 537 } 538 } 539 540 // Copy the unmatched characters after the last match. 541 if bsrc != nil { 542 buf = append(buf, bsrc[lastMatchEnd:]...) 543 } else { 544 buf = append(buf, src[lastMatchEnd:]...) 545 } 546 547 return buf 548 } 549 550 // ReplaceAll returns a copy of src, replacing matches of the Regexp 551 // with the replacement text repl. Inside repl, $ signs are interpreted as 552 // in Expand, so for instance $1 represents the text of the first submatch. 553 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 554 n := 2 555 if bytes.IndexByte(repl, '$') >= 0 { 556 n = 2 * (re.numSubexp + 1) 557 } 558 srepl := "" 559 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 560 if len(srepl) != len(repl) { 561 srepl = string(repl) 562 } 563 return re.expand(dst, srepl, src, "", match) 564 }) 565 return b 566 } 567 568 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 569 // with the replacement bytes repl. The replacement repl is substituted directly, 570 // without using Expand. 571 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 572 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 573 return append(dst, repl...) 574 }) 575 } 576 577 // ReplaceAllFunc returns a copy of src in which all matches of the 578 // Regexp have been replaced by the return value of function repl applied 579 // to the matched byte slice. The replacement returned by repl is substituted 580 // directly, without using Expand. 581 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 582 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 583 return append(dst, repl(src[match[0]:match[1]])...) 584 }) 585 } 586 587 var specialBytes = []byte(`\.+*?()|[]{}^$`) 588 589 func special(b byte) bool { 590 return bytes.IndexByte(specialBytes, b) >= 0 591 } 592 593 // QuoteMeta returns a string that quotes all regular expression metacharacters 594 // inside the argument text; the returned string is a regular expression matching 595 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. 596 func QuoteMeta(s string) string { 597 b := make([]byte, 2*len(s)) 598 599 // A byte loop is correct because all metacharacters are ASCII. 600 j := 0 601 for i := 0; i < len(s); i++ { 602 if special(s[i]) { 603 b[j] = '\\' 604 j++ 605 } 606 b[j] = s[i] 607 j++ 608 } 609 return string(b[0:j]) 610 } 611 612 // The number of capture values in the program may correspond 613 // to fewer capturing expressions than are in the regexp. 614 // For example, "(a){0}" turns into an empty program, so the 615 // maximum capture in the program is 0 but we need to return 616 // an expression for \1. Pad appends -1s to the slice a as needed. 617 func (re *Regexp) pad(a []int) []int { 618 if a == nil { 619 // No match. 620 return nil 621 } 622 n := (1 + re.numSubexp) * 2 623 for len(a) < n { 624 a = append(a, -1) 625 } 626 return a 627 } 628 629 // Find matches in slice b if b is non-nil, otherwise find matches in string s. 630 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 631 var end int 632 if b == nil { 633 end = len(s) 634 } else { 635 end = len(b) 636 } 637 638 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 639 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap) 640 if len(matches) == 0 { 641 break 642 } 643 644 accept := true 645 if matches[1] == pos { 646 // We've found an empty match. 647 if matches[0] == prevMatchEnd { 648 // We don't allow an empty match right 649 // after a previous match, so ignore it. 650 accept = false 651 } 652 var width int 653 // TODO: use step() 654 if b == nil { 655 _, width = utf8.DecodeRuneInString(s[pos:end]) 656 } else { 657 _, width = utf8.DecodeRune(b[pos:end]) 658 } 659 if width > 0 { 660 pos += width 661 } else { 662 pos = end + 1 663 } 664 } else { 665 pos = matches[1] 666 } 667 prevMatchEnd = matches[1] 668 669 if accept { 670 deliver(re.pad(matches)) 671 i++ 672 } 673 } 674 } 675 676 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 677 // A return value of nil indicates no match. 678 func (re *Regexp) Find(b []byte) []byte { 679 a := re.doExecute(nil, b, "", 0, 2) 680 if a == nil { 681 return nil 682 } 683 return b[a[0]:a[1]] 684 } 685 686 // FindIndex returns a two-element slice of integers defining the location of 687 // the leftmost match in b of the regular expression. The match itself is at 688 // b[loc[0]:loc[1]]. 689 // A return value of nil indicates no match. 690 func (re *Regexp) FindIndex(b []byte) (loc []int) { 691 a := re.doExecute(nil, b, "", 0, 2) 692 if a == nil { 693 return nil 694 } 695 return a[0:2] 696 } 697 698 // FindString returns a string holding the text of the leftmost match in s of the regular 699 // expression. If there is no match, the return value is an empty string, 700 // but it will also be empty if the regular expression successfully matches 701 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 702 // necessary to distinguish these cases. 703 func (re *Regexp) FindString(s string) string { 704 a := re.doExecute(nil, nil, s, 0, 2) 705 if a == nil { 706 return "" 707 } 708 return s[a[0]:a[1]] 709 } 710 711 // FindStringIndex returns a two-element slice of integers defining the 712 // location of the leftmost match in s of the regular expression. The match 713 // itself is at s[loc[0]:loc[1]]. 714 // A return value of nil indicates no match. 715 func (re *Regexp) FindStringIndex(s string) (loc []int) { 716 a := re.doExecute(nil, nil, s, 0, 2) 717 if a == nil { 718 return nil 719 } 720 return a[0:2] 721 } 722 723 // FindReaderIndex returns a two-element slice of integers defining the 724 // location of the leftmost match of the regular expression in text read from 725 // the RuneReader. The match text was found in the input stream at 726 // byte offset loc[0] through loc[1]-1. 727 // A return value of nil indicates no match. 728 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 729 a := re.doExecute(r, nil, "", 0, 2) 730 if a == nil { 731 return nil 732 } 733 return a[0:2] 734 } 735 736 // FindSubmatch returns a slice of slices holding the text of the leftmost 737 // match of the regular expression in b and the matches, if any, of its 738 // subexpressions, as defined by the 'Submatch' descriptions in the package 739 // comment. 740 // A return value of nil indicates no match. 741 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 742 a := re.doExecute(nil, b, "", 0, re.prog.NumCap) 743 if a == nil { 744 return nil 745 } 746 ret := make([][]byte, 1+re.numSubexp) 747 for i := range ret { 748 if 2*i < len(a) && a[2*i] >= 0 { 749 ret[i] = b[a[2*i]:a[2*i+1]] 750 } 751 } 752 return ret 753 } 754 755 // Expand appends template to dst and returns the result; during the 756 // append, Expand replaces variables in the template with corresponding 757 // matches drawn from src. The match slice should have been returned by 758 // FindSubmatchIndex. 759 // 760 // In the template, a variable is denoted by a substring of the form 761 // $name or ${name}, where name is a non-empty sequence of letters, 762 // digits, and underscores. A purely numeric name like $1 refers to 763 // the submatch with the corresponding index; other names refer to 764 // capturing parentheses named with the (?P<name>...) syntax. A 765 // reference to an out of range or unmatched index or a name that is not 766 // present in the regular expression is replaced with an empty slice. 767 // 768 // In the $name form, name is taken to be as long as possible: $1x is 769 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 770 // 771 // To insert a literal $ in the output, use $$ in the template. 772 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 773 return re.expand(dst, string(template), src, "", match) 774 } 775 776 // ExpandString is like Expand but the template and source are strings. 777 // It appends to and returns a byte slice in order to give the calling 778 // code control over allocation. 779 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 780 return re.expand(dst, template, nil, src, match) 781 } 782 783 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 784 for len(template) > 0 { 785 i := strings.Index(template, "$") 786 if i < 0 { 787 break 788 } 789 dst = append(dst, template[:i]...) 790 template = template[i:] 791 if len(template) > 1 && template[1] == '$' { 792 // Treat $$ as $. 793 dst = append(dst, '$') 794 template = template[2:] 795 continue 796 } 797 name, num, rest, ok := extract(template) 798 if !ok { 799 // Malformed; treat $ as raw text. 800 dst = append(dst, '$') 801 template = template[1:] 802 continue 803 } 804 template = rest 805 if num >= 0 { 806 if 2*num+1 < len(match) && match[2*num] >= 0 { 807 if bsrc != nil { 808 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 809 } else { 810 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 811 } 812 } 813 } else { 814 for i, namei := range re.subexpNames { 815 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 816 if bsrc != nil { 817 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 818 } else { 819 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 820 } 821 break 822 } 823 } 824 } 825 } 826 dst = append(dst, template...) 827 return dst 828 } 829 830 // extract returns the name from a leading "$name" or "${name}" in str. 831 // If it is a number, extract returns num set to that number; otherwise num = -1. 832 func extract(str string) (name string, num int, rest string, ok bool) { 833 if len(str) < 2 || str[0] != '$' { 834 return 835 } 836 brace := false 837 if str[1] == '{' { 838 brace = true 839 str = str[2:] 840 } else { 841 str = str[1:] 842 } 843 i := 0 844 for i < len(str) { 845 rune, size := utf8.DecodeRuneInString(str[i:]) 846 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 847 break 848 } 849 i += size 850 } 851 if i == 0 { 852 // empty name is not okay 853 return 854 } 855 name = str[:i] 856 if brace { 857 if i >= len(str) || str[i] != '}' { 858 // missing closing brace 859 return 860 } 861 i++ 862 } 863 864 // Parse number. 865 num = 0 866 for i := 0; i < len(name); i++ { 867 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 868 num = -1 869 break 870 } 871 num = num*10 + int(name[i]) - '0' 872 } 873 // Disallow leading zeros. 874 if name[0] == '0' && len(name) > 1 { 875 num = -1 876 } 877 878 rest = str[i:] 879 ok = true 880 return 881 } 882 883 // FindSubmatchIndex returns a slice holding the index pairs identifying the 884 // leftmost match of the regular expression in b and the matches, if any, of 885 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 886 // in the package comment. 887 // A return value of nil indicates no match. 888 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 889 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap)) 890 } 891 892 // FindStringSubmatch returns a slice of strings holding the text of the 893 // leftmost match of the regular expression in s and the matches, if any, of 894 // its subexpressions, as defined by the 'Submatch' description in the 895 // package comment. 896 // A return value of nil indicates no match. 897 func (re *Regexp) FindStringSubmatch(s string) []string { 898 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap) 899 if a == nil { 900 return nil 901 } 902 ret := make([]string, 1+re.numSubexp) 903 for i := range ret { 904 if 2*i < len(a) && a[2*i] >= 0 { 905 ret[i] = s[a[2*i]:a[2*i+1]] 906 } 907 } 908 return ret 909 } 910 911 // FindStringSubmatchIndex returns a slice holding the index pairs 912 // identifying the leftmost match of the regular expression in s and the 913 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 914 // 'Index' descriptions in the package comment. 915 // A return value of nil indicates no match. 916 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 917 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap)) 918 } 919 920 // FindReaderSubmatchIndex returns a slice holding the index pairs 921 // identifying the leftmost match of the regular expression of text read by 922 // the RuneReader, and the matches, if any, of its subexpressions, as defined 923 // by the 'Submatch' and 'Index' descriptions in the package comment. A 924 // return value of nil indicates no match. 925 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 926 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap)) 927 } 928 929 const startSize = 10 // The size at which to start a slice in the 'All' routines. 930 931 // FindAll is the 'All' version of Find; it returns a slice of all successive 932 // matches of the expression, as defined by the 'All' description in the 933 // package comment. 934 // A return value of nil indicates no match. 935 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 936 if n < 0 { 937 n = len(b) + 1 938 } 939 result := make([][]byte, 0, startSize) 940 re.allMatches("", b, n, func(match []int) { 941 result = append(result, b[match[0]:match[1]]) 942 }) 943 if len(result) == 0 { 944 return nil 945 } 946 return result 947 } 948 949 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 950 // successive matches of the expression, as defined by the 'All' description 951 // in the package comment. 952 // A return value of nil indicates no match. 953 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 954 if n < 0 { 955 n = len(b) + 1 956 } 957 result := make([][]int, 0, startSize) 958 re.allMatches("", b, n, func(match []int) { 959 result = append(result, match[0:2]) 960 }) 961 if len(result) == 0 { 962 return nil 963 } 964 return result 965 } 966 967 // FindAllString is the 'All' version of FindString; it returns a slice of all 968 // successive matches of the expression, as defined by the 'All' description 969 // in the package comment. 970 // A return value of nil indicates no match. 971 func (re *Regexp) FindAllString(s string, n int) []string { 972 if n < 0 { 973 n = len(s) + 1 974 } 975 result := make([]string, 0, startSize) 976 re.allMatches(s, nil, n, func(match []int) { 977 result = append(result, s[match[0]:match[1]]) 978 }) 979 if len(result) == 0 { 980 return nil 981 } 982 return result 983 } 984 985 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 986 // slice of all successive matches of the expression, as defined by the 'All' 987 // description in the package comment. 988 // A return value of nil indicates no match. 989 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 990 if n < 0 { 991 n = len(s) + 1 992 } 993 result := make([][]int, 0, startSize) 994 re.allMatches(s, nil, n, func(match []int) { 995 result = append(result, match[0:2]) 996 }) 997 if len(result) == 0 { 998 return nil 999 } 1000 return result 1001 } 1002 1003 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1004 // of all successive matches of the expression, as defined by the 'All' 1005 // description in the package comment. 1006 // A return value of nil indicates no match. 1007 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1008 if n < 0 { 1009 n = len(b) + 1 1010 } 1011 result := make([][][]byte, 0, startSize) 1012 re.allMatches("", b, n, func(match []int) { 1013 slice := make([][]byte, len(match)/2) 1014 for j := range slice { 1015 if match[2*j] >= 0 { 1016 slice[j] = b[match[2*j]:match[2*j+1]] 1017 } 1018 } 1019 result = append(result, slice) 1020 }) 1021 if len(result) == 0 { 1022 return nil 1023 } 1024 return result 1025 } 1026 1027 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1028 // a slice of all successive matches of the expression, as defined by the 1029 // 'All' description in the package comment. 1030 // A return value of nil indicates no match. 1031 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1032 if n < 0 { 1033 n = len(b) + 1 1034 } 1035 result := make([][]int, 0, startSize) 1036 re.allMatches("", b, n, func(match []int) { 1037 result = append(result, match) 1038 }) 1039 if len(result) == 0 { 1040 return nil 1041 } 1042 return result 1043 } 1044 1045 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1046 // returns a slice of all successive matches of the expression, as defined by 1047 // the 'All' description in the package comment. 1048 // A return value of nil indicates no match. 1049 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1050 if n < 0 { 1051 n = len(s) + 1 1052 } 1053 result := make([][]string, 0, startSize) 1054 re.allMatches(s, nil, n, func(match []int) { 1055 slice := make([]string, len(match)/2) 1056 for j := range slice { 1057 if match[2*j] >= 0 { 1058 slice[j] = s[match[2*j]:match[2*j+1]] 1059 } 1060 } 1061 result = append(result, slice) 1062 }) 1063 if len(result) == 0 { 1064 return nil 1065 } 1066 return result 1067 } 1068 1069 // FindAllStringSubmatchIndex is the 'All' version of 1070 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1071 // the expression, as defined by the 'All' description in the package 1072 // comment. 1073 // A return value of nil indicates no match. 1074 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1075 if n < 0 { 1076 n = len(s) + 1 1077 } 1078 result := make([][]int, 0, startSize) 1079 re.allMatches(s, nil, n, func(match []int) { 1080 result = append(result, match) 1081 }) 1082 if len(result) == 0 { 1083 return nil 1084 } 1085 return result 1086 } 1087 1088 // Split slices s into substrings separated by the expression and returns a slice of 1089 // the substrings between those expression matches. 1090 // 1091 // The slice returned by this method consists of all the substrings of s 1092 // not contained in the slice returned by FindAllString. When called on an expression 1093 // that contains no metacharacters, it is equivalent to strings.SplitN. 1094 // 1095 // Example: 1096 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1097 // // s: ["", "b", "b", "c", "cadaaae"] 1098 // 1099 // The count determines the number of substrings to return: 1100 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 1101 // n == 0: the result is nil (zero substrings) 1102 // n < 0: all substrings 1103 func (re *Regexp) Split(s string, n int) []string { 1104 1105 if n == 0 { 1106 return nil 1107 } 1108 1109 if len(re.expr) > 0 && len(s) == 0 { 1110 return []string{""} 1111 } 1112 1113 matches := re.FindAllStringIndex(s, n) 1114 strings := make([]string, 0, len(matches)) 1115 1116 beg := 0 1117 end := 0 1118 for _, match := range matches { 1119 if n > 0 && len(strings) >= n-1 { 1120 break 1121 } 1122 1123 end = match[0] 1124 if match[1] != 0 { 1125 strings = append(strings, s[beg:end]) 1126 } 1127 beg = match[1] 1128 } 1129 1130 if end != len(s) { 1131 strings = append(strings, s[beg:]) 1132 } 1133 1134 return strings 1135 }