github.com/code-reading/golang@v0.0.0-20220303082512-ba5bc0e589a3/go/src/regexp/regexp.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // https://golang.org/s/re2syntax, except for \C. 11 // For an overview of the syntax, run 12 // go doc regexp/syntax 13 // 14 // The regexp implementation provided by this package is 15 // guaranteed to run in time linear in the size of the input. 16 // (This is a property not guaranteed by most open source 17 // implementations of regular expressions.) For more information 18 // about this property, see 19 // https://swtch.com/~rsc/regexp/regexp1.html 20 // or any book about automata theory. 21 // 22 // All characters are UTF-8-encoded code points. 23 // 24 // There are 16 methods of Regexp that match a regular expression and identify 25 // the matched text. Their names are matched by this regular expression: 26 // 27 // Find(All)?(String)?(Submatch)?(Index)? 28 // 29 // If 'All' is present, the routine matches successive non-overlapping 30 // matches of the entire expression. Empty matches abutting a preceding 31 // match are ignored. The return value is a slice containing the successive 32 // return values of the corresponding non-'All' routine. These routines take 33 // an extra integer argument, n. If n >= 0, the function returns at most n 34 // matches/submatches; otherwise, it returns all of them. 35 // 36 // If 'String' is present, the argument is a string; otherwise it is a slice 37 // of bytes; return values are adjusted as appropriate. 38 // 39 // If 'Submatch' is present, the return value is a slice identifying the 40 // successive submatches of the expression. Submatches are matches of 41 // parenthesized subexpressions (also known as capturing groups) within the 42 // regular expression, numbered from left to right in order of opening 43 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 44 // the match of the first parenthesized subexpression, and so on. 45 // 46 // If 'Index' is present, matches and submatches are identified by byte index 47 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 48 // the nth submatch. The pair for n==0 identifies the match of the entire 49 // expression. If 'Index' is not present, the match is identified by the text 50 // of the match/submatch. If an index is negative or text is nil, it means that 51 // subexpression did not match any string in the input. For 'String' versions 52 // an empty string means either no match or an empty match. 53 // 54 // There is also a subset of the methods that can be applied to text read 55 // from a RuneReader: 56 // 57 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 58 // 59 // This set may grow. Note that regular expression matches may need to 60 // examine text beyond the text returned by a match, so the methods that 61 // match text from a RuneReader may read arbitrarily far into the input 62 // before returning. 63 // 64 // (There are a few other methods that do not match this pattern.) 65 // 66 package regexp 67 68 import ( 69 "bytes" 70 "io" 71 "regexp/syntax" 72 "strconv" 73 "strings" 74 "sync" 75 "unicode" 76 "unicode/utf8" 77 ) 78 79 // Regexp is the representation of a compiled regular expression. 80 // A Regexp is safe for concurrent use by multiple goroutines, 81 // except for configuration methods, such as Longest. 82 type Regexp struct { 83 expr string // as passed to Compile 84 prog *syntax.Prog // compiled program 85 onepass *onePassProg // onepass program or nil 86 numSubexp int 87 maxBitStateLen int 88 subexpNames []string 89 prefix string // required prefix in unanchored matches 90 prefixBytes []byte // prefix, as a []byte 91 prefixRune rune // first rune in prefix 92 prefixEnd uint32 // pc for last rune in prefix 93 mpool int // pool for machines 94 matchcap int // size of recorded match lengths 95 prefixComplete bool // prefix is the entire regexp 96 cond syntax.EmptyOp // empty-width conditions required at start of match 97 minInputLen int // minimum length of the input in bytes 98 99 // This field can be modified by the Longest method, 100 // but it is otherwise read-only. 101 longest bool // whether regexp prefers leftmost-longest match 102 } 103 104 // String returns the source text used to compile the regular expression. 105 func (re *Regexp) String() string { 106 return re.expr 107 } 108 109 // Copy returns a new Regexp object copied from re. 110 // Calling Longest on one copy does not affect another. 111 // 112 // Deprecated: In earlier releases, when using a Regexp in multiple goroutines, 113 // giving each goroutine its own copy helped to avoid lock contention. 114 // As of Go 1.12, using Copy is no longer necessary to avoid lock contention. 115 // Copy may still be appropriate if the reason for its use is to make 116 // two copies with different Longest settings. 117 func (re *Regexp) Copy() *Regexp { 118 re2 := *re 119 return &re2 120 } 121 122 // Compile parses a regular expression and returns, if successful, 123 // a Regexp object that can be used to match against text. 124 // 125 // When matching against text, the regexp returns a match that 126 // begins as early as possible in the input (leftmost), and among those 127 // it chooses the one that a backtracking search would have found first. 128 // This so-called leftmost-first matching is the same semantics 129 // that Perl, Python, and other implementations use, although this 130 // package implements it without the expense of backtracking. 131 // For POSIX leftmost-longest matching, see CompilePOSIX. 132 func Compile(expr string) (*Regexp, error) { 133 return compile(expr, syntax.Perl, false) 134 } 135 136 // CompilePOSIX is like Compile but restricts the regular expression 137 // to POSIX ERE (egrep) syntax and changes the match semantics to 138 // leftmost-longest. 139 // 140 // That is, when matching against text, the regexp returns a match that 141 // begins as early as possible in the input (leftmost), and among those 142 // it chooses a match that is as long as possible. 143 // This so-called leftmost-longest matching is the same semantics 144 // that early regular expression implementations used and that POSIX 145 // specifies. 146 // 147 // However, there can be multiple leftmost-longest matches, with different 148 // submatch choices, and here this package diverges from POSIX. 149 // Among the possible leftmost-longest matches, this package chooses 150 // the one that a backtracking search would have found first, while POSIX 151 // specifies that the match be chosen to maximize the length of the first 152 // subexpression, then the second, and so on from left to right. 153 // The POSIX rule is computationally prohibitive and not even well-defined. 154 // See https://swtch.com/~rsc/regexp/regexp2.html#posix for details. 155 func CompilePOSIX(expr string) (*Regexp, error) { 156 return compile(expr, syntax.POSIX, true) 157 } 158 159 // Longest makes future searches prefer the leftmost-longest match. 160 // That is, when matching against text, the regexp returns a match that 161 // begins as early as possible in the input (leftmost), and among those 162 // it chooses a match that is as long as possible. 163 // This method modifies the Regexp and may not be called concurrently 164 // with any other methods. 165 func (re *Regexp) Longest() { 166 re.longest = true 167 } 168 169 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 170 re, err := syntax.Parse(expr, mode) 171 if err != nil { 172 return nil, err 173 } 174 maxCap := re.MaxCap() 175 capNames := re.CapNames() 176 177 re = re.Simplify() 178 prog, err := syntax.Compile(re) 179 if err != nil { 180 return nil, err 181 } 182 matchcap := prog.NumCap 183 if matchcap < 2 { 184 matchcap = 2 185 } 186 regexp := &Regexp{ 187 expr: expr, 188 prog: prog, 189 onepass: compileOnePass(prog), 190 numSubexp: maxCap, 191 subexpNames: capNames, 192 cond: prog.StartCond(), 193 longest: longest, 194 matchcap: matchcap, 195 minInputLen: minInputLen(re), 196 } 197 if regexp.onepass == nil { 198 regexp.prefix, regexp.prefixComplete = prog.Prefix() 199 regexp.maxBitStateLen = maxBitStateLen(prog) 200 } else { 201 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 202 } 203 if regexp.prefix != "" { 204 // TODO(rsc): Remove this allocation by adding 205 // IndexString to package bytes. 206 regexp.prefixBytes = []byte(regexp.prefix) 207 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 208 } 209 210 n := len(prog.Inst) 211 i := 0 212 for matchSize[i] != 0 && matchSize[i] < n { 213 i++ 214 } 215 regexp.mpool = i 216 217 return regexp, nil 218 } 219 220 // Pools of *machine for use during (*Regexp).doExecute, 221 // split up by the size of the execution queues. 222 // matchPool[i] machines have queue size matchSize[i]. 223 // On a 64-bit system each queue entry is 16 bytes, 224 // so matchPool[0] has 16*2*128 = 4kB queues, etc. 225 // The final matchPool is a catch-all for very large queues. 226 var ( 227 matchSize = [...]int{128, 512, 2048, 16384, 0} 228 matchPool [len(matchSize)]sync.Pool 229 ) 230 231 // get returns a machine to use for matching re. 232 // It uses the re's machine cache if possible, to avoid 233 // unnecessary allocation. 234 func (re *Regexp) get() *machine { 235 m, ok := matchPool[re.mpool].Get().(*machine) 236 if !ok { 237 m = new(machine) 238 } 239 m.re = re 240 m.p = re.prog 241 if cap(m.matchcap) < re.matchcap { 242 m.matchcap = make([]int, re.matchcap) 243 for _, t := range m.pool { 244 t.cap = make([]int, re.matchcap) 245 } 246 } 247 248 // Allocate queues if needed. 249 // Or reallocate, for "large" match pool. 250 n := matchSize[re.mpool] 251 if n == 0 { // large pool 252 n = len(re.prog.Inst) 253 } 254 if len(m.q0.sparse) < n { 255 m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 256 m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 257 } 258 return m 259 } 260 261 // put returns a machine to the correct machine pool. 262 func (re *Regexp) put(m *machine) { 263 m.re = nil 264 m.p = nil 265 m.inputs.clear() 266 matchPool[re.mpool].Put(m) 267 } 268 269 // minInputLen walks the regexp to find the minimum length of any matchable input 270 func minInputLen(re *syntax.Regexp) int { 271 switch re.Op { 272 default: 273 return 0 274 case syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpCharClass: 275 return 1 276 case syntax.OpLiteral: 277 l := 0 278 for _, r := range re.Rune { 279 l += utf8.RuneLen(r) 280 } 281 return l 282 case syntax.OpCapture, syntax.OpPlus: 283 return minInputLen(re.Sub[0]) 284 case syntax.OpRepeat: 285 return re.Min * minInputLen(re.Sub[0]) 286 case syntax.OpConcat: 287 l := 0 288 for _, sub := range re.Sub { 289 l += minInputLen(sub) 290 } 291 return l 292 case syntax.OpAlternate: 293 l := minInputLen(re.Sub[0]) 294 var lnext int 295 for _, sub := range re.Sub[1:] { 296 lnext = minInputLen(sub) 297 if lnext < l { 298 l = lnext 299 } 300 } 301 return l 302 } 303 } 304 305 // MustCompile is like Compile but panics if the expression cannot be parsed. 306 // It simplifies safe initialization of global variables holding compiled regular 307 // expressions. 308 func MustCompile(str string) *Regexp { 309 regexp, err := Compile(str) 310 if err != nil { 311 panic(`regexp: Compile(` + quote(str) + `): ` + err.Error()) 312 } 313 return regexp 314 } 315 316 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 317 // It simplifies safe initialization of global variables holding compiled regular 318 // expressions. 319 func MustCompilePOSIX(str string) *Regexp { 320 regexp, err := CompilePOSIX(str) 321 if err != nil { 322 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error()) 323 } 324 return regexp 325 } 326 327 func quote(s string) string { 328 if strconv.CanBackquote(s) { 329 return "`" + s + "`" 330 } 331 return strconv.Quote(s) 332 } 333 334 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 335 func (re *Regexp) NumSubexp() int { 336 return re.numSubexp 337 } 338 339 // SubexpNames returns the names of the parenthesized subexpressions 340 // in this Regexp. The name for the first sub-expression is names[1], 341 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 342 // Since the Regexp as a whole cannot be named, names[0] is always 343 // the empty string. The slice should not be modified. 344 func (re *Regexp) SubexpNames() []string { 345 return re.subexpNames 346 } 347 348 // SubexpIndex returns the index of the first subexpression with the given name, 349 // or -1 if there is no subexpression with that name. 350 // 351 // Note that multiple subexpressions can be written using the same name, as in 352 // (?P<bob>a+)(?P<bob>b+), which declares two subexpressions named "bob". 353 // In this case, SubexpIndex returns the index of the leftmost such subexpression 354 // in the regular expression. 355 func (re *Regexp) SubexpIndex(name string) int { 356 if name != "" { 357 for i, s := range re.subexpNames { 358 if name == s { 359 return i 360 } 361 } 362 } 363 return -1 364 } 365 366 const endOfText rune = -1 367 368 // input abstracts different representations of the input text. It provides 369 // one-character lookahead. 370 type input interface { 371 step(pos int) (r rune, width int) // advance one rune 372 canCheckPrefix() bool // can we look ahead without losing info? 373 hasPrefix(re *Regexp) bool 374 index(re *Regexp, pos int) int 375 context(pos int) lazyFlag 376 } 377 378 // inputString scans a string. 379 type inputString struct { 380 str string 381 } 382 383 func (i *inputString) step(pos int) (rune, int) { 384 if pos < len(i.str) { 385 c := i.str[pos] 386 if c < utf8.RuneSelf { 387 return rune(c), 1 388 } 389 return utf8.DecodeRuneInString(i.str[pos:]) 390 } 391 return endOfText, 0 392 } 393 394 func (i *inputString) canCheckPrefix() bool { 395 return true 396 } 397 398 func (i *inputString) hasPrefix(re *Regexp) bool { 399 return strings.HasPrefix(i.str, re.prefix) 400 } 401 402 func (i *inputString) index(re *Regexp, pos int) int { 403 return strings.Index(i.str[pos:], re.prefix) 404 } 405 406 func (i *inputString) context(pos int) lazyFlag { 407 r1, r2 := endOfText, endOfText 408 // 0 < pos && pos <= len(i.str) 409 if uint(pos-1) < uint(len(i.str)) { 410 r1 = rune(i.str[pos-1]) 411 if r1 >= utf8.RuneSelf { 412 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 413 } 414 } 415 // 0 <= pos && pos < len(i.str) 416 if uint(pos) < uint(len(i.str)) { 417 r2 = rune(i.str[pos]) 418 if r2 >= utf8.RuneSelf { 419 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 420 } 421 } 422 return newLazyFlag(r1, r2) 423 } 424 425 // inputBytes scans a byte slice. 426 type inputBytes struct { 427 str []byte 428 } 429 430 func (i *inputBytes) step(pos int) (rune, int) { 431 if pos < len(i.str) { 432 c := i.str[pos] 433 if c < utf8.RuneSelf { 434 return rune(c), 1 435 } 436 return utf8.DecodeRune(i.str[pos:]) 437 } 438 return endOfText, 0 439 } 440 441 func (i *inputBytes) canCheckPrefix() bool { 442 return true 443 } 444 445 func (i *inputBytes) hasPrefix(re *Regexp) bool { 446 return bytes.HasPrefix(i.str, re.prefixBytes) 447 } 448 449 func (i *inputBytes) index(re *Regexp, pos int) int { 450 return bytes.Index(i.str[pos:], re.prefixBytes) 451 } 452 453 func (i *inputBytes) context(pos int) lazyFlag { 454 r1, r2 := endOfText, endOfText 455 // 0 < pos && pos <= len(i.str) 456 if uint(pos-1) < uint(len(i.str)) { 457 r1 = rune(i.str[pos-1]) 458 if r1 >= utf8.RuneSelf { 459 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 460 } 461 } 462 // 0 <= pos && pos < len(i.str) 463 if uint(pos) < uint(len(i.str)) { 464 r2 = rune(i.str[pos]) 465 if r2 >= utf8.RuneSelf { 466 r2, _ = utf8.DecodeRune(i.str[pos:]) 467 } 468 } 469 return newLazyFlag(r1, r2) 470 } 471 472 // inputReader scans a RuneReader. 473 type inputReader struct { 474 r io.RuneReader 475 atEOT bool 476 pos int 477 } 478 479 func (i *inputReader) step(pos int) (rune, int) { 480 if !i.atEOT && pos != i.pos { 481 return endOfText, 0 482 483 } 484 r, w, err := i.r.ReadRune() 485 if err != nil { 486 i.atEOT = true 487 return endOfText, 0 488 } 489 i.pos += w 490 return r, w 491 } 492 493 func (i *inputReader) canCheckPrefix() bool { 494 return false 495 } 496 497 func (i *inputReader) hasPrefix(re *Regexp) bool { 498 return false 499 } 500 501 func (i *inputReader) index(re *Regexp, pos int) int { 502 return -1 503 } 504 505 func (i *inputReader) context(pos int) lazyFlag { 506 return 0 // not used 507 } 508 509 // LiteralPrefix returns a literal string that must begin any match 510 // of the regular expression re. It returns the boolean true if the 511 // literal string comprises the entire regular expression. 512 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 513 return re.prefix, re.prefixComplete 514 } 515 516 // MatchReader reports whether the text returned by the RuneReader 517 // contains any match of the regular expression re. 518 func (re *Regexp) MatchReader(r io.RuneReader) bool { 519 return re.doMatch(r, nil, "") 520 } 521 522 // MatchString reports whether the string s 523 // contains any match of the regular expression re. 524 func (re *Regexp) MatchString(s string) bool { 525 return re.doMatch(nil, nil, s) 526 } 527 528 // Match reports whether the byte slice b 529 // contains any match of the regular expression re. 530 func (re *Regexp) Match(b []byte) bool { 531 return re.doMatch(nil, b, "") 532 } 533 534 // MatchReader reports whether the text returned by the RuneReader 535 // contains any match of the regular expression pattern. 536 // More complicated queries need to use Compile and the full Regexp interface. 537 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 538 re, err := Compile(pattern) 539 if err != nil { 540 return false, err 541 } 542 return re.MatchReader(r), nil 543 } 544 545 // MatchString reports whether the string s 546 // contains any match of the regular expression pattern. 547 // More complicated queries need to use Compile and the full Regexp interface. 548 func MatchString(pattern string, s string) (matched bool, err error) { 549 re, err := Compile(pattern) 550 if err != nil { 551 return false, err 552 } 553 return re.MatchString(s), nil 554 } 555 556 // Match reports whether the byte slice b 557 // contains any match of the regular expression pattern. 558 // More complicated queries need to use Compile and the full Regexp interface. 559 func Match(pattern string, b []byte) (matched bool, err error) { 560 re, err := Compile(pattern) 561 if err != nil { 562 return false, err 563 } 564 return re.Match(b), nil 565 } 566 567 // ReplaceAllString returns a copy of src, replacing matches of the Regexp 568 // with the replacement string repl. Inside repl, $ signs are interpreted as 569 // in Expand, so for instance $1 represents the text of the first submatch. 570 func (re *Regexp) ReplaceAllString(src, repl string) string { 571 n := 2 572 if strings.Contains(repl, "$") { 573 n = 2 * (re.numSubexp + 1) 574 } 575 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 576 return re.expand(dst, repl, nil, src, match) 577 }) 578 return string(b) 579 } 580 581 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 582 // with the replacement string repl. The replacement repl is substituted directly, 583 // without using Expand. 584 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 585 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 586 return append(dst, repl...) 587 })) 588 } 589 590 // ReplaceAllStringFunc returns a copy of src in which all matches of the 591 // Regexp have been replaced by the return value of function repl applied 592 // to the matched substring. The replacement returned by repl is substituted 593 // directly, without using Expand. 594 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 595 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 596 return append(dst, repl(src[match[0]:match[1]])...) 597 }) 598 return string(b) 599 } 600 601 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 602 lastMatchEnd := 0 // end position of the most recent match 603 searchPos := 0 // position where we next look for a match 604 var buf []byte 605 var endPos int 606 if bsrc != nil { 607 endPos = len(bsrc) 608 } else { 609 endPos = len(src) 610 } 611 if nmatch > re.prog.NumCap { 612 nmatch = re.prog.NumCap 613 } 614 615 var dstCap [2]int 616 for searchPos <= endPos { 617 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) 618 if len(a) == 0 { 619 break // no more matches 620 } 621 622 // Copy the unmatched characters before this match. 623 if bsrc != nil { 624 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 625 } else { 626 buf = append(buf, src[lastMatchEnd:a[0]]...) 627 } 628 629 // Now insert a copy of the replacement string, but not for a 630 // match of the empty string immediately after another match. 631 // (Otherwise, we get double replacement for patterns that 632 // match both empty and nonempty strings.) 633 if a[1] > lastMatchEnd || a[0] == 0 { 634 buf = repl(buf, a) 635 } 636 lastMatchEnd = a[1] 637 638 // Advance past this match; always advance at least one character. 639 var width int 640 if bsrc != nil { 641 _, width = utf8.DecodeRune(bsrc[searchPos:]) 642 } else { 643 _, width = utf8.DecodeRuneInString(src[searchPos:]) 644 } 645 if searchPos+width > a[1] { 646 searchPos += width 647 } else if searchPos+1 > a[1] { 648 // This clause is only needed at the end of the input 649 // string. In that case, DecodeRuneInString returns width=0. 650 searchPos++ 651 } else { 652 searchPos = a[1] 653 } 654 } 655 656 // Copy the unmatched characters after the last match. 657 if bsrc != nil { 658 buf = append(buf, bsrc[lastMatchEnd:]...) 659 } else { 660 buf = append(buf, src[lastMatchEnd:]...) 661 } 662 663 return buf 664 } 665 666 // ReplaceAll returns a copy of src, replacing matches of the Regexp 667 // with the replacement text repl. Inside repl, $ signs are interpreted as 668 // in Expand, so for instance $1 represents the text of the first submatch. 669 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 670 n := 2 671 if bytes.IndexByte(repl, '$') >= 0 { 672 n = 2 * (re.numSubexp + 1) 673 } 674 srepl := "" 675 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 676 if len(srepl) != len(repl) { 677 srepl = string(repl) 678 } 679 return re.expand(dst, srepl, src, "", match) 680 }) 681 return b 682 } 683 684 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 685 // with the replacement bytes repl. The replacement repl is substituted directly, 686 // without using Expand. 687 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 688 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 689 return append(dst, repl...) 690 }) 691 } 692 693 // ReplaceAllFunc returns a copy of src in which all matches of the 694 // Regexp have been replaced by the return value of function repl applied 695 // to the matched byte slice. The replacement returned by repl is substituted 696 // directly, without using Expand. 697 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 698 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 699 return append(dst, repl(src[match[0]:match[1]])...) 700 }) 701 } 702 703 // Bitmap used by func special to check whether a character needs to be escaped. 704 var specialBytes [16]byte 705 706 // special reports whether byte b needs to be escaped by QuoteMeta. 707 func special(b byte) bool { 708 return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0 709 } 710 711 func init() { 712 for _, b := range []byte(`\.+*?()|[]{}^$`) { 713 specialBytes[b%16] |= 1 << (b / 16) 714 } 715 } 716 717 // QuoteMeta returns a string that escapes all regular expression metacharacters 718 // inside the argument text; the returned string is a regular expression matching 719 // the literal text. 720 func QuoteMeta(s string) string { 721 // A byte loop is correct because all metacharacters are ASCII. 722 var i int 723 for i = 0; i < len(s); i++ { 724 if special(s[i]) { 725 break 726 } 727 } 728 // No meta characters found, so return original string. 729 if i >= len(s) { 730 return s 731 } 732 733 b := make([]byte, 2*len(s)-i) 734 copy(b, s[:i]) 735 j := i 736 for ; i < len(s); i++ { 737 if special(s[i]) { 738 b[j] = '\\' 739 j++ 740 } 741 b[j] = s[i] 742 j++ 743 } 744 return string(b[:j]) 745 } 746 747 // The number of capture values in the program may correspond 748 // to fewer capturing expressions than are in the regexp. 749 // For example, "(a){0}" turns into an empty program, so the 750 // maximum capture in the program is 0 but we need to return 751 // an expression for \1. Pad appends -1s to the slice a as needed. 752 func (re *Regexp) pad(a []int) []int { 753 if a == nil { 754 // No match. 755 return nil 756 } 757 n := (1 + re.numSubexp) * 2 758 for len(a) < n { 759 a = append(a, -1) 760 } 761 return a 762 } 763 764 // allMatches calls deliver at most n times 765 // with the location of successive matches in the input text. 766 // The input text is b if non-nil, otherwise s. 767 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 768 var end int 769 if b == nil { 770 end = len(s) 771 } else { 772 end = len(b) 773 } 774 775 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 776 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) 777 if len(matches) == 0 { 778 break 779 } 780 781 accept := true 782 if matches[1] == pos { 783 // We've found an empty match. 784 if matches[0] == prevMatchEnd { 785 // We don't allow an empty match right 786 // after a previous match, so ignore it. 787 accept = false 788 } 789 var width int 790 // TODO: use step() 791 if b == nil { 792 _, width = utf8.DecodeRuneInString(s[pos:end]) 793 } else { 794 _, width = utf8.DecodeRune(b[pos:end]) 795 } 796 if width > 0 { 797 pos += width 798 } else { 799 pos = end + 1 800 } 801 } else { 802 pos = matches[1] 803 } 804 prevMatchEnd = matches[1] 805 806 if accept { 807 deliver(re.pad(matches)) 808 i++ 809 } 810 } 811 } 812 813 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 814 // A return value of nil indicates no match. 815 func (re *Regexp) Find(b []byte) []byte { 816 var dstCap [2]int 817 a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) 818 if a == nil { 819 return nil 820 } 821 return b[a[0]:a[1]:a[1]] 822 } 823 824 // FindIndex returns a two-element slice of integers defining the location of 825 // the leftmost match in b of the regular expression. The match itself is at 826 // b[loc[0]:loc[1]]. 827 // A return value of nil indicates no match. 828 func (re *Regexp) FindIndex(b []byte) (loc []int) { 829 a := re.doExecute(nil, b, "", 0, 2, nil) 830 if a == nil { 831 return nil 832 } 833 return a[0:2] 834 } 835 836 // FindString returns a string holding the text of the leftmost match in s of the regular 837 // expression. If there is no match, the return value is an empty string, 838 // but it will also be empty if the regular expression successfully matches 839 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 840 // necessary to distinguish these cases. 841 func (re *Regexp) FindString(s string) string { 842 var dstCap [2]int 843 a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) 844 if a == nil { 845 return "" 846 } 847 return s[a[0]:a[1]] 848 } 849 850 // FindStringIndex returns a two-element slice of integers defining the 851 // location of the leftmost match in s of the regular expression. The match 852 // itself is at s[loc[0]:loc[1]]. 853 // A return value of nil indicates no match. 854 func (re *Regexp) FindStringIndex(s string) (loc []int) { 855 a := re.doExecute(nil, nil, s, 0, 2, nil) 856 if a == nil { 857 return nil 858 } 859 return a[0:2] 860 } 861 862 // FindReaderIndex returns a two-element slice of integers defining the 863 // location of the leftmost match of the regular expression in text read from 864 // the RuneReader. The match text was found in the input stream at 865 // byte offset loc[0] through loc[1]-1. 866 // A return value of nil indicates no match. 867 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 868 a := re.doExecute(r, nil, "", 0, 2, nil) 869 if a == nil { 870 return nil 871 } 872 return a[0:2] 873 } 874 875 // FindSubmatch returns a slice of slices holding the text of the leftmost 876 // match of the regular expression in b and the matches, if any, of its 877 // subexpressions, as defined by the 'Submatch' descriptions in the package 878 // comment. 879 // A return value of nil indicates no match. 880 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 881 var dstCap [4]int 882 a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) 883 if a == nil { 884 return nil 885 } 886 ret := make([][]byte, 1+re.numSubexp) 887 for i := range ret { 888 if 2*i < len(a) && a[2*i] >= 0 { 889 ret[i] = b[a[2*i]:a[2*i+1]:a[2*i+1]] 890 } 891 } 892 return ret 893 } 894 895 // Expand appends template to dst and returns the result; during the 896 // append, Expand replaces variables in the template with corresponding 897 // matches drawn from src. The match slice should have been returned by 898 // FindSubmatchIndex. 899 // 900 // In the template, a variable is denoted by a substring of the form 901 // $name or ${name}, where name is a non-empty sequence of letters, 902 // digits, and underscores. A purely numeric name like $1 refers to 903 // the submatch with the corresponding index; other names refer to 904 // capturing parentheses named with the (?P<name>...) syntax. A 905 // reference to an out of range or unmatched index or a name that is not 906 // present in the regular expression is replaced with an empty slice. 907 // 908 // In the $name form, name is taken to be as long as possible: $1x is 909 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 910 // 911 // To insert a literal $ in the output, use $$ in the template. 912 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 913 return re.expand(dst, string(template), src, "", match) 914 } 915 916 // ExpandString is like Expand but the template and source are strings. 917 // It appends to and returns a byte slice in order to give the calling 918 // code control over allocation. 919 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 920 return re.expand(dst, template, nil, src, match) 921 } 922 923 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 924 for len(template) > 0 { 925 i := strings.Index(template, "$") 926 if i < 0 { 927 break 928 } 929 dst = append(dst, template[:i]...) 930 template = template[i:] 931 if len(template) > 1 && template[1] == '$' { 932 // Treat $$ as $. 933 dst = append(dst, '$') 934 template = template[2:] 935 continue 936 } 937 name, num, rest, ok := extract(template) 938 if !ok { 939 // Malformed; treat $ as raw text. 940 dst = append(dst, '$') 941 template = template[1:] 942 continue 943 } 944 template = rest 945 if num >= 0 { 946 if 2*num+1 < len(match) && match[2*num] >= 0 { 947 if bsrc != nil { 948 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 949 } else { 950 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 951 } 952 } 953 } else { 954 for i, namei := range re.subexpNames { 955 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 956 if bsrc != nil { 957 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 958 } else { 959 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 960 } 961 break 962 } 963 } 964 } 965 } 966 dst = append(dst, template...) 967 return dst 968 } 969 970 // extract returns the name from a leading "$name" or "${name}" in str. 971 // If it is a number, extract returns num set to that number; otherwise num = -1. 972 func extract(str string) (name string, num int, rest string, ok bool) { 973 if len(str) < 2 || str[0] != '$' { 974 return 975 } 976 brace := false 977 if str[1] == '{' { 978 brace = true 979 str = str[2:] 980 } else { 981 str = str[1:] 982 } 983 i := 0 984 for i < len(str) { 985 rune, size := utf8.DecodeRuneInString(str[i:]) 986 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 987 break 988 } 989 i += size 990 } 991 if i == 0 { 992 // empty name is not okay 993 return 994 } 995 name = str[:i] 996 if brace { 997 if i >= len(str) || str[i] != '}' { 998 // missing closing brace 999 return 1000 } 1001 i++ 1002 } 1003 1004 // Parse number. 1005 num = 0 1006 for i := 0; i < len(name); i++ { 1007 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 1008 num = -1 1009 break 1010 } 1011 num = num*10 + int(name[i]) - '0' 1012 } 1013 // Disallow leading zeros. 1014 if name[0] == '0' && len(name) > 1 { 1015 num = -1 1016 } 1017 1018 rest = str[i:] 1019 ok = true 1020 return 1021 } 1022 1023 // FindSubmatchIndex returns a slice holding the index pairs identifying the 1024 // leftmost match of the regular expression in b and the matches, if any, of 1025 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 1026 // in the package comment. 1027 // A return value of nil indicates no match. 1028 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 1029 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) 1030 } 1031 1032 // FindStringSubmatch returns a slice of strings holding the text of the 1033 // leftmost match of the regular expression in s and the matches, if any, of 1034 // its subexpressions, as defined by the 'Submatch' description in the 1035 // package comment. 1036 // A return value of nil indicates no match. 1037 func (re *Regexp) FindStringSubmatch(s string) []string { 1038 var dstCap [4]int 1039 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) 1040 if a == nil { 1041 return nil 1042 } 1043 ret := make([]string, 1+re.numSubexp) 1044 for i := range ret { 1045 if 2*i < len(a) && a[2*i] >= 0 { 1046 ret[i] = s[a[2*i]:a[2*i+1]] 1047 } 1048 } 1049 return ret 1050 } 1051 1052 // FindStringSubmatchIndex returns a slice holding the index pairs 1053 // identifying the leftmost match of the regular expression in s and the 1054 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 1055 // 'Index' descriptions in the package comment. 1056 // A return value of nil indicates no match. 1057 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 1058 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) 1059 } 1060 1061 // FindReaderSubmatchIndex returns a slice holding the index pairs 1062 // identifying the leftmost match of the regular expression of text read by 1063 // the RuneReader, and the matches, if any, of its subexpressions, as defined 1064 // by the 'Submatch' and 'Index' descriptions in the package comment. A 1065 // return value of nil indicates no match. 1066 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 1067 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) 1068 } 1069 1070 const startSize = 10 // The size at which to start a slice in the 'All' routines. 1071 1072 // FindAll is the 'All' version of Find; it returns a slice of all successive 1073 // matches of the expression, as defined by the 'All' description in the 1074 // package comment. 1075 // A return value of nil indicates no match. 1076 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 1077 if n < 0 { 1078 n = len(b) + 1 1079 } 1080 var result [][]byte 1081 re.allMatches("", b, n, func(match []int) { 1082 if result == nil { 1083 result = make([][]byte, 0, startSize) 1084 } 1085 result = append(result, b[match[0]:match[1]:match[1]]) 1086 }) 1087 return result 1088 } 1089 1090 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 1091 // successive matches of the expression, as defined by the 'All' description 1092 // in the package comment. 1093 // A return value of nil indicates no match. 1094 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 1095 if n < 0 { 1096 n = len(b) + 1 1097 } 1098 var result [][]int 1099 re.allMatches("", b, n, func(match []int) { 1100 if result == nil { 1101 result = make([][]int, 0, startSize) 1102 } 1103 result = append(result, match[0:2]) 1104 }) 1105 return result 1106 } 1107 1108 // FindAllString is the 'All' version of FindString; it returns a slice of all 1109 // successive matches of the expression, as defined by the 'All' description 1110 // in the package comment. 1111 // A return value of nil indicates no match. 1112 func (re *Regexp) FindAllString(s string, n int) []string { 1113 if n < 0 { 1114 n = len(s) + 1 1115 } 1116 var result []string 1117 re.allMatches(s, nil, n, func(match []int) { 1118 if result == nil { 1119 result = make([]string, 0, startSize) 1120 } 1121 result = append(result, s[match[0]:match[1]]) 1122 }) 1123 return result 1124 } 1125 1126 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 1127 // slice of all successive matches of the expression, as defined by the 'All' 1128 // description in the package comment. 1129 // A return value of nil indicates no match. 1130 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1131 if n < 0 { 1132 n = len(s) + 1 1133 } 1134 var result [][]int 1135 re.allMatches(s, nil, n, func(match []int) { 1136 if result == nil { 1137 result = make([][]int, 0, startSize) 1138 } 1139 result = append(result, match[0:2]) 1140 }) 1141 return result 1142 } 1143 1144 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1145 // of all successive matches of the expression, as defined by the 'All' 1146 // description in the package comment. 1147 // A return value of nil indicates no match. 1148 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1149 if n < 0 { 1150 n = len(b) + 1 1151 } 1152 var result [][][]byte 1153 re.allMatches("", b, n, func(match []int) { 1154 if result == nil { 1155 result = make([][][]byte, 0, startSize) 1156 } 1157 slice := make([][]byte, len(match)/2) 1158 for j := range slice { 1159 if match[2*j] >= 0 { 1160 slice[j] = b[match[2*j]:match[2*j+1]:match[2*j+1]] 1161 } 1162 } 1163 result = append(result, slice) 1164 }) 1165 return result 1166 } 1167 1168 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1169 // a slice of all successive matches of the expression, as defined by the 1170 // 'All' description in the package comment. 1171 // A return value of nil indicates no match. 1172 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1173 if n < 0 { 1174 n = len(b) + 1 1175 } 1176 var result [][]int 1177 re.allMatches("", b, n, func(match []int) { 1178 if result == nil { 1179 result = make([][]int, 0, startSize) 1180 } 1181 result = append(result, match) 1182 }) 1183 return result 1184 } 1185 1186 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1187 // returns a slice of all successive matches of the expression, as defined by 1188 // the 'All' description in the package comment. 1189 // A return value of nil indicates no match. 1190 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1191 if n < 0 { 1192 n = len(s) + 1 1193 } 1194 var result [][]string 1195 re.allMatches(s, nil, n, func(match []int) { 1196 if result == nil { 1197 result = make([][]string, 0, startSize) 1198 } 1199 slice := make([]string, len(match)/2) 1200 for j := range slice { 1201 if match[2*j] >= 0 { 1202 slice[j] = s[match[2*j]:match[2*j+1]] 1203 } 1204 } 1205 result = append(result, slice) 1206 }) 1207 return result 1208 } 1209 1210 // FindAllStringSubmatchIndex is the 'All' version of 1211 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1212 // the expression, as defined by the 'All' description in the package 1213 // comment. 1214 // A return value of nil indicates no match. 1215 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1216 if n < 0 { 1217 n = len(s) + 1 1218 } 1219 var result [][]int 1220 re.allMatches(s, nil, n, func(match []int) { 1221 if result == nil { 1222 result = make([][]int, 0, startSize) 1223 } 1224 result = append(result, match) 1225 }) 1226 return result 1227 } 1228 1229 // Split slices s into substrings separated by the expression and returns a slice of 1230 // the substrings between those expression matches. 1231 // 1232 // The slice returned by this method consists of all the substrings of s 1233 // not contained in the slice returned by FindAllString. When called on an expression 1234 // that contains no metacharacters, it is equivalent to strings.SplitN. 1235 // 1236 // Example: 1237 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1238 // // s: ["", "b", "b", "c", "cadaaae"] 1239 // 1240 // The count determines the number of substrings to return: 1241 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 1242 // n == 0: the result is nil (zero substrings) 1243 // n < 0: all substrings 1244 func (re *Regexp) Split(s string, n int) []string { 1245 1246 if n == 0 { 1247 return nil 1248 } 1249 1250 if len(re.expr) > 0 && len(s) == 0 { 1251 return []string{""} 1252 } 1253 1254 matches := re.FindAllStringIndex(s, n) 1255 strings := make([]string, 0, len(matches)) 1256 1257 beg := 0 1258 end := 0 1259 for _, match := range matches { 1260 if n > 0 && len(strings) >= n-1 { 1261 break 1262 } 1263 1264 end = match[0] 1265 if match[1] != 0 { 1266 strings = append(strings, s[beg:end]) 1267 } 1268 beg = match[1] 1269 } 1270 1271 if end != len(s) { 1272 strings = append(strings, s[beg:]) 1273 } 1274 1275 return strings 1276 }