github.com/AESNooper/go/src@v0.0.0-20220218095104-b56a4ab1bbbb/regexp/regexp.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // https://golang.org/s/re2syntax, except for \C. 11 // For an overview of the syntax, run 12 // go doc regexp/syntax 13 // 14 // The regexp implementation provided by this package is 15 // guaranteed to run in time linear in the size of the input. 16 // (This is a property not guaranteed by most open source 17 // implementations of regular expressions.) For more information 18 // about this property, see 19 // https://swtch.com/~rsc/regexp/regexp1.html 20 // or any book about automata theory. 21 // 22 // All characters are UTF-8-encoded code points. 23 // Following utf8.DecodeRune, each byte of an invalid UTF-8 sequence 24 // is treated as if it encoded utf8.RuneError (U+FFFD). 25 // 26 // There are 16 methods of Regexp that match a regular expression and identify 27 // the matched text. Their names are matched by this regular expression: 28 // 29 // Find(All)?(String)?(Submatch)?(Index)? 30 // 31 // If 'All' is present, the routine matches successive non-overlapping 32 // matches of the entire expression. Empty matches abutting a preceding 33 // match are ignored. The return value is a slice containing the successive 34 // return values of the corresponding non-'All' routine. These routines take 35 // an extra integer argument, n. If n >= 0, the function returns at most n 36 // matches/submatches; otherwise, it returns all of them. 37 // 38 // If 'String' is present, the argument is a string; otherwise it is a slice 39 // of bytes; return values are adjusted as appropriate. 40 // 41 // If 'Submatch' is present, the return value is a slice identifying the 42 // successive submatches of the expression. Submatches are matches of 43 // parenthesized subexpressions (also known as capturing groups) within the 44 // regular expression, numbered from left to right in order of opening 45 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 46 // the match of the first parenthesized subexpression, and so on. 47 // 48 // If 'Index' is present, matches and submatches are identified by byte index 49 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 50 // the nth submatch. The pair for n==0 identifies the match of the entire 51 // expression. If 'Index' is not present, the match is identified by the text 52 // of the match/submatch. If an index is negative or text is nil, it means that 53 // subexpression did not match any string in the input. For 'String' versions 54 // an empty string means either no match or an empty match. 55 // 56 // There is also a subset of the methods that can be applied to text read 57 // from a RuneReader: 58 // 59 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 60 // 61 // This set may grow. Note that regular expression matches may need to 62 // examine text beyond the text returned by a match, so the methods that 63 // match text from a RuneReader may read arbitrarily far into the input 64 // before returning. 65 // 66 // (There are a few other methods that do not match this pattern.) 67 // 68 package regexp 69 70 import ( 71 "bytes" 72 "io" 73 "regexp/syntax" 74 "strconv" 75 "strings" 76 "sync" 77 "unicode" 78 "unicode/utf8" 79 ) 80 81 // Regexp is the representation of a compiled regular expression. 82 // A Regexp is safe for concurrent use by multiple goroutines, 83 // except for configuration methods, such as Longest. 84 type Regexp struct { 85 expr string // as passed to Compile 86 prog *syntax.Prog // compiled program 87 onepass *onePassProg // onepass program or nil 88 numSubexp int 89 maxBitStateLen int 90 subexpNames []string 91 prefix string // required prefix in unanchored matches 92 prefixBytes []byte // prefix, as a []byte 93 prefixRune rune // first rune in prefix 94 prefixEnd uint32 // pc for last rune in prefix 95 mpool int // pool for machines 96 matchcap int // size of recorded match lengths 97 prefixComplete bool // prefix is the entire regexp 98 cond syntax.EmptyOp // empty-width conditions required at start of match 99 minInputLen int // minimum length of the input in bytes 100 101 // This field can be modified by the Longest method, 102 // but it is otherwise read-only. 103 longest bool // whether regexp prefers leftmost-longest match 104 } 105 106 // String returns the source text used to compile the regular expression. 107 func (re *Regexp) String() string { 108 return re.expr 109 } 110 111 // Copy returns a new Regexp object copied from re. 112 // Calling Longest on one copy does not affect another. 113 // 114 // Deprecated: In earlier releases, when using a Regexp in multiple goroutines, 115 // giving each goroutine its own copy helped to avoid lock contention. 116 // As of Go 1.12, using Copy is no longer necessary to avoid lock contention. 117 // Copy may still be appropriate if the reason for its use is to make 118 // two copies with different Longest settings. 119 func (re *Regexp) Copy() *Regexp { 120 re2 := *re 121 return &re2 122 } 123 124 // Compile parses a regular expression and returns, if successful, 125 // a Regexp object that can be used to match against text. 126 // 127 // When matching against text, the regexp returns a match that 128 // begins as early as possible in the input (leftmost), and among those 129 // it chooses the one that a backtracking search would have found first. 130 // This so-called leftmost-first matching is the same semantics 131 // that Perl, Python, and other implementations use, although this 132 // package implements it without the expense of backtracking. 133 // For POSIX leftmost-longest matching, see CompilePOSIX. 134 func Compile(expr string) (*Regexp, error) { 135 return compile(expr, syntax.Perl, false) 136 } 137 138 // CompilePOSIX is like Compile but restricts the regular expression 139 // to POSIX ERE (egrep) syntax and changes the match semantics to 140 // leftmost-longest. 141 // 142 // That is, when matching against text, the regexp returns a match that 143 // begins as early as possible in the input (leftmost), and among those 144 // it chooses a match that is as long as possible. 145 // This so-called leftmost-longest matching is the same semantics 146 // that early regular expression implementations used and that POSIX 147 // specifies. 148 // 149 // However, there can be multiple leftmost-longest matches, with different 150 // submatch choices, and here this package diverges from POSIX. 151 // Among the possible leftmost-longest matches, this package chooses 152 // the one that a backtracking search would have found first, while POSIX 153 // specifies that the match be chosen to maximize the length of the first 154 // subexpression, then the second, and so on from left to right. 155 // The POSIX rule is computationally prohibitive and not even well-defined. 156 // See https://swtch.com/~rsc/regexp/regexp2.html#posix for details. 157 func CompilePOSIX(expr string) (*Regexp, error) { 158 return compile(expr, syntax.POSIX, true) 159 } 160 161 // Longest makes future searches prefer the leftmost-longest match. 162 // That is, when matching against text, the regexp returns a match that 163 // begins as early as possible in the input (leftmost), and among those 164 // it chooses a match that is as long as possible. 165 // This method modifies the Regexp and may not be called concurrently 166 // with any other methods. 167 func (re *Regexp) Longest() { 168 re.longest = true 169 } 170 171 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 172 re, err := syntax.Parse(expr, mode) 173 if err != nil { 174 return nil, err 175 } 176 maxCap := re.MaxCap() 177 capNames := re.CapNames() 178 179 re = re.Simplify() 180 prog, err := syntax.Compile(re) 181 if err != nil { 182 return nil, err 183 } 184 matchcap := prog.NumCap 185 if matchcap < 2 { 186 matchcap = 2 187 } 188 regexp := &Regexp{ 189 expr: expr, 190 prog: prog, 191 onepass: compileOnePass(prog), 192 numSubexp: maxCap, 193 subexpNames: capNames, 194 cond: prog.StartCond(), 195 longest: longest, 196 matchcap: matchcap, 197 minInputLen: minInputLen(re), 198 } 199 if regexp.onepass == nil { 200 regexp.prefix, regexp.prefixComplete = prog.Prefix() 201 regexp.maxBitStateLen = maxBitStateLen(prog) 202 } else { 203 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 204 } 205 if regexp.prefix != "" { 206 // TODO(rsc): Remove this allocation by adding 207 // IndexString to package bytes. 208 regexp.prefixBytes = []byte(regexp.prefix) 209 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 210 } 211 212 n := len(prog.Inst) 213 i := 0 214 for matchSize[i] != 0 && matchSize[i] < n { 215 i++ 216 } 217 regexp.mpool = i 218 219 return regexp, nil 220 } 221 222 // Pools of *machine for use during (*Regexp).doExecute, 223 // split up by the size of the execution queues. 224 // matchPool[i] machines have queue size matchSize[i]. 225 // On a 64-bit system each queue entry is 16 bytes, 226 // so matchPool[0] has 16*2*128 = 4kB queues, etc. 227 // The final matchPool is a catch-all for very large queues. 228 var ( 229 matchSize = [...]int{128, 512, 2048, 16384, 0} 230 matchPool [len(matchSize)]sync.Pool 231 ) 232 233 // get returns a machine to use for matching re. 234 // It uses the re's machine cache if possible, to avoid 235 // unnecessary allocation. 236 func (re *Regexp) get() *machine { 237 m, ok := matchPool[re.mpool].Get().(*machine) 238 if !ok { 239 m = new(machine) 240 } 241 m.re = re 242 m.p = re.prog 243 if cap(m.matchcap) < re.matchcap { 244 m.matchcap = make([]int, re.matchcap) 245 for _, t := range m.pool { 246 t.cap = make([]int, re.matchcap) 247 } 248 } 249 250 // Allocate queues if needed. 251 // Or reallocate, for "large" match pool. 252 n := matchSize[re.mpool] 253 if n == 0 { // large pool 254 n = len(re.prog.Inst) 255 } 256 if len(m.q0.sparse) < n { 257 m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 258 m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 259 } 260 return m 261 } 262 263 // put returns a machine to the correct machine pool. 264 func (re *Regexp) put(m *machine) { 265 m.re = nil 266 m.p = nil 267 m.inputs.clear() 268 matchPool[re.mpool].Put(m) 269 } 270 271 // minInputLen walks the regexp to find the minimum length of any matchable input 272 func minInputLen(re *syntax.Regexp) int { 273 switch re.Op { 274 default: 275 return 0 276 case syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpCharClass: 277 return 1 278 case syntax.OpLiteral: 279 l := 0 280 for _, r := range re.Rune { 281 if r == utf8.RuneError { 282 l++ 283 } else { 284 l += utf8.RuneLen(r) 285 } 286 } 287 return l 288 case syntax.OpCapture, syntax.OpPlus: 289 return minInputLen(re.Sub[0]) 290 case syntax.OpRepeat: 291 return re.Min * minInputLen(re.Sub[0]) 292 case syntax.OpConcat: 293 l := 0 294 for _, sub := range re.Sub { 295 l += minInputLen(sub) 296 } 297 return l 298 case syntax.OpAlternate: 299 l := minInputLen(re.Sub[0]) 300 var lnext int 301 for _, sub := range re.Sub[1:] { 302 lnext = minInputLen(sub) 303 if lnext < l { 304 l = lnext 305 } 306 } 307 return l 308 } 309 } 310 311 // MustCompile is like Compile but panics if the expression cannot be parsed. 312 // It simplifies safe initialization of global variables holding compiled regular 313 // expressions. 314 func MustCompile(str string) *Regexp { 315 regexp, err := Compile(str) 316 if err != nil { 317 panic(`regexp: Compile(` + quote(str) + `): ` + err.Error()) 318 } 319 return regexp 320 } 321 322 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 323 // It simplifies safe initialization of global variables holding compiled regular 324 // expressions. 325 func MustCompilePOSIX(str string) *Regexp { 326 regexp, err := CompilePOSIX(str) 327 if err != nil { 328 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error()) 329 } 330 return regexp 331 } 332 333 func quote(s string) string { 334 if strconv.CanBackquote(s) { 335 return "`" + s + "`" 336 } 337 return strconv.Quote(s) 338 } 339 340 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 341 func (re *Regexp) NumSubexp() int { 342 return re.numSubexp 343 } 344 345 // SubexpNames returns the names of the parenthesized subexpressions 346 // in this Regexp. The name for the first sub-expression is names[1], 347 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 348 // Since the Regexp as a whole cannot be named, names[0] is always 349 // the empty string. The slice should not be modified. 350 func (re *Regexp) SubexpNames() []string { 351 return re.subexpNames 352 } 353 354 // SubexpIndex returns the index of the first subexpression with the given name, 355 // or -1 if there is no subexpression with that name. 356 // 357 // Note that multiple subexpressions can be written using the same name, as in 358 // (?P<bob>a+)(?P<bob>b+), which declares two subexpressions named "bob". 359 // In this case, SubexpIndex returns the index of the leftmost such subexpression 360 // in the regular expression. 361 func (re *Regexp) SubexpIndex(name string) int { 362 if name != "" { 363 for i, s := range re.subexpNames { 364 if name == s { 365 return i 366 } 367 } 368 } 369 return -1 370 } 371 372 const endOfText rune = -1 373 374 // input abstracts different representations of the input text. It provides 375 // one-character lookahead. 376 type input interface { 377 step(pos int) (r rune, width int) // advance one rune 378 canCheckPrefix() bool // can we look ahead without losing info? 379 hasPrefix(re *Regexp) bool 380 index(re *Regexp, pos int) int 381 context(pos int) lazyFlag 382 } 383 384 // inputString scans a string. 385 type inputString struct { 386 str string 387 } 388 389 func (i *inputString) step(pos int) (rune, int) { 390 if pos < len(i.str) { 391 c := i.str[pos] 392 if c < utf8.RuneSelf { 393 return rune(c), 1 394 } 395 return utf8.DecodeRuneInString(i.str[pos:]) 396 } 397 return endOfText, 0 398 } 399 400 func (i *inputString) canCheckPrefix() bool { 401 return true 402 } 403 404 func (i *inputString) hasPrefix(re *Regexp) bool { 405 return strings.HasPrefix(i.str, re.prefix) 406 } 407 408 func (i *inputString) index(re *Regexp, pos int) int { 409 return strings.Index(i.str[pos:], re.prefix) 410 } 411 412 func (i *inputString) context(pos int) lazyFlag { 413 r1, r2 := endOfText, endOfText 414 // 0 < pos && pos <= len(i.str) 415 if uint(pos-1) < uint(len(i.str)) { 416 r1 = rune(i.str[pos-1]) 417 if r1 >= utf8.RuneSelf { 418 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 419 } 420 } 421 // 0 <= pos && pos < len(i.str) 422 if uint(pos) < uint(len(i.str)) { 423 r2 = rune(i.str[pos]) 424 if r2 >= utf8.RuneSelf { 425 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 426 } 427 } 428 return newLazyFlag(r1, r2) 429 } 430 431 // inputBytes scans a byte slice. 432 type inputBytes struct { 433 str []byte 434 } 435 436 func (i *inputBytes) step(pos int) (rune, int) { 437 if pos < len(i.str) { 438 c := i.str[pos] 439 if c < utf8.RuneSelf { 440 return rune(c), 1 441 } 442 return utf8.DecodeRune(i.str[pos:]) 443 } 444 return endOfText, 0 445 } 446 447 func (i *inputBytes) canCheckPrefix() bool { 448 return true 449 } 450 451 func (i *inputBytes) hasPrefix(re *Regexp) bool { 452 return bytes.HasPrefix(i.str, re.prefixBytes) 453 } 454 455 func (i *inputBytes) index(re *Regexp, pos int) int { 456 return bytes.Index(i.str[pos:], re.prefixBytes) 457 } 458 459 func (i *inputBytes) context(pos int) lazyFlag { 460 r1, r2 := endOfText, endOfText 461 // 0 < pos && pos <= len(i.str) 462 if uint(pos-1) < uint(len(i.str)) { 463 r1 = rune(i.str[pos-1]) 464 if r1 >= utf8.RuneSelf { 465 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 466 } 467 } 468 // 0 <= pos && pos < len(i.str) 469 if uint(pos) < uint(len(i.str)) { 470 r2 = rune(i.str[pos]) 471 if r2 >= utf8.RuneSelf { 472 r2, _ = utf8.DecodeRune(i.str[pos:]) 473 } 474 } 475 return newLazyFlag(r1, r2) 476 } 477 478 // inputReader scans a RuneReader. 479 type inputReader struct { 480 r io.RuneReader 481 atEOT bool 482 pos int 483 } 484 485 func (i *inputReader) step(pos int) (rune, int) { 486 if !i.atEOT && pos != i.pos { 487 return endOfText, 0 488 489 } 490 r, w, err := i.r.ReadRune() 491 if err != nil { 492 i.atEOT = true 493 return endOfText, 0 494 } 495 i.pos += w 496 return r, w 497 } 498 499 func (i *inputReader) canCheckPrefix() bool { 500 return false 501 } 502 503 func (i *inputReader) hasPrefix(re *Regexp) bool { 504 return false 505 } 506 507 func (i *inputReader) index(re *Regexp, pos int) int { 508 return -1 509 } 510 511 func (i *inputReader) context(pos int) lazyFlag { 512 return 0 // not used 513 } 514 515 // LiteralPrefix returns a literal string that must begin any match 516 // of the regular expression re. It returns the boolean true if the 517 // literal string comprises the entire regular expression. 518 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 519 return re.prefix, re.prefixComplete 520 } 521 522 // MatchReader reports whether the text returned by the RuneReader 523 // contains any match of the regular expression re. 524 func (re *Regexp) MatchReader(r io.RuneReader) bool { 525 return re.doMatch(r, nil, "") 526 } 527 528 // MatchString reports whether the string s 529 // contains any match of the regular expression re. 530 func (re *Regexp) MatchString(s string) bool { 531 return re.doMatch(nil, nil, s) 532 } 533 534 // Match reports whether the byte slice b 535 // contains any match of the regular expression re. 536 func (re *Regexp) Match(b []byte) bool { 537 return re.doMatch(nil, b, "") 538 } 539 540 // MatchReader reports whether the text returned by the RuneReader 541 // contains any match of the regular expression pattern. 542 // More complicated queries need to use Compile and the full Regexp interface. 543 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 544 re, err := Compile(pattern) 545 if err != nil { 546 return false, err 547 } 548 return re.MatchReader(r), nil 549 } 550 551 // MatchString reports whether the string s 552 // contains any match of the regular expression pattern. 553 // More complicated queries need to use Compile and the full Regexp interface. 554 func MatchString(pattern string, s string) (matched bool, err error) { 555 re, err := Compile(pattern) 556 if err != nil { 557 return false, err 558 } 559 return re.MatchString(s), nil 560 } 561 562 // Match reports whether the byte slice b 563 // contains any match of the regular expression pattern. 564 // More complicated queries need to use Compile and the full Regexp interface. 565 func Match(pattern string, b []byte) (matched bool, err error) { 566 re, err := Compile(pattern) 567 if err != nil { 568 return false, err 569 } 570 return re.Match(b), nil 571 } 572 573 // ReplaceAllString returns a copy of src, replacing matches of the Regexp 574 // with the replacement string repl. Inside repl, $ signs are interpreted as 575 // in Expand, so for instance $1 represents the text of the first submatch. 576 func (re *Regexp) ReplaceAllString(src, repl string) string { 577 n := 2 578 if strings.Contains(repl, "$") { 579 n = 2 * (re.numSubexp + 1) 580 } 581 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 582 return re.expand(dst, repl, nil, src, match) 583 }) 584 return string(b) 585 } 586 587 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 588 // with the replacement string repl. The replacement repl is substituted directly, 589 // without using Expand. 590 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 591 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 592 return append(dst, repl...) 593 })) 594 } 595 596 // ReplaceAllStringFunc returns a copy of src in which all matches of the 597 // Regexp have been replaced by the return value of function repl applied 598 // to the matched substring. The replacement returned by repl is substituted 599 // directly, without using Expand. 600 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 601 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 602 return append(dst, repl(src[match[0]:match[1]])...) 603 }) 604 return string(b) 605 } 606 607 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 608 lastMatchEnd := 0 // end position of the most recent match 609 searchPos := 0 // position where we next look for a match 610 var buf []byte 611 var endPos int 612 if bsrc != nil { 613 endPos = len(bsrc) 614 } else { 615 endPos = len(src) 616 } 617 if nmatch > re.prog.NumCap { 618 nmatch = re.prog.NumCap 619 } 620 621 var dstCap [2]int 622 for searchPos <= endPos { 623 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) 624 if len(a) == 0 { 625 break // no more matches 626 } 627 628 // Copy the unmatched characters before this match. 629 if bsrc != nil { 630 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 631 } else { 632 buf = append(buf, src[lastMatchEnd:a[0]]...) 633 } 634 635 // Now insert a copy of the replacement string, but not for a 636 // match of the empty string immediately after another match. 637 // (Otherwise, we get double replacement for patterns that 638 // match both empty and nonempty strings.) 639 if a[1] > lastMatchEnd || a[0] == 0 { 640 buf = repl(buf, a) 641 } 642 lastMatchEnd = a[1] 643 644 // Advance past this match; always advance at least one character. 645 var width int 646 if bsrc != nil { 647 _, width = utf8.DecodeRune(bsrc[searchPos:]) 648 } else { 649 _, width = utf8.DecodeRuneInString(src[searchPos:]) 650 } 651 if searchPos+width > a[1] { 652 searchPos += width 653 } else if searchPos+1 > a[1] { 654 // This clause is only needed at the end of the input 655 // string. In that case, DecodeRuneInString returns width=0. 656 searchPos++ 657 } else { 658 searchPos = a[1] 659 } 660 } 661 662 // Copy the unmatched characters after the last match. 663 if bsrc != nil { 664 buf = append(buf, bsrc[lastMatchEnd:]...) 665 } else { 666 buf = append(buf, src[lastMatchEnd:]...) 667 } 668 669 return buf 670 } 671 672 // ReplaceAll returns a copy of src, replacing matches of the Regexp 673 // with the replacement text repl. Inside repl, $ signs are interpreted as 674 // in Expand, so for instance $1 represents the text of the first submatch. 675 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 676 n := 2 677 if bytes.IndexByte(repl, '$') >= 0 { 678 n = 2 * (re.numSubexp + 1) 679 } 680 srepl := "" 681 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 682 if len(srepl) != len(repl) { 683 srepl = string(repl) 684 } 685 return re.expand(dst, srepl, src, "", match) 686 }) 687 return b 688 } 689 690 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 691 // with the replacement bytes repl. The replacement repl is substituted directly, 692 // without using Expand. 693 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 694 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 695 return append(dst, repl...) 696 }) 697 } 698 699 // ReplaceAllFunc returns a copy of src in which all matches of the 700 // Regexp have been replaced by the return value of function repl applied 701 // to the matched byte slice. The replacement returned by repl is substituted 702 // directly, without using Expand. 703 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 704 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 705 return append(dst, repl(src[match[0]:match[1]])...) 706 }) 707 } 708 709 // Bitmap used by func special to check whether a character needs to be escaped. 710 var specialBytes [16]byte 711 712 // special reports whether byte b needs to be escaped by QuoteMeta. 713 func special(b byte) bool { 714 return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0 715 } 716 717 func init() { 718 for _, b := range []byte(`\.+*?()|[]{}^$`) { 719 specialBytes[b%16] |= 1 << (b / 16) 720 } 721 } 722 723 // QuoteMeta returns a string that escapes all regular expression metacharacters 724 // inside the argument text; the returned string is a regular expression matching 725 // the literal text. 726 func QuoteMeta(s string) string { 727 // A byte loop is correct because all metacharacters are ASCII. 728 var i int 729 for i = 0; i < len(s); i++ { 730 if special(s[i]) { 731 break 732 } 733 } 734 // No meta characters found, so return original string. 735 if i >= len(s) { 736 return s 737 } 738 739 b := make([]byte, 2*len(s)-i) 740 copy(b, s[:i]) 741 j := i 742 for ; i < len(s); i++ { 743 if special(s[i]) { 744 b[j] = '\\' 745 j++ 746 } 747 b[j] = s[i] 748 j++ 749 } 750 return string(b[:j]) 751 } 752 753 // The number of capture values in the program may correspond 754 // to fewer capturing expressions than are in the regexp. 755 // For example, "(a){0}" turns into an empty program, so the 756 // maximum capture in the program is 0 but we need to return 757 // an expression for \1. Pad appends -1s to the slice a as needed. 758 func (re *Regexp) pad(a []int) []int { 759 if a == nil { 760 // No match. 761 return nil 762 } 763 n := (1 + re.numSubexp) * 2 764 for len(a) < n { 765 a = append(a, -1) 766 } 767 return a 768 } 769 770 // allMatches calls deliver at most n times 771 // with the location of successive matches in the input text. 772 // The input text is b if non-nil, otherwise s. 773 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 774 var end int 775 if b == nil { 776 end = len(s) 777 } else { 778 end = len(b) 779 } 780 781 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 782 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) 783 if len(matches) == 0 { 784 break 785 } 786 787 accept := true 788 if matches[1] == pos { 789 // We've found an empty match. 790 if matches[0] == prevMatchEnd { 791 // We don't allow an empty match right 792 // after a previous match, so ignore it. 793 accept = false 794 } 795 var width int 796 // TODO: use step() 797 if b == nil { 798 _, width = utf8.DecodeRuneInString(s[pos:end]) 799 } else { 800 _, width = utf8.DecodeRune(b[pos:end]) 801 } 802 if width > 0 { 803 pos += width 804 } else { 805 pos = end + 1 806 } 807 } else { 808 pos = matches[1] 809 } 810 prevMatchEnd = matches[1] 811 812 if accept { 813 deliver(re.pad(matches)) 814 i++ 815 } 816 } 817 } 818 819 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 820 // A return value of nil indicates no match. 821 func (re *Regexp) Find(b []byte) []byte { 822 var dstCap [2]int 823 a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) 824 if a == nil { 825 return nil 826 } 827 return b[a[0]:a[1]:a[1]] 828 } 829 830 // FindIndex returns a two-element slice of integers defining the location of 831 // the leftmost match in b of the regular expression. The match itself is at 832 // b[loc[0]:loc[1]]. 833 // A return value of nil indicates no match. 834 func (re *Regexp) FindIndex(b []byte) (loc []int) { 835 a := re.doExecute(nil, b, "", 0, 2, nil) 836 if a == nil { 837 return nil 838 } 839 return a[0:2] 840 } 841 842 // FindString returns a string holding the text of the leftmost match in s of the regular 843 // expression. If there is no match, the return value is an empty string, 844 // but it will also be empty if the regular expression successfully matches 845 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 846 // necessary to distinguish these cases. 847 func (re *Regexp) FindString(s string) string { 848 var dstCap [2]int 849 a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) 850 if a == nil { 851 return "" 852 } 853 return s[a[0]:a[1]] 854 } 855 856 // FindStringIndex returns a two-element slice of integers defining the 857 // location of the leftmost match in s of the regular expression. The match 858 // itself is at s[loc[0]:loc[1]]. 859 // A return value of nil indicates no match. 860 func (re *Regexp) FindStringIndex(s string) (loc []int) { 861 a := re.doExecute(nil, nil, s, 0, 2, nil) 862 if a == nil { 863 return nil 864 } 865 return a[0:2] 866 } 867 868 // FindReaderIndex returns a two-element slice of integers defining the 869 // location of the leftmost match of the regular expression in text read from 870 // the RuneReader. The match text was found in the input stream at 871 // byte offset loc[0] through loc[1]-1. 872 // A return value of nil indicates no match. 873 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 874 a := re.doExecute(r, nil, "", 0, 2, nil) 875 if a == nil { 876 return nil 877 } 878 return a[0:2] 879 } 880 881 // FindSubmatch returns a slice of slices holding the text of the leftmost 882 // match of the regular expression in b and the matches, if any, of its 883 // subexpressions, as defined by the 'Submatch' descriptions in the package 884 // comment. 885 // A return value of nil indicates no match. 886 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 887 var dstCap [4]int 888 a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) 889 if a == nil { 890 return nil 891 } 892 ret := make([][]byte, 1+re.numSubexp) 893 for i := range ret { 894 if 2*i < len(a) && a[2*i] >= 0 { 895 ret[i] = b[a[2*i]:a[2*i+1]:a[2*i+1]] 896 } 897 } 898 return ret 899 } 900 901 // Expand appends template to dst and returns the result; during the 902 // append, Expand replaces variables in the template with corresponding 903 // matches drawn from src. The match slice should have been returned by 904 // FindSubmatchIndex. 905 // 906 // In the template, a variable is denoted by a substring of the form 907 // $name or ${name}, where name is a non-empty sequence of letters, 908 // digits, and underscores. A purely numeric name like $1 refers to 909 // the submatch with the corresponding index; other names refer to 910 // capturing parentheses named with the (?P<name>...) syntax. A 911 // reference to an out of range or unmatched index or a name that is not 912 // present in the regular expression is replaced with an empty slice. 913 // 914 // In the $name form, name is taken to be as long as possible: $1x is 915 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 916 // 917 // To insert a literal $ in the output, use $$ in the template. 918 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 919 return re.expand(dst, string(template), src, "", match) 920 } 921 922 // ExpandString is like Expand but the template and source are strings. 923 // It appends to and returns a byte slice in order to give the calling 924 // code control over allocation. 925 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 926 return re.expand(dst, template, nil, src, match) 927 } 928 929 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 930 for len(template) > 0 { 931 before, after, ok := strings.Cut(template, "$") 932 if !ok { 933 break 934 } 935 dst = append(dst, before...) 936 template = after 937 if template != "" && template[0] == '$' { 938 // Treat $$ as $. 939 dst = append(dst, '$') 940 template = template[1:] 941 continue 942 } 943 name, num, rest, ok := extract(template) 944 if !ok { 945 // Malformed; treat $ as raw text. 946 dst = append(dst, '$') 947 continue 948 } 949 template = rest 950 if num >= 0 { 951 if 2*num+1 < len(match) && match[2*num] >= 0 { 952 if bsrc != nil { 953 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 954 } else { 955 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 956 } 957 } 958 } else { 959 for i, namei := range re.subexpNames { 960 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 961 if bsrc != nil { 962 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 963 } else { 964 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 965 } 966 break 967 } 968 } 969 } 970 } 971 dst = append(dst, template...) 972 return dst 973 } 974 975 // extract returns the name from a leading "name" or "{name}" in str. 976 // (The $ has already been removed by the caller.) 977 // If it is a number, extract returns num set to that number; otherwise num = -1. 978 func extract(str string) (name string, num int, rest string, ok bool) { 979 if str == "" { 980 return 981 } 982 brace := false 983 if str[0] == '{' { 984 brace = true 985 str = str[1:] 986 } 987 i := 0 988 for i < len(str) { 989 rune, size := utf8.DecodeRuneInString(str[i:]) 990 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 991 break 992 } 993 i += size 994 } 995 if i == 0 { 996 // empty name is not okay 997 return 998 } 999 name = str[:i] 1000 if brace { 1001 if i >= len(str) || str[i] != '}' { 1002 // missing closing brace 1003 return 1004 } 1005 i++ 1006 } 1007 1008 // Parse number. 1009 num = 0 1010 for i := 0; i < len(name); i++ { 1011 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 1012 num = -1 1013 break 1014 } 1015 num = num*10 + int(name[i]) - '0' 1016 } 1017 // Disallow leading zeros. 1018 if name[0] == '0' && len(name) > 1 { 1019 num = -1 1020 } 1021 1022 rest = str[i:] 1023 ok = true 1024 return 1025 } 1026 1027 // FindSubmatchIndex returns a slice holding the index pairs identifying the 1028 // leftmost match of the regular expression in b and the matches, if any, of 1029 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 1030 // in the package comment. 1031 // A return value of nil indicates no match. 1032 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 1033 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) 1034 } 1035 1036 // FindStringSubmatch returns a slice of strings holding the text of the 1037 // leftmost match of the regular expression in s and the matches, if any, of 1038 // its subexpressions, as defined by the 'Submatch' description in the 1039 // package comment. 1040 // A return value of nil indicates no match. 1041 func (re *Regexp) FindStringSubmatch(s string) []string { 1042 var dstCap [4]int 1043 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) 1044 if a == nil { 1045 return nil 1046 } 1047 ret := make([]string, 1+re.numSubexp) 1048 for i := range ret { 1049 if 2*i < len(a) && a[2*i] >= 0 { 1050 ret[i] = s[a[2*i]:a[2*i+1]] 1051 } 1052 } 1053 return ret 1054 } 1055 1056 // FindStringSubmatchIndex returns a slice holding the index pairs 1057 // identifying the leftmost match of the regular expression in s and the 1058 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 1059 // 'Index' descriptions in the package comment. 1060 // A return value of nil indicates no match. 1061 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 1062 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) 1063 } 1064 1065 // FindReaderSubmatchIndex returns a slice holding the index pairs 1066 // identifying the leftmost match of the regular expression of text read by 1067 // the RuneReader, and the matches, if any, of its subexpressions, as defined 1068 // by the 'Submatch' and 'Index' descriptions in the package comment. A 1069 // return value of nil indicates no match. 1070 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 1071 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) 1072 } 1073 1074 const startSize = 10 // The size at which to start a slice in the 'All' routines. 1075 1076 // FindAll is the 'All' version of Find; it returns a slice of all successive 1077 // matches of the expression, as defined by the 'All' description in the 1078 // package comment. 1079 // A return value of nil indicates no match. 1080 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 1081 if n < 0 { 1082 n = len(b) + 1 1083 } 1084 var result [][]byte 1085 re.allMatches("", b, n, func(match []int) { 1086 if result == nil { 1087 result = make([][]byte, 0, startSize) 1088 } 1089 result = append(result, b[match[0]:match[1]:match[1]]) 1090 }) 1091 return result 1092 } 1093 1094 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 1095 // successive matches of the expression, as defined by the 'All' description 1096 // in the package comment. 1097 // A return value of nil indicates no match. 1098 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 1099 if n < 0 { 1100 n = len(b) + 1 1101 } 1102 var result [][]int 1103 re.allMatches("", b, n, func(match []int) { 1104 if result == nil { 1105 result = make([][]int, 0, startSize) 1106 } 1107 result = append(result, match[0:2]) 1108 }) 1109 return result 1110 } 1111 1112 // FindAllString is the 'All' version of FindString; it returns a slice of all 1113 // successive matches of the expression, as defined by the 'All' description 1114 // in the package comment. 1115 // A return value of nil indicates no match. 1116 func (re *Regexp) FindAllString(s string, n int) []string { 1117 if n < 0 { 1118 n = len(s) + 1 1119 } 1120 var result []string 1121 re.allMatches(s, nil, n, func(match []int) { 1122 if result == nil { 1123 result = make([]string, 0, startSize) 1124 } 1125 result = append(result, s[match[0]:match[1]]) 1126 }) 1127 return result 1128 } 1129 1130 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 1131 // slice of all successive matches of the expression, as defined by the 'All' 1132 // description in the package comment. 1133 // A return value of nil indicates no match. 1134 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1135 if n < 0 { 1136 n = len(s) + 1 1137 } 1138 var result [][]int 1139 re.allMatches(s, nil, n, func(match []int) { 1140 if result == nil { 1141 result = make([][]int, 0, startSize) 1142 } 1143 result = append(result, match[0:2]) 1144 }) 1145 return result 1146 } 1147 1148 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1149 // of all successive matches of the expression, as defined by the 'All' 1150 // description in the package comment. 1151 // A return value of nil indicates no match. 1152 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1153 if n < 0 { 1154 n = len(b) + 1 1155 } 1156 var result [][][]byte 1157 re.allMatches("", b, n, func(match []int) { 1158 if result == nil { 1159 result = make([][][]byte, 0, startSize) 1160 } 1161 slice := make([][]byte, len(match)/2) 1162 for j := range slice { 1163 if match[2*j] >= 0 { 1164 slice[j] = b[match[2*j]:match[2*j+1]:match[2*j+1]] 1165 } 1166 } 1167 result = append(result, slice) 1168 }) 1169 return result 1170 } 1171 1172 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1173 // a slice of all successive matches of the expression, as defined by the 1174 // 'All' description in the package comment. 1175 // A return value of nil indicates no match. 1176 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1177 if n < 0 { 1178 n = len(b) + 1 1179 } 1180 var result [][]int 1181 re.allMatches("", b, n, func(match []int) { 1182 if result == nil { 1183 result = make([][]int, 0, startSize) 1184 } 1185 result = append(result, match) 1186 }) 1187 return result 1188 } 1189 1190 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1191 // returns a slice of all successive matches of the expression, as defined by 1192 // the 'All' description in the package comment. 1193 // A return value of nil indicates no match. 1194 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1195 if n < 0 { 1196 n = len(s) + 1 1197 } 1198 var result [][]string 1199 re.allMatches(s, nil, n, func(match []int) { 1200 if result == nil { 1201 result = make([][]string, 0, startSize) 1202 } 1203 slice := make([]string, len(match)/2) 1204 for j := range slice { 1205 if match[2*j] >= 0 { 1206 slice[j] = s[match[2*j]:match[2*j+1]] 1207 } 1208 } 1209 result = append(result, slice) 1210 }) 1211 return result 1212 } 1213 1214 // FindAllStringSubmatchIndex is the 'All' version of 1215 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1216 // the expression, as defined by the 'All' description in the package 1217 // comment. 1218 // A return value of nil indicates no match. 1219 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1220 if n < 0 { 1221 n = len(s) + 1 1222 } 1223 var result [][]int 1224 re.allMatches(s, nil, n, func(match []int) { 1225 if result == nil { 1226 result = make([][]int, 0, startSize) 1227 } 1228 result = append(result, match) 1229 }) 1230 return result 1231 } 1232 1233 // Split slices s into substrings separated by the expression and returns a slice of 1234 // the substrings between those expression matches. 1235 // 1236 // The slice returned by this method consists of all the substrings of s 1237 // not contained in the slice returned by FindAllString. When called on an expression 1238 // that contains no metacharacters, it is equivalent to strings.SplitN. 1239 // 1240 // Example: 1241 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1242 // // s: ["", "b", "b", "c", "cadaaae"] 1243 // 1244 // The count determines the number of substrings to return: 1245 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 1246 // n == 0: the result is nil (zero substrings) 1247 // n < 0: all substrings 1248 func (re *Regexp) Split(s string, n int) []string { 1249 1250 if n == 0 { 1251 return nil 1252 } 1253 1254 if len(re.expr) > 0 && len(s) == 0 { 1255 return []string{""} 1256 } 1257 1258 matches := re.FindAllStringIndex(s, n) 1259 strings := make([]string, 0, len(matches)) 1260 1261 beg := 0 1262 end := 0 1263 for _, match := range matches { 1264 if n > 0 && len(strings) >= n-1 { 1265 break 1266 } 1267 1268 end = match[0] 1269 if match[1] != 0 { 1270 strings = append(strings, s[beg:end]) 1271 } 1272 beg = match[1] 1273 } 1274 1275 if end != len(s) { 1276 strings = append(strings, s[beg:]) 1277 } 1278 1279 return strings 1280 }