github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/regexp/regexp.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // https://golang.org/s/re2syntax, except for \C. 11 // For an overview of the syntax, run 12 // 13 // go doc regexp/syntax 14 // 15 // The regexp implementation provided by this package is 16 // guaranteed to run in time linear in the size of the input. 17 // (This is a property not guaranteed by most open source 18 // implementations of regular expressions.) For more information 19 // about this property, see 20 // 21 // https://swtch.com/~rsc/regexp/regexp1.html 22 // 23 // or any book about automata theory. 24 // 25 // All characters are UTF-8-encoded code points. 26 // Following utf8.DecodeRune, each byte of an invalid UTF-8 sequence 27 // is treated as if it encoded utf8.RuneError (U+FFFD). 28 // 29 // There are 16 methods of Regexp that match a regular expression and identify 30 // the matched text. Their names are matched by this regular expression: 31 // 32 // Find(All)?(String)?(Submatch)?(Index)? 33 // 34 // If 'All' is present, the routine matches successive non-overlapping 35 // matches of the entire expression. Empty matches abutting a preceding 36 // match are ignored. The return value is a slice containing the successive 37 // return values of the corresponding non-'All' routine. These routines take 38 // an extra integer argument, n. If n >= 0, the function returns at most n 39 // matches/submatches; otherwise, it returns all of them. 40 // 41 // If 'String' is present, the argument is a string; otherwise it is a slice 42 // of bytes; return values are adjusted as appropriate. 43 // 44 // If 'Submatch' is present, the return value is a slice identifying the 45 // successive submatches of the expression. Submatches are matches of 46 // parenthesized subexpressions (also known as capturing groups) within the 47 // regular expression, numbered from left to right in order of opening 48 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 is 49 // the match of the first parenthesized subexpression, and so on. 50 // 51 // If 'Index' is present, matches and submatches are identified by byte index 52 // pairs within the input string: result[2*n:2*n+2] identifies the indexes of 53 // the nth submatch. The pair for n==0 identifies the match of the entire 54 // expression. If 'Index' is not present, the match is identified by the text 55 // of the match/submatch. If an index is negative or text is nil, it means that 56 // subexpression did not match any string in the input. For 'String' versions 57 // an empty string means either no match or an empty match. 58 // 59 // There is also a subset of the methods that can be applied to text read 60 // from a RuneReader: 61 // 62 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 63 // 64 // This set may grow. Note that regular expression matches may need to 65 // examine text beyond the text returned by a match, so the methods that 66 // match text from a RuneReader may read arbitrarily far into the input 67 // before returning. 68 // 69 // (There are a few other methods that do not match this pattern.) 70 package regexp 71 72 import ( 73 "bytes" 74 "io" 75 "regexp/syntax" 76 "strconv" 77 "strings" 78 "sync" 79 "unicode" 80 "unicode/utf8" 81 ) 82 83 // Regexp is the representation of a compiled regular expression. 84 // A Regexp is safe for concurrent use by multiple goroutines, 85 // except for configuration methods, such as Longest. 86 type Regexp struct { 87 expr string // as passed to Compile 88 prog *syntax.Prog // compiled program 89 onepass *onePassProg // onepass program or nil 90 numSubexp int 91 maxBitStateLen int 92 subexpNames []string 93 prefix string // required prefix in unanchored matches 94 prefixBytes []byte // prefix, as a []byte 95 prefixRune rune // first rune in prefix 96 prefixEnd uint32 // pc for last rune in prefix 97 mpool int // pool for machines 98 matchcap int // size of recorded match lengths 99 prefixComplete bool // prefix is the entire regexp 100 cond syntax.EmptyOp // empty-width conditions required at start of match 101 minInputLen int // minimum length of the input in bytes 102 103 // This field can be modified by the Longest method, 104 // but it is otherwise read-only. 105 longest bool // whether regexp prefers leftmost-longest match 106 } 107 108 // String returns the source text used to compile the regular expression. 109 func (re *Regexp) String() string { 110 return re.expr 111 } 112 113 // Copy returns a new Regexp object copied from re. 114 // Calling Longest on one copy does not affect another. 115 // 116 // Deprecated: In earlier releases, when using a Regexp in multiple goroutines, 117 // giving each goroutine its own copy helped to avoid lock contention. 118 // As of Go 1.12, using Copy is no longer necessary to avoid lock contention. 119 // Copy may still be appropriate if the reason for its use is to make 120 // two copies with different Longest settings. 121 func (re *Regexp) Copy() *Regexp { 122 re2 := *re 123 return &re2 124 } 125 126 // Compile parses a regular expression and returns, if successful, 127 // a Regexp object that can be used to match against text. 128 // 129 // When matching against text, the regexp returns a match that 130 // begins as early as possible in the input (leftmost), and among those 131 // it chooses the one that a backtracking search would have found first. 132 // This so-called leftmost-first matching is the same semantics 133 // that Perl, Python, and other implementations use, although this 134 // package implements it without the expense of backtracking. 135 // For POSIX leftmost-longest matching, see CompilePOSIX. 136 func Compile(expr string) (*Regexp, error) { 137 return compile(expr, syntax.Perl, false) 138 } 139 140 // CompilePOSIX is like Compile but restricts the regular expression 141 // to POSIX ERE (egrep) syntax and changes the match semantics to 142 // leftmost-longest. 143 // 144 // That is, when matching against text, the regexp returns a match that 145 // begins as early as possible in the input (leftmost), and among those 146 // it chooses a match that is as long as possible. 147 // This so-called leftmost-longest matching is the same semantics 148 // that early regular expression implementations used and that POSIX 149 // specifies. 150 // 151 // However, there can be multiple leftmost-longest matches, with different 152 // submatch choices, and here this package diverges from POSIX. 153 // Among the possible leftmost-longest matches, this package chooses 154 // the one that a backtracking search would have found first, while POSIX 155 // specifies that the match be chosen to maximize the length of the first 156 // subexpression, then the second, and so on from left to right. 157 // The POSIX rule is computationally prohibitive and not even well-defined. 158 // See https://swtch.com/~rsc/regexp/regexp2.html#posix for details. 159 func CompilePOSIX(expr string) (*Regexp, error) { 160 return compile(expr, syntax.POSIX, true) 161 } 162 163 // Longest makes future searches prefer the leftmost-longest match. 164 // That is, when matching against text, the regexp returns a match that 165 // begins as early as possible in the input (leftmost), and among those 166 // it chooses a match that is as long as possible. 167 // This method modifies the Regexp and may not be called concurrently 168 // with any other methods. 169 func (re *Regexp) Longest() { 170 re.longest = true 171 } 172 173 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 174 re, err := syntax.Parse(expr, mode) 175 if err != nil { 176 return nil, err 177 } 178 maxCap := re.MaxCap() 179 capNames := re.CapNames() 180 181 re = re.Simplify() 182 prog, err := syntax.Compile(re) 183 if err != nil { 184 return nil, err 185 } 186 matchcap := prog.NumCap 187 if matchcap < 2 { 188 matchcap = 2 189 } 190 regexp := &Regexp{ 191 expr: expr, 192 prog: prog, 193 onepass: compileOnePass(prog), 194 numSubexp: maxCap, 195 subexpNames: capNames, 196 cond: prog.StartCond(), 197 longest: longest, 198 matchcap: matchcap, 199 minInputLen: minInputLen(re), 200 } 201 if regexp.onepass == nil { 202 regexp.prefix, regexp.prefixComplete = prog.Prefix() 203 regexp.maxBitStateLen = maxBitStateLen(prog) 204 } else { 205 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 206 } 207 if regexp.prefix != "" { 208 // TODO(rsc): Remove this allocation by adding 209 // IndexString to package bytes. 210 regexp.prefixBytes = []byte(regexp.prefix) 211 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 212 } 213 214 n := len(prog.Inst) 215 i := 0 216 for matchSize[i] != 0 && matchSize[i] < n { 217 i++ 218 } 219 regexp.mpool = i 220 221 return regexp, nil 222 } 223 224 // Pools of *machine for use during (*Regexp).doExecute, 225 // split up by the size of the execution queues. 226 // matchPool[i] machines have queue size matchSize[i]. 227 // On a 64-bit system each queue entry is 16 bytes, 228 // so matchPool[0] has 16*2*128 = 4kB queues, etc. 229 // The final matchPool is a catch-all for very large queues. 230 var ( 231 matchSize = [...]int{128, 512, 2048, 16384, 0} 232 matchPool [len(matchSize)]sync.Pool 233 ) 234 235 // get returns a machine to use for matching re. 236 // It uses the re's machine cache if possible, to avoid 237 // unnecessary allocation. 238 func (re *Regexp) get() *machine { 239 m, ok := matchPool[re.mpool].Get().(*machine) 240 if !ok { 241 m = new(machine) 242 } 243 m.re = re 244 m.p = re.prog 245 if cap(m.matchcap) < re.matchcap { 246 m.matchcap = make([]int, re.matchcap) 247 for _, t := range m.pool { 248 t.cap = make([]int, re.matchcap) 249 } 250 } 251 252 // Allocate queues if needed. 253 // Or reallocate, for "large" match pool. 254 n := matchSize[re.mpool] 255 if n == 0 { // large pool 256 n = len(re.prog.Inst) 257 } 258 if len(m.q0.sparse) < n { 259 m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 260 m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 261 } 262 return m 263 } 264 265 // put returns a machine to the correct machine pool. 266 func (re *Regexp) put(m *machine) { 267 m.re = nil 268 m.p = nil 269 m.inputs.clear() 270 matchPool[re.mpool].Put(m) 271 } 272 273 // minInputLen walks the regexp to find the minimum length of any matchable input 274 func minInputLen(re *syntax.Regexp) int { 275 switch re.Op { 276 default: 277 return 0 278 case syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpCharClass: 279 return 1 280 case syntax.OpLiteral: 281 l := 0 282 for _, r := range re.Rune { 283 if r == utf8.RuneError { 284 l++ 285 } else { 286 l += utf8.RuneLen(r) 287 } 288 } 289 return l 290 case syntax.OpCapture, syntax.OpPlus: 291 return minInputLen(re.Sub[0]) 292 case syntax.OpRepeat: 293 return re.Min * minInputLen(re.Sub[0]) 294 case syntax.OpConcat: 295 l := 0 296 for _, sub := range re.Sub { 297 l += minInputLen(sub) 298 } 299 return l 300 case syntax.OpAlternate: 301 l := minInputLen(re.Sub[0]) 302 var lnext int 303 for _, sub := range re.Sub[1:] { 304 lnext = minInputLen(sub) 305 if lnext < l { 306 l = lnext 307 } 308 } 309 return l 310 } 311 } 312 313 // MustCompile is like Compile but panics if the expression cannot be parsed. 314 // It simplifies safe initialization of global variables holding compiled regular 315 // expressions. 316 func MustCompile(str string) *Regexp { 317 regexp, err := Compile(str) 318 if err != nil { 319 panic(`regexp: Compile(` + quote(str) + `): ` + err.Error()) 320 } 321 return regexp 322 } 323 324 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 325 // It simplifies safe initialization of global variables holding compiled regular 326 // expressions. 327 func MustCompilePOSIX(str string) *Regexp { 328 regexp, err := CompilePOSIX(str) 329 if err != nil { 330 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error()) 331 } 332 return regexp 333 } 334 335 func quote(s string) string { 336 if strconv.CanBackquote(s) { 337 return "`" + s + "`" 338 } 339 return strconv.Quote(s) 340 } 341 342 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 343 func (re *Regexp) NumSubexp() int { 344 return re.numSubexp 345 } 346 347 // SubexpNames returns the names of the parenthesized subexpressions 348 // in this Regexp. The name for the first sub-expression is names[1], 349 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 350 // Since the Regexp as a whole cannot be named, names[0] is always 351 // the empty string. The slice should not be modified. 352 func (re *Regexp) SubexpNames() []string { 353 return re.subexpNames 354 } 355 356 // SubexpIndex returns the index of the first subexpression with the given name, 357 // or -1 if there is no subexpression with that name. 358 // 359 // Note that multiple subexpressions can be written using the same name, as in 360 // (?P<bob>a+)(?P<bob>b+), which declares two subexpressions named "bob". 361 // In this case, SubexpIndex returns the index of the leftmost such subexpression 362 // in the regular expression. 363 func (re *Regexp) SubexpIndex(name string) int { 364 if name != "" { 365 for i, s := range re.subexpNames { 366 if name == s { 367 return i 368 } 369 } 370 } 371 return -1 372 } 373 374 const endOfText rune = -1 375 376 // input abstracts different representations of the input text. It provides 377 // one-character lookahead. 378 type input interface { 379 step(pos int) (r rune, width int) // advance one rune 380 canCheckPrefix() bool // can we look ahead without losing info? 381 hasPrefix(re *Regexp) bool 382 index(re *Regexp, pos int) int 383 context(pos int) lazyFlag 384 } 385 386 // inputString scans a string. 387 type inputString struct { 388 str string 389 } 390 391 func (i *inputString) step(pos int) (rune, int) { 392 if pos < len(i.str) { 393 c := i.str[pos] 394 if c < utf8.RuneSelf { 395 return rune(c), 1 396 } 397 return utf8.DecodeRuneInString(i.str[pos:]) 398 } 399 return endOfText, 0 400 } 401 402 func (i *inputString) canCheckPrefix() bool { 403 return true 404 } 405 406 func (i *inputString) hasPrefix(re *Regexp) bool { 407 return strings.HasPrefix(i.str, re.prefix) 408 } 409 410 func (i *inputString) index(re *Regexp, pos int) int { 411 return strings.Index(i.str[pos:], re.prefix) 412 } 413 414 func (i *inputString) context(pos int) lazyFlag { 415 r1, r2 := endOfText, endOfText 416 // 0 < pos && pos <= len(i.str) 417 if uint(pos-1) < uint(len(i.str)) { 418 r1 = rune(i.str[pos-1]) 419 if r1 >= utf8.RuneSelf { 420 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 421 } 422 } 423 // 0 <= pos && pos < len(i.str) 424 if uint(pos) < uint(len(i.str)) { 425 r2 = rune(i.str[pos]) 426 if r2 >= utf8.RuneSelf { 427 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 428 } 429 } 430 return newLazyFlag(r1, r2) 431 } 432 433 // inputBytes scans a byte slice. 434 type inputBytes struct { 435 str []byte 436 } 437 438 func (i *inputBytes) step(pos int) (rune, int) { 439 if pos < len(i.str) { 440 c := i.str[pos] 441 if c < utf8.RuneSelf { 442 return rune(c), 1 443 } 444 return utf8.DecodeRune(i.str[pos:]) 445 } 446 return endOfText, 0 447 } 448 449 func (i *inputBytes) canCheckPrefix() bool { 450 return true 451 } 452 453 func (i *inputBytes) hasPrefix(re *Regexp) bool { 454 return bytes.HasPrefix(i.str, re.prefixBytes) 455 } 456 457 func (i *inputBytes) index(re *Regexp, pos int) int { 458 return bytes.Index(i.str[pos:], re.prefixBytes) 459 } 460 461 func (i *inputBytes) context(pos int) lazyFlag { 462 r1, r2 := endOfText, endOfText 463 // 0 < pos && pos <= len(i.str) 464 if uint(pos-1) < uint(len(i.str)) { 465 r1 = rune(i.str[pos-1]) 466 if r1 >= utf8.RuneSelf { 467 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 468 } 469 } 470 // 0 <= pos && pos < len(i.str) 471 if uint(pos) < uint(len(i.str)) { 472 r2 = rune(i.str[pos]) 473 if r2 >= utf8.RuneSelf { 474 r2, _ = utf8.DecodeRune(i.str[pos:]) 475 } 476 } 477 return newLazyFlag(r1, r2) 478 } 479 480 // inputReader scans a RuneReader. 481 type inputReader struct { 482 r io.RuneReader 483 atEOT bool 484 pos int 485 } 486 487 func (i *inputReader) step(pos int) (rune, int) { 488 if !i.atEOT && pos != i.pos { 489 return endOfText, 0 490 491 } 492 r, w, err := i.r.ReadRune() 493 if err != nil { 494 i.atEOT = true 495 return endOfText, 0 496 } 497 i.pos += w 498 return r, w 499 } 500 501 func (i *inputReader) canCheckPrefix() bool { 502 return false 503 } 504 505 func (i *inputReader) hasPrefix(re *Regexp) bool { 506 return false 507 } 508 509 func (i *inputReader) index(re *Regexp, pos int) int { 510 return -1 511 } 512 513 func (i *inputReader) context(pos int) lazyFlag { 514 return 0 // not used 515 } 516 517 // LiteralPrefix returns a literal string that must begin any match 518 // of the regular expression re. It returns the boolean true if the 519 // literal string comprises the entire regular expression. 520 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 521 return re.prefix, re.prefixComplete 522 } 523 524 // MatchReader reports whether the text returned by the RuneReader 525 // contains any match of the regular expression re. 526 func (re *Regexp) MatchReader(r io.RuneReader) bool { 527 return re.doMatch(r, nil, "") 528 } 529 530 // MatchString reports whether the string s 531 // contains any match of the regular expression re. 532 func (re *Regexp) MatchString(s string) bool { 533 return re.doMatch(nil, nil, s) 534 } 535 536 // Match reports whether the byte slice b 537 // contains any match of the regular expression re. 538 func (re *Regexp) Match(b []byte) bool { 539 return re.doMatch(nil, b, "") 540 } 541 542 // MatchReader reports whether the text returned by the RuneReader 543 // contains any match of the regular expression pattern. 544 // More complicated queries need to use Compile and the full Regexp interface. 545 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 546 re, err := Compile(pattern) 547 if err != nil { 548 return false, err 549 } 550 return re.MatchReader(r), nil 551 } 552 553 // MatchString reports whether the string s 554 // contains any match of the regular expression pattern. 555 // More complicated queries need to use Compile and the full Regexp interface. 556 func MatchString(pattern string, s string) (matched bool, err error) { 557 re, err := Compile(pattern) 558 if err != nil { 559 return false, err 560 } 561 return re.MatchString(s), nil 562 } 563 564 // Match reports whether the byte slice b 565 // contains any match of the regular expression pattern. 566 // More complicated queries need to use Compile and the full Regexp interface. 567 func Match(pattern string, b []byte) (matched bool, err error) { 568 re, err := Compile(pattern) 569 if err != nil { 570 return false, err 571 } 572 return re.Match(b), nil 573 } 574 575 // ReplaceAllString returns a copy of src, replacing matches of the Regexp 576 // with the replacement string repl. Inside repl, $ signs are interpreted as 577 // in Expand, so for instance $1 represents the text of the first submatch. 578 func (re *Regexp) ReplaceAllString(src, repl string) string { 579 n := 2 580 if strings.Contains(repl, "$") { 581 n = 2 * (re.numSubexp + 1) 582 } 583 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 584 return re.expand(dst, repl, nil, src, match) 585 }) 586 return string(b) 587 } 588 589 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 590 // with the replacement string repl. The replacement repl is substituted directly, 591 // without using Expand. 592 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 593 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 594 return append(dst, repl...) 595 })) 596 } 597 598 // ReplaceAllStringFunc returns a copy of src in which all matches of the 599 // Regexp have been replaced by the return value of function repl applied 600 // to the matched substring. The replacement returned by repl is substituted 601 // directly, without using Expand. 602 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 603 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 604 return append(dst, repl(src[match[0]:match[1]])...) 605 }) 606 return string(b) 607 } 608 609 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 610 lastMatchEnd := 0 // end position of the most recent match 611 searchPos := 0 // position where we next look for a match 612 var buf []byte 613 var endPos int 614 if bsrc != nil { 615 endPos = len(bsrc) 616 } else { 617 endPos = len(src) 618 } 619 if nmatch > re.prog.NumCap { 620 nmatch = re.prog.NumCap 621 } 622 623 var dstCap [2]int 624 for searchPos <= endPos { 625 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) 626 if len(a) == 0 { 627 break // no more matches 628 } 629 630 // Copy the unmatched characters before this match. 631 if bsrc != nil { 632 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 633 } else { 634 buf = append(buf, src[lastMatchEnd:a[0]]...) 635 } 636 637 // Now insert a copy of the replacement string, but not for a 638 // match of the empty string immediately after another match. 639 // (Otherwise, we get double replacement for patterns that 640 // match both empty and nonempty strings.) 641 if a[1] > lastMatchEnd || a[0] == 0 { 642 buf = repl(buf, a) 643 } 644 lastMatchEnd = a[1] 645 646 // Advance past this match; always advance at least one character. 647 var width int 648 if bsrc != nil { 649 _, width = utf8.DecodeRune(bsrc[searchPos:]) 650 } else { 651 _, width = utf8.DecodeRuneInString(src[searchPos:]) 652 } 653 if searchPos+width > a[1] { 654 searchPos += width 655 } else if searchPos+1 > a[1] { 656 // This clause is only needed at the end of the input 657 // string. In that case, DecodeRuneInString returns width=0. 658 searchPos++ 659 } else { 660 searchPos = a[1] 661 } 662 } 663 664 // Copy the unmatched characters after the last match. 665 if bsrc != nil { 666 buf = append(buf, bsrc[lastMatchEnd:]...) 667 } else { 668 buf = append(buf, src[lastMatchEnd:]...) 669 } 670 671 return buf 672 } 673 674 // ReplaceAll returns a copy of src, replacing matches of the Regexp 675 // with the replacement text repl. Inside repl, $ signs are interpreted as 676 // in Expand, so for instance $1 represents the text of the first submatch. 677 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 678 n := 2 679 if bytes.IndexByte(repl, '$') >= 0 { 680 n = 2 * (re.numSubexp + 1) 681 } 682 srepl := "" 683 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 684 if len(srepl) != len(repl) { 685 srepl = string(repl) 686 } 687 return re.expand(dst, srepl, src, "", match) 688 }) 689 return b 690 } 691 692 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 693 // with the replacement bytes repl. The replacement repl is substituted directly, 694 // without using Expand. 695 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 696 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 697 return append(dst, repl...) 698 }) 699 } 700 701 // ReplaceAllFunc returns a copy of src in which all matches of the 702 // Regexp have been replaced by the return value of function repl applied 703 // to the matched byte slice. The replacement returned by repl is substituted 704 // directly, without using Expand. 705 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 706 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 707 return append(dst, repl(src[match[0]:match[1]])...) 708 }) 709 } 710 711 // Bitmap used by func special to check whether a character needs to be escaped. 712 var specialBytes [16]byte 713 714 // special reports whether byte b needs to be escaped by QuoteMeta. 715 func special(b byte) bool { 716 return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0 717 } 718 719 func init() { 720 for _, b := range []byte(`\.+*?()|[]{}^$`) { 721 specialBytes[b%16] |= 1 << (b / 16) 722 } 723 } 724 725 // QuoteMeta returns a string that escapes all regular expression metacharacters 726 // inside the argument text; the returned string is a regular expression matching 727 // the literal text. 728 func QuoteMeta(s string) string { 729 // A byte loop is correct because all metacharacters are ASCII. 730 var i int 731 for i = 0; i < len(s); i++ { 732 if special(s[i]) { 733 break 734 } 735 } 736 // No meta characters found, so return original string. 737 if i >= len(s) { 738 return s 739 } 740 741 b := make([]byte, 2*len(s)-i) 742 copy(b, s[:i]) 743 j := i 744 for ; i < len(s); i++ { 745 if special(s[i]) { 746 b[j] = '\\' 747 j++ 748 } 749 b[j] = s[i] 750 j++ 751 } 752 return string(b[:j]) 753 } 754 755 // The number of capture values in the program may correspond 756 // to fewer capturing expressions than are in the regexp. 757 // For example, "(a){0}" turns into an empty program, so the 758 // maximum capture in the program is 0 but we need to return 759 // an expression for \1. Pad appends -1s to the slice a as needed. 760 func (re *Regexp) pad(a []int) []int { 761 if a == nil { 762 // No match. 763 return nil 764 } 765 n := (1 + re.numSubexp) * 2 766 for len(a) < n { 767 a = append(a, -1) 768 } 769 return a 770 } 771 772 // allMatches calls deliver at most n times 773 // with the location of successive matches in the input text. 774 // The input text is b if non-nil, otherwise s. 775 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 776 var end int 777 if b == nil { 778 end = len(s) 779 } else { 780 end = len(b) 781 } 782 783 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 784 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) 785 if len(matches) == 0 { 786 break 787 } 788 789 accept := true 790 if matches[1] == pos { 791 // We've found an empty match. 792 if matches[0] == prevMatchEnd { 793 // We don't allow an empty match right 794 // after a previous match, so ignore it. 795 accept = false 796 } 797 var width int 798 if b == nil { 799 is := inputString{str: s} 800 _, width = is.step(pos) 801 } else { 802 ib := inputBytes{str: b} 803 _, width = ib.step(pos) 804 } 805 if width > 0 { 806 pos += width 807 } else { 808 pos = end + 1 809 } 810 } else { 811 pos = matches[1] 812 } 813 prevMatchEnd = matches[1] 814 815 if accept { 816 deliver(re.pad(matches)) 817 i++ 818 } 819 } 820 } 821 822 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 823 // A return value of nil indicates no match. 824 func (re *Regexp) Find(b []byte) []byte { 825 var dstCap [2]int 826 a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) 827 if a == nil { 828 return nil 829 } 830 return b[a[0]:a[1]:a[1]] 831 } 832 833 // FindIndex returns a two-element slice of integers defining the location of 834 // the leftmost match in b of the regular expression. The match itself is at 835 // b[loc[0]:loc[1]]. 836 // A return value of nil indicates no match. 837 func (re *Regexp) FindIndex(b []byte) (loc []int) { 838 a := re.doExecute(nil, b, "", 0, 2, nil) 839 if a == nil { 840 return nil 841 } 842 return a[0:2] 843 } 844 845 // FindString returns a string holding the text of the leftmost match in s of the regular 846 // expression. If there is no match, the return value is an empty string, 847 // but it will also be empty if the regular expression successfully matches 848 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 849 // necessary to distinguish these cases. 850 func (re *Regexp) FindString(s string) string { 851 var dstCap [2]int 852 a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) 853 if a == nil { 854 return "" 855 } 856 return s[a[0]:a[1]] 857 } 858 859 // FindStringIndex returns a two-element slice of integers defining the 860 // location of the leftmost match in s of the regular expression. The match 861 // itself is at s[loc[0]:loc[1]]. 862 // A return value of nil indicates no match. 863 func (re *Regexp) FindStringIndex(s string) (loc []int) { 864 a := re.doExecute(nil, nil, s, 0, 2, nil) 865 if a == nil { 866 return nil 867 } 868 return a[0:2] 869 } 870 871 // FindReaderIndex returns a two-element slice of integers defining the 872 // location of the leftmost match of the regular expression in text read from 873 // the RuneReader. The match text was found in the input stream at 874 // byte offset loc[0] through loc[1]-1. 875 // A return value of nil indicates no match. 876 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 877 a := re.doExecute(r, nil, "", 0, 2, nil) 878 if a == nil { 879 return nil 880 } 881 return a[0:2] 882 } 883 884 // FindSubmatch returns a slice of slices holding the text of the leftmost 885 // match of the regular expression in b and the matches, if any, of its 886 // subexpressions, as defined by the 'Submatch' descriptions in the package 887 // comment. 888 // A return value of nil indicates no match. 889 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 890 var dstCap [4]int 891 a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) 892 if a == nil { 893 return nil 894 } 895 ret := make([][]byte, 1+re.numSubexp) 896 for i := range ret { 897 if 2*i < len(a) && a[2*i] >= 0 { 898 ret[i] = b[a[2*i]:a[2*i+1]:a[2*i+1]] 899 } 900 } 901 return ret 902 } 903 904 // Expand appends template to dst and returns the result; during the 905 // append, Expand replaces variables in the template with corresponding 906 // matches drawn from src. The match slice should have been returned by 907 // FindSubmatchIndex. 908 // 909 // In the template, a variable is denoted by a substring of the form 910 // $name or ${name}, where name is a non-empty sequence of letters, 911 // digits, and underscores. A purely numeric name like $1 refers to 912 // the submatch with the corresponding index; other names refer to 913 // capturing parentheses named with the (?P<name>...) syntax. A 914 // reference to an out of range or unmatched index or a name that is not 915 // present in the regular expression is replaced with an empty slice. 916 // 917 // In the $name form, name is taken to be as long as possible: $1x is 918 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 919 // 920 // To insert a literal $ in the output, use $$ in the template. 921 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 922 return re.expand(dst, string(template), src, "", match) 923 } 924 925 // ExpandString is like Expand but the template and source are strings. 926 // It appends to and returns a byte slice in order to give the calling 927 // code control over allocation. 928 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 929 return re.expand(dst, template, nil, src, match) 930 } 931 932 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 933 for len(template) > 0 { 934 before, after, ok := strings.Cut(template, "$") 935 if !ok { 936 break 937 } 938 dst = append(dst, before...) 939 template = after 940 if template != "" && template[0] == '$' { 941 // Treat $$ as $. 942 dst = append(dst, '$') 943 template = template[1:] 944 continue 945 } 946 name, num, rest, ok := extract(template) 947 if !ok { 948 // Malformed; treat $ as raw text. 949 dst = append(dst, '$') 950 continue 951 } 952 template = rest 953 if num >= 0 { 954 if 2*num+1 < len(match) && match[2*num] >= 0 { 955 if bsrc != nil { 956 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 957 } else { 958 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 959 } 960 } 961 } else { 962 for i, namei := range re.subexpNames { 963 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 964 if bsrc != nil { 965 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 966 } else { 967 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 968 } 969 break 970 } 971 } 972 } 973 } 974 dst = append(dst, template...) 975 return dst 976 } 977 978 // extract returns the name from a leading "name" or "{name}" in str. 979 // (The $ has already been removed by the caller.) 980 // If it is a number, extract returns num set to that number; otherwise num = -1. 981 func extract(str string) (name string, num int, rest string, ok bool) { 982 if str == "" { 983 return 984 } 985 brace := false 986 if str[0] == '{' { 987 brace = true 988 str = str[1:] 989 } 990 i := 0 991 for i < len(str) { 992 rune, size := utf8.DecodeRuneInString(str[i:]) 993 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 994 break 995 } 996 i += size 997 } 998 if i == 0 { 999 // empty name is not okay 1000 return 1001 } 1002 name = str[:i] 1003 if brace { 1004 if i >= len(str) || str[i] != '}' { 1005 // missing closing brace 1006 return 1007 } 1008 i++ 1009 } 1010 1011 // Parse number. 1012 num = 0 1013 for i := 0; i < len(name); i++ { 1014 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 1015 num = -1 1016 break 1017 } 1018 num = num*10 + int(name[i]) - '0' 1019 } 1020 // Disallow leading zeros. 1021 if name[0] == '0' && len(name) > 1 { 1022 num = -1 1023 } 1024 1025 rest = str[i:] 1026 ok = true 1027 return 1028 } 1029 1030 // FindSubmatchIndex returns a slice holding the index pairs identifying the 1031 // leftmost match of the regular expression in b and the matches, if any, of 1032 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 1033 // in the package comment. 1034 // A return value of nil indicates no match. 1035 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 1036 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) 1037 } 1038 1039 // FindStringSubmatch returns a slice of strings holding the text of the 1040 // leftmost match of the regular expression in s and the matches, if any, of 1041 // its subexpressions, as defined by the 'Submatch' description in the 1042 // package comment. 1043 // A return value of nil indicates no match. 1044 func (re *Regexp) FindStringSubmatch(s string) []string { 1045 var dstCap [4]int 1046 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) 1047 if a == nil { 1048 return nil 1049 } 1050 ret := make([]string, 1+re.numSubexp) 1051 for i := range ret { 1052 if 2*i < len(a) && a[2*i] >= 0 { 1053 ret[i] = s[a[2*i]:a[2*i+1]] 1054 } 1055 } 1056 return ret 1057 } 1058 1059 // FindStringSubmatchIndex returns a slice holding the index pairs 1060 // identifying the leftmost match of the regular expression in s and the 1061 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 1062 // 'Index' descriptions in the package comment. 1063 // A return value of nil indicates no match. 1064 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 1065 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) 1066 } 1067 1068 // FindReaderSubmatchIndex returns a slice holding the index pairs 1069 // identifying the leftmost match of the regular expression of text read by 1070 // the RuneReader, and the matches, if any, of its subexpressions, as defined 1071 // by the 'Submatch' and 'Index' descriptions in the package comment. A 1072 // return value of nil indicates no match. 1073 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 1074 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) 1075 } 1076 1077 const startSize = 10 // The size at which to start a slice in the 'All' routines. 1078 1079 // FindAll is the 'All' version of Find; it returns a slice of all successive 1080 // matches of the expression, as defined by the 'All' description in the 1081 // package comment. 1082 // A return value of nil indicates no match. 1083 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 1084 if n < 0 { 1085 n = len(b) + 1 1086 } 1087 var result [][]byte 1088 re.allMatches("", b, n, func(match []int) { 1089 if result == nil { 1090 result = make([][]byte, 0, startSize) 1091 } 1092 result = append(result, b[match[0]:match[1]:match[1]]) 1093 }) 1094 return result 1095 } 1096 1097 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 1098 // successive matches of the expression, as defined by the 'All' description 1099 // in the package comment. 1100 // A return value of nil indicates no match. 1101 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 1102 if n < 0 { 1103 n = len(b) + 1 1104 } 1105 var result [][]int 1106 re.allMatches("", b, n, func(match []int) { 1107 if result == nil { 1108 result = make([][]int, 0, startSize) 1109 } 1110 result = append(result, match[0:2]) 1111 }) 1112 return result 1113 } 1114 1115 // FindAllString is the 'All' version of FindString; it returns a slice of all 1116 // successive matches of the expression, as defined by the 'All' description 1117 // in the package comment. 1118 // A return value of nil indicates no match. 1119 func (re *Regexp) FindAllString(s string, n int) []string { 1120 if n < 0 { 1121 n = len(s) + 1 1122 } 1123 var result []string 1124 re.allMatches(s, nil, n, func(match []int) { 1125 if result == nil { 1126 result = make([]string, 0, startSize) 1127 } 1128 result = append(result, s[match[0]:match[1]]) 1129 }) 1130 return result 1131 } 1132 1133 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 1134 // slice of all successive matches of the expression, as defined by the 'All' 1135 // description in the package comment. 1136 // A return value of nil indicates no match. 1137 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1138 if n < 0 { 1139 n = len(s) + 1 1140 } 1141 var result [][]int 1142 re.allMatches(s, nil, n, func(match []int) { 1143 if result == nil { 1144 result = make([][]int, 0, startSize) 1145 } 1146 result = append(result, match[0:2]) 1147 }) 1148 return result 1149 } 1150 1151 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1152 // of all successive matches of the expression, as defined by the 'All' 1153 // description in the package comment. 1154 // A return value of nil indicates no match. 1155 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1156 if n < 0 { 1157 n = len(b) + 1 1158 } 1159 var result [][][]byte 1160 re.allMatches("", b, n, func(match []int) { 1161 if result == nil { 1162 result = make([][][]byte, 0, startSize) 1163 } 1164 slice := make([][]byte, len(match)/2) 1165 for j := range slice { 1166 if match[2*j] >= 0 { 1167 slice[j] = b[match[2*j]:match[2*j+1]:match[2*j+1]] 1168 } 1169 } 1170 result = append(result, slice) 1171 }) 1172 return result 1173 } 1174 1175 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1176 // a slice of all successive matches of the expression, as defined by the 1177 // 'All' description in the package comment. 1178 // A return value of nil indicates no match. 1179 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1180 if n < 0 { 1181 n = len(b) + 1 1182 } 1183 var result [][]int 1184 re.allMatches("", b, n, func(match []int) { 1185 if result == nil { 1186 result = make([][]int, 0, startSize) 1187 } 1188 result = append(result, match) 1189 }) 1190 return result 1191 } 1192 1193 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1194 // returns a slice of all successive matches of the expression, as defined by 1195 // the 'All' description in the package comment. 1196 // A return value of nil indicates no match. 1197 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1198 if n < 0 { 1199 n = len(s) + 1 1200 } 1201 var result [][]string 1202 re.allMatches(s, nil, n, func(match []int) { 1203 if result == nil { 1204 result = make([][]string, 0, startSize) 1205 } 1206 slice := make([]string, len(match)/2) 1207 for j := range slice { 1208 if match[2*j] >= 0 { 1209 slice[j] = s[match[2*j]:match[2*j+1]] 1210 } 1211 } 1212 result = append(result, slice) 1213 }) 1214 return result 1215 } 1216 1217 // FindAllStringSubmatchIndex is the 'All' version of 1218 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1219 // the expression, as defined by the 'All' description in the package 1220 // comment. 1221 // A return value of nil indicates no match. 1222 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1223 if n < 0 { 1224 n = len(s) + 1 1225 } 1226 var result [][]int 1227 re.allMatches(s, nil, n, func(match []int) { 1228 if result == nil { 1229 result = make([][]int, 0, startSize) 1230 } 1231 result = append(result, match) 1232 }) 1233 return result 1234 } 1235 1236 // Split slices s into substrings separated by the expression and returns a slice of 1237 // the substrings between those expression matches. 1238 // 1239 // The slice returned by this method consists of all the substrings of s 1240 // not contained in the slice returned by FindAllString. When called on an expression 1241 // that contains no metacharacters, it is equivalent to strings.SplitN. 1242 // 1243 // Example: 1244 // 1245 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1246 // // s: ["", "b", "b", "c", "cadaaae"] 1247 // 1248 // The count determines the number of substrings to return: 1249 // 1250 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 1251 // n == 0: the result is nil (zero substrings) 1252 // n < 0: all substrings 1253 func (re *Regexp) Split(s string, n int) []string { 1254 1255 if n == 0 { 1256 return nil 1257 } 1258 1259 if len(re.expr) > 0 && len(s) == 0 { 1260 return []string{""} 1261 } 1262 1263 matches := re.FindAllStringIndex(s, n) 1264 strings := make([]string, 0, len(matches)) 1265 1266 beg := 0 1267 end := 0 1268 for _, match := range matches { 1269 if n > 0 && len(strings) >= n-1 { 1270 break 1271 } 1272 1273 end = match[0] 1274 if match[1] != 0 { 1275 strings = append(strings, s[beg:end]) 1276 } 1277 beg = match[1] 1278 } 1279 1280 if end != len(s) { 1281 strings = append(strings, s[beg:]) 1282 } 1283 1284 return strings 1285 }