github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/regexp/regexp.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // https://golang.org/s/re2syntax, except for \C. 11 // For an overview of the syntax, see the [regexp/syntax] package. 12 // 13 // The regexp implementation provided by this package is 14 // guaranteed to run in time linear in the size of the input. 15 // (This is a property not guaranteed by most open source 16 // implementations of regular expressions.) For more information 17 // about this property, see 18 // 19 // https://swtch.com/~rsc/regexp/regexp1.html 20 // 21 // or any book about automata theory. 22 // 23 // All characters are UTF-8-encoded code points. 24 // Following [utf8.DecodeRune], each byte of an invalid UTF-8 sequence 25 // is treated as if it encoded utf8.RuneError (U+FFFD). 26 // 27 // There are 16 methods of [Regexp] that match a regular expression and identify 28 // the matched text. Their names are matched by this regular expression: 29 // 30 // Find(All)?(String)?(Submatch)?(Index)? 31 // 32 // If 'All' is present, the routine matches successive non-overlapping 33 // matches of the entire expression. Empty matches abutting a preceding 34 // match are ignored. The return value is a slice containing the successive 35 // return values of the corresponding non-'All' routine. These routines take 36 // an extra integer argument, n. If n >= 0, the function returns at most n 37 // matches/submatches; otherwise, it returns all of them. 38 // 39 // If 'String' is present, the argument is a string; otherwise it is a slice 40 // of bytes; return values are adjusted as appropriate. 41 // 42 // If 'Submatch' is present, the return value is a slice identifying the 43 // successive submatches of the expression. Submatches are matches of 44 // parenthesized subexpressions (also known as capturing groups) within the 45 // regular expression, numbered from left to right in order of opening 46 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 is 47 // the match of the first parenthesized subexpression, and so on. 48 // 49 // If 'Index' is present, matches and submatches are identified by byte index 50 // pairs within the input string: result[2*n:2*n+2] identifies the indexes of 51 // the nth submatch. The pair for n==0 identifies the match of the entire 52 // expression. If 'Index' is not present, the match is identified by the text 53 // of the match/submatch. If an index is negative or text is nil, it means that 54 // subexpression did not match any string in the input. For 'String' versions 55 // an empty string means either no match or an empty match. 56 // 57 // There is also a subset of the methods that can be applied to text read 58 // from a RuneReader: 59 // 60 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 61 // 62 // This set may grow. Note that regular expression matches may need to 63 // examine text beyond the text returned by a match, so the methods that 64 // match text from a RuneReader may read arbitrarily far into the input 65 // before returning. 66 // 67 // (There are a few other methods that do not match this pattern.) 68 package regexp 69 70 import ( 71 "bytes" 72 "io" 73 "regexp/syntax" 74 "strconv" 75 "strings" 76 "sync" 77 "unicode" 78 "unicode/utf8" 79 ) 80 81 // Regexp is the representation of a compiled regular expression. 82 // A Regexp is safe for concurrent use by multiple goroutines, 83 // except for configuration methods, such as [Regexp.Longest]. 84 type Regexp struct { 85 expr string // as passed to Compile 86 prog *syntax.Prog // compiled program 87 onepass *onePassProg // onepass program or nil 88 numSubexp int 89 maxBitStateLen int 90 subexpNames []string 91 prefix string // required prefix in unanchored matches 92 prefixBytes []byte // prefix, as a []byte 93 prefixRune rune // first rune in prefix 94 prefixEnd uint32 // pc for last rune in prefix 95 mpool int // pool for machines 96 matchcap int // size of recorded match lengths 97 prefixComplete bool // prefix is the entire regexp 98 cond syntax.EmptyOp // empty-width conditions required at start of match 99 minInputLen int // minimum length of the input in bytes 100 101 // This field can be modified by the Longest method, 102 // but it is otherwise read-only. 103 longest bool // whether regexp prefers leftmost-longest match 104 } 105 106 // String returns the source text used to compile the regular expression. 107 func (re *Regexp) String() string { 108 return re.expr 109 } 110 111 // Copy returns a new [Regexp] object copied from re. 112 // Calling [Regexp.Longest] on one copy does not affect another. 113 // 114 // Deprecated: In earlier releases, when using a [Regexp] in multiple goroutines, 115 // giving each goroutine its own copy helped to avoid lock contention. 116 // As of Go 1.12, using Copy is no longer necessary to avoid lock contention. 117 // Copy may still be appropriate if the reason for its use is to make 118 // two copies with different [Regexp.Longest] settings. 119 func (re *Regexp) Copy() *Regexp { 120 re2 := *re 121 return &re2 122 } 123 124 // Compile parses a regular expression and returns, if successful, 125 // a [Regexp] object that can be used to match against text. 126 // 127 // When matching against text, the regexp returns a match that 128 // begins as early as possible in the input (leftmost), and among those 129 // it chooses the one that a backtracking search would have found first. 130 // This so-called leftmost-first matching is the same semantics 131 // that Perl, Python, and other implementations use, although this 132 // package implements it without the expense of backtracking. 133 // For POSIX leftmost-longest matching, see [CompilePOSIX]. 134 func Compile(expr string) (*Regexp, error) { 135 return compile(expr, syntax.Perl, false) 136 } 137 138 // CompilePOSIX is like [Compile] but restricts the regular expression 139 // to POSIX ERE (egrep) syntax and changes the match semantics to 140 // leftmost-longest. 141 // 142 // That is, when matching against text, the regexp returns a match that 143 // begins as early as possible in the input (leftmost), and among those 144 // it chooses a match that is as long as possible. 145 // This so-called leftmost-longest matching is the same semantics 146 // that early regular expression implementations used and that POSIX 147 // specifies. 148 // 149 // However, there can be multiple leftmost-longest matches, with different 150 // submatch choices, and here this package diverges from POSIX. 151 // Among the possible leftmost-longest matches, this package chooses 152 // the one that a backtracking search would have found first, while POSIX 153 // specifies that the match be chosen to maximize the length of the first 154 // subexpression, then the second, and so on from left to right. 155 // The POSIX rule is computationally prohibitive and not even well-defined. 156 // See https://swtch.com/~rsc/regexp/regexp2.html#posix for details. 157 func CompilePOSIX(expr string) (*Regexp, error) { 158 return compile(expr, syntax.POSIX, true) 159 } 160 161 // Longest makes future searches prefer the leftmost-longest match. 162 // That is, when matching against text, the regexp returns a match that 163 // begins as early as possible in the input (leftmost), and among those 164 // it chooses a match that is as long as possible. 165 // This method modifies the [Regexp] and may not be called concurrently 166 // with any other methods. 167 func (re *Regexp) Longest() { 168 re.longest = true 169 } 170 171 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 172 re, err := syntax.Parse(expr, mode) 173 if err != nil { 174 return nil, err 175 } 176 maxCap := re.MaxCap() 177 capNames := re.CapNames() 178 179 re = re.Simplify() 180 prog, err := syntax.Compile(re) 181 if err != nil { 182 return nil, err 183 } 184 matchcap := prog.NumCap 185 if matchcap < 2 { 186 matchcap = 2 187 } 188 regexp := &Regexp{ 189 expr: expr, 190 prog: prog, 191 onepass: compileOnePass(prog), 192 numSubexp: maxCap, 193 subexpNames: capNames, 194 cond: prog.StartCond(), 195 longest: longest, 196 matchcap: matchcap, 197 minInputLen: minInputLen(re), 198 } 199 if regexp.onepass == nil { 200 regexp.prefix, regexp.prefixComplete = prog.Prefix() 201 regexp.maxBitStateLen = maxBitStateLen(prog) 202 } else { 203 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 204 } 205 if regexp.prefix != "" { 206 // TODO(rsc): Remove this allocation by adding 207 // IndexString to package bytes. 208 regexp.prefixBytes = []byte(regexp.prefix) 209 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 210 } 211 212 n := len(prog.Inst) 213 i := 0 214 for matchSize[i] != 0 && matchSize[i] < n { 215 i++ 216 } 217 regexp.mpool = i 218 219 return regexp, nil 220 } 221 222 // Pools of *machine for use during (*Regexp).doExecute, 223 // split up by the size of the execution queues. 224 // matchPool[i] machines have queue size matchSize[i]. 225 // On a 64-bit system each queue entry is 16 bytes, 226 // so matchPool[0] has 16*2*128 = 4kB queues, etc. 227 // The final matchPool is a catch-all for very large queues. 228 var ( 229 matchSize = [...]int{128, 512, 2048, 16384, 0} 230 matchPool [len(matchSize)]sync.Pool 231 ) 232 233 // get returns a machine to use for matching re. 234 // It uses the re's machine cache if possible, to avoid 235 // unnecessary allocation. 236 func (re *Regexp) get() *machine { 237 m, ok := matchPool[re.mpool].Get().(*machine) 238 if !ok { 239 m = new(machine) 240 } 241 m.re = re 242 m.p = re.prog 243 if cap(m.matchcap) < re.matchcap { 244 m.matchcap = make([]int, re.matchcap) 245 for _, t := range m.pool { 246 t.cap = make([]int, re.matchcap) 247 } 248 } 249 250 // Allocate queues if needed. 251 // Or reallocate, for "large" match pool. 252 n := matchSize[re.mpool] 253 if n == 0 { // large pool 254 n = len(re.prog.Inst) 255 } 256 if len(m.q0.sparse) < n { 257 m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 258 m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 259 } 260 return m 261 } 262 263 // put returns a machine to the correct machine pool. 264 func (re *Regexp) put(m *machine) { 265 m.re = nil 266 m.p = nil 267 m.inputs.clear() 268 matchPool[re.mpool].Put(m) 269 } 270 271 // minInputLen walks the regexp to find the minimum length of any matchable input. 272 func minInputLen(re *syntax.Regexp) int { 273 switch re.Op { 274 default: 275 return 0 276 case syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpCharClass: 277 return 1 278 case syntax.OpLiteral: 279 l := 0 280 for _, r := range re.Rune { 281 if r == utf8.RuneError { 282 l++ 283 } else { 284 l += utf8.RuneLen(r) 285 } 286 } 287 return l 288 case syntax.OpCapture, syntax.OpPlus: 289 return minInputLen(re.Sub[0]) 290 case syntax.OpRepeat: 291 return re.Min * minInputLen(re.Sub[0]) 292 case syntax.OpConcat: 293 l := 0 294 for _, sub := range re.Sub { 295 l += minInputLen(sub) 296 } 297 return l 298 case syntax.OpAlternate: 299 l := minInputLen(re.Sub[0]) 300 var lnext int 301 for _, sub := range re.Sub[1:] { 302 lnext = minInputLen(sub) 303 if lnext < l { 304 l = lnext 305 } 306 } 307 return l 308 } 309 } 310 311 // MustCompile is like [Compile] but panics if the expression cannot be parsed. 312 // It simplifies safe initialization of global variables holding compiled regular 313 // expressions. 314 func MustCompile(str string) *Regexp { 315 regexp, err := Compile(str) 316 if err != nil { 317 panic(`regexp: Compile(` + quote(str) + `): ` + err.Error()) 318 } 319 return regexp 320 } 321 322 // MustCompilePOSIX is like [CompilePOSIX] but panics if the expression cannot be parsed. 323 // It simplifies safe initialization of global variables holding compiled regular 324 // expressions. 325 func MustCompilePOSIX(str string) *Regexp { 326 regexp, err := CompilePOSIX(str) 327 if err != nil { 328 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error()) 329 } 330 return regexp 331 } 332 333 func quote(s string) string { 334 if strconv.CanBackquote(s) { 335 return "`" + s + "`" 336 } 337 return strconv.Quote(s) 338 } 339 340 // NumSubexp returns the number of parenthesized subexpressions in this [Regexp]. 341 func (re *Regexp) NumSubexp() int { 342 return re.numSubexp 343 } 344 345 // SubexpNames returns the names of the parenthesized subexpressions 346 // in this [Regexp]. The name for the first sub-expression is names[1], 347 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 348 // Since the Regexp as a whole cannot be named, names[0] is always 349 // the empty string. The slice should not be modified. 350 func (re *Regexp) SubexpNames() []string { 351 return re.subexpNames 352 } 353 354 // SubexpIndex returns the index of the first subexpression with the given name, 355 // or -1 if there is no subexpression with that name. 356 // 357 // Note that multiple subexpressions can be written using the same name, as in 358 // (?P<bob>a+)(?P<bob>b+), which declares two subexpressions named "bob". 359 // In this case, SubexpIndex returns the index of the leftmost such subexpression 360 // in the regular expression. 361 func (re *Regexp) SubexpIndex(name string) int { 362 if name != "" { 363 for i, s := range re.subexpNames { 364 if name == s { 365 return i 366 } 367 } 368 } 369 return -1 370 } 371 372 const endOfText rune = -1 373 374 // input abstracts different representations of the input text. It provides 375 // one-character lookahead. 376 type input interface { 377 step(pos int) (r rune, width int) // advance one rune 378 canCheckPrefix() bool // can we look ahead without losing info? 379 hasPrefix(re *Regexp) bool 380 index(re *Regexp, pos int) int 381 context(pos int) lazyFlag 382 } 383 384 // inputString scans a string. 385 type inputString struct { 386 str string 387 } 388 389 func (i *inputString) step(pos int) (rune, int) { 390 if pos < len(i.str) { 391 c := i.str[pos] 392 if c < utf8.RuneSelf { 393 return rune(c), 1 394 } 395 return utf8.DecodeRuneInString(i.str[pos:]) 396 } 397 return endOfText, 0 398 } 399 400 func (i *inputString) canCheckPrefix() bool { 401 return true 402 } 403 404 func (i *inputString) hasPrefix(re *Regexp) bool { 405 return strings.HasPrefix(i.str, re.prefix) 406 } 407 408 func (i *inputString) index(re *Regexp, pos int) int { 409 return strings.Index(i.str[pos:], re.prefix) 410 } 411 412 func (i *inputString) context(pos int) lazyFlag { 413 r1, r2 := endOfText, endOfText 414 // 0 < pos && pos <= len(i.str) 415 if uint(pos-1) < uint(len(i.str)) { 416 r1 = rune(i.str[pos-1]) 417 if r1 >= utf8.RuneSelf { 418 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 419 } 420 } 421 // 0 <= pos && pos < len(i.str) 422 if uint(pos) < uint(len(i.str)) { 423 r2 = rune(i.str[pos]) 424 if r2 >= utf8.RuneSelf { 425 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 426 } 427 } 428 return newLazyFlag(r1, r2) 429 } 430 431 // inputBytes scans a byte slice. 432 type inputBytes struct { 433 str []byte 434 } 435 436 func (i *inputBytes) step(pos int) (rune, int) { 437 if pos < len(i.str) { 438 c := i.str[pos] 439 if c < utf8.RuneSelf { 440 return rune(c), 1 441 } 442 return utf8.DecodeRune(i.str[pos:]) 443 } 444 return endOfText, 0 445 } 446 447 func (i *inputBytes) canCheckPrefix() bool { 448 return true 449 } 450 451 func (i *inputBytes) hasPrefix(re *Regexp) bool { 452 return bytes.HasPrefix(i.str, re.prefixBytes) 453 } 454 455 func (i *inputBytes) index(re *Regexp, pos int) int { 456 return bytes.Index(i.str[pos:], re.prefixBytes) 457 } 458 459 func (i *inputBytes) context(pos int) lazyFlag { 460 r1, r2 := endOfText, endOfText 461 // 0 < pos && pos <= len(i.str) 462 if uint(pos-1) < uint(len(i.str)) { 463 r1 = rune(i.str[pos-1]) 464 if r1 >= utf8.RuneSelf { 465 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 466 } 467 } 468 // 0 <= pos && pos < len(i.str) 469 if uint(pos) < uint(len(i.str)) { 470 r2 = rune(i.str[pos]) 471 if r2 >= utf8.RuneSelf { 472 r2, _ = utf8.DecodeRune(i.str[pos:]) 473 } 474 } 475 return newLazyFlag(r1, r2) 476 } 477 478 // inputReader scans a RuneReader. 479 type inputReader struct { 480 r io.RuneReader 481 atEOT bool 482 pos int 483 } 484 485 func (i *inputReader) step(pos int) (rune, int) { 486 if !i.atEOT && pos != i.pos { 487 return endOfText, 0 488 489 } 490 r, w, err := i.r.ReadRune() 491 if err != nil { 492 i.atEOT = true 493 return endOfText, 0 494 } 495 i.pos += w 496 return r, w 497 } 498 499 func (i *inputReader) canCheckPrefix() bool { 500 return false 501 } 502 503 func (i *inputReader) hasPrefix(re *Regexp) bool { 504 return false 505 } 506 507 func (i *inputReader) index(re *Regexp, pos int) int { 508 return -1 509 } 510 511 func (i *inputReader) context(pos int) lazyFlag { 512 return 0 // not used 513 } 514 515 // LiteralPrefix returns a literal string that must begin any match 516 // of the regular expression re. It returns the boolean true if the 517 // literal string comprises the entire regular expression. 518 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 519 return re.prefix, re.prefixComplete 520 } 521 522 // MatchReader reports whether the text returned by the [io.RuneReader] 523 // contains any match of the regular expression re. 524 func (re *Regexp) MatchReader(r io.RuneReader) bool { 525 return re.doMatch(r, nil, "") 526 } 527 528 // MatchString reports whether the string s 529 // contains any match of the regular expression re. 530 func (re *Regexp) MatchString(s string) bool { 531 return re.doMatch(nil, nil, s) 532 } 533 534 // Match reports whether the byte slice b 535 // contains any match of the regular expression re. 536 func (re *Regexp) Match(b []byte) bool { 537 return re.doMatch(nil, b, "") 538 } 539 540 // MatchReader reports whether the text returned by the RuneReader 541 // contains any match of the regular expression pattern. 542 // More complicated queries need to use [Compile] and the full [Regexp] interface. 543 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 544 re, err := Compile(pattern) 545 if err != nil { 546 return false, err 547 } 548 return re.MatchReader(r), nil 549 } 550 551 // MatchString reports whether the string s 552 // contains any match of the regular expression pattern. 553 // More complicated queries need to use [Compile] and the full [Regexp] interface. 554 func MatchString(pattern string, s string) (matched bool, err error) { 555 re, err := Compile(pattern) 556 if err != nil { 557 return false, err 558 } 559 return re.MatchString(s), nil 560 } 561 562 // Match reports whether the byte slice b 563 // contains any match of the regular expression pattern. 564 // More complicated queries need to use [Compile] and the full [Regexp] interface. 565 func Match(pattern string, b []byte) (matched bool, err error) { 566 re, err := Compile(pattern) 567 if err != nil { 568 return false, err 569 } 570 return re.Match(b), nil 571 } 572 573 // ReplaceAllString returns a copy of src, replacing matches of the [Regexp] 574 // with the replacement string repl. 575 // Inside repl, $ signs are interpreted as in [Regexp.Expand]. 576 func (re *Regexp) ReplaceAllString(src, repl string) string { 577 n := 2 578 if strings.Contains(repl, "$") { 579 n = 2 * (re.numSubexp + 1) 580 } 581 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 582 return re.expand(dst, repl, nil, src, match) 583 }) 584 return string(b) 585 } 586 587 // ReplaceAllLiteralString returns a copy of src, replacing matches of the [Regexp] 588 // with the replacement string repl. The replacement repl is substituted directly, 589 // without using [Regexp.Expand]. 590 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 591 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 592 return append(dst, repl...) 593 })) 594 } 595 596 // ReplaceAllStringFunc returns a copy of src in which all matches of the 597 // [Regexp] have been replaced by the return value of function repl applied 598 // to the matched substring. The replacement returned by repl is substituted 599 // directly, without using [Regexp.Expand]. 600 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 601 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 602 return append(dst, repl(src[match[0]:match[1]])...) 603 }) 604 return string(b) 605 } 606 607 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 608 lastMatchEnd := 0 // end position of the most recent match 609 searchPos := 0 // position where we next look for a match 610 var buf []byte 611 var endPos int 612 if bsrc != nil { 613 endPos = len(bsrc) 614 } else { 615 endPos = len(src) 616 } 617 if nmatch > re.prog.NumCap { 618 nmatch = re.prog.NumCap 619 } 620 621 var dstCap [2]int 622 for searchPos <= endPos { 623 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) 624 if len(a) == 0 { 625 break // no more matches 626 } 627 628 // Copy the unmatched characters before this match. 629 if bsrc != nil { 630 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 631 } else { 632 buf = append(buf, src[lastMatchEnd:a[0]]...) 633 } 634 635 // Now insert a copy of the replacement string, but not for a 636 // match of the empty string immediately after another match. 637 // (Otherwise, we get double replacement for patterns that 638 // match both empty and nonempty strings.) 639 if a[1] > lastMatchEnd || a[0] == 0 { 640 buf = repl(buf, a) 641 } 642 lastMatchEnd = a[1] 643 644 // Advance past this match; always advance at least one character. 645 var width int 646 if bsrc != nil { 647 _, width = utf8.DecodeRune(bsrc[searchPos:]) 648 } else { 649 _, width = utf8.DecodeRuneInString(src[searchPos:]) 650 } 651 if searchPos+width > a[1] { 652 searchPos += width 653 } else if searchPos+1 > a[1] { 654 // This clause is only needed at the end of the input 655 // string. In that case, DecodeRuneInString returns width=0. 656 searchPos++ 657 } else { 658 searchPos = a[1] 659 } 660 } 661 662 // Copy the unmatched characters after the last match. 663 if bsrc != nil { 664 buf = append(buf, bsrc[lastMatchEnd:]...) 665 } else { 666 buf = append(buf, src[lastMatchEnd:]...) 667 } 668 669 return buf 670 } 671 672 // ReplaceAll returns a copy of src, replacing matches of the [Regexp] 673 // with the replacement text repl. 674 // Inside repl, $ signs are interpreted as in [Regexp.Expand]. 675 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 676 n := 2 677 if bytes.IndexByte(repl, '$') >= 0 { 678 n = 2 * (re.numSubexp + 1) 679 } 680 srepl := "" 681 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 682 if len(srepl) != len(repl) { 683 srepl = string(repl) 684 } 685 return re.expand(dst, srepl, src, "", match) 686 }) 687 return b 688 } 689 690 // ReplaceAllLiteral returns a copy of src, replacing matches of the [Regexp] 691 // with the replacement bytes repl. The replacement repl is substituted directly, 692 // without using [Regexp.Expand]. 693 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 694 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 695 return append(dst, repl...) 696 }) 697 } 698 699 // ReplaceAllFunc returns a copy of src in which all matches of the 700 // [Regexp] have been replaced by the return value of function repl applied 701 // to the matched byte slice. The replacement returned by repl is substituted 702 // directly, without using [Regexp.Expand]. 703 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 704 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 705 return append(dst, repl(src[match[0]:match[1]])...) 706 }) 707 } 708 709 // Bitmap used by func special to check whether a character needs to be escaped. 710 var specialBytes [16]byte 711 712 // special reports whether byte b needs to be escaped by QuoteMeta. 713 func special(b byte) bool { 714 return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0 715 } 716 717 func init() { 718 for _, b := range []byte(`\.+*?()|[]{}^$`) { 719 specialBytes[b%16] |= 1 << (b / 16) 720 } 721 } 722 723 // QuoteMeta returns a string that escapes all regular expression metacharacters 724 // inside the argument text; the returned string is a regular expression matching 725 // the literal text. 726 func QuoteMeta(s string) string { 727 // A byte loop is correct because all metacharacters are ASCII. 728 var i int 729 for i = 0; i < len(s); i++ { 730 if special(s[i]) { 731 break 732 } 733 } 734 // No meta characters found, so return original string. 735 if i >= len(s) { 736 return s 737 } 738 739 b := make([]byte, 2*len(s)-i) 740 copy(b, s[:i]) 741 j := i 742 for ; i < len(s); i++ { 743 if special(s[i]) { 744 b[j] = '\\' 745 j++ 746 } 747 b[j] = s[i] 748 j++ 749 } 750 return string(b[:j]) 751 } 752 753 // The number of capture values in the program may correspond 754 // to fewer capturing expressions than are in the regexp. 755 // For example, "(a){0}" turns into an empty program, so the 756 // maximum capture in the program is 0 but we need to return 757 // an expression for \1. Pad appends -1s to the slice a as needed. 758 func (re *Regexp) pad(a []int) []int { 759 if a == nil { 760 // No match. 761 return nil 762 } 763 n := (1 + re.numSubexp) * 2 764 for len(a) < n { 765 a = append(a, -1) 766 } 767 return a 768 } 769 770 // allMatches calls deliver at most n times 771 // with the location of successive matches in the input text. 772 // The input text is b if non-nil, otherwise s. 773 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 774 var end int 775 if b == nil { 776 end = len(s) 777 } else { 778 end = len(b) 779 } 780 781 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 782 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) 783 if len(matches) == 0 { 784 break 785 } 786 787 accept := true 788 if matches[1] == pos { 789 // We've found an empty match. 790 if matches[0] == prevMatchEnd { 791 // We don't allow an empty match right 792 // after a previous match, so ignore it. 793 accept = false 794 } 795 var width int 796 if b == nil { 797 is := inputString{str: s} 798 _, width = is.step(pos) 799 } else { 800 ib := inputBytes{str: b} 801 _, width = ib.step(pos) 802 } 803 if width > 0 { 804 pos += width 805 } else { 806 pos = end + 1 807 } 808 } else { 809 pos = matches[1] 810 } 811 prevMatchEnd = matches[1] 812 813 if accept { 814 deliver(re.pad(matches)) 815 i++ 816 } 817 } 818 } 819 820 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 821 // A return value of nil indicates no match. 822 func (re *Regexp) Find(b []byte) []byte { 823 var dstCap [2]int 824 a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) 825 if a == nil { 826 return nil 827 } 828 return b[a[0]:a[1]:a[1]] 829 } 830 831 // FindIndex returns a two-element slice of integers defining the location of 832 // the leftmost match in b of the regular expression. The match itself is at 833 // b[loc[0]:loc[1]]. 834 // A return value of nil indicates no match. 835 func (re *Regexp) FindIndex(b []byte) (loc []int) { 836 a := re.doExecute(nil, b, "", 0, 2, nil) 837 if a == nil { 838 return nil 839 } 840 return a[0:2] 841 } 842 843 // FindString returns a string holding the text of the leftmost match in s of the regular 844 // expression. If there is no match, the return value is an empty string, 845 // but it will also be empty if the regular expression successfully matches 846 // an empty string. Use [Regexp.FindStringIndex] or [Regexp.FindStringSubmatch] if it is 847 // necessary to distinguish these cases. 848 func (re *Regexp) FindString(s string) string { 849 var dstCap [2]int 850 a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) 851 if a == nil { 852 return "" 853 } 854 return s[a[0]:a[1]] 855 } 856 857 // FindStringIndex returns a two-element slice of integers defining the 858 // location of the leftmost match in s of the regular expression. The match 859 // itself is at s[loc[0]:loc[1]]. 860 // A return value of nil indicates no match. 861 func (re *Regexp) FindStringIndex(s string) (loc []int) { 862 a := re.doExecute(nil, nil, s, 0, 2, nil) 863 if a == nil { 864 return nil 865 } 866 return a[0:2] 867 } 868 869 // FindReaderIndex returns a two-element slice of integers defining the 870 // location of the leftmost match of the regular expression in text read from 871 // the [io.RuneReader]. The match text was found in the input stream at 872 // byte offset loc[0] through loc[1]-1. 873 // A return value of nil indicates no match. 874 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 875 a := re.doExecute(r, nil, "", 0, 2, nil) 876 if a == nil { 877 return nil 878 } 879 return a[0:2] 880 } 881 882 // FindSubmatch returns a slice of slices holding the text of the leftmost 883 // match of the regular expression in b and the matches, if any, of its 884 // subexpressions, as defined by the 'Submatch' descriptions in the package 885 // comment. 886 // A return value of nil indicates no match. 887 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 888 var dstCap [4]int 889 a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) 890 if a == nil { 891 return nil 892 } 893 ret := make([][]byte, 1+re.numSubexp) 894 for i := range ret { 895 if 2*i < len(a) && a[2*i] >= 0 { 896 ret[i] = b[a[2*i]:a[2*i+1]:a[2*i+1]] 897 } 898 } 899 return ret 900 } 901 902 // Expand appends template to dst and returns the result; during the 903 // append, Expand replaces variables in the template with corresponding 904 // matches drawn from src. The match slice should have been returned by 905 // [Regexp.FindSubmatchIndex]. 906 // 907 // In the template, a variable is denoted by a substring of the form 908 // $name or ${name}, where name is a non-empty sequence of letters, 909 // digits, and underscores. A purely numeric name like $1 refers to 910 // the submatch with the corresponding index; other names refer to 911 // capturing parentheses named with the (?P<name>...) syntax. A 912 // reference to an out of range or unmatched index or a name that is not 913 // present in the regular expression is replaced with an empty slice. 914 // 915 // In the $name form, name is taken to be as long as possible: $1x is 916 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 917 // 918 // To insert a literal $ in the output, use $$ in the template. 919 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 920 return re.expand(dst, string(template), src, "", match) 921 } 922 923 // ExpandString is like [Regexp.Expand] but the template and source are strings. 924 // It appends to and returns a byte slice in order to give the calling 925 // code control over allocation. 926 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 927 return re.expand(dst, template, nil, src, match) 928 } 929 930 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 931 for len(template) > 0 { 932 before, after, ok := strings.Cut(template, "$") 933 if !ok { 934 break 935 } 936 dst = append(dst, before...) 937 template = after 938 if template != "" && template[0] == '$' { 939 // Treat $$ as $. 940 dst = append(dst, '$') 941 template = template[1:] 942 continue 943 } 944 name, num, rest, ok := extract(template) 945 if !ok { 946 // Malformed; treat $ as raw text. 947 dst = append(dst, '$') 948 continue 949 } 950 template = rest 951 if num >= 0 { 952 if 2*num+1 < len(match) && match[2*num] >= 0 { 953 if bsrc != nil { 954 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 955 } else { 956 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 957 } 958 } 959 } else { 960 for i, namei := range re.subexpNames { 961 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 962 if bsrc != nil { 963 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 964 } else { 965 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 966 } 967 break 968 } 969 } 970 } 971 } 972 dst = append(dst, template...) 973 return dst 974 } 975 976 // extract returns the name from a leading "name" or "{name}" in str. 977 // (The $ has already been removed by the caller.) 978 // If it is a number, extract returns num set to that number; otherwise num = -1. 979 func extract(str string) (name string, num int, rest string, ok bool) { 980 if str == "" { 981 return 982 } 983 brace := false 984 if str[0] == '{' { 985 brace = true 986 str = str[1:] 987 } 988 i := 0 989 for i < len(str) { 990 rune, size := utf8.DecodeRuneInString(str[i:]) 991 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 992 break 993 } 994 i += size 995 } 996 if i == 0 { 997 // empty name is not okay 998 return 999 } 1000 name = str[:i] 1001 if brace { 1002 if i >= len(str) || str[i] != '}' { 1003 // missing closing brace 1004 return 1005 } 1006 i++ 1007 } 1008 1009 // Parse number. 1010 num = 0 1011 for i := 0; i < len(name); i++ { 1012 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 1013 num = -1 1014 break 1015 } 1016 num = num*10 + int(name[i]) - '0' 1017 } 1018 // Disallow leading zeros. 1019 if name[0] == '0' && len(name) > 1 { 1020 num = -1 1021 } 1022 1023 rest = str[i:] 1024 ok = true 1025 return 1026 } 1027 1028 // FindSubmatchIndex returns a slice holding the index pairs identifying the 1029 // leftmost match of the regular expression in b and the matches, if any, of 1030 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 1031 // in the package comment. 1032 // A return value of nil indicates no match. 1033 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 1034 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) 1035 } 1036 1037 // FindStringSubmatch returns a slice of strings holding the text of the 1038 // leftmost match of the regular expression in s and the matches, if any, of 1039 // its subexpressions, as defined by the 'Submatch' description in the 1040 // package comment. 1041 // A return value of nil indicates no match. 1042 func (re *Regexp) FindStringSubmatch(s string) []string { 1043 var dstCap [4]int 1044 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) 1045 if a == nil { 1046 return nil 1047 } 1048 ret := make([]string, 1+re.numSubexp) 1049 for i := range ret { 1050 if 2*i < len(a) && a[2*i] >= 0 { 1051 ret[i] = s[a[2*i]:a[2*i+1]] 1052 } 1053 } 1054 return ret 1055 } 1056 1057 // FindStringSubmatchIndex returns a slice holding the index pairs 1058 // identifying the leftmost match of the regular expression in s and the 1059 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 1060 // 'Index' descriptions in the package comment. 1061 // A return value of nil indicates no match. 1062 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 1063 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) 1064 } 1065 1066 // FindReaderSubmatchIndex returns a slice holding the index pairs 1067 // identifying the leftmost match of the regular expression of text read by 1068 // the [io.RuneReader], and the matches, if any, of its subexpressions, as defined 1069 // by the 'Submatch' and 'Index' descriptions in the package comment. A 1070 // return value of nil indicates no match. 1071 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 1072 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) 1073 } 1074 1075 const startSize = 10 // The size at which to start a slice in the 'All' routines. 1076 1077 // FindAll is the 'All' version of [Regexp.Find]; it returns a slice of all successive 1078 // matches of the expression, as defined by the 'All' description in the 1079 // package comment. 1080 // A return value of nil indicates no match. 1081 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 1082 if n < 0 { 1083 n = len(b) + 1 1084 } 1085 var result [][]byte 1086 re.allMatches("", b, n, func(match []int) { 1087 if result == nil { 1088 result = make([][]byte, 0, startSize) 1089 } 1090 result = append(result, b[match[0]:match[1]:match[1]]) 1091 }) 1092 return result 1093 } 1094 1095 // FindAllIndex is the 'All' version of [Regexp.FindIndex]; it returns a slice of all 1096 // successive matches of the expression, as defined by the 'All' description 1097 // in the package comment. 1098 // A return value of nil indicates no match. 1099 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 1100 if n < 0 { 1101 n = len(b) + 1 1102 } 1103 var result [][]int 1104 re.allMatches("", b, n, func(match []int) { 1105 if result == nil { 1106 result = make([][]int, 0, startSize) 1107 } 1108 result = append(result, match[0:2]) 1109 }) 1110 return result 1111 } 1112 1113 // FindAllString is the 'All' version of [Regexp.FindString]; it returns a slice of all 1114 // successive matches of the expression, as defined by the 'All' description 1115 // in the package comment. 1116 // A return value of nil indicates no match. 1117 func (re *Regexp) FindAllString(s string, n int) []string { 1118 if n < 0 { 1119 n = len(s) + 1 1120 } 1121 var result []string 1122 re.allMatches(s, nil, n, func(match []int) { 1123 if result == nil { 1124 result = make([]string, 0, startSize) 1125 } 1126 result = append(result, s[match[0]:match[1]]) 1127 }) 1128 return result 1129 } 1130 1131 // FindAllStringIndex is the 'All' version of [Regexp.FindStringIndex]; it returns a 1132 // slice of all successive matches of the expression, as defined by the 'All' 1133 // description in the package comment. 1134 // A return value of nil indicates no match. 1135 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1136 if n < 0 { 1137 n = len(s) + 1 1138 } 1139 var result [][]int 1140 re.allMatches(s, nil, n, func(match []int) { 1141 if result == nil { 1142 result = make([][]int, 0, startSize) 1143 } 1144 result = append(result, match[0:2]) 1145 }) 1146 return result 1147 } 1148 1149 // FindAllSubmatch is the 'All' version of [Regexp.FindSubmatch]; it returns a slice 1150 // of all successive matches of the expression, as defined by the 'All' 1151 // description in the package comment. 1152 // A return value of nil indicates no match. 1153 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1154 if n < 0 { 1155 n = len(b) + 1 1156 } 1157 var result [][][]byte 1158 re.allMatches("", b, n, func(match []int) { 1159 if result == nil { 1160 result = make([][][]byte, 0, startSize) 1161 } 1162 slice := make([][]byte, len(match)/2) 1163 for j := range slice { 1164 if match[2*j] >= 0 { 1165 slice[j] = b[match[2*j]:match[2*j+1]:match[2*j+1]] 1166 } 1167 } 1168 result = append(result, slice) 1169 }) 1170 return result 1171 } 1172 1173 // FindAllSubmatchIndex is the 'All' version of [Regexp.FindSubmatchIndex]; it returns 1174 // a slice of all successive matches of the expression, as defined by the 1175 // 'All' description in the package comment. 1176 // A return value of nil indicates no match. 1177 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1178 if n < 0 { 1179 n = len(b) + 1 1180 } 1181 var result [][]int 1182 re.allMatches("", b, n, func(match []int) { 1183 if result == nil { 1184 result = make([][]int, 0, startSize) 1185 } 1186 result = append(result, match) 1187 }) 1188 return result 1189 } 1190 1191 // FindAllStringSubmatch is the 'All' version of [Regexp.FindStringSubmatch]; it 1192 // returns a slice of all successive matches of the expression, as defined by 1193 // the 'All' description in the package comment. 1194 // A return value of nil indicates no match. 1195 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1196 if n < 0 { 1197 n = len(s) + 1 1198 } 1199 var result [][]string 1200 re.allMatches(s, nil, n, func(match []int) { 1201 if result == nil { 1202 result = make([][]string, 0, startSize) 1203 } 1204 slice := make([]string, len(match)/2) 1205 for j := range slice { 1206 if match[2*j] >= 0 { 1207 slice[j] = s[match[2*j]:match[2*j+1]] 1208 } 1209 } 1210 result = append(result, slice) 1211 }) 1212 return result 1213 } 1214 1215 // FindAllStringSubmatchIndex is the 'All' version of 1216 // [Regexp.FindStringSubmatchIndex]; it returns a slice of all successive matches of 1217 // the expression, as defined by the 'All' description in the package 1218 // comment. 1219 // A return value of nil indicates no match. 1220 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1221 if n < 0 { 1222 n = len(s) + 1 1223 } 1224 var result [][]int 1225 re.allMatches(s, nil, n, func(match []int) { 1226 if result == nil { 1227 result = make([][]int, 0, startSize) 1228 } 1229 result = append(result, match) 1230 }) 1231 return result 1232 } 1233 1234 // Split slices s into substrings separated by the expression and returns a slice of 1235 // the substrings between those expression matches. 1236 // 1237 // The slice returned by this method consists of all the substrings of s 1238 // not contained in the slice returned by [Regexp.FindAllString]. When called on an expression 1239 // that contains no metacharacters, it is equivalent to [strings.SplitN]. 1240 // 1241 // Example: 1242 // 1243 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1244 // // s: ["", "b", "b", "c", "cadaaae"] 1245 // 1246 // The count determines the number of substrings to return: 1247 // 1248 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 1249 // n == 0: the result is nil (zero substrings) 1250 // n < 0: all substrings 1251 func (re *Regexp) Split(s string, n int) []string { 1252 1253 if n == 0 { 1254 return nil 1255 } 1256 1257 if len(re.expr) > 0 && len(s) == 0 { 1258 return []string{""} 1259 } 1260 1261 matches := re.FindAllStringIndex(s, n) 1262 strings := make([]string, 0, len(matches)) 1263 1264 beg := 0 1265 end := 0 1266 for _, match := range matches { 1267 if n > 0 && len(strings) >= n-1 { 1268 break 1269 } 1270 1271 end = match[0] 1272 if match[1] != 0 { 1273 strings = append(strings, s[beg:end]) 1274 } 1275 beg = match[1] 1276 } 1277 1278 if end != len(s) { 1279 strings = append(strings, s[beg:]) 1280 } 1281 1282 return strings 1283 } 1284 1285 // MarshalText implements [encoding.TextMarshaler]. The output 1286 // matches that of calling the [Regexp.String] method. 1287 // 1288 // Note that the output is lossy in some cases: This method does not indicate 1289 // POSIX regular expressions (i.e. those compiled by calling [CompilePOSIX]), or 1290 // those for which the [Regexp.Longest] method has been called. 1291 func (re *Regexp) MarshalText() ([]byte, error) { 1292 return []byte(re.String()), nil 1293 } 1294 1295 // UnmarshalText implements [encoding.TextUnmarshaler] by calling 1296 // [Compile] on the encoded value. 1297 func (re *Regexp) UnmarshalText(text []byte) error { 1298 newRE, err := Compile(string(text)) 1299 if err != nil { 1300 return err 1301 } 1302 *re = *newRE 1303 return nil 1304 }