github.com/gnolang/gno@v0.0.0-20240520182011-228e9d0192ce/gnovm/stdlibs/regexp/regexp.gno (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // https://golang.org/s/re2syntax, except for \C. 11 // For an overview of the syntax, run 12 // 13 // go doc regexp/syntax 14 // 15 // The regexp implementation provided by this package is 16 // guaranteed to run in time linear in the size of the input. 17 // (This is a property not guaranteed by most open source 18 // implementations of regular expressions.) For more information 19 // about this property, see 20 // 21 // https://swtch.com/~rsc/regexp/regexp1.html 22 // 23 // or any book about automata theory. 24 // 25 // All characters are UTF-8-encoded code points. 26 // 27 // There are 16 methods of Regexp that match a regular expression and identify 28 // the matched text. Their names are matched by this regular expression: 29 // 30 // Find(All)?(String)?(Submatch)?(Index)? 31 // 32 // If 'All' is present, the routine matches successive non-overlapping 33 // matches of the entire expression. Empty matches abutting a preceding 34 // match are ignored. The return value is a slice containing the successive 35 // return values of the corresponding non-'All' routine. These routines take 36 // an extra integer argument, n. If n >= 0, the function returns at most n 37 // matches/submatches; otherwise, it returns all of them. 38 // 39 // If 'String' is present, the argument is a string; otherwise it is a slice 40 // of bytes; return values are adjusted as appropriate. 41 // 42 // If 'Submatch' is present, the return value is a slice identifying the 43 // successive submatches of the expression. Submatches are matches of 44 // parenthesized subexpressions (also known as capturing groups) within the 45 // regular expression, numbered from left to right in order of opening 46 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1 47 // the match of the first parenthesized subexpression, and so on. 48 // 49 // If 'Index' is present, matches and submatches are identified by byte index 50 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 51 // the nth submatch. The pair for n==0 identifies the match of the entire 52 // expression. If 'Index' is not present, the match is identified by the text 53 // of the match/submatch. If an index is negative or text is nil, it means that 54 // subexpression did not match any string in the input. For 'String' versions 55 // an empty string means either no match or an empty match. 56 // 57 // There is also a subset of the methods that can be applied to text read 58 // from a RuneReader: 59 // 60 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 61 // 62 // This set may grow. Note that regular expression matches may need to 63 // examine text beyond the text returned by a match, so the methods that 64 // match text from a RuneReader may read arbitrarily far into the input 65 // before returning. 66 // 67 // (There are a few other methods that do not match this pattern.) 68 package regexp 69 70 import ( 71 "bytes" 72 "io" 73 "regexp/syntax" 74 "strconv" 75 "strings" 76 "unicode" 77 "unicode/utf8" 78 ) 79 80 // Regexp is the representation of a compiled regular expression. 81 // A Regexp is safe for concurrent use by multiple goroutines, 82 // except for configuration methods, such as Longest. 83 type Regexp struct { 84 expr string // as passed to Compile 85 prog *syntax.Prog // compiled program 86 onepass *onePassProg // onepass program or nil 87 numSubexp int 88 maxBitStateLen int 89 subexpNames []string 90 prefix string // required prefix in unanchored matches 91 prefixBytes []byte // prefix, as a []byte 92 prefixRune rune // first rune in prefix 93 prefixEnd uint32 // pc for last rune in prefix 94 mpool int // pool for machines 95 matchcap int // size of recorded match lengths 96 prefixComplete bool // prefix is the entire regexp 97 cond syntax.EmptyOp // empty-width conditions required at start of match 98 minInputLen int // minimum length of the input in bytes 99 100 // This field can be modified by the Longest method, 101 // but it is otherwise read-only. 102 longest bool // whether regexp prefers leftmost-longest match 103 } 104 105 // String returns the source text used to compile the regular expression. 106 func (re *Regexp) String() string { 107 return re.expr 108 } 109 110 // Copy returns a new Regexp object copied from re. 111 // Calling Longest on one copy does not affect another. 112 // 113 // Deprecated: In earlier releases, when using a Regexp in multiple goroutines, 114 // giving each goroutine its own copy helped to avoid lock contention. 115 // As of Go 1.12, using Copy is no longer necessary to avoid lock contention. 116 // Copy may still be appropriate if the reason for its use is to make 117 // two copies with different Longest settings. 118 func (re *Regexp) Copy() *Regexp { 119 re2 := *re 120 return &re2 121 } 122 123 // Compile parses a regular expression and returns, if successful, 124 // a Regexp object that can be used to match against text. 125 // 126 // When matching against text, the regexp returns a match that 127 // begins as early as possible in the input (leftmost), and among those 128 // it chooses the one that a backtracking search would have found first. 129 // This so-called leftmost-first matching is the same semantics 130 // that Perl, Python, and other implementations use, although this 131 // package implements it without the expense of backtracking. 132 // For POSIX leftmost-longest matching, see CompilePOSIX. 133 func Compile(expr string) (*Regexp, error) { 134 return compile(expr, syntax.Perl, false) 135 } 136 137 // CompilePOSIX is like Compile but restricts the regular expression 138 // to POSIX ERE (egrep) syntax and changes the match semantics to 139 // leftmost-longest. 140 // 141 // That is, when matching against text, the regexp returns a match that 142 // begins as early as possible in the input (leftmost), and among those 143 // it chooses a match that is as long as possible. 144 // This so-called leftmost-longest matching is the same semantics 145 // that early regular expression implementations used and that POSIX 146 // specifies. 147 // 148 // However, there can be multiple leftmost-longest matches, with different 149 // submatch choices, and here this package diverges from POSIX. 150 // Among the possible leftmost-longest matches, this package chooses 151 // the one that a backtracking search would have found first, while POSIX 152 // specifies that the match be chosen to maximize the length of the first 153 // subexpression, then the second, and so on from left to right. 154 // The POSIX rule is computationally prohibitive and not even well-defined. 155 // See https://swtch.com/~rsc/regexp/regexp2.html#posix for details. 156 func CompilePOSIX(expr string) (*Regexp, error) { 157 return compile(expr, syntax.POSIX, true) 158 } 159 160 // Longest makes future searches prefer the leftmost-longest match. 161 // That is, when matching against text, the regexp returns a match that 162 // begins as early as possible in the input (leftmost), and among those 163 // it chooses a match that is as long as possible. 164 // This method modifies the Regexp and may not be called concurrently 165 // with any other methods. 166 func (re *Regexp) Longest() { 167 re.longest = true 168 } 169 170 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 171 re, err := syntax.Parse(expr, mode) 172 if err != nil { 173 return nil, err 174 } 175 maxCap := re.MaxCap() 176 capNames := re.CapNames() 177 178 re = re.Simplify() 179 prog, err := syntax.Compile(re) 180 if err != nil { 181 return nil, err 182 } 183 matchcap := prog.NumCap 184 if matchcap < 2 { 185 matchcap = 2 186 } 187 regexp := &Regexp{ 188 expr: expr, 189 prog: prog, 190 onepass: compileOnePass(prog), 191 numSubexp: maxCap, 192 subexpNames: capNames, 193 cond: prog.StartCond(), 194 longest: longest, 195 matchcap: matchcap, 196 minInputLen: minInputLen(re), 197 } 198 if regexp.onepass == nil { 199 regexp.prefix, regexp.prefixComplete = prog.Prefix() 200 regexp.maxBitStateLen = maxBitStateLen(prog) 201 } else { 202 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 203 } 204 if regexp.prefix != "" { 205 // TODO(rsc): Remove this allocation by adding 206 // IndexString to package bytes. 207 regexp.prefixBytes = []byte(regexp.prefix) 208 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 209 } 210 211 n := len(prog.Inst) 212 i := 0 213 for matchSize[i] != 0 && matchSize[i] < n { 214 i++ 215 } 216 regexp.mpool = i 217 218 return regexp, nil 219 } 220 221 // Pools of *machine for use during (*Regexp).doExecute, 222 // split up by the size of the execution queues. 223 // matchPool[i] machines have queue size matchSize[i]. 224 // On a 64-bit system each queue entry is 16 bytes, 225 // so matchPool[0] has 16*2*128 = 4kB queues, etc. 226 // The final matchPool is a catch-all for very large queues. 227 var ( 228 matchSize = [...]int{128, 512, 2048, 16384, 0} 229 // XXX sync not yet supported 230 // matchPool [len(matchSize)]sync.Pool 231 ) 232 233 // get returns a machine to use for matching re. 234 // It uses the re's machine cache if possible, to avoid 235 // unnecessary allocation. 236 func (re *Regexp) get() *machine { 237 // m, ok := matchPool[re.mpool].Get().(*machine) 238 // if !ok { 239 m := new(machine) 240 //} 241 m.re = re 242 m.p = re.prog 243 if cap(m.matchcap) < re.matchcap { 244 m.matchcap = make([]int, re.matchcap) 245 for _, t := range m.pool { 246 t.cap = make([]int, re.matchcap) 247 } 248 } 249 250 // Allocate queues if needed. 251 // Or reallocate, for "large" match pool. 252 n := matchSize[re.mpool] 253 if n == 0 { // large pool 254 n = len(re.prog.Inst) 255 } 256 if len(m.q0.sparse) < n { 257 m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 258 m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 259 } 260 return m 261 } 262 263 // put returns a machine to the correct machine pool. 264 func (re *Regexp) put(m *machine) { 265 m.re = nil 266 m.p = nil 267 m.inputs.clear() 268 // matchPool[re.mpool].Put(m) 269 } 270 271 // minInputLen walks the regexp to find the minimum length of any matchable input 272 func minInputLen(re *syntax.Regexp) int { 273 switch re.Op { 274 default: 275 return 0 276 case syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpCharClass: 277 return 1 278 case syntax.OpLiteral: 279 l := 0 280 for _, r := range re.Rune { 281 l += utf8.RuneLen(r) 282 } 283 return l 284 case syntax.OpCapture, syntax.OpPlus: 285 return minInputLen(re.Sub[0]) 286 case syntax.OpRepeat: 287 return re.Min * minInputLen(re.Sub[0]) 288 case syntax.OpConcat: 289 l := 0 290 for _, sub := range re.Sub { 291 l += minInputLen(sub) 292 } 293 return l 294 case syntax.OpAlternate: 295 l := minInputLen(re.Sub[0]) 296 var lnext int 297 for _, sub := range re.Sub[1:] { 298 lnext = minInputLen(sub) 299 if lnext < l { 300 l = lnext 301 } 302 } 303 return l 304 } 305 } 306 307 // MustCompile is like Compile but panics if the expression cannot be parsed. 308 // It simplifies safe initialization of global variables holding compiled regular 309 // expressions. 310 func MustCompile(str string) *Regexp { 311 regexp, err := Compile(str) 312 if err != nil { 313 panic(`regexp: Compile(` + quote(str) + `): ` + err.Error()) 314 } 315 return regexp 316 } 317 318 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 319 // It simplifies safe initialization of global variables holding compiled regular 320 // expressions. 321 func MustCompilePOSIX(str string) *Regexp { 322 regexp, err := CompilePOSIX(str) 323 if err != nil { 324 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error()) 325 } 326 return regexp 327 } 328 329 func quote(s string) string { 330 if strconv.CanBackquote(s) { 331 return "`" + s + "`" 332 } 333 return strconv.Quote(s) 334 } 335 336 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 337 func (re *Regexp) NumSubexp() int { 338 return re.numSubexp 339 } 340 341 // SubexpNames returns the names of the parenthesized subexpressions 342 // in this Regexp. The name for the first sub-expression is names[1], 343 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 344 // Since the Regexp as a whole cannot be named, names[0] is always 345 // the empty string. The slice should not be modified. 346 func (re *Regexp) SubexpNames() []string { 347 return re.subexpNames 348 } 349 350 // SubexpIndex returns the index of the first subexpression with the given name, 351 // or -1 if there is no subexpression with that name. 352 // 353 // Note that multiple subexpressions can be written using the same name, as in 354 // (?P<bob>a+)(?P<bob>b+), which declares two subexpressions named "bob". 355 // In this case, SubexpIndex returns the index of the leftmost such subexpression 356 // in the regular expression. 357 func (re *Regexp) SubexpIndex(name string) int { 358 if name != "" { 359 for i, s := range re.subexpNames { 360 if name == s { 361 return i 362 } 363 } 364 } 365 return -1 366 } 367 368 const endOfText rune = -1 369 370 // input abstracts different representations of the input text. It provides 371 // one-character lookahead. 372 type input interface { 373 step(pos int) (r rune, width int) // advance one rune 374 canCheckPrefix() bool // can we look ahead without losing info? 375 hasPrefix(re *Regexp) bool 376 index(re *Regexp, pos int) int 377 context(pos int) lazyFlag 378 } 379 380 // inputString scans a string. 381 type inputString struct { 382 str string 383 } 384 385 func (i *inputString) step(pos int) (rune, int) { 386 if pos < len(i.str) { 387 c := i.str[pos] 388 if c < utf8.RuneSelf { 389 return rune(c), 1 390 } 391 return utf8.DecodeRuneInString(i.str[pos:]) 392 } 393 return endOfText, 0 394 } 395 396 func (i *inputString) canCheckPrefix() bool { 397 return true 398 } 399 400 func (i *inputString) hasPrefix(re *Regexp) bool { 401 return strings.HasPrefix(i.str, re.prefix) 402 } 403 404 func (i *inputString) index(re *Regexp, pos int) int { 405 return strings.Index(i.str[pos:], re.prefix) 406 } 407 408 func (i *inputString) context(pos int) lazyFlag { 409 r1, r2 := endOfText, endOfText 410 // 0 < pos && pos <= len(i.str) 411 if uint(pos-1) < uint(len(i.str)) { 412 r1 = rune(i.str[pos-1]) 413 if r1 >= utf8.RuneSelf { 414 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 415 } 416 } 417 // 0 <= pos && pos < len(i.str) 418 if uint(pos) < uint(len(i.str)) { 419 r2 = rune(i.str[pos]) 420 if r2 >= utf8.RuneSelf { 421 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 422 } 423 } 424 return newLazyFlag(r1, r2) 425 } 426 427 // inputBytes scans a byte slice. 428 type inputBytes struct { 429 str []byte 430 } 431 432 func (i *inputBytes) step(pos int) (rune, int) { 433 if pos < len(i.str) { 434 c := i.str[pos] 435 if c < utf8.RuneSelf { 436 return rune(c), 1 437 } 438 return utf8.DecodeRune(i.str[pos:]) 439 } 440 return endOfText, 0 441 } 442 443 func (i *inputBytes) canCheckPrefix() bool { 444 return true 445 } 446 447 func (i *inputBytes) hasPrefix(re *Regexp) bool { 448 return bytes.HasPrefix(i.str, re.prefixBytes) 449 } 450 451 func (i *inputBytes) index(re *Regexp, pos int) int { 452 return bytes.Index(i.str[pos:], re.prefixBytes) 453 } 454 455 func (i *inputBytes) context(pos int) lazyFlag { 456 r1, r2 := endOfText, endOfText 457 // 0 < pos && pos <= len(i.str) 458 if uint(pos-1) < uint(len(i.str)) { 459 r1 = rune(i.str[pos-1]) 460 if r1 >= utf8.RuneSelf { 461 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 462 } 463 } 464 // 0 <= pos && pos < len(i.str) 465 if uint(pos) < uint(len(i.str)) { 466 r2 = rune(i.str[pos]) 467 if r2 >= utf8.RuneSelf { 468 r2, _ = utf8.DecodeRune(i.str[pos:]) 469 } 470 } 471 return newLazyFlag(r1, r2) 472 } 473 474 // inputReader scans a RuneReader. 475 type inputReader struct { 476 r io.RuneReader 477 atEOT bool 478 pos int 479 } 480 481 func (i *inputReader) step(pos int) (rune, int) { 482 if !i.atEOT && pos != i.pos { 483 return endOfText, 0 484 } 485 r, w, err := i.r.ReadRune() 486 if err != nil { 487 i.atEOT = true 488 return endOfText, 0 489 } 490 i.pos += w 491 return r, w 492 } 493 494 func (i *inputReader) canCheckPrefix() bool { 495 return false 496 } 497 498 func (i *inputReader) hasPrefix(re *Regexp) bool { 499 return false 500 } 501 502 func (i *inputReader) index(re *Regexp, pos int) int { 503 return -1 504 } 505 506 func (i *inputReader) context(pos int) lazyFlag { 507 return 0 // not used 508 } 509 510 // LiteralPrefix returns a literal string that must begin any match 511 // of the regular expression re. It returns the boolean true if the 512 // literal string comprises the entire regular expression. 513 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 514 return re.prefix, re.prefixComplete 515 } 516 517 // MatchReader reports whether the text returned by the RuneReader 518 // contains any match of the regular expression re. 519 func (re *Regexp) MatchReader(r io.RuneReader) bool { 520 return re.doMatch(r, nil, "") 521 } 522 523 // MatchString reports whether the string s 524 // contains any match of the regular expression re. 525 func (re *Regexp) MatchString(s string) bool { 526 return re.doMatch(nil, nil, s) 527 } 528 529 // Match reports whether the byte slice b 530 // contains any match of the regular expression re. 531 func (re *Regexp) Match(b []byte) bool { 532 return re.doMatch(nil, b, "") 533 } 534 535 // MatchReader reports whether the text returned by the RuneReader 536 // contains any match of the regular expression pattern. 537 // More complicated queries need to use Compile and the full Regexp interface. 538 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 539 re, err := Compile(pattern) 540 if err != nil { 541 return false, err 542 } 543 return re.MatchReader(r), nil 544 } 545 546 // MatchString reports whether the string s 547 // contains any match of the regular expression pattern. 548 // More complicated queries need to use Compile and the full Regexp interface. 549 func MatchString(pattern string, s string) (matched bool, err error) { 550 re, err := Compile(pattern) 551 if err != nil { 552 return false, err 553 } 554 return re.MatchString(s), nil 555 } 556 557 // Match reports whether the byte slice b 558 // contains any match of the regular expression pattern. 559 // More complicated queries need to use Compile and the full Regexp interface. 560 func Match(pattern string, b []byte) (matched bool, err error) { 561 re, err := Compile(pattern) 562 if err != nil { 563 return false, err 564 } 565 return re.Match(b), nil 566 } 567 568 // ReplaceAllString returns a copy of src, replacing matches of the Regexp 569 // with the replacement string repl. Inside repl, $ signs are interpreted as 570 // in Expand, so for instance $1 represents the text of the first submatch. 571 func (re *Regexp) ReplaceAllString(src, repl string) string { 572 n := 2 573 if strings.Contains(repl, "$") { 574 n = 2 * (re.numSubexp + 1) 575 } 576 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 577 return re.expand(dst, repl, nil, src, match) 578 }) 579 return string(b) 580 } 581 582 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 583 // with the replacement string repl. The replacement repl is substituted directly, 584 // without using Expand. 585 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 586 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 587 return append(dst, repl...) 588 })) 589 } 590 591 // ReplaceAllStringFunc returns a copy of src in which all matches of the 592 // Regexp have been replaced by the return value of function repl applied 593 // to the matched substring. The replacement returned by repl is substituted 594 // directly, without using Expand. 595 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 596 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 597 return append(dst, repl(src[match[0]:match[1]])...) 598 }) 599 return string(b) 600 } 601 602 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 603 lastMatchEnd := 0 // end position of the most recent match 604 searchPos := 0 // position where we next look for a match 605 var buf []byte 606 var endPos int 607 if bsrc != nil { 608 endPos = len(bsrc) 609 } else { 610 endPos = len(src) 611 } 612 if nmatch > re.prog.NumCap { 613 nmatch = re.prog.NumCap 614 } 615 616 var dstCap [2]int 617 for searchPos <= endPos { 618 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) 619 if len(a) == 0 { 620 break // no more matches 621 } 622 623 // Copy the unmatched characters before this match. 624 if bsrc != nil { 625 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 626 } else { 627 buf = append(buf, src[lastMatchEnd:a[0]]...) 628 } 629 630 // Now insert a copy of the replacement string, but not for a 631 // match of the empty string immediately after another match. 632 // (Otherwise, we get double replacement for patterns that 633 // match both empty and nonempty strings.) 634 if a[1] > lastMatchEnd || a[0] == 0 { 635 buf = repl(buf, a) 636 } 637 lastMatchEnd = a[1] 638 639 // Advance past this match; always advance at least one character. 640 var width int 641 if bsrc != nil { 642 _, width = utf8.DecodeRune(bsrc[searchPos:]) 643 } else { 644 _, width = utf8.DecodeRuneInString(src[searchPos:]) 645 } 646 if searchPos+width > a[1] { 647 searchPos += width 648 } else if searchPos+1 > a[1] { 649 // This clause is only needed at the end of the input 650 // string. In that case, DecodeRuneInString returns width=0. 651 searchPos++ 652 } else { 653 searchPos = a[1] 654 } 655 } 656 657 // Copy the unmatched characters after the last match. 658 if bsrc != nil { 659 buf = append(buf, bsrc[lastMatchEnd:]...) 660 } else { 661 buf = append(buf, src[lastMatchEnd:]...) 662 } 663 664 return buf 665 } 666 667 // ReplaceAll returns a copy of src, replacing matches of the Regexp 668 // with the replacement text repl. Inside repl, $ signs are interpreted as 669 // in Expand, so for instance $1 represents the text of the first submatch. 670 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 671 n := 2 672 if bytes.IndexByte(repl, '$') >= 0 { 673 n = 2 * (re.numSubexp + 1) 674 } 675 srepl := "" 676 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 677 if len(srepl) != len(repl) { 678 srepl = string(repl) 679 } 680 return re.expand(dst, srepl, src, "", match) 681 }) 682 return b 683 } 684 685 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 686 // with the replacement bytes repl. The replacement repl is substituted directly, 687 // without using Expand. 688 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 689 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 690 return append(dst, repl...) 691 }) 692 } 693 694 // ReplaceAllFunc returns a copy of src in which all matches of the 695 // Regexp have been replaced by the return value of function repl applied 696 // to the matched byte slice. The replacement returned by repl is substituted 697 // directly, without using Expand. 698 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 699 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 700 return append(dst, repl(src[match[0]:match[1]])...) 701 }) 702 } 703 704 // Bitmap used by func special to check whether a character needs to be escaped. 705 var specialBytes [16]byte 706 707 // special reports whether byte b needs to be escaped by QuoteMeta. 708 func special(b byte) bool { 709 return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0 710 } 711 712 func init() { 713 for _, b := range []byte(`\.+*?()|[]{}^$`) { 714 specialBytes[b%16] |= 1 << (b / 16) 715 } 716 } 717 718 // QuoteMeta returns a string that escapes all regular expression metacharacters 719 // inside the argument text; the returned string is a regular expression matching 720 // the literal text. 721 func QuoteMeta(s string) string { 722 // A byte loop is correct because all metacharacters are ASCII. 723 var i int 724 for i = 0; i < len(s); i++ { 725 if special(s[i]) { 726 break 727 } 728 } 729 // No meta characters found, so return original string. 730 if i >= len(s) { 731 return s 732 } 733 734 b := make([]byte, 2*len(s)-i) 735 copy(b, s[:i]) 736 j := i 737 for ; i < len(s); i++ { 738 if special(s[i]) { 739 b[j] = '\\' 740 j++ 741 } 742 b[j] = s[i] 743 j++ 744 } 745 return string(b[:j]) 746 } 747 748 // The number of capture values in the program may correspond 749 // to fewer capturing expressions than are in the regexp. 750 // For example, "(a){0}" turns into an empty program, so the 751 // maximum capture in the program is 0 but we need to return 752 // an expression for \1. Pad appends -1s to the slice a as needed. 753 func (re *Regexp) pad(a []int) []int { 754 if a == nil { 755 // No match. 756 return nil 757 } 758 n := (1 + re.numSubexp) * 2 759 for len(a) < n { 760 a = append(a, -1) 761 } 762 return a 763 } 764 765 // allMatches calls deliver at most n times 766 // with the location of successive matches in the input text. 767 // The input text is b if non-nil, otherwise s. 768 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 769 var end int 770 if b == nil { 771 end = len(s) 772 } else { 773 end = len(b) 774 } 775 776 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 777 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) 778 if len(matches) == 0 { 779 break 780 } 781 782 accept := true 783 if matches[1] == pos { 784 // We've found an empty match. 785 if matches[0] == prevMatchEnd { 786 // We don't allow an empty match right 787 // after a previous match, so ignore it. 788 accept = false 789 } 790 var width int 791 // TODO: use step() 792 if b == nil { 793 _, width = utf8.DecodeRuneInString(s[pos:end]) 794 } else { 795 _, width = utf8.DecodeRune(b[pos:end]) 796 } 797 if width > 0 { 798 pos += width 799 } else { 800 pos = end + 1 801 } 802 } else { 803 pos = matches[1] 804 } 805 prevMatchEnd = matches[1] 806 807 if accept { 808 deliver(re.pad(matches)) 809 i++ 810 } 811 } 812 } 813 814 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 815 // A return value of nil indicates no match. 816 func (re *Regexp) Find(b []byte) []byte { 817 var dstCap [2]int 818 a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) 819 if a == nil { 820 return nil 821 } 822 return b[a[0]:a[1]:a[1]] 823 } 824 825 // FindIndex returns a two-element slice of integers defining the location of 826 // the leftmost match in b of the regular expression. The match itself is at 827 // b[loc[0]:loc[1]]. 828 // A return value of nil indicates no match. 829 func (re *Regexp) FindIndex(b []byte) (loc []int) { 830 a := re.doExecute(nil, b, "", 0, 2, nil) 831 if a == nil { 832 return nil 833 } 834 return a[0:2] 835 } 836 837 // FindString returns a string holding the text of the leftmost match in s of the regular 838 // expression. If there is no match, the return value is an empty string, 839 // but it will also be empty if the regular expression successfully matches 840 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 841 // necessary to distinguish these cases. 842 func (re *Regexp) FindString(s string) string { 843 var dstCap [2]int 844 a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) 845 if a == nil { 846 return "" 847 } 848 return s[a[0]:a[1]] 849 } 850 851 // FindStringIndex returns a two-element slice of integers defining the 852 // location of the leftmost match in s of the regular expression. The match 853 // itself is at s[loc[0]:loc[1]]. 854 // A return value of nil indicates no match. 855 func (re *Regexp) FindStringIndex(s string) (loc []int) { 856 a := re.doExecute(nil, nil, s, 0, 2, nil) 857 if a == nil { 858 return nil 859 } 860 return a[0:2] 861 } 862 863 // FindReaderIndex returns a two-element slice of integers defining the 864 // location of the leftmost match of the regular expression in text read from 865 // the RuneReader. The match text was found in the input stream at 866 // byte offset loc[0] through loc[1]-1. 867 // A return value of nil indicates no match. 868 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 869 a := re.doExecute(r, nil, "", 0, 2, nil) 870 if a == nil { 871 return nil 872 } 873 return a[0:2] 874 } 875 876 // FindSubmatch returns a slice of slices holding the text of the leftmost 877 // match of the regular expression in b and the matches, if any, of its 878 // subexpressions, as defined by the 'Submatch' descriptions in the package 879 // comment. 880 // A return value of nil indicates no match. 881 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 882 var dstCap [4]int 883 a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) 884 if a == nil { 885 return nil 886 } 887 ret := make([][]byte, 1+re.numSubexp) 888 for i := range ret { 889 if 2*i < len(a) && a[2*i] >= 0 { 890 ret[i] = b[a[2*i]:a[2*i+1]:a[2*i+1]] 891 } 892 } 893 return ret 894 } 895 896 // Expand appends template to dst and returns the result; during the 897 // append, Expand replaces variables in the template with corresponding 898 // matches drawn from src. The match slice should have been returned by 899 // FindSubmatchIndex. 900 // 901 // In the template, a variable is denoted by a substring of the form 902 // $name or ${name}, where name is a non-empty sequence of letters, 903 // digits, and underscores. A purely numeric name like $1 refers to 904 // the submatch with the corresponding index; other names refer to 905 // capturing parentheses named with the (?P<name>...) syntax. A 906 // reference to an out of range or unmatched index or a name that is not 907 // present in the regular expression is replaced with an empty slice. 908 // 909 // In the $name form, name is taken to be as long as possible: $1x is 910 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 911 // 912 // To insert a literal $ in the output, use $$ in the template. 913 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 914 return re.expand(dst, string(template), src, "", match) 915 } 916 917 // ExpandString is like Expand but the template and source are strings. 918 // It appends to and returns a byte slice in order to give the calling 919 // code control over allocation. 920 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 921 return re.expand(dst, template, nil, src, match) 922 } 923 924 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 925 for len(template) > 0 { 926 i := strings.Index(template, "$") 927 if i < 0 { 928 break 929 } 930 dst = append(dst, template[:i]...) 931 template = template[i:] 932 if len(template) > 1 && template[1] == '$' { 933 // Treat $$ as $. 934 dst = append(dst, '$') 935 template = template[2:] 936 continue 937 } 938 name, num, rest, ok := extract(template) 939 if !ok { 940 // Malformed; treat $ as raw text. 941 dst = append(dst, '$') 942 template = template[1:] 943 continue 944 } 945 template = rest 946 if num >= 0 { 947 if 2*num+1 < len(match) && match[2*num] >= 0 { 948 if bsrc != nil { 949 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 950 } else { 951 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 952 } 953 } 954 } else { 955 for i, namei := range re.subexpNames { 956 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 957 if bsrc != nil { 958 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 959 } else { 960 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 961 } 962 break 963 } 964 } 965 } 966 } 967 dst = append(dst, template...) 968 return dst 969 } 970 971 // extract returns the name from a leading "$name" or "${name}" in str. 972 // If it is a number, extract returns num set to that number; otherwise num = -1. 973 func extract(str string) (name string, num int, rest string, ok bool) { 974 if len(str) < 2 || str[0] != '$' { 975 return 976 } 977 brace := false 978 if str[1] == '{' { 979 brace = true 980 str = str[2:] 981 } else { 982 str = str[1:] 983 } 984 i := 0 985 for i < len(str) { 986 r, size := utf8.DecodeRuneInString(str[i:]) 987 if !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '_' { 988 break 989 } 990 i += size 991 } 992 if i == 0 { 993 // empty name is not okay 994 return 995 } 996 name = str[:i] 997 if brace { 998 if i >= len(str) || str[i] != '}' { 999 // missing closing brace 1000 return 1001 } 1002 i++ 1003 } 1004 1005 // Parse number. 1006 num = 0 1007 for i := 0; i < len(name); i++ { 1008 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 1009 num = -1 1010 break 1011 } 1012 num = num*10 + int(name[i]) - '0' 1013 } 1014 // Disallow leading zeros. 1015 if name[0] == '0' && len(name) > 1 { 1016 num = -1 1017 } 1018 1019 rest = str[i:] 1020 ok = true 1021 return 1022 } 1023 1024 // FindSubmatchIndex returns a slice holding the index pairs identifying the 1025 // leftmost match of the regular expression in b and the matches, if any, of 1026 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 1027 // in the package comment. 1028 // A return value of nil indicates no match. 1029 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 1030 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) 1031 } 1032 1033 // FindStringSubmatch returns a slice of strings holding the text of the 1034 // leftmost match of the regular expression in s and the matches, if any, of 1035 // its subexpressions, as defined by the 'Submatch' description in the 1036 // package comment. 1037 // A return value of nil indicates no match. 1038 func (re *Regexp) FindStringSubmatch(s string) []string { 1039 var dstCap [4]int 1040 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) 1041 if a == nil { 1042 return nil 1043 } 1044 ret := make([]string, 1+re.numSubexp) 1045 for i := range ret { 1046 if 2*i < len(a) && a[2*i] >= 0 { 1047 ret[i] = s[a[2*i]:a[2*i+1]] 1048 } 1049 } 1050 return ret 1051 } 1052 1053 // FindStringSubmatchIndex returns a slice holding the index pairs 1054 // identifying the leftmost match of the regular expression in s and the 1055 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 1056 // 'Index' descriptions in the package comment. 1057 // A return value of nil indicates no match. 1058 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 1059 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) 1060 } 1061 1062 // FindReaderSubmatchIndex returns a slice holding the index pairs 1063 // identifying the leftmost match of the regular expression of text read by 1064 // the RuneReader, and the matches, if any, of its subexpressions, as defined 1065 // by the 'Submatch' and 'Index' descriptions in the package comment. A 1066 // return value of nil indicates no match. 1067 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 1068 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) 1069 } 1070 1071 const startSize = 10 // The size at which to start a slice in the 'All' routines. 1072 1073 // FindAll is the 'All' version of Find; it returns a slice of all successive 1074 // matches of the expression, as defined by the 'All' description in the 1075 // package comment. 1076 // A return value of nil indicates no match. 1077 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 1078 if n < 0 { 1079 n = len(b) + 1 1080 } 1081 var result [][]byte 1082 re.allMatches("", b, n, func(match []int) { 1083 if result == nil { 1084 result = make([][]byte, 0, startSize) 1085 } 1086 result = append(result, b[match[0]:match[1]:match[1]]) 1087 }) 1088 return result 1089 } 1090 1091 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 1092 // successive matches of the expression, as defined by the 'All' description 1093 // in the package comment. 1094 // A return value of nil indicates no match. 1095 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 1096 if n < 0 { 1097 n = len(b) + 1 1098 } 1099 var result [][]int 1100 re.allMatches("", b, n, func(match []int) { 1101 if result == nil { 1102 result = make([][]int, 0, startSize) 1103 } 1104 result = append(result, match[0:2]) 1105 }) 1106 return result 1107 } 1108 1109 // FindAllString is the 'All' version of FindString; it returns a slice of all 1110 // successive matches of the expression, as defined by the 'All' description 1111 // in the package comment. 1112 // A return value of nil indicates no match. 1113 func (re *Regexp) FindAllString(s string, n int) []string { 1114 if n < 0 { 1115 n = len(s) + 1 1116 } 1117 var result []string 1118 re.allMatches(s, nil, n, func(match []int) { 1119 if result == nil { 1120 result = make([]string, 0, startSize) 1121 } 1122 result = append(result, s[match[0]:match[1]]) 1123 }) 1124 return result 1125 } 1126 1127 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 1128 // slice of all successive matches of the expression, as defined by the 'All' 1129 // description in the package comment. 1130 // A return value of nil indicates no match. 1131 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1132 if n < 0 { 1133 n = len(s) + 1 1134 } 1135 var result [][]int 1136 re.allMatches(s, nil, n, func(match []int) { 1137 if result == nil { 1138 result = make([][]int, 0, startSize) 1139 } 1140 result = append(result, match[0:2]) 1141 }) 1142 return result 1143 } 1144 1145 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1146 // of all successive matches of the expression, as defined by the 'All' 1147 // description in the package comment. 1148 // A return value of nil indicates no match. 1149 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1150 if n < 0 { 1151 n = len(b) + 1 1152 } 1153 var result [][][]byte 1154 re.allMatches("", b, n, func(match []int) { 1155 if result == nil { 1156 result = make([][][]byte, 0, startSize) 1157 } 1158 slice := make([][]byte, len(match)/2) 1159 for j := range slice { 1160 if match[2*j] >= 0 { 1161 slice[j] = b[match[2*j]:match[2*j+1]:match[2*j+1]] 1162 } 1163 } 1164 result = append(result, slice) 1165 }) 1166 return result 1167 } 1168 1169 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1170 // a slice of all successive matches of the expression, as defined by the 1171 // 'All' description in the package comment. 1172 // A return value of nil indicates no match. 1173 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1174 if n < 0 { 1175 n = len(b) + 1 1176 } 1177 var result [][]int 1178 re.allMatches("", b, n, func(match []int) { 1179 if result == nil { 1180 result = make([][]int, 0, startSize) 1181 } 1182 result = append(result, match) 1183 }) 1184 return result 1185 } 1186 1187 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1188 // returns a slice of all successive matches of the expression, as defined by 1189 // the 'All' description in the package comment. 1190 // A return value of nil indicates no match. 1191 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1192 if n < 0 { 1193 n = len(s) + 1 1194 } 1195 var result [][]string 1196 re.allMatches(s, nil, n, func(match []int) { 1197 if result == nil { 1198 result = make([][]string, 0, startSize) 1199 } 1200 slice := make([]string, len(match)/2) 1201 for j := range slice { 1202 if match[2*j] >= 0 { 1203 slice[j] = s[match[2*j]:match[2*j+1]] 1204 } 1205 } 1206 result = append(result, slice) 1207 }) 1208 return result 1209 } 1210 1211 // FindAllStringSubmatchIndex is the 'All' version of 1212 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1213 // the expression, as defined by the 'All' description in the package 1214 // comment. 1215 // A return value of nil indicates no match. 1216 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1217 if n < 0 { 1218 n = len(s) + 1 1219 } 1220 var result [][]int 1221 re.allMatches(s, nil, n, func(match []int) { 1222 if result == nil { 1223 result = make([][]int, 0, startSize) 1224 } 1225 result = append(result, match) 1226 }) 1227 return result 1228 } 1229 1230 // Split slices s into substrings separated by the expression and returns a slice of 1231 // the substrings between those expression matches. 1232 // 1233 // The slice returned by this method consists of all the substrings of s 1234 // not contained in the slice returned by FindAllString. When called on an expression 1235 // that contains no metacharacters, it is equivalent to strings.SplitN. 1236 // 1237 // Example: 1238 // 1239 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1240 // // s: ["", "b", "b", "c", "cadaaae"] 1241 // 1242 // The count determines the number of substrings to return: 1243 // 1244 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 1245 // n == 0: the result is nil (zero substrings) 1246 // n < 0: all substrings 1247 func (re *Regexp) Split(s string, n int) []string { 1248 if n == 0 { 1249 return nil 1250 } 1251 1252 if len(re.expr) > 0 && len(s) == 0 { 1253 return []string{""} 1254 } 1255 1256 matches := re.FindAllStringIndex(s, n) 1257 strings := make([]string, 0, len(matches)) 1258 1259 beg := 0 1260 end := 0 1261 for _, match := range matches { 1262 if n > 0 && len(strings) >= n-1 { 1263 break 1264 } 1265 1266 end = match[0] 1267 if match[1] != 0 { 1268 strings = append(strings, s[beg:end]) 1269 } 1270 beg = match[1] 1271 } 1272 1273 if end != len(s) { 1274 strings = append(strings, s[beg:]) 1275 } 1276 1277 return strings 1278 }