go.arsenm.dev/pcre@v0.0.0-20220530205550-74594f6c8b0e/pcre.go (about) 1 // Package pcre is a library that provides pcre2 regular expressions 2 // in pure Go, allowing for features such as cross-compiling. 3 // 4 // The lib directory contains source code automatically translated from 5 // pcre2's C source code for each supported architecture and/or OS. 6 // This package wraps the automatically-translated source to provide a 7 // safe interface as close to Go's regexp library as possible. 8 package pcre 9 10 import ( 11 "os" 12 "runtime" 13 "strconv" 14 "sync" 15 "unsafe" 16 17 "go.arsenm.dev/pcre/lib" 18 19 "modernc.org/libc" 20 ) 21 22 // Version returns the version of pcre2 embedded in this library. 23 func Version() string { return lib.DPACKAGE_VERSION } 24 25 // Regexp represents a pcre2 regular expression 26 type Regexp struct { 27 mtx *sync.Mutex 28 expr string 29 re uintptr 30 tls *libc.TLS 31 } 32 33 // Compile runs CompileOpts with no options. 34 // 35 // Close() should be called on the returned expression 36 // once it is no longer needed. 37 func Compile(pattern string) (*Regexp, error) { 38 return CompileOpts(pattern, 0) 39 } 40 41 // CompileOpts compiles the provided pattern using the given options. 42 // 43 // Close() should be called on the returned expression 44 // once it is no longer needed. 45 func CompileOpts(pattern string, options CompileOption) (*Regexp, error) { 46 tls := libc.NewTLS() 47 48 // Get C string of pattern 49 cPattern, err := libc.CString(pattern) 50 if err != nil { 51 return nil, err 52 } 53 // Free the string when done 54 defer libc.Xfree(tls, cPattern) 55 56 // Allocate new error 57 cErr := allocError(tls) 58 // Free error when done 59 defer libc.Xfree(tls, cErr) 60 61 // Get error offsets 62 errPtr := addErrCodeOffset(cErr) 63 errOffsetPtr := addErrOffsetOffset(cErr) 64 65 // Convert pattern length to size_t type 66 cPatLen := lib.Tsize_t(len(pattern)) 67 68 // Compile expression 69 r := lib.Xpcre2_compile_8(tls, cPattern, cPatLen, uint32(options), errPtr, errOffsetPtr, 0) 70 if r == 0 { 71 return nil, ptrToError(tls, cErr) 72 } 73 74 // Create regexp instance 75 regex := Regexp{ 76 expr: pattern, 77 mtx: &sync.Mutex{}, 78 re: r, 79 tls: tls, 80 } 81 82 // Make sure resources are freed if GC collects the 83 // regular expression. 84 runtime.SetFinalizer(®ex, func(r *Regexp) error { 85 return r.Close() 86 }) 87 88 return ®ex, nil 89 } 90 91 // MustCompile compiles the given pattern and panics 92 // if there was an error 93 // 94 // Close() should be called on the returned expression 95 // once it is no longer needed. 96 func MustCompile(pattern string) *Regexp { 97 rgx, err := Compile(pattern) 98 if err != nil { 99 panic(err) 100 } 101 return rgx 102 } 103 104 // MustCompileOpts compiles the given pattern with the given 105 // options and panics if there was an error. 106 // 107 // Close() should be called on the returned expression 108 // once it is no longer needed. 109 func MustCompileOpts(pattern string, options CompileOption) *Regexp { 110 rgx, err := CompileOpts(pattern, options) 111 if err != nil { 112 panic(err) 113 } 114 return rgx 115 } 116 117 // Find returns the leftmost match of the regular expression. 118 // A return value of nil indicates no match. 119 func (r *Regexp) Find(b []byte) []byte { 120 matches, err := r.match(b, 0, false) 121 if err != nil { 122 panic(err) 123 } 124 if len(matches) == 0 { 125 return nil 126 } 127 match := matches[0] 128 return b[match[0]:match[1]] 129 } 130 131 // FindIndex returns a two-element slice of integers 132 // representing the location of the leftmost match of the 133 // regular expression. 134 func (r *Regexp) FindIndex(b []byte) []int { 135 matches, err := r.match(b, 0, false) 136 if err != nil { 137 panic(err) 138 } 139 if len(matches) == 0 { 140 return nil 141 } 142 match := matches[0] 143 144 return []int{int(match[0]), int(match[1])} 145 } 146 147 // FindAll returns all matches of the regular expression. 148 // A return value of nil indicates no match. 149 func (r *Regexp) FindAll(b []byte, n int) [][]byte { 150 matches, err := r.match(b, 0, true) 151 if err != nil { 152 panic(err) 153 } 154 if len(matches) == 0 || n == 0 { 155 return nil 156 } 157 if n > 0 && len(matches) > n { 158 matches = matches[:n] 159 } 160 161 out := make([][]byte, len(matches)) 162 for index, match := range matches { 163 out[index] = b[match[0]:match[1]] 164 } 165 166 return out 167 } 168 169 // FindAll returns indices of all matches of the 170 // regular expression. A return value of nil indicates 171 // no match. 172 func (r *Regexp) FindAllIndex(b []byte, n int) [][]int { 173 matches, err := r.match(b, 0, true) 174 if err != nil { 175 panic(err) 176 } 177 if len(matches) == 0 || n == 0 { 178 return nil 179 } 180 if n > 0 && len(matches) > n { 181 matches = matches[:n] 182 } 183 184 out := make([][]int, len(matches)) 185 for index, match := range matches { 186 out[index] = []int{int(match[0]), int(match[1])} 187 } 188 return out 189 } 190 191 // FindSubmatch returns a slice containing the match as the 192 // first element, and the submatches as the subsequent elements. 193 func (r *Regexp) FindSubmatch(b []byte) [][]byte { 194 matches, err := r.match(b, 0, false) 195 if err != nil { 196 panic(err) 197 } 198 if len(matches) == 0 { 199 return nil 200 } 201 match := matches[0] 202 203 out := make([][]byte, 0, len(match)/2) 204 for i := 0; i < len(match); i += 2 { 205 out = append(out, b[match[i]:match[i+1]]) 206 } 207 return out 208 } 209 210 // FindSubmatchIndex returns a slice of index pairs representing 211 // the match and submatches, if any. 212 func (r *Regexp) FindSubmatchIndex(b []byte) []int { 213 matches, err := r.match(b, 0, false) 214 if err != nil { 215 panic(err) 216 } 217 if len(matches) == 0 { 218 return nil 219 } 220 match := matches[0] 221 222 out := make([]int, len(match)) 223 for index, offset := range match { 224 out[index] = int(offset) 225 } 226 227 return out 228 } 229 230 // FindAllSubmatch returns a slice of all matches and submatches 231 // of the regular expression. It will return no more than n matches. 232 // If n < 0, it will return all matches. 233 func (r *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 234 matches, err := r.match(b, 0, true) 235 if err != nil { 236 panic(err) 237 } 238 if len(matches) == 0 || n == 0 { 239 return nil 240 } 241 if n > 0 && len(matches) > n { 242 matches = matches[:n] 243 } 244 245 out := make([][][]byte, len(matches)) 246 for index, match := range matches { 247 outMatch := make([][]byte, 0, len(match)/2) 248 249 for i := 0; i < len(match); i += 2 { 250 outMatch = append(outMatch, b[match[i]:match[i+1]]) 251 } 252 253 out[index] = outMatch 254 } 255 256 return out 257 } 258 259 // FindAllSubmatch returns a slice of all indeces representing the 260 // locations of matches and submatches, if any, of the regular expression. 261 // It will return no more than n matches. If n < 0, it will return all matches. 262 func (r *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 263 matches, err := r.match(b, 0, true) 264 if err != nil { 265 panic(err) 266 } 267 if len(matches) == 0 || n == 0 { 268 return nil 269 } 270 if n > 0 && len(matches) > n { 271 matches = matches[:n] 272 } 273 274 out := make([][]int, len(matches)) 275 for index, match := range matches { 276 offsets := make([]int, len(match)) 277 278 for index, offset := range match { 279 offsets[index] = int(offset) 280 } 281 282 out[index] = offsets 283 } 284 285 return out 286 } 287 288 // FindString is the String version of Find 289 func (r *Regexp) FindString(s string) string { 290 return string(r.Find([]byte(s))) 291 } 292 293 // FindStringIndex is the String version of FindIndex 294 func (r *Regexp) FindStringIndex(s string) []int { 295 return r.FindIndex([]byte(s)) 296 } 297 298 // FinAllString is the String version of FindAll 299 func (r *Regexp) FindAllString(s string, n int) []string { 300 matches := r.FindAll([]byte(s), n) 301 302 out := make([]string, len(matches)) 303 for index, match := range matches { 304 out[index] = string(match) 305 } 306 return out 307 } 308 309 // FindAllStringIndex is the String version of FindIndex 310 func (r *Regexp) FindAllStringIndex(s string, n int) [][]int { 311 return r.FindAllIndex([]byte(s), n) 312 } 313 314 // FindStringSubmatch is the string version of FindSubmatch 315 func (r *Regexp) FindStringSubmatch(s string) []string { 316 matches := r.FindSubmatch([]byte(s)) 317 318 out := make([]string, len(matches)) 319 for index, match := range matches { 320 out[index] = string(match) 321 } 322 return out 323 } 324 325 // FindStringSubmatchIndex is the String version of FindSubmatchIndex 326 func (r *Regexp) FindStringSubmatchIndex(s string) []int { 327 return r.FindSubmatchIndex([]byte(s)) 328 } 329 330 // FindAllStringSubmatch is the String version of FindAllSubmatch 331 func (r *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 332 matches := r.FindAllSubmatch([]byte(s), n) 333 334 out := make([][]string, len(matches)) 335 for index, match := range matches { 336 outMatch := make([]string, len(match)) 337 338 for index, byteMatch := range match { 339 outMatch[index] = string(byteMatch) 340 } 341 342 out[index] = outMatch 343 } 344 345 return out 346 } 347 348 // FindAllStringSubmatchIndex is the String version of FindAllSubmatchIndex 349 func (r *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 350 return r.FindAllSubmatchIndex([]byte(s), n) 351 } 352 353 // Match reports whether b contains a match of the regular expression 354 func (r *Regexp) Match(b []byte) bool { 355 return r.Find(b) != nil 356 } 357 358 // MatchString is the String version of Match 359 func (r *Regexp) MatchString(s string) bool { 360 return r.Find([]byte(s)) != nil 361 } 362 363 // NumSubexp returns the number of parenthesized subexpressions 364 // in the regular expression. 365 func (r *Regexp) NumSubexp() int { 366 return int(r.patternInfo(lib.DPCRE2_INFO_CAPTURECOUNT)) 367 } 368 369 // ReplaceAll returns a copy of src, replacing matches of the 370 // regular expression with the replacement text repl. 371 // Inside repl, $ signs are interpreted as in Expand, 372 // so for instance $1 represents the text of the first 373 // submatch and $name would represent the text of the 374 // subexpression called "name". 375 func (r *Regexp) ReplaceAll(src, repl []byte) []byte { 376 matches, err := r.match(src, 0, true) 377 if err != nil { 378 panic(err) 379 } 380 if len(matches) == 0 { 381 return src 382 } 383 384 out := make([]byte, len(src)) 385 copy(out, src) 386 387 var diff int64 388 for _, match := range matches { 389 replStr := os.Expand(string(repl), func(s string) string { 390 i, err := strconv.Atoi(s) 391 if err != nil { 392 i = r.SubexpIndex(s) 393 if i == -1 { 394 return "" 395 } 396 } 397 398 // If there given match does not exist, return empty string 399 if i == 0 || len(match) < (2*i)+1 { 400 return "" 401 } 402 403 // Return match 404 return string(src[match[2*i]:match[(2*i)+1]]) 405 }) 406 // Replace replacement string with expanded string 407 repl := []byte(replStr) 408 409 // Replace bytes with new replacement string 410 diff, out = replaceBytes(out, repl, match[0], match[1], diff) 411 } 412 413 return out 414 } 415 416 // ReplaceAllFunc returns a copy of src in which all matches of the 417 // regular expression have been replaced by the return value of function 418 // repl applied to the matched byte slice. The replacement returned by 419 // repl is substituted directly, without using Expand. 420 func (r *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 421 matches, err := r.match(src, 0, true) 422 if err != nil { 423 panic(err) 424 } 425 if len(matches) == 0 { 426 return src 427 } 428 429 out := make([]byte, len(src)) 430 copy(out, src) 431 432 var diff int64 433 for _, match := range matches { 434 replBytes := repl(src[match[0]:match[1]]) 435 diff, out = replaceBytes(out, replBytes, match[0], match[1], diff) 436 } 437 438 return out 439 } 440 441 // ReplaceAllLiteral returns a copy of src, replacing matches of 442 // the regular expression with the replacement bytes repl. 443 // The replacement is substituted directly, without using Expand. 444 func (r *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 445 matches, err := r.match(src, 0, true) 446 if err != nil { 447 panic(err) 448 } 449 if len(matches) == 0 { 450 return src 451 } 452 453 out := make([]byte, len(src)) 454 copy(out, src) 455 456 var diff int64 457 for _, match := range matches { 458 diff, out = replaceBytes(out, repl, match[0], match[1], diff) 459 } 460 461 return out 462 } 463 464 // ReplaceAllString is the String version of ReplaceAll 465 func (r *Regexp) ReplaceAllString(src, repl string) string { 466 return string(r.ReplaceAll([]byte(src), []byte(repl))) 467 } 468 469 // ReplaceAllStringFunc is the String version of ReplaceAllFunc 470 func (r *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 471 return string(r.ReplaceAllFunc([]byte(src), func(b []byte) []byte { 472 return []byte(repl(string(b))) 473 })) 474 } 475 476 // ReplaceAllLiteralString is the String version of ReplaceAllLiteral 477 func (r *Regexp) ReplaceAllLiteralString(src, repl string) string { 478 return string(r.ReplaceAllLiteral([]byte(src), []byte(repl))) 479 } 480 481 // Split slices s into substrings separated by the 482 // expression and returns a slice of the substrings 483 // between those expression matches. 484 // 485 // Example: 486 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 487 // // s: ["", "b", "b", "c", "cadaaae"] 488 // The count determines the number of substrings to return: 489 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 490 // n == 0: the result is nil (zero substrings) 491 // n < 0: all substrings 492 func (r *Regexp) Split(s string, n int) []string { 493 if n == 0 { 494 return nil 495 } 496 497 if len(r.expr) > 0 && len(s) == 0 { 498 return []string{""} 499 } 500 501 matches := r.FindAllStringIndex(s, n) 502 strings := make([]string, 0, len(matches)) 503 504 beg := 0 505 end := 0 506 for _, match := range matches { 507 if n > 0 && len(strings) >= n-1 { 508 break 509 } 510 511 end = match[0] 512 if match[1] != 0 { 513 strings = append(strings, s[beg:end]) 514 } 515 beg = match[1] 516 } 517 518 if end != len(s) { 519 strings = append(strings, s[beg:]) 520 } 521 522 return strings 523 } 524 525 // String returns the text of the regular expression 526 // used for compilation. 527 func (r *Regexp) String() string { 528 return r.expr 529 } 530 531 // SubexpIndex returns the index of the subexpression 532 // with the given name, or -1 if there is no subexpression 533 // with that name. 534 func (r *Regexp) SubexpIndex(name string) int { 535 r.mtx.Lock() 536 defer r.mtx.Unlock() 537 538 // Get C string of name 539 cName, err := libc.CString(name) 540 if err != nil { 541 panic(err) 542 } 543 544 // Get substring index from name 545 ret := lib.Xpcre2_substring_number_from_name_8(r.tls, r.re, cName) 546 547 // If no substring error returned, return -1. 548 // If a different error is returned, panic. 549 if ret == lib.DPCRE2_ERROR_NOSUBSTRING { 550 return -1 551 } else if ret < 0 { 552 panic(codeToError(r.tls, ret)) 553 } 554 555 // Return the index of the subexpression 556 return int(ret) 557 } 558 559 // replaceBytes replaces the bytes at a given location, and returns a new 560 // offset, based on how much bigger or smaller the slice got after replacement 561 func replaceBytes(src, repl []byte, sOff, eOff lib.Tsize_t, diff int64) (int64, []byte) { 562 var out []byte 563 out = append( 564 src[:int64(sOff)+diff], 565 append( 566 repl, 567 src[int64(eOff)+diff:]..., 568 )..., 569 ) 570 571 return diff + int64(len(out)-len(src)), out 572 } 573 574 // match calls the underlying pcre match functions. It re-runs the functions 575 // until no matches are found if multi is set to true. 576 func (r *Regexp) match(b []byte, options uint32, multi bool) ([][]lib.Tsize_t, error) { 577 if len(b) == 0 { 578 return nil, nil 579 } 580 581 r.mtx.Lock() 582 defer r.mtx.Unlock() 583 584 // Create a C pointer to the subject 585 sp := unsafe.Pointer(&b[0]) 586 cSubject := uintptr(sp) 587 // Convert the size of the subject to a C size_t type 588 cSubjectLen := lib.Tsize_t(len(b)) 589 590 // Create match data using the pattern to figure out the buffer size 591 md := lib.Xpcre2_match_data_create_from_pattern_8(r.tls, r.re, 0) 592 if md == 0 { 593 panic("error creating match data") 594 } 595 // Free the match data at the end of the function 596 defer lib.Xpcre2_match_data_free_8(r.tls, md) 597 598 var offset lib.Tsize_t 599 var out [][]lib.Tsize_t 600 // While the offset is less than the length of the subject 601 for offset < cSubjectLen { 602 // Execute expression on subject 603 ret := lib.Xpcre2_match_8(r.tls, r.re, cSubject, cSubjectLen, offset, options, md, 0) 604 if ret < 0 { 605 // If no match found, break 606 if ret == lib.DPCRE2_ERROR_NOMATCH { 607 break 608 } 609 610 return nil, codeToError(r.tls, ret) 611 } else { 612 // Get amount of pairs in output vector 613 pairAmt := lib.Xpcre2_get_ovector_count_8(r.tls, md) 614 // Get pointer to output vector 615 ovec := lib.Xpcre2_get_ovector_pointer_8(r.tls, md) 616 // Create a Go slice using the output vector as the underlying array 617 slice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ovec)), pairAmt*2) 618 619 // Create a new slice and copy the elements from the slice 620 // This is required because the match data will be freed in 621 // a defer, and that would cause a panic every time the slice 622 // is used later. 623 matches := make([]lib.Tsize_t, len(slice)) 624 copy(matches, slice) 625 626 // If the two indices are the same (empty string), and the match is not 627 // immediately after another match, add it to the output and increment the 628 // offset. Otherwise, increment the offset and ignore the match. 629 if slice[0] == slice[1] && len(out) > 0 && slice[0] != out[len(out)-1][1] { 630 out = append(out, matches) 631 offset = slice[1] + 1 632 continue 633 } else if slice[0] == slice[1] { 634 offset = slice[1] + 1 635 continue 636 } 637 638 // Add the match to the output 639 out = append(out, matches) 640 // Set the next offset to the end index of the match 641 offset = matches[1] 642 } 643 644 // If multiple matches disabled, break 645 if !multi { 646 break 647 } 648 } 649 return out, nil 650 } 651 652 // patternInfo calls the underlying pcre pattern info function 653 // and returns information about the compiled regular expression 654 func (r *Regexp) patternInfo(what uint32) (out uint32) { 655 // Create a C pointer to the output integer 656 cOut := uintptr(unsafe.Pointer(&out)) 657 // Get information about the compiled pattern 658 lib.Xpcre2_pattern_info_8(r.tls, r.re, what, cOut) 659 return 660 } 661 662 // Close frees resources used by the regular expression. 663 func (r *Regexp) Close() error { 664 if r == nil { 665 return nil 666 } 667 668 // Close thread-local storage 669 defer r.tls.Close() 670 671 // Free the compiled code 672 lib.Xpcre2_code_free_8(r.tls, r.re) 673 // Set regular expression to null 674 r.re = 0 675 676 return nil 677 }