go.arsenm.dev/pcre@v0.0.0-20220530205550-74594f6c8b0e/pcre.go (about)

     1  // Package pcre is a library that provides pcre2 regular expressions
     2  // in pure Go, allowing for features such as cross-compiling.
     3  //
     4  // The lib directory contains source code automatically translated from
     5  // pcre2's C source code for each supported architecture and/or OS.
     6  // This package wraps the automatically-translated source to provide a
     7  // safe interface as close to Go's regexp library as possible.
     8  package pcre
     9  
    10  import (
    11  	"os"
    12  	"runtime"
    13  	"strconv"
    14  	"sync"
    15  	"unsafe"
    16  
    17  	"go.arsenm.dev/pcre/lib"
    18  
    19  	"modernc.org/libc"
    20  )
    21  
    22  // Version returns the version of pcre2 embedded in this library.
    23  func Version() string { return lib.DPACKAGE_VERSION }
    24  
    25  // Regexp represents a pcre2 regular expression
    26  type Regexp struct {
    27  	mtx  *sync.Mutex
    28  	expr string
    29  	re   uintptr
    30  	tls  *libc.TLS
    31  }
    32  
    33  // Compile runs CompileOpts with no options.
    34  //
    35  // Close() should be called on the returned expression
    36  // once it is no longer needed.
    37  func Compile(pattern string) (*Regexp, error) {
    38  	return CompileOpts(pattern, 0)
    39  }
    40  
    41  // CompileOpts compiles the provided pattern using the given options.
    42  //
    43  // Close() should be called on the returned expression
    44  // once it is no longer needed.
    45  func CompileOpts(pattern string, options CompileOption) (*Regexp, error) {
    46  	tls := libc.NewTLS()
    47  
    48  	// Get C string of pattern
    49  	cPattern, err := libc.CString(pattern)
    50  	if err != nil {
    51  		return nil, err
    52  	}
    53  	// Free the string when done
    54  	defer libc.Xfree(tls, cPattern)
    55  
    56  	// Allocate new error
    57  	cErr := allocError(tls)
    58  	// Free error when done
    59  	defer libc.Xfree(tls, cErr)
    60  
    61  	// Get error offsets
    62  	errPtr := addErrCodeOffset(cErr)
    63  	errOffsetPtr := addErrOffsetOffset(cErr)
    64  
    65  	// Convert pattern length to size_t type
    66  	cPatLen := lib.Tsize_t(len(pattern))
    67  
    68  	// Compile expression
    69  	r := lib.Xpcre2_compile_8(tls, cPattern, cPatLen, uint32(options), errPtr, errOffsetPtr, 0)
    70  	if r == 0 {
    71  		return nil, ptrToError(tls, cErr)
    72  	}
    73  
    74  	// Create regexp instance
    75  	regex := Regexp{
    76  		expr: pattern,
    77  		mtx:  &sync.Mutex{},
    78  		re:   r,
    79  		tls:  tls,
    80  	}
    81  
    82  	// Make sure resources are freed if GC collects the
    83  	// regular expression.
    84  	runtime.SetFinalizer(&regex, func(r *Regexp) error {
    85  		return r.Close()
    86  	})
    87  
    88  	return &regex, nil
    89  }
    90  
    91  // MustCompile compiles the given pattern and panics
    92  // if there was an error
    93  //
    94  // Close() should be called on the returned expression
    95  // once it is no longer needed.
    96  func MustCompile(pattern string) *Regexp {
    97  	rgx, err := Compile(pattern)
    98  	if err != nil {
    99  		panic(err)
   100  	}
   101  	return rgx
   102  }
   103  
   104  // MustCompileOpts compiles the given pattern with the given
   105  // options and panics if there was an error.
   106  //
   107  // Close() should be called on the returned expression
   108  // once it is no longer needed.
   109  func MustCompileOpts(pattern string, options CompileOption) *Regexp {
   110  	rgx, err := CompileOpts(pattern, options)
   111  	if err != nil {
   112  		panic(err)
   113  	}
   114  	return rgx
   115  }
   116  
   117  // Find returns the leftmost match of the regular expression.
   118  // A return value of nil indicates no match.
   119  func (r *Regexp) Find(b []byte) []byte {
   120  	matches, err := r.match(b, 0, false)
   121  	if err != nil {
   122  		panic(err)
   123  	}
   124  	if len(matches) == 0 {
   125  		return nil
   126  	}
   127  	match := matches[0]
   128  	return b[match[0]:match[1]]
   129  }
   130  
   131  // FindIndex returns a two-element slice of integers
   132  // representing the location of the leftmost match of the
   133  // regular expression.
   134  func (r *Regexp) FindIndex(b []byte) []int {
   135  	matches, err := r.match(b, 0, false)
   136  	if err != nil {
   137  		panic(err)
   138  	}
   139  	if len(matches) == 0 {
   140  		return nil
   141  	}
   142  	match := matches[0]
   143  
   144  	return []int{int(match[0]), int(match[1])}
   145  }
   146  
   147  // FindAll returns all matches of the regular expression.
   148  // A return value of nil indicates no match.
   149  func (r *Regexp) FindAll(b []byte, n int) [][]byte {
   150  	matches, err := r.match(b, 0, true)
   151  	if err != nil {
   152  		panic(err)
   153  	}
   154  	if len(matches) == 0 || n == 0 {
   155  		return nil
   156  	}
   157  	if n > 0 && len(matches) > n {
   158  		matches = matches[:n]
   159  	}
   160  
   161  	out := make([][]byte, len(matches))
   162  	for index, match := range matches {
   163  		out[index] = b[match[0]:match[1]]
   164  	}
   165  
   166  	return out
   167  }
   168  
   169  // FindAll returns indices of all matches of the
   170  // regular expression. A return value of nil indicates
   171  // no match.
   172  func (r *Regexp) FindAllIndex(b []byte, n int) [][]int {
   173  	matches, err := r.match(b, 0, true)
   174  	if err != nil {
   175  		panic(err)
   176  	}
   177  	if len(matches) == 0 || n == 0 {
   178  		return nil
   179  	}
   180  	if n > 0 && len(matches) > n {
   181  		matches = matches[:n]
   182  	}
   183  
   184  	out := make([][]int, len(matches))
   185  	for index, match := range matches {
   186  		out[index] = []int{int(match[0]), int(match[1])}
   187  	}
   188  	return out
   189  }
   190  
   191  // FindSubmatch returns a slice containing the match as the
   192  // first element, and the submatches as the subsequent elements.
   193  func (r *Regexp) FindSubmatch(b []byte) [][]byte {
   194  	matches, err := r.match(b, 0, false)
   195  	if err != nil {
   196  		panic(err)
   197  	}
   198  	if len(matches) == 0 {
   199  		return nil
   200  	}
   201  	match := matches[0]
   202  
   203  	out := make([][]byte, 0, len(match)/2)
   204  	for i := 0; i < len(match); i += 2 {
   205  		out = append(out, b[match[i]:match[i+1]])
   206  	}
   207  	return out
   208  }
   209  
   210  // FindSubmatchIndex returns a slice of index pairs representing
   211  // the match and submatches, if any.
   212  func (r *Regexp) FindSubmatchIndex(b []byte) []int {
   213  	matches, err := r.match(b, 0, false)
   214  	if err != nil {
   215  		panic(err)
   216  	}
   217  	if len(matches) == 0 {
   218  		return nil
   219  	}
   220  	match := matches[0]
   221  
   222  	out := make([]int, len(match))
   223  	for index, offset := range match {
   224  		out[index] = int(offset)
   225  	}
   226  
   227  	return out
   228  }
   229  
   230  // FindAllSubmatch returns a slice of all matches and submatches
   231  // of the regular expression. It will return no more than n matches.
   232  // If n < 0, it will return all matches.
   233  func (r *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
   234  	matches, err := r.match(b, 0, true)
   235  	if err != nil {
   236  		panic(err)
   237  	}
   238  	if len(matches) == 0 || n == 0 {
   239  		return nil
   240  	}
   241  	if n > 0 && len(matches) > n {
   242  		matches = matches[:n]
   243  	}
   244  
   245  	out := make([][][]byte, len(matches))
   246  	for index, match := range matches {
   247  		outMatch := make([][]byte, 0, len(match)/2)
   248  
   249  		for i := 0; i < len(match); i += 2 {
   250  			outMatch = append(outMatch, b[match[i]:match[i+1]])
   251  		}
   252  
   253  		out[index] = outMatch
   254  	}
   255  
   256  	return out
   257  }
   258  
   259  // FindAllSubmatch returns a slice of all indeces representing the
   260  // locations of matches and submatches, if any, of the regular expression.
   261  // It will return no more than n matches. If n < 0, it will return all matches.
   262  func (r *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
   263  	matches, err := r.match(b, 0, true)
   264  	if err != nil {
   265  		panic(err)
   266  	}
   267  	if len(matches) == 0 || n == 0 {
   268  		return nil
   269  	}
   270  	if n > 0 && len(matches) > n {
   271  		matches = matches[:n]
   272  	}
   273  
   274  	out := make([][]int, len(matches))
   275  	for index, match := range matches {
   276  		offsets := make([]int, len(match))
   277  
   278  		for index, offset := range match {
   279  			offsets[index] = int(offset)
   280  		}
   281  
   282  		out[index] = offsets
   283  	}
   284  
   285  	return out
   286  }
   287  
   288  // FindString is the String version of Find
   289  func (r *Regexp) FindString(s string) string {
   290  	return string(r.Find([]byte(s)))
   291  }
   292  
   293  // FindStringIndex is the String version of FindIndex
   294  func (r *Regexp) FindStringIndex(s string) []int {
   295  	return r.FindIndex([]byte(s))
   296  }
   297  
   298  // FinAllString is the String version of FindAll
   299  func (r *Regexp) FindAllString(s string, n int) []string {
   300  	matches := r.FindAll([]byte(s), n)
   301  	
   302  	out := make([]string, len(matches))
   303  	for index, match := range matches {
   304  		out[index] = string(match)
   305  	}
   306  	return out
   307  }
   308  
   309  // FindAllStringIndex is the String version of FindIndex
   310  func (r *Regexp) FindAllStringIndex(s string, n int) [][]int {
   311  	return r.FindAllIndex([]byte(s), n)
   312  }
   313  
   314  // FindStringSubmatch is the string version of FindSubmatch
   315  func (r *Regexp) FindStringSubmatch(s string) []string {
   316  	matches := r.FindSubmatch([]byte(s))
   317  
   318  	out := make([]string, len(matches))
   319  	for index, match := range matches {
   320  		out[index] = string(match)
   321  	}
   322  	return out
   323  }
   324  
   325  // FindStringSubmatchIndex is the String version of FindSubmatchIndex
   326  func (r *Regexp) FindStringSubmatchIndex(s string) []int {
   327  	return r.FindSubmatchIndex([]byte(s))
   328  }
   329  
   330  // FindAllStringSubmatch is the String version of FindAllSubmatch
   331  func (r *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
   332  	matches := r.FindAllSubmatch([]byte(s), n)
   333  
   334  	out := make([][]string, len(matches))
   335  	for index, match := range matches {
   336  		outMatch := make([]string, len(match))
   337  
   338  		for index, byteMatch := range match {
   339  			outMatch[index] = string(byteMatch)
   340  		}
   341  
   342  		out[index] = outMatch
   343  	}
   344  
   345  	return out
   346  }
   347  
   348  // FindAllStringSubmatchIndex is the String version of FindAllSubmatchIndex
   349  func (r *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
   350  	return r.FindAllSubmatchIndex([]byte(s), n)
   351  }
   352  
   353  // Match reports whether b contains a match of the regular expression
   354  func (r *Regexp) Match(b []byte) bool {
   355  	return r.Find(b) != nil
   356  }
   357  
   358  // MatchString is the String version of Match
   359  func (r *Regexp) MatchString(s string) bool {
   360  	return r.Find([]byte(s)) != nil
   361  }
   362  
   363  // NumSubexp returns the number of parenthesized subexpressions
   364  // in the regular expression.
   365  func (r *Regexp) NumSubexp() int {
   366  	return int(r.patternInfo(lib.DPCRE2_INFO_CAPTURECOUNT))
   367  }
   368  
   369  // ReplaceAll returns a copy of src, replacing matches of the
   370  // regular expression with the replacement text repl.
   371  // Inside repl, $ signs are interpreted as in Expand,
   372  // so for instance $1 represents the text of the first
   373  // submatch and $name would represent the text of the
   374  // subexpression called "name".
   375  func (r *Regexp) ReplaceAll(src, repl []byte) []byte {
   376  	matches, err := r.match(src, 0, true)
   377  	if err != nil {
   378  		panic(err)
   379  	}
   380  	if len(matches) == 0 {
   381  		return src
   382  	}
   383  
   384  	out := make([]byte, len(src))
   385  	copy(out, src)
   386  
   387  	var diff int64
   388  	for _, match := range matches {
   389  		replStr := os.Expand(string(repl), func(s string) string {
   390  			i, err := strconv.Atoi(s)
   391  			if err != nil {
   392  				i = r.SubexpIndex(s)
   393  				if i == -1 {
   394  					return ""
   395  				}
   396  			}
   397  
   398  			// If there given match does not exist, return empty string
   399  			if i == 0 || len(match) < (2*i)+1 {
   400  				return ""
   401  			}
   402  
   403  			// Return match
   404  			return string(src[match[2*i]:match[(2*i)+1]])
   405  		})
   406  		// Replace replacement string with expanded string
   407  		repl := []byte(replStr)
   408  
   409  		// Replace bytes with new replacement string
   410  		diff, out = replaceBytes(out, repl, match[0], match[1], diff)
   411  	}
   412  
   413  	return out
   414  }
   415  
   416  // ReplaceAllFunc returns a copy of src in which all matches of the
   417  // regular expression have been replaced by the return value of function
   418  // repl applied to the matched byte slice. The replacement returned by
   419  // repl is substituted directly, without using Expand.
   420  func (r *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
   421  	matches, err := r.match(src, 0, true)
   422  	if err != nil {
   423  		panic(err)
   424  	}
   425  	if len(matches) == 0 {
   426  		return src
   427  	}
   428  
   429  	out := make([]byte, len(src))
   430  	copy(out, src)
   431  
   432  	var diff int64
   433  	for _, match := range matches {
   434  		replBytes := repl(src[match[0]:match[1]])
   435  		diff, out = replaceBytes(out, replBytes, match[0], match[1], diff)
   436  	}
   437  
   438  	return out
   439  }
   440  
   441  // ReplaceAllLiteral returns a copy of src, replacing matches of
   442  // the regular expression with the replacement bytes repl.
   443  // The replacement is substituted directly, without using Expand.
   444  func (r *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
   445  	matches, err := r.match(src, 0, true)
   446  	if err != nil {
   447  		panic(err)
   448  	}
   449  	if len(matches) == 0 {
   450  		return src
   451  	}
   452  
   453  	out := make([]byte, len(src))
   454  	copy(out, src)
   455  
   456  	var diff int64
   457  	for _, match := range matches {
   458  		diff, out = replaceBytes(out, repl, match[0], match[1], diff)
   459  	}
   460  
   461  	return out
   462  }
   463  
   464  // ReplaceAllString is the String version of ReplaceAll
   465  func (r *Regexp) ReplaceAllString(src, repl string) string {
   466  	return string(r.ReplaceAll([]byte(src), []byte(repl)))
   467  }
   468  
   469  // ReplaceAllStringFunc is the String version of ReplaceAllFunc
   470  func (r *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
   471  	return string(r.ReplaceAllFunc([]byte(src), func(b []byte) []byte {
   472  		return []byte(repl(string(b)))
   473  	}))
   474  }
   475  
   476  // ReplaceAllLiteralString is the String version of ReplaceAllLiteral
   477  func (r *Regexp) ReplaceAllLiteralString(src, repl string) string {
   478  	return string(r.ReplaceAllLiteral([]byte(src), []byte(repl)))
   479  }
   480  
   481  // Split slices s into substrings separated by the
   482  // expression and returns a slice of the substrings
   483  // between those expression matches.
   484  //
   485  // Example:
   486  //	s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
   487  //	// s: ["", "b", "b", "c", "cadaaae"]
   488  // The count determines the number of substrings to return:
   489  //	n > 0: at most n substrings; the last substring will be the unsplit remainder.
   490  //	n == 0: the result is nil (zero substrings)
   491  //	n < 0: all substrings
   492  func (r *Regexp) Split(s string, n int) []string {
   493  	if n == 0 {
   494  		return nil
   495  	}
   496  
   497  	if len(r.expr) > 0 && len(s) == 0 {
   498  		return []string{""}
   499  	}
   500  
   501  	matches := r.FindAllStringIndex(s, n)
   502  	strings := make([]string, 0, len(matches))
   503  
   504  	beg := 0
   505  	end := 0
   506  	for _, match := range matches {
   507  		if n > 0 && len(strings) >= n-1 {
   508  			break
   509  		}
   510  
   511  		end = match[0]
   512  		if match[1] != 0 {
   513  			strings = append(strings, s[beg:end])
   514  		}
   515  		beg = match[1]
   516  	}
   517  
   518  	if end != len(s) {
   519  		strings = append(strings, s[beg:])
   520  	}
   521  
   522  	return strings
   523  }
   524  
   525  // String returns the text of the regular expression
   526  // used for compilation.
   527  func (r *Regexp) String() string {
   528  	return r.expr
   529  }
   530  
   531  // SubexpIndex returns the index of the subexpression
   532  // with the given name, or -1 if there is no subexpression
   533  // with that name.
   534  func (r *Regexp) SubexpIndex(name string) int {
   535  	r.mtx.Lock()
   536  	defer r.mtx.Unlock()
   537  
   538  	// Get C string of name
   539  	cName, err := libc.CString(name)
   540  	if err != nil {
   541  		panic(err)
   542  	}
   543  
   544  	// Get substring index from name
   545  	ret := lib.Xpcre2_substring_number_from_name_8(r.tls, r.re, cName)
   546  
   547  	// If no substring error returned, return -1.
   548  	// If a different error is returned, panic.
   549  	if ret == lib.DPCRE2_ERROR_NOSUBSTRING {
   550  		return -1
   551  	} else if ret < 0 {
   552  		panic(codeToError(r.tls, ret))
   553  	}
   554  
   555  	// Return the index of the subexpression
   556  	return int(ret)
   557  }
   558  
   559  // replaceBytes replaces the bytes at a given location, and returns a new
   560  // offset, based on how much bigger or smaller the slice got after replacement
   561  func replaceBytes(src, repl []byte, sOff, eOff lib.Tsize_t, diff int64) (int64, []byte) {
   562  	var out []byte
   563  	out = append(
   564  		src[:int64(sOff)+diff],
   565  		append(
   566  			repl,
   567  			src[int64(eOff)+diff:]...,
   568  		)...,
   569  	)
   570  
   571  	return diff + int64(len(out)-len(src)), out
   572  }
   573  
   574  // match calls the underlying pcre match functions. It re-runs the functions
   575  // until no matches are found if multi is set to true.
   576  func (r *Regexp) match(b []byte, options uint32, multi bool) ([][]lib.Tsize_t, error) {
   577  	if len(b) == 0 {
   578  		return nil, nil
   579  	}
   580  	
   581  	r.mtx.Lock()
   582  	defer r.mtx.Unlock()
   583  
   584  	// Create a C pointer to the subject
   585  	sp := unsafe.Pointer(&b[0])
   586  	cSubject := uintptr(sp)
   587  	// Convert the size of the subject to a C size_t type
   588  	cSubjectLen := lib.Tsize_t(len(b))
   589  
   590  	// Create match data using the pattern to figure out the buffer size
   591  	md := lib.Xpcre2_match_data_create_from_pattern_8(r.tls, r.re, 0)
   592  	if md == 0 {
   593  		panic("error creating match data")
   594  	}
   595  	// Free the match data at the end of the function
   596  	defer lib.Xpcre2_match_data_free_8(r.tls, md)
   597  
   598  	var offset lib.Tsize_t
   599  	var out [][]lib.Tsize_t
   600  	// While the offset is less than the length of the subject
   601  	for offset < cSubjectLen {
   602  		// Execute expression on subject
   603  		ret := lib.Xpcre2_match_8(r.tls, r.re, cSubject, cSubjectLen, offset, options, md, 0)
   604  		if ret < 0 {
   605  			// If no match found, break
   606  			if ret == lib.DPCRE2_ERROR_NOMATCH {
   607  				break
   608  			}
   609  
   610  			return nil, codeToError(r.tls, ret)
   611  		} else {
   612  			// Get amount of pairs in output vector
   613  			pairAmt := lib.Xpcre2_get_ovector_count_8(r.tls, md)
   614  			// Get pointer to output vector
   615  			ovec := lib.Xpcre2_get_ovector_pointer_8(r.tls, md)
   616  			// Create a Go slice using the output vector as the underlying array
   617  			slice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ovec)), pairAmt*2)
   618  
   619  			// Create a new slice and copy the elements from the slice
   620  			// This is required because the match data will be freed in
   621  			// a defer, and that would cause a panic every time the slice
   622  			// is used later.
   623  			matches := make([]lib.Tsize_t, len(slice))
   624  			copy(matches, slice)
   625  
   626  			// If the two indices are the same (empty string), and the match is not
   627  			// immediately after another match, add it to the output and increment the
   628  			// offset. Otherwise, increment the offset and ignore the match.
   629  			if slice[0] == slice[1] && len(out) > 0 && slice[0] != out[len(out)-1][1] {
   630  				out = append(out, matches)
   631  				offset = slice[1] + 1
   632  				continue
   633  			} else if slice[0] == slice[1] {
   634  				offset = slice[1] + 1
   635  				continue
   636  			}
   637  
   638  			// Add the match to the output
   639  			out = append(out, matches)
   640  			// Set the next offset to the end index of the match
   641  			offset = matches[1]
   642  		}
   643  
   644  		// If multiple matches disabled, break
   645  		if !multi {
   646  			break
   647  		}
   648  	}
   649  	return out, nil
   650  }
   651  
   652  // patternInfo calls the underlying pcre pattern info function
   653  // and returns information about the compiled regular expression
   654  func (r *Regexp) patternInfo(what uint32) (out uint32) {
   655  	// Create a C pointer to the output integer
   656  	cOut := uintptr(unsafe.Pointer(&out))
   657  	// Get information about the compiled pattern
   658  	lib.Xpcre2_pattern_info_8(r.tls, r.re, what, cOut)
   659  	return
   660  }
   661  
   662  // Close frees resources used by the regular expression.
   663  func (r *Regexp) Close() error {
   664  	if r == nil {
   665  		return nil
   666  	}
   667  
   668  	// Close thread-local storage
   669  	defer r.tls.Close()
   670  
   671  	// Free the compiled code
   672  	lib.Xpcre2_code_free_8(r.tls, r.re)
   673  	// Set regular expression to null
   674  	r.re = 0
   675  
   676  	return nil
   677  }