github.com/nuvolaris/goja@v0.0.0-20230825100449-967811910c6d/regexp.go (about)

     1  package goja
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"regexp"
     7  	"sort"
     8  	"strings"
     9  	"unicode/utf16"
    10  
    11  	"github.com/dlclark/regexp2"
    12  	"github.com/nuvolaris/goja/unistring"
    13  )
    14  
    15  type regexp2MatchCache struct {
    16  	target String
    17  	runes  []rune
    18  	posMap []int
    19  }
    20  
    21  // Not goroutine-safe. Use regexp2Wrapper.clone()
    22  type regexp2Wrapper struct {
    23  	rx    *regexp2.Regexp
    24  	cache *regexp2MatchCache
    25  }
    26  
    27  type regexpWrapper regexp.Regexp
    28  
    29  type positionMapItem struct {
    30  	src, dst int
    31  }
    32  type positionMap []positionMapItem
    33  
    34  func (m positionMap) get(src int) int {
    35  	if src <= 0 {
    36  		return src
    37  	}
    38  	res := sort.Search(len(m), func(n int) bool { return m[n].src >= src })
    39  	if res >= len(m) || m[res].src != src {
    40  		panic("index not found")
    41  	}
    42  	return m[res].dst
    43  }
    44  
    45  type arrayRuneReader struct {
    46  	runes []rune
    47  	pos   int
    48  }
    49  
    50  func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) {
    51  	if rd.pos < len(rd.runes) {
    52  		r = rd.runes[rd.pos]
    53  		size = 1
    54  		rd.pos++
    55  	} else {
    56  		err = io.EOF
    57  	}
    58  	return
    59  }
    60  
    61  // Not goroutine-safe. Use regexpPattern.clone()
    62  type regexpPattern struct {
    63  	src string
    64  
    65  	global, ignoreCase, multiline, sticky, unicode bool
    66  
    67  	regexpWrapper  *regexpWrapper
    68  	regexp2Wrapper *regexp2Wrapper
    69  }
    70  
    71  func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, error) {
    72  	var opts regexp2.RegexOptions = regexp2.ECMAScript
    73  	if multiline {
    74  		opts |= regexp2.Multiline
    75  	}
    76  	if ignoreCase {
    77  		opts |= regexp2.IgnoreCase
    78  	}
    79  	regexp2Pattern, err1 := regexp2.Compile(src, opts)
    80  	if err1 != nil {
    81  		return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
    82  	}
    83  
    84  	return &regexp2Wrapper{rx: regexp2Pattern}, nil
    85  }
    86  
    87  func (p *regexpPattern) createRegexp2() {
    88  	if p.regexp2Wrapper != nil {
    89  		return
    90  	}
    91  	rx, err := compileRegexp2(p.src, p.multiline, p.ignoreCase)
    92  	if err != nil {
    93  		// At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug.
    94  		panic(err)
    95  	}
    96  	p.regexp2Wrapper = rx
    97  }
    98  
    99  func buildUTF8PosMap(s unicodeString) (positionMap, string) {
   100  	pm := make(positionMap, 0, s.Length())
   101  	rd := s.Reader()
   102  	sPos, utf8Pos := 0, 0
   103  	var sb strings.Builder
   104  	for {
   105  		r, size, err := rd.ReadRune()
   106  		if err == io.EOF {
   107  			break
   108  		}
   109  		if err != nil {
   110  			// the string contains invalid UTF-16, bailing out
   111  			return nil, ""
   112  		}
   113  		utf8Size, _ := sb.WriteRune(r)
   114  		sPos += size
   115  		utf8Pos += utf8Size
   116  		pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos})
   117  	}
   118  	return pm, sb.String()
   119  }
   120  
   121  func (p *regexpPattern) findSubmatchIndex(s String, start int) []int {
   122  	if p.regexpWrapper == nil {
   123  		return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
   124  	}
   125  	if start != 0 {
   126  		// Unfortunately Go's regexp library does not allow starting from an arbitrary position.
   127  		// If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not
   128  		// work correctly.
   129  		p.createRegexp2()
   130  		return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
   131  	}
   132  	return p.regexpWrapper.findSubmatchIndex(s, p.unicode)
   133  }
   134  
   135  func (p *regexpPattern) findAllSubmatchIndex(s String, start int, limit int, sticky bool) [][]int {
   136  	if p.regexpWrapper == nil {
   137  		return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
   138  	}
   139  	if start == 0 {
   140  		a, u := devirtualizeString(s)
   141  		if u == nil {
   142  			return p.regexpWrapper.findAllSubmatchIndex(string(a), limit, sticky)
   143  		}
   144  		if limit == 1 {
   145  			result := p.regexpWrapper.findSubmatchIndexUnicode(u, p.unicode)
   146  			if result == nil {
   147  				return nil
   148  			}
   149  			return [][]int{result}
   150  		}
   151  		// Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an
   152  		// input.
   153  		if p.unicode {
   154  			// Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8.
   155  			pm, str := buildUTF8PosMap(u)
   156  			if pm != nil {
   157  				res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky)
   158  				for _, result := range res {
   159  					for i, idx := range result {
   160  						result[i] = pm.get(idx)
   161  					}
   162  				}
   163  				return res
   164  			}
   165  		}
   166  	}
   167  
   168  	p.createRegexp2()
   169  	return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
   170  }
   171  
   172  // clone creates a copy of the regexpPattern which can be used concurrently.
   173  func (p *regexpPattern) clone() *regexpPattern {
   174  	ret := &regexpPattern{
   175  		src:        p.src,
   176  		global:     p.global,
   177  		ignoreCase: p.ignoreCase,
   178  		multiline:  p.multiline,
   179  		sticky:     p.sticky,
   180  		unicode:    p.unicode,
   181  	}
   182  	if p.regexpWrapper != nil {
   183  		ret.regexpWrapper = p.regexpWrapper.clone()
   184  	}
   185  	if p.regexp2Wrapper != nil {
   186  		ret.regexp2Wrapper = p.regexp2Wrapper.clone()
   187  	}
   188  	return ret
   189  }
   190  
   191  type regexpObject struct {
   192  	baseObject
   193  	pattern *regexpPattern
   194  	source  String
   195  
   196  	standard bool
   197  }
   198  
   199  func (r *regexp2Wrapper) findSubmatchIndex(s String, start int, fullUnicode, doCache bool) (result []int) {
   200  	if fullUnicode {
   201  		return r.findSubmatchIndexUnicode(s, start, doCache)
   202  	}
   203  	return r.findSubmatchIndexUTF16(s, start, doCache)
   204  }
   205  
   206  func (r *regexp2Wrapper) findUTF16Cached(s String, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) {
   207  	wrapped := r.rx
   208  	cache := r.cache
   209  	if cache != nil && cache.posMap == nil && cache.target.SameAs(s) {
   210  		runes = cache.runes
   211  	} else {
   212  		runes = s.utf16Runes()
   213  		cache = nil
   214  	}
   215  	match, err = wrapped.FindRunesMatchStartingAt(runes, start)
   216  	if doCache && match != nil && err == nil {
   217  		if cache == nil {
   218  			if r.cache == nil {
   219  				r.cache = new(regexp2MatchCache)
   220  			}
   221  			*r.cache = regexp2MatchCache{
   222  				target: s,
   223  				runes:  runes,
   224  			}
   225  		}
   226  	} else {
   227  		r.cache = nil
   228  	}
   229  	return
   230  }
   231  
   232  func (r *regexp2Wrapper) findSubmatchIndexUTF16(s String, start int, doCache bool) (result []int) {
   233  	match, _, err := r.findUTF16Cached(s, start, doCache)
   234  	if err != nil {
   235  		return
   236  	}
   237  
   238  	if match == nil {
   239  		return
   240  	}
   241  	groups := match.Groups()
   242  
   243  	result = make([]int, 0, len(groups)<<1)
   244  	for _, group := range groups {
   245  		if len(group.Captures) > 0 {
   246  			result = append(result, group.Index, group.Index+group.Length)
   247  		} else {
   248  			result = append(result, -1, 0)
   249  		}
   250  	}
   251  	return
   252  }
   253  
   254  func (r *regexp2Wrapper) findUnicodeCached(s String, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) {
   255  	var (
   256  		runes       []rune
   257  		mappedStart int
   258  		splitPair   bool
   259  		savedRune   rune
   260  	)
   261  	wrapped := r.rx
   262  	cache := r.cache
   263  	if cache != nil && cache.posMap != nil && cache.target.SameAs(s) {
   264  		runes, posMap = cache.runes, cache.posMap
   265  		mappedStart, splitPair = posMapReverseLookup(posMap, start)
   266  	} else {
   267  		posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), start)
   268  		cache = nil
   269  	}
   270  	if splitPair {
   271  		// temporarily set the rune at mappedStart to the second code point of the pair
   272  		_, second := utf16.EncodeRune(runes[mappedStart])
   273  		savedRune, runes[mappedStart] = runes[mappedStart], second
   274  	}
   275  	match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart)
   276  	if doCache && match != nil && err == nil {
   277  		if splitPair {
   278  			runes[mappedStart] = savedRune
   279  		}
   280  		if cache == nil {
   281  			if r.cache == nil {
   282  				r.cache = new(regexp2MatchCache)
   283  			}
   284  			*r.cache = regexp2MatchCache{
   285  				target: s,
   286  				runes:  runes,
   287  				posMap: posMap,
   288  			}
   289  		}
   290  	} else {
   291  		r.cache = nil
   292  	}
   293  
   294  	return
   295  }
   296  
   297  func (r *regexp2Wrapper) findSubmatchIndexUnicode(s String, start int, doCache bool) (result []int) {
   298  	match, posMap, err := r.findUnicodeCached(s, start, doCache)
   299  	if match == nil || err != nil {
   300  		return
   301  	}
   302  
   303  	groups := match.Groups()
   304  
   305  	result = make([]int, 0, len(groups)<<1)
   306  	for _, group := range groups {
   307  		if len(group.Captures) > 0 {
   308  			result = append(result, posMap[group.Index], posMap[group.Index+group.Length])
   309  		} else {
   310  			result = append(result, -1, 0)
   311  		}
   312  	}
   313  	return
   314  }
   315  
   316  func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s String, start, limit int, sticky bool) [][]int {
   317  	wrapped := r.rx
   318  	match, runes, err := r.findUTF16Cached(s, start, false)
   319  	if match == nil || err != nil {
   320  		return nil
   321  	}
   322  	if limit < 0 {
   323  		limit = len(runes) + 1
   324  	}
   325  	results := make([][]int, 0, limit)
   326  	for match != nil {
   327  		groups := match.Groups()
   328  
   329  		result := make([]int, 0, len(groups)<<1)
   330  
   331  		for _, group := range groups {
   332  			if len(group.Captures) > 0 {
   333  				startPos := group.Index
   334  				endPos := group.Index + group.Length
   335  				result = append(result, startPos, endPos)
   336  			} else {
   337  				result = append(result, -1, 0)
   338  			}
   339  		}
   340  
   341  		if sticky && len(result) > 1 {
   342  			if result[0] != start {
   343  				break
   344  			}
   345  			start = result[1]
   346  		}
   347  
   348  		results = append(results, result)
   349  		limit--
   350  		if limit <= 0 {
   351  			break
   352  		}
   353  		match, err = wrapped.FindNextMatch(match)
   354  		if err != nil {
   355  			return nil
   356  		}
   357  	}
   358  	return results
   359  }
   360  
   361  func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) {
   362  	posMap = make([]int, 0, l+1)
   363  	curPos := 0
   364  	runes = make([]rune, 0, l)
   365  	startFound := false
   366  	for {
   367  		if !startFound {
   368  			if curPos == start {
   369  				mappedStart = len(runes)
   370  				startFound = true
   371  			}
   372  			if curPos > start {
   373  				// start position splits a surrogate pair
   374  				mappedStart = len(runes) - 1
   375  				splitPair = true
   376  				startFound = true
   377  			}
   378  		}
   379  		rn, size, err := rd.ReadRune()
   380  		if err != nil {
   381  			break
   382  		}
   383  		runes = append(runes, rn)
   384  		posMap = append(posMap, curPos)
   385  		curPos += size
   386  	}
   387  	posMap = append(posMap, curPos)
   388  	return
   389  }
   390  
   391  func posMapReverseLookup(posMap []int, pos int) (int, bool) {
   392  	mapped := sort.SearchInts(posMap, pos)
   393  	if mapped < len(posMap) && posMap[mapped] != pos {
   394  		return mapped - 1, true
   395  	}
   396  	return mapped, false
   397  }
   398  
   399  func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int {
   400  	wrapped := r.rx
   401  	if limit < 0 {
   402  		limit = len(s) + 1
   403  	}
   404  	results := make([][]int, 0, limit)
   405  	match, posMap, err := r.findUnicodeCached(s, start, false)
   406  	if err != nil {
   407  		return nil
   408  	}
   409  	for match != nil {
   410  		groups := match.Groups()
   411  
   412  		result := make([]int, 0, len(groups)<<1)
   413  
   414  		for _, group := range groups {
   415  			if len(group.Captures) > 0 {
   416  				start := posMap[group.Index]
   417  				end := posMap[group.Index+group.Length]
   418  				result = append(result, start, end)
   419  			} else {
   420  				result = append(result, -1, 0)
   421  			}
   422  		}
   423  
   424  		if sticky && len(result) > 1 {
   425  			if result[0] != start {
   426  				break
   427  			}
   428  			start = result[1]
   429  		}
   430  
   431  		results = append(results, result)
   432  		match, err = wrapped.FindNextMatch(match)
   433  		if err != nil {
   434  			return nil
   435  		}
   436  	}
   437  	return results
   438  }
   439  
   440  func (r *regexp2Wrapper) findAllSubmatchIndex(s String, start, limit int, sticky, fullUnicode bool) [][]int {
   441  	a, u := devirtualizeString(s)
   442  	if u != nil {
   443  		if fullUnicode {
   444  			return r.findAllSubmatchIndexUnicode(u, start, limit, sticky)
   445  		}
   446  		return r.findAllSubmatchIndexUTF16(u, start, limit, sticky)
   447  	}
   448  	return r.findAllSubmatchIndexUTF16(a, start, limit, sticky)
   449  }
   450  
   451  func (r *regexp2Wrapper) clone() *regexp2Wrapper {
   452  	return &regexp2Wrapper{
   453  		rx: r.rx,
   454  	}
   455  }
   456  
   457  func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) {
   458  	wrapped := (*regexp.Regexp)(r)
   459  	results = wrapped.FindAllStringSubmatchIndex(s, limit)
   460  	pos := 0
   461  	if sticky {
   462  		for i, result := range results {
   463  			if len(result) > 1 {
   464  				if result[0] != pos {
   465  					return results[:i]
   466  				}
   467  				pos = result[1]
   468  			}
   469  		}
   470  	}
   471  	return
   472  }
   473  
   474  func (r *regexpWrapper) findSubmatchIndex(s String, fullUnicode bool) []int {
   475  	a, u := devirtualizeString(s)
   476  	if u != nil {
   477  		return r.findSubmatchIndexUnicode(u, fullUnicode)
   478  	}
   479  	return r.findSubmatchIndexASCII(string(a))
   480  }
   481  
   482  func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int {
   483  	wrapped := (*regexp.Regexp)(r)
   484  	return wrapped.FindStringSubmatchIndex(s)
   485  }
   486  
   487  func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) {
   488  	wrapped := (*regexp.Regexp)(r)
   489  	if fullUnicode {
   490  		posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), 0)
   491  		res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes})
   492  		for i, item := range res {
   493  			if item >= 0 {
   494  				res[i] = posMap[item]
   495  			}
   496  		}
   497  		return res
   498  	}
   499  	return wrapped.FindReaderSubmatchIndex(s.utf16RuneReader())
   500  }
   501  
   502  func (r *regexpWrapper) clone() *regexpWrapper {
   503  	return r
   504  }
   505  
   506  func (r *regexpObject) execResultToArray(target String, result []int) Value {
   507  	captureCount := len(result) >> 1
   508  	valueArray := make([]Value, captureCount)
   509  	matchIndex := result[0]
   510  	valueArray[0] = target.Substring(result[0], result[1])
   511  	lowerBound := 0
   512  	for index := 1; index < captureCount; index++ {
   513  		offset := index << 1
   514  		if result[offset] >= 0 && result[offset+1] >= lowerBound {
   515  			valueArray[index] = target.Substring(result[offset], result[offset+1])
   516  			lowerBound = result[offset]
   517  		} else {
   518  			valueArray[index] = _undefined
   519  		}
   520  	}
   521  	match := r.val.runtime.newArrayValues(valueArray)
   522  	match.self.setOwnStr("input", target, false)
   523  	match.self.setOwnStr("index", intToValue(int64(matchIndex)), false)
   524  	return match
   525  }
   526  
   527  func (r *regexpObject) getLastIndex() int64 {
   528  	lastIndex := toLength(r.getStr("lastIndex", nil))
   529  	if !r.pattern.global && !r.pattern.sticky {
   530  		return 0
   531  	}
   532  	return lastIndex
   533  }
   534  
   535  func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool {
   536  	if r.pattern.sticky {
   537  		if firstResult == nil || int64(firstResult[0]) != index {
   538  			r.setOwnStr("lastIndex", intToValue(0), true)
   539  			return false
   540  		}
   541  	} else {
   542  		if firstResult == nil {
   543  			if r.pattern.global {
   544  				r.setOwnStr("lastIndex", intToValue(0), true)
   545  			}
   546  			return false
   547  		}
   548  	}
   549  
   550  	if r.pattern.global || r.pattern.sticky {
   551  		r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true)
   552  	}
   553  	return true
   554  }
   555  
   556  func (r *regexpObject) execRegexp(target String) (match bool, result []int) {
   557  	index := r.getLastIndex()
   558  	if index >= 0 && index <= int64(target.Length()) {
   559  		result = r.pattern.findSubmatchIndex(target, int(index))
   560  	}
   561  	match = r.updateLastIndex(index, result, result)
   562  	return
   563  }
   564  
   565  func (r *regexpObject) exec(target String) Value {
   566  	match, result := r.execRegexp(target)
   567  	if match {
   568  		return r.execResultToArray(target, result)
   569  	}
   570  	return _null
   571  }
   572  
   573  func (r *regexpObject) test(target String) bool {
   574  	match, _ := r.execRegexp(target)
   575  	return match
   576  }
   577  
   578  func (r *regexpObject) clone() *regexpObject {
   579  	r1 := r.val.runtime.newRegexpObject(r.prototype)
   580  	r1.source = r.source
   581  	r1.pattern = r.pattern
   582  
   583  	return r1
   584  }
   585  
   586  func (r *regexpObject) init() {
   587  	r.baseObject.init()
   588  	r.standard = true
   589  	r._putProp("lastIndex", intToValue(0), true, false, false)
   590  }
   591  
   592  func (r *regexpObject) setProto(proto *Object, throw bool) bool {
   593  	res := r.baseObject.setProto(proto, throw)
   594  	if res {
   595  		r.standard = false
   596  	}
   597  	return res
   598  }
   599  
   600  func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool {
   601  	res := r.baseObject.defineOwnPropertyStr(name, desc, throw)
   602  	if res {
   603  		r.standard = false
   604  	}
   605  	return res
   606  }
   607  
   608  func (r *regexpObject) defineOwnPropertySym(name *Symbol, desc PropertyDescriptor, throw bool) bool {
   609  	res := r.baseObject.defineOwnPropertySym(name, desc, throw)
   610  	if res && r.standard {
   611  		switch name {
   612  		case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
   613  			r.standard = false
   614  		}
   615  	}
   616  	return res
   617  }
   618  
   619  func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool {
   620  	res := r.baseObject.deleteStr(name, throw)
   621  	if res {
   622  		r.standard = false
   623  	}
   624  	return res
   625  }
   626  
   627  func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool {
   628  	res := r.baseObject.setOwnStr(name, value, throw)
   629  	if res && r.standard && name == "exec" {
   630  		r.standard = false
   631  	}
   632  	return res
   633  }
   634  
   635  func (r *regexpObject) setOwnSym(name *Symbol, value Value, throw bool) bool {
   636  	res := r.baseObject.setOwnSym(name, value, throw)
   637  	if res && r.standard {
   638  		switch name {
   639  		case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
   640  			r.standard = false
   641  		}
   642  	}
   643  	return res
   644  }