github.com/nuvolaris/goja@v0.0.0-20230825100449-967811910c6d/parser/regexp.go (about)

     1  package parser
     2  
     3  import (
     4  	"fmt"
     5  	"strconv"
     6  	"strings"
     7  	"unicode/utf8"
     8  )
     9  
    10  const (
    11  	WhitespaceChars = " \f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff"
    12  	Re2Dot          = "[^\r\n\u2028\u2029]"
    13  )
    14  
    15  type regexpParseError struct {
    16  	offset int
    17  	err    string
    18  }
    19  
    20  type RegexpErrorIncompatible struct {
    21  	regexpParseError
    22  }
    23  type RegexpSyntaxError struct {
    24  	regexpParseError
    25  }
    26  
    27  func (s regexpParseError) Error() string {
    28  	return s.err
    29  }
    30  
    31  type _RegExp_parser struct {
    32  	str    string
    33  	length int
    34  
    35  	chr       rune // The current character
    36  	chrOffset int  // The offset of current character
    37  	offset    int  // The offset after current character (may be greater than 1)
    38  
    39  	err error
    40  
    41  	goRegexp   strings.Builder
    42  	passOffset int
    43  }
    44  
    45  // TransformRegExp transforms a JavaScript pattern into  a Go "regexp" pattern.
    46  //
    47  // re2 (Go) cannot do backtracking, so the presence of a lookahead (?=) (?!) or
    48  // backreference (\1, \2, ...) will cause an error.
    49  //
    50  // re2 (Go) has a different definition for \s: [\t\n\f\r ].
    51  // The JavaScript definition, on the other hand, also includes \v, Unicode "Separator, Space", etc.
    52  //
    53  // If the pattern is valid, but incompatible (contains a lookahead or backreference),
    54  // then this function returns an empty string an error of type RegexpErrorIncompatible.
    55  //
    56  // If the pattern is invalid (not valid even in JavaScript), then this function
    57  // returns an empty string and a generic error.
    58  func TransformRegExp(pattern string) (transformed string, err error) {
    59  
    60  	if pattern == "" {
    61  		return "", nil
    62  	}
    63  
    64  	parser := _RegExp_parser{
    65  		str:    pattern,
    66  		length: len(pattern),
    67  	}
    68  	err = parser.parse()
    69  	if err != nil {
    70  		return "", err
    71  	}
    72  
    73  	return parser.ResultString(), nil
    74  }
    75  
    76  func (self *_RegExp_parser) ResultString() string {
    77  	if self.passOffset != -1 {
    78  		return self.str[:self.passOffset]
    79  	}
    80  	return self.goRegexp.String()
    81  }
    82  
    83  func (self *_RegExp_parser) parse() (err error) {
    84  	self.read() // Pull in the first character
    85  	self.scan()
    86  	return self.err
    87  }
    88  
    89  func (self *_RegExp_parser) read() {
    90  	if self.offset < self.length {
    91  		self.chrOffset = self.offset
    92  		chr, width := rune(self.str[self.offset]), 1
    93  		if chr >= utf8.RuneSelf { // !ASCII
    94  			chr, width = utf8.DecodeRuneInString(self.str[self.offset:])
    95  			if chr == utf8.RuneError && width == 1 {
    96  				self.error(true, "Invalid UTF-8 character")
    97  				return
    98  			}
    99  		}
   100  		self.offset += width
   101  		self.chr = chr
   102  	} else {
   103  		self.chrOffset = self.length
   104  		self.chr = -1 // EOF
   105  	}
   106  }
   107  
   108  func (self *_RegExp_parser) stopPassing() {
   109  	self.goRegexp.Grow(3 * len(self.str) / 2)
   110  	self.goRegexp.WriteString(self.str[:self.passOffset])
   111  	self.passOffset = -1
   112  }
   113  
   114  func (self *_RegExp_parser) write(p []byte) {
   115  	if self.passOffset != -1 {
   116  		self.stopPassing()
   117  	}
   118  	self.goRegexp.Write(p)
   119  }
   120  
   121  func (self *_RegExp_parser) writeByte(b byte) {
   122  	if self.passOffset != -1 {
   123  		self.stopPassing()
   124  	}
   125  	self.goRegexp.WriteByte(b)
   126  }
   127  
   128  func (self *_RegExp_parser) writeString(s string) {
   129  	if self.passOffset != -1 {
   130  		self.stopPassing()
   131  	}
   132  	self.goRegexp.WriteString(s)
   133  }
   134  
   135  func (self *_RegExp_parser) scan() {
   136  	for self.chr != -1 {
   137  		switch self.chr {
   138  		case '\\':
   139  			self.read()
   140  			self.scanEscape(false)
   141  		case '(':
   142  			self.pass()
   143  			self.scanGroup()
   144  		case '[':
   145  			self.scanBracket()
   146  		case ')':
   147  			self.error(true, "Unmatched ')'")
   148  			return
   149  		case '.':
   150  			self.writeString(Re2Dot)
   151  			self.read()
   152  		default:
   153  			self.pass()
   154  		}
   155  	}
   156  }
   157  
   158  // (...)
   159  func (self *_RegExp_parser) scanGroup() {
   160  	str := self.str[self.chrOffset:]
   161  	if len(str) > 1 { // A possibility of (?= or (?!
   162  		if str[0] == '?' {
   163  			ch := str[1]
   164  			switch {
   165  			case ch == '=' || ch == '!':
   166  				self.error(false, "re2: Invalid (%s) <lookahead>", self.str[self.chrOffset:self.chrOffset+2])
   167  				return
   168  			case ch == '<':
   169  				self.error(false, "re2: Invalid (%s) <lookbehind>", self.str[self.chrOffset:self.chrOffset+2])
   170  				return
   171  			case ch != ':':
   172  				self.error(true, "Invalid group")
   173  				return
   174  			}
   175  		}
   176  	}
   177  	for self.chr != -1 && self.chr != ')' {
   178  		switch self.chr {
   179  		case '\\':
   180  			self.read()
   181  			self.scanEscape(false)
   182  		case '(':
   183  			self.pass()
   184  			self.scanGroup()
   185  		case '[':
   186  			self.scanBracket()
   187  		case '.':
   188  			self.writeString(Re2Dot)
   189  			self.read()
   190  		default:
   191  			self.pass()
   192  			continue
   193  		}
   194  	}
   195  	if self.chr != ')' {
   196  		self.error(true, "Unterminated group")
   197  		return
   198  	}
   199  	self.pass()
   200  }
   201  
   202  // [...]
   203  func (self *_RegExp_parser) scanBracket() {
   204  	str := self.str[self.chrOffset:]
   205  	if strings.HasPrefix(str, "[]") {
   206  		// [] -- Empty character class
   207  		self.writeString("[^\u0000-\U0001FFFF]")
   208  		self.offset += 1
   209  		self.read()
   210  		return
   211  	}
   212  
   213  	if strings.HasPrefix(str, "[^]") {
   214  		self.writeString("[\u0000-\U0001FFFF]")
   215  		self.offset += 2
   216  		self.read()
   217  		return
   218  	}
   219  
   220  	self.pass()
   221  	for self.chr != -1 {
   222  		if self.chr == ']' {
   223  			break
   224  		} else if self.chr == '\\' {
   225  			self.read()
   226  			self.scanEscape(true)
   227  			continue
   228  		}
   229  		self.pass()
   230  	}
   231  	if self.chr != ']' {
   232  		self.error(true, "Unterminated character class")
   233  		return
   234  	}
   235  	self.pass()
   236  }
   237  
   238  // \...
   239  func (self *_RegExp_parser) scanEscape(inClass bool) {
   240  	offset := self.chrOffset
   241  
   242  	var length, base uint32
   243  	switch self.chr {
   244  
   245  	case '0', '1', '2', '3', '4', '5', '6', '7':
   246  		var value int64
   247  		size := 0
   248  		for {
   249  			digit := int64(digitValue(self.chr))
   250  			if digit >= 8 {
   251  				// Not a valid digit
   252  				break
   253  			}
   254  			value = value*8 + digit
   255  			self.read()
   256  			size += 1
   257  		}
   258  		if size == 1 { // The number of characters read
   259  			if value != 0 {
   260  				// An invalid backreference
   261  				self.error(false, "re2: Invalid \\%d <backreference>", value)
   262  				return
   263  			}
   264  			self.passString(offset-1, self.chrOffset)
   265  			return
   266  		}
   267  		tmp := []byte{'\\', 'x', '0', 0}
   268  		if value >= 16 {
   269  			tmp = tmp[0:2]
   270  		} else {
   271  			tmp = tmp[0:3]
   272  		}
   273  		tmp = strconv.AppendInt(tmp, value, 16)
   274  		self.write(tmp)
   275  		return
   276  
   277  	case '8', '9':
   278  		self.read()
   279  		self.error(false, "re2: Invalid \\%s <backreference>", self.str[offset:self.chrOffset])
   280  		return
   281  
   282  	case 'x':
   283  		self.read()
   284  		length, base = 2, 16
   285  
   286  	case 'u':
   287  		self.read()
   288  		if self.chr == '{' {
   289  			self.read()
   290  			length, base = 0, 16
   291  		} else {
   292  			length, base = 4, 16
   293  		}
   294  
   295  	case 'b':
   296  		if inClass {
   297  			self.write([]byte{'\\', 'x', '0', '8'})
   298  			self.read()
   299  			return
   300  		}
   301  		fallthrough
   302  
   303  	case 'B':
   304  		fallthrough
   305  
   306  	case 'd', 'D', 'w', 'W':
   307  		// This is slightly broken, because ECMAScript
   308  		// includes \v in \s, \S, while re2 does not
   309  		fallthrough
   310  
   311  	case '\\':
   312  		fallthrough
   313  
   314  	case 'f', 'n', 'r', 't', 'v':
   315  		self.passString(offset-1, self.offset)
   316  		self.read()
   317  		return
   318  
   319  	case 'c':
   320  		self.read()
   321  		var value int64
   322  		if 'a' <= self.chr && self.chr <= 'z' {
   323  			value = int64(self.chr - 'a' + 1)
   324  		} else if 'A' <= self.chr && self.chr <= 'Z' {
   325  			value = int64(self.chr - 'A' + 1)
   326  		} else {
   327  			self.writeByte('c')
   328  			return
   329  		}
   330  		tmp := []byte{'\\', 'x', '0', 0}
   331  		if value >= 16 {
   332  			tmp = tmp[0:2]
   333  		} else {
   334  			tmp = tmp[0:3]
   335  		}
   336  		tmp = strconv.AppendInt(tmp, value, 16)
   337  		self.write(tmp)
   338  		self.read()
   339  		return
   340  	case 's':
   341  		if inClass {
   342  			self.writeString(WhitespaceChars)
   343  		} else {
   344  			self.writeString("[" + WhitespaceChars + "]")
   345  		}
   346  		self.read()
   347  		return
   348  	case 'S':
   349  		if inClass {
   350  			self.error(false, "S in class")
   351  			return
   352  		} else {
   353  			self.writeString("[^" + WhitespaceChars + "]")
   354  		}
   355  		self.read()
   356  		return
   357  	default:
   358  		// $ is an identifier character, so we have to have
   359  		// a special case for it here
   360  		if self.chr == '$' || self.chr < utf8.RuneSelf && !isIdentifierPart(self.chr) {
   361  			// A non-identifier character needs escaping
   362  			self.passString(offset-1, self.offset)
   363  			self.read()
   364  			return
   365  		}
   366  		// Unescape the character for re2
   367  		self.pass()
   368  		return
   369  	}
   370  
   371  	// Otherwise, we're a \u.... or \x...
   372  	valueOffset := self.chrOffset
   373  
   374  	if length > 0 {
   375  		for length := length; length > 0; length-- {
   376  			digit := uint32(digitValue(self.chr))
   377  			if digit >= base {
   378  				// Not a valid digit
   379  				goto skip
   380  			}
   381  			self.read()
   382  		}
   383  	} else {
   384  		for self.chr != '}' && self.chr != -1 {
   385  			digit := uint32(digitValue(self.chr))
   386  			if digit >= base {
   387  				// Not a valid digit
   388  				goto skip
   389  			}
   390  			self.read()
   391  		}
   392  	}
   393  
   394  	if length == 4 || length == 0 {
   395  		self.write([]byte{
   396  			'\\',
   397  			'x',
   398  			'{',
   399  		})
   400  		self.passString(valueOffset, self.chrOffset)
   401  		if length != 0 {
   402  			self.writeByte('}')
   403  		}
   404  	} else if length == 2 {
   405  		self.passString(offset-1, valueOffset+2)
   406  	} else {
   407  		// Should never, ever get here...
   408  		self.error(true, "re2: Illegal branch in scanEscape")
   409  		return
   410  	}
   411  
   412  	return
   413  
   414  skip:
   415  	self.passString(offset, self.chrOffset)
   416  }
   417  
   418  func (self *_RegExp_parser) pass() {
   419  	if self.passOffset == self.chrOffset {
   420  		self.passOffset = self.offset
   421  	} else {
   422  		if self.passOffset != -1 {
   423  			self.stopPassing()
   424  		}
   425  		if self.chr != -1 {
   426  			self.goRegexp.WriteRune(self.chr)
   427  		}
   428  	}
   429  	self.read()
   430  }
   431  
   432  func (self *_RegExp_parser) passString(start, end int) {
   433  	if self.passOffset == start {
   434  		self.passOffset = end
   435  		return
   436  	}
   437  	if self.passOffset != -1 {
   438  		self.stopPassing()
   439  	}
   440  	self.goRegexp.WriteString(self.str[start:end])
   441  }
   442  
   443  func (self *_RegExp_parser) error(fatal bool, msg string, msgValues ...interface{}) {
   444  	if self.err != nil {
   445  		return
   446  	}
   447  	e := regexpParseError{
   448  		offset: self.offset,
   449  		err:    fmt.Sprintf(msg, msgValues...),
   450  	}
   451  	if fatal {
   452  		self.err = RegexpSyntaxError{e}
   453  	} else {
   454  		self.err = RegexpErrorIncompatible{e}
   455  	}
   456  	self.offset = self.length
   457  	self.chr = -1
   458  }