github.com/nuvolaris/goja@v0.0.0-20230825100449-967811910c6d/parser/regexp.go (about) 1 package parser 2 3 import ( 4 "fmt" 5 "strconv" 6 "strings" 7 "unicode/utf8" 8 ) 9 10 const ( 11 WhitespaceChars = " \f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff" 12 Re2Dot = "[^\r\n\u2028\u2029]" 13 ) 14 15 type regexpParseError struct { 16 offset int 17 err string 18 } 19 20 type RegexpErrorIncompatible struct { 21 regexpParseError 22 } 23 type RegexpSyntaxError struct { 24 regexpParseError 25 } 26 27 func (s regexpParseError) Error() string { 28 return s.err 29 } 30 31 type _RegExp_parser struct { 32 str string 33 length int 34 35 chr rune // The current character 36 chrOffset int // The offset of current character 37 offset int // The offset after current character (may be greater than 1) 38 39 err error 40 41 goRegexp strings.Builder 42 passOffset int 43 } 44 45 // TransformRegExp transforms a JavaScript pattern into a Go "regexp" pattern. 46 // 47 // re2 (Go) cannot do backtracking, so the presence of a lookahead (?=) (?!) or 48 // backreference (\1, \2, ...) will cause an error. 49 // 50 // re2 (Go) has a different definition for \s: [\t\n\f\r ]. 51 // The JavaScript definition, on the other hand, also includes \v, Unicode "Separator, Space", etc. 52 // 53 // If the pattern is valid, but incompatible (contains a lookahead or backreference), 54 // then this function returns an empty string an error of type RegexpErrorIncompatible. 55 // 56 // If the pattern is invalid (not valid even in JavaScript), then this function 57 // returns an empty string and a generic error. 58 func TransformRegExp(pattern string) (transformed string, err error) { 59 60 if pattern == "" { 61 return "", nil 62 } 63 64 parser := _RegExp_parser{ 65 str: pattern, 66 length: len(pattern), 67 } 68 err = parser.parse() 69 if err != nil { 70 return "", err 71 } 72 73 return parser.ResultString(), nil 74 } 75 76 func (self *_RegExp_parser) ResultString() string { 77 if self.passOffset != -1 { 78 return self.str[:self.passOffset] 79 } 80 return self.goRegexp.String() 81 } 82 83 func (self *_RegExp_parser) parse() (err error) { 84 self.read() // Pull in the first character 85 self.scan() 86 return self.err 87 } 88 89 func (self *_RegExp_parser) read() { 90 if self.offset < self.length { 91 self.chrOffset = self.offset 92 chr, width := rune(self.str[self.offset]), 1 93 if chr >= utf8.RuneSelf { // !ASCII 94 chr, width = utf8.DecodeRuneInString(self.str[self.offset:]) 95 if chr == utf8.RuneError && width == 1 { 96 self.error(true, "Invalid UTF-8 character") 97 return 98 } 99 } 100 self.offset += width 101 self.chr = chr 102 } else { 103 self.chrOffset = self.length 104 self.chr = -1 // EOF 105 } 106 } 107 108 func (self *_RegExp_parser) stopPassing() { 109 self.goRegexp.Grow(3 * len(self.str) / 2) 110 self.goRegexp.WriteString(self.str[:self.passOffset]) 111 self.passOffset = -1 112 } 113 114 func (self *_RegExp_parser) write(p []byte) { 115 if self.passOffset != -1 { 116 self.stopPassing() 117 } 118 self.goRegexp.Write(p) 119 } 120 121 func (self *_RegExp_parser) writeByte(b byte) { 122 if self.passOffset != -1 { 123 self.stopPassing() 124 } 125 self.goRegexp.WriteByte(b) 126 } 127 128 func (self *_RegExp_parser) writeString(s string) { 129 if self.passOffset != -1 { 130 self.stopPassing() 131 } 132 self.goRegexp.WriteString(s) 133 } 134 135 func (self *_RegExp_parser) scan() { 136 for self.chr != -1 { 137 switch self.chr { 138 case '\\': 139 self.read() 140 self.scanEscape(false) 141 case '(': 142 self.pass() 143 self.scanGroup() 144 case '[': 145 self.scanBracket() 146 case ')': 147 self.error(true, "Unmatched ')'") 148 return 149 case '.': 150 self.writeString(Re2Dot) 151 self.read() 152 default: 153 self.pass() 154 } 155 } 156 } 157 158 // (...) 159 func (self *_RegExp_parser) scanGroup() { 160 str := self.str[self.chrOffset:] 161 if len(str) > 1 { // A possibility of (?= or (?! 162 if str[0] == '?' { 163 ch := str[1] 164 switch { 165 case ch == '=' || ch == '!': 166 self.error(false, "re2: Invalid (%s) <lookahead>", self.str[self.chrOffset:self.chrOffset+2]) 167 return 168 case ch == '<': 169 self.error(false, "re2: Invalid (%s) <lookbehind>", self.str[self.chrOffset:self.chrOffset+2]) 170 return 171 case ch != ':': 172 self.error(true, "Invalid group") 173 return 174 } 175 } 176 } 177 for self.chr != -1 && self.chr != ')' { 178 switch self.chr { 179 case '\\': 180 self.read() 181 self.scanEscape(false) 182 case '(': 183 self.pass() 184 self.scanGroup() 185 case '[': 186 self.scanBracket() 187 case '.': 188 self.writeString(Re2Dot) 189 self.read() 190 default: 191 self.pass() 192 continue 193 } 194 } 195 if self.chr != ')' { 196 self.error(true, "Unterminated group") 197 return 198 } 199 self.pass() 200 } 201 202 // [...] 203 func (self *_RegExp_parser) scanBracket() { 204 str := self.str[self.chrOffset:] 205 if strings.HasPrefix(str, "[]") { 206 // [] -- Empty character class 207 self.writeString("[^\u0000-\U0001FFFF]") 208 self.offset += 1 209 self.read() 210 return 211 } 212 213 if strings.HasPrefix(str, "[^]") { 214 self.writeString("[\u0000-\U0001FFFF]") 215 self.offset += 2 216 self.read() 217 return 218 } 219 220 self.pass() 221 for self.chr != -1 { 222 if self.chr == ']' { 223 break 224 } else if self.chr == '\\' { 225 self.read() 226 self.scanEscape(true) 227 continue 228 } 229 self.pass() 230 } 231 if self.chr != ']' { 232 self.error(true, "Unterminated character class") 233 return 234 } 235 self.pass() 236 } 237 238 // \... 239 func (self *_RegExp_parser) scanEscape(inClass bool) { 240 offset := self.chrOffset 241 242 var length, base uint32 243 switch self.chr { 244 245 case '0', '1', '2', '3', '4', '5', '6', '7': 246 var value int64 247 size := 0 248 for { 249 digit := int64(digitValue(self.chr)) 250 if digit >= 8 { 251 // Not a valid digit 252 break 253 } 254 value = value*8 + digit 255 self.read() 256 size += 1 257 } 258 if size == 1 { // The number of characters read 259 if value != 0 { 260 // An invalid backreference 261 self.error(false, "re2: Invalid \\%d <backreference>", value) 262 return 263 } 264 self.passString(offset-1, self.chrOffset) 265 return 266 } 267 tmp := []byte{'\\', 'x', '0', 0} 268 if value >= 16 { 269 tmp = tmp[0:2] 270 } else { 271 tmp = tmp[0:3] 272 } 273 tmp = strconv.AppendInt(tmp, value, 16) 274 self.write(tmp) 275 return 276 277 case '8', '9': 278 self.read() 279 self.error(false, "re2: Invalid \\%s <backreference>", self.str[offset:self.chrOffset]) 280 return 281 282 case 'x': 283 self.read() 284 length, base = 2, 16 285 286 case 'u': 287 self.read() 288 if self.chr == '{' { 289 self.read() 290 length, base = 0, 16 291 } else { 292 length, base = 4, 16 293 } 294 295 case 'b': 296 if inClass { 297 self.write([]byte{'\\', 'x', '0', '8'}) 298 self.read() 299 return 300 } 301 fallthrough 302 303 case 'B': 304 fallthrough 305 306 case 'd', 'D', 'w', 'W': 307 // This is slightly broken, because ECMAScript 308 // includes \v in \s, \S, while re2 does not 309 fallthrough 310 311 case '\\': 312 fallthrough 313 314 case 'f', 'n', 'r', 't', 'v': 315 self.passString(offset-1, self.offset) 316 self.read() 317 return 318 319 case 'c': 320 self.read() 321 var value int64 322 if 'a' <= self.chr && self.chr <= 'z' { 323 value = int64(self.chr - 'a' + 1) 324 } else if 'A' <= self.chr && self.chr <= 'Z' { 325 value = int64(self.chr - 'A' + 1) 326 } else { 327 self.writeByte('c') 328 return 329 } 330 tmp := []byte{'\\', 'x', '0', 0} 331 if value >= 16 { 332 tmp = tmp[0:2] 333 } else { 334 tmp = tmp[0:3] 335 } 336 tmp = strconv.AppendInt(tmp, value, 16) 337 self.write(tmp) 338 self.read() 339 return 340 case 's': 341 if inClass { 342 self.writeString(WhitespaceChars) 343 } else { 344 self.writeString("[" + WhitespaceChars + "]") 345 } 346 self.read() 347 return 348 case 'S': 349 if inClass { 350 self.error(false, "S in class") 351 return 352 } else { 353 self.writeString("[^" + WhitespaceChars + "]") 354 } 355 self.read() 356 return 357 default: 358 // $ is an identifier character, so we have to have 359 // a special case for it here 360 if self.chr == '$' || self.chr < utf8.RuneSelf && !isIdentifierPart(self.chr) { 361 // A non-identifier character needs escaping 362 self.passString(offset-1, self.offset) 363 self.read() 364 return 365 } 366 // Unescape the character for re2 367 self.pass() 368 return 369 } 370 371 // Otherwise, we're a \u.... or \x... 372 valueOffset := self.chrOffset 373 374 if length > 0 { 375 for length := length; length > 0; length-- { 376 digit := uint32(digitValue(self.chr)) 377 if digit >= base { 378 // Not a valid digit 379 goto skip 380 } 381 self.read() 382 } 383 } else { 384 for self.chr != '}' && self.chr != -1 { 385 digit := uint32(digitValue(self.chr)) 386 if digit >= base { 387 // Not a valid digit 388 goto skip 389 } 390 self.read() 391 } 392 } 393 394 if length == 4 || length == 0 { 395 self.write([]byte{ 396 '\\', 397 'x', 398 '{', 399 }) 400 self.passString(valueOffset, self.chrOffset) 401 if length != 0 { 402 self.writeByte('}') 403 } 404 } else if length == 2 { 405 self.passString(offset-1, valueOffset+2) 406 } else { 407 // Should never, ever get here... 408 self.error(true, "re2: Illegal branch in scanEscape") 409 return 410 } 411 412 return 413 414 skip: 415 self.passString(offset, self.chrOffset) 416 } 417 418 func (self *_RegExp_parser) pass() { 419 if self.passOffset == self.chrOffset { 420 self.passOffset = self.offset 421 } else { 422 if self.passOffset != -1 { 423 self.stopPassing() 424 } 425 if self.chr != -1 { 426 self.goRegexp.WriteRune(self.chr) 427 } 428 } 429 self.read() 430 } 431 432 func (self *_RegExp_parser) passString(start, end int) { 433 if self.passOffset == start { 434 self.passOffset = end 435 return 436 } 437 if self.passOffset != -1 { 438 self.stopPassing() 439 } 440 self.goRegexp.WriteString(self.str[start:end]) 441 } 442 443 func (self *_RegExp_parser) error(fatal bool, msg string, msgValues ...interface{}) { 444 if self.err != nil { 445 return 446 } 447 e := regexpParseError{ 448 offset: self.offset, 449 err: fmt.Sprintf(msg, msgValues...), 450 } 451 if fatal { 452 self.err = RegexpSyntaxError{e} 453 } else { 454 self.err = RegexpErrorIncompatible{e} 455 } 456 self.offset = self.length 457 self.chr = -1 458 }