github.com/dop251/goja@v0.0.0-20240220182346-e401ed450204/regexp.go (about) 1 package goja 2 3 import ( 4 "fmt" 5 "github.com/dlclark/regexp2" 6 "github.com/dop251/goja/unistring" 7 "io" 8 "regexp" 9 "sort" 10 "strings" 11 "unicode/utf16" 12 ) 13 14 type regexp2MatchCache struct { 15 target String 16 runes []rune 17 posMap []int 18 } 19 20 // Not goroutine-safe. Use regexp2Wrapper.clone() 21 type regexp2Wrapper struct { 22 rx *regexp2.Regexp 23 cache *regexp2MatchCache 24 } 25 26 type regexpWrapper regexp.Regexp 27 28 type positionMapItem struct { 29 src, dst int 30 } 31 type positionMap []positionMapItem 32 33 func (m positionMap) get(src int) int { 34 if src <= 0 { 35 return src 36 } 37 res := sort.Search(len(m), func(n int) bool { return m[n].src >= src }) 38 if res >= len(m) || m[res].src != src { 39 panic("index not found") 40 } 41 return m[res].dst 42 } 43 44 type arrayRuneReader struct { 45 runes []rune 46 pos int 47 } 48 49 func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) { 50 if rd.pos < len(rd.runes) { 51 r = rd.runes[rd.pos] 52 size = 1 53 rd.pos++ 54 } else { 55 err = io.EOF 56 } 57 return 58 } 59 60 // Not goroutine-safe. Use regexpPattern.clone() 61 type regexpPattern struct { 62 src string 63 64 global, ignoreCase, multiline, sticky, unicode bool 65 66 regexpWrapper *regexpWrapper 67 regexp2Wrapper *regexp2Wrapper 68 } 69 70 func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, error) { 71 var opts regexp2.RegexOptions = regexp2.ECMAScript 72 if multiline { 73 opts |= regexp2.Multiline 74 } 75 if ignoreCase { 76 opts |= regexp2.IgnoreCase 77 } 78 regexp2Pattern, err1 := regexp2.Compile(src, opts) 79 if err1 != nil { 80 return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1) 81 } 82 83 return ®exp2Wrapper{rx: regexp2Pattern}, nil 84 } 85 86 func (p *regexpPattern) createRegexp2() { 87 if p.regexp2Wrapper != nil { 88 return 89 } 90 rx, err := compileRegexp2(p.src, p.multiline, p.ignoreCase) 91 if err != nil { 92 // At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug. 93 panic(err) 94 } 95 p.regexp2Wrapper = rx 96 } 97 98 func buildUTF8PosMap(s unicodeString) (positionMap, string) { 99 pm := make(positionMap, 0, s.Length()) 100 rd := s.Reader() 101 sPos, utf8Pos := 0, 0 102 var sb strings.Builder 103 for { 104 r, size, err := rd.ReadRune() 105 if err == io.EOF { 106 break 107 } 108 if err != nil { 109 // the string contains invalid UTF-16, bailing out 110 return nil, "" 111 } 112 utf8Size, _ := sb.WriteRune(r) 113 sPos += size 114 utf8Pos += utf8Size 115 pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos}) 116 } 117 return pm, sb.String() 118 } 119 120 func (p *regexpPattern) findSubmatchIndex(s String, start int) []int { 121 if p.regexpWrapper == nil { 122 return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky) 123 } 124 if start != 0 { 125 // Unfortunately Go's regexp library does not allow starting from an arbitrary position. 126 // If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not 127 // work correctly. 128 p.createRegexp2() 129 return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky) 130 } 131 return p.regexpWrapper.findSubmatchIndex(s, p.unicode) 132 } 133 134 func (p *regexpPattern) findAllSubmatchIndex(s String, start int, limit int, sticky bool) [][]int { 135 if p.regexpWrapper == nil { 136 return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode) 137 } 138 if start == 0 { 139 a, u := devirtualizeString(s) 140 if u == nil { 141 return p.regexpWrapper.findAllSubmatchIndex(string(a), limit, sticky) 142 } 143 if limit == 1 { 144 result := p.regexpWrapper.findSubmatchIndexUnicode(u, p.unicode) 145 if result == nil { 146 return nil 147 } 148 return [][]int{result} 149 } 150 // Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an 151 // input. 152 if p.unicode { 153 // Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8. 154 pm, str := buildUTF8PosMap(u) 155 if pm != nil { 156 res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky) 157 for _, result := range res { 158 for i, idx := range result { 159 result[i] = pm.get(idx) 160 } 161 } 162 return res 163 } 164 } 165 } 166 167 p.createRegexp2() 168 return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode) 169 } 170 171 // clone creates a copy of the regexpPattern which can be used concurrently. 172 func (p *regexpPattern) clone() *regexpPattern { 173 ret := ®expPattern{ 174 src: p.src, 175 global: p.global, 176 ignoreCase: p.ignoreCase, 177 multiline: p.multiline, 178 sticky: p.sticky, 179 unicode: p.unicode, 180 } 181 if p.regexpWrapper != nil { 182 ret.regexpWrapper = p.regexpWrapper.clone() 183 } 184 if p.regexp2Wrapper != nil { 185 ret.regexp2Wrapper = p.regexp2Wrapper.clone() 186 } 187 return ret 188 } 189 190 type regexpObject struct { 191 baseObject 192 pattern *regexpPattern 193 source String 194 195 standard bool 196 } 197 198 func (r *regexp2Wrapper) findSubmatchIndex(s String, start int, fullUnicode, doCache bool) (result []int) { 199 if fullUnicode { 200 return r.findSubmatchIndexUnicode(s, start, doCache) 201 } 202 return r.findSubmatchIndexUTF16(s, start, doCache) 203 } 204 205 func (r *regexp2Wrapper) findUTF16Cached(s String, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) { 206 wrapped := r.rx 207 cache := r.cache 208 if cache != nil && cache.posMap == nil && cache.target.SameAs(s) { 209 runes = cache.runes 210 } else { 211 runes = s.utf16Runes() 212 cache = nil 213 } 214 match, err = wrapped.FindRunesMatchStartingAt(runes, start) 215 if doCache && match != nil && err == nil { 216 if cache == nil { 217 if r.cache == nil { 218 r.cache = new(regexp2MatchCache) 219 } 220 *r.cache = regexp2MatchCache{ 221 target: s, 222 runes: runes, 223 } 224 } 225 } else { 226 r.cache = nil 227 } 228 return 229 } 230 231 func (r *regexp2Wrapper) findSubmatchIndexUTF16(s String, start int, doCache bool) (result []int) { 232 match, _, err := r.findUTF16Cached(s, start, doCache) 233 if err != nil { 234 return 235 } 236 237 if match == nil { 238 return 239 } 240 groups := match.Groups() 241 242 result = make([]int, 0, len(groups)<<1) 243 for _, group := range groups { 244 if len(group.Captures) > 0 { 245 result = append(result, group.Index, group.Index+group.Length) 246 } else { 247 result = append(result, -1, 0) 248 } 249 } 250 return 251 } 252 253 func (r *regexp2Wrapper) findUnicodeCached(s String, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) { 254 var ( 255 runes []rune 256 mappedStart int 257 splitPair bool 258 savedRune rune 259 ) 260 wrapped := r.rx 261 cache := r.cache 262 if cache != nil && cache.posMap != nil && cache.target.SameAs(s) { 263 runes, posMap = cache.runes, cache.posMap 264 mappedStart, splitPair = posMapReverseLookup(posMap, start) 265 } else { 266 posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), start) 267 cache = nil 268 } 269 if splitPair { 270 // temporarily set the rune at mappedStart to the second code point of the pair 271 _, second := utf16.EncodeRune(runes[mappedStart]) 272 savedRune, runes[mappedStart] = runes[mappedStart], second 273 } 274 match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart) 275 if doCache && match != nil && err == nil { 276 if splitPair { 277 runes[mappedStart] = savedRune 278 } 279 if cache == nil { 280 if r.cache == nil { 281 r.cache = new(regexp2MatchCache) 282 } 283 *r.cache = regexp2MatchCache{ 284 target: s, 285 runes: runes, 286 posMap: posMap, 287 } 288 } 289 } else { 290 r.cache = nil 291 } 292 293 return 294 } 295 296 func (r *regexp2Wrapper) findSubmatchIndexUnicode(s String, start int, doCache bool) (result []int) { 297 match, posMap, err := r.findUnicodeCached(s, start, doCache) 298 if match == nil || err != nil { 299 return 300 } 301 302 groups := match.Groups() 303 304 result = make([]int, 0, len(groups)<<1) 305 for _, group := range groups { 306 if len(group.Captures) > 0 { 307 result = append(result, posMap[group.Index], posMap[group.Index+group.Length]) 308 } else { 309 result = append(result, -1, 0) 310 } 311 } 312 return 313 } 314 315 func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s String, start, limit int, sticky bool) [][]int { 316 wrapped := r.rx 317 match, runes, err := r.findUTF16Cached(s, start, false) 318 if match == nil || err != nil { 319 return nil 320 } 321 if limit < 0 { 322 limit = len(runes) + 1 323 } 324 results := make([][]int, 0, limit) 325 for match != nil { 326 groups := match.Groups() 327 328 result := make([]int, 0, len(groups)<<1) 329 330 for _, group := range groups { 331 if len(group.Captures) > 0 { 332 startPos := group.Index 333 endPos := group.Index + group.Length 334 result = append(result, startPos, endPos) 335 } else { 336 result = append(result, -1, 0) 337 } 338 } 339 340 if sticky && len(result) > 1 { 341 if result[0] != start { 342 break 343 } 344 start = result[1] 345 } 346 347 results = append(results, result) 348 limit-- 349 if limit <= 0 { 350 break 351 } 352 match, err = wrapped.FindNextMatch(match) 353 if err != nil { 354 return nil 355 } 356 } 357 return results 358 } 359 360 func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) { 361 posMap = make([]int, 0, l+1) 362 curPos := 0 363 runes = make([]rune, 0, l) 364 startFound := false 365 for { 366 if !startFound { 367 if curPos == start { 368 mappedStart = len(runes) 369 startFound = true 370 } 371 if curPos > start { 372 // start position splits a surrogate pair 373 mappedStart = len(runes) - 1 374 splitPair = true 375 startFound = true 376 } 377 } 378 rn, size, err := rd.ReadRune() 379 if err != nil { 380 break 381 } 382 runes = append(runes, rn) 383 posMap = append(posMap, curPos) 384 curPos += size 385 } 386 posMap = append(posMap, curPos) 387 return 388 } 389 390 func posMapReverseLookup(posMap []int, pos int) (int, bool) { 391 mapped := sort.SearchInts(posMap, pos) 392 if mapped < len(posMap) && posMap[mapped] != pos { 393 return mapped - 1, true 394 } 395 return mapped, false 396 } 397 398 func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int { 399 wrapped := r.rx 400 if limit < 0 { 401 limit = len(s) + 1 402 } 403 results := make([][]int, 0, limit) 404 match, posMap, err := r.findUnicodeCached(s, start, false) 405 if err != nil { 406 return nil 407 } 408 for match != nil { 409 groups := match.Groups() 410 411 result := make([]int, 0, len(groups)<<1) 412 413 for _, group := range groups { 414 if len(group.Captures) > 0 { 415 start := posMap[group.Index] 416 end := posMap[group.Index+group.Length] 417 result = append(result, start, end) 418 } else { 419 result = append(result, -1, 0) 420 } 421 } 422 423 if sticky && len(result) > 1 { 424 if result[0] != start { 425 break 426 } 427 start = result[1] 428 } 429 430 results = append(results, result) 431 match, err = wrapped.FindNextMatch(match) 432 if err != nil { 433 return nil 434 } 435 } 436 return results 437 } 438 439 func (r *regexp2Wrapper) findAllSubmatchIndex(s String, start, limit int, sticky, fullUnicode bool) [][]int { 440 a, u := devirtualizeString(s) 441 if u != nil { 442 if fullUnicode { 443 return r.findAllSubmatchIndexUnicode(u, start, limit, sticky) 444 } 445 return r.findAllSubmatchIndexUTF16(u, start, limit, sticky) 446 } 447 return r.findAllSubmatchIndexUTF16(a, start, limit, sticky) 448 } 449 450 func (r *regexp2Wrapper) clone() *regexp2Wrapper { 451 return ®exp2Wrapper{ 452 rx: r.rx, 453 } 454 } 455 456 func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) { 457 wrapped := (*regexp.Regexp)(r) 458 results = wrapped.FindAllStringSubmatchIndex(s, limit) 459 pos := 0 460 if sticky { 461 for i, result := range results { 462 if len(result) > 1 { 463 if result[0] != pos { 464 return results[:i] 465 } 466 pos = result[1] 467 } 468 } 469 } 470 return 471 } 472 473 func (r *regexpWrapper) findSubmatchIndex(s String, fullUnicode bool) []int { 474 a, u := devirtualizeString(s) 475 if u != nil { 476 return r.findSubmatchIndexUnicode(u, fullUnicode) 477 } 478 return r.findSubmatchIndexASCII(string(a)) 479 } 480 481 func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int { 482 wrapped := (*regexp.Regexp)(r) 483 return wrapped.FindStringSubmatchIndex(s) 484 } 485 486 func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) { 487 wrapped := (*regexp.Regexp)(r) 488 if fullUnicode { 489 posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), 0) 490 res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes}) 491 for i, item := range res { 492 if item >= 0 { 493 res[i] = posMap[item] 494 } 495 } 496 return res 497 } 498 return wrapped.FindReaderSubmatchIndex(s.utf16RuneReader()) 499 } 500 501 func (r *regexpWrapper) clone() *regexpWrapper { 502 return r 503 } 504 505 func (r *regexpObject) execResultToArray(target String, result []int) Value { 506 captureCount := len(result) >> 1 507 valueArray := make([]Value, captureCount) 508 matchIndex := result[0] 509 valueArray[0] = target.Substring(result[0], result[1]) 510 lowerBound := 0 511 for index := 1; index < captureCount; index++ { 512 offset := index << 1 513 if result[offset] >= 0 && result[offset+1] >= lowerBound { 514 valueArray[index] = target.Substring(result[offset], result[offset+1]) 515 lowerBound = result[offset] 516 } else { 517 valueArray[index] = _undefined 518 } 519 } 520 match := r.val.runtime.newArrayValues(valueArray) 521 match.self.setOwnStr("input", target, false) 522 match.self.setOwnStr("index", intToValue(int64(matchIndex)), false) 523 return match 524 } 525 526 func (r *regexpObject) getLastIndex() int64 { 527 lastIndex := toLength(r.getStr("lastIndex", nil)) 528 if !r.pattern.global && !r.pattern.sticky { 529 return 0 530 } 531 return lastIndex 532 } 533 534 func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool { 535 if r.pattern.sticky { 536 if firstResult == nil || int64(firstResult[0]) != index { 537 r.setOwnStr("lastIndex", intToValue(0), true) 538 return false 539 } 540 } else { 541 if firstResult == nil { 542 if r.pattern.global { 543 r.setOwnStr("lastIndex", intToValue(0), true) 544 } 545 return false 546 } 547 } 548 549 if r.pattern.global || r.pattern.sticky { 550 r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true) 551 } 552 return true 553 } 554 555 func (r *regexpObject) execRegexp(target String) (match bool, result []int) { 556 index := r.getLastIndex() 557 if index >= 0 && index <= int64(target.Length()) { 558 result = r.pattern.findSubmatchIndex(target, int(index)) 559 } 560 match = r.updateLastIndex(index, result, result) 561 return 562 } 563 564 func (r *regexpObject) exec(target String) Value { 565 match, result := r.execRegexp(target) 566 if match { 567 return r.execResultToArray(target, result) 568 } 569 return _null 570 } 571 572 func (r *regexpObject) test(target String) bool { 573 match, _ := r.execRegexp(target) 574 return match 575 } 576 577 func (r *regexpObject) clone() *regexpObject { 578 r1 := r.val.runtime.newRegexpObject(r.prototype) 579 r1.source = r.source 580 r1.pattern = r.pattern 581 582 return r1 583 } 584 585 func (r *regexpObject) init() { 586 r.baseObject.init() 587 r.standard = true 588 r._putProp("lastIndex", intToValue(0), true, false, false) 589 } 590 591 func (r *regexpObject) setProto(proto *Object, throw bool) bool { 592 res := r.baseObject.setProto(proto, throw) 593 if res { 594 r.standard = false 595 } 596 return res 597 } 598 599 func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool { 600 res := r.baseObject.defineOwnPropertyStr(name, desc, throw) 601 if res { 602 r.standard = false 603 } 604 return res 605 } 606 607 func (r *regexpObject) defineOwnPropertySym(name *Symbol, desc PropertyDescriptor, throw bool) bool { 608 res := r.baseObject.defineOwnPropertySym(name, desc, throw) 609 if res && r.standard { 610 switch name { 611 case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace: 612 r.standard = false 613 } 614 } 615 return res 616 } 617 618 func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool { 619 res := r.baseObject.deleteStr(name, throw) 620 if res { 621 r.standard = false 622 } 623 return res 624 } 625 626 func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool { 627 res := r.baseObject.setOwnStr(name, value, throw) 628 if res && r.standard && name == "exec" { 629 r.standard = false 630 } 631 return res 632 } 633 634 func (r *regexpObject) setOwnSym(name *Symbol, value Value, throw bool) bool { 635 res := r.baseObject.setOwnSym(name, value, throw) 636 if res && r.standard { 637 switch name { 638 case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace: 639 r.standard = false 640 } 641 } 642 return res 643 }