github.com/nuvolaris/goja@v0.0.0-20230825100449-967811910c6d/regexp.go (about) 1 package goja 2 3 import ( 4 "fmt" 5 "io" 6 "regexp" 7 "sort" 8 "strings" 9 "unicode/utf16" 10 11 "github.com/dlclark/regexp2" 12 "github.com/nuvolaris/goja/unistring" 13 ) 14 15 type regexp2MatchCache struct { 16 target String 17 runes []rune 18 posMap []int 19 } 20 21 // Not goroutine-safe. Use regexp2Wrapper.clone() 22 type regexp2Wrapper struct { 23 rx *regexp2.Regexp 24 cache *regexp2MatchCache 25 } 26 27 type regexpWrapper regexp.Regexp 28 29 type positionMapItem struct { 30 src, dst int 31 } 32 type positionMap []positionMapItem 33 34 func (m positionMap) get(src int) int { 35 if src <= 0 { 36 return src 37 } 38 res := sort.Search(len(m), func(n int) bool { return m[n].src >= src }) 39 if res >= len(m) || m[res].src != src { 40 panic("index not found") 41 } 42 return m[res].dst 43 } 44 45 type arrayRuneReader struct { 46 runes []rune 47 pos int 48 } 49 50 func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) { 51 if rd.pos < len(rd.runes) { 52 r = rd.runes[rd.pos] 53 size = 1 54 rd.pos++ 55 } else { 56 err = io.EOF 57 } 58 return 59 } 60 61 // Not goroutine-safe. Use regexpPattern.clone() 62 type regexpPattern struct { 63 src string 64 65 global, ignoreCase, multiline, sticky, unicode bool 66 67 regexpWrapper *regexpWrapper 68 regexp2Wrapper *regexp2Wrapper 69 } 70 71 func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, error) { 72 var opts regexp2.RegexOptions = regexp2.ECMAScript 73 if multiline { 74 opts |= regexp2.Multiline 75 } 76 if ignoreCase { 77 opts |= regexp2.IgnoreCase 78 } 79 regexp2Pattern, err1 := regexp2.Compile(src, opts) 80 if err1 != nil { 81 return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1) 82 } 83 84 return ®exp2Wrapper{rx: regexp2Pattern}, nil 85 } 86 87 func (p *regexpPattern) createRegexp2() { 88 if p.regexp2Wrapper != nil { 89 return 90 } 91 rx, err := compileRegexp2(p.src, p.multiline, p.ignoreCase) 92 if err != nil { 93 // At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug. 94 panic(err) 95 } 96 p.regexp2Wrapper = rx 97 } 98 99 func buildUTF8PosMap(s unicodeString) (positionMap, string) { 100 pm := make(positionMap, 0, s.Length()) 101 rd := s.Reader() 102 sPos, utf8Pos := 0, 0 103 var sb strings.Builder 104 for { 105 r, size, err := rd.ReadRune() 106 if err == io.EOF { 107 break 108 } 109 if err != nil { 110 // the string contains invalid UTF-16, bailing out 111 return nil, "" 112 } 113 utf8Size, _ := sb.WriteRune(r) 114 sPos += size 115 utf8Pos += utf8Size 116 pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos}) 117 } 118 return pm, sb.String() 119 } 120 121 func (p *regexpPattern) findSubmatchIndex(s String, start int) []int { 122 if p.regexpWrapper == nil { 123 return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky) 124 } 125 if start != 0 { 126 // Unfortunately Go's regexp library does not allow starting from an arbitrary position. 127 // If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not 128 // work correctly. 129 p.createRegexp2() 130 return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky) 131 } 132 return p.regexpWrapper.findSubmatchIndex(s, p.unicode) 133 } 134 135 func (p *regexpPattern) findAllSubmatchIndex(s String, start int, limit int, sticky bool) [][]int { 136 if p.regexpWrapper == nil { 137 return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode) 138 } 139 if start == 0 { 140 a, u := devirtualizeString(s) 141 if u == nil { 142 return p.regexpWrapper.findAllSubmatchIndex(string(a), limit, sticky) 143 } 144 if limit == 1 { 145 result := p.regexpWrapper.findSubmatchIndexUnicode(u, p.unicode) 146 if result == nil { 147 return nil 148 } 149 return [][]int{result} 150 } 151 // Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an 152 // input. 153 if p.unicode { 154 // Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8. 155 pm, str := buildUTF8PosMap(u) 156 if pm != nil { 157 res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky) 158 for _, result := range res { 159 for i, idx := range result { 160 result[i] = pm.get(idx) 161 } 162 } 163 return res 164 } 165 } 166 } 167 168 p.createRegexp2() 169 return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode) 170 } 171 172 // clone creates a copy of the regexpPattern which can be used concurrently. 173 func (p *regexpPattern) clone() *regexpPattern { 174 ret := ®expPattern{ 175 src: p.src, 176 global: p.global, 177 ignoreCase: p.ignoreCase, 178 multiline: p.multiline, 179 sticky: p.sticky, 180 unicode: p.unicode, 181 } 182 if p.regexpWrapper != nil { 183 ret.regexpWrapper = p.regexpWrapper.clone() 184 } 185 if p.regexp2Wrapper != nil { 186 ret.regexp2Wrapper = p.regexp2Wrapper.clone() 187 } 188 return ret 189 } 190 191 type regexpObject struct { 192 baseObject 193 pattern *regexpPattern 194 source String 195 196 standard bool 197 } 198 199 func (r *regexp2Wrapper) findSubmatchIndex(s String, start int, fullUnicode, doCache bool) (result []int) { 200 if fullUnicode { 201 return r.findSubmatchIndexUnicode(s, start, doCache) 202 } 203 return r.findSubmatchIndexUTF16(s, start, doCache) 204 } 205 206 func (r *regexp2Wrapper) findUTF16Cached(s String, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) { 207 wrapped := r.rx 208 cache := r.cache 209 if cache != nil && cache.posMap == nil && cache.target.SameAs(s) { 210 runes = cache.runes 211 } else { 212 runes = s.utf16Runes() 213 cache = nil 214 } 215 match, err = wrapped.FindRunesMatchStartingAt(runes, start) 216 if doCache && match != nil && err == nil { 217 if cache == nil { 218 if r.cache == nil { 219 r.cache = new(regexp2MatchCache) 220 } 221 *r.cache = regexp2MatchCache{ 222 target: s, 223 runes: runes, 224 } 225 } 226 } else { 227 r.cache = nil 228 } 229 return 230 } 231 232 func (r *regexp2Wrapper) findSubmatchIndexUTF16(s String, start int, doCache bool) (result []int) { 233 match, _, err := r.findUTF16Cached(s, start, doCache) 234 if err != nil { 235 return 236 } 237 238 if match == nil { 239 return 240 } 241 groups := match.Groups() 242 243 result = make([]int, 0, len(groups)<<1) 244 for _, group := range groups { 245 if len(group.Captures) > 0 { 246 result = append(result, group.Index, group.Index+group.Length) 247 } else { 248 result = append(result, -1, 0) 249 } 250 } 251 return 252 } 253 254 func (r *regexp2Wrapper) findUnicodeCached(s String, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) { 255 var ( 256 runes []rune 257 mappedStart int 258 splitPair bool 259 savedRune rune 260 ) 261 wrapped := r.rx 262 cache := r.cache 263 if cache != nil && cache.posMap != nil && cache.target.SameAs(s) { 264 runes, posMap = cache.runes, cache.posMap 265 mappedStart, splitPair = posMapReverseLookup(posMap, start) 266 } else { 267 posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), start) 268 cache = nil 269 } 270 if splitPair { 271 // temporarily set the rune at mappedStart to the second code point of the pair 272 _, second := utf16.EncodeRune(runes[mappedStart]) 273 savedRune, runes[mappedStart] = runes[mappedStart], second 274 } 275 match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart) 276 if doCache && match != nil && err == nil { 277 if splitPair { 278 runes[mappedStart] = savedRune 279 } 280 if cache == nil { 281 if r.cache == nil { 282 r.cache = new(regexp2MatchCache) 283 } 284 *r.cache = regexp2MatchCache{ 285 target: s, 286 runes: runes, 287 posMap: posMap, 288 } 289 } 290 } else { 291 r.cache = nil 292 } 293 294 return 295 } 296 297 func (r *regexp2Wrapper) findSubmatchIndexUnicode(s String, start int, doCache bool) (result []int) { 298 match, posMap, err := r.findUnicodeCached(s, start, doCache) 299 if match == nil || err != nil { 300 return 301 } 302 303 groups := match.Groups() 304 305 result = make([]int, 0, len(groups)<<1) 306 for _, group := range groups { 307 if len(group.Captures) > 0 { 308 result = append(result, posMap[group.Index], posMap[group.Index+group.Length]) 309 } else { 310 result = append(result, -1, 0) 311 } 312 } 313 return 314 } 315 316 func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s String, start, limit int, sticky bool) [][]int { 317 wrapped := r.rx 318 match, runes, err := r.findUTF16Cached(s, start, false) 319 if match == nil || err != nil { 320 return nil 321 } 322 if limit < 0 { 323 limit = len(runes) + 1 324 } 325 results := make([][]int, 0, limit) 326 for match != nil { 327 groups := match.Groups() 328 329 result := make([]int, 0, len(groups)<<1) 330 331 for _, group := range groups { 332 if len(group.Captures) > 0 { 333 startPos := group.Index 334 endPos := group.Index + group.Length 335 result = append(result, startPos, endPos) 336 } else { 337 result = append(result, -1, 0) 338 } 339 } 340 341 if sticky && len(result) > 1 { 342 if result[0] != start { 343 break 344 } 345 start = result[1] 346 } 347 348 results = append(results, result) 349 limit-- 350 if limit <= 0 { 351 break 352 } 353 match, err = wrapped.FindNextMatch(match) 354 if err != nil { 355 return nil 356 } 357 } 358 return results 359 } 360 361 func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) { 362 posMap = make([]int, 0, l+1) 363 curPos := 0 364 runes = make([]rune, 0, l) 365 startFound := false 366 for { 367 if !startFound { 368 if curPos == start { 369 mappedStart = len(runes) 370 startFound = true 371 } 372 if curPos > start { 373 // start position splits a surrogate pair 374 mappedStart = len(runes) - 1 375 splitPair = true 376 startFound = true 377 } 378 } 379 rn, size, err := rd.ReadRune() 380 if err != nil { 381 break 382 } 383 runes = append(runes, rn) 384 posMap = append(posMap, curPos) 385 curPos += size 386 } 387 posMap = append(posMap, curPos) 388 return 389 } 390 391 func posMapReverseLookup(posMap []int, pos int) (int, bool) { 392 mapped := sort.SearchInts(posMap, pos) 393 if mapped < len(posMap) && posMap[mapped] != pos { 394 return mapped - 1, true 395 } 396 return mapped, false 397 } 398 399 func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int { 400 wrapped := r.rx 401 if limit < 0 { 402 limit = len(s) + 1 403 } 404 results := make([][]int, 0, limit) 405 match, posMap, err := r.findUnicodeCached(s, start, false) 406 if err != nil { 407 return nil 408 } 409 for match != nil { 410 groups := match.Groups() 411 412 result := make([]int, 0, len(groups)<<1) 413 414 for _, group := range groups { 415 if len(group.Captures) > 0 { 416 start := posMap[group.Index] 417 end := posMap[group.Index+group.Length] 418 result = append(result, start, end) 419 } else { 420 result = append(result, -1, 0) 421 } 422 } 423 424 if sticky && len(result) > 1 { 425 if result[0] != start { 426 break 427 } 428 start = result[1] 429 } 430 431 results = append(results, result) 432 match, err = wrapped.FindNextMatch(match) 433 if err != nil { 434 return nil 435 } 436 } 437 return results 438 } 439 440 func (r *regexp2Wrapper) findAllSubmatchIndex(s String, start, limit int, sticky, fullUnicode bool) [][]int { 441 a, u := devirtualizeString(s) 442 if u != nil { 443 if fullUnicode { 444 return r.findAllSubmatchIndexUnicode(u, start, limit, sticky) 445 } 446 return r.findAllSubmatchIndexUTF16(u, start, limit, sticky) 447 } 448 return r.findAllSubmatchIndexUTF16(a, start, limit, sticky) 449 } 450 451 func (r *regexp2Wrapper) clone() *regexp2Wrapper { 452 return ®exp2Wrapper{ 453 rx: r.rx, 454 } 455 } 456 457 func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) { 458 wrapped := (*regexp.Regexp)(r) 459 results = wrapped.FindAllStringSubmatchIndex(s, limit) 460 pos := 0 461 if sticky { 462 for i, result := range results { 463 if len(result) > 1 { 464 if result[0] != pos { 465 return results[:i] 466 } 467 pos = result[1] 468 } 469 } 470 } 471 return 472 } 473 474 func (r *regexpWrapper) findSubmatchIndex(s String, fullUnicode bool) []int { 475 a, u := devirtualizeString(s) 476 if u != nil { 477 return r.findSubmatchIndexUnicode(u, fullUnicode) 478 } 479 return r.findSubmatchIndexASCII(string(a)) 480 } 481 482 func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int { 483 wrapped := (*regexp.Regexp)(r) 484 return wrapped.FindStringSubmatchIndex(s) 485 } 486 487 func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) { 488 wrapped := (*regexp.Regexp)(r) 489 if fullUnicode { 490 posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), 0) 491 res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes}) 492 for i, item := range res { 493 if item >= 0 { 494 res[i] = posMap[item] 495 } 496 } 497 return res 498 } 499 return wrapped.FindReaderSubmatchIndex(s.utf16RuneReader()) 500 } 501 502 func (r *regexpWrapper) clone() *regexpWrapper { 503 return r 504 } 505 506 func (r *regexpObject) execResultToArray(target String, result []int) Value { 507 captureCount := len(result) >> 1 508 valueArray := make([]Value, captureCount) 509 matchIndex := result[0] 510 valueArray[0] = target.Substring(result[0], result[1]) 511 lowerBound := 0 512 for index := 1; index < captureCount; index++ { 513 offset := index << 1 514 if result[offset] >= 0 && result[offset+1] >= lowerBound { 515 valueArray[index] = target.Substring(result[offset], result[offset+1]) 516 lowerBound = result[offset] 517 } else { 518 valueArray[index] = _undefined 519 } 520 } 521 match := r.val.runtime.newArrayValues(valueArray) 522 match.self.setOwnStr("input", target, false) 523 match.self.setOwnStr("index", intToValue(int64(matchIndex)), false) 524 return match 525 } 526 527 func (r *regexpObject) getLastIndex() int64 { 528 lastIndex := toLength(r.getStr("lastIndex", nil)) 529 if !r.pattern.global && !r.pattern.sticky { 530 return 0 531 } 532 return lastIndex 533 } 534 535 func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool { 536 if r.pattern.sticky { 537 if firstResult == nil || int64(firstResult[0]) != index { 538 r.setOwnStr("lastIndex", intToValue(0), true) 539 return false 540 } 541 } else { 542 if firstResult == nil { 543 if r.pattern.global { 544 r.setOwnStr("lastIndex", intToValue(0), true) 545 } 546 return false 547 } 548 } 549 550 if r.pattern.global || r.pattern.sticky { 551 r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true) 552 } 553 return true 554 } 555 556 func (r *regexpObject) execRegexp(target String) (match bool, result []int) { 557 index := r.getLastIndex() 558 if index >= 0 && index <= int64(target.Length()) { 559 result = r.pattern.findSubmatchIndex(target, int(index)) 560 } 561 match = r.updateLastIndex(index, result, result) 562 return 563 } 564 565 func (r *regexpObject) exec(target String) Value { 566 match, result := r.execRegexp(target) 567 if match { 568 return r.execResultToArray(target, result) 569 } 570 return _null 571 } 572 573 func (r *regexpObject) test(target String) bool { 574 match, _ := r.execRegexp(target) 575 return match 576 } 577 578 func (r *regexpObject) clone() *regexpObject { 579 r1 := r.val.runtime.newRegexpObject(r.prototype) 580 r1.source = r.source 581 r1.pattern = r.pattern 582 583 return r1 584 } 585 586 func (r *regexpObject) init() { 587 r.baseObject.init() 588 r.standard = true 589 r._putProp("lastIndex", intToValue(0), true, false, false) 590 } 591 592 func (r *regexpObject) setProto(proto *Object, throw bool) bool { 593 res := r.baseObject.setProto(proto, throw) 594 if res { 595 r.standard = false 596 } 597 return res 598 } 599 600 func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool { 601 res := r.baseObject.defineOwnPropertyStr(name, desc, throw) 602 if res { 603 r.standard = false 604 } 605 return res 606 } 607 608 func (r *regexpObject) defineOwnPropertySym(name *Symbol, desc PropertyDescriptor, throw bool) bool { 609 res := r.baseObject.defineOwnPropertySym(name, desc, throw) 610 if res && r.standard { 611 switch name { 612 case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace: 613 r.standard = false 614 } 615 } 616 return res 617 } 618 619 func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool { 620 res := r.baseObject.deleteStr(name, throw) 621 if res { 622 r.standard = false 623 } 624 return res 625 } 626 627 func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool { 628 res := r.baseObject.setOwnStr(name, value, throw) 629 if res && r.standard && name == "exec" { 630 r.standard = false 631 } 632 return res 633 } 634 635 func (r *regexpObject) setOwnSym(name *Symbol, value Value, throw bool) bool { 636 res := r.baseObject.setOwnSym(name, value, throw) 637 if res && r.standard { 638 switch name { 639 case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace: 640 r.standard = false 641 } 642 } 643 return res 644 }