github.com/nuvolaris/goja@v0.0.0-20230825100449-967811910c6d/string_unicode.go (about) 1 package goja 2 3 import ( 4 "errors" 5 "hash/maphash" 6 "io" 7 "math" 8 "reflect" 9 "strings" 10 "unicode/utf16" 11 "unicode/utf8" 12 13 "github.com/nuvolaris/goja/parser" 14 "github.com/nuvolaris/goja/unistring" 15 "golang.org/x/text/cases" 16 "golang.org/x/text/language" 17 ) 18 19 type unicodeString []uint16 20 21 type unicodeRuneReader struct { 22 s unicodeString 23 pos int 24 } 25 26 type utf16RuneReader struct { 27 s unicodeString 28 pos int 29 } 30 31 // passes through invalid surrogate pairs 32 type lenientUtf16Decoder struct { 33 utf16Reader utf16Reader 34 prev uint16 35 prevSet bool 36 } 37 38 // StringBuilder serves similar purpose to strings.Builder, except it works with ECMAScript String. 39 // Use it to efficiently build 'native' ECMAScript values that either contain invalid UTF-16 surrogate pairs 40 // (and therefore cannot be represented as UTF-8) or never expected to be exported to Go. See also 41 // StringFromUTF16. 42 type StringBuilder struct { 43 asciiBuilder strings.Builder 44 unicodeBuilder unicodeStringBuilder 45 } 46 47 type unicodeStringBuilder struct { 48 buf []uint16 49 unicode bool 50 } 51 52 var ( 53 InvalidRuneError = errors.New("invalid rune") 54 ) 55 56 func (rr *utf16RuneReader) readChar() (c uint16, err error) { 57 if rr.pos < len(rr.s) { 58 c = rr.s[rr.pos] 59 rr.pos++ 60 return 61 } 62 err = io.EOF 63 return 64 } 65 66 func (rr *utf16RuneReader) ReadRune() (r rune, size int, err error) { 67 if rr.pos < len(rr.s) { 68 r = rune(rr.s[rr.pos]) 69 rr.pos++ 70 size = 1 71 return 72 } 73 err = io.EOF 74 return 75 } 76 77 func (rr *lenientUtf16Decoder) ReadRune() (r rune, size int, err error) { 78 var c uint16 79 if rr.prevSet { 80 c = rr.prev 81 rr.prevSet = false 82 } else { 83 c, err = rr.utf16Reader.readChar() 84 if err != nil { 85 return 86 } 87 } 88 size = 1 89 if isUTF16FirstSurrogate(c) { 90 second, err1 := rr.utf16Reader.readChar() 91 if err1 != nil { 92 if err1 != io.EOF { 93 err = err1 94 } else { 95 r = rune(c) 96 } 97 return 98 } 99 if isUTF16SecondSurrogate(second) { 100 r = utf16.DecodeRune(rune(c), rune(second)) 101 size++ 102 return 103 } else { 104 rr.prev = second 105 rr.prevSet = true 106 } 107 } 108 r = rune(c) 109 return 110 } 111 112 func (rr *unicodeRuneReader) ReadRune() (r rune, size int, err error) { 113 if rr.pos < len(rr.s) { 114 c := rr.s[rr.pos] 115 size++ 116 rr.pos++ 117 if isUTF16FirstSurrogate(c) { 118 if rr.pos < len(rr.s) { 119 second := rr.s[rr.pos] 120 if isUTF16SecondSurrogate(second) { 121 r = utf16.DecodeRune(rune(c), rune(second)) 122 size++ 123 rr.pos++ 124 return 125 } 126 } 127 err = InvalidRuneError 128 } else if isUTF16SecondSurrogate(c) { 129 err = InvalidRuneError 130 } 131 r = rune(c) 132 } else { 133 err = io.EOF 134 } 135 return 136 } 137 138 func (b *unicodeStringBuilder) Grow(n int) { 139 if len(b.buf) == 0 { 140 n++ 141 } 142 if cap(b.buf)-len(b.buf) < n { 143 buf := make([]uint16, len(b.buf), 2*cap(b.buf)+n) 144 copy(buf, b.buf) 145 b.buf = buf 146 } 147 } 148 149 func (b *unicodeStringBuilder) ensureStarted(initialSize int) { 150 b.Grow(initialSize) 151 if len(b.buf) == 0 { 152 b.buf = append(b.buf, unistring.BOM) 153 } 154 } 155 156 // assumes already started 157 func (b *unicodeStringBuilder) writeString(s String) { 158 a, u := devirtualizeString(s) 159 if u != nil { 160 b.buf = append(b.buf, u[1:]...) 161 b.unicode = true 162 } else { 163 for i := 0; i < len(a); i++ { 164 b.buf = append(b.buf, uint16(a[i])) 165 } 166 } 167 } 168 169 func (b *unicodeStringBuilder) String() String { 170 if b.unicode { 171 return unicodeString(b.buf) 172 } 173 if len(b.buf) < 2 { 174 return stringEmpty 175 } 176 buf := make([]byte, 0, len(b.buf)-1) 177 for _, c := range b.buf[1:] { 178 buf = append(buf, byte(c)) 179 } 180 return asciiString(buf) 181 } 182 183 func (b *unicodeStringBuilder) WriteRune(r rune) { 184 b.ensureStarted(2) 185 b.writeRuneFast(r) 186 } 187 188 // assumes already started 189 func (b *unicodeStringBuilder) writeRuneFast(r rune) { 190 if r <= 0xFFFF { 191 b.buf = append(b.buf, uint16(r)) 192 if !b.unicode && r >= utf8.RuneSelf { 193 b.unicode = true 194 } 195 } else { 196 first, second := utf16.EncodeRune(r) 197 b.buf = append(b.buf, uint16(first), uint16(second)) 198 b.unicode = true 199 } 200 } 201 202 func (b *unicodeStringBuilder) writeASCIIString(bytes string) { 203 for _, c := range bytes { 204 b.buf = append(b.buf, uint16(c)) 205 } 206 } 207 208 func (b *unicodeStringBuilder) writeUnicodeString(str unicodeString) { 209 b.buf = append(b.buf, str[1:]...) 210 b.unicode = true 211 } 212 213 func (b *StringBuilder) ascii() bool { 214 return len(b.unicodeBuilder.buf) == 0 215 } 216 217 func (b *StringBuilder) WriteString(s String) { 218 a, u := devirtualizeString(s) 219 if u != nil { 220 b.switchToUnicode(u.Length()) 221 b.unicodeBuilder.writeUnicodeString(u) 222 } else { 223 if b.ascii() { 224 b.asciiBuilder.WriteString(string(a)) 225 } else { 226 b.unicodeBuilder.writeASCIIString(string(a)) 227 } 228 } 229 } 230 231 func (b *StringBuilder) WriteUTF8String(s string) { 232 firstUnicodeIdx := 0 233 if b.ascii() { 234 for i := 0; i < len(s); i++ { 235 if s[i] >= utf8.RuneSelf { 236 b.switchToUnicode(len(s)) 237 b.unicodeBuilder.writeASCIIString(s[:i]) 238 firstUnicodeIdx = i 239 goto unicode 240 } 241 } 242 b.asciiBuilder.WriteString(s) 243 return 244 } 245 unicode: 246 for _, r := range s[firstUnicodeIdx:] { 247 b.unicodeBuilder.writeRuneFast(r) 248 } 249 } 250 251 func (b *StringBuilder) writeASCII(s string) { 252 if b.ascii() { 253 b.asciiBuilder.WriteString(s) 254 } else { 255 b.unicodeBuilder.writeASCIIString(s) 256 } 257 } 258 259 func (b *StringBuilder) WriteRune(r rune) { 260 if r < utf8.RuneSelf { 261 if b.ascii() { 262 b.asciiBuilder.WriteByte(byte(r)) 263 } else { 264 b.unicodeBuilder.writeRuneFast(r) 265 } 266 } else { 267 var extraLen int 268 if r <= 0xFFFF { 269 extraLen = 1 270 } else { 271 extraLen = 2 272 } 273 b.switchToUnicode(extraLen) 274 b.unicodeBuilder.writeRuneFast(r) 275 } 276 } 277 278 func (b *StringBuilder) String() String { 279 if b.ascii() { 280 return asciiString(b.asciiBuilder.String()) 281 } 282 return b.unicodeBuilder.String() 283 } 284 285 func (b *StringBuilder) Grow(n int) { 286 if b.ascii() { 287 b.asciiBuilder.Grow(n) 288 } else { 289 b.unicodeBuilder.Grow(n) 290 } 291 } 292 293 // LikelyUnicode hints to the builder that the resulting string is likely to contain Unicode (non-ASCII) characters. 294 // The argument is an extra capacity (in characters) to reserve on top of the current length (it's like calling 295 // Grow() afterwards). 296 // This method may be called at any point (not just when the buffer is empty), although for efficiency it should 297 // be called as early as possible. 298 func (b *StringBuilder) LikelyUnicode(extraLen int) { 299 b.switchToUnicode(extraLen) 300 } 301 302 func (b *StringBuilder) switchToUnicode(extraLen int) { 303 if b.ascii() { 304 c := b.asciiBuilder.Cap() 305 newCap := b.asciiBuilder.Len() + extraLen 306 if newCap < c { 307 newCap = c 308 } 309 b.unicodeBuilder.ensureStarted(newCap) 310 b.unicodeBuilder.writeASCIIString(b.asciiBuilder.String()) 311 b.asciiBuilder.Reset() 312 } 313 } 314 315 func (b *StringBuilder) WriteSubstring(source String, start int, end int) { 316 a, us := devirtualizeString(source) 317 if us == nil { 318 if b.ascii() { 319 b.asciiBuilder.WriteString(string(a[start:end])) 320 } else { 321 b.unicodeBuilder.writeASCIIString(string(a[start:end])) 322 } 323 return 324 } 325 if b.ascii() { 326 uc := false 327 for i := start; i < end; i++ { 328 if us.CharAt(i) >= utf8.RuneSelf { 329 uc = true 330 break 331 } 332 } 333 if uc { 334 b.switchToUnicode(end - start + 1) 335 } else { 336 b.asciiBuilder.Grow(end - start + 1) 337 for i := start; i < end; i++ { 338 b.asciiBuilder.WriteByte(byte(us.CharAt(i))) 339 } 340 return 341 } 342 } 343 b.unicodeBuilder.buf = append(b.unicodeBuilder.buf, us[start+1:end+1]...) 344 b.unicodeBuilder.unicode = true 345 } 346 347 func (s unicodeString) Reader() io.RuneReader { 348 return &unicodeRuneReader{ 349 s: s[1:], 350 } 351 } 352 353 func (s unicodeString) utf16Reader() utf16Reader { 354 return &utf16RuneReader{ 355 s: s[1:], 356 } 357 } 358 359 func (s unicodeString) utf16RuneReader() io.RuneReader { 360 return &utf16RuneReader{ 361 s: s[1:], 362 } 363 } 364 365 func (s unicodeString) utf16Runes() []rune { 366 runes := make([]rune, len(s)-1) 367 for i, ch := range s[1:] { 368 runes[i] = rune(ch) 369 } 370 return runes 371 } 372 373 func (s unicodeString) ToInteger() int64 { 374 return 0 375 } 376 377 func (s unicodeString) toString() String { 378 return s 379 } 380 381 func (s unicodeString) ToString() Value { 382 return s 383 } 384 385 func (s unicodeString) ToFloat() float64 { 386 return math.NaN() 387 } 388 389 func (s unicodeString) ToBoolean() bool { 390 return len(s) > 0 391 } 392 393 func (s unicodeString) toTrimmedUTF8() string { 394 if len(s) == 0 { 395 return "" 396 } 397 return strings.Trim(s.String(), parser.WhitespaceChars) 398 } 399 400 func (s unicodeString) ToNumber() Value { 401 return asciiString(s.toTrimmedUTF8()).ToNumber() 402 } 403 404 func (s unicodeString) ToObject(r *Runtime) *Object { 405 return r._newString(s, r.global.StringPrototype) 406 } 407 408 func (s unicodeString) equals(other unicodeString) bool { 409 if len(s) != len(other) { 410 return false 411 } 412 for i, r := range s { 413 if r != other[i] { 414 return false 415 } 416 } 417 return true 418 } 419 420 func (s unicodeString) SameAs(other Value) bool { 421 return s.StrictEquals(other) 422 } 423 424 func (s unicodeString) Equals(other Value) bool { 425 if s.StrictEquals(other) { 426 return true 427 } 428 429 if o, ok := other.(*Object); ok { 430 return s.Equals(o.toPrimitive()) 431 } 432 return false 433 } 434 435 func (s unicodeString) StrictEquals(other Value) bool { 436 if otherStr, ok := other.(unicodeString); ok { 437 return s.equals(otherStr) 438 } 439 if otherStr, ok := other.(*importedString); ok { 440 otherStr.ensureScanned() 441 if otherStr.u != nil { 442 return s.equals(otherStr.u) 443 } 444 } 445 446 return false 447 } 448 449 func (s unicodeString) baseObject(r *Runtime) *Object { 450 ss := r.stringSingleton 451 ss.value = s 452 ss.setLength() 453 return ss.val 454 } 455 456 func (s unicodeString) CharAt(idx int) uint16 { 457 return s[idx+1] 458 } 459 460 func (s unicodeString) Length() int { 461 return len(s) - 1 462 } 463 464 func (s unicodeString) Concat(other String) String { 465 a, u := devirtualizeString(other) 466 if u != nil { 467 b := make(unicodeString, len(s)+len(u)-1) 468 copy(b, s) 469 copy(b[len(s):], u[1:]) 470 return b 471 } 472 b := make([]uint16, len(s)+len(a)) 473 copy(b, s) 474 b1 := b[len(s):] 475 for i := 0; i < len(a); i++ { 476 b1[i] = uint16(a[i]) 477 } 478 return unicodeString(b) 479 } 480 481 func (s unicodeString) Substring(start, end int) String { 482 ss := s[start+1 : end+1] 483 for _, c := range ss { 484 if c >= utf8.RuneSelf { 485 b := make(unicodeString, end-start+1) 486 b[0] = unistring.BOM 487 copy(b[1:], ss) 488 return b 489 } 490 } 491 as := make([]byte, end-start) 492 for i, c := range ss { 493 as[i] = byte(c) 494 } 495 return asciiString(as) 496 } 497 498 func (s unicodeString) String() string { 499 return string(utf16.Decode(s[1:])) 500 } 501 502 func (s unicodeString) CompareTo(other String) int { 503 // TODO handle invalid UTF-16 504 return strings.Compare(s.String(), other.String()) 505 } 506 507 func (s unicodeString) index(substr String, start int) int { 508 var ss []uint16 509 a, u := devirtualizeString(substr) 510 if u != nil { 511 ss = u[1:] 512 } else { 513 ss = make([]uint16, len(a)) 514 for i := 0; i < len(a); i++ { 515 ss[i] = uint16(a[i]) 516 } 517 } 518 s1 := s[1:] 519 // TODO: optimise 520 end := len(s1) - len(ss) 521 for start <= end { 522 for i := 0; i < len(ss); i++ { 523 if s1[start+i] != ss[i] { 524 goto nomatch 525 } 526 } 527 528 return start 529 nomatch: 530 start++ 531 } 532 return -1 533 } 534 535 func (s unicodeString) lastIndex(substr String, start int) int { 536 var ss []uint16 537 a, u := devirtualizeString(substr) 538 if u != nil { 539 ss = u[1:] 540 } else { 541 ss = make([]uint16, len(a)) 542 for i := 0; i < len(a); i++ { 543 ss[i] = uint16(a[i]) 544 } 545 } 546 547 s1 := s[1:] 548 if maxStart := len(s1) - len(ss); start > maxStart { 549 start = maxStart 550 } 551 // TODO: optimise 552 for start >= 0 { 553 for i := 0; i < len(ss); i++ { 554 if s1[start+i] != ss[i] { 555 goto nomatch 556 } 557 } 558 559 return start 560 nomatch: 561 start-- 562 } 563 return -1 564 } 565 566 func unicodeStringFromRunes(r []rune) unicodeString { 567 return unistring.NewFromRunes(r).AsUtf16() 568 } 569 570 func toLower(s string) String { 571 caser := cases.Lower(language.Und) 572 r := []rune(caser.String(s)) 573 // Workaround 574 ascii := true 575 for i := 0; i < len(r)-1; i++ { 576 if (i == 0 || r[i-1] != 0x3b1) && r[i] == 0x345 && r[i+1] == 0x3c2 { 577 i++ 578 r[i] = 0x3c3 579 } 580 if r[i] >= utf8.RuneSelf { 581 ascii = false 582 } 583 } 584 if ascii { 585 ascii = r[len(r)-1] < utf8.RuneSelf 586 } 587 if ascii { 588 return asciiString(r) 589 } 590 return unicodeStringFromRunes(r) 591 } 592 593 func (s unicodeString) toLower() String { 594 return toLower(s.String()) 595 } 596 597 func (s unicodeString) toUpper() String { 598 caser := cases.Upper(language.Und) 599 return newStringValue(caser.String(s.String())) 600 } 601 602 func (s unicodeString) Export() interface{} { 603 return s.String() 604 } 605 606 func (s unicodeString) ExportType() reflect.Type { 607 return reflectTypeString 608 } 609 610 func (s unicodeString) hash(hash *maphash.Hash) uint64 { 611 _, _ = hash.WriteString(string(unistring.FromUtf16(s))) 612 h := hash.Sum64() 613 hash.Reset() 614 return h 615 } 616 617 func (s unicodeString) string() unistring.String { 618 return unistring.FromUtf16(s) 619 }