go.ketch.com/lib/goja@v0.0.1/string_unicode.go (about) 1 package goja 2 3 import ( 4 "errors" 5 "hash/maphash" 6 "io" 7 "math" 8 "reflect" 9 "strings" 10 "unicode/utf16" 11 "unicode/utf8" 12 13 "go.ketch.com/lib/goja/parser" 14 "go.ketch.com/lib/goja/unistring" 15 "golang.org/x/text/cases" 16 "golang.org/x/text/language" 17 ) 18 19 type unicodeString []uint16 20 21 type unicodeRuneReader struct { 22 s unicodeString 23 pos int 24 } 25 26 type utf16RuneReader struct { 27 s unicodeString 28 pos int 29 } 30 31 // passes through invalid surrogate pairs 32 type lenientUtf16Decoder struct { 33 utf16Reader io.RuneReader 34 prev rune 35 prevSet bool 36 } 37 38 type valueStringBuilder struct { 39 asciiBuilder strings.Builder 40 unicodeBuilder unicodeStringBuilder 41 } 42 43 type unicodeStringBuilder struct { 44 buf []uint16 45 unicode bool 46 } 47 48 var ( 49 InvalidRuneError = errors.New("invalid rune") 50 ) 51 52 func (rr *utf16RuneReader) ReadRune() (r rune, size int, err error) { 53 if rr.pos < len(rr.s) { 54 r = rune(rr.s[rr.pos]) 55 size++ 56 rr.pos++ 57 return 58 } 59 err = io.EOF 60 return 61 } 62 63 func (rr *lenientUtf16Decoder) ReadRune() (r rune, size int, err error) { 64 if rr.prevSet { 65 r = rr.prev 66 size = 1 67 rr.prevSet = false 68 } else { 69 r, size, err = rr.utf16Reader.ReadRune() 70 if err != nil { 71 return 72 } 73 } 74 if isUTF16FirstSurrogate(r) { 75 second, _, err1 := rr.utf16Reader.ReadRune() 76 if err1 != nil { 77 if err1 != io.EOF { 78 err = err1 79 } 80 return 81 } 82 if isUTF16SecondSurrogate(second) { 83 r = utf16.DecodeRune(r, second) 84 size++ 85 } else { 86 rr.prev = second 87 rr.prevSet = true 88 } 89 } 90 91 return 92 } 93 94 func (rr *unicodeRuneReader) ReadRune() (r rune, size int, err error) { 95 if rr.pos < len(rr.s) { 96 r = rune(rr.s[rr.pos]) 97 size++ 98 rr.pos++ 99 if isUTF16FirstSurrogate(r) { 100 if rr.pos < len(rr.s) { 101 second := rune(rr.s[rr.pos]) 102 if isUTF16SecondSurrogate(second) { 103 r = utf16.DecodeRune(r, second) 104 size++ 105 rr.pos++ 106 } else { 107 err = InvalidRuneError 108 } 109 } else { 110 err = InvalidRuneError 111 } 112 } else if isUTF16SecondSurrogate(r) { 113 err = InvalidRuneError 114 } 115 } else { 116 err = io.EOF 117 } 118 return 119 } 120 121 func (b *unicodeStringBuilder) Grow(n int) { 122 if len(b.buf) == 0 { 123 n++ 124 } 125 if cap(b.buf)-len(b.buf) < n { 126 buf := make([]uint16, len(b.buf), 2*cap(b.buf)+n) 127 copy(buf, b.buf) 128 b.buf = buf 129 } 130 } 131 132 func (b *unicodeStringBuilder) ensureStarted(initialSize int) { 133 b.Grow(initialSize) 134 if len(b.buf) == 0 { 135 b.buf = append(b.buf, unistring.BOM) 136 } 137 } 138 139 func (b *unicodeStringBuilder) WriteString(s valueString) { 140 b.ensureStarted(s.length()) 141 a, u := devirtualizeString(s) 142 if u != nil { 143 b.buf = append(b.buf, u[1:]...) 144 b.unicode = true 145 } else { 146 for i := 0; i < len(a); i++ { 147 b.buf = append(b.buf, uint16(a[i])) 148 } 149 } 150 } 151 152 func (b *unicodeStringBuilder) String() valueString { 153 if b.unicode { 154 return unicodeString(b.buf) 155 } 156 if len(b.buf) == 0 { 157 return stringEmpty 158 } 159 buf := make([]byte, 0, len(b.buf)-1) 160 for _, c := range b.buf[1:] { 161 buf = append(buf, byte(c)) 162 } 163 return asciiString(buf) 164 } 165 166 func (b *unicodeStringBuilder) WriteRune(r rune) { 167 if r <= 0xFFFF { 168 b.ensureStarted(1) 169 b.buf = append(b.buf, uint16(r)) 170 if !b.unicode && r >= utf8.RuneSelf { 171 b.unicode = true 172 } 173 } else { 174 b.ensureStarted(2) 175 first, second := utf16.EncodeRune(r) 176 b.buf = append(b.buf, uint16(first), uint16(second)) 177 b.unicode = true 178 } 179 } 180 181 func (b *unicodeStringBuilder) writeASCIIString(bytes string) { 182 b.ensureStarted(len(bytes)) 183 for _, c := range bytes { 184 b.buf = append(b.buf, uint16(c)) 185 } 186 } 187 188 func (b *unicodeStringBuilder) writeUnicodeString(str unicodeString) { 189 b.ensureStarted(str.length()) 190 b.buf = append(b.buf, str[1:]...) 191 b.unicode = true 192 } 193 194 func (b *valueStringBuilder) ascii() bool { 195 return len(b.unicodeBuilder.buf) == 0 196 } 197 198 func (b *valueStringBuilder) WriteString(s valueString) { 199 a, u := devirtualizeString(s) 200 if u != nil { 201 b.switchToUnicode(u.length()) 202 b.unicodeBuilder.writeUnicodeString(u) 203 } else { 204 if b.ascii() { 205 b.asciiBuilder.WriteString(string(a)) 206 } else { 207 b.unicodeBuilder.writeASCIIString(string(a)) 208 } 209 } 210 } 211 212 func (b *valueStringBuilder) WriteASCII(s string) { 213 if b.ascii() { 214 b.asciiBuilder.WriteString(s) 215 } else { 216 b.unicodeBuilder.writeASCIIString(s) 217 } 218 } 219 220 func (b *valueStringBuilder) WriteRune(r rune) { 221 if r < utf8.RuneSelf { 222 if b.ascii() { 223 b.asciiBuilder.WriteByte(byte(r)) 224 } else { 225 b.unicodeBuilder.WriteRune(r) 226 } 227 } else { 228 var extraLen int 229 if r <= 0xFFFF { 230 extraLen = 1 231 } else { 232 extraLen = 2 233 } 234 b.switchToUnicode(extraLen) 235 b.unicodeBuilder.WriteRune(r) 236 } 237 } 238 239 func (b *valueStringBuilder) String() valueString { 240 if b.ascii() { 241 return asciiString(b.asciiBuilder.String()) 242 } 243 return b.unicodeBuilder.String() 244 } 245 246 func (b *valueStringBuilder) Grow(n int) { 247 if b.ascii() { 248 b.asciiBuilder.Grow(n) 249 } else { 250 b.unicodeBuilder.Grow(n) 251 } 252 } 253 254 func (b *valueStringBuilder) switchToUnicode(extraLen int) { 255 if b.ascii() { 256 b.unicodeBuilder.ensureStarted(b.asciiBuilder.Len() + extraLen) 257 b.unicodeBuilder.writeASCIIString(b.asciiBuilder.String()) 258 b.asciiBuilder.Reset() 259 } 260 } 261 262 func (b *valueStringBuilder) WriteSubstring(source valueString, start int, end int) { 263 a, us := devirtualizeString(source) 264 if us == nil { 265 if b.ascii() { 266 b.asciiBuilder.WriteString(string(a[start:end])) 267 } else { 268 b.unicodeBuilder.writeASCIIString(string(a[start:end])) 269 } 270 return 271 } 272 if b.ascii() { 273 uc := false 274 for i := start; i < end; i++ { 275 if us.charAt(i) >= utf8.RuneSelf { 276 uc = true 277 break 278 } 279 } 280 if uc { 281 b.switchToUnicode(end - start + 1) 282 } else { 283 b.asciiBuilder.Grow(end - start + 1) 284 for i := start; i < end; i++ { 285 b.asciiBuilder.WriteByte(byte(us.charAt(i))) 286 } 287 return 288 } 289 } 290 b.unicodeBuilder.buf = append(b.unicodeBuilder.buf, us[start+1:end+1]...) 291 b.unicodeBuilder.unicode = true 292 } 293 294 func (s unicodeString) reader() io.RuneReader { 295 return &unicodeRuneReader{ 296 s: s[1:], 297 } 298 } 299 300 func (s unicodeString) utf16Reader() io.RuneReader { 301 return &utf16RuneReader{ 302 s: s[1:], 303 } 304 } 305 306 func (s unicodeString) utf16Runes() []rune { 307 runes := make([]rune, len(s)-1) 308 for i, ch := range s[1:] { 309 runes[i] = rune(ch) 310 } 311 return runes 312 } 313 314 func (s unicodeString) ToInteger() int64 { 315 return 0 316 } 317 318 func (s unicodeString) toString() valueString { 319 return s 320 } 321 322 func (s unicodeString) ToString() Value { 323 return s 324 } 325 326 func (s unicodeString) ToFloat() float64 { 327 return math.NaN() 328 } 329 330 func (s unicodeString) ToBoolean() bool { 331 return len(s) > 0 332 } 333 334 func (s unicodeString) toTrimmedUTF8() string { 335 if len(s) == 0 { 336 return "" 337 } 338 return strings.Trim(s.String(), parser.WhitespaceChars) 339 } 340 341 func (s unicodeString) ToNumber() Value { 342 return asciiString(s.toTrimmedUTF8()).ToNumber() 343 } 344 345 func (s unicodeString) ToObject(r *Runtime) *Object { 346 return r._newString(s, r.global.StringPrototype) 347 } 348 349 func (s unicodeString) equals(other unicodeString) bool { 350 if len(s) != len(other) { 351 return false 352 } 353 for i, r := range s { 354 if r != other[i] { 355 return false 356 } 357 } 358 return true 359 } 360 361 func (s unicodeString) SameAs(other Value) bool { 362 return s.StrictEquals(other) 363 } 364 365 func (s unicodeString) Equals(other Value) bool { 366 if s.StrictEquals(other) { 367 return true 368 } 369 370 if o, ok := other.(*Object); ok { 371 return s.Equals(o.toPrimitive()) 372 } 373 return false 374 } 375 376 func (s unicodeString) StrictEquals(other Value) bool { 377 if otherStr, ok := other.(unicodeString); ok { 378 return s.equals(otherStr) 379 } 380 if otherStr, ok := other.(*importedString); ok { 381 otherStr.ensureScanned() 382 if otherStr.u != nil { 383 return s.equals(otherStr.u) 384 } 385 } 386 387 return false 388 } 389 390 func (s unicodeString) baseObject(r *Runtime) *Object { 391 ss := r.stringSingleton 392 ss.value = s 393 ss.setLength() 394 return ss.val 395 } 396 397 func (s unicodeString) charAt(idx int) rune { 398 return rune(s[idx+1]) 399 } 400 401 func (s unicodeString) length() int { 402 return len(s) - 1 403 } 404 405 func (s unicodeString) concat(other valueString) valueString { 406 a, u := devirtualizeString(other) 407 if u != nil { 408 b := make(unicodeString, len(s)+len(u)-1) 409 copy(b, s) 410 copy(b[len(s):], u[1:]) 411 return b 412 } 413 b := make([]uint16, len(s)+len(a)) 414 copy(b, s) 415 b1 := b[len(s):] 416 for i := 0; i < len(a); i++ { 417 b1[i] = uint16(a[i]) 418 } 419 return unicodeString(b) 420 } 421 422 func (s unicodeString) substring(start, end int) valueString { 423 ss := s[start+1 : end+1] 424 for _, c := range ss { 425 if c >= utf8.RuneSelf { 426 b := make(unicodeString, end-start+1) 427 b[0] = unistring.BOM 428 copy(b[1:], ss) 429 return b 430 } 431 } 432 as := make([]byte, end-start) 433 for i, c := range ss { 434 as[i] = byte(c) 435 } 436 return asciiString(as) 437 } 438 439 func (s unicodeString) String() string { 440 return string(utf16.Decode(s[1:])) 441 } 442 443 func (s unicodeString) compareTo(other valueString) int { 444 // TODO handle invalid UTF-16 445 return strings.Compare(s.String(), other.String()) 446 } 447 448 func (s unicodeString) index(substr valueString, start int) int { 449 var ss []uint16 450 a, u := devirtualizeString(substr) 451 if u != nil { 452 ss = u[1:] 453 } else { 454 ss = make([]uint16, len(a)) 455 for i := 0; i < len(a); i++ { 456 ss[i] = uint16(a[i]) 457 } 458 } 459 s1 := s[1:] 460 // TODO: optimise 461 end := len(s1) - len(ss) 462 for start <= end { 463 for i := 0; i < len(ss); i++ { 464 if s1[start+i] != ss[i] { 465 goto nomatch 466 } 467 } 468 469 return start 470 nomatch: 471 start++ 472 } 473 return -1 474 } 475 476 func (s unicodeString) lastIndex(substr valueString, start int) int { 477 var ss []uint16 478 a, u := devirtualizeString(substr) 479 if u != nil { 480 ss = u[1:] 481 } else { 482 ss = make([]uint16, len(a)) 483 for i := 0; i < len(a); i++ { 484 ss[i] = uint16(a[i]) 485 } 486 } 487 488 s1 := s[1:] 489 if maxStart := len(s1) - len(ss); start > maxStart { 490 start = maxStart 491 } 492 // TODO: optimise 493 for start >= 0 { 494 for i := 0; i < len(ss); i++ { 495 if s1[start+i] != ss[i] { 496 goto nomatch 497 } 498 } 499 500 return start 501 nomatch: 502 start-- 503 } 504 return -1 505 } 506 507 func unicodeStringFromRunes(r []rune) unicodeString { 508 return unistring.NewFromRunes(r).AsUtf16() 509 } 510 511 func toLower(s string) valueString { 512 caser := cases.Lower(language.Und) 513 r := []rune(caser.String(s)) 514 // Workaround 515 ascii := true 516 for i := 0; i < len(r)-1; i++ { 517 if (i == 0 || r[i-1] != 0x3b1) && r[i] == 0x345 && r[i+1] == 0x3c2 { 518 i++ 519 r[i] = 0x3c3 520 } 521 if r[i] >= utf8.RuneSelf { 522 ascii = false 523 } 524 } 525 if ascii { 526 ascii = r[len(r)-1] < utf8.RuneSelf 527 } 528 if ascii { 529 return asciiString(r) 530 } 531 return unicodeStringFromRunes(r) 532 } 533 534 func (s unicodeString) toLower() valueString { 535 return toLower(s.String()) 536 } 537 538 func (s unicodeString) toUpper() valueString { 539 caser := cases.Upper(language.Und) 540 return newStringValue(caser.String(s.String())) 541 } 542 543 func (s unicodeString) Export() interface{} { 544 return s.String() 545 } 546 547 func (s unicodeString) ExportType() reflect.Type { 548 return reflectTypeString 549 } 550 551 func (s unicodeString) hash(hash *maphash.Hash) uint64 { 552 _, _ = hash.WriteString(string(unistring.FromUtf16(s))) 553 h := hash.Sum64() 554 hash.Reset() 555 return h 556 } 557 558 func (s unicodeString) string() unistring.String { 559 return unistring.FromUtf16(s) 560 }