github.com/nuvolaris/goja@v0.0.0-20230825100449-967811910c6d/string_unicode.go (about)

     1  package goja
     2  
     3  import (
     4  	"errors"
     5  	"hash/maphash"
     6  	"io"
     7  	"math"
     8  	"reflect"
     9  	"strings"
    10  	"unicode/utf16"
    11  	"unicode/utf8"
    12  
    13  	"github.com/nuvolaris/goja/parser"
    14  	"github.com/nuvolaris/goja/unistring"
    15  	"golang.org/x/text/cases"
    16  	"golang.org/x/text/language"
    17  )
    18  
    19  type unicodeString []uint16
    20  
    21  type unicodeRuneReader struct {
    22  	s   unicodeString
    23  	pos int
    24  }
    25  
    26  type utf16RuneReader struct {
    27  	s   unicodeString
    28  	pos int
    29  }
    30  
    31  // passes through invalid surrogate pairs
    32  type lenientUtf16Decoder struct {
    33  	utf16Reader utf16Reader
    34  	prev        uint16
    35  	prevSet     bool
    36  }
    37  
    38  // StringBuilder serves similar purpose to strings.Builder, except it works with ECMAScript String.
    39  // Use it to efficiently build 'native' ECMAScript values that either contain invalid UTF-16 surrogate pairs
    40  // (and therefore cannot be represented as UTF-8) or never expected to be exported to Go. See also
    41  // StringFromUTF16.
    42  type StringBuilder struct {
    43  	asciiBuilder   strings.Builder
    44  	unicodeBuilder unicodeStringBuilder
    45  }
    46  
    47  type unicodeStringBuilder struct {
    48  	buf     []uint16
    49  	unicode bool
    50  }
    51  
    52  var (
    53  	InvalidRuneError = errors.New("invalid rune")
    54  )
    55  
    56  func (rr *utf16RuneReader) readChar() (c uint16, err error) {
    57  	if rr.pos < len(rr.s) {
    58  		c = rr.s[rr.pos]
    59  		rr.pos++
    60  		return
    61  	}
    62  	err = io.EOF
    63  	return
    64  }
    65  
    66  func (rr *utf16RuneReader) ReadRune() (r rune, size int, err error) {
    67  	if rr.pos < len(rr.s) {
    68  		r = rune(rr.s[rr.pos])
    69  		rr.pos++
    70  		size = 1
    71  		return
    72  	}
    73  	err = io.EOF
    74  	return
    75  }
    76  
    77  func (rr *lenientUtf16Decoder) ReadRune() (r rune, size int, err error) {
    78  	var c uint16
    79  	if rr.prevSet {
    80  		c = rr.prev
    81  		rr.prevSet = false
    82  	} else {
    83  		c, err = rr.utf16Reader.readChar()
    84  		if err != nil {
    85  			return
    86  		}
    87  	}
    88  	size = 1
    89  	if isUTF16FirstSurrogate(c) {
    90  		second, err1 := rr.utf16Reader.readChar()
    91  		if err1 != nil {
    92  			if err1 != io.EOF {
    93  				err = err1
    94  			} else {
    95  				r = rune(c)
    96  			}
    97  			return
    98  		}
    99  		if isUTF16SecondSurrogate(second) {
   100  			r = utf16.DecodeRune(rune(c), rune(second))
   101  			size++
   102  			return
   103  		} else {
   104  			rr.prev = second
   105  			rr.prevSet = true
   106  		}
   107  	}
   108  	r = rune(c)
   109  	return
   110  }
   111  
   112  func (rr *unicodeRuneReader) ReadRune() (r rune, size int, err error) {
   113  	if rr.pos < len(rr.s) {
   114  		c := rr.s[rr.pos]
   115  		size++
   116  		rr.pos++
   117  		if isUTF16FirstSurrogate(c) {
   118  			if rr.pos < len(rr.s) {
   119  				second := rr.s[rr.pos]
   120  				if isUTF16SecondSurrogate(second) {
   121  					r = utf16.DecodeRune(rune(c), rune(second))
   122  					size++
   123  					rr.pos++
   124  					return
   125  				}
   126  			}
   127  			err = InvalidRuneError
   128  		} else if isUTF16SecondSurrogate(c) {
   129  			err = InvalidRuneError
   130  		}
   131  		r = rune(c)
   132  	} else {
   133  		err = io.EOF
   134  	}
   135  	return
   136  }
   137  
   138  func (b *unicodeStringBuilder) Grow(n int) {
   139  	if len(b.buf) == 0 {
   140  		n++
   141  	}
   142  	if cap(b.buf)-len(b.buf) < n {
   143  		buf := make([]uint16, len(b.buf), 2*cap(b.buf)+n)
   144  		copy(buf, b.buf)
   145  		b.buf = buf
   146  	}
   147  }
   148  
   149  func (b *unicodeStringBuilder) ensureStarted(initialSize int) {
   150  	b.Grow(initialSize)
   151  	if len(b.buf) == 0 {
   152  		b.buf = append(b.buf, unistring.BOM)
   153  	}
   154  }
   155  
   156  // assumes already started
   157  func (b *unicodeStringBuilder) writeString(s String) {
   158  	a, u := devirtualizeString(s)
   159  	if u != nil {
   160  		b.buf = append(b.buf, u[1:]...)
   161  		b.unicode = true
   162  	} else {
   163  		for i := 0; i < len(a); i++ {
   164  			b.buf = append(b.buf, uint16(a[i]))
   165  		}
   166  	}
   167  }
   168  
   169  func (b *unicodeStringBuilder) String() String {
   170  	if b.unicode {
   171  		return unicodeString(b.buf)
   172  	}
   173  	if len(b.buf) < 2 {
   174  		return stringEmpty
   175  	}
   176  	buf := make([]byte, 0, len(b.buf)-1)
   177  	for _, c := range b.buf[1:] {
   178  		buf = append(buf, byte(c))
   179  	}
   180  	return asciiString(buf)
   181  }
   182  
   183  func (b *unicodeStringBuilder) WriteRune(r rune) {
   184  	b.ensureStarted(2)
   185  	b.writeRuneFast(r)
   186  }
   187  
   188  // assumes already started
   189  func (b *unicodeStringBuilder) writeRuneFast(r rune) {
   190  	if r <= 0xFFFF {
   191  		b.buf = append(b.buf, uint16(r))
   192  		if !b.unicode && r >= utf8.RuneSelf {
   193  			b.unicode = true
   194  		}
   195  	} else {
   196  		first, second := utf16.EncodeRune(r)
   197  		b.buf = append(b.buf, uint16(first), uint16(second))
   198  		b.unicode = true
   199  	}
   200  }
   201  
   202  func (b *unicodeStringBuilder) writeASCIIString(bytes string) {
   203  	for _, c := range bytes {
   204  		b.buf = append(b.buf, uint16(c))
   205  	}
   206  }
   207  
   208  func (b *unicodeStringBuilder) writeUnicodeString(str unicodeString) {
   209  	b.buf = append(b.buf, str[1:]...)
   210  	b.unicode = true
   211  }
   212  
   213  func (b *StringBuilder) ascii() bool {
   214  	return len(b.unicodeBuilder.buf) == 0
   215  }
   216  
   217  func (b *StringBuilder) WriteString(s String) {
   218  	a, u := devirtualizeString(s)
   219  	if u != nil {
   220  		b.switchToUnicode(u.Length())
   221  		b.unicodeBuilder.writeUnicodeString(u)
   222  	} else {
   223  		if b.ascii() {
   224  			b.asciiBuilder.WriteString(string(a))
   225  		} else {
   226  			b.unicodeBuilder.writeASCIIString(string(a))
   227  		}
   228  	}
   229  }
   230  
   231  func (b *StringBuilder) WriteUTF8String(s string) {
   232  	firstUnicodeIdx := 0
   233  	if b.ascii() {
   234  		for i := 0; i < len(s); i++ {
   235  			if s[i] >= utf8.RuneSelf {
   236  				b.switchToUnicode(len(s))
   237  				b.unicodeBuilder.writeASCIIString(s[:i])
   238  				firstUnicodeIdx = i
   239  				goto unicode
   240  			}
   241  		}
   242  		b.asciiBuilder.WriteString(s)
   243  		return
   244  	}
   245  unicode:
   246  	for _, r := range s[firstUnicodeIdx:] {
   247  		b.unicodeBuilder.writeRuneFast(r)
   248  	}
   249  }
   250  
   251  func (b *StringBuilder) writeASCII(s string) {
   252  	if b.ascii() {
   253  		b.asciiBuilder.WriteString(s)
   254  	} else {
   255  		b.unicodeBuilder.writeASCIIString(s)
   256  	}
   257  }
   258  
   259  func (b *StringBuilder) WriteRune(r rune) {
   260  	if r < utf8.RuneSelf {
   261  		if b.ascii() {
   262  			b.asciiBuilder.WriteByte(byte(r))
   263  		} else {
   264  			b.unicodeBuilder.writeRuneFast(r)
   265  		}
   266  	} else {
   267  		var extraLen int
   268  		if r <= 0xFFFF {
   269  			extraLen = 1
   270  		} else {
   271  			extraLen = 2
   272  		}
   273  		b.switchToUnicode(extraLen)
   274  		b.unicodeBuilder.writeRuneFast(r)
   275  	}
   276  }
   277  
   278  func (b *StringBuilder) String() String {
   279  	if b.ascii() {
   280  		return asciiString(b.asciiBuilder.String())
   281  	}
   282  	return b.unicodeBuilder.String()
   283  }
   284  
   285  func (b *StringBuilder) Grow(n int) {
   286  	if b.ascii() {
   287  		b.asciiBuilder.Grow(n)
   288  	} else {
   289  		b.unicodeBuilder.Grow(n)
   290  	}
   291  }
   292  
   293  // LikelyUnicode hints to the builder that the resulting string is likely to contain Unicode (non-ASCII) characters.
   294  // The argument is an extra capacity (in characters) to reserve on top of the current length (it's like calling
   295  // Grow() afterwards).
   296  // This method may be called at any point (not just when the buffer is empty), although for efficiency it should
   297  // be called as early as possible.
   298  func (b *StringBuilder) LikelyUnicode(extraLen int) {
   299  	b.switchToUnicode(extraLen)
   300  }
   301  
   302  func (b *StringBuilder) switchToUnicode(extraLen int) {
   303  	if b.ascii() {
   304  		c := b.asciiBuilder.Cap()
   305  		newCap := b.asciiBuilder.Len() + extraLen
   306  		if newCap < c {
   307  			newCap = c
   308  		}
   309  		b.unicodeBuilder.ensureStarted(newCap)
   310  		b.unicodeBuilder.writeASCIIString(b.asciiBuilder.String())
   311  		b.asciiBuilder.Reset()
   312  	}
   313  }
   314  
   315  func (b *StringBuilder) WriteSubstring(source String, start int, end int) {
   316  	a, us := devirtualizeString(source)
   317  	if us == nil {
   318  		if b.ascii() {
   319  			b.asciiBuilder.WriteString(string(a[start:end]))
   320  		} else {
   321  			b.unicodeBuilder.writeASCIIString(string(a[start:end]))
   322  		}
   323  		return
   324  	}
   325  	if b.ascii() {
   326  		uc := false
   327  		for i := start; i < end; i++ {
   328  			if us.CharAt(i) >= utf8.RuneSelf {
   329  				uc = true
   330  				break
   331  			}
   332  		}
   333  		if uc {
   334  			b.switchToUnicode(end - start + 1)
   335  		} else {
   336  			b.asciiBuilder.Grow(end - start + 1)
   337  			for i := start; i < end; i++ {
   338  				b.asciiBuilder.WriteByte(byte(us.CharAt(i)))
   339  			}
   340  			return
   341  		}
   342  	}
   343  	b.unicodeBuilder.buf = append(b.unicodeBuilder.buf, us[start+1:end+1]...)
   344  	b.unicodeBuilder.unicode = true
   345  }
   346  
   347  func (s unicodeString) Reader() io.RuneReader {
   348  	return &unicodeRuneReader{
   349  		s: s[1:],
   350  	}
   351  }
   352  
   353  func (s unicodeString) utf16Reader() utf16Reader {
   354  	return &utf16RuneReader{
   355  		s: s[1:],
   356  	}
   357  }
   358  
   359  func (s unicodeString) utf16RuneReader() io.RuneReader {
   360  	return &utf16RuneReader{
   361  		s: s[1:],
   362  	}
   363  }
   364  
   365  func (s unicodeString) utf16Runes() []rune {
   366  	runes := make([]rune, len(s)-1)
   367  	for i, ch := range s[1:] {
   368  		runes[i] = rune(ch)
   369  	}
   370  	return runes
   371  }
   372  
   373  func (s unicodeString) ToInteger() int64 {
   374  	return 0
   375  }
   376  
   377  func (s unicodeString) toString() String {
   378  	return s
   379  }
   380  
   381  func (s unicodeString) ToString() Value {
   382  	return s
   383  }
   384  
   385  func (s unicodeString) ToFloat() float64 {
   386  	return math.NaN()
   387  }
   388  
   389  func (s unicodeString) ToBoolean() bool {
   390  	return len(s) > 0
   391  }
   392  
   393  func (s unicodeString) toTrimmedUTF8() string {
   394  	if len(s) == 0 {
   395  		return ""
   396  	}
   397  	return strings.Trim(s.String(), parser.WhitespaceChars)
   398  }
   399  
   400  func (s unicodeString) ToNumber() Value {
   401  	return asciiString(s.toTrimmedUTF8()).ToNumber()
   402  }
   403  
   404  func (s unicodeString) ToObject(r *Runtime) *Object {
   405  	return r._newString(s, r.global.StringPrototype)
   406  }
   407  
   408  func (s unicodeString) equals(other unicodeString) bool {
   409  	if len(s) != len(other) {
   410  		return false
   411  	}
   412  	for i, r := range s {
   413  		if r != other[i] {
   414  			return false
   415  		}
   416  	}
   417  	return true
   418  }
   419  
   420  func (s unicodeString) SameAs(other Value) bool {
   421  	return s.StrictEquals(other)
   422  }
   423  
   424  func (s unicodeString) Equals(other Value) bool {
   425  	if s.StrictEquals(other) {
   426  		return true
   427  	}
   428  
   429  	if o, ok := other.(*Object); ok {
   430  		return s.Equals(o.toPrimitive())
   431  	}
   432  	return false
   433  }
   434  
   435  func (s unicodeString) StrictEquals(other Value) bool {
   436  	if otherStr, ok := other.(unicodeString); ok {
   437  		return s.equals(otherStr)
   438  	}
   439  	if otherStr, ok := other.(*importedString); ok {
   440  		otherStr.ensureScanned()
   441  		if otherStr.u != nil {
   442  			return s.equals(otherStr.u)
   443  		}
   444  	}
   445  
   446  	return false
   447  }
   448  
   449  func (s unicodeString) baseObject(r *Runtime) *Object {
   450  	ss := r.stringSingleton
   451  	ss.value = s
   452  	ss.setLength()
   453  	return ss.val
   454  }
   455  
   456  func (s unicodeString) CharAt(idx int) uint16 {
   457  	return s[idx+1]
   458  }
   459  
   460  func (s unicodeString) Length() int {
   461  	return len(s) - 1
   462  }
   463  
   464  func (s unicodeString) Concat(other String) String {
   465  	a, u := devirtualizeString(other)
   466  	if u != nil {
   467  		b := make(unicodeString, len(s)+len(u)-1)
   468  		copy(b, s)
   469  		copy(b[len(s):], u[1:])
   470  		return b
   471  	}
   472  	b := make([]uint16, len(s)+len(a))
   473  	copy(b, s)
   474  	b1 := b[len(s):]
   475  	for i := 0; i < len(a); i++ {
   476  		b1[i] = uint16(a[i])
   477  	}
   478  	return unicodeString(b)
   479  }
   480  
   481  func (s unicodeString) Substring(start, end int) String {
   482  	ss := s[start+1 : end+1]
   483  	for _, c := range ss {
   484  		if c >= utf8.RuneSelf {
   485  			b := make(unicodeString, end-start+1)
   486  			b[0] = unistring.BOM
   487  			copy(b[1:], ss)
   488  			return b
   489  		}
   490  	}
   491  	as := make([]byte, end-start)
   492  	for i, c := range ss {
   493  		as[i] = byte(c)
   494  	}
   495  	return asciiString(as)
   496  }
   497  
   498  func (s unicodeString) String() string {
   499  	return string(utf16.Decode(s[1:]))
   500  }
   501  
   502  func (s unicodeString) CompareTo(other String) int {
   503  	// TODO handle invalid UTF-16
   504  	return strings.Compare(s.String(), other.String())
   505  }
   506  
   507  func (s unicodeString) index(substr String, start int) int {
   508  	var ss []uint16
   509  	a, u := devirtualizeString(substr)
   510  	if u != nil {
   511  		ss = u[1:]
   512  	} else {
   513  		ss = make([]uint16, len(a))
   514  		for i := 0; i < len(a); i++ {
   515  			ss[i] = uint16(a[i])
   516  		}
   517  	}
   518  	s1 := s[1:]
   519  	// TODO: optimise
   520  	end := len(s1) - len(ss)
   521  	for start <= end {
   522  		for i := 0; i < len(ss); i++ {
   523  			if s1[start+i] != ss[i] {
   524  				goto nomatch
   525  			}
   526  		}
   527  
   528  		return start
   529  	nomatch:
   530  		start++
   531  	}
   532  	return -1
   533  }
   534  
   535  func (s unicodeString) lastIndex(substr String, start int) int {
   536  	var ss []uint16
   537  	a, u := devirtualizeString(substr)
   538  	if u != nil {
   539  		ss = u[1:]
   540  	} else {
   541  		ss = make([]uint16, len(a))
   542  		for i := 0; i < len(a); i++ {
   543  			ss[i] = uint16(a[i])
   544  		}
   545  	}
   546  
   547  	s1 := s[1:]
   548  	if maxStart := len(s1) - len(ss); start > maxStart {
   549  		start = maxStart
   550  	}
   551  	// TODO: optimise
   552  	for start >= 0 {
   553  		for i := 0; i < len(ss); i++ {
   554  			if s1[start+i] != ss[i] {
   555  				goto nomatch
   556  			}
   557  		}
   558  
   559  		return start
   560  	nomatch:
   561  		start--
   562  	}
   563  	return -1
   564  }
   565  
   566  func unicodeStringFromRunes(r []rune) unicodeString {
   567  	return unistring.NewFromRunes(r).AsUtf16()
   568  }
   569  
   570  func toLower(s string) String {
   571  	caser := cases.Lower(language.Und)
   572  	r := []rune(caser.String(s))
   573  	// Workaround
   574  	ascii := true
   575  	for i := 0; i < len(r)-1; i++ {
   576  		if (i == 0 || r[i-1] != 0x3b1) && r[i] == 0x345 && r[i+1] == 0x3c2 {
   577  			i++
   578  			r[i] = 0x3c3
   579  		}
   580  		if r[i] >= utf8.RuneSelf {
   581  			ascii = false
   582  		}
   583  	}
   584  	if ascii {
   585  		ascii = r[len(r)-1] < utf8.RuneSelf
   586  	}
   587  	if ascii {
   588  		return asciiString(r)
   589  	}
   590  	return unicodeStringFromRunes(r)
   591  }
   592  
   593  func (s unicodeString) toLower() String {
   594  	return toLower(s.String())
   595  }
   596  
   597  func (s unicodeString) toUpper() String {
   598  	caser := cases.Upper(language.Und)
   599  	return newStringValue(caser.String(s.String()))
   600  }
   601  
   602  func (s unicodeString) Export() interface{} {
   603  	return s.String()
   604  }
   605  
   606  func (s unicodeString) ExportType() reflect.Type {
   607  	return reflectTypeString
   608  }
   609  
   610  func (s unicodeString) hash(hash *maphash.Hash) uint64 {
   611  	_, _ = hash.WriteString(string(unistring.FromUtf16(s)))
   612  	h := hash.Sum64()
   613  	hash.Reset()
   614  	return h
   615  }
   616  
   617  func (s unicodeString) string() unistring.String {
   618  	return unistring.FromUtf16(s)
   619  }