go.ketch.com/lib/goja@v0.0.1/string_unicode.go (about)

     1  package goja
     2  
     3  import (
     4  	"errors"
     5  	"hash/maphash"
     6  	"io"
     7  	"math"
     8  	"reflect"
     9  	"strings"
    10  	"unicode/utf16"
    11  	"unicode/utf8"
    12  
    13  	"go.ketch.com/lib/goja/parser"
    14  	"go.ketch.com/lib/goja/unistring"
    15  	"golang.org/x/text/cases"
    16  	"golang.org/x/text/language"
    17  )
    18  
    19  type unicodeString []uint16
    20  
    21  type unicodeRuneReader struct {
    22  	s   unicodeString
    23  	pos int
    24  }
    25  
    26  type utf16RuneReader struct {
    27  	s   unicodeString
    28  	pos int
    29  }
    30  
    31  // passes through invalid surrogate pairs
    32  type lenientUtf16Decoder struct {
    33  	utf16Reader io.RuneReader
    34  	prev        rune
    35  	prevSet     bool
    36  }
    37  
    38  type valueStringBuilder struct {
    39  	asciiBuilder   strings.Builder
    40  	unicodeBuilder unicodeStringBuilder
    41  }
    42  
    43  type unicodeStringBuilder struct {
    44  	buf     []uint16
    45  	unicode bool
    46  }
    47  
    48  var (
    49  	InvalidRuneError = errors.New("invalid rune")
    50  )
    51  
    52  func (rr *utf16RuneReader) ReadRune() (r rune, size int, err error) {
    53  	if rr.pos < len(rr.s) {
    54  		r = rune(rr.s[rr.pos])
    55  		size++
    56  		rr.pos++
    57  		return
    58  	}
    59  	err = io.EOF
    60  	return
    61  }
    62  
    63  func (rr *lenientUtf16Decoder) ReadRune() (r rune, size int, err error) {
    64  	if rr.prevSet {
    65  		r = rr.prev
    66  		size = 1
    67  		rr.prevSet = false
    68  	} else {
    69  		r, size, err = rr.utf16Reader.ReadRune()
    70  		if err != nil {
    71  			return
    72  		}
    73  	}
    74  	if isUTF16FirstSurrogate(r) {
    75  		second, _, err1 := rr.utf16Reader.ReadRune()
    76  		if err1 != nil {
    77  			if err1 != io.EOF {
    78  				err = err1
    79  			}
    80  			return
    81  		}
    82  		if isUTF16SecondSurrogate(second) {
    83  			r = utf16.DecodeRune(r, second)
    84  			size++
    85  		} else {
    86  			rr.prev = second
    87  			rr.prevSet = true
    88  		}
    89  	}
    90  
    91  	return
    92  }
    93  
    94  func (rr *unicodeRuneReader) ReadRune() (r rune, size int, err error) {
    95  	if rr.pos < len(rr.s) {
    96  		r = rune(rr.s[rr.pos])
    97  		size++
    98  		rr.pos++
    99  		if isUTF16FirstSurrogate(r) {
   100  			if rr.pos < len(rr.s) {
   101  				second := rune(rr.s[rr.pos])
   102  				if isUTF16SecondSurrogate(second) {
   103  					r = utf16.DecodeRune(r, second)
   104  					size++
   105  					rr.pos++
   106  				} else {
   107  					err = InvalidRuneError
   108  				}
   109  			} else {
   110  				err = InvalidRuneError
   111  			}
   112  		} else if isUTF16SecondSurrogate(r) {
   113  			err = InvalidRuneError
   114  		}
   115  	} else {
   116  		err = io.EOF
   117  	}
   118  	return
   119  }
   120  
   121  func (b *unicodeStringBuilder) Grow(n int) {
   122  	if len(b.buf) == 0 {
   123  		n++
   124  	}
   125  	if cap(b.buf)-len(b.buf) < n {
   126  		buf := make([]uint16, len(b.buf), 2*cap(b.buf)+n)
   127  		copy(buf, b.buf)
   128  		b.buf = buf
   129  	}
   130  }
   131  
   132  func (b *unicodeStringBuilder) ensureStarted(initialSize int) {
   133  	b.Grow(initialSize)
   134  	if len(b.buf) == 0 {
   135  		b.buf = append(b.buf, unistring.BOM)
   136  	}
   137  }
   138  
   139  func (b *unicodeStringBuilder) WriteString(s valueString) {
   140  	b.ensureStarted(s.length())
   141  	a, u := devirtualizeString(s)
   142  	if u != nil {
   143  		b.buf = append(b.buf, u[1:]...)
   144  		b.unicode = true
   145  	} else {
   146  		for i := 0; i < len(a); i++ {
   147  			b.buf = append(b.buf, uint16(a[i]))
   148  		}
   149  	}
   150  }
   151  
   152  func (b *unicodeStringBuilder) String() valueString {
   153  	if b.unicode {
   154  		return unicodeString(b.buf)
   155  	}
   156  	if len(b.buf) == 0 {
   157  		return stringEmpty
   158  	}
   159  	buf := make([]byte, 0, len(b.buf)-1)
   160  	for _, c := range b.buf[1:] {
   161  		buf = append(buf, byte(c))
   162  	}
   163  	return asciiString(buf)
   164  }
   165  
   166  func (b *unicodeStringBuilder) WriteRune(r rune) {
   167  	if r <= 0xFFFF {
   168  		b.ensureStarted(1)
   169  		b.buf = append(b.buf, uint16(r))
   170  		if !b.unicode && r >= utf8.RuneSelf {
   171  			b.unicode = true
   172  		}
   173  	} else {
   174  		b.ensureStarted(2)
   175  		first, second := utf16.EncodeRune(r)
   176  		b.buf = append(b.buf, uint16(first), uint16(second))
   177  		b.unicode = true
   178  	}
   179  }
   180  
   181  func (b *unicodeStringBuilder) writeASCIIString(bytes string) {
   182  	b.ensureStarted(len(bytes))
   183  	for _, c := range bytes {
   184  		b.buf = append(b.buf, uint16(c))
   185  	}
   186  }
   187  
   188  func (b *unicodeStringBuilder) writeUnicodeString(str unicodeString) {
   189  	b.ensureStarted(str.length())
   190  	b.buf = append(b.buf, str[1:]...)
   191  	b.unicode = true
   192  }
   193  
   194  func (b *valueStringBuilder) ascii() bool {
   195  	return len(b.unicodeBuilder.buf) == 0
   196  }
   197  
   198  func (b *valueStringBuilder) WriteString(s valueString) {
   199  	a, u := devirtualizeString(s)
   200  	if u != nil {
   201  		b.switchToUnicode(u.length())
   202  		b.unicodeBuilder.writeUnicodeString(u)
   203  	} else {
   204  		if b.ascii() {
   205  			b.asciiBuilder.WriteString(string(a))
   206  		} else {
   207  			b.unicodeBuilder.writeASCIIString(string(a))
   208  		}
   209  	}
   210  }
   211  
   212  func (b *valueStringBuilder) WriteASCII(s string) {
   213  	if b.ascii() {
   214  		b.asciiBuilder.WriteString(s)
   215  	} else {
   216  		b.unicodeBuilder.writeASCIIString(s)
   217  	}
   218  }
   219  
   220  func (b *valueStringBuilder) WriteRune(r rune) {
   221  	if r < utf8.RuneSelf {
   222  		if b.ascii() {
   223  			b.asciiBuilder.WriteByte(byte(r))
   224  		} else {
   225  			b.unicodeBuilder.WriteRune(r)
   226  		}
   227  	} else {
   228  		var extraLen int
   229  		if r <= 0xFFFF {
   230  			extraLen = 1
   231  		} else {
   232  			extraLen = 2
   233  		}
   234  		b.switchToUnicode(extraLen)
   235  		b.unicodeBuilder.WriteRune(r)
   236  	}
   237  }
   238  
   239  func (b *valueStringBuilder) String() valueString {
   240  	if b.ascii() {
   241  		return asciiString(b.asciiBuilder.String())
   242  	}
   243  	return b.unicodeBuilder.String()
   244  }
   245  
   246  func (b *valueStringBuilder) Grow(n int) {
   247  	if b.ascii() {
   248  		b.asciiBuilder.Grow(n)
   249  	} else {
   250  		b.unicodeBuilder.Grow(n)
   251  	}
   252  }
   253  
   254  func (b *valueStringBuilder) switchToUnicode(extraLen int) {
   255  	if b.ascii() {
   256  		b.unicodeBuilder.ensureStarted(b.asciiBuilder.Len() + extraLen)
   257  		b.unicodeBuilder.writeASCIIString(b.asciiBuilder.String())
   258  		b.asciiBuilder.Reset()
   259  	}
   260  }
   261  
   262  func (b *valueStringBuilder) WriteSubstring(source valueString, start int, end int) {
   263  	a, us := devirtualizeString(source)
   264  	if us == nil {
   265  		if b.ascii() {
   266  			b.asciiBuilder.WriteString(string(a[start:end]))
   267  		} else {
   268  			b.unicodeBuilder.writeASCIIString(string(a[start:end]))
   269  		}
   270  		return
   271  	}
   272  	if b.ascii() {
   273  		uc := false
   274  		for i := start; i < end; i++ {
   275  			if us.charAt(i) >= utf8.RuneSelf {
   276  				uc = true
   277  				break
   278  			}
   279  		}
   280  		if uc {
   281  			b.switchToUnicode(end - start + 1)
   282  		} else {
   283  			b.asciiBuilder.Grow(end - start + 1)
   284  			for i := start; i < end; i++ {
   285  				b.asciiBuilder.WriteByte(byte(us.charAt(i)))
   286  			}
   287  			return
   288  		}
   289  	}
   290  	b.unicodeBuilder.buf = append(b.unicodeBuilder.buf, us[start+1:end+1]...)
   291  	b.unicodeBuilder.unicode = true
   292  }
   293  
   294  func (s unicodeString) reader() io.RuneReader {
   295  	return &unicodeRuneReader{
   296  		s: s[1:],
   297  	}
   298  }
   299  
   300  func (s unicodeString) utf16Reader() io.RuneReader {
   301  	return &utf16RuneReader{
   302  		s: s[1:],
   303  	}
   304  }
   305  
   306  func (s unicodeString) utf16Runes() []rune {
   307  	runes := make([]rune, len(s)-1)
   308  	for i, ch := range s[1:] {
   309  		runes[i] = rune(ch)
   310  	}
   311  	return runes
   312  }
   313  
   314  func (s unicodeString) ToInteger() int64 {
   315  	return 0
   316  }
   317  
   318  func (s unicodeString) toString() valueString {
   319  	return s
   320  }
   321  
   322  func (s unicodeString) ToString() Value {
   323  	return s
   324  }
   325  
   326  func (s unicodeString) ToFloat() float64 {
   327  	return math.NaN()
   328  }
   329  
   330  func (s unicodeString) ToBoolean() bool {
   331  	return len(s) > 0
   332  }
   333  
   334  func (s unicodeString) toTrimmedUTF8() string {
   335  	if len(s) == 0 {
   336  		return ""
   337  	}
   338  	return strings.Trim(s.String(), parser.WhitespaceChars)
   339  }
   340  
   341  func (s unicodeString) ToNumber() Value {
   342  	return asciiString(s.toTrimmedUTF8()).ToNumber()
   343  }
   344  
   345  func (s unicodeString) ToObject(r *Runtime) *Object {
   346  	return r._newString(s, r.global.StringPrototype)
   347  }
   348  
   349  func (s unicodeString) equals(other unicodeString) bool {
   350  	if len(s) != len(other) {
   351  		return false
   352  	}
   353  	for i, r := range s {
   354  		if r != other[i] {
   355  			return false
   356  		}
   357  	}
   358  	return true
   359  }
   360  
   361  func (s unicodeString) SameAs(other Value) bool {
   362  	return s.StrictEquals(other)
   363  }
   364  
   365  func (s unicodeString) Equals(other Value) bool {
   366  	if s.StrictEquals(other) {
   367  		return true
   368  	}
   369  
   370  	if o, ok := other.(*Object); ok {
   371  		return s.Equals(o.toPrimitive())
   372  	}
   373  	return false
   374  }
   375  
   376  func (s unicodeString) StrictEquals(other Value) bool {
   377  	if otherStr, ok := other.(unicodeString); ok {
   378  		return s.equals(otherStr)
   379  	}
   380  	if otherStr, ok := other.(*importedString); ok {
   381  		otherStr.ensureScanned()
   382  		if otherStr.u != nil {
   383  			return s.equals(otherStr.u)
   384  		}
   385  	}
   386  
   387  	return false
   388  }
   389  
   390  func (s unicodeString) baseObject(r *Runtime) *Object {
   391  	ss := r.stringSingleton
   392  	ss.value = s
   393  	ss.setLength()
   394  	return ss.val
   395  }
   396  
   397  func (s unicodeString) charAt(idx int) rune {
   398  	return rune(s[idx+1])
   399  }
   400  
   401  func (s unicodeString) length() int {
   402  	return len(s) - 1
   403  }
   404  
   405  func (s unicodeString) concat(other valueString) valueString {
   406  	a, u := devirtualizeString(other)
   407  	if u != nil {
   408  		b := make(unicodeString, len(s)+len(u)-1)
   409  		copy(b, s)
   410  		copy(b[len(s):], u[1:])
   411  		return b
   412  	}
   413  	b := make([]uint16, len(s)+len(a))
   414  	copy(b, s)
   415  	b1 := b[len(s):]
   416  	for i := 0; i < len(a); i++ {
   417  		b1[i] = uint16(a[i])
   418  	}
   419  	return unicodeString(b)
   420  }
   421  
   422  func (s unicodeString) substring(start, end int) valueString {
   423  	ss := s[start+1 : end+1]
   424  	for _, c := range ss {
   425  		if c >= utf8.RuneSelf {
   426  			b := make(unicodeString, end-start+1)
   427  			b[0] = unistring.BOM
   428  			copy(b[1:], ss)
   429  			return b
   430  		}
   431  	}
   432  	as := make([]byte, end-start)
   433  	for i, c := range ss {
   434  		as[i] = byte(c)
   435  	}
   436  	return asciiString(as)
   437  }
   438  
   439  func (s unicodeString) String() string {
   440  	return string(utf16.Decode(s[1:]))
   441  }
   442  
   443  func (s unicodeString) compareTo(other valueString) int {
   444  	// TODO handle invalid UTF-16
   445  	return strings.Compare(s.String(), other.String())
   446  }
   447  
   448  func (s unicodeString) index(substr valueString, start int) int {
   449  	var ss []uint16
   450  	a, u := devirtualizeString(substr)
   451  	if u != nil {
   452  		ss = u[1:]
   453  	} else {
   454  		ss = make([]uint16, len(a))
   455  		for i := 0; i < len(a); i++ {
   456  			ss[i] = uint16(a[i])
   457  		}
   458  	}
   459  	s1 := s[1:]
   460  	// TODO: optimise
   461  	end := len(s1) - len(ss)
   462  	for start <= end {
   463  		for i := 0; i < len(ss); i++ {
   464  			if s1[start+i] != ss[i] {
   465  				goto nomatch
   466  			}
   467  		}
   468  
   469  		return start
   470  	nomatch:
   471  		start++
   472  	}
   473  	return -1
   474  }
   475  
   476  func (s unicodeString) lastIndex(substr valueString, start int) int {
   477  	var ss []uint16
   478  	a, u := devirtualizeString(substr)
   479  	if u != nil {
   480  		ss = u[1:]
   481  	} else {
   482  		ss = make([]uint16, len(a))
   483  		for i := 0; i < len(a); i++ {
   484  			ss[i] = uint16(a[i])
   485  		}
   486  	}
   487  
   488  	s1 := s[1:]
   489  	if maxStart := len(s1) - len(ss); start > maxStart {
   490  		start = maxStart
   491  	}
   492  	// TODO: optimise
   493  	for start >= 0 {
   494  		for i := 0; i < len(ss); i++ {
   495  			if s1[start+i] != ss[i] {
   496  				goto nomatch
   497  			}
   498  		}
   499  
   500  		return start
   501  	nomatch:
   502  		start--
   503  	}
   504  	return -1
   505  }
   506  
   507  func unicodeStringFromRunes(r []rune) unicodeString {
   508  	return unistring.NewFromRunes(r).AsUtf16()
   509  }
   510  
   511  func toLower(s string) valueString {
   512  	caser := cases.Lower(language.Und)
   513  	r := []rune(caser.String(s))
   514  	// Workaround
   515  	ascii := true
   516  	for i := 0; i < len(r)-1; i++ {
   517  		if (i == 0 || r[i-1] != 0x3b1) && r[i] == 0x345 && r[i+1] == 0x3c2 {
   518  			i++
   519  			r[i] = 0x3c3
   520  		}
   521  		if r[i] >= utf8.RuneSelf {
   522  			ascii = false
   523  		}
   524  	}
   525  	if ascii {
   526  		ascii = r[len(r)-1] < utf8.RuneSelf
   527  	}
   528  	if ascii {
   529  		return asciiString(r)
   530  	}
   531  	return unicodeStringFromRunes(r)
   532  }
   533  
   534  func (s unicodeString) toLower() valueString {
   535  	return toLower(s.String())
   536  }
   537  
   538  func (s unicodeString) toUpper() valueString {
   539  	caser := cases.Upper(language.Und)
   540  	return newStringValue(caser.String(s.String()))
   541  }
   542  
   543  func (s unicodeString) Export() interface{} {
   544  	return s.String()
   545  }
   546  
   547  func (s unicodeString) ExportType() reflect.Type {
   548  	return reflectTypeString
   549  }
   550  
   551  func (s unicodeString) hash(hash *maphash.Hash) uint64 {
   552  	_, _ = hash.WriteString(string(unistring.FromUtf16(s)))
   553  	h := hash.Sum64()
   554  	hash.Reset()
   555  	return h
   556  }
   557  
   558  func (s unicodeString) string() unistring.String {
   559  	return unistring.FromUtf16(s)
   560  }