github.com/google/grumpy@v0.0.0-20171122020858-3ec87959189c/runtime/unicode.go (about)

     1  // Copyright 2016 Google Inc. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package grumpy
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"reflect"
    21  	"unicode"
    22  	"unicode/utf8"
    23  )
    24  
    25  var (
    26  	// UnicodeType is the object representing the Python 'unicode' type.
    27  	UnicodeType = newBasisType("unicode", reflect.TypeOf(Unicode{}), toUnicodeUnsafe, BaseStringType)
    28  )
    29  
    30  // Unicode represents Python 'unicode' objects. The string value is stored as
    31  // utf-32 data.
    32  type Unicode struct {
    33  	Object
    34  	value []rune
    35  }
    36  
    37  // NewUnicode returns a new Unicode holding the given string value. value is
    38  // assumed to be a valid utf-8 string.
    39  func NewUnicode(value string) *Unicode {
    40  	return NewUnicodeFromRunes(bytes.Runes([]byte(value)))
    41  }
    42  
    43  // NewUnicodeFromRunes returns a new Unicode holding the given runes.
    44  func NewUnicodeFromRunes(value []rune) *Unicode {
    45  	return &Unicode{Object{typ: UnicodeType}, value}
    46  }
    47  
    48  func toUnicodeUnsafe(o *Object) *Unicode {
    49  	return (*Unicode)(o.toPointer())
    50  }
    51  
    52  // Encode translates the runes in s into a str with the given encoding.
    53  //
    54  // NOTE: If s contains surrogates (e.g. U+D800), Encode will raise
    55  // UnicodeDecodeError consistent with CPython 3.x but different than 2.x.
    56  func (s *Unicode) Encode(f *Frame, encoding, errors string) (*Str, *BaseException) {
    57  	// TODO: Support custom encodings and error handlers.
    58  	normalized := normalizeEncoding(encoding)
    59  	if normalized != "utf8" {
    60  		return nil, f.RaiseType(LookupErrorType, fmt.Sprintf("unknown encoding: %s", encoding))
    61  	}
    62  	buf := bytes.Buffer{}
    63  	for i, r := range s.Value() {
    64  		switch {
    65  		case utf8.ValidRune(r):
    66  			buf.WriteRune(r)
    67  		case errors == EncodeIgnore:
    68  			// Do nothing
    69  		case errors == EncodeReplace:
    70  			buf.WriteRune(unicode.ReplacementChar)
    71  		case errors == EncodeStrict:
    72  			format := "'%s' codec can't encode character %s in position %d"
    73  			return nil, f.RaiseType(UnicodeEncodeErrorType, fmt.Sprintf(format, encoding, escapeRune(r), i))
    74  		default:
    75  			format := "unknown error handler name '%s'"
    76  			return nil, f.RaiseType(LookupErrorType, fmt.Sprintf(format, errors))
    77  		}
    78  	}
    79  	return NewStr(buf.String()), nil
    80  }
    81  
    82  // ToObject upcasts s to an Object.
    83  func (s *Unicode) ToObject() *Object {
    84  	return &s.Object
    85  }
    86  
    87  // Value returns the underlying string value held by s.
    88  func (s *Unicode) Value() []rune {
    89  	return s.value
    90  }
    91  
    92  func unicodeAdd(f *Frame, v, w *Object) (*Object, *BaseException) {
    93  	unicodeV := toUnicodeUnsafe(v)
    94  	unicodeW, raised := unicodeCoerce(f, w)
    95  	if raised != nil {
    96  		return nil, raised
    97  	}
    98  	lenV := len(unicodeV.Value())
    99  	newLen := lenV + len(unicodeW.Value())
   100  	if newLen < 0 {
   101  		return nil, f.RaiseType(OverflowErrorType, errResultTooLarge)
   102  	}
   103  	value := make([]rune, newLen)
   104  	copy(value, unicodeV.Value())
   105  	copy(value[lenV:], unicodeW.Value())
   106  	return NewUnicodeFromRunes(value).ToObject(), nil
   107  }
   108  
   109  func unicodeContains(f *Frame, o *Object, value *Object) (*Object, *BaseException) {
   110  	lhs := toUnicodeUnsafe(o).Value()
   111  	s, raised := unicodeCoerce(f, value)
   112  	if raised != nil {
   113  		return nil, raised
   114  	}
   115  	rhs := s.Value()
   116  	lhsLen, rhsLen := len(lhs), len(rhs)
   117  	maxOffset := lhsLen - rhsLen
   118  	for offset := 0; offset <= maxOffset; offset++ {
   119  		if runeSliceCmp(lhs[offset:offset+rhsLen], rhs) == 0 {
   120  			return True.ToObject(), nil
   121  		}
   122  	}
   123  	return False.ToObject(), nil
   124  }
   125  
   126  func unicodeEncode(f *Frame, args Args, kwargs KWArgs) (*Object, *BaseException) {
   127  	// TODO: Accept unicode for encoding and errors args.
   128  	expectedTypes := []*Type{UnicodeType, StrType, StrType}
   129  	argc := len(args)
   130  	if argc >= 1 && argc < 3 {
   131  		expectedTypes = expectedTypes[:argc]
   132  	}
   133  	if raised := checkMethodArgs(f, "encode", args, expectedTypes...); raised != nil {
   134  		return nil, raised
   135  	}
   136  	encoding := EncodeDefault
   137  	if argc > 1 {
   138  		encoding = toStrUnsafe(args[1]).Value()
   139  	}
   140  	errors := EncodeStrict
   141  	if argc > 2 {
   142  		errors = toStrUnsafe(args[2]).Value()
   143  	}
   144  	ret, raised := toUnicodeUnsafe(args[0]).Encode(f, encoding, errors)
   145  	if raised != nil {
   146  		return nil, raised
   147  	}
   148  	return ret.ToObject(), nil
   149  }
   150  
   151  func unicodeEq(f *Frame, v, w *Object) (*Object, *BaseException) {
   152  	return unicodeCompareEq(f, toUnicodeUnsafe(v), w, true)
   153  }
   154  
   155  func unicodeGE(f *Frame, v, w *Object) (*Object, *BaseException) {
   156  	return unicodeCompare(f, toUnicodeUnsafe(v), w, False, True, True)
   157  }
   158  
   159  // unicodeGetItem returns a slice of string depending on whether index is an
   160  // integer or a slice. If index is neither of those types then a TypeError is
   161  // returned.
   162  func unicodeGetItem(f *Frame, o, key *Object) (*Object, *BaseException) {
   163  	s := toUnicodeUnsafe(o).Value()
   164  	switch {
   165  	case key.typ.slots.Index != nil:
   166  		index, raised := seqCheckedIndex(f, len(s), toIntUnsafe(key).Value())
   167  		if raised != nil {
   168  			return nil, raised
   169  		}
   170  		return NewUnicodeFromRunes([]rune{s[index]}).ToObject(), nil
   171  	case key.isInstance(SliceType):
   172  		slice := toSliceUnsafe(key)
   173  		start, stop, step, sliceLen, raised := slice.calcSlice(f, len(s))
   174  		if raised != nil {
   175  			return nil, raised
   176  		}
   177  		if step == 1 {
   178  			return NewUnicodeFromRunes(s[start:stop]).ToObject(), nil
   179  		}
   180  		result := make([]rune, 0, sliceLen)
   181  		for j := start; j < stop; j += step {
   182  			result = append(result, s[j])
   183  		}
   184  		return NewUnicodeFromRunes([]rune(result)).ToObject(), nil
   185  	}
   186  	return nil, f.RaiseType(TypeErrorType, fmt.Sprintf("unicode indices must be integers or slice, not %s", key.typ.Name()))
   187  }
   188  
   189  func unicodeGetNewArgs(f *Frame, args Args, _ KWArgs) (*Object, *BaseException) {
   190  	if raised := checkMethodArgs(f, "__getnewargs__", args, UnicodeType); raised != nil {
   191  		return nil, raised
   192  	}
   193  	return NewTuple1(args[0]).ToObject(), nil
   194  }
   195  
   196  func unicodeGT(f *Frame, v, w *Object) (*Object, *BaseException) {
   197  	return unicodeCompare(f, toUnicodeUnsafe(v), w, False, False, True)
   198  }
   199  
   200  func unicodeHash(f *Frame, o *Object) (*Object, *BaseException) {
   201  	s := toUnicodeUnsafe(o).Value()
   202  	l := len(s)
   203  	if l == 0 {
   204  		return NewInt(0).ToObject(), nil
   205  	}
   206  	h := int(s[0]) << 7
   207  	for _, r := range s {
   208  		h = (1000003 * h) ^ int(r)
   209  	}
   210  	h ^= l
   211  	if h == -1 {
   212  		h = -2
   213  	}
   214  	return NewInt(h).ToObject(), nil
   215  }
   216  
   217  func unicodeJoin(f *Frame, args Args, _ KWArgs) (*Object, *BaseException) {
   218  	if raised := checkMethodArgs(f, "join", args, UnicodeType, ObjectType); raised != nil {
   219  		return nil, raised
   220  	}
   221  	var result *Object
   222  	raised := seqApply(f, args[1], func(parts []*Object, _ bool) (raised *BaseException) {
   223  		result, raised = unicodeJoinParts(f, toUnicodeUnsafe(args[0]), parts)
   224  		return raised
   225  	})
   226  	if raised != nil {
   227  		return nil, raised
   228  	}
   229  	return result, nil
   230  }
   231  
   232  func unicodeLE(f *Frame, v, w *Object) (*Object, *BaseException) {
   233  	return unicodeCompare(f, toUnicodeUnsafe(v), w, True, True, False)
   234  }
   235  
   236  func unicodeLen(f *Frame, o *Object) (*Object, *BaseException) {
   237  	return NewInt(len(toUnicodeUnsafe(o).Value())).ToObject(), nil
   238  }
   239  
   240  func unicodeLT(f *Frame, v, w *Object) (*Object, *BaseException) {
   241  	return unicodeCompare(f, toUnicodeUnsafe(v), w, True, False, False)
   242  }
   243  
   244  func unicodeMul(f *Frame, v, w *Object) (*Object, *BaseException) {
   245  	value := toUnicodeUnsafe(v).Value()
   246  	numChars := len(value)
   247  	n, ok, raised := strRepeatCount(f, numChars, w)
   248  	if raised != nil {
   249  		return nil, raised
   250  	}
   251  	if !ok {
   252  		return NotImplemented, nil
   253  	}
   254  	newLen := numChars * n
   255  	newValue := make([]rune, newLen)
   256  	for i := 0; i < newLen; i += numChars {
   257  		copy(newValue[i:], value)
   258  	}
   259  	return NewUnicodeFromRunes(newValue).ToObject(), nil
   260  }
   261  
   262  func unicodeNative(f *Frame, o *Object) (reflect.Value, *BaseException) {
   263  	// Encode to utf-8 when passing data out to Go.
   264  	s, raised := toUnicodeUnsafe(o).Encode(f, EncodeDefault, EncodeStrict)
   265  	if raised != nil {
   266  		return reflect.Value{}, raised
   267  	}
   268  	return reflect.ValueOf(s.Value()), nil
   269  }
   270  
   271  func unicodeNE(f *Frame, v, w *Object) (*Object, *BaseException) {
   272  	return unicodeCompareEq(f, toUnicodeUnsafe(v), w, false)
   273  }
   274  
   275  func unicodeNew(f *Frame, t *Type, args Args, _ KWArgs) (ret *Object, raised *BaseException) {
   276  	// TODO: Accept keyword arguments: string, encoding, errors.
   277  	if t != UnicodeType {
   278  		// Allocate a plain unicode then copy it's value into an object
   279  		// of the unicode subtype.
   280  		s, raised := unicodeNew(f, UnicodeType, args, nil)
   281  		if raised != nil {
   282  			return nil, raised
   283  		}
   284  		result := toUnicodeUnsafe(newObject(t))
   285  		result.value = toUnicodeUnsafe(s).Value()
   286  		return result.ToObject(), nil
   287  	}
   288  	expectedTypes := []*Type{ObjectType, StrType, StrType}
   289  	argc := len(args)
   290  	if argc < 3 {
   291  		expectedTypes = expectedTypes[:argc]
   292  	}
   293  	if raised := checkMethodArgs(f, "__new__", args, expectedTypes...); raised != nil {
   294  		return nil, raised
   295  	}
   296  	if argc == 0 {
   297  		return NewUnicodeFromRunes(nil).ToObject(), nil
   298  	}
   299  	arg0 := args[0]
   300  	if argc == 1 {
   301  		if unicode := arg0.typ.slots.Unicode; unicode != nil {
   302  			ret, raised = unicode.Fn(f, arg0)
   303  		} else if arg0.typ == UnicodeType {
   304  			ret = toUnicodeUnsafe(arg0).ToObject()
   305  		} else if arg0.isInstance(UnicodeType) {
   306  			// Return a unicode object (not a subtype).
   307  			ret = NewUnicodeFromRunes(toUnicodeUnsafe(arg0).Value()).ToObject()
   308  		} else if str := arg0.typ.slots.Str; str != nil {
   309  			ret, raised = str.Fn(f, arg0)
   310  		} else {
   311  			var s *Str
   312  			if s, raised = Repr(f, arg0); raised == nil {
   313  				ret = s.ToObject()
   314  			}
   315  		}
   316  		if raised != nil {
   317  			return nil, raised
   318  		}
   319  		u, raised := unicodeCoerce(f, ret)
   320  		if raised != nil {
   321  			return nil, raised
   322  		}
   323  		return u.ToObject(), nil
   324  	}
   325  	if !arg0.isInstance(StrType) {
   326  		format := "coercing to Unicode: need str, %s found"
   327  		return nil, f.RaiseType(TypeErrorType, fmt.Sprintf(format, arg0.typ.Name()))
   328  	}
   329  	encoding := toStrUnsafe(args[1]).Value()
   330  	errors := "strict"
   331  	if argc > 2 {
   332  		errors = toStrUnsafe(args[2]).Value()
   333  	}
   334  	s, raised := toStrUnsafe(arg0).Decode(f, encoding, errors)
   335  	if raised != nil {
   336  		return nil, raised
   337  	}
   338  	return s.ToObject(), nil
   339  }
   340  
   341  func unicodeRepr(_ *Frame, o *Object) (*Object, *BaseException) {
   342  	buf := bytes.Buffer{}
   343  	buf.WriteString("u'")
   344  	for _, r := range toUnicodeUnsafe(o).Value() {
   345  		if escape, ok := escapeMap[r]; ok {
   346  			buf.WriteString(escape)
   347  		} else if r <= unicode.MaxASCII && unicode.IsPrint(r) {
   348  			buf.WriteRune(r)
   349  		} else {
   350  			buf.Write(escapeRune(r))
   351  		}
   352  	}
   353  	buf.WriteRune('\'')
   354  	return NewStr(buf.String()).ToObject(), nil
   355  }
   356  
   357  func unicodeStr(f *Frame, o *Object) (*Object, *BaseException) {
   358  	ret, raised := toUnicodeUnsafe(o).Encode(f, EncodeDefault, EncodeStrict)
   359  	if raised != nil {
   360  		return nil, raised
   361  	}
   362  	return ret.ToObject(), nil
   363  }
   364  
   365  func unicodeStrip(f *Frame, args Args, _ KWArgs) (*Object, *BaseException) {
   366  	expectedTypes := []*Type{UnicodeType, ObjectType}
   367  	argc := len(args)
   368  	if argc == 1 {
   369  		expectedTypes = expectedTypes[:argc]
   370  	}
   371  	if raised := checkMethodArgs(f, "strip", args, expectedTypes...); raised != nil {
   372  		return nil, raised
   373  	}
   374  	s := toUnicodeUnsafe(args[0])
   375  	charsArg := None
   376  	if argc > 1 {
   377  		charsArg = args[1]
   378  	}
   379  	matchFunc := unicode.IsSpace
   380  	if charsArg != None {
   381  		chars, raised := unicodeCoerce(f, charsArg)
   382  		if raised != nil {
   383  			return nil, raised
   384  		}
   385  		matchFunc = func(r rune) bool {
   386  			for _, c := range chars.Value() {
   387  				if r == c {
   388  					return true
   389  				}
   390  			}
   391  			return false
   392  		}
   393  	}
   394  	runes := s.Value()
   395  	numRunes := len(runes)
   396  	lindex := 0
   397  	for ; lindex < numRunes; lindex++ {
   398  		if !matchFunc(runes[lindex]) {
   399  			break
   400  		}
   401  	}
   402  	rindex := numRunes
   403  	for ; rindex > lindex; rindex-- {
   404  		if !matchFunc(runes[rindex-1]) {
   405  			break
   406  		}
   407  	}
   408  	result := make([]rune, rindex-lindex)
   409  	copy(result, runes[lindex:rindex])
   410  	return NewUnicodeFromRunes(result).ToObject(), nil
   411  }
   412  
   413  func initUnicodeType(dict map[string]*Object) {
   414  	dict["__getnewargs__"] = newBuiltinFunction("__getnewargs__", unicodeGetNewArgs).ToObject()
   415  	dict["encode"] = newBuiltinFunction("encode", unicodeEncode).ToObject()
   416  	dict["join"] = newBuiltinFunction("join", unicodeJoin).ToObject()
   417  	dict["strip"] = newBuiltinFunction("strip", unicodeStrip).ToObject()
   418  	UnicodeType.slots.Add = &binaryOpSlot{unicodeAdd}
   419  	UnicodeType.slots.Contains = &binaryOpSlot{unicodeContains}
   420  	UnicodeType.slots.Eq = &binaryOpSlot{unicodeEq}
   421  	UnicodeType.slots.GE = &binaryOpSlot{unicodeGE}
   422  	UnicodeType.slots.GetItem = &binaryOpSlot{unicodeGetItem}
   423  	UnicodeType.slots.GT = &binaryOpSlot{unicodeGT}
   424  	UnicodeType.slots.Hash = &unaryOpSlot{unicodeHash}
   425  	UnicodeType.slots.LE = &binaryOpSlot{unicodeLE}
   426  	UnicodeType.slots.Len = &unaryOpSlot{unicodeLen}
   427  	UnicodeType.slots.LT = &binaryOpSlot{unicodeLT}
   428  	UnicodeType.slots.Mul = &binaryOpSlot{unicodeMul}
   429  	UnicodeType.slots.NE = &binaryOpSlot{unicodeNE}
   430  	UnicodeType.slots.New = &newSlot{unicodeNew}
   431  	UnicodeType.slots.Native = &nativeSlot{unicodeNative}
   432  	UnicodeType.slots.RMul = &binaryOpSlot{unicodeMul}
   433  	UnicodeType.slots.Repr = &unaryOpSlot{unicodeRepr}
   434  	UnicodeType.slots.Str = &unaryOpSlot{unicodeStr}
   435  }
   436  
   437  func unicodeCompare(f *Frame, v *Unicode, w *Object, ltResult, eqResult, gtResult *Int) (*Object, *BaseException) {
   438  	rhs := []rune(nil)
   439  	if w.isInstance(UnicodeType) {
   440  		rhs = toUnicodeUnsafe(w).Value()
   441  	} else if w.isInstance(StrType) {
   442  		ret, raised := toStrUnsafe(w).Decode(f, EncodeDefault, EncodeStrict)
   443  		if raised != nil {
   444  			return nil, raised
   445  		}
   446  		rhs = ret.Value()
   447  	} else {
   448  		return NotImplemented, nil
   449  	}
   450  	switch runeSliceCmp(v.Value(), rhs) {
   451  	case -1:
   452  		return ltResult.ToObject(), nil
   453  	case 0:
   454  		return eqResult.ToObject(), nil
   455  	default:
   456  		return gtResult.ToObject(), nil
   457  	}
   458  }
   459  
   460  func runeSliceCmp(lhs []rune, rhs []rune) int {
   461  	lhsLen, rhsLen := len(lhs), len(rhs)
   462  	minLen := lhsLen
   463  	if rhsLen < lhsLen {
   464  		minLen = rhsLen
   465  	}
   466  	for i := 0; i < minLen; i++ {
   467  		if lhs[i] < rhs[i] {
   468  			return -1
   469  		}
   470  		if lhs[i] > rhs[i] {
   471  			return 1
   472  		}
   473  	}
   474  	if lhsLen < rhsLen {
   475  		return -1
   476  	}
   477  	if lhsLen > rhsLen {
   478  		return 1
   479  	}
   480  	return 0
   481  }
   482  
   483  // unicodeCompareEq returns the result of comparing whether v and w are equal
   484  // (when eq is true) or unequal (when eq is false). It differs from
   485  // unicodeCompare in that it will safely decode w if it has type str and
   486  // therefore will not raise UnicodeDecodeError.
   487  func unicodeCompareEq(f *Frame, v *Unicode, w *Object, eq bool) (*Object, *BaseException) {
   488  	if w.isInstance(UnicodeType) {
   489  		// Do the standard comparison knowing that we won't raise
   490  		// UnicodeDecodeError for w.
   491  		return unicodeCompare(f, v, w, GetBool(!eq), GetBool(eq), GetBool(!eq))
   492  	}
   493  	if !w.isInstance(StrType) {
   494  		return NotImplemented, nil
   495  	}
   496  	lhs := v.Value()
   497  	lhsLen := len(lhs)
   498  	i := 0
   499  	// Decode w as utf-8.
   500  	for _, r := range toStrUnsafe(w).Value() {
   501  		// lhs[i] should never be RuneError so the second part of the
   502  		// condition should catch that case.
   503  		if i >= lhsLen || lhs[i] != r {
   504  			return GetBool(!eq).ToObject(), nil
   505  		}
   506  		i++
   507  	}
   508  	return GetBool((i == lhsLen) == eq).ToObject(), nil
   509  }
   510  
   511  func unicodeCoerce(f *Frame, o *Object) (*Unicode, *BaseException) {
   512  	switch {
   513  	case o.isInstance(StrType):
   514  		return toStrUnsafe(o).Decode(f, EncodeDefault, EncodeStrict)
   515  	case o.isInstance(UnicodeType):
   516  		return toUnicodeUnsafe(o), nil
   517  	default:
   518  		format := "coercing to Unicode: need string, %s found"
   519  		return nil, f.RaiseType(TypeErrorType, fmt.Sprintf(format, o.typ.Name()))
   520  	}
   521  }
   522  
   523  func unicodeJoinParts(f *Frame, s *Unicode, parts []*Object) (*Object, *BaseException) {
   524  	numParts := len(parts)
   525  	if numParts == 0 {
   526  		return NewUnicode("").ToObject(), nil
   527  	}
   528  	sep := s.Value()
   529  	sepLen := len(sep)
   530  	unicodeParts := make([]*Unicode, numParts)
   531  	// Calculate the size of the required buffer.
   532  	numRunes := (numParts - 1) * len(sep)
   533  	for i, part := range parts {
   534  		s, raised := unicodeCoerce(f, part)
   535  		if raised != nil {
   536  			return nil, raised
   537  		}
   538  		unicodeParts[i] = s
   539  		numRunes += len(s.Value())
   540  	}
   541  	// Piece together the result string into buf.
   542  	buf := make([]rune, numRunes)
   543  	offset := 0
   544  	for i, part := range unicodeParts {
   545  		if i > 0 {
   546  			copy(buf[offset:offset+sepLen], sep)
   547  			offset += sepLen
   548  		}
   549  		s := part.Value()
   550  		l := len(s)
   551  		copy(buf[offset:offset+l], s)
   552  		offset += l
   553  	}
   554  	return NewUnicodeFromRunes(buf).ToObject(), nil
   555  }