github.com/grumpyhome/grumpy@v0.3.1-0.20201208125205-7b775405bdf1/grumpy-runtime-src/runtime/unicode.go (about) 1 // Copyright 2016 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package grumpy 16 17 import ( 18 "bytes" 19 "fmt" 20 "reflect" 21 "unicode" 22 "unicode/utf8" 23 ) 24 25 var ( 26 // UnicodeType is the object representing the Python 'unicode' type. 27 UnicodeType = newBasisType("unicode", reflect.TypeOf(Unicode{}), toUnicodeUnsafe, BaseStringType) 28 ) 29 30 // Unicode represents Python 'unicode' objects. The string value is stored as 31 // utf-32 data. 32 type Unicode struct { 33 Object 34 value []rune 35 } 36 37 // NewUnicode returns a new Unicode holding the given string value. value is 38 // assumed to be a valid utf-8 string. 39 func NewUnicode(value string) *Unicode { 40 return NewUnicodeFromRunes(bytes.Runes([]byte(value))) 41 } 42 43 // NewUnicodeFromRunes returns a new Unicode holding the given runes. 44 func NewUnicodeFromRunes(value []rune) *Unicode { 45 return &Unicode{Object{typ: UnicodeType}, value} 46 } 47 48 func toUnicodeUnsafe(o *Object) *Unicode { 49 return (*Unicode)(o.toPointer()) 50 } 51 52 // Encode translates the runes in s into a str with the given encoding. 53 // 54 // NOTE: If s contains surrogates (e.g. U+D800), Encode will raise 55 // UnicodeDecodeError consistent with CPython 3.x but different than 2.x. 56 func (s *Unicode) Encode(f *Frame, encoding, errors string) (*Str, *BaseException) { 57 // TODO: Support custom encodings and error handlers. 58 normalized := normalizeEncoding(encoding) 59 if normalized != "utf8" { 60 return nil, f.RaiseType(LookupErrorType, fmt.Sprintf("unknown encoding: %s", encoding)) 61 } 62 buf := bytes.Buffer{} 63 for i, r := range s.Value() { 64 switch { 65 case utf8.ValidRune(r): 66 buf.WriteRune(r) 67 case errors == EncodeIgnore: 68 // Do nothing 69 case errors == EncodeReplace: 70 buf.WriteRune(unicode.ReplacementChar) 71 case errors == EncodeStrict: 72 format := "'%s' codec can't encode character %s in position %d" 73 return nil, f.RaiseType(UnicodeEncodeErrorType, fmt.Sprintf(format, encoding, escapeRune(r), i)) 74 default: 75 format := "unknown error handler name '%s'" 76 return nil, f.RaiseType(LookupErrorType, fmt.Sprintf(format, errors)) 77 } 78 } 79 return NewStr(buf.String()), nil 80 } 81 82 // ToObject upcasts s to an Object. 83 func (s *Unicode) ToObject() *Object { 84 return &s.Object 85 } 86 87 // Value returns the underlying string value held by s. 88 func (s *Unicode) Value() []rune { 89 return s.value 90 } 91 92 func unicodeAdd(f *Frame, v, w *Object) (*Object, *BaseException) { 93 unicodeV := toUnicodeUnsafe(v) 94 unicodeW, raised := unicodeCoerce(f, w) 95 if raised != nil { 96 return nil, raised 97 } 98 lenV := len(unicodeV.Value()) 99 newLen := lenV + len(unicodeW.Value()) 100 if newLen < 0 { 101 return nil, f.RaiseType(OverflowErrorType, errResultTooLarge) 102 } 103 value := make([]rune, newLen) 104 copy(value, unicodeV.Value()) 105 copy(value[lenV:], unicodeW.Value()) 106 return NewUnicodeFromRunes(value).ToObject(), nil 107 } 108 109 func unicodeContains(f *Frame, o *Object, value *Object) (*Object, *BaseException) { 110 lhs := toUnicodeUnsafe(o).Value() 111 s, raised := unicodeCoerce(f, value) 112 if raised != nil { 113 return nil, raised 114 } 115 rhs := s.Value() 116 lhsLen, rhsLen := len(lhs), len(rhs) 117 maxOffset := lhsLen - rhsLen 118 for offset := 0; offset <= maxOffset; offset++ { 119 if runeSliceCmp(lhs[offset:offset+rhsLen], rhs) == 0 { 120 return True.ToObject(), nil 121 } 122 } 123 return False.ToObject(), nil 124 } 125 126 func unicodeEncode(f *Frame, args Args, kwargs KWArgs) (*Object, *BaseException) { 127 // TODO: Accept unicode for encoding and errors args. 128 expectedTypes := []*Type{UnicodeType, StrType, StrType} 129 argc := len(args) 130 if argc >= 1 && argc < 3 { 131 expectedTypes = expectedTypes[:argc] 132 } 133 if raised := checkMethodArgs(f, "encode", args, expectedTypes...); raised != nil { 134 return nil, raised 135 } 136 encoding := EncodeDefault 137 if argc > 1 { 138 encoding = toStrUnsafe(args[1]).Value() 139 } 140 errors := EncodeStrict 141 if argc > 2 { 142 errors = toStrUnsafe(args[2]).Value() 143 } 144 ret, raised := toUnicodeUnsafe(args[0]).Encode(f, encoding, errors) 145 if raised != nil { 146 return nil, raised 147 } 148 return ret.ToObject(), nil 149 } 150 151 func unicodeEq(f *Frame, v, w *Object) (*Object, *BaseException) { 152 return unicodeCompareEq(f, toUnicodeUnsafe(v), w, true) 153 } 154 155 func unicodeGE(f *Frame, v, w *Object) (*Object, *BaseException) { 156 return unicodeCompare(f, toUnicodeUnsafe(v), w, False, True, True) 157 } 158 159 // unicodeGetItem returns a slice of string depending on whether index is an 160 // integer or a slice. If index is neither of those types then a TypeError is 161 // returned. 162 func unicodeGetItem(f *Frame, o, key *Object) (*Object, *BaseException) { 163 s := toUnicodeUnsafe(o).Value() 164 switch { 165 case key.typ.slots.Index != nil: 166 index, raised := seqCheckedIndex(f, len(s), toIntUnsafe(key).Value()) 167 if raised != nil { 168 return nil, raised 169 } 170 return NewUnicodeFromRunes([]rune{s[index]}).ToObject(), nil 171 case key.isInstance(SliceType): 172 slice := toSliceUnsafe(key) 173 start, stop, step, sliceLen, raised := slice.calcSlice(f, len(s)) 174 if raised != nil { 175 return nil, raised 176 } 177 if step == 1 { 178 return NewUnicodeFromRunes(s[start:stop]).ToObject(), nil 179 } 180 result := make([]rune, 0, sliceLen) 181 for j := start; j < stop; j += step { 182 result = append(result, s[j]) 183 } 184 return NewUnicodeFromRunes([]rune(result)).ToObject(), nil 185 } 186 return nil, f.RaiseType(TypeErrorType, fmt.Sprintf("unicode indices must be integers or slice, not %s", key.typ.Name())) 187 } 188 189 func unicodeGetNewArgs(f *Frame, args Args, _ KWArgs) (*Object, *BaseException) { 190 if raised := checkMethodArgs(f, "__getnewargs__", args, UnicodeType); raised != nil { 191 return nil, raised 192 } 193 return NewTuple1(args[0]).ToObject(), nil 194 } 195 196 func unicodeGT(f *Frame, v, w *Object) (*Object, *BaseException) { 197 return unicodeCompare(f, toUnicodeUnsafe(v), w, False, False, True) 198 } 199 200 func unicodeHash(f *Frame, o *Object) (*Object, *BaseException) { 201 s := toUnicodeUnsafe(o).Value() 202 l := len(s) 203 if l == 0 { 204 return NewInt(0).ToObject(), nil 205 } 206 h := int(s[0]) << 7 207 for _, r := range s { 208 h = (1000003 * h) ^ int(r) 209 } 210 h ^= l 211 if h == -1 { 212 h = -2 213 } 214 return NewInt(h).ToObject(), nil 215 } 216 217 func unicodeJoin(f *Frame, args Args, _ KWArgs) (*Object, *BaseException) { 218 if raised := checkMethodArgs(f, "join", args, UnicodeType, ObjectType); raised != nil { 219 return nil, raised 220 } 221 var result *Object 222 raised := seqApply(f, args[1], func(parts []*Object, _ bool) (raised *BaseException) { 223 result, raised = unicodeJoinParts(f, toUnicodeUnsafe(args[0]), parts) 224 return raised 225 }) 226 if raised != nil { 227 return nil, raised 228 } 229 return result, nil 230 } 231 232 func unicodeLE(f *Frame, v, w *Object) (*Object, *BaseException) { 233 return unicodeCompare(f, toUnicodeUnsafe(v), w, True, True, False) 234 } 235 236 func unicodeLen(f *Frame, o *Object) (*Object, *BaseException) { 237 return NewInt(len(toUnicodeUnsafe(o).Value())).ToObject(), nil 238 } 239 240 func unicodeLT(f *Frame, v, w *Object) (*Object, *BaseException) { 241 return unicodeCompare(f, toUnicodeUnsafe(v), w, True, False, False) 242 } 243 244 func unicodeMul(f *Frame, v, w *Object) (*Object, *BaseException) { 245 value := toUnicodeUnsafe(v).Value() 246 numChars := len(value) 247 n, ok, raised := strRepeatCount(f, numChars, w) 248 if raised != nil { 249 return nil, raised 250 } 251 if !ok { 252 return NotImplemented, nil 253 } 254 newLen := numChars * n 255 newValue := make([]rune, newLen) 256 for i := 0; i < newLen; i += numChars { 257 copy(newValue[i:], value) 258 } 259 return NewUnicodeFromRunes(newValue).ToObject(), nil 260 } 261 262 func unicodeNative(f *Frame, o *Object) (reflect.Value, *BaseException) { 263 // Encode to utf-8 when passing data out to Go. 264 s, raised := toUnicodeUnsafe(o).Encode(f, EncodeDefault, EncodeStrict) 265 if raised != nil { 266 return reflect.Value{}, raised 267 } 268 return reflect.ValueOf(s.Value()), nil 269 } 270 271 func unicodeNE(f *Frame, v, w *Object) (*Object, *BaseException) { 272 return unicodeCompareEq(f, toUnicodeUnsafe(v), w, false) 273 } 274 275 func unicodeNew(f *Frame, t *Type, args Args, _ KWArgs) (ret *Object, raised *BaseException) { 276 // TODO: Accept keyword arguments: string, encoding, errors. 277 if t != UnicodeType { 278 // Allocate a plain unicode then copy it's value into an object 279 // of the unicode subtype. 280 s, raised := unicodeNew(f, UnicodeType, args, nil) 281 if raised != nil { 282 return nil, raised 283 } 284 result := toUnicodeUnsafe(newObject(t)) 285 result.value = toUnicodeUnsafe(s).Value() 286 return result.ToObject(), nil 287 } 288 expectedTypes := []*Type{ObjectType, StrType, StrType} 289 argc := len(args) 290 if argc < 3 { 291 expectedTypes = expectedTypes[:argc] 292 } 293 if raised := checkMethodArgs(f, "__new__", args, expectedTypes...); raised != nil { 294 return nil, raised 295 } 296 if argc == 0 { 297 return NewUnicodeFromRunes(nil).ToObject(), nil 298 } 299 arg0 := args[0] 300 if argc == 1 { 301 if unicode := arg0.typ.slots.Unicode; unicode != nil { 302 ret, raised = unicode.Fn(f, arg0) 303 } else if arg0.typ == UnicodeType { 304 ret = toUnicodeUnsafe(arg0).ToObject() 305 } else if arg0.isInstance(UnicodeType) { 306 // Return a unicode object (not a subtype). 307 ret = NewUnicodeFromRunes(toUnicodeUnsafe(arg0).Value()).ToObject() 308 } else if str := arg0.typ.slots.Str; str != nil { 309 ret, raised = str.Fn(f, arg0) 310 } else { 311 var s *Str 312 if s, raised = Repr(f, arg0); raised == nil { 313 ret = s.ToObject() 314 } 315 } 316 if raised != nil { 317 return nil, raised 318 } 319 u, raised := unicodeCoerce(f, ret) 320 if raised != nil { 321 return nil, raised 322 } 323 return u.ToObject(), nil 324 } 325 if !arg0.isInstance(StrType) { 326 format := "coercing to Unicode: need str, %s found" 327 return nil, f.RaiseType(TypeErrorType, fmt.Sprintf(format, arg0.typ.Name())) 328 } 329 encoding := toStrUnsafe(args[1]).Value() 330 errors := "strict" 331 if argc > 2 { 332 errors = toStrUnsafe(args[2]).Value() 333 } 334 s, raised := toStrUnsafe(arg0).Decode(f, encoding, errors) 335 if raised != nil { 336 return nil, raised 337 } 338 return s.ToObject(), nil 339 } 340 341 func unicodeRepr(_ *Frame, o *Object) (*Object, *BaseException) { 342 buf := bytes.Buffer{} 343 buf.WriteString("u'") 344 for _, r := range toUnicodeUnsafe(o).Value() { 345 if escape, ok := escapeMap[r]; ok { 346 buf.WriteString(escape) 347 } else if r <= unicode.MaxASCII && unicode.IsPrint(r) { 348 buf.WriteRune(r) 349 } else { 350 buf.Write(escapeRune(r)) 351 } 352 } 353 buf.WriteRune('\'') 354 return NewStr(buf.String()).ToObject(), nil 355 } 356 357 func unicodeStr(f *Frame, o *Object) (*Object, *BaseException) { 358 ret, raised := toUnicodeUnsafe(o).Encode(f, EncodeDefault, EncodeStrict) 359 if raised != nil { 360 return nil, raised 361 } 362 return ret.ToObject(), nil 363 } 364 365 func unicodeStrip(f *Frame, args Args, _ KWArgs) (*Object, *BaseException) { 366 expectedTypes := []*Type{UnicodeType, ObjectType} 367 argc := len(args) 368 if argc == 1 { 369 expectedTypes = expectedTypes[:argc] 370 } 371 if raised := checkMethodArgs(f, "strip", args, expectedTypes...); raised != nil { 372 return nil, raised 373 } 374 s := toUnicodeUnsafe(args[0]) 375 charsArg := None 376 if argc > 1 { 377 charsArg = args[1] 378 } 379 matchFunc := unicode.IsSpace 380 if charsArg != None { 381 chars, raised := unicodeCoerce(f, charsArg) 382 if raised != nil { 383 return nil, raised 384 } 385 matchFunc = func(r rune) bool { 386 for _, c := range chars.Value() { 387 if r == c { 388 return true 389 } 390 } 391 return false 392 } 393 } 394 runes := s.Value() 395 numRunes := len(runes) 396 lindex := 0 397 for ; lindex < numRunes; lindex++ { 398 if !matchFunc(runes[lindex]) { 399 break 400 } 401 } 402 rindex := numRunes 403 for ; rindex > lindex; rindex-- { 404 if !matchFunc(runes[rindex-1]) { 405 break 406 } 407 } 408 result := make([]rune, rindex-lindex) 409 copy(result, runes[lindex:rindex]) 410 return NewUnicodeFromRunes(result).ToObject(), nil 411 } 412 413 func initUnicodeType(dict map[string]*Object) { 414 dict["__getnewargs__"] = newBuiltinFunction("__getnewargs__", unicodeGetNewArgs).ToObject() 415 dict["encode"] = newBuiltinFunction("encode", unicodeEncode).ToObject() 416 dict["join"] = newBuiltinFunction("join", unicodeJoin).ToObject() 417 dict["strip"] = newBuiltinFunction("strip", unicodeStrip).ToObject() 418 UnicodeType.slots.Add = &binaryOpSlot{unicodeAdd} 419 UnicodeType.slots.Contains = &binaryOpSlot{unicodeContains} 420 UnicodeType.slots.Eq = &binaryOpSlot{unicodeEq} 421 UnicodeType.slots.GE = &binaryOpSlot{unicodeGE} 422 UnicodeType.slots.GetItem = &binaryOpSlot{unicodeGetItem} 423 UnicodeType.slots.GT = &binaryOpSlot{unicodeGT} 424 UnicodeType.slots.Hash = &unaryOpSlot{unicodeHash} 425 UnicodeType.slots.LE = &binaryOpSlot{unicodeLE} 426 UnicodeType.slots.Len = &unaryOpSlot{unicodeLen} 427 UnicodeType.slots.LT = &binaryOpSlot{unicodeLT} 428 UnicodeType.slots.Mul = &binaryOpSlot{unicodeMul} 429 UnicodeType.slots.NE = &binaryOpSlot{unicodeNE} 430 UnicodeType.slots.New = &newSlot{unicodeNew} 431 UnicodeType.slots.Native = &nativeSlot{unicodeNative} 432 UnicodeType.slots.RMul = &binaryOpSlot{unicodeMul} 433 UnicodeType.slots.Repr = &unaryOpSlot{unicodeRepr} 434 UnicodeType.slots.Str = &unaryOpSlot{unicodeStr} 435 } 436 437 func unicodeCompare(f *Frame, v *Unicode, w *Object, ltResult, eqResult, gtResult *Int) (*Object, *BaseException) { 438 rhs := []rune(nil) 439 if w.isInstance(UnicodeType) { 440 rhs = toUnicodeUnsafe(w).Value() 441 } else if w.isInstance(StrType) { 442 ret, raised := toStrUnsafe(w).Decode(f, EncodeDefault, EncodeStrict) 443 if raised != nil { 444 return nil, raised 445 } 446 rhs = ret.Value() 447 } else { 448 return NotImplemented, nil 449 } 450 switch runeSliceCmp(v.Value(), rhs) { 451 case -1: 452 return ltResult.ToObject(), nil 453 case 0: 454 return eqResult.ToObject(), nil 455 default: 456 return gtResult.ToObject(), nil 457 } 458 } 459 460 func runeSliceCmp(lhs []rune, rhs []rune) int { 461 lhsLen, rhsLen := len(lhs), len(rhs) 462 minLen := lhsLen 463 if rhsLen < lhsLen { 464 minLen = rhsLen 465 } 466 for i := 0; i < minLen; i++ { 467 if lhs[i] < rhs[i] { 468 return -1 469 } 470 if lhs[i] > rhs[i] { 471 return 1 472 } 473 } 474 if lhsLen < rhsLen { 475 return -1 476 } 477 if lhsLen > rhsLen { 478 return 1 479 } 480 return 0 481 } 482 483 // unicodeCompareEq returns the result of comparing whether v and w are equal 484 // (when eq is true) or unequal (when eq is false). It differs from 485 // unicodeCompare in that it will safely decode w if it has type str and 486 // therefore will not raise UnicodeDecodeError. 487 func unicodeCompareEq(f *Frame, v *Unicode, w *Object, eq bool) (*Object, *BaseException) { 488 if w.isInstance(UnicodeType) { 489 // Do the standard comparison knowing that we won't raise 490 // UnicodeDecodeError for w. 491 return unicodeCompare(f, v, w, GetBool(!eq), GetBool(eq), GetBool(!eq)) 492 } 493 if !w.isInstance(StrType) { 494 return NotImplemented, nil 495 } 496 lhs := v.Value() 497 lhsLen := len(lhs) 498 i := 0 499 // Decode w as utf-8. 500 for _, r := range toStrUnsafe(w).Value() { 501 // lhs[i] should never be RuneError so the second part of the 502 // condition should catch that case. 503 if i >= lhsLen || lhs[i] != r { 504 return GetBool(!eq).ToObject(), nil 505 } 506 i++ 507 } 508 return GetBool((i == lhsLen) == eq).ToObject(), nil 509 } 510 511 func unicodeCoerce(f *Frame, o *Object) (*Unicode, *BaseException) { 512 switch { 513 case o.isInstance(StrType): 514 return toStrUnsafe(o).Decode(f, EncodeDefault, EncodeStrict) 515 case o.isInstance(UnicodeType): 516 return toUnicodeUnsafe(o), nil 517 default: 518 format := "coercing to Unicode: need string, %s found" 519 return nil, f.RaiseType(TypeErrorType, fmt.Sprintf(format, o.typ.Name())) 520 } 521 } 522 523 func unicodeJoinParts(f *Frame, s *Unicode, parts []*Object) (*Object, *BaseException) { 524 numParts := len(parts) 525 if numParts == 0 { 526 return NewUnicode("").ToObject(), nil 527 } 528 sep := s.Value() 529 sepLen := len(sep) 530 unicodeParts := make([]*Unicode, numParts) 531 // Calculate the size of the required buffer. 532 numRunes := (numParts - 1) * len(sep) 533 for i, part := range parts { 534 s, raised := unicodeCoerce(f, part) 535 if raised != nil { 536 return nil, raised 537 } 538 unicodeParts[i] = s 539 numRunes += len(s.Value()) 540 } 541 // Piece together the result string into buf. 542 buf := make([]rune, numRunes) 543 offset := 0 544 for i, part := range unicodeParts { 545 if i > 0 { 546 copy(buf[offset:offset+sepLen], sep) 547 offset += sepLen 548 } 549 s := part.Value() 550 l := len(s) 551 copy(buf[offset:offset+l], s) 552 offset += l 553 } 554 return NewUnicodeFromRunes(buf).ToObject(), nil 555 }