github.com/primecitizens/pcz/std@v0.2.1/text/unicode/wtf16/decode.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2023 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 package wtf16 9 10 import ( 11 . "github.com/primecitizens/pcz/std/text/unicode/common" 12 "github.com/primecitizens/pcz/std/text/unicode/utf16" 13 "github.com/primecitizens/pcz/std/text/unicode/utf8" 14 ) 15 16 func WTF8DecodedSize(s ...uint16) (n int, canDecodeInPlace bool) { 17 // use utf16.UTF8DecodedSize is fine because WTF8Decode handles 18 // invalid surrogates as 3-byte utf8 append, which is the same 19 // as utf8.RuneErrorLen 20 return utf16.UTF8DecodedSize(s...) 21 } 22 23 func WTF8DecodeAll(dst []byte, src ...uint16) []byte { 24 for n := 0; len(src) != 0; src = src[n:] { 25 if dst, n = WTF8Decode(dst, src...); n == 0 { 26 // TODO(alloc): grow dst 27 return dst 28 } 29 } 30 31 return dst 32 } 33 34 // WTF8Decode returns the WTF-8 encoding of the potentially ill-formed 35 // UTF-16 src. 36 func WTF8Decode(dst []byte, src ...uint16) ([]byte, int) { 37 var ( 38 r rune 39 sz, n int 40 ) 41 42 Loop: 43 for ; n < len(src); n++ { 44 switch r = rune(src[n]); { 45 case r < surr1, surr3 <= r: 46 // normal rune 47 dst, sz = utf8.EncodeRune(dst, r) 48 case surr1 <= r && r < surr2 && 49 n+1 < len(src) && 50 surr2 <= src[n+1] && src[n+1] < surr3: 51 // valid surrogate sequence 52 n++ 53 dst, sz = utf8.EncodeRune(dst, utf16.DecodeRune(r, rune(src[n]))) 54 default: 55 // WTF-8 fallback. 56 // This only handles the 3-byte case of utf8.AppendRune, 57 // as surrogates always fall in that case. 58 59 if cap(dst)-len(dst) < 3 { 60 break Loop 61 } 62 63 if r > MaxRune { 64 r = RuneError 65 } 66 67 dst = append(dst, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx) 68 } 69 70 if sz == 0 { 71 break 72 } 73 } 74 75 return dst, n 76 }