github.com/primecitizens/pcz/std@v0.2.1/text/unicode/utf16/decode.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2010 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 package utf16 9 10 import ( 11 "github.com/primecitizens/pcz/std/core/assert" 12 . "github.com/primecitizens/pcz/std/text/unicode/common" 13 "github.com/primecitizens/pcz/std/text/unicode/utf8" 14 ) 15 16 // UTF8DecodedSize returns UTF-8 bytes required to store the decoded 17 // UTF-16 encoding of unicode code points. 18 func UTF8DecodedSize(utf16 ...uint16) (n int, canDecodeInPlace bool) { 19 var ( 20 i int 21 ar rune 22 ) 23 24 // when canDecodeInPlace is true: i*2 >= n 25 for canDecodeInPlace = true; i < len(utf16); { 26 switch ar = rune(utf16[i]); { 27 case ar < surr1, surr3 <= ar: 28 // normal rune 29 i++ 30 n += utf8.RuneLen(ar) 31 if canDecodeInPlace && i*2 /* space avail */ < n /* space required */ { 32 canDecodeInPlace = false 33 } 34 case surr1 <= ar && ar < surr2 && 35 i+1 < len(utf16) && 36 surr2 <= utf16[i+1] && utf16[i+1] < surr3: 37 // valid surrogate sequence 38 // in this case, we have 4-bytes available for utf-8, so it 39 // can always decode to utf8 in-place, no need to update 40 // canDecodeInPlace 41 i++ 42 n += utf8.RuneLen(DecodeRune(ar, rune(utf16[i]))) 43 i++ 44 default: 45 // invalid surrogate sequence 46 i++ 47 n += utf8.RuneErrorLen 48 if canDecodeInPlace && i*2 /* space avail */ < n /* space required */ { 49 canDecodeInPlace = false 50 } 51 } 52 } 53 54 return 55 } 56 57 // DecodeRune returns the UTF-16 decoding of a surrogate pair. 58 // If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns 59 // the Unicode replacement code point U+FFFD. 60 func DecodeRune(r1, r2 rune) rune { 61 if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { 62 return (r1-surr1)<<10 | (r2 - surr2) + surrSelf 63 } 64 return RuneError 65 } 66 67 // UTF8Decode transcodes UTF-16 encoding of unicode code points 68 // to UTF-8 bytes, returns updated dst and count of consumed 69 // uint16s from src. 70 func UTF8Decode(dst []byte, utf16 ...uint16) ([]byte, int) { 71 var ( 72 sz, n int 73 r rune 74 ) 75 76 for ; n < len(utf16); n++ { 77 switch r = rune(utf16[n]); { 78 case r < surr1, surr3 <= r: 79 // normal rune 80 dst, sz = utf8.EncodeRune(dst, r) 81 case surr1 <= r && r < surr2 && 82 n+1 < len(utf16) && 83 surr2 <= utf16[n+1] && utf16[n+1] < surr3: 84 // valid surrogate sequence 85 n++ 86 dst, sz = utf8.EncodeRune(dst, DecodeRune(r, rune(utf16[n]))) 87 default: 88 // invalid surrogate sequence 89 dst, sz = utf8.EncodeRune(dst, RuneError) 90 } 91 92 if sz == 0 { 93 break 94 } 95 } 96 97 return dst, n 98 } 99 100 func UTF8Append(dst []byte, src ...uint16) []byte { 101 for n := 0; len(src) != 0; src = src[n:] { 102 if dst, n = UTF8Decode(dst, src...); n == 0 { 103 assert.TODO("grow") 104 } 105 } 106 107 return dst 108 } 109 110 // RunesDecode decodes UTF-16 code points in src into the dst, returns 111 // updated dst and count of consumed uint16s from src. 112 func RunesDecode(d []rune, src ...uint16) (dst []rune, n int) { 113 var ( 114 r rune 115 off = len(d) 116 ) 117 dst = d[:cap(d)] 118 119 for n = 0; n < len(src) && off < len(dst); n++ { 120 switch r = rune(src[n]); { 121 case r < surr1, surr3 <= r: 122 // normal rune 123 case surr1 <= r && r < surr2 && 124 n+1 < len(src) && 125 surr2 <= src[n+1] && src[n+1] < surr3: 126 // valid surrogate sequence 127 n++ 128 r = DecodeRune(r, rune(src[n])) 129 default: 130 // invalid surrogate sequence 131 r = RuneError 132 } 133 134 dst[off] = r 135 off++ 136 } 137 138 return dst[:off], n 139 } 140 141 // RunesAppend decodes all UTF-16 code points in src into the dst, returns 142 // the decoded runes. 143 func RunesAppend(dst []rune, utf16 ...uint16) []rune { 144 for n := 0; len(utf16) != 0; utf16 = utf16[n:] { 145 if dst, n = RunesDecode(dst, utf16...); n == 0 { 146 assert.TODO("grow") 147 } 148 } 149 150 return dst 151 }