github.com/primecitizens/pcz/std@v0.2.1/text/unicode/utf16/decode.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2010 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  package utf16
     9  
    10  import (
    11  	"github.com/primecitizens/pcz/std/core/assert"
    12  	. "github.com/primecitizens/pcz/std/text/unicode/common"
    13  	"github.com/primecitizens/pcz/std/text/unicode/utf8"
    14  )
    15  
    16  // UTF8DecodedSize returns UTF-8 bytes required to store the decoded
    17  // UTF-16 encoding of unicode code points.
    18  func UTF8DecodedSize(utf16 ...uint16) (n int, canDecodeInPlace bool) {
    19  	var (
    20  		i  int
    21  		ar rune
    22  	)
    23  
    24  	// when canDecodeInPlace is true: i*2 >= n
    25  	for canDecodeInPlace = true; i < len(utf16); {
    26  		switch ar = rune(utf16[i]); {
    27  		case ar < surr1, surr3 <= ar:
    28  			// normal rune
    29  			i++
    30  			n += utf8.RuneLen(ar)
    31  			if canDecodeInPlace && i*2 /* space avail */ < n /* space required */ {
    32  				canDecodeInPlace = false
    33  			}
    34  		case surr1 <= ar && ar < surr2 &&
    35  			i+1 < len(utf16) &&
    36  			surr2 <= utf16[i+1] && utf16[i+1] < surr3:
    37  			// valid surrogate sequence
    38  			// in this case, we have 4-bytes available for utf-8, so it
    39  			// can always decode to utf8 in-place, no need to update
    40  			// canDecodeInPlace
    41  			i++
    42  			n += utf8.RuneLen(DecodeRune(ar, rune(utf16[i])))
    43  			i++
    44  		default:
    45  			// invalid surrogate sequence
    46  			i++
    47  			n += utf8.RuneErrorLen
    48  			if canDecodeInPlace && i*2 /* space avail */ < n /* space required */ {
    49  				canDecodeInPlace = false
    50  			}
    51  		}
    52  	}
    53  
    54  	return
    55  }
    56  
    57  // DecodeRune returns the UTF-16 decoding of a surrogate pair.
    58  // If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
    59  // the Unicode replacement code point U+FFFD.
    60  func DecodeRune(r1, r2 rune) rune {
    61  	if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
    62  		return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
    63  	}
    64  	return RuneError
    65  }
    66  
    67  // UTF8Decode transcodes UTF-16 encoding of unicode code points
    68  // to UTF-8 bytes, returns updated dst and count of consumed
    69  // uint16s from src.
    70  func UTF8Decode(dst []byte, utf16 ...uint16) ([]byte, int) {
    71  	var (
    72  		sz, n int
    73  		r     rune
    74  	)
    75  
    76  	for ; n < len(utf16); n++ {
    77  		switch r = rune(utf16[n]); {
    78  		case r < surr1, surr3 <= r:
    79  			// normal rune
    80  			dst, sz = utf8.EncodeRune(dst, r)
    81  		case surr1 <= r && r < surr2 &&
    82  			n+1 < len(utf16) &&
    83  			surr2 <= utf16[n+1] && utf16[n+1] < surr3:
    84  			// valid surrogate sequence
    85  			n++
    86  			dst, sz = utf8.EncodeRune(dst, DecodeRune(r, rune(utf16[n])))
    87  		default:
    88  			// invalid surrogate sequence
    89  			dst, sz = utf8.EncodeRune(dst, RuneError)
    90  		}
    91  
    92  		if sz == 0 {
    93  			break
    94  		}
    95  	}
    96  
    97  	return dst, n
    98  }
    99  
   100  func UTF8Append(dst []byte, src ...uint16) []byte {
   101  	for n := 0; len(src) != 0; src = src[n:] {
   102  		if dst, n = UTF8Decode(dst, src...); n == 0 {
   103  			assert.TODO("grow")
   104  		}
   105  	}
   106  
   107  	return dst
   108  }
   109  
   110  // RunesDecode decodes UTF-16 code points in src into the dst, returns
   111  // updated dst and count of consumed uint16s from src.
   112  func RunesDecode(d []rune, src ...uint16) (dst []rune, n int) {
   113  	var (
   114  		r   rune
   115  		off = len(d)
   116  	)
   117  	dst = d[:cap(d)]
   118  
   119  	for n = 0; n < len(src) && off < len(dst); n++ {
   120  		switch r = rune(src[n]); {
   121  		case r < surr1, surr3 <= r:
   122  			// normal rune
   123  		case surr1 <= r && r < surr2 &&
   124  			n+1 < len(src) &&
   125  			surr2 <= src[n+1] && src[n+1] < surr3:
   126  			// valid surrogate sequence
   127  			n++
   128  			r = DecodeRune(r, rune(src[n]))
   129  		default:
   130  			// invalid surrogate sequence
   131  			r = RuneError
   132  		}
   133  
   134  		dst[off] = r
   135  		off++
   136  	}
   137  
   138  	return dst[:off], n
   139  }
   140  
   141  // RunesAppend decodes all UTF-16 code points in src into the dst, returns
   142  // the decoded runes.
   143  func RunesAppend(dst []rune, utf16 ...uint16) []rune {
   144  	for n := 0; len(utf16) != 0; utf16 = utf16[n:] {
   145  		if dst, n = RunesDecode(dst, utf16...); n == 0 {
   146  			assert.TODO("grow")
   147  		}
   148  	}
   149  
   150  	return dst
   151  }