github.com/btwiuse/jiri@v0.0.0-20191125065820-53353bcfef54/textutil/utf8.go (about)

     1  // Copyright 2015 The Vanadium Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package textutil
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"unicode/utf8"
    11  )
    12  
    13  // UTF8Encoder implements RuneEncoder for the UTF-8 encoding.
    14  type UTF8Encoder struct{}
    15  
    16  var _ RuneEncoder = UTF8Encoder{}
    17  
    18  // Encode encodes r into buf in the UTF-8 encoding.
    19  func (UTF8Encoder) Encode(r rune, buf *bytes.Buffer) { buf.WriteRune(r) }
    20  
    21  // UTF8ChunkDecoder implements RuneChunkDecoder for a stream of UTF-8 data that
    22  // is arbitrarily chunked.
    23  //
    24  // UTF-8 is a byte-wise encoding that may use multiple bytes to encode a single
    25  // rune.  This decoder buffers partial runes that have been split across chunks,
    26  // so that a full rune is returned when the subsequent data chunk is provided.
    27  //
    28  // This is commonly used to implement an io.Writer wrapper over UTF-8 text.  It
    29  // is useful since the data provided to Write calls may be arbitrarily chunked.
    30  //
    31  // The zero UTF8ChunkDecoder is a decoder with an empty buffer.
    32  type UTF8ChunkDecoder struct {
    33  	// The only state we keep is the last partial rune we've encountered.
    34  	partial    [utf8.UTFMax]byte
    35  	partialLen int
    36  }
    37  
    38  var _ RuneChunkDecoder = (*UTF8ChunkDecoder)(nil)
    39  
    40  // DecodeRune implements the RuneChunkDecoder interface method.
    41  //
    42  // Invalid encodings are transformed into U+FFFD, one byte at a time.  See
    43  // unicode/utf8.DecodeRune for details.
    44  func (d *UTF8ChunkDecoder) DecodeRune(chunk []byte) (rune, int) {
    45  	if d.partialLen > 0 {
    46  		return d.decodeRunePartial(chunk)
    47  	}
    48  	r, size := utf8.DecodeRune(chunk)
    49  	if r == utf8.RuneError && !utf8.FullRune(chunk) {
    50  		// Initialize the partial rune buffer with chunk.
    51  		d.partialLen = copy(d.partial[:], chunk)
    52  		return d.verifyPartial(d.partialLen, chunk)
    53  	}
    54  	return r, size
    55  }
    56  
    57  // decodeRunePartial implements decodeRune when there is a previously buffered
    58  // partial rune.
    59  func (d *UTF8ChunkDecoder) decodeRunePartial(chunk []byte) (rune, int) {
    60  	// Append as much as we can to the partial rune, and see if it's full.
    61  	oldLen := d.partialLen
    62  	d.partialLen += copy(d.partial[oldLen:], chunk)
    63  	if !utf8.FullRune(d.partial[:d.partialLen]) {
    64  		// We still don't have a full rune - keep waiting.
    65  		return d.verifyPartial(d.partialLen-oldLen, chunk)
    66  	}
    67  	// We finally have a full rune.
    68  	r, size := utf8.DecodeRune(d.partial[:d.partialLen])
    69  	if size < oldLen {
    70  		// This occurs when we have a multi-byte rune that has the right number of
    71  		// bytes, but is an invalid code point.
    72  		//
    73  		// Say oldLen=2, and we just received the third byte of a 3-byte rune which
    74  		// isn't a UTF-8 trailing byte.  In this case utf8.DecodeRune returns U+FFFD
    75  		// and size=1, to indicate we should skip the first byte.
    76  		//
    77  		// We shift the unread portion of the old partial buffer forward, and update
    78  		// the partial len so that it's strictly decreasing.  The strictly
    79  		// decreasing property isn't necessary for correctness, but helps avoid
    80  		// repeatedly copying into the partial buffer unecessarily.
    81  		copy(d.partial[:], d.partial[size:oldLen])
    82  		d.partialLen = oldLen - size
    83  		return r, 0
    84  	}
    85  	// We've used all of the partial buffer.
    86  	d.partialLen = 0
    87  	return r, size - oldLen
    88  }
    89  
    90  // verifyPartial is called when we don't have a full rune, and ncopy bytes have
    91  // been copied from data into the decoder partial rune buffer.  We expect that
    92  // all data has been buffered and we return EOF and the total size of the data.
    93  func (d *UTF8ChunkDecoder) verifyPartial(ncopy int, data []byte) (rune, int) {
    94  	if ncopy < len(data) {
    95  		// Something's very wrong if we managed to fill d.partial without copying
    96  		// all the data; any sequence of utf8.UTFMax bytes must be a full rune.
    97  		panic(fmt.Errorf("UTF8ChunkDecoder: partial rune %v with leftover data %v", d.partial[:d.partialLen], data[ncopy:]))
    98  	}
    99  	return EOF, len(data)
   100  }
   101  
   102  // FlushRune implements the RuneChunkDecoder interface method.
   103  //
   104  // Since the only data that is buffered is the final partial rune, the return
   105  // value will only ever be U+FFFD or EOF.  No valid runes are ever returned by
   106  // this method, but multiple U+FFFD may be returned before EOF.
   107  func (d *UTF8ChunkDecoder) FlushRune() rune {
   108  	if d.partialLen == 0 {
   109  		return EOF
   110  	}
   111  	r, size := utf8.DecodeRune(d.partial[:d.partialLen])
   112  	copy(d.partial[:], d.partial[size:])
   113  	d.partialLen -= size
   114  	return r
   115  }