github.com/btwiuse/jiri@v0.0.0-20191125065820-53353bcfef54/textutil/utf8.go (about) 1 // Copyright 2015 The Vanadium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package textutil 6 7 import ( 8 "bytes" 9 "fmt" 10 "unicode/utf8" 11 ) 12 13 // UTF8Encoder implements RuneEncoder for the UTF-8 encoding. 14 type UTF8Encoder struct{} 15 16 var _ RuneEncoder = UTF8Encoder{} 17 18 // Encode encodes r into buf in the UTF-8 encoding. 19 func (UTF8Encoder) Encode(r rune, buf *bytes.Buffer) { buf.WriteRune(r) } 20 21 // UTF8ChunkDecoder implements RuneChunkDecoder for a stream of UTF-8 data that 22 // is arbitrarily chunked. 23 // 24 // UTF-8 is a byte-wise encoding that may use multiple bytes to encode a single 25 // rune. This decoder buffers partial runes that have been split across chunks, 26 // so that a full rune is returned when the subsequent data chunk is provided. 27 // 28 // This is commonly used to implement an io.Writer wrapper over UTF-8 text. It 29 // is useful since the data provided to Write calls may be arbitrarily chunked. 30 // 31 // The zero UTF8ChunkDecoder is a decoder with an empty buffer. 32 type UTF8ChunkDecoder struct { 33 // The only state we keep is the last partial rune we've encountered. 34 partial [utf8.UTFMax]byte 35 partialLen int 36 } 37 38 var _ RuneChunkDecoder = (*UTF8ChunkDecoder)(nil) 39 40 // DecodeRune implements the RuneChunkDecoder interface method. 41 // 42 // Invalid encodings are transformed into U+FFFD, one byte at a time. See 43 // unicode/utf8.DecodeRune for details. 44 func (d *UTF8ChunkDecoder) DecodeRune(chunk []byte) (rune, int) { 45 if d.partialLen > 0 { 46 return d.decodeRunePartial(chunk) 47 } 48 r, size := utf8.DecodeRune(chunk) 49 if r == utf8.RuneError && !utf8.FullRune(chunk) { 50 // Initialize the partial rune buffer with chunk. 51 d.partialLen = copy(d.partial[:], chunk) 52 return d.verifyPartial(d.partialLen, chunk) 53 } 54 return r, size 55 } 56 57 // decodeRunePartial implements decodeRune when there is a previously buffered 58 // partial rune. 59 func (d *UTF8ChunkDecoder) decodeRunePartial(chunk []byte) (rune, int) { 60 // Append as much as we can to the partial rune, and see if it's full. 61 oldLen := d.partialLen 62 d.partialLen += copy(d.partial[oldLen:], chunk) 63 if !utf8.FullRune(d.partial[:d.partialLen]) { 64 // We still don't have a full rune - keep waiting. 65 return d.verifyPartial(d.partialLen-oldLen, chunk) 66 } 67 // We finally have a full rune. 68 r, size := utf8.DecodeRune(d.partial[:d.partialLen]) 69 if size < oldLen { 70 // This occurs when we have a multi-byte rune that has the right number of 71 // bytes, but is an invalid code point. 72 // 73 // Say oldLen=2, and we just received the third byte of a 3-byte rune which 74 // isn't a UTF-8 trailing byte. In this case utf8.DecodeRune returns U+FFFD 75 // and size=1, to indicate we should skip the first byte. 76 // 77 // We shift the unread portion of the old partial buffer forward, and update 78 // the partial len so that it's strictly decreasing. The strictly 79 // decreasing property isn't necessary for correctness, but helps avoid 80 // repeatedly copying into the partial buffer unecessarily. 81 copy(d.partial[:], d.partial[size:oldLen]) 82 d.partialLen = oldLen - size 83 return r, 0 84 } 85 // We've used all of the partial buffer. 86 d.partialLen = 0 87 return r, size - oldLen 88 } 89 90 // verifyPartial is called when we don't have a full rune, and ncopy bytes have 91 // been copied from data into the decoder partial rune buffer. We expect that 92 // all data has been buffered and we return EOF and the total size of the data. 93 func (d *UTF8ChunkDecoder) verifyPartial(ncopy int, data []byte) (rune, int) { 94 if ncopy < len(data) { 95 // Something's very wrong if we managed to fill d.partial without copying 96 // all the data; any sequence of utf8.UTFMax bytes must be a full rune. 97 panic(fmt.Errorf("UTF8ChunkDecoder: partial rune %v with leftover data %v", d.partial[:d.partialLen], data[ncopy:])) 98 } 99 return EOF, len(data) 100 } 101 102 // FlushRune implements the RuneChunkDecoder interface method. 103 // 104 // Since the only data that is buffered is the final partial rune, the return 105 // value will only ever be U+FFFD or EOF. No valid runes are ever returned by 106 // this method, but multiple U+FFFD may be returned before EOF. 107 func (d *UTF8ChunkDecoder) FlushRune() rune { 108 if d.partialLen == 0 { 109 return EOF 110 } 111 r, size := utf8.DecodeRune(d.partial[:d.partialLen]) 112 copy(d.partial[:], d.partial[size:]) 113 d.partialLen -= size 114 return r 115 }