github.com/ezoic/ws@v1.0.4-0.20220713205711-5c1d69e074c5/wsutil/utf8.go (about) 1 package wsutil 2 3 import ( 4 "fmt" 5 "io" 6 ) 7 8 // ErrInvalidUTF8 is returned by UTF8 reader on invalid utf8 sequence. 9 var ErrInvalidUTF8 = fmt.Errorf("invalid utf8") 10 11 // UTF8Reader implements io.Reader that calculates utf8 validity state after 12 // every read byte from Source. 13 // 14 // Note that in some cases client must call r.Valid() after all bytes are read 15 // to ensure that all of them are valid utf8 sequences. That is, some io helper 16 // functions such io.ReadAtLeast or io.ReadFull could discard the error 17 // information returned by the reader when they receive all of requested bytes. 18 // For example, the last read sequence is invalid and UTF8Reader returns number 19 // of bytes read and an error. But helper function decides to discard received 20 // error due to all requested bytes are completely read from the source. 21 // 22 // Another possible case is when some valid sequence become split by the read 23 // bound. Then UTF8Reader can not make decision about validity of the last 24 // sequence cause it is not fully read yet. And if the read stops, Valid() will 25 // return false, even if Read() by itself dit not. 26 type UTF8Reader struct { 27 Source io.Reader 28 29 accepted int 30 31 state uint32 32 codep uint32 33 } 34 35 // NewUTF8Reader creates utf8 reader that reads from r. 36 func NewUTF8Reader(r io.Reader) *UTF8Reader { 37 return &UTF8Reader{ 38 Source: r, 39 } 40 } 41 42 // Reset resets utf8 reader to read from r. 43 func (u *UTF8Reader) Reset(r io.Reader) { 44 u.Source = r 45 u.state = 0 46 u.codep = 0 47 } 48 49 // Read implements io.Reader. 50 func (u *UTF8Reader) Read(p []byte) (n int, err error) { 51 n, err = u.Source.Read(p) 52 53 accepted := 0 54 s, c := u.state, u.codep 55 for i := 0; i < n; i++ { 56 c, s = decode(s, c, p[i]) 57 if s == utf8Reject { 58 u.state = s 59 return accepted, ErrInvalidUTF8 60 } 61 if s == utf8Accept { 62 accepted = i + 1 63 } 64 } 65 u.state, u.codep = s, c 66 u.accepted = accepted 67 68 return 69 } 70 71 // Valid checks current reader state. It returns true if all read bytes are 72 // valid UTF-8 sequences, and false if not. 73 func (u *UTF8Reader) Valid() bool { 74 return u.state == utf8Accept 75 } 76 77 // Accepted returns number of valid bytes in last Read(). 78 func (u *UTF8Reader) Accepted() int { 79 return u.accepted 80 } 81 82 // Below is port of UTF-8 decoder from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 83 // 84 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 85 // 86 // Permission is hereby granted, free of charge, to any person obtaining a copy 87 // of this software and associated documentation files (the "Software"), to 88 // deal in the Software without restriction, including without limitation the 89 // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 90 // sell copies of the Software, and to permit persons to whom the Software is 91 // furnished to do so, subject to the following conditions: 92 // 93 // The above copyright notice and this permission notice shall be included in 94 // all copies or substantial portions of the Software. 95 // 96 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 97 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 98 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 99 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 100 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 101 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 102 // IN THE SOFTWARE. 103 104 const ( 105 utf8Accept = 0 106 utf8Reject = 12 107 ) 108 109 var utf8d = [...]byte{ 110 // The first part of the table maps bytes to character classes that 111 // to reduce the size of the transition table and create bitmasks. 112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 117 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 118 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 119 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 120 121 // The second part is a transition table that maps a combination 122 // of a state of the automaton and a character class to a state. 123 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 124 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, 125 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 126 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 127 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 128 } 129 130 func decode(state, codep uint32, b byte) (uint32, uint32) { 131 t := uint32(utf8d[b]) 132 133 if state != utf8Accept { 134 codep = (uint32(b) & 0x3f) | (codep << 6) 135 } else { 136 codep = (0xff >> t) & uint32(b) 137 } 138 139 return codep, uint32(utf8d[256+state+t]) 140 }