github.com/ezoic/ws@v1.0.4-0.20220713205711-5c1d69e074c5/wsutil/utf8.go (about)

     1  package wsutil
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  )
     7  
     8  // ErrInvalidUTF8 is returned by UTF8 reader on invalid utf8 sequence.
     9  var ErrInvalidUTF8 = fmt.Errorf("invalid utf8")
    10  
    11  // UTF8Reader implements io.Reader that calculates utf8 validity state after
    12  // every read byte from Source.
    13  //
    14  // Note that in some cases client must call r.Valid() after all bytes are read
    15  // to ensure that all of them are valid utf8 sequences. That is, some io helper
    16  // functions such io.ReadAtLeast or io.ReadFull could discard the error
    17  // information returned by the reader when they receive all of requested bytes.
    18  // For example, the last read sequence is invalid and UTF8Reader returns number
    19  // of bytes read and an error. But helper function decides to discard received
    20  // error due to all requested bytes are completely read from the source.
    21  //
    22  // Another possible case is when some valid sequence become split by the read
    23  // bound. Then UTF8Reader can not make decision about validity of the last
    24  // sequence cause it is not fully read yet. And if the read stops, Valid() will
    25  // return false, even if Read() by itself dit not.
    26  type UTF8Reader struct {
    27  	Source io.Reader
    28  
    29  	accepted int
    30  
    31  	state uint32
    32  	codep uint32
    33  }
    34  
    35  // NewUTF8Reader creates utf8 reader that reads from r.
    36  func NewUTF8Reader(r io.Reader) *UTF8Reader {
    37  	return &UTF8Reader{
    38  		Source: r,
    39  	}
    40  }
    41  
    42  // Reset resets utf8 reader to read from r.
    43  func (u *UTF8Reader) Reset(r io.Reader) {
    44  	u.Source = r
    45  	u.state = 0
    46  	u.codep = 0
    47  }
    48  
    49  // Read implements io.Reader.
    50  func (u *UTF8Reader) Read(p []byte) (n int, err error) {
    51  	n, err = u.Source.Read(p)
    52  
    53  	accepted := 0
    54  	s, c := u.state, u.codep
    55  	for i := 0; i < n; i++ {
    56  		c, s = decode(s, c, p[i])
    57  		if s == utf8Reject {
    58  			u.state = s
    59  			return accepted, ErrInvalidUTF8
    60  		}
    61  		if s == utf8Accept {
    62  			accepted = i + 1
    63  		}
    64  	}
    65  	u.state, u.codep = s, c
    66  	u.accepted = accepted
    67  
    68  	return
    69  }
    70  
    71  // Valid checks current reader state. It returns true if all read bytes are
    72  // valid UTF-8 sequences, and false if not.
    73  func (u *UTF8Reader) Valid() bool {
    74  	return u.state == utf8Accept
    75  }
    76  
    77  // Accepted returns number of valid bytes in last Read().
    78  func (u *UTF8Reader) Accepted() int {
    79  	return u.accepted
    80  }
    81  
    82  // Below is port of UTF-8 decoder from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
    83  //
    84  // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
    85  //
    86  // Permission is hereby granted, free of charge, to any person obtaining a copy
    87  // of this software and associated documentation files (the "Software"), to
    88  // deal in the Software without restriction, including without limitation the
    89  // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
    90  // sell copies of the Software, and to permit persons to whom the Software is
    91  // furnished to do so, subject to the following conditions:
    92  //
    93  // The above copyright notice and this permission notice shall be included in
    94  // all copies or substantial portions of the Software.
    95  //
    96  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    97  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    98  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    99  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   100  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   101  // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   102  // IN THE SOFTWARE.
   103  
   104  const (
   105  	utf8Accept = 0
   106  	utf8Reject = 12
   107  )
   108  
   109  var utf8d = [...]byte{
   110  	// The first part of the table maps bytes to character classes that
   111  	// to reduce the size of the transition table and create bitmasks.
   112  	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   113  	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   114  	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   115  	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   116  	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
   117  	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
   118  	8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   119  	10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
   120  
   121  	// The second part is a transition table that maps a combination
   122  	// of a state of the automaton and a character class to a state.
   123  	0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
   124  	12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12,
   125  	12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
   126  	12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
   127  	12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
   128  }
   129  
   130  func decode(state, codep uint32, b byte) (uint32, uint32) {
   131  	t := uint32(utf8d[b])
   132  
   133  	if state != utf8Accept {
   134  		codep = (uint32(b) & 0x3f) | (codep << 6)
   135  	} else {
   136  		codep = (0xff >> t) & uint32(b)
   137  	}
   138  
   139  	return codep, uint32(utf8d[256+state+t])
   140  }