github.com/vanus-labs/vanus/lib@v0.0.0-20231221070800-1334a7b9605e/bytes/utf8.go (about)

     1  // Copyright 2023 Linkall Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bytes
    16  
    17  import (
    18  	// standard libraries.
    19  	"io"
    20  	"unicode/utf8"
    21  )
    22  
    23  const (
    24  	maskx = 0b00111111
    25  	mask2 = 0b00011111
    26  	mask3 = 0b00001111
    27  	mask4 = 0b00000111
    28  
    29  	// The default lowest and highest continuation byte.
    30  	locb = 0b10000000
    31  	hicb = 0b10111111
    32  
    33  	// These names of these constants are chosen to give nice alignment in the
    34  	// table below. The first nibble is an index into acceptRanges or F for
    35  	// special one-byte cases. The second nibble is the Rune length or the
    36  	// Status for the special one-byte case.
    37  	xx = 0xF1 // invalid: size 1
    38  	as = 0xF0 // ASCII: size 1
    39  	s1 = 0x02 // accept 0, size 2
    40  	s2 = 0x13 // accept 1, size 3
    41  	s3 = 0x03 // accept 0, size 3
    42  	s4 = 0x23 // accept 2, size 3
    43  	s5 = 0x34 // accept 3, size 4
    44  	s6 = 0x04 // accept 0, size 4
    45  	s7 = 0x44 // accept 4, size 4
    46  )
    47  
    48  // first is information about the first byte in a UTF-8 sequence.
    49  var first = [256]uint8{
    50  	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    51  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
    52  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
    53  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
    54  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
    55  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
    56  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
    57  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
    58  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
    59  	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    60  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
    61  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
    62  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
    63  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
    64  	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
    65  	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
    66  	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
    67  	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
    68  }
    69  
    70  // acceptRange gives the range of valid values for the second byte in a UTF-8
    71  // sequence.
    72  type acceptRange struct {
    73  	lo uint8 // lowest value for second byte.
    74  	hi uint8 // highest value for second byte.
    75  }
    76  
    77  // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
    78  var acceptRanges = [16]acceptRange{
    79  	0: {locb, hicb},
    80  	1: {0xA0, hicb},
    81  	2: {locb, 0x9F},
    82  	3: {0x90, hicb},
    83  	4: {locb, 0x8F},
    84  }
    85  
    86  func ReadRune(r io.ByteReader) (rune, int) {
    87  	b0, err := r.ReadByte()
    88  	if err != nil {
    89  		return utf8.RuneError, 0
    90  	}
    91  	return ReadRuneExt(b0, r)
    92  }
    93  
    94  func ReadRuneExt(b0 byte, r io.ByteReader) (rune, int) {
    95  	x := first[b0]
    96  	if x >= as {
    97  		// The following code simulates an additional check for x == xx and
    98  		// handling the ASCII and invalid cases accordingly. This mask-and-or
    99  		// approach prevents an additional branch.
   100  		mask := rune(x) << 31 >> 31 //nolint:gomnd // Create 0x0000 or 0xFFFF.
   101  		return rune(b0)&^mask | utf8.RuneError&mask, 1
   102  	}
   103  
   104  	sz := int(x & 0b111) //nolint:gomnd // magic is ok
   105  	accept := acceptRanges[x>>4]
   106  
   107  	b1, err := r.ReadByte()
   108  	if err != nil || b1 < accept.lo || accept.hi < b1 {
   109  		return utf8.RuneError, 1
   110  	}
   111  	if sz == 2 { //nolint:gomnd // magic is ok
   112  		return rune(b0&mask2)<<6 | rune(b1&maskx), 2 //nolint:gomnd // magic is ok
   113  	}
   114  
   115  	b2, err := r.ReadByte()
   116  	if err != nil || b2 < locb || hicb < b2 {
   117  		return utf8.RuneError, 1
   118  	}
   119  	if sz == 3 { //nolint:gomnd // magic is ok
   120  		return rune(b0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3 //nolint:gomnd // magic is ok
   121  	}
   122  
   123  	b3, err := r.ReadByte()
   124  	if err != nil || b3 < locb || hicb < b3 {
   125  		return utf8.RuneError, 1
   126  	}
   127  	return rune(b0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4 //nolint:gomnd // magic is ok
   128  }
   129  
   130  func WriteRune(w io.ByteWriter, ru rune) error {
   131  	var buf [utf8.UTFMax]byte
   132  	n := utf8.EncodeRune(buf[:], ru)
   133  	for i := 0; i < n; i++ {
   134  		if err := w.WriteByte(buf[i]); err != nil {
   135  			return err
   136  		}
   137  	}
   138  	return nil
   139  }