github.com/arnodel/golua@v0.0.0-20230215163904-e0b5347eaaa1/luastrings/utf8.go (about)

     1  package luastrings
     2  
     3  import "unicode/utf8"
     4  
     5  // Most of this code is copied from Go's own "unicode/utf8" package, with some
     6  // modifications to allow "utf-8" encoding of any positive int32 and the reverse
     7  // operation.
     8  
     9  const (
    10  	UTFMax = 6 // Originally 4 in unicode/utf8
    11  
    12  	t1 = 0b00000000
    13  	tx = 0b10000000
    14  	t2 = 0b11000000
    15  	t3 = 0b11100000
    16  	t4 = 0b11110000
    17  
    18  	// Added for Lua
    19  	t5 = 0b11111000
    20  	t6 = 0b11111100
    21  
    22  	maskx = 0b00111111
    23  	mask2 = 0b00011111
    24  	mask3 = 0b00001111
    25  	mask4 = 0b00000111
    26  
    27  	// Added for Lua
    28  	mask5 = 0b00000011
    29  	mask6 = 0b00000001
    30  
    31  	rune1Max = 1<<7 - 1
    32  	rune2Max = 1<<11 - 1
    33  	rune3Max = 1<<16 - 1
    34  	rune4Max = 1<<21 - 1
    35  	rune5Max = 1<<26 - 1
    36  	// Commented because unused as rune6Max is the biggest int32
    37  	// rune6Max = 1<<31 - 1
    38  
    39  	// The default lowest and highest continuation byte.
    40  	locb = 0b10000000
    41  	hicb = 0b10111111
    42  
    43  	// These names of these constants are chosen to give nice alignment in the
    44  	// table below. The first nibble is an index into acceptRanges or F for
    45  	// special one-byte cases. The second nibble is the Rune length or the
    46  	// Status for the special one-byte case.
    47  	//
    48  	// Note: the first nibble is not used because in this implementation
    49  	// acceptRanges is not needed as any byte sequence is valid.  It is however
    50  	// kept to keep the similarity with the code it's copied from as high as
    51  	// possible.
    52  	xx = 0xF1 // invalid: size 1
    53  	as = 0xF0 // ASCII: size 1
    54  	s1 = 0x02 // accept 0, size 2
    55  
    56  	s2 = 0x13 // accept 1, size 3
    57  	s3 = 0x03 // accept 0, size 3
    58  	s4 = 0x23 // accept 2, size 3
    59  
    60  	s5 = 0x34 // accept 3, size 4
    61  	s6 = 0x04 // accept 0, size 4
    62  	s7 = 0x44 // accept 4, size 4
    63  
    64  	// Added for Lua
    65  	s8 = 0x05 // accept 0, size 5
    66  	s9 = 0x06 // accept 0, size 6
    67  )
    68  
    69  // first is information about the first byte in a UTF-8 sequence.  This table is
    70  // copied from the utf8 std library.
    71  var first = [256]uint8{
    72  	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    73  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
    74  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
    75  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
    76  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
    77  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
    78  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
    79  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
    80  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
    81  	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    82  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
    83  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
    84  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
    85  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
    86  	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
    87  	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
    88  	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
    89  	s5, s6, s6, s6, s7, s6, s6, s6, s8, s8, s8, s8, s9, s9, xx, xx, // 0xF0-0xFF
    90  }
    91  
    92  // 1111 0xxx: 0xF0-0xF7
    93  // 1111 10xx: 0xF8-0xFB
    94  // 1111 110x: 0xFC-0xFD
    95  
    96  const RuneError = utf8.RuneError
    97  
    98  // Encode a unicode point with value i into a sequence of bytes, writing into p.
    99  // p must be big enough (length 6 accomodates all values).  Returns the number
   100  // of bytes written. A non-positive value means an error.
   101  //
   102  // Any non-negative int32 can be encoded, that is why the golang utf8 package
   103  // cannot be used.
   104  func UTF8EncodeInt32(p []byte, i int32) int {
   105  	switch {
   106  	case i < 0:
   107  		return 0
   108  	case i <= rune1Max:
   109  		p[0] = t1 | byte(i)
   110  		return 1
   111  	case i <= rune2Max:
   112  		_ = p[1]
   113  		p[0] = t2 | byte(i>>6)
   114  		p[1] = tx | byte(i)&maskx
   115  		return 2
   116  	case i <= rune3Max:
   117  		_ = p[2]
   118  		p[0] = t3 | byte(i>>12)
   119  		p[1] = tx | byte(i>>6)&maskx
   120  		p[2] = tx | byte(i)&maskx
   121  		return 3
   122  	case i <= rune4Max:
   123  		_ = p[3]
   124  		p[0] = t4 | byte(i>>18)
   125  		p[1] = tx | byte(i>>12)&maskx
   126  		p[2] = tx | byte(i>>6)&maskx
   127  		p[3] = tx | byte(i)&maskx
   128  		return 4
   129  	case i <= rune5Max:
   130  		_ = p[4]
   131  		p[0] = t5 | byte(i>>24)
   132  		p[1] = tx | byte(i>>18)&maskx
   133  		p[2] = tx | byte(i>>12)&maskx
   134  		p[3] = tx | byte(i>>6)&maskx
   135  		p[4] = tx | byte(i)&maskx
   136  		return 5
   137  	default: // i <= rune6Max:
   138  		_ = p[5]
   139  		p[0] = t6 | byte(i>>30)
   140  		p[1] = tx | byte(i>>24)&maskx
   141  		p[2] = tx | byte(i>>18)&maskx
   142  		p[3] = tx | byte(i>>12)&maskx
   143  		p[4] = tx | byte(i>>6)&maskx
   144  		p[5] = tx | byte(i)&maskx
   145  		return 6
   146  	}
   147  }
   148  
   149  // GetDecodeRuneInString return a decode function that is strict or lax about
   150  // the utf8 encoding depending on the value of lax.  For details see the UTF-8
   151  // support section in the Lua 5.4 manual.
   152  func GetDecodeRuneInString(lax bool) func(string) (rune, int) {
   153  	if lax {
   154  		return DecodeRuneInString
   155  	} else {
   156  		return utf8.DecodeRuneInString
   157  	}
   158  }
   159  
   160  // DecodeRuneInString is like DecodeRune but its input is a string. If s is
   161  // empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
   162  // returns (RuneError, 1). Both are impossible results for correct, non-empty
   163  // UTF-8.
   164  //
   165  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   166  // out of range, or is not the shortest possible UTF-8 encoding for the
   167  // value. No other validation is performed.
   168  func DecodeRuneInString(s string) (r rune, size int) {
   169  	n := len(s)
   170  	if n < 1 {
   171  		return RuneError, 0
   172  	}
   173  	s0 := s[0]
   174  	x := first[s0]
   175  	if x >= as {
   176  		// The following code simulates an additional check for x == xx and
   177  		// handling the ASCII and invalid cases accordingly. This mask-and-or
   178  		// approach prevents an additional branch.
   179  		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
   180  		return rune(s[0])&^mask | RuneError&mask, 1
   181  	}
   182  	sz := int(x & 7) // Throw away the acceptRange nibble
   183  	if n < sz {
   184  		return RuneError, 1
   185  	}
   186  	s1 := s[1]
   187  	if s1 < locb || hicb < s1 {
   188  		return RuneError, 1
   189  	}
   190  	if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
   191  		return rune(s0&mask2)<<6 | rune(s1&maskx), 2
   192  	}
   193  	s2 := s[2]
   194  	if s2 < locb || hicb < s2 {
   195  		return RuneError, 1
   196  	}
   197  	if sz <= 3 {
   198  		return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
   199  	}
   200  	s3 := s[3]
   201  	if s3 < locb || hicb < s3 {
   202  		return RuneError, 1
   203  	}
   204  	if sz <= 4 {
   205  		return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
   206  	}
   207  
   208  	// Non-standard encodings supported by Lua
   209  	s4 := s[4]
   210  	if s4 < locb || hicb < s4 {
   211  		return RuneError, 1
   212  	}
   213  	if sz <= 5 {
   214  		return rune(s0&mask5)<<24 | rune(s1&maskx)<<18 | rune(s2&maskx)<<12 | rune(s3&maskx)<<6 | rune(s4&maskx), 5
   215  	}
   216  	s5 := s[5]
   217  	if s5 < locb || hicb < s5 {
   218  		return RuneError, 1
   219  	}
   220  	return rune(s0&mask6)<<30 | rune(s1&maskx)<<24 | rune(s2&maskx)<<18 | rune(s3&maskx)<<12 | rune(s4&maskx)<<6 | rune(s5&maskx), 6
   221  }