github.com/arnodel/golua@v0.0.0-20230215163904-e0b5347eaaa1/luastrings/utf8.go (about) 1 package luastrings 2 3 import "unicode/utf8" 4 5 // Most of this code is copied from Go's own "unicode/utf8" package, with some 6 // modifications to allow "utf-8" encoding of any positive int32 and the reverse 7 // operation. 8 9 const ( 10 UTFMax = 6 // Originally 4 in unicode/utf8 11 12 t1 = 0b00000000 13 tx = 0b10000000 14 t2 = 0b11000000 15 t3 = 0b11100000 16 t4 = 0b11110000 17 18 // Added for Lua 19 t5 = 0b11111000 20 t6 = 0b11111100 21 22 maskx = 0b00111111 23 mask2 = 0b00011111 24 mask3 = 0b00001111 25 mask4 = 0b00000111 26 27 // Added for Lua 28 mask5 = 0b00000011 29 mask6 = 0b00000001 30 31 rune1Max = 1<<7 - 1 32 rune2Max = 1<<11 - 1 33 rune3Max = 1<<16 - 1 34 rune4Max = 1<<21 - 1 35 rune5Max = 1<<26 - 1 36 // Commented because unused as rune6Max is the biggest int32 37 // rune6Max = 1<<31 - 1 38 39 // The default lowest and highest continuation byte. 40 locb = 0b10000000 41 hicb = 0b10111111 42 43 // These names of these constants are chosen to give nice alignment in the 44 // table below. The first nibble is an index into acceptRanges or F for 45 // special one-byte cases. The second nibble is the Rune length or the 46 // Status for the special one-byte case. 47 // 48 // Note: the first nibble is not used because in this implementation 49 // acceptRanges is not needed as any byte sequence is valid. It is however 50 // kept to keep the similarity with the code it's copied from as high as 51 // possible. 52 xx = 0xF1 // invalid: size 1 53 as = 0xF0 // ASCII: size 1 54 s1 = 0x02 // accept 0, size 2 55 56 s2 = 0x13 // accept 1, size 3 57 s3 = 0x03 // accept 0, size 3 58 s4 = 0x23 // accept 2, size 3 59 60 s5 = 0x34 // accept 3, size 4 61 s6 = 0x04 // accept 0, size 4 62 s7 = 0x44 // accept 4, size 4 63 64 // Added for Lua 65 s8 = 0x05 // accept 0, size 5 66 s9 = 0x06 // accept 0, size 6 67 ) 68 69 // first is information about the first byte in a UTF-8 sequence. This table is 70 // copied from the utf8 std library. 71 var first = [256]uint8{ 72 // 1 2 3 4 5 6 7 8 9 A B C D E F 73 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F 74 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F 75 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F 76 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F 77 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F 78 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F 79 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F 80 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F 81 // 1 2 3 4 5 6 7 8 9 A B C D E F 82 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F 83 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F 84 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF 85 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF 86 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF 87 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF 88 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF 89 s5, s6, s6, s6, s7, s6, s6, s6, s8, s8, s8, s8, s9, s9, xx, xx, // 0xF0-0xFF 90 } 91 92 // 1111 0xxx: 0xF0-0xF7 93 // 1111 10xx: 0xF8-0xFB 94 // 1111 110x: 0xFC-0xFD 95 96 const RuneError = utf8.RuneError 97 98 // Encode a unicode point with value i into a sequence of bytes, writing into p. 99 // p must be big enough (length 6 accomodates all values). Returns the number 100 // of bytes written. A non-positive value means an error. 101 // 102 // Any non-negative int32 can be encoded, that is why the golang utf8 package 103 // cannot be used. 104 func UTF8EncodeInt32(p []byte, i int32) int { 105 switch { 106 case i < 0: 107 return 0 108 case i <= rune1Max: 109 p[0] = t1 | byte(i) 110 return 1 111 case i <= rune2Max: 112 _ = p[1] 113 p[0] = t2 | byte(i>>6) 114 p[1] = tx | byte(i)&maskx 115 return 2 116 case i <= rune3Max: 117 _ = p[2] 118 p[0] = t3 | byte(i>>12) 119 p[1] = tx | byte(i>>6)&maskx 120 p[2] = tx | byte(i)&maskx 121 return 3 122 case i <= rune4Max: 123 _ = p[3] 124 p[0] = t4 | byte(i>>18) 125 p[1] = tx | byte(i>>12)&maskx 126 p[2] = tx | byte(i>>6)&maskx 127 p[3] = tx | byte(i)&maskx 128 return 4 129 case i <= rune5Max: 130 _ = p[4] 131 p[0] = t5 | byte(i>>24) 132 p[1] = tx | byte(i>>18)&maskx 133 p[2] = tx | byte(i>>12)&maskx 134 p[3] = tx | byte(i>>6)&maskx 135 p[4] = tx | byte(i)&maskx 136 return 5 137 default: // i <= rune6Max: 138 _ = p[5] 139 p[0] = t6 | byte(i>>30) 140 p[1] = tx | byte(i>>24)&maskx 141 p[2] = tx | byte(i>>18)&maskx 142 p[3] = tx | byte(i>>12)&maskx 143 p[4] = tx | byte(i>>6)&maskx 144 p[5] = tx | byte(i)&maskx 145 return 6 146 } 147 } 148 149 // GetDecodeRuneInString return a decode function that is strict or lax about 150 // the utf8 encoding depending on the value of lax. For details see the UTF-8 151 // support section in the Lua 5.4 manual. 152 func GetDecodeRuneInString(lax bool) func(string) (rune, int) { 153 if lax { 154 return DecodeRuneInString 155 } else { 156 return utf8.DecodeRuneInString 157 } 158 } 159 160 // DecodeRuneInString is like DecodeRune but its input is a string. If s is 161 // empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it 162 // returns (RuneError, 1). Both are impossible results for correct, non-empty 163 // UTF-8. 164 // 165 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 166 // out of range, or is not the shortest possible UTF-8 encoding for the 167 // value. No other validation is performed. 168 func DecodeRuneInString(s string) (r rune, size int) { 169 n := len(s) 170 if n < 1 { 171 return RuneError, 0 172 } 173 s0 := s[0] 174 x := first[s0] 175 if x >= as { 176 // The following code simulates an additional check for x == xx and 177 // handling the ASCII and invalid cases accordingly. This mask-and-or 178 // approach prevents an additional branch. 179 mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF. 180 return rune(s[0])&^mask | RuneError&mask, 1 181 } 182 sz := int(x & 7) // Throw away the acceptRange nibble 183 if n < sz { 184 return RuneError, 1 185 } 186 s1 := s[1] 187 if s1 < locb || hicb < s1 { 188 return RuneError, 1 189 } 190 if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks 191 return rune(s0&mask2)<<6 | rune(s1&maskx), 2 192 } 193 s2 := s[2] 194 if s2 < locb || hicb < s2 { 195 return RuneError, 1 196 } 197 if sz <= 3 { 198 return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3 199 } 200 s3 := s[3] 201 if s3 < locb || hicb < s3 { 202 return RuneError, 1 203 } 204 if sz <= 4 { 205 return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4 206 } 207 208 // Non-standard encodings supported by Lua 209 s4 := s[4] 210 if s4 < locb || hicb < s4 { 211 return RuneError, 1 212 } 213 if sz <= 5 { 214 return rune(s0&mask5)<<24 | rune(s1&maskx)<<18 | rune(s2&maskx)<<12 | rune(s3&maskx)<<6 | rune(s4&maskx), 5 215 } 216 s5 := s[5] 217 if s5 < locb || hicb < s5 { 218 return RuneError, 1 219 } 220 return rune(s0&mask6)<<30 | rune(s1&maskx)<<24 | rune(s2&maskx)<<18 | rune(s3&maskx)<<12 | rune(s4&maskx)<<6 | rune(s5&maskx), 6 221 }