github.com/status-im/status-go@v1.1.0/abi-spec/utf8.go (about) 1 package abispec 2 3 import ( 4 "fmt" 5 "unicode/utf8" 6 7 "github.com/ethereum/go-ethereum/common/hexutil" 8 ) 9 10 func stringToRunes(str string) []rune { 11 var runes []rune 12 bytes := []byte(str) 13 for len(bytes) > 0 { 14 r, size := utf8.DecodeRune(bytes) 15 if r == utf8.RuneError { 16 for i := 0; i < size; i++ { 17 runes = append(runes, rune(bytes[i])) 18 } 19 } else { 20 runes = append(runes, r) 21 } 22 bytes = bytes[size:] 23 } 24 return runes 25 } 26 27 // Taken from https://mths.be/punycode 28 func ucs2decode(str string) []rune { 29 var runes = stringToRunes(str) 30 var output []rune 31 var counter = 0 32 var length = len(runes) 33 var value rune 34 var extra rune 35 for counter < length { 36 value = runes[counter] 37 counter++ 38 if value >= 0xD800 && value <= 0xDBFF && counter < length { 39 // high surrogate, and there is a next character 40 extra = runes[counter] 41 counter++ 42 if (extra & 0xFC00) == 0xDC00 { // low surrogate 43 output = append(output, ((value&0x3FF)<<10)+(extra&0x3FF)+0x10000) 44 } else { 45 // unmatched surrogate; only append this code unit, in case the next 46 // code unit is the high surrogate of a surrogate pair 47 output = append(output, value) 48 counter-- 49 } 50 } else { 51 output = append(output, value) 52 } 53 } 54 return output 55 } 56 57 // Taken from https://mths.be/punycode 58 func ucs2encode(array []rune) []byte { 59 var length = len(array) 60 var index = 0 61 var value rune 62 var output []byte 63 for index < length { 64 value = array[index] 65 if value > 0xFFFF { 66 value -= 0x10000 67 codePoint := rune(uint32(value)>>10&0x3FF | 0xD800) 68 output = appendBytes(output, stringFromCharCode(codePoint)) 69 value = 0xDC00 | value&0x3FF 70 } 71 output = appendBytes(output, stringFromCharCode(value)) 72 index++ 73 } 74 return output 75 } 76 77 func appendBytes(dest []byte, bytes []byte) []byte { 78 for i := 0; i < len(bytes); i++ { 79 dest = append(dest, bytes[i]) 80 } 81 return dest 82 } 83 84 func checkScalarValue(codePoint rune) error { 85 if codePoint >= 0xD800 && codePoint <= 0xDFFF { 86 return fmt.Errorf("lone surrogate U+%s is not a scalar value", hexutil.EncodeUint64(uint64(codePoint))) 87 } 88 return nil 89 } 90 91 func stringFromCharCode(codePoint rune) []byte { 92 var buf = make([]byte, 4) 93 n := utf8.EncodeRune(buf, codePoint) 94 return buf[0:n] 95 } 96 97 func createByte(codePoint rune, shift uint32) []byte { 98 return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80) 99 } 100 101 func encodeCodePoint(codePoint rune) ([]byte, error) { 102 if (uint32(codePoint) & uint32(0xFFFFFF80)) == 0 { // 1-byte sequence 103 return stringFromCharCode(codePoint), nil 104 } 105 var symbol []byte 106 if uint32(codePoint)&0xFFFFF800 == 0 { // 2-byte sequence 107 symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0) 108 } else if (uint32(codePoint) & 0xFFFF0000) == 0 { // 3-byte sequence 109 err := checkScalarValue(codePoint) 110 if err != nil { 111 return nil, err 112 } 113 symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0) 114 symbol = appendBytes(symbol, createByte(codePoint, 6)) 115 } else if (uint32(codePoint) & 0xFFE00000) == 0 { // 4-byte sequence 116 symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0) 117 symbol = appendBytes(symbol, createByte(codePoint, 12)) 118 symbol = appendBytes(symbol, createByte(codePoint, 6)) 119 } 120 symbol = appendBytes(symbol, stringFromCharCode((codePoint&0x3F)|0x80)) 121 return symbol, nil 122 } 123 124 // implementation referenced from https://github.com/mathiasbynens/utf8.js/blob/2ce09544b62f2a274dbcd249473c0986e3660849/utf8.js 125 func Utf8encode(str string) (string, error) { 126 var codePoints = ucs2decode(str) 127 var length = len(codePoints) 128 var index = 0 129 var codePoint rune 130 var bytes []byte 131 for index < length { 132 codePoint = codePoints[index] 133 cps, err := encodeCodePoint(codePoint) 134 if err != nil { 135 return "", err 136 } 137 bytes = appendBytes(bytes, cps) 138 index++ 139 } 140 return string(bytes), nil 141 } 142 143 func readContinuationByte(byteArray []rune, byteCount int, pByteIndex *int) (rune, error) { 144 if *pByteIndex >= byteCount { 145 return utf8.RuneError, fmt.Errorf("invalid byte index") 146 } 147 148 var continuationByte = byteArray[*pByteIndex] & 0xFF 149 *pByteIndex = *pByteIndex + 1 150 151 if (continuationByte & 0xC0) == 0x80 { 152 return continuationByte & 0x3F, nil 153 } 154 155 // If we end up here, it’s not a continuation byte 156 return utf8.RuneError, fmt.Errorf("invalid continuation byte") 157 } 158 159 func decodeSymbol(byteArray []rune, byteCount int, pByteIndex *int) (rune, bool, error) { 160 var byte1 rune 161 var codePoint rune 162 163 if *pByteIndex > byteCount { 164 return utf8.RuneError, false, fmt.Errorf("invalid byte index") 165 } 166 167 if *pByteIndex == byteCount { 168 return utf8.RuneError, false, nil 169 } 170 171 // Read first byte 172 byte1 = byteArray[*pByteIndex] & 0xFF 173 *pByteIndex = *pByteIndex + 1 174 175 // 1-byte sequence (no continuation bytes) 176 if (byte1 & 0x80) == 0 { 177 return byte1, true, nil 178 } 179 180 // 2-byte sequence 181 if (byte1 & 0xE0) == 0xC0 { 182 byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex) 183 if err != nil { 184 return utf8.RuneError, false, err 185 } 186 codePoint = ((byte1 & 0x1F) << 6) | byte2 187 if codePoint >= 0x80 { 188 return codePoint, true, nil 189 } 190 return utf8.RuneError, false, fmt.Errorf("invalid continuation byte") 191 } 192 193 // 3-byte sequence (may include unpaired surrogates) 194 if (byte1 & 0xF0) == 0xE0 { 195 byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex) 196 if err != nil { 197 return utf8.RuneError, false, err 198 } 199 byte3, err := readContinuationByte(byteArray, byteCount, pByteIndex) 200 if err != nil { 201 return utf8.RuneError, false, err 202 } 203 codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3 204 if codePoint >= 0x0800 { 205 err := checkScalarValue(codePoint) 206 if err != nil { 207 return utf8.RuneError, false, err 208 } 209 return codePoint, true, nil 210 } 211 return utf8.RuneError, false, fmt.Errorf("invalid continuation byte") 212 } 213 214 // 4-byte sequence 215 if (byte1 & 0xF8) == 0xF0 { 216 byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex) 217 if err != nil { 218 return utf8.RuneError, false, err 219 } 220 byte3, err := readContinuationByte(byteArray, byteCount, pByteIndex) 221 if err != nil { 222 return utf8.RuneError, false, err 223 } 224 byte4, err := readContinuationByte(byteArray, byteCount, pByteIndex) 225 if err != nil { 226 return utf8.RuneError, false, err 227 } 228 codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | 229 (byte3 << 0x06) | byte4 230 if codePoint >= 0x010000 && codePoint <= 0x10FFFF { 231 return codePoint, true, nil 232 } 233 } 234 235 return utf8.RuneError, false, fmt.Errorf("invalid UTF-8 detected") 236 } 237 238 // implementation referenced from https://github.com/mathiasbynens/utf8.js/blob/2ce09544b62f2a274dbcd249473c0986e3660849/utf8.js 239 func Utf8decode(str string) ([]byte, error) { 240 byteArray := ucs2decode(str) 241 byteCount := len(byteArray) 242 byteIndex := 0 243 var codePoints []rune 244 for { 245 codePoint, goOn, err := decodeSymbol(byteArray, byteCount, &byteIndex) 246 if err != nil { 247 return nil, err 248 } 249 if !goOn { 250 break 251 } 252 codePoints = append(codePoints, codePoint) 253 } 254 return ucs2encode(codePoints), nil 255 }