github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/src/pkg/unicode/utf8/utf8.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package utf8 implements functions and constants to support text encoded in 6 // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences. 7 package utf8 8 9 // The conditions RuneError==unicode.ReplacementChar and 10 // MaxRune==unicode.MaxRune are verified in the tests. 11 // Defining them locally avoids this package depending on package unicode. 12 13 // Numbers fundamental to the encoding. 14 const ( 15 RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character" 16 RuneSelf = 0x80 // characters below Runeself are represented as themselves in a single byte. 17 MaxRune = '\U0010FFFF' // Maximum valid Unicode code point. 18 UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character. 19 ) 20 21 // Code points in the surrogate range are not valid for UTF-8. 22 const ( 23 surrogateMin = 0xD800 24 surrogateMax = 0xDFFF 25 ) 26 27 const ( 28 t1 = 0x00 // 0000 0000 29 tx = 0x80 // 1000 0000 30 t2 = 0xC0 // 1100 0000 31 t3 = 0xE0 // 1110 0000 32 t4 = 0xF0 // 1111 0000 33 t5 = 0xF8 // 1111 1000 34 35 maskx = 0x3F // 0011 1111 36 mask2 = 0x1F // 0001 1111 37 mask3 = 0x0F // 0000 1111 38 mask4 = 0x07 // 0000 0111 39 40 rune1Max = 1<<7 - 1 41 rune2Max = 1<<11 - 1 42 rune3Max = 1<<16 - 1 43 ) 44 45 func decodeRuneInternal(p []byte) (r rune, size int, short bool) { 46 n := len(p) 47 if n < 1 { 48 return RuneError, 0, true 49 } 50 c0 := p[0] 51 52 // 1-byte, 7-bit sequence? 53 if c0 < tx { 54 return rune(c0), 1, false 55 } 56 57 // unexpected continuation byte? 58 if c0 < t2 { 59 return RuneError, 1, false 60 } 61 62 // need first continuation byte 63 if n < 2 { 64 return RuneError, 1, true 65 } 66 c1 := p[1] 67 if c1 < tx || t2 <= c1 { 68 return RuneError, 1, false 69 } 70 71 // 2-byte, 11-bit sequence? 72 if c0 < t3 { 73 r = rune(c0&mask2)<<6 | rune(c1&maskx) 74 if r <= rune1Max { 75 return RuneError, 1, false 76 } 77 return r, 2, false 78 } 79 80 // need second continuation byte 81 if n < 3 { 82 return RuneError, 1, true 83 } 84 c2 := p[2] 85 if c2 < tx || t2 <= c2 { 86 return RuneError, 1, false 87 } 88 89 // 3-byte, 16-bit sequence? 90 if c0 < t4 { 91 r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) 92 if r <= rune2Max { 93 return RuneError, 1, false 94 } 95 if surrogateMin <= r && r <= surrogateMax { 96 return RuneError, 1, false 97 } 98 return r, 3, false 99 } 100 101 // need third continuation byte 102 if n < 4 { 103 return RuneError, 1, true 104 } 105 c3 := p[3] 106 if c3 < tx || t2 <= c3 { 107 return RuneError, 1, false 108 } 109 110 // 4-byte, 21-bit sequence? 111 if c0 < t5 { 112 r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) 113 if r <= rune3Max || MaxRune < r { 114 return RuneError, 1, false 115 } 116 return r, 4, false 117 } 118 119 // error 120 return RuneError, 1, false 121 } 122 123 func decodeRuneInStringInternal(s string) (r rune, size int, short bool) { 124 n := len(s) 125 if n < 1 { 126 return RuneError, 0, true 127 } 128 c0 := s[0] 129 130 // 1-byte, 7-bit sequence? 131 if c0 < tx { 132 return rune(c0), 1, false 133 } 134 135 // unexpected continuation byte? 136 if c0 < t2 { 137 return RuneError, 1, false 138 } 139 140 // need first continuation byte 141 if n < 2 { 142 return RuneError, 1, true 143 } 144 c1 := s[1] 145 if c1 < tx || t2 <= c1 { 146 return RuneError, 1, false 147 } 148 149 // 2-byte, 11-bit sequence? 150 if c0 < t3 { 151 r = rune(c0&mask2)<<6 | rune(c1&maskx) 152 if r <= rune1Max { 153 return RuneError, 1, false 154 } 155 return r, 2, false 156 } 157 158 // need second continuation byte 159 if n < 3 { 160 return RuneError, 1, true 161 } 162 c2 := s[2] 163 if c2 < tx || t2 <= c2 { 164 return RuneError, 1, false 165 } 166 167 // 3-byte, 16-bit sequence? 168 if c0 < t4 { 169 r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) 170 if r <= rune2Max { 171 return RuneError, 1, false 172 } 173 if surrogateMin <= r && r <= surrogateMax { 174 return RuneError, 1, false 175 } 176 return r, 3, false 177 } 178 179 // need third continuation byte 180 if n < 4 { 181 return RuneError, 1, true 182 } 183 c3 := s[3] 184 if c3 < tx || t2 <= c3 { 185 return RuneError, 1, false 186 } 187 188 // 4-byte, 21-bit sequence? 189 if c0 < t5 { 190 r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) 191 if r <= rune3Max || MaxRune < r { 192 return RuneError, 1, false 193 } 194 return r, 4, false 195 } 196 197 // error 198 return RuneError, 1, false 199 } 200 201 // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune. 202 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune. 203 func FullRune(p []byte) bool { 204 _, _, short := decodeRuneInternal(p) 205 return !short 206 } 207 208 // FullRuneInString is like FullRune but its input is a string. 209 func FullRuneInString(s string) bool { 210 _, _, short := decodeRuneInStringInternal(s) 211 return !short 212 } 213 214 // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes. 215 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 216 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 217 // out of range, or is not the shortest possible UTF-8 encoding for the 218 // value. No other validation is performed. 219 func DecodeRune(p []byte) (r rune, size int) { 220 r, size, _ = decodeRuneInternal(p) 221 return 222 } 223 224 // DecodeRuneInString is like DecodeRune but its input is a string. 225 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 226 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 227 // out of range, or is not the shortest possible UTF-8 encoding for the 228 // value. No other validation is performed. 229 func DecodeRuneInString(s string) (r rune, size int) { 230 r, size, _ = decodeRuneInStringInternal(s) 231 return 232 } 233 234 // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes. 235 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 236 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 237 // out of range, or is not the shortest possible UTF-8 encoding for the 238 // value. No other validation is performed. 239 func DecodeLastRune(p []byte) (r rune, size int) { 240 end := len(p) 241 if end == 0 { 242 return RuneError, 0 243 } 244 start := end - 1 245 r = rune(p[start]) 246 if r < RuneSelf { 247 return r, 1 248 } 249 // guard against O(n^2) behavior when traversing 250 // backwards through strings with long sequences of 251 // invalid UTF-8. 252 lim := end - UTFMax 253 if lim < 0 { 254 lim = 0 255 } 256 for start--; start >= lim; start-- { 257 if RuneStart(p[start]) { 258 break 259 } 260 } 261 if start < 0 { 262 start = 0 263 } 264 r, size = DecodeRune(p[start:end]) 265 if start+size != end { 266 return RuneError, 1 267 } 268 return r, size 269 } 270 271 // DecodeLastRuneInString is like DecodeLastRune but its input is a string. 272 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 273 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 274 // out of range, or is not the shortest possible UTF-8 encoding for the 275 // value. No other validation is performed. 276 func DecodeLastRuneInString(s string) (r rune, size int) { 277 end := len(s) 278 if end == 0 { 279 return RuneError, 0 280 } 281 start := end - 1 282 r = rune(s[start]) 283 if r < RuneSelf { 284 return r, 1 285 } 286 // guard against O(n^2) behavior when traversing 287 // backwards through strings with long sequences of 288 // invalid UTF-8. 289 lim := end - UTFMax 290 if lim < 0 { 291 lim = 0 292 } 293 for start--; start >= lim; start-- { 294 if RuneStart(s[start]) { 295 break 296 } 297 } 298 if start < 0 { 299 start = 0 300 } 301 r, size = DecodeRuneInString(s[start:end]) 302 if start+size != end { 303 return RuneError, 1 304 } 305 return r, size 306 } 307 308 // RuneLen returns the number of bytes required to encode the rune. 309 // It returns -1 if the rune is not a valid value to encode in UTF-8. 310 func RuneLen(r rune) int { 311 switch { 312 case r < 0: 313 return -1 314 case r <= rune1Max: 315 return 1 316 case r <= rune2Max: 317 return 2 318 case surrogateMin <= r && r <= surrogateMax: 319 return -1 320 case r <= rune3Max: 321 return 3 322 case r <= MaxRune: 323 return 4 324 } 325 return -1 326 } 327 328 // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune. 329 // It returns the number of bytes written. 330 func EncodeRune(p []byte, r rune) int { 331 // Negative values are erroneous. Making it unsigned addresses the problem. 332 if uint32(r) <= rune1Max { 333 p[0] = byte(r) 334 return 1 335 } 336 337 if uint32(r) <= rune2Max { 338 p[0] = t2 | byte(r>>6) 339 p[1] = tx | byte(r)&maskx 340 return 2 341 } 342 343 if uint32(r) > MaxRune { 344 r = RuneError 345 } 346 347 if surrogateMin <= r && r <= surrogateMax { 348 r = RuneError 349 } 350 351 if uint32(r) <= rune3Max { 352 p[0] = t3 | byte(r>>12) 353 p[1] = tx | byte(r>>6)&maskx 354 p[2] = tx | byte(r)&maskx 355 return 3 356 } 357 358 p[0] = t4 | byte(r>>18) 359 p[1] = tx | byte(r>>12)&maskx 360 p[2] = tx | byte(r>>6)&maskx 361 p[3] = tx | byte(r)&maskx 362 return 4 363 } 364 365 // RuneCount returns the number of runes in p. Erroneous and short 366 // encodings are treated as single runes of width 1 byte. 367 func RuneCount(p []byte) int { 368 i := 0 369 var n int 370 for n = 0; i < len(p); n++ { 371 if p[i] < RuneSelf { 372 i++ 373 } else { 374 _, size := DecodeRune(p[i:]) 375 i += size 376 } 377 } 378 return n 379 } 380 381 // RuneCountInString is like RuneCount but its input is a string. 382 func RuneCountInString(s string) (n int) { 383 for _ = range s { 384 n++ 385 } 386 return 387 } 388 389 // RuneStart reports whether the byte could be the first byte of 390 // an encoded rune. Second and subsequent bytes always have the top 391 // two bits set to 10. 392 func RuneStart(b byte) bool { return b&0xC0 != 0x80 } 393 394 // Valid reports whether p consists entirely of valid UTF-8-encoded runes. 395 func Valid(p []byte) bool { 396 i := 0 397 for i < len(p) { 398 if p[i] < RuneSelf { 399 i++ 400 } else { 401 _, size := DecodeRune(p[i:]) 402 if size == 1 { 403 // All valid runes of size 1 (those 404 // below RuneSelf) were handled above. 405 // This must be a RuneError. 406 return false 407 } 408 i += size 409 } 410 } 411 return true 412 } 413 414 // ValidString reports whether s consists entirely of valid UTF-8-encoded runes. 415 func ValidString(s string) bool { 416 for i, r := range s { 417 if r == RuneError { 418 // The RuneError value can be an error 419 // sentinel value (if it's size 1) or the same 420 // value encoded properly. Decode it to see if 421 // it's the 1 byte sentinel value. 422 _, size := DecodeRuneInString(s[i:]) 423 if size == 1 { 424 return false 425 } 426 } 427 } 428 return true 429 } 430 431 // ValidRune reports whether r can be legally encoded as UTF-8. 432 // Code points that are out of range or a surrogate half are illegal. 433 func ValidRune(r rune) bool { 434 switch { 435 case r < 0: 436 return false 437 case surrogateMin <= r && r <= surrogateMax: 438 return false 439 case r > MaxRune: 440 return false 441 } 442 return true 443 }