github.com/ader1990/go@v0.0.0-20140630135419-8c24447fa791/src/pkg/unicode/utf8/utf8.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package utf8 implements functions and constants to support text encoded in 6 // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences. 7 package utf8 8 9 // The conditions RuneError==unicode.ReplacementChar and 10 // MaxRune==unicode.MaxRune are verified in the tests. 11 // Defining them locally avoids this package depending on package unicode. 12 13 // Numbers fundamental to the encoding. 14 const ( 15 RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character" 16 RuneSelf = 0x80 // characters below Runeself are represented as themselves in a single byte. 17 MaxRune = '\U0010FFFF' // Maximum valid Unicode code point. 18 UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character. 19 ) 20 21 // Code points in the surrogate range are not valid for UTF-8. 22 const ( 23 surrogateMin = 0xD800 24 surrogateMax = 0xDFFF 25 ) 26 27 const ( 28 t1 = 0x00 // 0000 0000 29 tx = 0x80 // 1000 0000 30 t2 = 0xC0 // 1100 0000 31 t3 = 0xE0 // 1110 0000 32 t4 = 0xF0 // 1111 0000 33 t5 = 0xF8 // 1111 1000 34 35 maskx = 0x3F // 0011 1111 36 mask2 = 0x1F // 0001 1111 37 mask3 = 0x0F // 0000 1111 38 mask4 = 0x07 // 0000 0111 39 40 rune1Max = 1<<7 - 1 41 rune2Max = 1<<11 - 1 42 rune3Max = 1<<16 - 1 43 ) 44 45 func decodeRuneInternal(p []byte) (r rune, size int, short bool) { 46 n := len(p) 47 if n < 1 { 48 return RuneError, 0, true 49 } 50 c0 := p[0] 51 52 // 1-byte, 7-bit sequence? 53 if c0 < tx { 54 return rune(c0), 1, false 55 } 56 57 // unexpected continuation byte? 58 if c0 < t2 { 59 return RuneError, 1, false 60 } 61 62 // need first continuation byte 63 if n < 2 { 64 return RuneError, 1, true 65 } 66 c1 := p[1] 67 if c1 < tx || t2 <= c1 { 68 return RuneError, 1, false 69 } 70 71 // 2-byte, 11-bit sequence? 72 if c0 < t3 { 73 r = rune(c0&mask2)<<6 | rune(c1&maskx) 74 if r <= rune1Max { 75 return RuneError, 1, false 76 } 77 return r, 2, false 78 } 79 80 // need second continuation byte 81 if n < 3 { 82 return RuneError, 1, true 83 } 84 c2 := p[2] 85 if c2 < tx || t2 <= c2 { 86 return RuneError, 1, false 87 } 88 89 // 3-byte, 16-bit sequence? 90 if c0 < t4 { 91 r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) 92 if r <= rune2Max { 93 return RuneError, 1, false 94 } 95 if surrogateMin <= r && r <= surrogateMax { 96 return RuneError, 1, false 97 } 98 return r, 3, false 99 } 100 101 // need third continuation byte 102 if n < 4 { 103 return RuneError, 1, true 104 } 105 c3 := p[3] 106 if c3 < tx || t2 <= c3 { 107 return RuneError, 1, false 108 } 109 110 // 4-byte, 21-bit sequence? 111 if c0 < t5 { 112 r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) 113 if r <= rune3Max || MaxRune < r { 114 return RuneError, 1, false 115 } 116 return r, 4, false 117 } 118 119 // error 120 return RuneError, 1, false 121 } 122 123 func decodeRuneInStringInternal(s string) (r rune, size int, short bool) { 124 n := len(s) 125 if n < 1 { 126 return RuneError, 0, true 127 } 128 c0 := s[0] 129 130 // 1-byte, 7-bit sequence? 131 if c0 < tx { 132 return rune(c0), 1, false 133 } 134 135 // unexpected continuation byte? 136 if c0 < t2 { 137 return RuneError, 1, false 138 } 139 140 // need first continuation byte 141 if n < 2 { 142 return RuneError, 1, true 143 } 144 c1 := s[1] 145 if c1 < tx || t2 <= c1 { 146 return RuneError, 1, false 147 } 148 149 // 2-byte, 11-bit sequence? 150 if c0 < t3 { 151 r = rune(c0&mask2)<<6 | rune(c1&maskx) 152 if r <= rune1Max { 153 return RuneError, 1, false 154 } 155 return r, 2, false 156 } 157 158 // need second continuation byte 159 if n < 3 { 160 return RuneError, 1, true 161 } 162 c2 := s[2] 163 if c2 < tx || t2 <= c2 { 164 return RuneError, 1, false 165 } 166 167 // 3-byte, 16-bit sequence? 168 if c0 < t4 { 169 r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) 170 if r <= rune2Max { 171 return RuneError, 1, false 172 } 173 if surrogateMin <= r && r <= surrogateMax { 174 return RuneError, 1, false 175 } 176 return r, 3, false 177 } 178 179 // need third continuation byte 180 if n < 4 { 181 return RuneError, 1, true 182 } 183 c3 := s[3] 184 if c3 < tx || t2 <= c3 { 185 return RuneError, 1, false 186 } 187 188 // 4-byte, 21-bit sequence? 189 if c0 < t5 { 190 r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) 191 if r <= rune3Max || MaxRune < r { 192 return RuneError, 1, false 193 } 194 return r, 4, false 195 } 196 197 // error 198 return RuneError, 1, false 199 } 200 201 // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune. 202 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune. 203 func FullRune(p []byte) bool { 204 _, _, short := decodeRuneInternal(p) 205 return !short 206 } 207 208 // FullRuneInString is like FullRune but its input is a string. 209 func FullRuneInString(s string) bool { 210 _, _, short := decodeRuneInStringInternal(s) 211 return !short 212 } 213 214 // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes. 215 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 216 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 217 // out of range, or is not the shortest possible UTF-8 encoding for the 218 // value. No other validation is performed. 219 func DecodeRune(p []byte) (r rune, size int) { 220 r, size, _ = decodeRuneInternal(p) 221 return 222 } 223 224 // DecodeRuneInString is like DecodeRune but its input is a string. 225 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 226 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 227 // out of range, or is not the shortest possible UTF-8 encoding for the 228 // value. No other validation is performed. 229 func DecodeRuneInString(s string) (r rune, size int) { 230 r, size, _ = decodeRuneInStringInternal(s) 231 return 232 } 233 234 // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes. 235 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 236 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 237 // out of range, or is not the shortest possible UTF-8 encoding for the 238 // value. No other validation is performed. 239 func DecodeLastRune(p []byte) (r rune, size int) { 240 end := len(p) 241 if end == 0 { 242 return RuneError, 0 243 } 244 start := end - 1 245 r = rune(p[start]) 246 if r < RuneSelf { 247 return r, 1 248 } 249 // guard against O(n^2) behavior when traversing 250 // backwards through strings with long sequences of 251 // invalid UTF-8. 252 lim := end - UTFMax 253 if lim < 0 { 254 lim = 0 255 } 256 for start--; start >= lim; start-- { 257 if RuneStart(p[start]) { 258 break 259 } 260 } 261 if start < 0 { 262 start = 0 263 } 264 r, size = DecodeRune(p[start:end]) 265 if start+size != end { 266 return RuneError, 1 267 } 268 return r, size 269 } 270 271 // DecodeLastRuneInString is like DecodeLastRune but its input is a string. 272 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 273 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 274 // out of range, or is not the shortest possible UTF-8 encoding for the 275 // value. No other validation is performed. 276 func DecodeLastRuneInString(s string) (r rune, size int) { 277 end := len(s) 278 if end == 0 { 279 return RuneError, 0 280 } 281 start := end - 1 282 r = rune(s[start]) 283 if r < RuneSelf { 284 return r, 1 285 } 286 // guard against O(n^2) behavior when traversing 287 // backwards through strings with long sequences of 288 // invalid UTF-8. 289 lim := end - UTFMax 290 if lim < 0 { 291 lim = 0 292 } 293 for start--; start >= lim; start-- { 294 if RuneStart(s[start]) { 295 break 296 } 297 } 298 if start < 0 { 299 start = 0 300 } 301 r, size = DecodeRuneInString(s[start:end]) 302 if start+size != end { 303 return RuneError, 1 304 } 305 return r, size 306 } 307 308 // RuneLen returns the number of bytes required to encode the rune. 309 // It returns -1 if the rune is not a valid value to encode in UTF-8. 310 func RuneLen(r rune) int { 311 switch { 312 case r < 0: 313 return -1 314 case r <= rune1Max: 315 return 1 316 case r <= rune2Max: 317 return 2 318 case surrogateMin <= r && r <= surrogateMax: 319 return -1 320 case r <= rune3Max: 321 return 3 322 case r <= MaxRune: 323 return 4 324 } 325 return -1 326 } 327 328 // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune. 329 // It returns the number of bytes written. 330 func EncodeRune(p []byte, r rune) int { 331 // Negative values are erroneous. Making it unsigned addresses the problem. 332 switch i := uint32(r); { 333 case i <= rune1Max: 334 p[0] = byte(r) 335 return 1 336 case i <= rune2Max: 337 p[0] = t2 | byte(r>>6) 338 p[1] = tx | byte(r)&maskx 339 return 2 340 case i > MaxRune, surrogateMin <= i && i <= surrogateMax: 341 r = RuneError 342 fallthrough 343 case i <= rune3Max: 344 p[0] = t3 | byte(r>>12) 345 p[1] = tx | byte(r>>6)&maskx 346 p[2] = tx | byte(r)&maskx 347 return 3 348 default: 349 p[0] = t4 | byte(r>>18) 350 p[1] = tx | byte(r>>12)&maskx 351 p[2] = tx | byte(r>>6)&maskx 352 p[3] = tx | byte(r)&maskx 353 return 4 354 } 355 } 356 357 // RuneCount returns the number of runes in p. Erroneous and short 358 // encodings are treated as single runes of width 1 byte. 359 func RuneCount(p []byte) int { 360 i := 0 361 var n int 362 for n = 0; i < len(p); n++ { 363 if p[i] < RuneSelf { 364 i++ 365 } else { 366 _, size := DecodeRune(p[i:]) 367 i += size 368 } 369 } 370 return n 371 } 372 373 // RuneCountInString is like RuneCount but its input is a string. 374 func RuneCountInString(s string) (n int) { 375 for _ = range s { 376 n++ 377 } 378 return 379 } 380 381 // RuneStart reports whether the byte could be the first byte of 382 // an encoded rune. Second and subsequent bytes always have the top 383 // two bits set to 10. 384 func RuneStart(b byte) bool { return b&0xC0 != 0x80 } 385 386 // Valid reports whether p consists entirely of valid UTF-8-encoded runes. 387 func Valid(p []byte) bool { 388 i := 0 389 for i < len(p) { 390 if p[i] < RuneSelf { 391 i++ 392 } else { 393 _, size := DecodeRune(p[i:]) 394 if size == 1 { 395 // All valid runes of size 1 (those 396 // below RuneSelf) were handled above. 397 // This must be a RuneError. 398 return false 399 } 400 i += size 401 } 402 } 403 return true 404 } 405 406 // ValidString reports whether s consists entirely of valid UTF-8-encoded runes. 407 func ValidString(s string) bool { 408 for i, r := range s { 409 if r == RuneError { 410 // The RuneError value can be an error 411 // sentinel value (if it's size 1) or the same 412 // value encoded properly. Decode it to see if 413 // it's the 1 byte sentinel value. 414 _, size := DecodeRuneInString(s[i:]) 415 if size == 1 { 416 return false 417 } 418 } 419 } 420 return true 421 } 422 423 // ValidRune reports whether r can be legally encoded as UTF-8. 424 // Code points that are out of range or a surrogate half are illegal. 425 func ValidRune(r rune) bool { 426 switch { 427 case r < 0: 428 return false 429 case surrogateMin <= r && r <= surrogateMax: 430 return false 431 case r > MaxRune: 432 return false 433 } 434 return true 435 }