github.com/peggyl/go@v0.0.0-20151008231540-ae315999c2d5/src/unicode/utf8/utf8.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package utf8 implements functions and constants to support text encoded in
     6  // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
     7  package utf8
     8  
     9  // The conditions RuneError==unicode.ReplacementChar and
    10  // MaxRune==unicode.MaxRune are verified in the tests.
    11  // Defining them locally avoids this package depending on package unicode.
    12  
    13  // Numbers fundamental to the encoding.
    14  const (
    15  	RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
    16  	RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
    17  	MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
    18  	UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
    19  )
    20  
    21  // Code points in the surrogate range are not valid for UTF-8.
    22  const (
    23  	surrogateMin = 0xD800
    24  	surrogateMax = 0xDFFF
    25  )
    26  
    27  const (
    28  	t1 = 0x00 // 0000 0000
    29  	tx = 0x80 // 1000 0000
    30  	t2 = 0xC0 // 1100 0000
    31  	t3 = 0xE0 // 1110 0000
    32  	t4 = 0xF0 // 1111 0000
    33  	t5 = 0xF8 // 1111 1000
    34  
    35  	maskx = 0x3F // 0011 1111
    36  	mask2 = 0x1F // 0001 1111
    37  	mask3 = 0x0F // 0000 1111
    38  	mask4 = 0x07 // 0000 0111
    39  
    40  	rune1Max = 1<<7 - 1
    41  	rune2Max = 1<<11 - 1
    42  	rune3Max = 1<<16 - 1
    43  )
    44  
    45  func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
    46  	n := len(p)
    47  	if n < 1 {
    48  		return RuneError, 0, true
    49  	}
    50  	c0 := p[0]
    51  
    52  	// 1-byte, 7-bit sequence?
    53  	if c0 < tx {
    54  		return rune(c0), 1, false
    55  	}
    56  
    57  	// unexpected continuation byte?
    58  	if c0 < t2 {
    59  		return RuneError, 1, false
    60  	}
    61  
    62  	// need first continuation byte
    63  	if n < 2 {
    64  		return RuneError, 1, true
    65  	}
    66  	c1 := p[1]
    67  	if c1 < tx || t2 <= c1 {
    68  		return RuneError, 1, false
    69  	}
    70  
    71  	// 2-byte, 11-bit sequence?
    72  	if c0 < t3 {
    73  		r = rune(c0&mask2)<<6 | rune(c1&maskx)
    74  		if r <= rune1Max {
    75  			return RuneError, 1, false
    76  		}
    77  		return r, 2, false
    78  	}
    79  
    80  	// need second continuation byte
    81  	if n < 3 {
    82  		return RuneError, 1, true
    83  	}
    84  	c2 := p[2]
    85  	if c2 < tx || t2 <= c2 {
    86  		return RuneError, 1, false
    87  	}
    88  
    89  	// 3-byte, 16-bit sequence?
    90  	if c0 < t4 {
    91  		r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
    92  		if r <= rune2Max {
    93  			return RuneError, 1, false
    94  		}
    95  		if surrogateMin <= r && r <= surrogateMax {
    96  			return RuneError, 1, false
    97  		}
    98  		return r, 3, false
    99  	}
   100  
   101  	// need third continuation byte
   102  	if n < 4 {
   103  		return RuneError, 1, true
   104  	}
   105  	c3 := p[3]
   106  	if c3 < tx || t2 <= c3 {
   107  		return RuneError, 1, false
   108  	}
   109  
   110  	// 4-byte, 21-bit sequence?
   111  	if c0 < t5 {
   112  		r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
   113  		if r <= rune3Max || MaxRune < r {
   114  			return RuneError, 1, false
   115  		}
   116  		return r, 4, false
   117  	}
   118  
   119  	// error
   120  	return RuneError, 1, false
   121  }
   122  
   123  func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
   124  	n := len(s)
   125  	if n < 1 {
   126  		return RuneError, 0, true
   127  	}
   128  	c0 := s[0]
   129  
   130  	// 1-byte, 7-bit sequence?
   131  	if c0 < tx {
   132  		return rune(c0), 1, false
   133  	}
   134  
   135  	// unexpected continuation byte?
   136  	if c0 < t2 {
   137  		return RuneError, 1, false
   138  	}
   139  
   140  	// need first continuation byte
   141  	if n < 2 {
   142  		return RuneError, 1, true
   143  	}
   144  	c1 := s[1]
   145  	if c1 < tx || t2 <= c1 {
   146  		return RuneError, 1, false
   147  	}
   148  
   149  	// 2-byte, 11-bit sequence?
   150  	if c0 < t3 {
   151  		r = rune(c0&mask2)<<6 | rune(c1&maskx)
   152  		if r <= rune1Max {
   153  			return RuneError, 1, false
   154  		}
   155  		return r, 2, false
   156  	}
   157  
   158  	// need second continuation byte
   159  	if n < 3 {
   160  		return RuneError, 1, true
   161  	}
   162  	c2 := s[2]
   163  	if c2 < tx || t2 <= c2 {
   164  		return RuneError, 1, false
   165  	}
   166  
   167  	// 3-byte, 16-bit sequence?
   168  	if c0 < t4 {
   169  		r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
   170  		if r <= rune2Max {
   171  			return RuneError, 1, false
   172  		}
   173  		if surrogateMin <= r && r <= surrogateMax {
   174  			return RuneError, 1, false
   175  		}
   176  		return r, 3, false
   177  	}
   178  
   179  	// need third continuation byte
   180  	if n < 4 {
   181  		return RuneError, 1, true
   182  	}
   183  	c3 := s[3]
   184  	if c3 < tx || t2 <= c3 {
   185  		return RuneError, 1, false
   186  	}
   187  
   188  	// 4-byte, 21-bit sequence?
   189  	if c0 < t5 {
   190  		r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
   191  		if r <= rune3Max || MaxRune < r {
   192  			return RuneError, 1, false
   193  		}
   194  		return r, 4, false
   195  	}
   196  
   197  	// error
   198  	return RuneError, 1, false
   199  }
   200  
   201  // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
   202  // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
   203  func FullRune(p []byte) bool {
   204  	_, _, short := decodeRuneInternal(p)
   205  	return !short
   206  }
   207  
   208  // FullRuneInString is like FullRune but its input is a string.
   209  func FullRuneInString(s string) bool {
   210  	_, _, short := decodeRuneInStringInternal(s)
   211  	return !short
   212  }
   213  
   214  // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
   215  // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
   216  // the encoding is invalid, it returns (RuneError, 1). Both are impossible
   217  // results for correct UTF-8.
   218  //
   219  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   220  // out of range, or is not the shortest possible UTF-8 encoding for the
   221  // value. No other validation is performed.
   222  func DecodeRune(p []byte) (r rune, size int) {
   223  	r, size, _ = decodeRuneInternal(p)
   224  	return
   225  }
   226  
   227  // DecodeRuneInString is like DecodeRune but its input is a string. If s is
   228  // empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
   229  // returns (RuneError, 1). Both are impossible results for correct UTF-8.
   230  //
   231  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   232  // out of range, or is not the shortest possible UTF-8 encoding for the
   233  // value. No other validation is performed.
   234  func DecodeRuneInString(s string) (r rune, size int) {
   235  	r, size, _ = decodeRuneInStringInternal(s)
   236  	return
   237  }
   238  
   239  // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
   240  // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
   241  // the encoding is invalid, it returns (RuneError, 1). Both are impossible
   242  // results for correct UTF-8.
   243  //
   244  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   245  // out of range, or is not the shortest possible UTF-8 encoding for the
   246  // value. No other validation is performed.
   247  func DecodeLastRune(p []byte) (r rune, size int) {
   248  	end := len(p)
   249  	if end == 0 {
   250  		return RuneError, 0
   251  	}
   252  	start := end - 1
   253  	r = rune(p[start])
   254  	if r < RuneSelf {
   255  		return r, 1
   256  	}
   257  	// guard against O(n^2) behavior when traversing
   258  	// backwards through strings with long sequences of
   259  	// invalid UTF-8.
   260  	lim := end - UTFMax
   261  	if lim < 0 {
   262  		lim = 0
   263  	}
   264  	for start--; start >= lim; start-- {
   265  		if RuneStart(p[start]) {
   266  			break
   267  		}
   268  	}
   269  	if start < 0 {
   270  		start = 0
   271  	}
   272  	r, size = DecodeRune(p[start:end])
   273  	if start+size != end {
   274  		return RuneError, 1
   275  	}
   276  	return r, size
   277  }
   278  
   279  // DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
   280  // s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid,
   281  // it returns (RuneError, 1). Both are impossible results for correct UTF-8.
   282  //
   283  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   284  // out of range, or is not the shortest possible UTF-8 encoding for the
   285  // value. No other validation is performed.
   286  func DecodeLastRuneInString(s string) (r rune, size int) {
   287  	end := len(s)
   288  	if end == 0 {
   289  		return RuneError, 0
   290  	}
   291  	start := end - 1
   292  	r = rune(s[start])
   293  	if r < RuneSelf {
   294  		return r, 1
   295  	}
   296  	// guard against O(n^2) behavior when traversing
   297  	// backwards through strings with long sequences of
   298  	// invalid UTF-8.
   299  	lim := end - UTFMax
   300  	if lim < 0 {
   301  		lim = 0
   302  	}
   303  	for start--; start >= lim; start-- {
   304  		if RuneStart(s[start]) {
   305  			break
   306  		}
   307  	}
   308  	if start < 0 {
   309  		start = 0
   310  	}
   311  	r, size = DecodeRuneInString(s[start:end])
   312  	if start+size != end {
   313  		return RuneError, 1
   314  	}
   315  	return r, size
   316  }
   317  
   318  // RuneLen returns the number of bytes required to encode the rune.
   319  // It returns -1 if the rune is not a valid value to encode in UTF-8.
   320  func RuneLen(r rune) int {
   321  	switch {
   322  	case r < 0:
   323  		return -1
   324  	case r <= rune1Max:
   325  		return 1
   326  	case r <= rune2Max:
   327  		return 2
   328  	case surrogateMin <= r && r <= surrogateMax:
   329  		return -1
   330  	case r <= rune3Max:
   331  		return 3
   332  	case r <= MaxRune:
   333  		return 4
   334  	}
   335  	return -1
   336  }
   337  
   338  // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
   339  // It returns the number of bytes written.
   340  func EncodeRune(p []byte, r rune) int {
   341  	// Negative values are erroneous.  Making it unsigned addresses the problem.
   342  	switch i := uint32(r); {
   343  	case i <= rune1Max:
   344  		p[0] = byte(r)
   345  		return 1
   346  	case i <= rune2Max:
   347  		p[0] = t2 | byte(r>>6)
   348  		p[1] = tx | byte(r)&maskx
   349  		return 2
   350  	case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
   351  		r = RuneError
   352  		fallthrough
   353  	case i <= rune3Max:
   354  		p[0] = t3 | byte(r>>12)
   355  		p[1] = tx | byte(r>>6)&maskx
   356  		p[2] = tx | byte(r)&maskx
   357  		return 3
   358  	default:
   359  		p[0] = t4 | byte(r>>18)
   360  		p[1] = tx | byte(r>>12)&maskx
   361  		p[2] = tx | byte(r>>6)&maskx
   362  		p[3] = tx | byte(r)&maskx
   363  		return 4
   364  	}
   365  }
   366  
   367  // RuneCount returns the number of runes in p.  Erroneous and short
   368  // encodings are treated as single runes of width 1 byte.
   369  func RuneCount(p []byte) int {
   370  	i := 0
   371  	var n int
   372  	for n = 0; i < len(p); n++ {
   373  		if p[i] < RuneSelf {
   374  			i++
   375  		} else {
   376  			_, size := DecodeRune(p[i:])
   377  			i += size
   378  		}
   379  	}
   380  	return n
   381  }
   382  
   383  // RuneCountInString is like RuneCount but its input is a string.
   384  func RuneCountInString(s string) (n int) {
   385  	for range s {
   386  		n++
   387  	}
   388  	return
   389  }
   390  
   391  // RuneStart reports whether the byte could be the first byte of
   392  // an encoded rune.  Second and subsequent bytes always have the top
   393  // two bits set to 10.
   394  func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
   395  
   396  // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
   397  func Valid(p []byte) bool {
   398  	i := 0
   399  	for i < len(p) {
   400  		if p[i] < RuneSelf {
   401  			i++
   402  		} else {
   403  			_, size := DecodeRune(p[i:])
   404  			if size == 1 {
   405  				// All valid runes of size 1 (those
   406  				// below RuneSelf) were handled above.
   407  				// This must be a RuneError.
   408  				return false
   409  			}
   410  			i += size
   411  		}
   412  	}
   413  	return true
   414  }
   415  
   416  // ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
   417  func ValidString(s string) bool {
   418  	for i, r := range s {
   419  		if r == RuneError {
   420  			// The RuneError value can be an error
   421  			// sentinel value (if it's size 1) or the same
   422  			// value encoded properly. Decode it to see if
   423  			// it's the 1 byte sentinel value.
   424  			_, size := DecodeRuneInString(s[i:])
   425  			if size == 1 {
   426  				return false
   427  			}
   428  		}
   429  	}
   430  	return true
   431  }
   432  
   433  // ValidRune reports whether r can be legally encoded as UTF-8.
   434  // Code points that are out of range or a surrogate half are illegal.
   435  func ValidRune(r rune) bool {
   436  	switch {
   437  	case r < 0:
   438  		return false
   439  	case surrogateMin <= r && r <= surrogateMax:
   440  		return false
   441  	case r > MaxRune:
   442  		return false
   443  	}
   444  	return true
   445  }