github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/src/pkg/unicode/utf8/utf8.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package utf8 implements functions and constants to support text encoded in
     6  // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
     7  package utf8
     8  
     9  // The conditions RuneError==unicode.ReplacementChar and
    10  // MaxRune==unicode.MaxRune are verified in the tests.
    11  // Defining them locally avoids this package depending on package unicode.
    12  
    13  // Numbers fundamental to the encoding.
    14  const (
    15  	RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
    16  	RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
    17  	MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
    18  	UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
    19  )
    20  
    21  // Code points in the surrogate range are not valid for UTF-8.
    22  const (
    23  	surrogateMin = 0xD800
    24  	surrogateMax = 0xDFFF
    25  )
    26  
    27  const (
    28  	t1 = 0x00 // 0000 0000
    29  	tx = 0x80 // 1000 0000
    30  	t2 = 0xC0 // 1100 0000
    31  	t3 = 0xE0 // 1110 0000
    32  	t4 = 0xF0 // 1111 0000
    33  	t5 = 0xF8 // 1111 1000
    34  
    35  	maskx = 0x3F // 0011 1111
    36  	mask2 = 0x1F // 0001 1111
    37  	mask3 = 0x0F // 0000 1111
    38  	mask4 = 0x07 // 0000 0111
    39  
    40  	rune1Max = 1<<7 - 1
    41  	rune2Max = 1<<11 - 1
    42  	rune3Max = 1<<16 - 1
    43  )
    44  
    45  func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
    46  	n := len(p)
    47  	if n < 1 {
    48  		return RuneError, 0, true
    49  	}
    50  	c0 := p[0]
    51  
    52  	// 1-byte, 7-bit sequence?
    53  	if c0 < tx {
    54  		return rune(c0), 1, false
    55  	}
    56  
    57  	// unexpected continuation byte?
    58  	if c0 < t2 {
    59  		return RuneError, 1, false
    60  	}
    61  
    62  	// need first continuation byte
    63  	if n < 2 {
    64  		return RuneError, 1, true
    65  	}
    66  	c1 := p[1]
    67  	if c1 < tx || t2 <= c1 {
    68  		return RuneError, 1, false
    69  	}
    70  
    71  	// 2-byte, 11-bit sequence?
    72  	if c0 < t3 {
    73  		r = rune(c0&mask2)<<6 | rune(c1&maskx)
    74  		if r <= rune1Max {
    75  			return RuneError, 1, false
    76  		}
    77  		return r, 2, false
    78  	}
    79  
    80  	// need second continuation byte
    81  	if n < 3 {
    82  		return RuneError, 1, true
    83  	}
    84  	c2 := p[2]
    85  	if c2 < tx || t2 <= c2 {
    86  		return RuneError, 1, false
    87  	}
    88  
    89  	// 3-byte, 16-bit sequence?
    90  	if c0 < t4 {
    91  		r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
    92  		if r <= rune2Max {
    93  			return RuneError, 1, false
    94  		}
    95  		if surrogateMin <= r && r <= surrogateMax {
    96  			return RuneError, 1, false
    97  		}
    98  		return r, 3, false
    99  	}
   100  
   101  	// need third continuation byte
   102  	if n < 4 {
   103  		return RuneError, 1, true
   104  	}
   105  	c3 := p[3]
   106  	if c3 < tx || t2 <= c3 {
   107  		return RuneError, 1, false
   108  	}
   109  
   110  	// 4-byte, 21-bit sequence?
   111  	if c0 < t5 {
   112  		r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
   113  		if r <= rune3Max || MaxRune < r {
   114  			return RuneError, 1, false
   115  		}
   116  		return r, 4, false
   117  	}
   118  
   119  	// error
   120  	return RuneError, 1, false
   121  }
   122  
   123  func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
   124  	n := len(s)
   125  	if n < 1 {
   126  		return RuneError, 0, true
   127  	}
   128  	c0 := s[0]
   129  
   130  	// 1-byte, 7-bit sequence?
   131  	if c0 < tx {
   132  		return rune(c0), 1, false
   133  	}
   134  
   135  	// unexpected continuation byte?
   136  	if c0 < t2 {
   137  		return RuneError, 1, false
   138  	}
   139  
   140  	// need first continuation byte
   141  	if n < 2 {
   142  		return RuneError, 1, true
   143  	}
   144  	c1 := s[1]
   145  	if c1 < tx || t2 <= c1 {
   146  		return RuneError, 1, false
   147  	}
   148  
   149  	// 2-byte, 11-bit sequence?
   150  	if c0 < t3 {
   151  		r = rune(c0&mask2)<<6 | rune(c1&maskx)
   152  		if r <= rune1Max {
   153  			return RuneError, 1, false
   154  		}
   155  		return r, 2, false
   156  	}
   157  
   158  	// need second continuation byte
   159  	if n < 3 {
   160  		return RuneError, 1, true
   161  	}
   162  	c2 := s[2]
   163  	if c2 < tx || t2 <= c2 {
   164  		return RuneError, 1, false
   165  	}
   166  
   167  	// 3-byte, 16-bit sequence?
   168  	if c0 < t4 {
   169  		r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
   170  		if r <= rune2Max {
   171  			return RuneError, 1, false
   172  		}
   173  		if surrogateMin <= r && r <= surrogateMax {
   174  			return RuneError, 1, false
   175  		}
   176  		return r, 3, false
   177  	}
   178  
   179  	// need third continuation byte
   180  	if n < 4 {
   181  		return RuneError, 1, true
   182  	}
   183  	c3 := s[3]
   184  	if c3 < tx || t2 <= c3 {
   185  		return RuneError, 1, false
   186  	}
   187  
   188  	// 4-byte, 21-bit sequence?
   189  	if c0 < t5 {
   190  		r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
   191  		if r <= rune3Max || MaxRune < r {
   192  			return RuneError, 1, false
   193  		}
   194  		return r, 4, false
   195  	}
   196  
   197  	// error
   198  	return RuneError, 1, false
   199  }
   200  
   201  // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
   202  // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
   203  func FullRune(p []byte) bool {
   204  	_, _, short := decodeRuneInternal(p)
   205  	return !short
   206  }
   207  
   208  // FullRuneInString is like FullRune but its input is a string.
   209  func FullRuneInString(s string) bool {
   210  	_, _, short := decodeRuneInStringInternal(s)
   211  	return !short
   212  }
   213  
   214  // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes.
   215  // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
   216  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   217  // out of range, or is not the shortest possible UTF-8 encoding for the
   218  // value. No other validation is performed.
   219  func DecodeRune(p []byte) (r rune, size int) {
   220  	r, size, _ = decodeRuneInternal(p)
   221  	return
   222  }
   223  
   224  // DecodeRuneInString is like DecodeRune but its input is a string.
   225  // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
   226  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   227  // out of range, or is not the shortest possible UTF-8 encoding for the
   228  // value. No other validation is performed.
   229  func DecodeRuneInString(s string) (r rune, size int) {
   230  	r, size, _ = decodeRuneInStringInternal(s)
   231  	return
   232  }
   233  
   234  // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes.
   235  // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
   236  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   237  // out of range, or is not the shortest possible UTF-8 encoding for the
   238  // value. No other validation is performed.
   239  func DecodeLastRune(p []byte) (r rune, size int) {
   240  	end := len(p)
   241  	if end == 0 {
   242  		return RuneError, 0
   243  	}
   244  	start := end - 1
   245  	r = rune(p[start])
   246  	if r < RuneSelf {
   247  		return r, 1
   248  	}
   249  	// guard against O(n^2) behavior when traversing
   250  	// backwards through strings with long sequences of
   251  	// invalid UTF-8.
   252  	lim := end - UTFMax
   253  	if lim < 0 {
   254  		lim = 0
   255  	}
   256  	for start--; start >= lim; start-- {
   257  		if RuneStart(p[start]) {
   258  			break
   259  		}
   260  	}
   261  	if start < 0 {
   262  		start = 0
   263  	}
   264  	r, size = DecodeRune(p[start:end])
   265  	if start+size != end {
   266  		return RuneError, 1
   267  	}
   268  	return r, size
   269  }
   270  
   271  // DecodeLastRuneInString is like DecodeLastRune but its input is a string.
   272  // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
   273  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   274  // out of range, or is not the shortest possible UTF-8 encoding for the
   275  // value. No other validation is performed.
   276  func DecodeLastRuneInString(s string) (r rune, size int) {
   277  	end := len(s)
   278  	if end == 0 {
   279  		return RuneError, 0
   280  	}
   281  	start := end - 1
   282  	r = rune(s[start])
   283  	if r < RuneSelf {
   284  		return r, 1
   285  	}
   286  	// guard against O(n^2) behavior when traversing
   287  	// backwards through strings with long sequences of
   288  	// invalid UTF-8.
   289  	lim := end - UTFMax
   290  	if lim < 0 {
   291  		lim = 0
   292  	}
   293  	for start--; start >= lim; start-- {
   294  		if RuneStart(s[start]) {
   295  			break
   296  		}
   297  	}
   298  	if start < 0 {
   299  		start = 0
   300  	}
   301  	r, size = DecodeRuneInString(s[start:end])
   302  	if start+size != end {
   303  		return RuneError, 1
   304  	}
   305  	return r, size
   306  }
   307  
   308  // RuneLen returns the number of bytes required to encode the rune.
   309  // It returns -1 if the rune is not a valid value to encode in UTF-8.
   310  func RuneLen(r rune) int {
   311  	switch {
   312  	case r < 0:
   313  		return -1
   314  	case r <= rune1Max:
   315  		return 1
   316  	case r <= rune2Max:
   317  		return 2
   318  	case surrogateMin <= r && r <= surrogateMax:
   319  		return -1
   320  	case r <= rune3Max:
   321  		return 3
   322  	case r <= MaxRune:
   323  		return 4
   324  	}
   325  	return -1
   326  }
   327  
   328  // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
   329  // It returns the number of bytes written.
   330  func EncodeRune(p []byte, r rune) int {
   331  	// Negative values are erroneous.  Making it unsigned addresses the problem.
   332  	if uint32(r) <= rune1Max {
   333  		p[0] = byte(r)
   334  		return 1
   335  	}
   336  
   337  	if uint32(r) <= rune2Max {
   338  		p[0] = t2 | byte(r>>6)
   339  		p[1] = tx | byte(r)&maskx
   340  		return 2
   341  	}
   342  
   343  	if uint32(r) > MaxRune {
   344  		r = RuneError
   345  	}
   346  
   347  	if surrogateMin <= r && r <= surrogateMax {
   348  		r = RuneError
   349  	}
   350  
   351  	if uint32(r) <= rune3Max {
   352  		p[0] = t3 | byte(r>>12)
   353  		p[1] = tx | byte(r>>6)&maskx
   354  		p[2] = tx | byte(r)&maskx
   355  		return 3
   356  	}
   357  
   358  	p[0] = t4 | byte(r>>18)
   359  	p[1] = tx | byte(r>>12)&maskx
   360  	p[2] = tx | byte(r>>6)&maskx
   361  	p[3] = tx | byte(r)&maskx
   362  	return 4
   363  }
   364  
   365  // RuneCount returns the number of runes in p.  Erroneous and short
   366  // encodings are treated as single runes of width 1 byte.
   367  func RuneCount(p []byte) int {
   368  	i := 0
   369  	var n int
   370  	for n = 0; i < len(p); n++ {
   371  		if p[i] < RuneSelf {
   372  			i++
   373  		} else {
   374  			_, size := DecodeRune(p[i:])
   375  			i += size
   376  		}
   377  	}
   378  	return n
   379  }
   380  
   381  // RuneCountInString is like RuneCount but its input is a string.
   382  func RuneCountInString(s string) (n int) {
   383  	for _ = range s {
   384  		n++
   385  	}
   386  	return
   387  }
   388  
   389  // RuneStart reports whether the byte could be the first byte of
   390  // an encoded rune.  Second and subsequent bytes always have the top
   391  // two bits set to 10.
   392  func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
   393  
   394  // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
   395  func Valid(p []byte) bool {
   396  	i := 0
   397  	for i < len(p) {
   398  		if p[i] < RuneSelf {
   399  			i++
   400  		} else {
   401  			_, size := DecodeRune(p[i:])
   402  			if size == 1 {
   403  				// All valid runes of size 1 (those
   404  				// below RuneSelf) were handled above.
   405  				// This must be a RuneError.
   406  				return false
   407  			}
   408  			i += size
   409  		}
   410  	}
   411  	return true
   412  }
   413  
   414  // ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
   415  func ValidString(s string) bool {
   416  	for i, r := range s {
   417  		if r == RuneError {
   418  			// The RuneError value can be an error
   419  			// sentinel value (if it's size 1) or the same
   420  			// value encoded properly. Decode it to see if
   421  			// it's the 1 byte sentinel value.
   422  			_, size := DecodeRuneInString(s[i:])
   423  			if size == 1 {
   424  				return false
   425  			}
   426  		}
   427  	}
   428  	return true
   429  }
   430  
   431  // ValidRune reports whether r can be legally encoded as UTF-8.
   432  // Code points that are out of range or a surrogate half are illegal.
   433  func ValidRune(r rune) bool {
   434  	switch {
   435  	case r < 0:
   436  		return false
   437  	case surrogateMin <= r && r <= surrogateMax:
   438  		return false
   439  	case r > MaxRune:
   440  		return false
   441  	}
   442  	return true
   443  }