github.com/primecitizens/pcz/std@v0.2.1/text/unicode/utf8/decode.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2009 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  // Package utf8 implements functions and constants to support text encoded in
     9  // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
    10  // See https://en.wikipedia.org/wiki/UTF-8
    11  package utf8
    12  
    13  import (
    14  	. "github.com/primecitizens/pcz/std/text/unicode/common"
    15  )
    16  
    17  // Valid reports whether s consists entirely of valid UTF-8-encoded runes.
    18  func Valid(s string) bool {
    19  	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
    20  	for len(s) >= 8 {
    21  		// Combining two 32 bit loads allows the same code to be used
    22  		// for 32 and 64 bit platforms.
    23  		// The compiler can generate a 32bit load for first32 and second32
    24  		// on many platforms. See test/codegen/memcombine.go.
    25  		first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
    26  		second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
    27  		if (first32|second32)&0x80808080 != 0 {
    28  			// Found a non ASCII byte (>= RuneSelf).
    29  			break
    30  		}
    31  		s = s[8:]
    32  	}
    33  
    34  	for i, n := 0, len(s); i < n; {
    35  		si := s[i]
    36  		if si < RuneSelf {
    37  			i++
    38  			continue
    39  		}
    40  		x := first[si]
    41  		if x == xx {
    42  			return false // Illegal starter byte.
    43  		}
    44  		size := int(x & 7)
    45  		if i+size > n {
    46  			return false // Short or invalid.
    47  		}
    48  		accept := acceptRanges[x>>4]
    49  		if c := s[i+1]; c < accept.lo || accept.hi < c {
    50  			return false
    51  		} else if size == 2 {
    52  		} else if c := s[i+2]; c < locb || hicb < c {
    53  			return false
    54  		} else if size == 3 {
    55  		} else if c := s[i+3]; c < locb || hicb < c {
    56  			return false
    57  		}
    58  		i += size
    59  	}
    60  	return true
    61  }
    62  
    63  // Count returns the number of runes in s. Erroneous and short
    64  // encodings are treated as single runes of width 1 byte.
    65  func Count(s string) (n int) {
    66  	for i := 0; i < len(s); n++ {
    67  		c := s[i]
    68  		if c < RuneSelf {
    69  			// ASCII fast path
    70  			i++
    71  			continue
    72  		}
    73  		x := first[c]
    74  		if x == xx {
    75  			i++ // invalid.
    76  			continue
    77  		}
    78  		size := int(x & 7)
    79  		if i+size > len(s) {
    80  			i++ // Short or invalid.
    81  			continue
    82  		}
    83  		accept := acceptRanges[x>>4]
    84  		if c := s[i+1]; c < accept.lo || accept.hi < c {
    85  			size = 1
    86  		} else if size == 2 {
    87  		} else if c := s[i+2]; c < locb || hicb < c {
    88  			size = 1
    89  		} else if size == 3 {
    90  		} else if c := s[i+3]; c < locb || hicb < c {
    91  			size = 1
    92  		}
    93  		i += size
    94  	}
    95  	return n
    96  }
    97  
    98  // First unpacks the first UTF-8 encoding in s and returns the rune and
    99  // its width in bytes. If p is empty it returns (RuneError, 0).
   100  //
   101  // Otherwise, if the encoding is invalid, it returns
   102  // (RuneError, 1). Both are impossible results for correct,
   103  // non-empty UTF-8.
   104  //
   105  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   106  // out of range, or is not the shortest possible UTF-8 encoding for the
   107  // value. No other validation is performed.
   108  func First(s string) (r rune, size int) {
   109  	if len(s) == 0 {
   110  		return RuneError, 0
   111  	}
   112  	s0 := s[0]
   113  	x := first[s0]
   114  	if x >= as {
   115  		// The following code simulates an additional check for x == xx and
   116  		// handling the ASCII and invalid cases accordingly. This mask-and-or
   117  		// approach prevents an additional branch.
   118  		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
   119  		return rune(s[0])&^mask | RuneError&mask, 1
   120  	}
   121  	sz := int(x & 7)
   122  	accept := acceptRanges[x>>4]
   123  	if len(s) < sz {
   124  		return RuneError, 1
   125  	}
   126  	s1 := s[1]
   127  	if s1 < accept.lo || accept.hi < s1 {
   128  		return RuneError, 1
   129  	}
   130  	if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
   131  		return rune(s0&mask2)<<6 | rune(s1&maskx), 2
   132  	}
   133  	s2 := s[2]
   134  	if s2 < locb || hicb < s2 {
   135  		return RuneError, 1
   136  	}
   137  	if sz <= 3 {
   138  		return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
   139  	}
   140  	s3 := s[3]
   141  	if s3 < locb || hicb < s3 {
   142  		return RuneError, 1
   143  	}
   144  	return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
   145  }
   146  
   147  // Last unpacks the last UTF-8 encoding in p and returns the rune and
   148  // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
   149  // the encoding is invalid, it returns (RuneError, 1). Both are impossible
   150  // results for correct, non-empty UTF-8.
   151  //
   152  // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   153  // out of range, or is not the shortest possible UTF-8 encoding for the
   154  // value. No other validation is performed.
   155  func Last(s string) (r rune, size int) {
   156  	end := len(s)
   157  	if end == 0 {
   158  		return RuneError, 0
   159  	}
   160  	start := end - 1
   161  	r = rune(s[start])
   162  	if r < RuneSelf {
   163  		return r, 1
   164  	}
   165  	// guard against O(n^2) behavior when traversing
   166  	// backwards through strings with long sequences of
   167  	// invalid UTF-8.
   168  	lim := end - MaxRuneLen
   169  	if lim < 0 {
   170  		lim = 0
   171  	}
   172  	for start--; start >= lim; start-- {
   173  		if RuneStart(s[start]) {
   174  			break
   175  		}
   176  	}
   177  	if start < 0 {
   178  		start = 0
   179  	}
   180  	r, size = First(s[start:end])
   181  	if start+size != end {
   182  		return RuneError, 1
   183  	}
   184  	return r, size
   185  }
   186  
   187  // FullRune reports whether the bytes in s begin with a full UTF-8 encoding of a rune.
   188  //
   189  // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
   190  func FullRune(s string) bool {
   191  	if len(s) == 0 {
   192  		return false
   193  	}
   194  	x := first[s[0]]
   195  	if len(s) >= int(x&7) {
   196  		return true // ASCII, invalid, or valid.
   197  	}
   198  	// Must be short or invalid.
   199  	accept := acceptRanges[x>>4]
   200  	if len(s) > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
   201  		return true
   202  	} else if len(s) > 2 && (s[2] < locb || hicb < s[2]) {
   203  		return true
   204  	}
   205  	return false
   206  }
   207  
   208  // RuneStart reports whether the byte could be the first byte of an encoded,
   209  // possibly invalid rune. Second and subsequent bytes always have the top two
   210  // bits set to 10.
   211  func RuneStart(b byte) bool {
   212  	return b&0xC0 != 0x80
   213  }
   214  
   215  // RuneValid reports whether r can be legally encoded as UTF-8.
   216  // Code points that are out of range or a surrogate half are illegal.
   217  func RuneValid(r rune) bool {
   218  	switch {
   219  	case 0 <= r && r < SurrogateMin,
   220  		SurrogateMax < r && r <= MaxRune:
   221  		return true
   222  	}
   223  	return false
   224  }