golang.org/x/exp@v0.0.0-20240506185415-9bf2ced13842/utf8string/string.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package utf8string provides an efficient way to index strings by rune rather than by byte.
     6  package utf8string // import "golang.org/x/exp/utf8string"
     7  
     8  import (
     9  	"errors"
    10  	"unicode/utf8"
    11  )
    12  
    13  // String wraps a regular string with a small structure that provides more
    14  // efficient indexing by code point index, as opposed to byte index.
    15  // Scanning incrementally forwards or backwards is O(1) per index operation
    16  // (although not as fast a range clause going forwards).  Random access is
    17  // O(N) in the length of the string, but the overhead is less than always
    18  // scanning from the beginning.
    19  // If the string is ASCII, random access is O(1).
    20  // Unlike the built-in string type, String has internal mutable state and
    21  // is not thread-safe.
    22  type String struct {
    23  	str      string
    24  	numRunes int
    25  	// If width > 0, the rune at runePos starts at bytePos and has the specified width.
    26  	width    int
    27  	bytePos  int
    28  	runePos  int
    29  	nonASCII int // byte index of the first non-ASCII rune.
    30  }
    31  
    32  // NewString returns a new UTF-8 string with the provided contents.
    33  func NewString(contents string) *String {
    34  	return new(String).Init(contents)
    35  }
    36  
    37  // Init initializes an existing String to hold the provided contents.
    38  // It returns a pointer to the initialized String.
    39  func (s *String) Init(contents string) *String {
    40  	s.str = contents
    41  	s.bytePos = 0
    42  	s.runePos = 0
    43  	for i := 0; i < len(contents); i++ {
    44  		if contents[i] >= utf8.RuneSelf {
    45  			// Not ASCII.
    46  			s.numRunes = utf8.RuneCountInString(contents)
    47  			_, s.width = utf8.DecodeRuneInString(contents)
    48  			s.nonASCII = i
    49  			return s
    50  		}
    51  	}
    52  	// ASCII is simple.  Also, the empty string is ASCII.
    53  	s.numRunes = len(contents)
    54  	s.width = 0
    55  	s.nonASCII = len(contents)
    56  	return s
    57  }
    58  
    59  // String returns the contents of the String.  This method also means the
    60  // String is directly printable by fmt.Print.
    61  func (s *String) String() string {
    62  	return s.str
    63  }
    64  
    65  // RuneCount returns the number of runes (Unicode code points) in the String.
    66  func (s *String) RuneCount() int {
    67  	return s.numRunes
    68  }
    69  
    70  // IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
    71  func (s *String) IsASCII() bool {
    72  	return s.width == 0
    73  }
    74  
    75  // Slice returns the string sliced at rune positions [i:j].
    76  func (s *String) Slice(i, j int) string {
    77  	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
    78  	if j < s.nonASCII {
    79  		return s.str[i:j]
    80  	}
    81  	if i < 0 || j > s.numRunes || i > j {
    82  		panic(sliceOutOfRange)
    83  	}
    84  	if i == j {
    85  		return ""
    86  	}
    87  	// For non-ASCII, after At(i), bytePos is always the position of the indexed character.
    88  	var low, high int
    89  	switch {
    90  	case i < s.nonASCII:
    91  		low = i
    92  	case i == s.numRunes:
    93  		low = len(s.str)
    94  	default:
    95  		s.At(i)
    96  		low = s.bytePos
    97  	}
    98  	switch {
    99  	case j == s.numRunes:
   100  		high = len(s.str)
   101  	default:
   102  		s.At(j)
   103  		high = s.bytePos
   104  	}
   105  	return s.str[low:high]
   106  }
   107  
   108  // At returns the rune with index i in the String.  The sequence of runes is the same
   109  // as iterating over the contents with a "for range" clause.
   110  func (s *String) At(i int) rune {
   111  	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
   112  	if i < s.nonASCII {
   113  		return rune(s.str[i])
   114  	}
   115  
   116  	// Now we do need to know the index is valid.
   117  	if i < 0 || i >= s.numRunes {
   118  		panic(outOfRange)
   119  	}
   120  
   121  	var r rune
   122  
   123  	// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
   124  	// With these cases, all scans from beginning or end work in O(1) time per rune.
   125  	switch {
   126  
   127  	case i == s.runePos-1: // backing up one rune
   128  		r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
   129  		s.runePos = i
   130  		s.bytePos -= s.width
   131  		return r
   132  	case i == s.runePos+1: // moving ahead one rune
   133  		s.runePos = i
   134  		s.bytePos += s.width
   135  		fallthrough
   136  	case i == s.runePos:
   137  		r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
   138  		return r
   139  	case i == 0: // start of string
   140  		r, s.width = utf8.DecodeRuneInString(s.str)
   141  		s.runePos = 0
   142  		s.bytePos = 0
   143  		return r
   144  
   145  	case i == s.numRunes-1: // last rune in string
   146  		r, s.width = utf8.DecodeLastRuneInString(s.str)
   147  		s.runePos = i
   148  		s.bytePos = len(s.str) - s.width
   149  		return r
   150  	}
   151  
   152  	// We need to do a linear scan.  There are three places to start from:
   153  	// 1) The beginning
   154  	// 2) bytePos/runePos.
   155  	// 3) The end
   156  	// Choose the closest in rune count, scanning backwards if necessary.
   157  	forward := true
   158  	if i < s.runePos {
   159  		// Between beginning and pos.  Which is closer?
   160  		// Since both i and runePos are guaranteed >= nonASCII, that's the
   161  		// lowest location we need to start from.
   162  		if i < (s.runePos-s.nonASCII)/2 {
   163  			// Scan forward from beginning
   164  			s.bytePos, s.runePos = s.nonASCII, s.nonASCII
   165  		} else {
   166  			// Scan backwards from where we are
   167  			forward = false
   168  		}
   169  	} else {
   170  		// Between pos and end.  Which is closer?
   171  		if i-s.runePos < (s.numRunes-s.runePos)/2 {
   172  			// Scan forward from pos
   173  		} else {
   174  			// Scan backwards from end
   175  			s.bytePos, s.runePos = len(s.str), s.numRunes
   176  			forward = false
   177  		}
   178  	}
   179  	if forward {
   180  		// TODO: Is it much faster to use a range loop for this scan?
   181  		for {
   182  			r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
   183  			if s.runePos == i {
   184  				break
   185  			}
   186  			s.runePos++
   187  			s.bytePos += s.width
   188  		}
   189  	} else {
   190  		for {
   191  			r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
   192  			s.runePos--
   193  			s.bytePos -= s.width
   194  			if s.runePos == i {
   195  				break
   196  			}
   197  		}
   198  	}
   199  	return r
   200  }
   201  
   202  var outOfRange = errors.New("utf8string: index out of range")
   203  var sliceOutOfRange = errors.New("utf8string: slice index out of range")