golang.org/x/exp@v0.0.0-20240506185415-9bf2ced13842/utf8string/string.go (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package utf8string provides an efficient way to index strings by rune rather than by byte. 6 package utf8string // import "golang.org/x/exp/utf8string" 7 8 import ( 9 "errors" 10 "unicode/utf8" 11 ) 12 13 // String wraps a regular string with a small structure that provides more 14 // efficient indexing by code point index, as opposed to byte index. 15 // Scanning incrementally forwards or backwards is O(1) per index operation 16 // (although not as fast a range clause going forwards). Random access is 17 // O(N) in the length of the string, but the overhead is less than always 18 // scanning from the beginning. 19 // If the string is ASCII, random access is O(1). 20 // Unlike the built-in string type, String has internal mutable state and 21 // is not thread-safe. 22 type String struct { 23 str string 24 numRunes int 25 // If width > 0, the rune at runePos starts at bytePos and has the specified width. 26 width int 27 bytePos int 28 runePos int 29 nonASCII int // byte index of the first non-ASCII rune. 30 } 31 32 // NewString returns a new UTF-8 string with the provided contents. 33 func NewString(contents string) *String { 34 return new(String).Init(contents) 35 } 36 37 // Init initializes an existing String to hold the provided contents. 38 // It returns a pointer to the initialized String. 39 func (s *String) Init(contents string) *String { 40 s.str = contents 41 s.bytePos = 0 42 s.runePos = 0 43 for i := 0; i < len(contents); i++ { 44 if contents[i] >= utf8.RuneSelf { 45 // Not ASCII. 46 s.numRunes = utf8.RuneCountInString(contents) 47 _, s.width = utf8.DecodeRuneInString(contents) 48 s.nonASCII = i 49 return s 50 } 51 } 52 // ASCII is simple. Also, the empty string is ASCII. 53 s.numRunes = len(contents) 54 s.width = 0 55 s.nonASCII = len(contents) 56 return s 57 } 58 59 // String returns the contents of the String. This method also means the 60 // String is directly printable by fmt.Print. 61 func (s *String) String() string { 62 return s.str 63 } 64 65 // RuneCount returns the number of runes (Unicode code points) in the String. 66 func (s *String) RuneCount() int { 67 return s.numRunes 68 } 69 70 // IsASCII returns a boolean indicating whether the String contains only ASCII bytes. 71 func (s *String) IsASCII() bool { 72 return s.width == 0 73 } 74 75 // Slice returns the string sliced at rune positions [i:j]. 76 func (s *String) Slice(i, j int) string { 77 // ASCII is easy. Let the compiler catch the indexing error if there is one. 78 if j < s.nonASCII { 79 return s.str[i:j] 80 } 81 if i < 0 || j > s.numRunes || i > j { 82 panic(sliceOutOfRange) 83 } 84 if i == j { 85 return "" 86 } 87 // For non-ASCII, after At(i), bytePos is always the position of the indexed character. 88 var low, high int 89 switch { 90 case i < s.nonASCII: 91 low = i 92 case i == s.numRunes: 93 low = len(s.str) 94 default: 95 s.At(i) 96 low = s.bytePos 97 } 98 switch { 99 case j == s.numRunes: 100 high = len(s.str) 101 default: 102 s.At(j) 103 high = s.bytePos 104 } 105 return s.str[low:high] 106 } 107 108 // At returns the rune with index i in the String. The sequence of runes is the same 109 // as iterating over the contents with a "for range" clause. 110 func (s *String) At(i int) rune { 111 // ASCII is easy. Let the compiler catch the indexing error if there is one. 112 if i < s.nonASCII { 113 return rune(s.str[i]) 114 } 115 116 // Now we do need to know the index is valid. 117 if i < 0 || i >= s.numRunes { 118 panic(outOfRange) 119 } 120 121 var r rune 122 123 // Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end. 124 // With these cases, all scans from beginning or end work in O(1) time per rune. 125 switch { 126 127 case i == s.runePos-1: // backing up one rune 128 r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos]) 129 s.runePos = i 130 s.bytePos -= s.width 131 return r 132 case i == s.runePos+1: // moving ahead one rune 133 s.runePos = i 134 s.bytePos += s.width 135 fallthrough 136 case i == s.runePos: 137 r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:]) 138 return r 139 case i == 0: // start of string 140 r, s.width = utf8.DecodeRuneInString(s.str) 141 s.runePos = 0 142 s.bytePos = 0 143 return r 144 145 case i == s.numRunes-1: // last rune in string 146 r, s.width = utf8.DecodeLastRuneInString(s.str) 147 s.runePos = i 148 s.bytePos = len(s.str) - s.width 149 return r 150 } 151 152 // We need to do a linear scan. There are three places to start from: 153 // 1) The beginning 154 // 2) bytePos/runePos. 155 // 3) The end 156 // Choose the closest in rune count, scanning backwards if necessary. 157 forward := true 158 if i < s.runePos { 159 // Between beginning and pos. Which is closer? 160 // Since both i and runePos are guaranteed >= nonASCII, that's the 161 // lowest location we need to start from. 162 if i < (s.runePos-s.nonASCII)/2 { 163 // Scan forward from beginning 164 s.bytePos, s.runePos = s.nonASCII, s.nonASCII 165 } else { 166 // Scan backwards from where we are 167 forward = false 168 } 169 } else { 170 // Between pos and end. Which is closer? 171 if i-s.runePos < (s.numRunes-s.runePos)/2 { 172 // Scan forward from pos 173 } else { 174 // Scan backwards from end 175 s.bytePos, s.runePos = len(s.str), s.numRunes 176 forward = false 177 } 178 } 179 if forward { 180 // TODO: Is it much faster to use a range loop for this scan? 181 for { 182 r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:]) 183 if s.runePos == i { 184 break 185 } 186 s.runePos++ 187 s.bytePos += s.width 188 } 189 } else { 190 for { 191 r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos]) 192 s.runePos-- 193 s.bytePos -= s.width 194 if s.runePos == i { 195 break 196 } 197 } 198 } 199 return r 200 } 201 202 var outOfRange = errors.New("utf8string: index out of range") 203 var sliceOutOfRange = errors.New("utf8string: slice index out of range")