github.com/aretext/aretext@v1.3.0/locate/word.go (about) 1 package locate 2 3 import ( 4 "unicode" 5 6 "github.com/aretext/aretext/text" 7 "github.com/aretext/aretext/text/segment" 8 ) 9 10 // NextWordStart locates the start of the next word after the cursor. 11 // Word boundaries occur: 12 // 1. at the first non-whitespace after a whitespace 13 // 2. at the start of an empty line 14 // 3. between punctuation and non-punctuation (unless withPunctuation=true) 15 func NextWordStart(textTree *text.Tree, pos uint64, targetCount uint64, withPunctuation, stopAtEndOfLastLine bool) uint64 { 16 if targetCount == 0 { 17 return pos 18 } 19 20 reader := textTree.ReaderAtPosition(pos) 21 gcIter := segment.NewGraphemeClusterIter(reader) 22 gc := segment.Empty() 23 24 // Read the first gc to check if we're on 25 // a newline, whitespace, or punct. 26 err := gcIter.NextSegment(gc) 27 if err != nil { 28 return pos 29 } 30 prevHasNewline := gc.HasNewline() 31 prevWasWhitespace := gc.IsWhitespace() 32 prevWasPunct := isPunct(gc) 33 34 if stopAtEndOfLastLine && targetCount == 1 && prevHasNewline { 35 return pos 36 } 37 38 pos += gc.NumRunes() 39 40 // Read subsequent runes to find the next word boundary. 41 var count uint64 42 for { 43 err = gcIter.NextSegment(gc) 44 if err != nil { 45 break 46 } 47 48 isWhitespace := gc.IsWhitespace() 49 hasNewline := gc.HasNewline() 50 isPunct := isPunct(gc) 51 52 if (prevWasWhitespace && !isWhitespace) || 53 (!withPunctuation && prevWasPunct && !isPunct && !isWhitespace) || 54 (!withPunctuation && !prevWasPunct && isPunct) || 55 (prevHasNewline && hasNewline) { 56 count++ 57 } 58 59 if stopAtEndOfLastLine && count+1 == targetCount && hasNewline { 60 break 61 } 62 63 if count == targetCount { 64 break 65 } 66 67 pos += gc.NumRunes() 68 prevHasNewline = hasNewline 69 prevWasWhitespace = isWhitespace 70 prevWasPunct = isPunct 71 } 72 73 return pos 74 } 75 76 // PrevWordStart locates the start of the word before the cursor. 77 // It is the inverse of NextWordStart. 78 func PrevWordStart(textTree *text.Tree, pos uint64, targetCount uint64, withPunctuation bool) uint64 { 79 if targetCount == 0 { 80 return pos 81 } 82 83 reader := textTree.ReverseReaderAtPosition(pos) 84 gcIter := segment.NewReverseGraphemeClusterIter(reader) 85 gc := segment.Empty() 86 87 // Read the gc before pos to check if we're on 88 // a newline, whitespace, or punct. 89 err := gcIter.NextSegment(gc) 90 if err != nil { 91 return 0 // io.EOF means we're at the start of the document. 92 } 93 prevHasNewline := gc.HasNewline() 94 prevWasWhitespace := gc.IsWhitespace() 95 prevWasPunct := isPunct(gc) 96 pos -= gc.NumRunes() 97 98 // Read backwards until we find a boundary. 99 var count uint64 100 for { 101 err = gcIter.NextSegment(gc) 102 if err != nil { 103 return 0 // io.EOF means we're at the start of the document. 104 } 105 106 isWhitespace := gc.IsWhitespace() 107 hasNewline := gc.HasNewline() 108 isPunct := isPunct(gc) 109 110 if (isWhitespace && !prevWasWhitespace) || 111 (!withPunctuation && isPunct && !prevWasPunct && !prevWasWhitespace) || 112 (!withPunctuation && !isPunct && prevWasPunct) || 113 (hasNewline && prevHasNewline) { 114 count++ 115 } 116 117 if count == targetCount { 118 break 119 } 120 121 pos -= gc.NumRunes() 122 prevHasNewline = hasNewline 123 prevWasWhitespace = isWhitespace 124 prevWasPunct = isPunct 125 } 126 127 return pos 128 } 129 130 // NextWordEnd locates the next word-end boundary after the cursor. 131 // The word break rules are the same as for NextWordStart, except 132 // that empty lines are NOT treated as word boundaries. 133 func NextWordEnd(textTree *text.Tree, pos uint64, targetCount uint64, withPunctuation bool) uint64 { 134 if targetCount == 0 { 135 return pos 136 } 137 138 reader := textTree.ReaderAtPosition(pos) 139 gcIter := segment.NewGraphemeClusterIter(reader) 140 gc := segment.Empty() 141 142 // Discard the first gc. 143 // This ensures that we advance even if we start 144 // at the end of a word. 145 err := gcIter.NextSegment(gc) 146 if err != nil { 147 return pos 148 } 149 prevPos := pos 150 pos += gc.NumRunes() 151 152 // Read the second gc to check if we're on 153 // a newline, whitespace, or punct. 154 err = gcIter.NextSegment(gc) 155 if err != nil { 156 return prevPos 157 } 158 prevWasWhitespace := gc.IsWhitespace() 159 prevWasPunct := isPunct(gc) 160 prevPos = pos 161 pos += gc.NumRunes() 162 163 // Read subsequent runes to find the next word boundary. 164 var count uint64 165 for { 166 err = gcIter.NextSegment(gc) 167 if err != nil { 168 break 169 } 170 171 isWhitespace := gc.IsWhitespace() 172 isPunct := isPunct(gc) 173 174 if (!prevWasWhitespace && isWhitespace) || 175 (!withPunctuation && prevWasPunct != isPunct) { 176 count++ 177 } 178 179 if count == targetCount { 180 break 181 } 182 183 prevPos = pos 184 pos += gc.NumRunes() 185 prevWasWhitespace = isWhitespace 186 prevWasPunct = isPunct 187 } 188 189 // Return the previous position to ensure that we stop on, 190 // not after, the end of word. 191 return prevPos 192 } 193 194 // WordObject returns the start and end positions of the word object under the cursor. 195 // If the cursor is on whitespace, include it as leading whitespace. 196 // Otherwise, include trailing whitespace. 197 // This is equivalent to vim's "aw" ("a word") object. 198 func WordObject(textTree *text.Tree, pos uint64, targetCount uint64) (uint64, uint64) { 199 if targetCount == 0 { 200 return pos, pos 201 } 202 203 // Lookahead one rune to detect whether we're in whitespace or not. 204 reader := textTree.ReaderAtPosition(pos) 205 r, _, err := reader.ReadRune() 206 if err != nil { 207 // This can only occur in an empty document. 208 return pos, pos 209 } 210 211 if unicode.IsSpace(r) { 212 // If we're in whitespace, treat it as leading whitespace 213 // and move to the following word. 214 return wordObjectWithLeadingWhitespace(textTree, pos, targetCount) 215 } else { 216 // Otherwise, move past the end of the word and 217 // any trailing whitespace. 218 return wordObjectWithTrailingWhitespace(textTree, pos, targetCount) 219 } 220 } 221 222 func wordObjectWithLeadingWhitespace(textTree *text.Tree, pos uint64, targetCount uint64) (uint64, uint64) { 223 startPos, endPos := pos, pos 224 225 // Scan backwards to the start of leading whitespace. 226 gc := segment.Empty() 227 reverseReader := textTree.ReverseReaderAtPosition(pos) 228 reverseGcIter := segment.NewReverseGraphemeClusterIter(reverseReader) 229 for { 230 err := reverseGcIter.NextSegment(gc) 231 if err != nil || gc.HasNewline() || !gc.IsWhitespace() { 232 break 233 } 234 startPos -= gc.NumRunes() 235 } 236 237 // Skip the next gc, since we already know it's whitespace. 238 reader := textTree.ReaderAtPosition(pos) 239 gcIter := segment.NewGraphemeClusterIter(reader) 240 err := gcIter.NextSegment(gc) 241 if err != nil { 242 // Should never happen, because the caller validated that there's at least one rune. 243 panic(err) 244 } 245 endPos += gc.NumRunes() 246 247 // Scan forward to the end of the word after leading whitespace. 248 prevWasWhitespace, prevWasPunct := true, false 249 var count uint64 250 for { 251 err := gcIter.NextSegment(gc) 252 if err != nil { 253 break 254 } 255 256 isWhitespace := gc.IsWhitespace() 257 isPunct := isPunct(gc) 258 if (!prevWasWhitespace && isWhitespace) || 259 (!prevWasPunct && !prevWasWhitespace && isPunct) || 260 (prevWasPunct && !isPunct && !isWhitespace) { 261 count++ 262 } 263 264 if count == targetCount { 265 break 266 } 267 268 endPos += gc.NumRunes() 269 prevWasWhitespace = isWhitespace 270 prevWasPunct = isPunct 271 } 272 273 return startPos, endPos 274 } 275 276 func wordObjectWithTrailingWhitespace(textTree *text.Tree, pos uint64, targetCount uint64) (uint64, uint64) { 277 startPos, endPos := pos, pos 278 reader := textTree.ReaderAtPosition(pos) 279 gcIter := segment.NewGraphemeClusterIter(reader) 280 gc := segment.Empty() 281 282 // Lookahead one gc to see if we're in punctuation. 283 err := gcIter.NextSegment(gc) 284 if err != nil { 285 // Should never happen, because the caller validated that there's at least one rune. 286 panic(err) 287 } 288 firstIsPunct := isPunct(gc) 289 firstIsWhitespace := gc.IsWhitespace() 290 endPos += gc.NumRunes() 291 292 // Scan backwards to the previous word boundary. 293 reverseReader := textTree.ReverseReaderAtPosition(pos) 294 reverseGcIter := segment.NewReverseGraphemeClusterIter(reverseReader) 295 for { 296 err = reverseGcIter.NextSegment(gc) 297 if err != nil || 298 gc.IsWhitespace() || 299 gc.HasNewline() || 300 (firstIsPunct != isPunct(gc)) { 301 break 302 } 303 startPos -= gc.NumRunes() 304 } 305 306 // Scan forward to the end of word. 307 prevWasWhitespace := firstIsWhitespace 308 prevWasPunct := firstIsPunct 309 var count uint64 310 for { 311 err = gcIter.NextSegment(gc) 312 if err != nil { 313 break 314 } 315 316 isWhitespace := gc.IsWhitespace() 317 isPunct := isPunct(gc) 318 if (!prevWasWhitespace && isWhitespace) || 319 (!prevWasPunct && !prevWasWhitespace && isPunct) || 320 (prevWasPunct && !isPunct && !isWhitespace) { 321 count++ 322 } 323 324 if count == targetCount { 325 break 326 } 327 328 prevWasWhitespace = isWhitespace 329 prevWasPunct = isPunct 330 endPos += gc.NumRunes() 331 } 332 333 // If we're at the end of the line or the next char isn't whitespace, we're done. 334 if gc.HasNewline() || !gc.IsWhitespace() { 335 return startPos, endPos 336 } 337 338 // Count the whitespace character we already scanned. 339 endPos += gc.NumRunes() 340 341 // Otherwise, keep scanning to end of trailing whitespace. 342 for { 343 err = gcIter.NextSegment(gc) 344 if err != nil || !gc.IsWhitespace() || gc.HasNewline() { 345 break 346 } 347 endPos += gc.NumRunes() 348 } 349 350 return startPos, endPos 351 } 352 353 // InnerWordObject returns the start and end positions of the word object or whitespace regions under the cursor. 354 // This is similar to WordObject, except that whitespace regions are counted as if they were words. 355 // This is equivalent to vim's "iw" ("inner word") object. 356 func InnerWordObject(textTree *text.Tree, pos uint64, targetCount uint64) (uint64, uint64) { 357 if targetCount == 0 { 358 return pos, pos 359 } 360 361 startPos, endPos := pos, pos 362 reader := textTree.ReaderAtPosition(pos) 363 gcIter := segment.NewGraphemeClusterIter(reader) 364 gc := segment.Empty() 365 366 // Lookahead one gc to see if we're in whitespace or punctuation. 367 err := gcIter.NextSegment(gc) 368 if err != nil { 369 // This can occur only in an empty document. 370 return pos, pos 371 } 372 firstNumRunes := gc.NumRunes() 373 firstHasNewline := gc.HasNewline() 374 firstIsWhitespace := gc.IsWhitespace() 375 firstIsPunct := isPunct(gc) 376 377 // Scan backwards for a word boundary. 378 reverseReader := textTree.ReverseReaderAtPosition(pos) 379 reverseGcIter := segment.NewReverseGraphemeClusterIter(reverseReader) 380 for { 381 err = reverseGcIter.NextSegment(gc) 382 if err != nil || 383 (firstIsWhitespace != gc.IsWhitespace()) || 384 (firstIsPunct != isPunct(gc)) || 385 gc.HasNewline() { 386 break 387 } 388 startPos -= gc.NumRunes() 389 } 390 391 // If the next gc is a newline, then stop there. 392 if targetCount == 1 && firstHasNewline { 393 return startPos, endPos 394 } 395 396 endPos += firstNumRunes 397 398 prevHasNewline := firstHasNewline 399 prevWasWhitespace := firstIsWhitespace 400 prevWasPunct := firstIsPunct 401 402 // Otherwise, scan forward to the next boundary. 403 var count uint64 404 for { 405 err = gcIter.NextSegment(gc) 406 if err != nil { 407 break 408 } 409 410 hasNewline := gc.HasNewline() 411 isWhitespace := gc.IsWhitespace() 412 isPunct := isPunct(gc) 413 414 if (!prevWasWhitespace && isWhitespace) || 415 (prevWasWhitespace && !prevHasNewline && !isWhitespace) || 416 (prevWasPunct != isPunct) { 417 count++ 418 } 419 420 if count == targetCount { 421 break 422 } 423 424 endPos += gc.NumRunes() 425 prevHasNewline = hasNewline 426 prevWasWhitespace = isWhitespace 427 prevWasPunct = isPunct 428 } 429 430 return startPos, endPos 431 } 432 433 // isPunct returns whether a grapheme cluster should be treated as punctuation for determining word boundaries. 434 func isPunct(seg *segment.Segment) bool { 435 if seg.NumRunes() != 1 { 436 return false 437 } 438 439 r := seg.Runes()[0] 440 441 // These ranges are the same as the unicode punctuation class for ASCII characters, except that: 442 // * underscores ('_') are NOT treated as punctuation 443 // * the following chars ARE treated as punctuation: '$', '+', '<', '=', '>', '^', '`', '|', '~' 444 return (r >= '!' && r <= '/') || (r >= ':' && r <= '@') || (r >= '[' && r <= '^') || (r == '`' || r >= '{' && r <= '~') 445 }