github.com/aretext/aretext@v1.3.0/syntax/languages/helpers.go (about) 1 package languages 2 3 import ( 4 "io" 5 "sort" 6 "strings" 7 "unicode" 8 "unicode/utf8" 9 10 "github.com/aretext/aretext/syntax/parser" 11 ) 12 13 // initialState sets the initial parser state if it hasn't yet been set. 14 func initialState(initialState parser.State, f parser.Func) parser.Func { 15 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 16 if state.Equals(parser.EmptyState{}) { 17 state = initialState 18 } 19 return f(iter, state) 20 } 21 } 22 23 // matchState executes `f` only if the parser state matches `targetState`. 24 func matchState(targetState parser.State, f parser.Func) parser.Func { 25 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 26 if !state.Equals(targetState) { 27 return parser.FailedResult 28 } 29 return f(iter, state) 30 } 31 } 32 33 // matchStates executes `f` only if the parse state matches one of `targetStates`. 34 func matchStates(targetStates []parser.State, f parser.Func) parser.Func { 35 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 36 for _, ts := range targetStates { 37 if state.Equals(ts) { 38 return f(iter, state) 39 } 40 } 41 return parser.FailedResult 42 } 43 } 44 45 // setState sets the next parser state to `targetState`. 46 func setState(targetState parser.State) parser.MapFn { 47 return func(result parser.Result) parser.Result { 48 return parser.Result{ 49 NumConsumed: result.NumConsumed, 50 ComputedTokens: result.ComputedTokens, 51 NextState: targetState, 52 } 53 } 54 } 55 56 // consumeString consumes the characters in `s`. 57 func consumeString(s string) parser.Func { 58 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 59 var numConsumed uint64 60 for _, targetRune := range s { 61 r, err := iter.NextRune() 62 if err != nil || r != targetRune { 63 return parser.FailedResult 64 } 65 numConsumed++ 66 } 67 return parser.Result{ 68 NumConsumed: numConsumed, 69 NextState: state, 70 } 71 } 72 } 73 74 // consumeToString consumes all characters up to and including the string `s`. 75 func consumeToString(s string) parser.Func { 76 f := consumeString(s) 77 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 78 var numSkipped uint64 79 for { 80 r := f(iter, state) 81 if r.IsSuccess() { 82 return r.ShiftForward(numSkipped) 83 } 84 85 _, err := iter.NextRune() 86 if err != nil { 87 return parser.FailedResult 88 } 89 numSkipped++ 90 } 91 } 92 } 93 94 // consumeSingleRuneLike consumes a single rune matching a predicate. 95 func consumeSingleRuneLike(predicateFn func(rune) bool) parser.Func { 96 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 97 r, err := iter.NextRune() 98 if err == nil && predicateFn(r) { 99 return parser.Result{ 100 NumConsumed: 1, 101 NextState: state, 102 } 103 } 104 return parser.FailedResult 105 } 106 } 107 108 // consumeRunesLike consumes one or more runes matching a predicate. 109 func consumeRunesLike(predicateFn func(rune) bool) parser.Func { 110 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 111 var numConsumed uint64 112 for { 113 r, err := iter.NextRune() 114 if err != nil || !predicateFn(r) { 115 return parser.Result{ 116 NumConsumed: numConsumed, 117 NextState: state, 118 } 119 } 120 numConsumed++ 121 } 122 } 123 } 124 125 // consumeToEofOrRuneLike consumes up to and including a rune matching a predicate or EOF. 126 func consumeToEofOrRuneLike(predicate func(r rune) bool) parser.Func { 127 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 128 var numConsumed uint64 129 for { 130 r, err := iter.NextRune() 131 if err == io.EOF { 132 break 133 } else if err != nil { 134 return parser.FailedResult 135 } 136 137 numConsumed++ 138 139 if predicate(r) { 140 break 141 } 142 } 143 return parser.Result{ 144 NumConsumed: numConsumed, 145 NextState: state, 146 } 147 } 148 } 149 150 // consumeToNextLineFeed consumes up to and including the next newline character or the last character in the document, whichever comes first. 151 var consumeToNextLineFeed = consumeToEofOrRuneLike(func(r rune) bool { 152 return r == '\n' 153 }) 154 155 func consumeDigitsAndSeparators(allowLeadingSeparator bool, isDigit func(r rune) bool) parser.Func { 156 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 157 var numConsumed uint64 158 var lastWasUnderscore bool 159 for { 160 r, err := iter.NextRune() 161 if err != nil { 162 break 163 } 164 165 if r == '_' && !lastWasUnderscore && (allowLeadingSeparator || numConsumed > 0) { 166 lastWasUnderscore = true 167 numConsumed++ 168 continue 169 } 170 171 if isDigit(r) { 172 lastWasUnderscore = false 173 numConsumed++ 174 continue 175 } 176 177 break 178 } 179 180 if lastWasUnderscore { 181 numConsumed-- 182 } 183 184 return parser.Result{ 185 NumConsumed: numConsumed, 186 NextState: state, 187 } 188 } 189 190 } 191 192 // recognizeToken recognizes the consumed characters in the result as a token. 193 func recognizeToken(tokenRole parser.TokenRole) parser.MapFn { 194 return func(result parser.Result) parser.Result { 195 token := parser.ComputedToken{ 196 Length: result.NumConsumed, 197 Role: tokenRole, 198 } 199 return parser.Result{ 200 NumConsumed: result.NumConsumed, 201 ComputedTokens: []parser.ComputedToken{token}, 202 NextState: result.NextState, 203 } 204 } 205 } 206 207 func maxStrLen(ss []string) uint64 { 208 maxLength := uint64(0) 209 for _, s := range ss { 210 length := uint64(utf8.RuneCountInString(s)) 211 if length > maxLength { 212 maxLength = length 213 } 214 } 215 return maxLength 216 } 217 218 // consumeLongestMatchingOption consumes the longest matching option from a set of options. 219 func consumeLongestMatchingOption(options []string) parser.Func { 220 // Sort options descending by length. 221 sort.SliceStable(options, func(i, j int) bool { 222 return len(options[i]) > len(options[j]) 223 }) 224 225 // Allocate buffer for lookahead runes (shared across func invocations). 226 buf := make([]rune, maxStrLen(options)) 227 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 228 // Lookahead up to the length of the longest option. 229 var n uint64 230 for i := 0; i < len(buf); i++ { 231 r, err := iter.NextRune() 232 if err != nil { 233 break 234 } 235 buf[i] = r 236 n++ 237 } 238 239 // Look for longest matching option. 240 // We can return the first one that matches b/c options 241 // are sorted descending by length. 242 for _, opt := range options { 243 var i uint64 244 matched := true 245 for _, r := range opt { 246 if r != buf[i] || i >= n { 247 matched = false 248 break 249 } 250 i++ 251 } 252 if matched { 253 return parser.Result{ 254 NumConsumed: i, 255 NextState: state, 256 } 257 } 258 } 259 return parser.FailedResult 260 } 261 } 262 263 // recognizeKeywordOrConsume recognizes a keyword from the list of `keywords`. 264 // If no keywords match, the result is returned unmodified. 265 func recognizeKeywordOrConsume(keywords []string) parser.MapWithInputFn { 266 // Calculate the length of the longest keyword to limit how much 267 // of the input needs to be reprocessed. 268 maxLength := maxStrLen(keywords) 269 return func(result parser.Result, iter parser.TrackingRuneIter, state parser.State) parser.Result { 270 if result.NumConsumed > maxLength { 271 return result 272 } 273 274 s := readInputString(iter, result.NumConsumed) 275 for _, kw := range keywords { 276 if kw == s { 277 token := parser.ComputedToken{ 278 Role: parser.TokenRoleKeyword, 279 Length: result.NumConsumed, 280 } 281 return parser.Result{ 282 NumConsumed: result.NumConsumed, 283 ComputedTokens: []parser.ComputedToken{token}, 284 NextState: state, 285 } 286 } 287 } 288 289 return result 290 } 291 } 292 293 // failIfMatchTerm fails if the consumed string matches any of the excluded terms. 294 // Otherwise, it returns the result unmodified. 295 func failIfMatchTerm(terms []string) parser.MapWithInputFn { 296 maxLength := maxStrLen(terms) 297 return func(result parser.Result, iter parser.TrackingRuneIter, state parser.State) parser.Result { 298 if result.NumConsumed > maxLength { 299 return result 300 } 301 s := readInputString(iter, result.NumConsumed) 302 for _, term := range terms { 303 if term == s { 304 return parser.FailedResult 305 } 306 } 307 return result 308 } 309 } 310 311 // readInputString reads a string from the text up to `n` characters long. 312 func readInputString(iter parser.TrackingRuneIter, n uint64) string { 313 var sb strings.Builder 314 for i := uint64(0); i < n; i++ { 315 r, err := iter.NextRune() 316 if err != nil { 317 break 318 } 319 if _, err := sb.WriteRune(r); err != nil { 320 panic(err) 321 } 322 } 323 return sb.String() 324 } 325 326 // consumeCStyleString consumes a string with characters escaped by a backslash. 327 func consumeCStyleString(quoteRune rune, allowLineBreaks bool) parser.Func { 328 return func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 329 var n uint64 330 r, err := iter.NextRune() 331 if err != nil || r != quoteRune { 332 return parser.FailedResult 333 } 334 n++ 335 336 var inEscapeSeq bool 337 for { 338 r, err = iter.NextRune() 339 if err != nil || (!allowLineBreaks && r == '\n') { 340 return parser.FailedResult 341 } 342 n++ 343 344 if r == quoteRune && !inEscapeSeq { 345 return parser.Result{ 346 NumConsumed: n, 347 ComputedTokens: []parser.ComputedToken{ 348 {Length: n}, 349 }, 350 NextState: state, 351 } 352 } 353 354 if r == '\\' && !inEscapeSeq { 355 inEscapeSeq = true 356 continue 357 } 358 359 if inEscapeSeq { 360 inEscapeSeq = false 361 } 362 } 363 } 364 } 365 366 // parseCStyleString parses a string with characters escaped by a backslash. 367 func parseCStyleString(quoteRune rune, allowLineBreaks bool) parser.Func { 368 return consumeCStyleString(quoteRune, allowLineBreaks). 369 Map(recognizeToken(parser.TokenRoleString)) 370 } 371 372 // consumeCStylePreprocessorDirective parses a preprocessor directive (like "#include") 373 func consumeCStylePreprocessorDirective(directives []string) parser.Func { 374 // Consume leading '#' with optional whitespace after. 375 consumeStartOfDirective := func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 376 var numConsumed uint64 377 var sawHashmark bool 378 for { 379 r, err := iter.NextRune() 380 if err == io.EOF { 381 break 382 } else if err != nil { 383 return parser.FailedResult 384 } 385 386 if r == '#' && !sawHashmark { 387 sawHashmark = true 388 numConsumed++ 389 } else if sawHashmark && (r == ' ' || r == '\t') { 390 numConsumed++ 391 } else { 392 break 393 } 394 } 395 396 if !sawHashmark { 397 return parser.FailedResult 398 } 399 400 return parser.Result{ 401 NumConsumed: numConsumed, 402 NextState: state, 403 } 404 } 405 406 // Consume to the end of line or EOF, unless the line ends with a backslash. 407 consumeToEndOfDirective := func(iter parser.TrackingRuneIter, state parser.State) parser.Result { 408 var numConsumed uint64 409 var lastWasBackslash bool 410 for { 411 r, err := iter.NextRune() 412 if err == io.EOF { 413 break 414 } else if err != nil { 415 return parser.FailedResult 416 } 417 418 numConsumed++ 419 420 if r == '\n' && !lastWasBackslash { 421 break 422 } 423 lastWasBackslash = (r == '\\') 424 } 425 return parser.Result{ 426 NumConsumed: numConsumed, 427 NextState: state, 428 } 429 } 430 431 return parser.Func(consumeStartOfDirective). 432 Then(consumeLongestMatchingOption(directives)). 433 ThenNot(consumeSingleRuneLike(func(r rune) bool { 434 return !unicode.IsSpace(r) // must be followed by space, newline, or EOF 435 })). 436 ThenMaybe(consumeToEndOfDirective). 437 Map(recognizeToken(cTokenRolePreprocessorDirective)) 438 }