github.com/aretext/aretext@v1.3.0/syntax/languages/helpers.go (about)

     1  package languages
     2  
     3  import (
     4  	"io"
     5  	"sort"
     6  	"strings"
     7  	"unicode"
     8  	"unicode/utf8"
     9  
    10  	"github.com/aretext/aretext/syntax/parser"
    11  )
    12  
    13  // initialState sets the initial parser state if it hasn't yet been set.
    14  func initialState(initialState parser.State, f parser.Func) parser.Func {
    15  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
    16  		if state.Equals(parser.EmptyState{}) {
    17  			state = initialState
    18  		}
    19  		return f(iter, state)
    20  	}
    21  }
    22  
    23  // matchState executes `f` only if the parser state matches `targetState`.
    24  func matchState(targetState parser.State, f parser.Func) parser.Func {
    25  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
    26  		if !state.Equals(targetState) {
    27  			return parser.FailedResult
    28  		}
    29  		return f(iter, state)
    30  	}
    31  }
    32  
    33  // matchStates executes `f` only if the parse state matches one of `targetStates`.
    34  func matchStates(targetStates []parser.State, f parser.Func) parser.Func {
    35  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
    36  		for _, ts := range targetStates {
    37  			if state.Equals(ts) {
    38  				return f(iter, state)
    39  			}
    40  		}
    41  		return parser.FailedResult
    42  	}
    43  }
    44  
    45  // setState sets the next parser state to `targetState`.
    46  func setState(targetState parser.State) parser.MapFn {
    47  	return func(result parser.Result) parser.Result {
    48  		return parser.Result{
    49  			NumConsumed:    result.NumConsumed,
    50  			ComputedTokens: result.ComputedTokens,
    51  			NextState:      targetState,
    52  		}
    53  	}
    54  }
    55  
    56  // consumeString consumes the characters in `s`.
    57  func consumeString(s string) parser.Func {
    58  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
    59  		var numConsumed uint64
    60  		for _, targetRune := range s {
    61  			r, err := iter.NextRune()
    62  			if err != nil || r != targetRune {
    63  				return parser.FailedResult
    64  			}
    65  			numConsumed++
    66  		}
    67  		return parser.Result{
    68  			NumConsumed: numConsumed,
    69  			NextState:   state,
    70  		}
    71  	}
    72  }
    73  
    74  // consumeToString consumes all characters up to and including the string `s`.
    75  func consumeToString(s string) parser.Func {
    76  	f := consumeString(s)
    77  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
    78  		var numSkipped uint64
    79  		for {
    80  			r := f(iter, state)
    81  			if r.IsSuccess() {
    82  				return r.ShiftForward(numSkipped)
    83  			}
    84  
    85  			_, err := iter.NextRune()
    86  			if err != nil {
    87  				return parser.FailedResult
    88  			}
    89  			numSkipped++
    90  		}
    91  	}
    92  }
    93  
    94  // consumeSingleRuneLike consumes a single rune matching a predicate.
    95  func consumeSingleRuneLike(predicateFn func(rune) bool) parser.Func {
    96  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
    97  		r, err := iter.NextRune()
    98  		if err == nil && predicateFn(r) {
    99  			return parser.Result{
   100  				NumConsumed: 1,
   101  				NextState:   state,
   102  			}
   103  		}
   104  		return parser.FailedResult
   105  	}
   106  }
   107  
   108  // consumeRunesLike consumes one or more runes matching a predicate.
   109  func consumeRunesLike(predicateFn func(rune) bool) parser.Func {
   110  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
   111  		var numConsumed uint64
   112  		for {
   113  			r, err := iter.NextRune()
   114  			if err != nil || !predicateFn(r) {
   115  				return parser.Result{
   116  					NumConsumed: numConsumed,
   117  					NextState:   state,
   118  				}
   119  			}
   120  			numConsumed++
   121  		}
   122  	}
   123  }
   124  
   125  // consumeToEofOrRuneLike consumes up to and including a rune matching a predicate or EOF.
   126  func consumeToEofOrRuneLike(predicate func(r rune) bool) parser.Func {
   127  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
   128  		var numConsumed uint64
   129  		for {
   130  			r, err := iter.NextRune()
   131  			if err == io.EOF {
   132  				break
   133  			} else if err != nil {
   134  				return parser.FailedResult
   135  			}
   136  
   137  			numConsumed++
   138  
   139  			if predicate(r) {
   140  				break
   141  			}
   142  		}
   143  		return parser.Result{
   144  			NumConsumed: numConsumed,
   145  			NextState:   state,
   146  		}
   147  	}
   148  }
   149  
   150  // consumeToNextLineFeed consumes up to and including the next newline character or the last character in the document, whichever comes first.
   151  var consumeToNextLineFeed = consumeToEofOrRuneLike(func(r rune) bool {
   152  	return r == '\n'
   153  })
   154  
   155  func consumeDigitsAndSeparators(allowLeadingSeparator bool, isDigit func(r rune) bool) parser.Func {
   156  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
   157  		var numConsumed uint64
   158  		var lastWasUnderscore bool
   159  		for {
   160  			r, err := iter.NextRune()
   161  			if err != nil {
   162  				break
   163  			}
   164  
   165  			if r == '_' && !lastWasUnderscore && (allowLeadingSeparator || numConsumed > 0) {
   166  				lastWasUnderscore = true
   167  				numConsumed++
   168  				continue
   169  			}
   170  
   171  			if isDigit(r) {
   172  				lastWasUnderscore = false
   173  				numConsumed++
   174  				continue
   175  			}
   176  
   177  			break
   178  		}
   179  
   180  		if lastWasUnderscore {
   181  			numConsumed--
   182  		}
   183  
   184  		return parser.Result{
   185  			NumConsumed: numConsumed,
   186  			NextState:   state,
   187  		}
   188  	}
   189  
   190  }
   191  
   192  // recognizeToken recognizes the consumed characters in the result as a token.
   193  func recognizeToken(tokenRole parser.TokenRole) parser.MapFn {
   194  	return func(result parser.Result) parser.Result {
   195  		token := parser.ComputedToken{
   196  			Length: result.NumConsumed,
   197  			Role:   tokenRole,
   198  		}
   199  		return parser.Result{
   200  			NumConsumed:    result.NumConsumed,
   201  			ComputedTokens: []parser.ComputedToken{token},
   202  			NextState:      result.NextState,
   203  		}
   204  	}
   205  }
   206  
   207  func maxStrLen(ss []string) uint64 {
   208  	maxLength := uint64(0)
   209  	for _, s := range ss {
   210  		length := uint64(utf8.RuneCountInString(s))
   211  		if length > maxLength {
   212  			maxLength = length
   213  		}
   214  	}
   215  	return maxLength
   216  }
   217  
   218  // consumeLongestMatchingOption consumes the longest matching option from a set of options.
   219  func consumeLongestMatchingOption(options []string) parser.Func {
   220  	// Sort options descending by length.
   221  	sort.SliceStable(options, func(i, j int) bool {
   222  		return len(options[i]) > len(options[j])
   223  	})
   224  
   225  	// Allocate buffer for lookahead runes (shared across func invocations).
   226  	buf := make([]rune, maxStrLen(options))
   227  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
   228  		// Lookahead up to the length of the longest option.
   229  		var n uint64
   230  		for i := 0; i < len(buf); i++ {
   231  			r, err := iter.NextRune()
   232  			if err != nil {
   233  				break
   234  			}
   235  			buf[i] = r
   236  			n++
   237  		}
   238  
   239  		// Look for longest matching option.
   240  		// We can return the first one that matches b/c options
   241  		// are sorted descending by length.
   242  		for _, opt := range options {
   243  			var i uint64
   244  			matched := true
   245  			for _, r := range opt {
   246  				if r != buf[i] || i >= n {
   247  					matched = false
   248  					break
   249  				}
   250  				i++
   251  			}
   252  			if matched {
   253  				return parser.Result{
   254  					NumConsumed: i,
   255  					NextState:   state,
   256  				}
   257  			}
   258  		}
   259  		return parser.FailedResult
   260  	}
   261  }
   262  
   263  // recognizeKeywordOrConsume recognizes a keyword from the list of `keywords`.
   264  // If no keywords match, the result is returned unmodified.
   265  func recognizeKeywordOrConsume(keywords []string) parser.MapWithInputFn {
   266  	// Calculate the length of the longest keyword to limit how much
   267  	// of the input needs to be reprocessed.
   268  	maxLength := maxStrLen(keywords)
   269  	return func(result parser.Result, iter parser.TrackingRuneIter, state parser.State) parser.Result {
   270  		if result.NumConsumed > maxLength {
   271  			return result
   272  		}
   273  
   274  		s := readInputString(iter, result.NumConsumed)
   275  		for _, kw := range keywords {
   276  			if kw == s {
   277  				token := parser.ComputedToken{
   278  					Role:   parser.TokenRoleKeyword,
   279  					Length: result.NumConsumed,
   280  				}
   281  				return parser.Result{
   282  					NumConsumed:    result.NumConsumed,
   283  					ComputedTokens: []parser.ComputedToken{token},
   284  					NextState:      state,
   285  				}
   286  			}
   287  		}
   288  
   289  		return result
   290  	}
   291  }
   292  
   293  // failIfMatchTerm fails if the consumed string matches any of the excluded terms.
   294  // Otherwise, it returns the result unmodified.
   295  func failIfMatchTerm(terms []string) parser.MapWithInputFn {
   296  	maxLength := maxStrLen(terms)
   297  	return func(result parser.Result, iter parser.TrackingRuneIter, state parser.State) parser.Result {
   298  		if result.NumConsumed > maxLength {
   299  			return result
   300  		}
   301  		s := readInputString(iter, result.NumConsumed)
   302  		for _, term := range terms {
   303  			if term == s {
   304  				return parser.FailedResult
   305  			}
   306  		}
   307  		return result
   308  	}
   309  }
   310  
   311  // readInputString reads a string from the text up to `n` characters long.
   312  func readInputString(iter parser.TrackingRuneIter, n uint64) string {
   313  	var sb strings.Builder
   314  	for i := uint64(0); i < n; i++ {
   315  		r, err := iter.NextRune()
   316  		if err != nil {
   317  			break
   318  		}
   319  		if _, err := sb.WriteRune(r); err != nil {
   320  			panic(err)
   321  		}
   322  	}
   323  	return sb.String()
   324  }
   325  
   326  // consumeCStyleString consumes a string with characters escaped by a backslash.
   327  func consumeCStyleString(quoteRune rune, allowLineBreaks bool) parser.Func {
   328  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
   329  		var n uint64
   330  		r, err := iter.NextRune()
   331  		if err != nil || r != quoteRune {
   332  			return parser.FailedResult
   333  		}
   334  		n++
   335  
   336  		var inEscapeSeq bool
   337  		for {
   338  			r, err = iter.NextRune()
   339  			if err != nil || (!allowLineBreaks && r == '\n') {
   340  				return parser.FailedResult
   341  			}
   342  			n++
   343  
   344  			if r == quoteRune && !inEscapeSeq {
   345  				return parser.Result{
   346  					NumConsumed: n,
   347  					ComputedTokens: []parser.ComputedToken{
   348  						{Length: n},
   349  					},
   350  					NextState: state,
   351  				}
   352  			}
   353  
   354  			if r == '\\' && !inEscapeSeq {
   355  				inEscapeSeq = true
   356  				continue
   357  			}
   358  
   359  			if inEscapeSeq {
   360  				inEscapeSeq = false
   361  			}
   362  		}
   363  	}
   364  }
   365  
   366  // parseCStyleString parses a string with characters escaped by a backslash.
   367  func parseCStyleString(quoteRune rune, allowLineBreaks bool) parser.Func {
   368  	return consumeCStyleString(quoteRune, allowLineBreaks).
   369  		Map(recognizeToken(parser.TokenRoleString))
   370  }
   371  
   372  // consumeCStylePreprocessorDirective parses a preprocessor directive (like "#include")
   373  func consumeCStylePreprocessorDirective(directives []string) parser.Func {
   374  	// Consume leading '#' with optional whitespace after.
   375  	consumeStartOfDirective := func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
   376  		var numConsumed uint64
   377  		var sawHashmark bool
   378  		for {
   379  			r, err := iter.NextRune()
   380  			if err == io.EOF {
   381  				break
   382  			} else if err != nil {
   383  				return parser.FailedResult
   384  			}
   385  
   386  			if r == '#' && !sawHashmark {
   387  				sawHashmark = true
   388  				numConsumed++
   389  			} else if sawHashmark && (r == ' ' || r == '\t') {
   390  				numConsumed++
   391  			} else {
   392  				break
   393  			}
   394  		}
   395  
   396  		if !sawHashmark {
   397  			return parser.FailedResult
   398  		}
   399  
   400  		return parser.Result{
   401  			NumConsumed: numConsumed,
   402  			NextState:   state,
   403  		}
   404  	}
   405  
   406  	// Consume to the end of line or EOF, unless the line ends with a backslash.
   407  	consumeToEndOfDirective := func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
   408  		var numConsumed uint64
   409  		var lastWasBackslash bool
   410  		for {
   411  			r, err := iter.NextRune()
   412  			if err == io.EOF {
   413  				break
   414  			} else if err != nil {
   415  				return parser.FailedResult
   416  			}
   417  
   418  			numConsumed++
   419  
   420  			if r == '\n' && !lastWasBackslash {
   421  				break
   422  			}
   423  			lastWasBackslash = (r == '\\')
   424  		}
   425  		return parser.Result{
   426  			NumConsumed: numConsumed,
   427  			NextState:   state,
   428  		}
   429  	}
   430  
   431  	return parser.Func(consumeStartOfDirective).
   432  		Then(consumeLongestMatchingOption(directives)).
   433  		ThenNot(consumeSingleRuneLike(func(r rune) bool {
   434  			return !unicode.IsSpace(r) // must be followed by space, newline, or EOF
   435  		})).
   436  		ThenMaybe(consumeToEndOfDirective).
   437  		Map(recognizeToken(cTokenRolePreprocessorDirective))
   438  }