github.com/aretext/aretext@v1.3.0/syntax/languages/criticmarkup.go (about)

     1  package languages
     2  
     3  import (
     4  	"sort"
     5  
     6  	"github.com/aretext/aretext/syntax/parser"
     7  )
     8  
     9  const (
    10  	criticMarkupCommentRole = parser.TokenRoleComment
    11  
    12  	// Use higher-numbered custom roles to avoid conflict
    13  	// with custom roles used for markdown.
    14  	criticMarkupAddRole       = parser.TokenRoleCustom9
    15  	criticMarkupDelRole       = parser.TokenRoleCustom10
    16  	criticMarkupSubRole       = parser.TokenRoleCustom11
    17  	criticMarkupHighlightRole = parser.TokenRoleCustom12
    18  )
    19  
    20  // CriticMarkupParseFunc returns a parse func for CriticMarkup.
    21  // https://github.com/CriticMarkup/CriticMarkup-toolkit/blob/master/README.md
    22  func CriticMarkupParseFunc() parser.Func {
    23  	/*
    24  		This is a bit of a hack.
    25  
    26  		We first run the markdown parser, then run the CriticMarkup parser on whatever
    27  		the markdown parser consumed (if we see the start of a CriticMarkup tag, we
    28  		may continue past where the markdown parser stopped).
    29  
    30  		We then delete/truncate/split markdown tokens to make space for
    31  		the CriticMarkup tokens.
    32  
    33  		This works, but notice that the text within a CriticMarkup tag is still
    34  		processed by the Markdown parser! So, for example, an asterisk "*" inside
    35  		a CriticMarkup tag can terminate a Markdown emphasis tag.
    36  		Fortunately, CriticMarkup explicitly forbids nesting Markdown tags,
    37  		so if the user is doing this, it's a mistake and we can interpret
    38  		it however we want.
    39  	*/
    40  
    41  	parseMarkdown := MarkdownParseFunc()
    42  	parseCriticMarkup := criticMarkupParseFunc()
    43  	return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
    44  		result := parseMarkdown(iter, state)
    45  
    46  		// Lookahead as far as the markdown parser consumed.
    47  		lookaheadLimit := result.NumConsumed
    48  
    49  		// If the markdown parser failed, lookahead to the one rune
    50  		// that would be consumed by error recovery.
    51  		// This shouldn't ever happen because the markdown parser always
    52  		// tries to consume something, but it's safer to check.
    53  		if lookaheadLimit == 0 {
    54  			lookaheadLimit = 1
    55  		}
    56  
    57  		// Attempt to parse this part of the document as CriticMarkup.
    58  		var criticMarkupTokens []parser.ComputedToken
    59  		var n uint64
    60  		for n < lookaheadLimit {
    61  			cmResult := parseCriticMarkup(iter, state)
    62  			if cmResult.IsSuccess() {
    63  				for _, tok := range cmResult.ComputedTokens {
    64  					tok.Offset += n
    65  					criticMarkupTokens = append(criticMarkupTokens, tok)
    66  				}
    67  				iter.Skip(cmResult.NumConsumed)
    68  				n += cmResult.NumConsumed
    69  			} else {
    70  				iter.Skip(1)
    71  				n++
    72  			}
    73  		}
    74  
    75  		// CriticMarkup tokens may overlap the markdown tokens.
    76  		// Delete/truncate/split the markdown tokens as necessary to make space.
    77  		result.ComputedTokens = criticMarkupConsolidateTokens(result.ComputedTokens, criticMarkupTokens)
    78  
    79  		// There may be CriticMarkup tokens that started within this computation
    80  		// but extend past the end of the computation. If so, update NumConsumed.
    81  		if len(result.ComputedTokens) > 0 {
    82  			lastToken := result.ComputedTokens[len(result.ComputedTokens)-1]
    83  			lastTokenEnd := lastToken.Offset + lastToken.Length
    84  			if lastTokenEnd > result.NumConsumed {
    85  				result.NumConsumed = lastTokenEnd
    86  			}
    87  		}
    88  
    89  		return result
    90  	}
    91  }
    92  
    93  func criticMarkupParseFunc() parser.Func {
    94  	parseAdd := consumeString("{++").
    95  		Then(consumeToString("++}")).
    96  		Map(recognizeToken(criticMarkupAddRole))
    97  
    98  	// Examples in the CriticMarkup README use U+2010 hyphens, so allow those as well.
    99  	parseDel := (consumeString("{--").Then(consumeToString("--}"))).
   100  		Or(consumeString("{\u2010\u2010").Then(consumeToString("\u2010\u2010}"))).
   101  		Map(recognizeToken(criticMarkupDelRole))
   102  
   103  	parseSub := consumeString("{~~").
   104  		Then(consumeToString("~~}")).
   105  		Map(recognizeToken(criticMarkupSubRole))
   106  
   107  	parseComment := consumeString("{>>").
   108  		Then(consumeToString("<<}")).
   109  		Map(recognizeToken(criticMarkupCommentRole))
   110  
   111  	parseHighlight := consumeString("{==").
   112  		Then(consumeToString("==}")).
   113  		Map(recognizeToken(criticMarkupHighlightRole))
   114  
   115  	return parseAdd.
   116  		Or(parseDel).
   117  		Or(parseSub).
   118  		Or(parseComment).
   119  		Or(parseHighlight)
   120  }
   121  
   122  func criticMarkupConsolidateTokens(mdTokens, cmTokens []parser.ComputedToken) []parser.ComputedToken {
   123  	// Fast path if we have only Markdown or only CriticMarkup.
   124  	if len(cmTokens) == 0 {
   125  		return mdTokens
   126  	} else if len(mdTokens) == 0 {
   127  		return cmTokens
   128  	}
   129  
   130  	// Assume that mdTokens and cmTokens are each sorted ascending and non-overlapping.
   131  	tokens := make([]parser.ComputedToken, 0, len(mdTokens)+len(cmTokens))
   132  	tokens = append(tokens, mdTokens...)
   133  
   134  	for _, cmTok := range cmTokens {
   135  		// Each iteration of this loop eliminates an overlap by deleting, truncating, or splitting
   136  		// one token. Once there are no overlaps, it inserts cmTok and exits the loop.
   137  		for {
   138  			i := sort.Search(len(tokens), func(i int) bool {
   139  				return tokens[i].Offset >= cmTok.Offset
   140  			})
   141  
   142  			if i > 0 {
   143  				tokBefore := tokens[i-1]
   144  				if tokBefore.Offset+tokBefore.Length > cmTok.Offset+cmTok.Length {
   145  					// tokBefore contains cmTok, so split tokBefore to make space.
   146  					tokens = append(tokens, parser.ComputedToken{})
   147  					copy(tokens[i+1:], tokens[i:])
   148  					tokens[i-1].Length = cmTok.Offset - tokBefore.Offset
   149  					tokens[i] = parser.ComputedToken{
   150  						Offset: cmTok.Offset + cmTok.Length,
   151  						Length: (tokBefore.Offset + tokBefore.Length) - (cmTok.Offset + cmTok.Length),
   152  						Role:   tokBefore.Role,
   153  					}
   154  					continue
   155  				} else if tokBefore.Offset+tokBefore.Length > cmTok.Offset {
   156  					// Truncate end of prev token
   157  					tokens[i-1].Length = cmTok.Offset - tokBefore.Offset
   158  					continue
   159  				}
   160  			}
   161  
   162  			if i < len(tokens) {
   163  				tokAfter := tokens[i]
   164  				if cmTok.Offset+cmTok.Length >= tokAfter.Offset+tokAfter.Length {
   165  					// cmTok contains the following token, so delete it to make space.
   166  					copy(tokens[i:], tokens[i+1:])
   167  					tokens = tokens[0 : len(tokens)-1]
   168  					continue
   169  				} else if cmTok.Offset+cmTok.Length > tokAfter.Offset {
   170  					// Truncate start of next token.
   171  					tokens[i].Offset = cmTok.Offset + cmTok.Length
   172  					tokens[i].Length -= (cmTok.Offset + cmTok.Length) - tokAfter.Offset
   173  					continue
   174  				}
   175  			}
   176  
   177  			// No overlap, so insert the token and exit the loop.
   178  			tokens = append(tokens, parser.ComputedToken{})
   179  			copy(tokens[i+1:], tokens[i:])
   180  			tokens[i] = cmTok
   181  			break
   182  		}
   183  	}
   184  
   185  	return tokens
   186  }