code.gitea.io/gitea@v1.21.7/services/gitdiff/highlightdiff.go (about)

     1  // Copyright 2022 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package gitdiff
     5  
     6  import (
     7  	"strings"
     8  
     9  	"code.gitea.io/gitea/modules/highlight"
    10  
    11  	"github.com/sergi/go-diff/diffmatchpatch"
    12  )
    13  
    14  // token is a html tag or entity, eg: "<span ...>", "</span>", "&lt;"
    15  func extractHTMLToken(s string) (before, token, after string, valid bool) {
    16  	for pos1 := 0; pos1 < len(s); pos1++ {
    17  		if s[pos1] == '<' {
    18  			pos2 := strings.IndexByte(s[pos1:], '>')
    19  			if pos2 == -1 {
    20  				return "", "", s, false
    21  			}
    22  			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
    23  		} else if s[pos1] == '&' {
    24  			pos2 := strings.IndexByte(s[pos1:], ';')
    25  			if pos2 == -1 {
    26  				return "", "", s, false
    27  			}
    28  			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
    29  		}
    30  	}
    31  	return "", "", s, true
    32  }
    33  
    34  // highlightCodeDiff is used to do diff with highlighted HTML code.
    35  // It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
    36  // The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001"
    37  // These Unicode placeholders are friendly to the diff.
    38  // Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
    39  // It's guaranteed that the tags in final diff result are paired correctly.
    40  type highlightCodeDiff struct {
    41  	placeholderBegin    rune
    42  	placeholderMaxCount int
    43  	placeholderIndex    int
    44  	placeholderTokenMap map[rune]string
    45  	tokenPlaceholderMap map[string]rune
    46  
    47  	placeholderOverflowCount int
    48  
    49  	lineWrapperTags []string
    50  }
    51  
    52  func newHighlightCodeDiff() *highlightCodeDiff {
    53  	return &highlightCodeDiff{
    54  		placeholderBegin:    rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
    55  		placeholderMaxCount: 64000,
    56  		placeholderTokenMap: map[rune]string{},
    57  		tokenPlaceholderMap: map[string]rune{},
    58  	}
    59  }
    60  
    61  // nextPlaceholder returns 0 if no more placeholder can be used
    62  // the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
    63  // so the placeholderMaxCount is impossible to be exhausted in real cases.
    64  func (hcd *highlightCodeDiff) nextPlaceholder() rune {
    65  	for hcd.placeholderIndex < hcd.placeholderMaxCount {
    66  		r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
    67  		hcd.placeholderIndex++
    68  		// only use non-existing (not used by code) rune as placeholders
    69  		if _, ok := hcd.placeholderTokenMap[r]; !ok {
    70  			return r
    71  		}
    72  	}
    73  	return 0 // no more available placeholder
    74  }
    75  
    76  func (hcd *highlightCodeDiff) isInPlaceholderRange(r rune) bool {
    77  	return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
    78  }
    79  
    80  func (hcd *highlightCodeDiff) collectUsedRunes(code string) {
    81  	for _, r := range code {
    82  		if hcd.isInPlaceholderRange(r) {
    83  			// put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
    84  			hcd.placeholderTokenMap[r] = ""
    85  		}
    86  	}
    87  }
    88  
    89  func (hcd *highlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff {
    90  	hcd.collectUsedRunes(codeA)
    91  	hcd.collectUsedRunes(codeB)
    92  
    93  	highlightCodeA, _ := highlight.Code(filename, language, codeA)
    94  	highlightCodeB, _ := highlight.Code(filename, language, codeB)
    95  
    96  	convertedCodeA := hcd.convertToPlaceholders(string(highlightCodeA))
    97  	convertedCodeB := hcd.convertToPlaceholders(string(highlightCodeB))
    98  
    99  	diffs := diffMatchPatch.DiffMain(convertedCodeA, convertedCodeB, true)
   100  	diffs = diffMatchPatch.DiffCleanupEfficiency(diffs)
   101  
   102  	for i := range diffs {
   103  		hcd.recoverOneDiff(&diffs[i])
   104  	}
   105  	return diffs
   106  }
   107  
   108  // convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
   109  func (hcd *highlightCodeDiff) convertToPlaceholders(htmlCode string) string {
   110  	var tagStack []string
   111  	res := strings.Builder{}
   112  
   113  	firstRunForLineTags := hcd.lineWrapperTags == nil
   114  
   115  	var beforeToken, token string
   116  	var valid bool
   117  
   118  	// the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>"
   119  	for {
   120  		beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode)
   121  		if !valid || token == "" {
   122  			break
   123  		}
   124  		// write the content before the token into result string, and consume the token in the string
   125  		res.WriteString(beforeToken)
   126  
   127  		// the line wrapper tags should be removed before diff
   128  		if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) {
   129  			if firstRunForLineTags {
   130  				// if this is the first run for converting, save the line wrapper tags for later use, they should be added back
   131  				hcd.lineWrapperTags = append(hcd.lineWrapperTags, token)
   132  			}
   133  			htmlCode = strings.TrimSuffix(htmlCode, "</span>")
   134  			continue
   135  		}
   136  
   137  		var tokenInMap string
   138  		if strings.HasSuffix(token, "</") { // for closing tag
   139  			if len(tagStack) == 0 {
   140  				break // invalid diff result, no opening tag but see closing tag
   141  			}
   142  			// make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
   143  			// the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>"
   144  			tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->"
   145  			tagStack = tagStack[:len(tagStack)-1]
   146  		} else if token[0] == '<' { // for opening tag
   147  			tokenInMap = token
   148  			tagStack = append(tagStack, token)
   149  		} else if token[0] == '&' { // for html entity
   150  			tokenInMap = token
   151  		} // else: impossible
   152  
   153  		// remember the placeholder and token in the map
   154  		placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap]
   155  		if !ok {
   156  			placeholder = hcd.nextPlaceholder()
   157  			if placeholder != 0 {
   158  				hcd.tokenPlaceholderMap[tokenInMap] = placeholder
   159  				hcd.placeholderTokenMap[placeholder] = tokenInMap
   160  			}
   161  		}
   162  
   163  		if placeholder != 0 {
   164  			res.WriteRune(placeholder) // use the placeholder to replace the token
   165  		} else {
   166  			// unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
   167  			// usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
   168  			hcd.placeholderOverflowCount++
   169  			if strings.HasPrefix(token, "&") {
   170  				// when the token is a html entity, something must be outputted even if there is no placeholder.
   171  				res.WriteRune(0xFFFD)      // replacement character TODO: how to handle this case more gracefully?
   172  				res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
   173  			}
   174  		}
   175  	}
   176  
   177  	// write the remaining string
   178  	res.WriteString(htmlCode)
   179  	return res.String()
   180  }
   181  
   182  func (hcd *highlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) {
   183  	sb := strings.Builder{}
   184  	var tagStack []string
   185  
   186  	for _, r := range diff.Text {
   187  		token, ok := hcd.placeholderTokenMap[r]
   188  		if !ok || token == "" {
   189  			sb.WriteRune(r) // if the rune is not a placeholder, write it as it is
   190  			continue
   191  		}
   192  		var tokenToRecover string
   193  		if strings.HasPrefix(token, "</") { // for closing tag
   194  			// only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
   195  			tokenToRecover = token[:strings.IndexByte(token, '>')+1]
   196  			if len(tagStack) == 0 {
   197  				continue // if no opening tag in stack yet, skip the closing tag
   198  			}
   199  			tagStack = tagStack[:len(tagStack)-1]
   200  		} else if token[0] == '<' { // for opening tag
   201  			tokenToRecover = token
   202  			tagStack = append(tagStack, token)
   203  		} else if token[0] == '&' { // for html entity
   204  			tokenToRecover = token
   205  		} // else: impossible
   206  		sb.WriteString(tokenToRecover)
   207  	}
   208  
   209  	if len(tagStack) > 0 {
   210  		// close all opening tags
   211  		for i := len(tagStack) - 1; i >= 0; i-- {
   212  			tagToClose := tagStack[i]
   213  			// get the closing tag "</span>" from "<span class=...>" or "<span>"
   214  			pos := strings.IndexAny(tagToClose, " >")
   215  			if pos != -1 {
   216  				sb.WriteString("</" + tagToClose[1:pos] + ">")
   217  			} // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag
   218  		}
   219  	}
   220  
   221  	diff.Text = sb.String()
   222  }