code.gitea.io/gitea@v1.21.7/services/gitdiff/highlightdiff.go (about) 1 // Copyright 2022 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package gitdiff 5 6 import ( 7 "strings" 8 9 "code.gitea.io/gitea/modules/highlight" 10 11 "github.com/sergi/go-diff/diffmatchpatch" 12 ) 13 14 // token is a html tag or entity, eg: "<span ...>", "</span>", "<" 15 func extractHTMLToken(s string) (before, token, after string, valid bool) { 16 for pos1 := 0; pos1 < len(s); pos1++ { 17 if s[pos1] == '<' { 18 pos2 := strings.IndexByte(s[pos1:], '>') 19 if pos2 == -1 { 20 return "", "", s, false 21 } 22 return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true 23 } else if s[pos1] == '&' { 24 pos2 := strings.IndexByte(s[pos1:], ';') 25 if pos2 == -1 { 26 return "", "", s, false 27 } 28 return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true 29 } 30 } 31 return "", "", s, true 32 } 33 34 // highlightCodeDiff is used to do diff with highlighted HTML code. 35 // It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes. 36 // The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001" 37 // These Unicode placeholders are friendly to the diff. 38 // Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities. 39 // It's guaranteed that the tags in final diff result are paired correctly. 40 type highlightCodeDiff struct { 41 placeholderBegin rune 42 placeholderMaxCount int 43 placeholderIndex int 44 placeholderTokenMap map[rune]string 45 tokenPlaceholderMap map[string]rune 46 47 placeholderOverflowCount int 48 49 lineWrapperTags []string 50 } 51 52 func newHighlightCodeDiff() *highlightCodeDiff { 53 return &highlightCodeDiff{ 54 placeholderBegin: rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD) 55 placeholderMaxCount: 64000, 56 placeholderTokenMap: map[rune]string{}, 57 tokenPlaceholderMap: map[string]rune{}, 58 } 59 } 60 61 // nextPlaceholder returns 0 if no more placeholder can be used 62 // the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line 63 // so the placeholderMaxCount is impossible to be exhausted in real cases. 64 func (hcd *highlightCodeDiff) nextPlaceholder() rune { 65 for hcd.placeholderIndex < hcd.placeholderMaxCount { 66 r := hcd.placeholderBegin + rune(hcd.placeholderIndex) 67 hcd.placeholderIndex++ 68 // only use non-existing (not used by code) rune as placeholders 69 if _, ok := hcd.placeholderTokenMap[r]; !ok { 70 return r 71 } 72 } 73 return 0 // no more available placeholder 74 } 75 76 func (hcd *highlightCodeDiff) isInPlaceholderRange(r rune) bool { 77 return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount) 78 } 79 80 func (hcd *highlightCodeDiff) collectUsedRunes(code string) { 81 for _, r := range code { 82 if hcd.isInPlaceholderRange(r) { 83 // put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore. 84 hcd.placeholderTokenMap[r] = "" 85 } 86 } 87 } 88 89 func (hcd *highlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff { 90 hcd.collectUsedRunes(codeA) 91 hcd.collectUsedRunes(codeB) 92 93 highlightCodeA, _ := highlight.Code(filename, language, codeA) 94 highlightCodeB, _ := highlight.Code(filename, language, codeB) 95 96 convertedCodeA := hcd.convertToPlaceholders(string(highlightCodeA)) 97 convertedCodeB := hcd.convertToPlaceholders(string(highlightCodeB)) 98 99 diffs := diffMatchPatch.DiffMain(convertedCodeA, convertedCodeB, true) 100 diffs = diffMatchPatch.DiffCleanupEfficiency(diffs) 101 102 for i := range diffs { 103 hcd.recoverOneDiff(&diffs[i]) 104 } 105 return diffs 106 } 107 108 // convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes. 109 func (hcd *highlightCodeDiff) convertToPlaceholders(htmlCode string) string { 110 var tagStack []string 111 res := strings.Builder{} 112 113 firstRunForLineTags := hcd.lineWrapperTags == nil 114 115 var beforeToken, token string 116 var valid bool 117 118 // the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>" 119 for { 120 beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode) 121 if !valid || token == "" { 122 break 123 } 124 // write the content before the token into result string, and consume the token in the string 125 res.WriteString(beforeToken) 126 127 // the line wrapper tags should be removed before diff 128 if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) { 129 if firstRunForLineTags { 130 // if this is the first run for converting, save the line wrapper tags for later use, they should be added back 131 hcd.lineWrapperTags = append(hcd.lineWrapperTags, token) 132 } 133 htmlCode = strings.TrimSuffix(htmlCode, "</span>") 134 continue 135 } 136 137 var tokenInMap string 138 if strings.HasSuffix(token, "</") { // for closing tag 139 if len(tagStack) == 0 { 140 break // invalid diff result, no opening tag but see closing tag 141 } 142 // make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags 143 // the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>" 144 tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->" 145 tagStack = tagStack[:len(tagStack)-1] 146 } else if token[0] == '<' { // for opening tag 147 tokenInMap = token 148 tagStack = append(tagStack, token) 149 } else if token[0] == '&' { // for html entity 150 tokenInMap = token 151 } // else: impossible 152 153 // remember the placeholder and token in the map 154 placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap] 155 if !ok { 156 placeholder = hcd.nextPlaceholder() 157 if placeholder != 0 { 158 hcd.tokenPlaceholderMap[tokenInMap] = placeholder 159 hcd.placeholderTokenMap[placeholder] = tokenInMap 160 } 161 } 162 163 if placeholder != 0 { 164 res.WriteRune(placeholder) // use the placeholder to replace the token 165 } else { 166 // unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting 167 // usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma. 168 hcd.placeholderOverflowCount++ 169 if strings.HasPrefix(token, "&") { 170 // when the token is a html entity, something must be outputted even if there is no placeholder. 171 res.WriteRune(0xFFFD) // replacement character TODO: how to handle this case more gracefully? 172 res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result. 173 } 174 } 175 } 176 177 // write the remaining string 178 res.WriteString(htmlCode) 179 return res.String() 180 } 181 182 func (hcd *highlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) { 183 sb := strings.Builder{} 184 var tagStack []string 185 186 for _, r := range diff.Text { 187 token, ok := hcd.placeholderTokenMap[r] 188 if !ok || token == "" { 189 sb.WriteRune(r) // if the rune is not a placeholder, write it as it is 190 continue 191 } 192 var tokenToRecover string 193 if strings.HasPrefix(token, "</") { // for closing tag 194 // only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function) 195 tokenToRecover = token[:strings.IndexByte(token, '>')+1] 196 if len(tagStack) == 0 { 197 continue // if no opening tag in stack yet, skip the closing tag 198 } 199 tagStack = tagStack[:len(tagStack)-1] 200 } else if token[0] == '<' { // for opening tag 201 tokenToRecover = token 202 tagStack = append(tagStack, token) 203 } else if token[0] == '&' { // for html entity 204 tokenToRecover = token 205 } // else: impossible 206 sb.WriteString(tokenToRecover) 207 } 208 209 if len(tagStack) > 0 { 210 // close all opening tags 211 for i := len(tagStack) - 1; i >= 0; i-- { 212 tagToClose := tagStack[i] 213 // get the closing tag "</span>" from "<span class=...>" or "<span>" 214 pos := strings.IndexAny(tagToClose, " >") 215 if pos != -1 { 216 sb.WriteString("</" + tagToClose[1:pos] + ">") 217 } // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag 218 } 219 } 220 221 diff.Text = sb.String() 222 }