code.gitea.io/gitea@v1.19.3/modules/charset/escape_stream.go (about) 1 // Copyright 2022 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package charset 5 6 import ( 7 "fmt" 8 "regexp" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 13 "code.gitea.io/gitea/modules/translation" 14 15 "golang.org/x/net/html" 16 ) 17 18 // VScode defaultWordRegexp 19 var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`) 20 21 func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer { 22 allowedM := make(map[rune]bool, len(allowed)) 23 for _, v := range allowed { 24 allowedM[v] = true 25 } 26 return &escapeStreamer{ 27 escaped: &EscapeStatus{}, 28 PassthroughHTMLStreamer: *NewPassthroughStreamer(next), 29 locale: locale, 30 ambiguousTables: AmbiguousTablesForLocale(locale), 31 allowed: allowedM, 32 } 33 } 34 35 type escapeStreamer struct { 36 PassthroughHTMLStreamer 37 escaped *EscapeStatus 38 locale translation.Locale 39 ambiguousTables []*AmbiguousTable 40 allowed map[rune]bool 41 } 42 43 func (e *escapeStreamer) EscapeStatus() *EscapeStatus { 44 return e.escaped 45 } 46 47 // Text tells the next streamer there is a text 48 func (e *escapeStreamer) Text(data string) error { 49 sb := &strings.Builder{} 50 pos, until, next := 0, 0, 0 51 if len(data) > len(UTF8BOM) && data[:len(UTF8BOM)] == string(UTF8BOM) { 52 _, _ = sb.WriteString(data[:len(UTF8BOM)]) 53 pos = len(UTF8BOM) 54 } 55 dataBytes := []byte(data) 56 for pos < len(data) { 57 nextIdxs := defaultWordRegexp.FindStringIndex(data[pos:]) 58 if nextIdxs == nil { 59 until = len(data) 60 next = until 61 } else { 62 until, next = nextIdxs[0]+pos, nextIdxs[1]+pos 63 } 64 65 // from pos until until we know that the runes are not \r\t\n or even ' ' 66 runes := make([]rune, 0, next-until) 67 positions := make([]int, 0, next-until+1) 68 69 for pos < until { 70 r, sz := utf8.DecodeRune(dataBytes[pos:]) 71 positions = positions[:0] 72 positions = append(positions, pos, pos+sz) 73 types, confusables, _ := e.runeTypes(r) 74 if err := e.handleRunes(dataBytes, []rune{r}, positions, types, confusables, sb); err != nil { 75 return err 76 } 77 pos += sz 78 } 79 80 for i := pos; i < next; { 81 r, sz := utf8.DecodeRune(dataBytes[i:]) 82 runes = append(runes, r) 83 positions = append(positions, i) 84 i += sz 85 } 86 positions = append(positions, next) 87 types, confusables, runeCounts := e.runeTypes(runes...) 88 if runeCounts.needsEscape() { 89 if err := e.handleRunes(dataBytes, runes, positions, types, confusables, sb); err != nil { 90 return err 91 } 92 } else { 93 _, _ = sb.Write(dataBytes[pos:next]) 94 } 95 pos = next 96 } 97 if sb.Len() > 0 { 98 if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { 99 return err 100 } 101 } 102 return nil 103 } 104 105 func (e *escapeStreamer) handleRunes(data []byte, runes []rune, positions []int, types []runeType, confusables []rune, sb *strings.Builder) error { 106 for i, r := range runes { 107 switch types[i] { 108 case brokenRuneType: 109 if sb.Len() > 0 { 110 if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { 111 return err 112 } 113 sb.Reset() 114 } 115 end := positions[i+1] 116 start := positions[i] 117 if err := e.brokenRune(data[start:end]); err != nil { 118 return err 119 } 120 case ambiguousRuneType: 121 if sb.Len() > 0 { 122 if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { 123 return err 124 } 125 sb.Reset() 126 } 127 if err := e.ambiguousRune(r, confusables[0]); err != nil { 128 return err 129 } 130 confusables = confusables[1:] 131 case invisibleRuneType: 132 if sb.Len() > 0 { 133 if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { 134 return err 135 } 136 sb.Reset() 137 } 138 if err := e.invisibleRune(r); err != nil { 139 return err 140 } 141 default: 142 _, _ = sb.WriteRune(r) 143 } 144 } 145 return nil 146 } 147 148 func (e *escapeStreamer) brokenRune(bs []byte) error { 149 e.escaped.Escaped = true 150 e.escaped.HasBadRunes = true 151 152 if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ 153 Key: "class", 154 Val: "broken-code-point", 155 }); err != nil { 156 return err 157 } 158 if err := e.PassthroughHTMLStreamer.Text(fmt.Sprintf("<%X>", bs)); err != nil { 159 return err 160 } 161 162 return e.PassthroughHTMLStreamer.EndTag("span") 163 } 164 165 func (e *escapeStreamer) ambiguousRune(r, c rune) error { 166 e.escaped.Escaped = true 167 e.escaped.HasAmbiguous = true 168 169 if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ 170 Key: "class", 171 Val: "ambiguous-code-point tooltip", 172 }, html.Attribute{ 173 Key: "data-content", 174 Val: e.locale.Tr("repo.ambiguous_character", r, c), 175 }); err != nil { 176 return err 177 } 178 if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ 179 Key: "class", 180 Val: "char", 181 }); err != nil { 182 return err 183 } 184 if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil { 185 return err 186 } 187 if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil { 188 return err 189 } 190 191 return e.PassthroughHTMLStreamer.EndTag("span") 192 } 193 194 func (e *escapeStreamer) invisibleRune(r rune) error { 195 e.escaped.Escaped = true 196 e.escaped.HasInvisible = true 197 198 if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ 199 Key: "class", 200 Val: "escaped-code-point", 201 }, html.Attribute{ 202 Key: "data-escaped", 203 Val: fmt.Sprintf("[U+%04X]", r), 204 }); err != nil { 205 return err 206 } 207 if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ 208 Key: "class", 209 Val: "char", 210 }); err != nil { 211 return err 212 } 213 if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil { 214 return err 215 } 216 if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil { 217 return err 218 } 219 220 return e.PassthroughHTMLStreamer.EndTag("span") 221 } 222 223 type runeCountType struct { 224 numBasicRunes int 225 numNonConfusingNonBasicRunes int 226 numAmbiguousRunes int 227 numInvisibleRunes int 228 numBrokenRunes int 229 } 230 231 func (counts runeCountType) needsEscape() bool { 232 if counts.numBrokenRunes > 0 { 233 return true 234 } 235 if counts.numBasicRunes == 0 && 236 counts.numNonConfusingNonBasicRunes > 0 { 237 return false 238 } 239 return counts.numAmbiguousRunes > 0 || counts.numInvisibleRunes > 0 240 } 241 242 type runeType int 243 244 const ( 245 basicASCIIRuneType runeType = iota // <- This is technically deadcode but its self-documenting so it should stay 246 brokenRuneType 247 nonBasicASCIIRuneType 248 ambiguousRuneType 249 invisibleRuneType 250 ) 251 252 func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables []rune, runeCounts runeCountType) { 253 types = make([]runeType, len(runes)) 254 for i, r := range runes { 255 var confusable rune 256 switch { 257 case r == utf8.RuneError: 258 types[i] = brokenRuneType 259 runeCounts.numBrokenRunes++ 260 case r == ' ' || r == '\t' || r == '\n': 261 runeCounts.numBasicRunes++ 262 case e.allowed[r]: 263 if r > 0x7e || r < 0x20 { 264 types[i] = nonBasicASCIIRuneType 265 runeCounts.numNonConfusingNonBasicRunes++ 266 } else { 267 runeCounts.numBasicRunes++ 268 } 269 case unicode.Is(InvisibleRanges, r): 270 types[i] = invisibleRuneType 271 runeCounts.numInvisibleRunes++ 272 case unicode.IsControl(r): 273 types[i] = invisibleRuneType 274 runeCounts.numInvisibleRunes++ 275 case isAmbiguous(r, &confusable, e.ambiguousTables...): 276 confusables = append(confusables, confusable) 277 types[i] = ambiguousRuneType 278 runeCounts.numAmbiguousRunes++ 279 case r > 0x7e || r < 0x20: 280 types[i] = nonBasicASCIIRuneType 281 runeCounts.numNonConfusingNonBasicRunes++ 282 default: 283 runeCounts.numBasicRunes++ 284 } 285 } 286 return types, confusables, runeCounts 287 }