code.gitea.io/gitea@v1.22.3/modules/charset/escape_stream.go (about) 1 // Copyright 2022 The Gitea Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package charset 5 6 import ( 7 "fmt" 8 "regexp" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 13 "code.gitea.io/gitea/modules/translation" 14 15 "golang.org/x/net/html" 16 ) 17 18 // VScode defaultWordRegexp 19 var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`) 20 21 func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer { 22 allowedM := make(map[rune]bool, len(allowed)) 23 for _, v := range allowed { 24 allowedM[v] = true 25 } 26 return &escapeStreamer{ 27 escaped: &EscapeStatus{}, 28 PassthroughHTMLStreamer: *NewPassthroughStreamer(next), 29 locale: locale, 30 ambiguousTables: AmbiguousTablesForLocale(locale), 31 allowed: allowedM, 32 } 33 } 34 35 type escapeStreamer struct { 36 PassthroughHTMLStreamer 37 escaped *EscapeStatus 38 locale translation.Locale 39 ambiguousTables []*AmbiguousTable 40 allowed map[rune]bool 41 } 42 43 func (e *escapeStreamer) EscapeStatus() *EscapeStatus { 44 return e.escaped 45 } 46 47 // Text tells the next streamer there is a text 48 func (e *escapeStreamer) Text(data string) error { 49 sb := &strings.Builder{} 50 var until int 51 var next int 52 pos := 0 53 if len(data) > len(UTF8BOM) && data[:len(UTF8BOM)] == string(UTF8BOM) { 54 _, _ = sb.WriteString(data[:len(UTF8BOM)]) 55 pos = len(UTF8BOM) 56 } 57 dataBytes := []byte(data) 58 for pos < len(data) { 59 nextIdxs := defaultWordRegexp.FindStringIndex(data[pos:]) 60 if nextIdxs == nil { 61 until = len(data) 62 next = until 63 } else { 64 until, next = nextIdxs[0]+pos, nextIdxs[1]+pos 65 } 66 67 // from pos until we know that the runes are not \r\t\n or even ' ' 68 runes := make([]rune, 0, next-until) 69 positions := make([]int, 0, next-until+1) 70 71 for pos < until { 72 r, sz := utf8.DecodeRune(dataBytes[pos:]) 73 positions = positions[:0] 74 positions = append(positions, pos, pos+sz) 75 types, confusables, _ := e.runeTypes(r) 76 if err := e.handleRunes(dataBytes, []rune{r}, positions, types, confusables, sb); err != nil { 77 return err 78 } 79 pos += sz 80 } 81 82 for i := pos; i < next; { 83 r, sz := utf8.DecodeRune(dataBytes[i:]) 84 runes = append(runes, r) 85 positions = append(positions, i) 86 i += sz 87 } 88 positions = append(positions, next) 89 types, confusables, runeCounts := e.runeTypes(runes...) 90 if runeCounts.needsEscape() { 91 if err := e.handleRunes(dataBytes, runes, positions, types, confusables, sb); err != nil { 92 return err 93 } 94 } else { 95 _, _ = sb.Write(dataBytes[pos:next]) 96 } 97 pos = next 98 } 99 if sb.Len() > 0 { 100 if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { 101 return err 102 } 103 } 104 return nil 105 } 106 107 func (e *escapeStreamer) handleRunes(data []byte, runes []rune, positions []int, types []runeType, confusables []rune, sb *strings.Builder) error { 108 for i, r := range runes { 109 switch types[i] { 110 case brokenRuneType: 111 if sb.Len() > 0 { 112 if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { 113 return err 114 } 115 sb.Reset() 116 } 117 end := positions[i+1] 118 start := positions[i] 119 if err := e.brokenRune(data[start:end]); err != nil { 120 return err 121 } 122 case ambiguousRuneType: 123 if sb.Len() > 0 { 124 if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { 125 return err 126 } 127 sb.Reset() 128 } 129 if err := e.ambiguousRune(r, confusables[0]); err != nil { 130 return err 131 } 132 confusables = confusables[1:] 133 case invisibleRuneType: 134 if sb.Len() > 0 { 135 if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { 136 return err 137 } 138 sb.Reset() 139 } 140 if err := e.invisibleRune(r); err != nil { 141 return err 142 } 143 default: 144 _, _ = sb.WriteRune(r) 145 } 146 } 147 return nil 148 } 149 150 func (e *escapeStreamer) brokenRune(bs []byte) error { 151 e.escaped.Escaped = true 152 e.escaped.HasBadRunes = true 153 154 if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ 155 Key: "class", 156 Val: "broken-code-point", 157 }); err != nil { 158 return err 159 } 160 if err := e.PassthroughHTMLStreamer.Text(fmt.Sprintf("<%X>", bs)); err != nil { 161 return err 162 } 163 164 return e.PassthroughHTMLStreamer.EndTag("span") 165 } 166 167 func (e *escapeStreamer) ambiguousRune(r, c rune) error { 168 e.escaped.Escaped = true 169 e.escaped.HasAmbiguous = true 170 171 if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ 172 Key: "class", 173 Val: "ambiguous-code-point", 174 }, html.Attribute{ 175 Key: "data-tooltip-content", 176 Val: e.locale.TrString("repo.ambiguous_character", r, c), 177 }); err != nil { 178 return err 179 } 180 if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ 181 Key: "class", 182 Val: "char", 183 }); err != nil { 184 return err 185 } 186 if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil { 187 return err 188 } 189 if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil { 190 return err 191 } 192 193 return e.PassthroughHTMLStreamer.EndTag("span") 194 } 195 196 func (e *escapeStreamer) invisibleRune(r rune) error { 197 e.escaped.Escaped = true 198 e.escaped.HasInvisible = true 199 200 if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ 201 Key: "class", 202 Val: "escaped-code-point", 203 }, html.Attribute{ 204 Key: "data-escaped", 205 Val: fmt.Sprintf("[U+%04X]", r), 206 }); err != nil { 207 return err 208 } 209 if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ 210 Key: "class", 211 Val: "char", 212 }); err != nil { 213 return err 214 } 215 if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil { 216 return err 217 } 218 if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil { 219 return err 220 } 221 222 return e.PassthroughHTMLStreamer.EndTag("span") 223 } 224 225 type runeCountType struct { 226 numBasicRunes int 227 numNonConfusingNonBasicRunes int 228 numAmbiguousRunes int 229 numInvisibleRunes int 230 numBrokenRunes int 231 } 232 233 func (counts runeCountType) needsEscape() bool { 234 if counts.numBrokenRunes > 0 { 235 return true 236 } 237 if counts.numBasicRunes == 0 && 238 counts.numNonConfusingNonBasicRunes > 0 { 239 return false 240 } 241 return counts.numAmbiguousRunes > 0 || counts.numInvisibleRunes > 0 242 } 243 244 type runeType int 245 246 const ( 247 basicASCIIRuneType runeType = iota // <- This is technically deadcode but its self-documenting so it should stay 248 brokenRuneType 249 nonBasicASCIIRuneType 250 ambiguousRuneType 251 invisibleRuneType 252 ) 253 254 func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables []rune, runeCounts runeCountType) { 255 types = make([]runeType, len(runes)) 256 for i, r := range runes { 257 var confusable rune 258 switch { 259 case r == utf8.RuneError: 260 types[i] = brokenRuneType 261 runeCounts.numBrokenRunes++ 262 case r == ' ' || r == '\t' || r == '\n': 263 runeCounts.numBasicRunes++ 264 case e.allowed[r]: 265 if r > 0x7e || r < 0x20 { 266 types[i] = nonBasicASCIIRuneType 267 runeCounts.numNonConfusingNonBasicRunes++ 268 } else { 269 runeCounts.numBasicRunes++ 270 } 271 case unicode.Is(InvisibleRanges, r): 272 types[i] = invisibleRuneType 273 runeCounts.numInvisibleRunes++ 274 case unicode.IsControl(r): 275 types[i] = invisibleRuneType 276 runeCounts.numInvisibleRunes++ 277 case isAmbiguous(r, &confusable, e.ambiguousTables...): 278 confusables = append(confusables, confusable) 279 types[i] = ambiguousRuneType 280 runeCounts.numAmbiguousRunes++ 281 case r > 0x7e || r < 0x20: 282 types[i] = nonBasicASCIIRuneType 283 runeCounts.numNonConfusingNonBasicRunes++ 284 default: 285 runeCounts.numBasicRunes++ 286 } 287 } 288 return types, confusables, runeCounts 289 }