github.com/gitbundle/modules@v0.0.0-20231025071548-85b91c5c3b01/charset/escape.go (about) 1 // Copyright 2023 The GitBundle Inc. All rights reserved. 2 // Copyright 2017 The Gitea Authors. All rights reserved. 3 // Use of this source code is governed by a MIT-style 4 // license that can be found in the LICENSE file. 5 6 package charset 7 8 import ( 9 "bytes" 10 "fmt" 11 "io" 12 "strings" 13 "unicode" 14 "unicode/utf8" 15 16 "golang.org/x/text/unicode/bidi" 17 ) 18 19 // EscapeStatus represents the findings of the unicode escaper 20 type EscapeStatus struct { 21 Escaped bool 22 HasError bool 23 HasBadRunes bool 24 HasControls bool 25 HasSpaces bool 26 HasMarks bool 27 HasBIDI bool 28 BadBIDI bool 29 HasRTLScript bool 30 HasLTRScript bool 31 } 32 33 // Or combines two EscapeStatus structs into one representing the conjunction of the two 34 func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus { 35 st := status 36 st.Escaped = st.Escaped || other.Escaped 37 st.HasError = st.HasError || other.HasError 38 st.HasBadRunes = st.HasBadRunes || other.HasBadRunes 39 st.HasControls = st.HasControls || other.HasControls 40 st.HasSpaces = st.HasSpaces || other.HasSpaces 41 st.HasMarks = st.HasMarks || other.HasMarks 42 st.HasBIDI = st.HasBIDI || other.HasBIDI 43 st.BadBIDI = st.BadBIDI || other.BadBIDI 44 st.HasRTLScript = st.HasRTLScript || other.HasRTLScript 45 st.HasLTRScript = st.HasLTRScript || other.HasLTRScript 46 return st 47 } 48 49 // EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string 50 func EscapeControlString(text string) (EscapeStatus, string) { 51 sb := &strings.Builder{} 52 escaped, _ := EscapeControlReader(strings.NewReader(text), sb) 53 return escaped, sb.String() 54 } 55 56 // EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte 57 func EscapeControlBytes(text []byte) (EscapeStatus, []byte) { 58 buf := &bytes.Buffer{} 59 escaped, _ := EscapeControlReader(bytes.NewReader(text), buf) 60 return escaped, buf.Bytes() 61 } 62 63 // EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error 64 func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) { 65 buf := make([]byte, 4096) 66 readStart := 0 67 runeCount := 0 68 var n int 69 var writePos int 70 71 lineHasBIDI := false 72 lineHasRTLScript := false 73 lineHasLTRScript := false 74 75 readingloop: 76 for err == nil { 77 n, err = text.Read(buf[readStart:]) 78 bs := buf[:n+readStart] 79 n = len(bs) 80 i := 0 81 82 for i < len(bs) { 83 r, size := utf8.DecodeRune(bs[i:]) 84 runeCount++ 85 86 // Now handle the codepoints 87 switch { 88 case r == utf8.RuneError: 89 if writePos < i { 90 if _, err = output.Write(bs[writePos:i]); err != nil { 91 escaped.HasError = true 92 return 93 } 94 writePos = i 95 } 96 // runes can be at most 4 bytes - so... 97 if len(bs)-i <= 3 { 98 // if not request more data 99 copy(buf, bs[i:]) 100 readStart = n - i 101 writePos = 0 102 continue readingloop 103 } 104 // this is a real broken rune 105 escaped.HasBadRunes = true 106 escaped.Escaped = true 107 if err = writeBroken(output, bs[i:i+size]); err != nil { 108 escaped.HasError = true 109 return 110 } 111 writePos += size 112 case r == '\n': 113 if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { 114 escaped.BadBIDI = true 115 } 116 lineHasBIDI = false 117 lineHasRTLScript = false 118 lineHasLTRScript = false 119 120 case runeCount == 1 && r == 0xFEFF: // UTF BOM 121 // the first BOM is safe 122 case r == '\r' || r == '\t' || r == ' ': 123 // These are acceptable control characters and space characters 124 case unicode.IsSpace(r): 125 escaped.HasSpaces = true 126 escaped.Escaped = true 127 if writePos < i { 128 if _, err = output.Write(bs[writePos:i]); err != nil { 129 escaped.HasError = true 130 return 131 } 132 } 133 if err = writeEscaped(output, r); err != nil { 134 escaped.HasError = true 135 return 136 } 137 writePos = i + size 138 case unicode.Is(unicode.Bidi_Control, r): 139 escaped.Escaped = true 140 escaped.HasBIDI = true 141 if writePos < i { 142 if _, err = output.Write(bs[writePos:i]); err != nil { 143 escaped.HasError = true 144 return 145 } 146 } 147 lineHasBIDI = true 148 if err = writeEscaped(output, r); err != nil { 149 escaped.HasError = true 150 return 151 } 152 writePos = i + size 153 case unicode.Is(unicode.C, r): 154 escaped.Escaped = true 155 escaped.HasControls = true 156 if writePos < i { 157 if _, err = output.Write(bs[writePos:i]); err != nil { 158 escaped.HasError = true 159 return 160 } 161 } 162 if err = writeEscaped(output, r); err != nil { 163 escaped.HasError = true 164 return 165 } 166 writePos = i + size 167 case unicode.Is(unicode.M, r): 168 escaped.Escaped = true 169 escaped.HasMarks = true 170 if writePos < i { 171 if _, err = output.Write(bs[writePos:i]); err != nil { 172 escaped.HasError = true 173 return 174 } 175 } 176 if err = writeEscaped(output, r); err != nil { 177 escaped.HasError = true 178 return 179 } 180 writePos = i + size 181 default: 182 p, _ := bidi.Lookup(bs[i : i+size]) 183 c := p.Class() 184 if c == bidi.R || c == bidi.AL { 185 lineHasRTLScript = true 186 escaped.HasRTLScript = true 187 } else if c == bidi.L { 188 lineHasLTRScript = true 189 escaped.HasLTRScript = true 190 } 191 } 192 i += size 193 } 194 if n > 0 { 195 // we read something... 196 // write everything unwritten 197 if writePos < i { 198 if _, err = output.Write(bs[writePos:i]); err != nil { 199 escaped.HasError = true 200 return 201 } 202 } 203 204 // reset the starting positions for the next read 205 readStart = 0 206 writePos = 0 207 } 208 } 209 if readStart > 0 { 210 // this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round 211 escaped.Escaped = true 212 escaped.HasBadRunes = true 213 if err = writeBroken(output, buf[:readStart]); err != nil { 214 escaped.HasError = true 215 return 216 } 217 } 218 if err == io.EOF { 219 if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { 220 escaped.BadBIDI = true 221 } 222 err = nil 223 return 224 } 225 escaped.HasError = true 226 return 227 } 228 229 func writeBroken(output io.Writer, bs []byte) (err error) { 230 _, err = fmt.Fprintf(output, `<span class="broken-code-point"><%X></span>`, bs) 231 return 232 } 233 234 func writeEscaped(output io.Writer, r rune) (err error) { 235 _, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r) 236 return 237 }