github.com/gitbundle/modules@v0.0.0-20231025071548-85b91c5c3b01/charset/charset.go (about) 1 // Copyright 2023 The GitBundle Inc. All rights reserved. 2 // Copyright 2017 The Gitea Authors. All rights reserved. 3 // Use of this source code is governed by a MIT-style 4 // license that can be found in the LICENSE file. 5 6 // Copyright 2014 The Gogs Authors. All rights reserved. 7 // Use of this source code is governed by a MIT-style 8 // license that can be found in the LICENSE file. 9 10 package charset 11 12 import ( 13 "bytes" 14 "fmt" 15 "io" 16 "strings" 17 "unicode/utf8" 18 19 "github.com/gitbundle/modules/log" 20 "github.com/gitbundle/modules/setting" 21 "github.com/gitbundle/modules/util" 22 23 "github.com/gogs/chardet" 24 "golang.org/x/net/html/charset" 25 "golang.org/x/text/transform" 26 ) 27 28 // UTF8BOM is the utf-8 byte-order marker 29 var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} 30 31 // ToUTF8WithFallbackReader detects the encoding of content and coverts to UTF-8 reader if possible 32 func ToUTF8WithFallbackReader(rd io.Reader) io.Reader { 33 buf := make([]byte, 2048) 34 n, err := util.ReadAtMost(rd, buf) 35 if err != nil { 36 return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd) 37 } 38 39 charsetLabel, err := DetectEncoding(buf[:n]) 40 if err != nil || charsetLabel == "UTF-8" { 41 return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd) 42 } 43 44 encoding, _ := charset.Lookup(charsetLabel) 45 if encoding == nil { 46 return io.MultiReader(bytes.NewReader(buf[:n]), rd) 47 } 48 49 return transform.NewReader( 50 io.MultiReader( 51 bytes.NewReader(RemoveBOMIfPresent(buf[:n])), 52 rd, 53 ), 54 encoding.NewDecoder(), 55 ) 56 } 57 58 // ToUTF8WithErr converts content to UTF8 encoding 59 func ToUTF8WithErr(content []byte) (string, error) { 60 charsetLabel, err := DetectEncoding(content) 61 if err != nil { 62 return "", err 63 } else if charsetLabel == "UTF-8" { 64 return string(RemoveBOMIfPresent(content)), nil 65 } 66 67 encoding, _ := charset.Lookup(charsetLabel) 68 if encoding == nil { 69 return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel) 70 } 71 72 // If there is an error, we concatenate the nicely decoded part and the 73 // original left over. This way we won't lose much data. 74 result, n, err := transform.Bytes(encoding.NewDecoder(), content) 75 if err != nil { 76 result = append(result, content[n:]...) 77 } 78 79 result = RemoveBOMIfPresent(result) 80 81 return string(result), err 82 } 83 84 // ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible 85 func ToUTF8WithFallback(content []byte) []byte { 86 bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content))) 87 return bs 88 } 89 90 // ToUTF8 converts content to UTF8 encoding and ignore error 91 func ToUTF8(content string) string { 92 res, _ := ToUTF8WithErr([]byte(content)) 93 return res 94 } 95 96 // ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible 97 func ToUTF8DropErrors(content []byte) []byte { 98 charsetLabel, err := DetectEncoding(content) 99 if err != nil || charsetLabel == "UTF-8" { 100 return RemoveBOMIfPresent(content) 101 } 102 103 encoding, _ := charset.Lookup(charsetLabel) 104 if encoding == nil { 105 return content 106 } 107 108 // We ignore any non-decodable parts from the file. 109 // Some parts might be lost 110 var decoded []byte 111 decoder := encoding.NewDecoder() 112 idx := 0 113 for { 114 result, n, err := transform.Bytes(decoder, content[idx:]) 115 decoded = append(decoded, result...) 116 if err == nil { 117 break 118 } 119 decoded = append(decoded, ' ') 120 idx = idx + n + 1 121 if idx >= len(content) { 122 break 123 } 124 } 125 126 return RemoveBOMIfPresent(decoded) 127 } 128 129 // RemoveBOMIfPresent removes a UTF-8 BOM from a []byte 130 func RemoveBOMIfPresent(content []byte) []byte { 131 if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) { 132 return content[3:] 133 } 134 return content 135 } 136 137 // DetectEncoding detect the encoding of content 138 func DetectEncoding(content []byte) (string, error) { 139 // First we check if the content represents valid utf8 content excepting a truncated character at the end. 140 141 // Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do 142 // instead we walk backwards from the end to trim off a the incomplete character 143 toValidate := content 144 end := len(toValidate) - 1 145 146 if end < 0 { 147 // no-op 148 } else if toValidate[end]>>5 == 0b110 { 149 // Incomplete 1 byte extension e.g. © <c2><a9> which has been truncated to <c2> 150 toValidate = toValidate[:end] 151 } else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 { 152 // Incomplete 2 byte extension e.g. ⛔ <e2><9b><94> which has been truncated to <e2><9b> 153 toValidate = toValidate[:end-1] 154 } else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 { 155 // Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92> 156 toValidate = toValidate[:end-2] 157 } 158 if utf8.Valid(toValidate) { 159 // NOTE: remove trace log 160 // log.Trace("Detected encoding: utf-8 (fast)") 161 return "UTF-8", nil 162 } 163 164 textDetector := chardet.NewTextDetector() 165 var detectContent []byte 166 if len(content) < 1024 { 167 // Check if original content is valid 168 if _, err := textDetector.DetectBest(content); err != nil { 169 return "", err 170 } 171 times := 1024 / len(content) 172 detectContent = make([]byte, 0, times*len(content)) 173 for i := 0; i < times; i++ { 174 detectContent = append(detectContent, content...) 175 } 176 } else { 177 detectContent = content 178 } 179 180 // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break 181 results, err := textDetector.DetectAll(detectContent) 182 if err != nil { 183 if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { 184 log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) 185 return setting.Repository.AnsiCharset, nil 186 } 187 return "", err 188 } 189 190 topConfidence := results[0].Confidence 191 topResult := results[0] 192 priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))] 193 for _, result := range results { 194 // As results are sorted in confidence order - if we have a different confidence 195 // we know it's less than the current confidence and can break out of the loop early 196 if result.Confidence != topConfidence { 197 break 198 } 199 200 // Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss 201 resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))] 202 if resultHas && (!has || resultPriority < priority) { 203 topResult = result 204 priority = resultPriority 205 has = true 206 } 207 } 208 209 // FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument 210 if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { 211 log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) 212 return setting.Repository.AnsiCharset, err 213 } 214 215 log.Debug("Detected encoding: %s", topResult.Charset) 216 return topResult.Charset, err 217 }