code.gitea.io/gitea@v1.19.3/modules/charset/charset.go (about) 1 // Copyright 2014 The Gogs Authors. All rights reserved. 2 // SPDX-License-Identifier: MIT 3 4 package charset 5 6 import ( 7 "bytes" 8 "fmt" 9 "io" 10 "strings" 11 "unicode/utf8" 12 13 "code.gitea.io/gitea/modules/log" 14 "code.gitea.io/gitea/modules/setting" 15 "code.gitea.io/gitea/modules/util" 16 17 "github.com/gogs/chardet" 18 "golang.org/x/net/html/charset" 19 "golang.org/x/text/transform" 20 ) 21 22 // UTF8BOM is the utf-8 byte-order marker 23 var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} 24 25 // ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible 26 func ToUTF8WithFallbackReader(rd io.Reader) io.Reader { 27 buf := make([]byte, 2048) 28 n, err := util.ReadAtMost(rd, buf) 29 if err != nil { 30 return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd) 31 } 32 33 charsetLabel, err := DetectEncoding(buf[:n]) 34 if err != nil || charsetLabel == "UTF-8" { 35 return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd) 36 } 37 38 encoding, _ := charset.Lookup(charsetLabel) 39 if encoding == nil { 40 return io.MultiReader(bytes.NewReader(buf[:n]), rd) 41 } 42 43 return transform.NewReader( 44 io.MultiReader( 45 bytes.NewReader(RemoveBOMIfPresent(buf[:n])), 46 rd, 47 ), 48 encoding.NewDecoder(), 49 ) 50 } 51 52 // ToUTF8WithErr converts content to UTF8 encoding 53 func ToUTF8WithErr(content []byte) (string, error) { 54 charsetLabel, err := DetectEncoding(content) 55 if err != nil { 56 return "", err 57 } else if charsetLabel == "UTF-8" { 58 return string(RemoveBOMIfPresent(content)), nil 59 } 60 61 encoding, _ := charset.Lookup(charsetLabel) 62 if encoding == nil { 63 return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel) 64 } 65 66 // If there is an error, we concatenate the nicely decoded part and the 67 // original left over. This way we won't lose much data. 68 result, n, err := transform.Bytes(encoding.NewDecoder(), content) 69 if err != nil { 70 result = append(result, content[n:]...) 71 } 72 73 result = RemoveBOMIfPresent(result) 74 75 return string(result), err 76 } 77 78 // ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible 79 func ToUTF8WithFallback(content []byte) []byte { 80 bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content))) 81 return bs 82 } 83 84 // ToUTF8 converts content to UTF8 encoding and ignore error 85 func ToUTF8(content string) string { 86 res, _ := ToUTF8WithErr([]byte(content)) 87 return res 88 } 89 90 // ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible 91 func ToUTF8DropErrors(content []byte) []byte { 92 charsetLabel, err := DetectEncoding(content) 93 if err != nil || charsetLabel == "UTF-8" { 94 return RemoveBOMIfPresent(content) 95 } 96 97 encoding, _ := charset.Lookup(charsetLabel) 98 if encoding == nil { 99 return content 100 } 101 102 // We ignore any non-decodable parts from the file. 103 // Some parts might be lost 104 var decoded []byte 105 decoder := encoding.NewDecoder() 106 idx := 0 107 for { 108 result, n, err := transform.Bytes(decoder, content[idx:]) 109 decoded = append(decoded, result...) 110 if err == nil { 111 break 112 } 113 decoded = append(decoded, ' ') 114 idx = idx + n + 1 115 if idx >= len(content) { 116 break 117 } 118 } 119 120 return RemoveBOMIfPresent(decoded) 121 } 122 123 // RemoveBOMIfPresent removes a UTF-8 BOM from a []byte 124 func RemoveBOMIfPresent(content []byte) []byte { 125 if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) { 126 return content[3:] 127 } 128 return content 129 } 130 131 // DetectEncoding detect the encoding of content 132 func DetectEncoding(content []byte) (string, error) { 133 // First we check if the content represents valid utf8 content excepting a truncated character at the end. 134 135 // Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do 136 // instead we walk backwards from the end to trim off a the incomplete character 137 toValidate := content 138 end := len(toValidate) - 1 139 140 if end < 0 { 141 // no-op 142 } else if toValidate[end]>>5 == 0b110 { 143 // Incomplete 1 byte extension e.g. © <c2><a9> which has been truncated to <c2> 144 toValidate = toValidate[:end] 145 } else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 { 146 // Incomplete 2 byte extension e.g. ⛔ <e2><9b><94> which has been truncated to <e2><9b> 147 toValidate = toValidate[:end-1] 148 } else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 { 149 // Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92> 150 toValidate = toValidate[:end-2] 151 } 152 if utf8.Valid(toValidate) { 153 log.Debug("Detected encoding: utf-8 (fast)") 154 return "UTF-8", nil 155 } 156 157 textDetector := chardet.NewTextDetector() 158 var detectContent []byte 159 if len(content) < 1024 { 160 // Check if original content is valid 161 if _, err := textDetector.DetectBest(content); err != nil { 162 return "", err 163 } 164 times := 1024 / len(content) 165 detectContent = make([]byte, 0, times*len(content)) 166 for i := 0; i < times; i++ { 167 detectContent = append(detectContent, content...) 168 } 169 } else { 170 detectContent = content 171 } 172 173 // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break 174 results, err := textDetector.DetectAll(detectContent) 175 if err != nil { 176 if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { 177 log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) 178 return setting.Repository.AnsiCharset, nil 179 } 180 return "", err 181 } 182 183 topConfidence := results[0].Confidence 184 topResult := results[0] 185 priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))] 186 for _, result := range results { 187 // As results are sorted in confidence order - if we have a different confidence 188 // we know it's less than the current confidence and can break out of the loop early 189 if result.Confidence != topConfidence { 190 break 191 } 192 193 // Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guess 194 resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))] 195 if resultHas && (!has || resultPriority < priority) { 196 topResult = result 197 priority = resultPriority 198 has = true 199 } 200 } 201 202 // FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument 203 if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { 204 log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) 205 return setting.Repository.AnsiCharset, err 206 } 207 208 log.Debug("Detected encoding: %s", topResult.Charset) 209 return topResult.Charset, err 210 }