github.com/editorconfig-checker/editorconfig-checker@v0.0.0-20231102090242-ddae3e68851e/pkg/encoding/encoding.go (about) 1 // package encoding contains all the encoding functions 2 package encoding 3 4 import ( 5 "fmt" 6 "sort" 7 "strings" 8 "unicode/utf8" 9 10 "github.com/baulk/chardet" 11 "golang.org/x/text/encoding" 12 "golang.org/x/text/encoding/charmap" 13 "golang.org/x/text/encoding/japanese" 14 "golang.org/x/text/encoding/korean" 15 "golang.org/x/text/encoding/simplifiedchinese" 16 "golang.org/x/text/encoding/traditionalchinese" 17 "golang.org/x/text/encoding/unicode" 18 "golang.org/x/text/encoding/unicode/utf32" 19 ) 20 21 const BinaryData = "binary" 22 23 var encodings = map[string]encoding.Encoding{ 24 // In https://github.com/golang/text/blob/HEAD/encoding/htmlindex/map.go#L64 and 25 // https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156 : 26 "utf8": unicode.UTF8, 27 "ibm866": charmap.CodePage866, 28 "iso88592": charmap.ISO8859_2, 29 "iso88593": charmap.ISO8859_3, 30 "iso88594": charmap.ISO8859_4, 31 "iso88595": charmap.ISO8859_5, 32 "iso88596": charmap.ISO8859_6, 33 "iso88597": charmap.ISO8859_7, 34 "iso88598": charmap.ISO8859_8, 35 "iso885910": charmap.ISO8859_10, 36 "iso885913": charmap.ISO8859_13, 37 "iso885914": charmap.ISO8859_14, 38 "iso885915": charmap.ISO8859_15, 39 "iso885916": charmap.ISO8859_16, 40 "koi8r": charmap.KOI8R, 41 "koi8u": charmap.KOI8U, 42 "macintosh": charmap.Macintosh, 43 "windows874": charmap.Windows874, 44 "windows1250": charmap.Windows1250, 45 "windows1251": charmap.Windows1251, 46 "windows1252": charmap.Windows1252, 47 "windows1253": charmap.Windows1253, 48 "windows1254": charmap.Windows1254, 49 "windows1255": charmap.Windows1255, 50 "windows1256": charmap.Windows1256, 51 "windows1257": charmap.Windows1257, 52 "windows1258": charmap.Windows1258, 53 "gbk": simplifiedchinese.GBK, 54 "gb18030": simplifiedchinese.GB18030, 55 "big5": traditionalchinese.Big5, 56 "eucjp": japanese.EUCJP, 57 "iso2022jp": japanese.ISO2022JP, 58 "shiftjis": japanese.ShiftJIS, 59 "euckr": korean.EUCKR, 60 "utf16be": unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM), 61 "utf16le": unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), 62 // Not in https://github.com/golang/text/blob/HEAD/encoding/htmlindex/map.go#L64 : 63 "iso88591": charmap.ISO8859_1, 64 "ibm037": charmap.CodePage037, 65 "ibm437": charmap.CodePage437, 66 "ibm850": charmap.CodePage850, 67 "ibm852": charmap.CodePage852, 68 "ibm855": charmap.CodePage855, 69 "ibm858": charmap.CodePage858, 70 "ibm860": charmap.CodePage860, 71 "ibm862": charmap.CodePage862, 72 "ibm863": charmap.CodePage863, 73 "ibm865": charmap.CodePage865, 74 "ibm1047": charmap.CodePage1047, 75 "ibm1140": charmap.CodePage1140, 76 "iso88596e": charmap.ISO8859_6E, 77 "iso88596i": charmap.ISO8859_6I, 78 "iso88598e": charmap.ISO8859_8E, 79 "iso88598i": charmap.ISO8859_8I, 80 "iso88599": charmap.ISO8859_9, 81 "hzgb2312": simplifiedchinese.HZGB2312, 82 // Not https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156 : 83 "macintoshcyrillic": charmap.MacintoshCyrillic, 84 // Not in https://github.com/golang/text/blob/HEAD/encoding/htmlindex/map.go#L64 or 85 // https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156 : 86 "utf8bom": unicode.UTF8, 87 "utf32be": utf32.UTF32(utf32.BigEndian, utf32.IgnoreBOM), 88 "utf32le": utf32.UTF32(utf32.LittleEndian, utf32.IgnoreBOM), 89 } 90 91 // In https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156 92 // but not included above: 93 // enc3: asciiEnc, 94 // enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM) 95 96 // DecodeBytes converts a byte array to a string 97 func DecodeBytes(contentBytes []byte) (string, string, error) { 98 contentString := string(contentBytes) 99 100 charset, err := detectText(contentBytes) 101 if err != nil { 102 if IsBinaryFile(contentBytes) { 103 return contentString, BinaryData, nil 104 } 105 return contentString, charset, err 106 } 107 decodedContentString, err := decodeText(contentBytes, charset) 108 if err != nil { 109 if IsBinaryFile(contentBytes) { 110 return contentString, BinaryData, nil 111 } 112 return contentString, charset, err 113 } 114 return decodedContentString, charset, nil 115 } 116 117 func detectText(contentBytes []byte) (string, error) { 118 detector := chardet.NewTextDetector() 119 results, err := detector.DetectAll(contentBytes) 120 if err != nil { 121 return "", err 122 } 123 if len(results) == 0 { 124 return "", fmt.Errorf("Failed to determine charset") 125 } 126 confidence := -1 127 keys := make([]string, 0, len(results)) 128 for _, result := range results { 129 _, ok := getEncoding(result.Charset) 130 if !ok { 131 continue 132 } 133 if result.Confidence < confidence { 134 break 135 } 136 confidence = result.Confidence 137 keys = append(keys, result.Charset) 138 } 139 sort.Strings(keys) 140 return keys[0], nil 141 } 142 143 func decodeText(contentBytes []byte, charset string) (string, error) { 144 enc, ok := getEncoding(charset) 145 if !ok { 146 return "", fmt.Errorf("unrecognized charset %s", charset) 147 } 148 var err error 149 contentBytes, err = enc.NewDecoder().Bytes(contentBytes) 150 if err != nil { 151 return "", err 152 } 153 if !utf8.Valid(contentBytes) { 154 return "", fmt.Errorf("the file is not a valid UTF-8 encoded file") 155 } 156 return string(contentBytes), nil 157 } 158 159 func getEncoding(charset string) (encoding.Encoding, bool) { 160 r := strings.NewReplacer("-", "", "_", "") 161 key := strings.ToLower(r.Replace(charset)) 162 enc, ok := encodings[key] 163 return enc, ok 164 } 165 166 var binaryChars = [256]bool{} 167 168 func init() { 169 // Allow tab (9), lf (10), ff (12), and cr (13) 170 trues := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31} 171 for _, i := range trues { 172 binaryChars[i] = true 173 } 174 } 175 176 // IsBinaryFile returns true if the bytes contain \x00-\x08,\x0b,\x0e-\x1f 177 func IsBinaryFile(rawFileContent []byte) bool { 178 for _, b := range rawFileContent { 179 if binaryChars[b] { 180 return true 181 } 182 } 183 return false 184 }