github.com/editorconfig-checker/editorconfig-checker@v0.0.0-20231102090242-ddae3e68851e/pkg/encoding/encoding.go (about)

     1  // package encoding contains all the encoding functions
     2  package encoding
     3  
     4  import (
     5  	"fmt"
     6  	"sort"
     7  	"strings"
     8  	"unicode/utf8"
     9  
    10  	"github.com/baulk/chardet"
    11  	"golang.org/x/text/encoding"
    12  	"golang.org/x/text/encoding/charmap"
    13  	"golang.org/x/text/encoding/japanese"
    14  	"golang.org/x/text/encoding/korean"
    15  	"golang.org/x/text/encoding/simplifiedchinese"
    16  	"golang.org/x/text/encoding/traditionalchinese"
    17  	"golang.org/x/text/encoding/unicode"
    18  	"golang.org/x/text/encoding/unicode/utf32"
    19  )
    20  
    21  const BinaryData = "binary"
    22  
    23  var encodings = map[string]encoding.Encoding{
    24  	// In https://github.com/golang/text/blob/HEAD/encoding/htmlindex/map.go#L64 and
    25  	//    https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156 :
    26  	"utf8":        unicode.UTF8,
    27  	"ibm866":      charmap.CodePage866,
    28  	"iso88592":    charmap.ISO8859_2,
    29  	"iso88593":    charmap.ISO8859_3,
    30  	"iso88594":    charmap.ISO8859_4,
    31  	"iso88595":    charmap.ISO8859_5,
    32  	"iso88596":    charmap.ISO8859_6,
    33  	"iso88597":    charmap.ISO8859_7,
    34  	"iso88598":    charmap.ISO8859_8,
    35  	"iso885910":   charmap.ISO8859_10,
    36  	"iso885913":   charmap.ISO8859_13,
    37  	"iso885914":   charmap.ISO8859_14,
    38  	"iso885915":   charmap.ISO8859_15,
    39  	"iso885916":   charmap.ISO8859_16,
    40  	"koi8r":       charmap.KOI8R,
    41  	"koi8u":       charmap.KOI8U,
    42  	"macintosh":   charmap.Macintosh,
    43  	"windows874":  charmap.Windows874,
    44  	"windows1250": charmap.Windows1250,
    45  	"windows1251": charmap.Windows1251,
    46  	"windows1252": charmap.Windows1252,
    47  	"windows1253": charmap.Windows1253,
    48  	"windows1254": charmap.Windows1254,
    49  	"windows1255": charmap.Windows1255,
    50  	"windows1256": charmap.Windows1256,
    51  	"windows1257": charmap.Windows1257,
    52  	"windows1258": charmap.Windows1258,
    53  	"gbk":         simplifiedchinese.GBK,
    54  	"gb18030":     simplifiedchinese.GB18030,
    55  	"big5":        traditionalchinese.Big5,
    56  	"eucjp":       japanese.EUCJP,
    57  	"iso2022jp":   japanese.ISO2022JP,
    58  	"shiftjis":    japanese.ShiftJIS,
    59  	"euckr":       korean.EUCKR,
    60  	"utf16be":     unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
    61  	"utf16le":     unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
    62  	// Not in https://github.com/golang/text/blob/HEAD/encoding/htmlindex/map.go#L64 :
    63  	"iso88591":  charmap.ISO8859_1,
    64  	"ibm037":    charmap.CodePage037,
    65  	"ibm437":    charmap.CodePage437,
    66  	"ibm850":    charmap.CodePage850,
    67  	"ibm852":    charmap.CodePage852,
    68  	"ibm855":    charmap.CodePage855,
    69  	"ibm858":    charmap.CodePage858,
    70  	"ibm860":    charmap.CodePage860,
    71  	"ibm862":    charmap.CodePage862,
    72  	"ibm863":    charmap.CodePage863,
    73  	"ibm865":    charmap.CodePage865,
    74  	"ibm1047":   charmap.CodePage1047,
    75  	"ibm1140":   charmap.CodePage1140,
    76  	"iso88596e": charmap.ISO8859_6E,
    77  	"iso88596i": charmap.ISO8859_6I,
    78  	"iso88598e": charmap.ISO8859_8E,
    79  	"iso88598i": charmap.ISO8859_8I,
    80  	"iso88599":  charmap.ISO8859_9,
    81  	"hzgb2312":  simplifiedchinese.HZGB2312,
    82  	// Not https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156 :
    83  	"macintoshcyrillic": charmap.MacintoshCyrillic,
    84  	// Not in https://github.com/golang/text/blob/HEAD/encoding/htmlindex/map.go#L64 or
    85  	//        https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156 :
    86  	"utf8bom": unicode.UTF8,
    87  	"utf32be": utf32.UTF32(utf32.BigEndian, utf32.IgnoreBOM),
    88  	"utf32le": utf32.UTF32(utf32.LittleEndian, utf32.IgnoreBOM),
    89  }
    90  
    91  // In https://github.com/golang/text/blob/HEAD/encoding/ianaindex/ianaindex.go#L156
    92  // but not included above:
    93  // 	 enc3:    asciiEnc,
    94  //   enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM)
    95  
    96  // DecodeBytes converts a byte array to a string
    97  func DecodeBytes(contentBytes []byte) (string, string, error) {
    98  	contentString := string(contentBytes)
    99  
   100  	charset, err := detectText(contentBytes)
   101  	if err != nil {
   102  		if IsBinaryFile(contentBytes) {
   103  			return contentString, BinaryData, nil
   104  		}
   105  		return contentString, charset, err
   106  	}
   107  	decodedContentString, err := decodeText(contentBytes, charset)
   108  	if err != nil {
   109  		if IsBinaryFile(contentBytes) {
   110  			return contentString, BinaryData, nil
   111  		}
   112  		return contentString, charset, err
   113  	}
   114  	return decodedContentString, charset, nil
   115  }
   116  
   117  func detectText(contentBytes []byte) (string, error) {
   118  	detector := chardet.NewTextDetector()
   119  	results, err := detector.DetectAll(contentBytes)
   120  	if err != nil {
   121  		return "", err
   122  	}
   123  	if len(results) == 0 {
   124  		return "", fmt.Errorf("Failed to determine charset")
   125  	}
   126  	confidence := -1
   127  	keys := make([]string, 0, len(results))
   128  	for _, result := range results {
   129  		_, ok := getEncoding(result.Charset)
   130  		if !ok {
   131  			continue
   132  		}
   133  		if result.Confidence < confidence {
   134  			break
   135  		}
   136  		confidence = result.Confidence
   137  		keys = append(keys, result.Charset)
   138  	}
   139  	sort.Strings(keys)
   140  	return keys[0], nil
   141  }
   142  
   143  func decodeText(contentBytes []byte, charset string) (string, error) {
   144  	enc, ok := getEncoding(charset)
   145  	if !ok {
   146  		return "", fmt.Errorf("unrecognized charset %s", charset)
   147  	}
   148  	var err error
   149  	contentBytes, err = enc.NewDecoder().Bytes(contentBytes)
   150  	if err != nil {
   151  		return "", err
   152  	}
   153  	if !utf8.Valid(contentBytes) {
   154  		return "", fmt.Errorf("the file is not a valid UTF-8 encoded file")
   155  	}
   156  	return string(contentBytes), nil
   157  }
   158  
   159  func getEncoding(charset string) (encoding.Encoding, bool) {
   160  	r := strings.NewReplacer("-", "", "_", "")
   161  	key := strings.ToLower(r.Replace(charset))
   162  	enc, ok := encodings[key]
   163  	return enc, ok
   164  }
   165  
   166  var binaryChars = [256]bool{}
   167  
   168  func init() {
   169  	// Allow tab (9), lf (10), ff (12), and cr (13)
   170  	trues := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
   171  	for _, i := range trues {
   172  		binaryChars[i] = true
   173  	}
   174  }
   175  
   176  // IsBinaryFile returns true if the bytes contain \x00-\x08,\x0b,\x0e-\x1f
   177  func IsBinaryFile(rawFileContent []byte) bool {
   178  	for _, b := range rawFileContent {
   179  		if binaryChars[b] {
   180  			return true
   181  		}
   182  	}
   183  	return false
   184  }