github.com/Aoi-hosizora/ahlib-more@v1.5.1-0.20230404072844-256112befaf6/xcharset/xcharset.go (about)

     1  package xcharset
     2  
     3  import (
     4  	"github.com/saintfish/chardet"
     5  )
     6  
     7  // DetectResult contains the information for charset detector. See chardet.Result.
     8  type DetectResult struct {
     9  	// Charset represents IANA or MIME name of the detected charset.
    10  	Charset string
    11  
    12  	// Language represents IANA name of the detected language. It may be empty for some charsets.
    13  	Language string
    14  
    15  	// Confidence represents the confidence of the result. Scale from 1 to 100.
    16  	Confidence int
    17  }
    18  
    19  // DetectBestCharset detects bytes and returns the charset result with the highest confidence.
    20  func DetectBestCharset(bs []byte) (*DetectResult, bool) {
    21  	detector := chardet.NewTextDetector()
    22  	result, err := detector.DetectBest(bs)
    23  	if err != nil {
    24  		return nil, false // empty result
    25  	}
    26  
    27  	return detectResultFromChardet(result), true
    28  }
    29  
    30  // DetectAllCharsets detects bytes and returns all charsets in confidence's descending order.
    31  func DetectAllCharsets(bs []byte) ([]*DetectResult, bool) {
    32  	detector := chardet.NewTextDetector()
    33  	results, err := detector.DetectAll(bs)
    34  	if err != nil {
    35  		return nil, false // empty result
    36  	}
    37  
    38  	out := make([]*DetectResult, len(results))
    39  	for idx := range results {
    40  		out[idx] = detectResultFromChardet(&results[idx])
    41  	}
    42  	return out, true
    43  }
    44  
    45  // detectResultFromChardet creates a DetectResult from chardet.Result. Note that there are some bugs in `chardet` package.
    46  func detectResultFromChardet(r *chardet.Result) *DetectResult {
    47  	charset := r.Charset
    48  	language := r.Language
    49  
    50  	switch charset {
    51  	// case "ISO-8859-1":
    52  	// 	switch language {
    53  	// 	case "cs", "hu", "pl", "ro":
    54  	// 		charset = "ISO-8859-2"
    55  	// 	}
    56  	case "GB-18030":
    57  		charset = "GB18030"
    58  	case "ISO-2022-JP":
    59  		language = "ja"
    60  		// case "ISO-2022-KR":
    61  		// 	language = "ko"
    62  		// case "ISO-2022-CN":
    63  		// 	language = "cn"
    64  	}
    65  
    66  	return &DetectResult{Charset: charset, Language: language, Confidence: r.Confidence}
    67  }