github.com/Aoi-hosizora/ahlib-more@v1.5.1-0.20230404072844-256112befaf6/xcharset/xcharset.go (about) 1 package xcharset 2 3 import ( 4 "github.com/saintfish/chardet" 5 ) 6 7 // DetectResult contains the information for charset detector. See chardet.Result. 8 type DetectResult struct { 9 // Charset represents IANA or MIME name of the detected charset. 10 Charset string 11 12 // Language represents IANA name of the detected language. It may be empty for some charsets. 13 Language string 14 15 // Confidence represents the confidence of the result. Scale from 1 to 100. 16 Confidence int 17 } 18 19 // DetectBestCharset detects bytes and returns the charset result with the highest confidence. 20 func DetectBestCharset(bs []byte) (*DetectResult, bool) { 21 detector := chardet.NewTextDetector() 22 result, err := detector.DetectBest(bs) 23 if err != nil { 24 return nil, false // empty result 25 } 26 27 return detectResultFromChardet(result), true 28 } 29 30 // DetectAllCharsets detects bytes and returns all charsets in confidence's descending order. 31 func DetectAllCharsets(bs []byte) ([]*DetectResult, bool) { 32 detector := chardet.NewTextDetector() 33 results, err := detector.DetectAll(bs) 34 if err != nil { 35 return nil, false // empty result 36 } 37 38 out := make([]*DetectResult, len(results)) 39 for idx := range results { 40 out[idx] = detectResultFromChardet(&results[idx]) 41 } 42 return out, true 43 } 44 45 // detectResultFromChardet creates a DetectResult from chardet.Result. Note that there are some bugs in `chardet` package. 46 func detectResultFromChardet(r *chardet.Result) *DetectResult { 47 charset := r.Charset 48 language := r.Language 49 50 switch charset { 51 // case "ISO-8859-1": 52 // switch language { 53 // case "cs", "hu", "pl", "ro": 54 // charset = "ISO-8859-2" 55 // } 56 case "GB-18030": 57 charset = "GB18030" 58 case "ISO-2022-JP": 59 language = "ja" 60 // case "ISO-2022-KR": 61 // language = "ko" 62 // case "ISO-2022-CN": 63 // language = "cn" 64 } 65 66 return &DetectResult{Charset: charset, Language: language, Confidence: r.Confidence} 67 }