github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/detector.go (about) 1 // Package chardet ports character set detection from ICU. 2 package chardet 3 4 import ( 5 "errors" 6 "sort" 7 ) 8 9 // Result contains all the information that charset detector gives. 10 type Result struct { 11 // IANA name of the detected charset. 12 Charset string 13 // IANA name of the detected language. It may be empty for some charsets. 14 Language string 15 // Confidence of the Result. Scale from 1 to 100. The bigger, the more confident. 16 Confidence int 17 } 18 19 // Detector implements charset detection. 20 type Detector struct { 21 recognizers []recognizer 22 stripTag bool 23 } 24 25 // List of charset recognizers 26 var recognizers = []recognizer{ 27 newRecognizer_utf8(), 28 newRecognizer_utf16be(), 29 newRecognizer_utf16le(), 30 newRecognizer_utf32be(), 31 newRecognizer_utf32le(), 32 newRecognizer_8859_1_en(), 33 newRecognizer_8859_1_da(), 34 newRecognizer_8859_1_de(), 35 newRecognizer_8859_1_es(), 36 newRecognizer_8859_1_fr(), 37 newRecognizer_8859_1_it(), 38 newRecognizer_8859_1_nl(), 39 newRecognizer_8859_1_no(), 40 newRecognizer_8859_1_pt(), 41 newRecognizer_8859_1_sv(), 42 newRecognizer_8859_2_cs(), 43 newRecognizer_8859_2_hu(), 44 newRecognizer_8859_2_pl(), 45 newRecognizer_8859_2_ro(), 46 newRecognizer_8859_5_ru(), 47 newRecognizer_8859_6_ar(), 48 newRecognizer_8859_7_el(), 49 newRecognizer_8859_8_I_he(), 50 newRecognizer_8859_8_he(), 51 newRecognizer_windows_1251(), 52 newRecognizer_windows_1256(), 53 newRecognizer_KOI8_R(), 54 newRecognizer_8859_9_tr(), 55 56 newRecognizer_sjis(), 57 newRecognizer_gb_18030(), 58 newRecognizer_euc_jp(), 59 newRecognizer_euc_kr(), 60 newRecognizer_big5(), 61 62 newRecognizer_2022JP(), 63 newRecognizer_2022KR(), 64 newRecognizer_2022CN(), 65 66 newRecognizer_IBM424_he_rtl(), 67 newRecognizer_IBM424_he_ltr(), 68 newRecognizer_IBM420_ar_rtl(), 69 newRecognizer_IBM420_ar_ltr(), 70 } 71 72 // NewTextDetector creates a Detector for plain text. 73 func NewTextDetector() *Detector { 74 return &Detector{recognizers, false} 75 } 76 77 // NewHtmlDetector creates a Detector for Html. 78 func NewHtmlDetector() *Detector { 79 return &Detector{recognizers, true} 80 } 81 82 var ( 83 NotDetectedError = errors.New("Charset not detected.") 84 ) 85 86 // DetectBest returns the Result with highest Confidence. 87 func (d *Detector) DetectBest(b []byte) (r *Result, err error) { 88 var all []Result 89 if all, err = d.DetectAll(b); err == nil { 90 r = &all[0] 91 } 92 return 93 } 94 95 // DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order. 96 func (d *Detector) DetectAll(b []byte) ([]Result, error) { 97 input := newRecognizerInput(b, d.stripTag) 98 outputChan := make(chan recognizerOutput) 99 for _, r := range d.recognizers { 100 go matchHelper(r, input, outputChan) 101 } 102 outputs := make([]recognizerOutput, 0, len(d.recognizers)) 103 for i := 0; i < len(d.recognizers); i++ { 104 o := <-outputChan 105 if o.Confidence > 0 { 106 outputs = append(outputs, o) 107 } 108 } 109 if len(outputs) == 0 { 110 return nil, NotDetectedError 111 } 112 113 sort.Sort(recognizerOutputs(outputs)) 114 dedupOutputs := make([]Result, 0, len(outputs)) 115 foundCharsets := make(map[string]struct{}, len(outputs)) 116 for _, o := range outputs { 117 if _, found := foundCharsets[o.Charset]; !found { 118 dedupOutputs = append(dedupOutputs, Result(o)) 119 foundCharsets[o.Charset] = struct{}{} 120 } 121 } 122 if len(dedupOutputs) == 0 { 123 return nil, NotDetectedError 124 } 125 return dedupOutputs, nil 126 } 127 128 func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) { 129 outputChan <- r.Match(input) 130 } 131 132 type recognizerOutputs []recognizerOutput 133 134 func (r recognizerOutputs) Len() int { return len(r) } 135 func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence } 136 func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }