github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/detector.go (about)

     1  // Package chardet ports character set detection from ICU.
     2  package chardet
     3  
     4  import (
     5  	"errors"
     6  	"sort"
     7  )
     8  
     9  // Result contains all the information that charset detector gives.
    10  type Result struct {
    11  	// IANA name of the detected charset.
    12  	Charset string
    13  	// IANA name of the detected language. It may be empty for some charsets.
    14  	Language string
    15  	// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
    16  	Confidence int
    17  }
    18  
    19  // Detector implements charset detection.
    20  type Detector struct {
    21  	recognizers []recognizer
    22  	stripTag    bool
    23  }
    24  
    25  // List of charset recognizers
    26  var recognizers = []recognizer{
    27  	newRecognizer_utf8(),
    28  	newRecognizer_utf16be(),
    29  	newRecognizer_utf16le(),
    30  	newRecognizer_utf32be(),
    31  	newRecognizer_utf32le(),
    32  	newRecognizer_8859_1_en(),
    33  	newRecognizer_8859_1_da(),
    34  	newRecognizer_8859_1_de(),
    35  	newRecognizer_8859_1_es(),
    36  	newRecognizer_8859_1_fr(),
    37  	newRecognizer_8859_1_it(),
    38  	newRecognizer_8859_1_nl(),
    39  	newRecognizer_8859_1_no(),
    40  	newRecognizer_8859_1_pt(),
    41  	newRecognizer_8859_1_sv(),
    42  	newRecognizer_8859_2_cs(),
    43  	newRecognizer_8859_2_hu(),
    44  	newRecognizer_8859_2_pl(),
    45  	newRecognizer_8859_2_ro(),
    46  	newRecognizer_8859_5_ru(),
    47  	newRecognizer_8859_6_ar(),
    48  	newRecognizer_8859_7_el(),
    49  	newRecognizer_8859_8_I_he(),
    50  	newRecognizer_8859_8_he(),
    51  	newRecognizer_windows_1251(),
    52  	newRecognizer_windows_1256(),
    53  	newRecognizer_KOI8_R(),
    54  	newRecognizer_8859_9_tr(),
    55  
    56  	newRecognizer_sjis(),
    57  	newRecognizer_gb_18030(),
    58  	newRecognizer_euc_jp(),
    59  	newRecognizer_euc_kr(),
    60  	newRecognizer_big5(),
    61  
    62  	newRecognizer_2022JP(),
    63  	newRecognizer_2022KR(),
    64  	newRecognizer_2022CN(),
    65  
    66  	newRecognizer_IBM424_he_rtl(),
    67  	newRecognizer_IBM424_he_ltr(),
    68  	newRecognizer_IBM420_ar_rtl(),
    69  	newRecognizer_IBM420_ar_ltr(),
    70  }
    71  
    72  // NewTextDetector creates a Detector for plain text.
    73  func NewTextDetector() *Detector {
    74  	return &Detector{recognizers, false}
    75  }
    76  
    77  // NewHtmlDetector creates a Detector for Html.
    78  func NewHtmlDetector() *Detector {
    79  	return &Detector{recognizers, true}
    80  }
    81  
    82  var (
    83  	NotDetectedError = errors.New("Charset not detected.")
    84  )
    85  
    86  // DetectBest returns the Result with highest Confidence.
    87  func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
    88  	var all []Result
    89  	if all, err = d.DetectAll(b); err == nil {
    90  		r = &all[0]
    91  	}
    92  	return
    93  }
    94  
    95  // DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
    96  func (d *Detector) DetectAll(b []byte) ([]Result, error) {
    97  	input := newRecognizerInput(b, d.stripTag)
    98  	outputChan := make(chan recognizerOutput)
    99  	for _, r := range d.recognizers {
   100  		go matchHelper(r, input, outputChan)
   101  	}
   102  	outputs := make([]recognizerOutput, 0, len(d.recognizers))
   103  	for i := 0; i < len(d.recognizers); i++ {
   104  		o := <-outputChan
   105  		if o.Confidence > 0 {
   106  			outputs = append(outputs, o)
   107  		}
   108  	}
   109  	if len(outputs) == 0 {
   110  		return nil, NotDetectedError
   111  	}
   112  
   113  	sort.Sort(recognizerOutputs(outputs))
   114  	dedupOutputs := make([]Result, 0, len(outputs))
   115  	foundCharsets := make(map[string]struct{}, len(outputs))
   116  	for _, o := range outputs {
   117  		if _, found := foundCharsets[o.Charset]; !found {
   118  			dedupOutputs = append(dedupOutputs, Result(o))
   119  			foundCharsets[o.Charset] = struct{}{}
   120  		}
   121  	}
   122  	if len(dedupOutputs) == 0 {
   123  		return nil, NotDetectedError
   124  	}
   125  	return dedupOutputs, nil
   126  }
   127  
   128  func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
   129  	outputChan <- r.Match(input)
   130  }
   131  
   132  type recognizerOutputs []recognizerOutput
   133  
   134  func (r recognizerOutputs) Len() int           { return len(r) }
   135  func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
   136  func (r recognizerOutputs) Swap(i, j int)      { r[i], r[j] = r[j], r[i] }