github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/2022.go (about)

     1  package chardet
     2  
     3  import (
     4  	"bytes"
     5  )
     6  
     7  type recognizer2022 struct {
     8  	charset string
     9  	escapes [][]byte
    10  }
    11  
    12  func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
    13  	return recognizerOutput{
    14  		Charset:    r.charset,
    15  		Confidence: r.matchConfidence(input.input),
    16  	}
    17  }
    18  
    19  func (r *recognizer2022) matchConfidence(input []byte) int {
    20  	var hits, misses, shifts int
    21  input:
    22  	for i := 0; i < len(input); i++ {
    23  		c := input[i]
    24  		if c == 0x1B {
    25  			for _, esc := range r.escapes {
    26  				if bytes.HasPrefix(input[i+1:], esc) {
    27  					hits++
    28  					i += len(esc)
    29  					continue input
    30  				}
    31  			}
    32  			misses++
    33  		} else if c == 0x0E || c == 0x0F {
    34  			shifts++
    35  		}
    36  	}
    37  	if hits == 0 {
    38  		return 0
    39  	}
    40  	quality := (100*hits - 100*misses) / (hits + misses)
    41  	if hits+shifts < 5 {
    42  		quality -= (5 - (hits + shifts)) * 10
    43  	}
    44  	if quality < 0 {
    45  		quality = 0
    46  	}
    47  	return quality
    48  }
    49  
    50  var escapeSequences_2022JP = [][]byte{
    51  	{0x24, 0x28, 0x43}, // KS X 1001:1992
    52  	{0x24, 0x28, 0x44}, // JIS X 212-1990
    53  	{0x24, 0x40},       // JIS C 6226-1978
    54  	{0x24, 0x41},       // GB 2312-80
    55  	{0x24, 0x42},       // JIS X 208-1983
    56  	{0x26, 0x40},       // JIS X 208 1990, 1997
    57  	{0x28, 0x42},       // ASCII
    58  	{0x28, 0x48},       // JIS-Roman
    59  	{0x28, 0x49},       // Half-width katakana
    60  	{0x28, 0x4a},       // JIS-Roman
    61  	{0x2e, 0x41},       // ISO 8859-1
    62  	{0x2e, 0x46},       // ISO 8859-7
    63  }
    64  
    65  var escapeSequences_2022KR = [][]byte{
    66  	{0x24, 0x29, 0x43},
    67  }
    68  
    69  var escapeSequences_2022CN = [][]byte{
    70  	{0x24, 0x29, 0x41}, // GB 2312-80
    71  	{0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
    72  	{0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
    73  	{0x24, 0x29, 0x45}, // ISO-IR-165
    74  	{0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
    75  	{0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
    76  	{0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
    77  	{0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
    78  	{0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
    79  	{0x4e},             // SS2
    80  	{0x4f},             // SS3
    81  }
    82  
    83  func newRecognizer_2022JP() *recognizer2022 {
    84  	return &recognizer2022{
    85  		"ISO-2022-JP",
    86  		escapeSequences_2022JP,
    87  	}
    88  }
    89  
    90  func newRecognizer_2022KR() *recognizer2022 {
    91  	return &recognizer2022{
    92  		"ISO-2022-KR",
    93  		escapeSequences_2022KR,
    94  	}
    95  }
    96  
    97  func newRecognizer_2022CN() *recognizer2022 {
    98  	return &recognizer2022{
    99  		"ISO-2022-CN",
   100  		escapeSequences_2022CN,
   101  	}
   102  }