github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/multi_byte.go (about)

     1  package chardet
     2  
     3  import (
     4  	"errors"
     5  	"math"
     6  )
     7  
     8  type recognizerMultiByte struct {
     9  	charset     string
    10  	language    string
    11  	decoder     charDecoder
    12  	commonChars []uint16
    13  }
    14  
    15  type charDecoder interface {
    16  	DecodeOneChar([]byte) (c uint16, remain []byte, err error)
    17  }
    18  
    19  func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
    20  	return recognizerOutput{
    21  		Charset:    r.charset,
    22  		Language:   r.language,
    23  		Confidence: r.matchConfidence(input),
    24  	}
    25  }
    26  
    27  func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int {
    28  	raw := input.raw
    29  	var c uint16
    30  	var err error
    31  	var totalCharCount, badCharCount, singleByteCharCount, doubleByteCharCount, commonCharCount int
    32  	for c, raw, err = r.decoder.DecodeOneChar(raw); len(raw) > 0; c, raw, err = r.decoder.DecodeOneChar(raw) {
    33  		totalCharCount++
    34  		if err != nil {
    35  			badCharCount++
    36  		} else if c <= 0xFF {
    37  			singleByteCharCount++
    38  		} else {
    39  			doubleByteCharCount++
    40  			if r.commonChars != nil && binarySearch(r.commonChars, c) {
    41  				commonCharCount++
    42  			}
    43  		}
    44  		if badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount {
    45  			return 0
    46  		}
    47  	}
    48  
    49  	if doubleByteCharCount <= 10 && badCharCount == 0 {
    50  		if doubleByteCharCount == 0 && totalCharCount < 10 {
    51  			return 0
    52  		} else {
    53  			return 10
    54  		}
    55  	}
    56  
    57  	if doubleByteCharCount < 20*badCharCount {
    58  		return 0
    59  	}
    60  	if r.commonChars == nil {
    61  		confidence := 30 + doubleByteCharCount - 20*badCharCount
    62  		if confidence > 100 {
    63  			confidence = 100
    64  		}
    65  		return confidence
    66  	}
    67  	maxVal := math.Log(float64(doubleByteCharCount) / 4)
    68  	scaleFactor := 90 / maxVal
    69  	confidence := int(math.Log(float64(commonCharCount)+1)*scaleFactor + 10)
    70  	if confidence > 100 {
    71  		confidence = 100
    72  	}
    73  	if confidence < 0 {
    74  		confidence = 0
    75  	}
    76  	return confidence
    77  }
    78  
    79  func binarySearch(l []uint16, c uint16) bool {
    80  	start := 0
    81  	end := len(l) - 1
    82  	for start <= end {
    83  		mid := (start + end) / 2
    84  		if c == l[mid] {
    85  			return true
    86  		} else if c < l[mid] {
    87  			end = mid - 1
    88  		} else {
    89  			start = mid + 1
    90  		}
    91  	}
    92  	return false
    93  }
    94  
    95  var eobError = errors.New("End of input buffer")
    96  var badCharError = errors.New("Decode a bad char")
    97  
    98  type charDecoder_sjis struct {
    99  }
   100  
   101  func (charDecoder_sjis) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
   102  	if len(input) == 0 {
   103  		return 0, nil, eobError
   104  	}
   105  	first := input[0]
   106  	c = uint16(first)
   107  	remain = input[1:]
   108  	if first <= 0x7F || (first > 0xA0 && first <= 0xDF) {
   109  		return
   110  	}
   111  	if len(remain) == 0 {
   112  		return c, remain, badCharError
   113  	}
   114  	second := remain[0]
   115  	remain = remain[1:]
   116  	c = c<<8 | uint16(second)
   117  	if (second >= 0x40 && second <= 0x7F) || (second >= 0x80 && second <= 0xFE) {
   118  	} else {
   119  		err = badCharError
   120  	}
   121  	return
   122  }
   123  
   124  var commonChars_sjis = []uint16{
   125  	0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
   126  	0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
   127  	0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
   128  	0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
   129  	0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
   130  	0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa,
   131  }
   132  
   133  func newRecognizer_sjis() *recognizerMultiByte {
   134  	return &recognizerMultiByte{
   135  		"Shift_JIS",
   136  		"ja",
   137  		charDecoder_sjis{},
   138  		commonChars_sjis,
   139  	}
   140  }
   141  
   142  type charDecoder_euc struct {
   143  }
   144  
   145  func (charDecoder_euc) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
   146  	if len(input) == 0 {
   147  		return 0, nil, eobError
   148  	}
   149  	first := input[0]
   150  	remain = input[1:]
   151  	c = uint16(first)
   152  	if first <= 0x8D {
   153  		return uint16(first), remain, nil
   154  	}
   155  	if len(remain) == 0 {
   156  		return 0, nil, eobError
   157  	}
   158  	second := remain[0]
   159  	remain = remain[1:]
   160  	c = c<<8 | uint16(second)
   161  	if first >= 0xA1 && first <= 0xFE {
   162  		if second < 0xA1 {
   163  			err = badCharError
   164  		}
   165  		return
   166  	}
   167  	if first == 0x8E {
   168  		if second < 0xA1 {
   169  			err = badCharError
   170  		}
   171  		return
   172  	}
   173  	if first == 0x8F {
   174  		if len(remain) == 0 {
   175  			return 0, nil, eobError
   176  		}
   177  		third := remain[0]
   178  		remain = remain[1:]
   179  		c = c<<0 | uint16(third)
   180  		if third < 0xa1 {
   181  			err = badCharError
   182  		}
   183  	}
   184  	return
   185  }
   186  
   187  var commonChars_euc_jp = []uint16{
   188  	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
   189  	0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
   190  	0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
   191  	0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
   192  	0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
   193  	0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
   194  	0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
   195  	0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
   196  	0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
   197  	0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1,
   198  }
   199  
   200  var commonChars_euc_kr = []uint16{
   201  	0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
   202  	0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
   203  	0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
   204  	0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
   205  	0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
   206  	0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
   207  	0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
   208  	0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
   209  	0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
   210  	0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad,
   211  }
   212  
   213  func newRecognizer_euc_jp() *recognizerMultiByte {
   214  	return &recognizerMultiByte{
   215  		"EUC-JP",
   216  		"ja",
   217  		charDecoder_euc{},
   218  		commonChars_euc_jp,
   219  	}
   220  }
   221  
   222  func newRecognizer_euc_kr() *recognizerMultiByte {
   223  	return &recognizerMultiByte{
   224  		"EUC-KR",
   225  		"ko",
   226  		charDecoder_euc{},
   227  		commonChars_euc_kr,
   228  	}
   229  }
   230  
   231  type charDecoder_big5 struct {
   232  }
   233  
   234  func (charDecoder_big5) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
   235  	if len(input) == 0 {
   236  		return 0, nil, eobError
   237  	}
   238  	first := input[0]
   239  	remain = input[1:]
   240  	c = uint16(first)
   241  	if first <= 0x7F || first == 0xFF {
   242  		return
   243  	}
   244  	if len(remain) == 0 {
   245  		return c, nil, eobError
   246  	}
   247  	second := remain[0]
   248  	remain = remain[1:]
   249  	c = c<<8 | uint16(second)
   250  	if second < 0x40 || second == 0x7F || second == 0xFF {
   251  		err = badCharError
   252  	}
   253  	return
   254  }
   255  
   256  var commonChars_big5 = []uint16{
   257  	0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
   258  	0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
   259  	0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
   260  	0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
   261  	0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
   262  	0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
   263  	0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
   264  	0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
   265  	0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
   266  	0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f,
   267  }
   268  
   269  func newRecognizer_big5() *recognizerMultiByte {
   270  	return &recognizerMultiByte{
   271  		"Big5",
   272  		"zh",
   273  		charDecoder_big5{},
   274  		commonChars_big5,
   275  	}
   276  }
   277  
   278  type charDecoder_gb_18030 struct {
   279  }
   280  
   281  func (charDecoder_gb_18030) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
   282  	if len(input) == 0 {
   283  		return 0, nil, eobError
   284  	}
   285  	first := input[0]
   286  	remain = input[1:]
   287  	c = uint16(first)
   288  	if first <= 0x80 {
   289  		return
   290  	}
   291  	if len(remain) == 0 {
   292  		return 0, nil, eobError
   293  	}
   294  	second := remain[0]
   295  	remain = remain[1:]
   296  	c = c<<8 | uint16(second)
   297  	if first >= 0x81 && first <= 0xFE {
   298  		if (second >= 0x40 && second <= 0x7E) || (second >= 0x80 && second <= 0xFE) {
   299  			return
   300  		}
   301  
   302  		if second >= 0x30 && second <= 0x39 {
   303  			if len(remain) == 0 {
   304  				return 0, nil, eobError
   305  			}
   306  			third := remain[0]
   307  			remain = remain[1:]
   308  			if third >= 0x81 && third <= 0xFE {
   309  				if len(remain) == 0 {
   310  					return 0, nil, eobError
   311  				}
   312  				fourth := remain[0]
   313  				remain = remain[1:]
   314  				if fourth >= 0x30 && fourth <= 0x39 {
   315  					c = c<<16 | uint16(third)<<8 | uint16(fourth)
   316  					return
   317  				}
   318  			}
   319  		}
   320  		err = badCharError
   321  	}
   322  	return
   323  }
   324  
   325  var commonChars_gb_18030 = []uint16{
   326  	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
   327  	0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
   328  	0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
   329  	0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
   330  	0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
   331  	0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
   332  	0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
   333  	0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
   334  	0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
   335  	0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0,
   336  }
   337  
   338  func newRecognizer_gb_18030() *recognizerMultiByte {
   339  	return &recognizerMultiByte{
   340  		"GB-18030",
   341  		"zh",
   342  		charDecoder_gb_18030{},
   343  		commonChars_gb_18030,
   344  	}
   345  }