github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/unicode.go (about)

     1  package chardet
     2  
     3  import (
     4  	"bytes"
     5  )
     6  
     7  var (
     8  	utf16beBom = []byte{0xFE, 0xFF}
     9  	utf16leBom = []byte{0xFF, 0xFE}
    10  	utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
    11  	utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
    12  )
    13  
    14  type recognizerUtf16be struct {
    15  }
    16  
    17  func newRecognizer_utf16be() *recognizerUtf16be {
    18  	return &recognizerUtf16be{}
    19  }
    20  
    21  func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
    22  	output = recognizerOutput{
    23  		Charset: "UTF-16BE",
    24  	}
    25  	if bytes.HasPrefix(input.raw, utf16beBom) {
    26  		output.Confidence = 100
    27  	}
    28  	return
    29  }
    30  
    31  type recognizerUtf16le struct {
    32  }
    33  
    34  func newRecognizer_utf16le() *recognizerUtf16le {
    35  	return &recognizerUtf16le{}
    36  }
    37  
    38  func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
    39  	output = recognizerOutput{
    40  		Charset: "UTF-16LE",
    41  	}
    42  	if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
    43  		output.Confidence = 100
    44  	}
    45  	return
    46  }
    47  
    48  type recognizerUtf32 struct {
    49  	name       string
    50  	bom        []byte
    51  	decodeChar func(input []byte) uint32
    52  }
    53  
    54  func decodeUtf32be(input []byte) uint32 {
    55  	return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
    56  }
    57  
    58  func decodeUtf32le(input []byte) uint32 {
    59  	return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
    60  }
    61  
    62  func newRecognizer_utf32be() *recognizerUtf32 {
    63  	return &recognizerUtf32{
    64  		"UTF-32BE",
    65  		utf32beBom,
    66  		decodeUtf32be,
    67  	}
    68  }
    69  
    70  func newRecognizer_utf32le() *recognizerUtf32 {
    71  	return &recognizerUtf32{
    72  		"UTF-32LE",
    73  		utf32leBom,
    74  		decodeUtf32le,
    75  	}
    76  }
    77  
    78  func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
    79  	output = recognizerOutput{
    80  		Charset: r.name,
    81  	}
    82  	hasBom := bytes.HasPrefix(input.raw, r.bom)
    83  	var numValid, numInvalid uint32
    84  	for b := input.raw; len(b) >= 4; b = b[4:] {
    85  		if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
    86  			numInvalid++
    87  		} else {
    88  			numValid++
    89  		}
    90  	}
    91  	if hasBom && numInvalid == 0 {
    92  		output.Confidence = 100
    93  	} else if hasBom && numValid > numInvalid*10 {
    94  		output.Confidence = 80
    95  	} else if numValid > 3 && numInvalid == 0 {
    96  		output.Confidence = 100
    97  	} else if numValid > 0 && numInvalid == 0 {
    98  		output.Confidence = 80
    99  	} else if numValid > numInvalid*10 {
   100  		output.Confidence = 25
   101  	}
   102  	return
   103  }