github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/unicode.go (about) 1 package chardet 2 3 import ( 4 "bytes" 5 ) 6 7 var ( 8 utf16beBom = []byte{0xFE, 0xFF} 9 utf16leBom = []byte{0xFF, 0xFE} 10 utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF} 11 utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00} 12 ) 13 14 type recognizerUtf16be struct { 15 } 16 17 func newRecognizer_utf16be() *recognizerUtf16be { 18 return &recognizerUtf16be{} 19 } 20 21 func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) { 22 output = recognizerOutput{ 23 Charset: "UTF-16BE", 24 } 25 if bytes.HasPrefix(input.raw, utf16beBom) { 26 output.Confidence = 100 27 } 28 return 29 } 30 31 type recognizerUtf16le struct { 32 } 33 34 func newRecognizer_utf16le() *recognizerUtf16le { 35 return &recognizerUtf16le{} 36 } 37 38 func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) { 39 output = recognizerOutput{ 40 Charset: "UTF-16LE", 41 } 42 if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) { 43 output.Confidence = 100 44 } 45 return 46 } 47 48 type recognizerUtf32 struct { 49 name string 50 bom []byte 51 decodeChar func(input []byte) uint32 52 } 53 54 func decodeUtf32be(input []byte) uint32 { 55 return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3]) 56 } 57 58 func decodeUtf32le(input []byte) uint32 { 59 return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0]) 60 } 61 62 func newRecognizer_utf32be() *recognizerUtf32 { 63 return &recognizerUtf32{ 64 "UTF-32BE", 65 utf32beBom, 66 decodeUtf32be, 67 } 68 } 69 70 func newRecognizer_utf32le() *recognizerUtf32 { 71 return &recognizerUtf32{ 72 "UTF-32LE", 73 utf32leBom, 74 decodeUtf32le, 75 } 76 } 77 78 func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) { 79 output = recognizerOutput{ 80 Charset: r.name, 81 } 82 hasBom := bytes.HasPrefix(input.raw, r.bom) 83 var numValid, numInvalid uint32 84 for b := input.raw; len(b) >= 4; b = b[4:] { 85 if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) { 86 numInvalid++ 87 } else { 88 numValid++ 89 } 90 } 91 if hasBom && numInvalid == 0 { 92 output.Confidence = 100 93 } else if hasBom && numValid > numInvalid*10 { 94 output.Confidence = 80 95 } else if numValid > 3 && numInvalid == 0 { 96 output.Confidence = 100 97 } else if numValid > 0 && numInvalid == 0 { 98 output.Confidence = 80 99 } else if numValid > numInvalid*10 { 100 output.Confidence = 25 101 } 102 return 103 }