github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/utf8.go (about) 1 package chardet 2 3 import ( 4 "bytes" 5 ) 6 7 var utf8Bom = []byte{0xEF, 0xBB, 0xBF} 8 9 type recognizerUtf8 struct { 10 } 11 12 func newRecognizer_utf8() *recognizerUtf8 { 13 return &recognizerUtf8{} 14 } 15 16 func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) { 17 output = recognizerOutput{ 18 Charset: "UTF-8", 19 } 20 hasBom := bytes.HasPrefix(input.raw, utf8Bom) 21 inputLen := len(input.raw) 22 var numValid, numInvalid uint32 23 var trailBytes uint8 24 for i := 0; i < inputLen; i++ { 25 c := input.raw[i] 26 if c&0x80 == 0 { 27 continue 28 } 29 if c&0xE0 == 0xC0 { 30 trailBytes = 1 31 } else if c&0xF0 == 0xE0 { 32 trailBytes = 2 33 } else if c&0xF8 == 0xF0 { 34 trailBytes = 3 35 } else { 36 numInvalid++ 37 if numInvalid > 5 { 38 break 39 } 40 trailBytes = 0 41 } 42 43 for i++; i < inputLen; i++ { 44 c = input.raw[i] 45 if c&0xC0 != 0x80 { 46 numInvalid++ 47 break 48 } 49 if trailBytes--; trailBytes == 0 { 50 numValid++ 51 break 52 } 53 } 54 } 55 56 if hasBom && numInvalid == 0 { 57 output.Confidence = 100 58 } else if hasBom && numValid > numInvalid*10 { 59 output.Confidence = 80 60 } else if numValid > 3 && numInvalid == 0 { 61 output.Confidence = 100 62 } else if numValid > 0 && numInvalid == 0 { 63 output.Confidence = 80 64 } else if numValid == 0 && numInvalid == 0 { 65 // Plain ASCII 66 output.Confidence = 10 67 } else if numValid > numInvalid*10 { 68 output.Confidence = 25 69 } 70 return 71 }