github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/recognizer.go (about) 1 package chardet 2 3 type recognizer interface { 4 Match(*recognizerInput) recognizerOutput 5 } 6 7 type recognizerOutput Result 8 9 type recognizerInput struct { 10 raw []byte 11 input []byte 12 tagStripped bool 13 byteStats []int 14 hasC1Bytes bool 15 } 16 17 func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput { 18 input, stripped := mayStripInput(raw, stripTag) 19 byteStats := computeByteStats(input) 20 return &recognizerInput{ 21 raw: raw, 22 input: input, 23 tagStripped: stripped, 24 byteStats: byteStats, 25 hasC1Bytes: computeHasC1Bytes(byteStats), 26 } 27 } 28 29 func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) { 30 const inputBufferSize = 8192 31 out = make([]byte, 0, inputBufferSize) 32 var badTags, openTags int32 33 var inMarkup bool = false 34 stripped = false 35 if stripTag { 36 stripped = true 37 for _, c := range raw { 38 if c == '<' { 39 if inMarkup { 40 badTags += 1 41 } 42 inMarkup = true 43 openTags += 1 44 } 45 if !inMarkup { 46 out = append(out, c) 47 if len(out) >= inputBufferSize { 48 break 49 } 50 } 51 if c == '>' { 52 inMarkup = false 53 } 54 } 55 } 56 if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) { 57 limit := len(raw) 58 if limit > inputBufferSize { 59 limit = inputBufferSize 60 } 61 out = make([]byte, limit) 62 copy(out, raw[:limit]) 63 stripped = false 64 } 65 return 66 } 67 68 func computeByteStats(input []byte) []int { 69 r := make([]int, 256) 70 for _, c := range input { 71 r[c] += 1 72 } 73 return r 74 } 75 76 func computeHasC1Bytes(byteStats []int) bool { 77 for _, count := range byteStats[0x80 : 0x9F+1] { 78 if count > 0 { 79 return true 80 } 81 } 82 return false 83 }