github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/recognizer.go (about)

     1  package chardet
     2  
     3  type recognizer interface {
     4  	Match(*recognizerInput) recognizerOutput
     5  }
     6  
     7  type recognizerOutput Result
     8  
     9  type recognizerInput struct {
    10  	raw         []byte
    11  	input       []byte
    12  	tagStripped bool
    13  	byteStats   []int
    14  	hasC1Bytes  bool
    15  }
    16  
    17  func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
    18  	input, stripped := mayStripInput(raw, stripTag)
    19  	byteStats := computeByteStats(input)
    20  	return &recognizerInput{
    21  		raw:         raw,
    22  		input:       input,
    23  		tagStripped: stripped,
    24  		byteStats:   byteStats,
    25  		hasC1Bytes:  computeHasC1Bytes(byteStats),
    26  	}
    27  }
    28  
    29  func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
    30  	const inputBufferSize = 8192
    31  	out = make([]byte, 0, inputBufferSize)
    32  	var badTags, openTags int32
    33  	var inMarkup bool = false
    34  	stripped = false
    35  	if stripTag {
    36  		stripped = true
    37  		for _, c := range raw {
    38  			if c == '<' {
    39  				if inMarkup {
    40  					badTags += 1
    41  				}
    42  				inMarkup = true
    43  				openTags += 1
    44  			}
    45  			if !inMarkup {
    46  				out = append(out, c)
    47  				if len(out) >= inputBufferSize {
    48  					break
    49  				}
    50  			}
    51  			if c == '>' {
    52  				inMarkup = false
    53  			}
    54  		}
    55  	}
    56  	if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
    57  		limit := len(raw)
    58  		if limit > inputBufferSize {
    59  			limit = inputBufferSize
    60  		}
    61  		out = make([]byte, limit)
    62  		copy(out, raw[:limit])
    63  		stripped = false
    64  	}
    65  	return
    66  }
    67  
    68  func computeByteStats(input []byte) []int {
    69  	r := make([]int, 256)
    70  	for _, c := range input {
    71  		r[c] += 1
    72  	}
    73  	return r
    74  }
    75  
    76  func computeHasC1Bytes(byteStats []int) bool {
    77  	for _, count := range byteStats[0x80 : 0x9F+1] {
    78  		if count > 0 {
    79  			return true
    80  		}
    81  	}
    82  	return false
    83  }