github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/utf8.go (about)

     1  package chardet
     2  
     3  import (
     4  	"bytes"
     5  )
     6  
     7  var utf8Bom = []byte{0xEF, 0xBB, 0xBF}
     8  
     9  type recognizerUtf8 struct {
    10  }
    11  
    12  func newRecognizer_utf8() *recognizerUtf8 {
    13  	return &recognizerUtf8{}
    14  }
    15  
    16  func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
    17  	output = recognizerOutput{
    18  		Charset: "UTF-8",
    19  	}
    20  	hasBom := bytes.HasPrefix(input.raw, utf8Bom)
    21  	inputLen := len(input.raw)
    22  	var numValid, numInvalid uint32
    23  	var trailBytes uint8
    24  	for i := 0; i < inputLen; i++ {
    25  		c := input.raw[i]
    26  		if c&0x80 == 0 {
    27  			continue
    28  		}
    29  		if c&0xE0 == 0xC0 {
    30  			trailBytes = 1
    31  		} else if c&0xF0 == 0xE0 {
    32  			trailBytes = 2
    33  		} else if c&0xF8 == 0xF0 {
    34  			trailBytes = 3
    35  		} else {
    36  			numInvalid++
    37  			if numInvalid > 5 {
    38  				break
    39  			}
    40  			trailBytes = 0
    41  		}
    42  
    43  		for i++; i < inputLen; i++ {
    44  			c = input.raw[i]
    45  			if c&0xC0 != 0x80 {
    46  				numInvalid++
    47  				break
    48  			}
    49  			if trailBytes--; trailBytes == 0 {
    50  				numValid++
    51  				break
    52  			}
    53  		}
    54  	}
    55  
    56  	if hasBom && numInvalid == 0 {
    57  		output.Confidence = 100
    58  	} else if hasBom && numValid > numInvalid*10 {
    59  		output.Confidence = 80
    60  	} else if numValid > 3 && numInvalid == 0 {
    61  		output.Confidence = 100
    62  	} else if numValid > 0 && numInvalid == 0 {
    63  		output.Confidence = 80
    64  	} else if numValid == 0 && numInvalid == 0 {
    65  		// Plain ASCII
    66  		output.Confidence = 10
    67  	} else if numValid > numInvalid*10 {
    68  		output.Confidence = 25
    69  	}
    70  	return
    71  }