github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/chardet/2022.go (about) 1 package chardet 2 3 import ( 4 "bytes" 5 ) 6 7 type recognizer2022 struct { 8 charset string 9 escapes [][]byte 10 } 11 12 func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) { 13 return recognizerOutput{ 14 Charset: r.charset, 15 Confidence: r.matchConfidence(input.input), 16 } 17 } 18 19 func (r *recognizer2022) matchConfidence(input []byte) int { 20 var hits, misses, shifts int 21 input: 22 for i := 0; i < len(input); i++ { 23 c := input[i] 24 if c == 0x1B { 25 for _, esc := range r.escapes { 26 if bytes.HasPrefix(input[i+1:], esc) { 27 hits++ 28 i += len(esc) 29 continue input 30 } 31 } 32 misses++ 33 } else if c == 0x0E || c == 0x0F { 34 shifts++ 35 } 36 } 37 if hits == 0 { 38 return 0 39 } 40 quality := (100*hits - 100*misses) / (hits + misses) 41 if hits+shifts < 5 { 42 quality -= (5 - (hits + shifts)) * 10 43 } 44 if quality < 0 { 45 quality = 0 46 } 47 return quality 48 } 49 50 var escapeSequences_2022JP = [][]byte{ 51 {0x24, 0x28, 0x43}, // KS X 1001:1992 52 {0x24, 0x28, 0x44}, // JIS X 212-1990 53 {0x24, 0x40}, // JIS C 6226-1978 54 {0x24, 0x41}, // GB 2312-80 55 {0x24, 0x42}, // JIS X 208-1983 56 {0x26, 0x40}, // JIS X 208 1990, 1997 57 {0x28, 0x42}, // ASCII 58 {0x28, 0x48}, // JIS-Roman 59 {0x28, 0x49}, // Half-width katakana 60 {0x28, 0x4a}, // JIS-Roman 61 {0x2e, 0x41}, // ISO 8859-1 62 {0x2e, 0x46}, // ISO 8859-7 63 } 64 65 var escapeSequences_2022KR = [][]byte{ 66 {0x24, 0x29, 0x43}, 67 } 68 69 var escapeSequences_2022CN = [][]byte{ 70 {0x24, 0x29, 0x41}, // GB 2312-80 71 {0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1 72 {0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2 73 {0x24, 0x29, 0x45}, // ISO-IR-165 74 {0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3 75 {0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4 76 {0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5 77 {0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6 78 {0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7 79 {0x4e}, // SS2 80 {0x4f}, // SS3 81 } 82 83 func newRecognizer_2022JP() *recognizer2022 { 84 return &recognizer2022{ 85 "ISO-2022-JP", 86 escapeSequences_2022JP, 87 } 88 } 89 90 func newRecognizer_2022KR() *recognizer2022 { 91 return &recognizer2022{ 92 "ISO-2022-KR", 93 escapeSequences_2022KR, 94 } 95 } 96 97 func newRecognizer_2022CN() *recognizer2022 { 98 return &recognizer2022{ 99 "ISO-2022-CN", 100 escapeSequences_2022CN, 101 } 102 }