github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/third_party/code.google.com/p/go-charset/charset/cp932.go (about) 1 package charset 2 3 import ( 4 "fmt" 5 "unicode/utf8" 6 ) 7 8 func init() { 9 registerClass("cp932", fromCP932, nil) 10 } 11 12 // encoding details 13 // (Traditional) Shift-JIS 14 // 15 // 00..1f control characters 16 // 20 space 17 // 21..7f JIS X 0201:1976/1997 roman (see notes) 18 // 80 undefined 19 // 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997 20 // a0 undefined 21 // a1..df JIS X 0201:1976/1997 katakana 22 // e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997 23 // eb..ff undefined 24 // 25 // CP932 (windows-31J) 26 // 27 // this encoding scheme extends Shift-JIS in the following way 28 // 29 // eb..ec undefined (marked as lead bytes - see notes below) 30 // ed..ee lead byte of NEC-selected IBM extended characters 31 // ef undefined (marked as lead byte - see notes below) 32 // f0..f9 lead byte of User defined GAIJI (see note below) 33 // fa..fc lead byte of IBM extended characters 34 // fd..ff undefined 35 // 36 // 37 // Notes 38 // 39 // JISX 0201:1976/1997 roman 40 // this is the same as ASCII but with 0x5c (ASCII code for '\') 41 // representing the Yen currency symbol '¥' (U+00a5) 42 // This mapping is contentious, some conversion packages implent it 43 // others do not. 44 // The mapping files from The Unicode Consortium show cp932 mapping 45 // plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen 46 // symbol (¥) and 0x7e ('~') to overline (¯) 47 // 48 // CP932 double-byte character codes: 49 // 50 // eb-ec, ef, f0-f9: 51 // Marked as DBCS LEAD BYTEs in the unicode mapping data 52 // obtained from: 53 // https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT 54 // 55 // but there are no defined mappings for codes in this range. 56 // It is not clear whether or not an implementation should 57 // consume one or two bytes before emitting an error char. 58 59 const ( 60 kanaPages = 1 61 kanaPageSize = 63 62 kanaChar0 = 0xa1 63 64 cp932Pages = 45 // 81..84, 87..9f, e0..ea, ed..ee, fa..fc 65 cp932PageSize = 189 // 40..fc (including 7f) 66 cp932Char0 = 0x40 67 ) 68 69 type jisTables struct { 70 page0 [256]rune 71 dbcsoff [256]int 72 cp932 []rune 73 } 74 75 type translateFromCP932 struct { 76 tables *jisTables 77 scratch []byte 78 } 79 80 func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) { 81 tables := p.tables 82 p.scratch = p.scratch[:0] 83 n := 0 84 for i := 0; i < len(data); i++ { 85 b := data[i] 86 r := tables.page0[b] 87 if r != -1 { 88 p.scratch = appendRune(p.scratch, r) 89 n++ 90 continue 91 } 92 // DBCS 93 i++ 94 if i >= len(data) { 95 break 96 } 97 pnum := tables.dbcsoff[b] 98 ix := int(data[i]) - cp932Char0 99 if pnum == -1 || ix < 0 || ix >= cp932PageSize { 100 r = utf8.RuneError 101 } else { 102 r = tables.cp932[pnum*cp932PageSize+ix] 103 } 104 p.scratch = appendRune(p.scratch, r) 105 n += 2 106 } 107 return n, p.scratch, nil 108 } 109 110 type cp932Key bool 111 112 func fromCP932(arg string) (Translator, error) { 113 shiftJIS := arg == "shiftjis" 114 tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) { 115 tables := new(jisTables) 116 kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages) 117 if err != nil { 118 return nil, err 119 } 120 tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages) 121 if err != nil { 122 return nil, err 123 } 124 125 // jisx0201kana is mapped into 0xA1..0xDF 126 for i := 0; i < kanaPageSize; i++ { 127 tables.page0[i+kanaChar0] = kana[i] 128 } 129 130 // 00..7f same as ascii in cp932 131 for i := rune(0); i < 0x7f; i++ { 132 tables.page0[i] = i 133 } 134 135 if shiftJIS { 136 // shift-jis uses JIS X 0201 for the ASCII range 137 // this is the same as ASCII apart from 138 // 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯) 139 tables.page0['\\'] = '¥' 140 tables.page0['~'] = '¯' 141 } 142 143 // pre-calculate DBCS page numbers to mapping file page numbers 144 // and mark codes in page0 that are DBCS lead bytes 145 pnum := 0 146 for i := 0x81; i <= 0x84; i++ { 147 tables.page0[i] = -1 148 tables.dbcsoff[i] = pnum 149 pnum++ 150 } 151 for i := 0x87; i <= 0x9f; i++ { 152 tables.page0[i] = -1 153 tables.dbcsoff[i] = pnum 154 pnum++ 155 } 156 for i := 0xe0; i <= 0xea; i++ { 157 tables.page0[i] = -1 158 tables.dbcsoff[i] = pnum 159 pnum++ 160 } 161 if shiftJIS { 162 return tables, nil 163 } 164 // add in cp932 extensions 165 for i := 0xed; i <= 0xee; i++ { 166 tables.page0[i] = -1 167 tables.dbcsoff[i] = pnum 168 pnum++ 169 } 170 for i := 0xfa; i <= 0xfc; i++ { 171 tables.page0[i] = -1 172 tables.dbcsoff[i] = pnum 173 pnum++ 174 } 175 return tables, nil 176 }) 177 178 if err != nil { 179 return nil, err 180 } 181 182 return &translateFromCP932{tables: tables.(*jisTables)}, nil 183 } 184 185 func jisGetMap(name string, pgsize, npages int) ([]rune, error) { 186 data, err := readFile(name) 187 if err != nil { 188 return nil, err 189 } 190 m := []rune(string(data)) 191 if len(m) != pgsize*npages { 192 return nil, fmt.Errorf("%q: incorrect length data", name) 193 } 194 return m, nil 195 }