github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/third_party/code.google.com/p/go-charset/charset/cp932.go (about)

     1  package charset
     2  
     3  import (
     4  	"fmt"
     5  	"unicode/utf8"
     6  )
     7  
     8  func init() {
     9  	registerClass("cp932", fromCP932, nil)
    10  }
    11  
    12  // encoding details
    13  // (Traditional) Shift-JIS
    14  //
    15  // 00..1f	control characters
    16  // 20		space
    17  // 21..7f	JIS X 0201:1976/1997 roman (see notes)
    18  // 80		undefined
    19  // 81..9f	lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
    20  // a0		undefined
    21  // a1..df	JIS X 0201:1976/1997 katakana
    22  // e0..ea	lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
    23  // eb..ff	undefined
    24  //
    25  // CP932 (windows-31J)
    26  //
    27  // this encoding scheme extends Shift-JIS in the following way
    28  //
    29  // eb..ec	undefined (marked as lead bytes - see notes below)
    30  // ed..ee	lead byte of NEC-selected IBM extended characters
    31  // ef		undefined (marked as lead byte - see notes below)
    32  // f0..f9	lead byte of User defined GAIJI (see note below)
    33  // fa..fc	lead byte of IBM extended characters
    34  // fd..ff	undefined
    35  //
    36  //
    37  // Notes
    38  //
    39  // JISX 0201:1976/1997 roman
    40  //	this is the same as ASCII but with 0x5c (ASCII code for '\')
    41  //	representing the Yen currency symbol '¥' (U+00a5)
    42  //	This mapping is contentious, some conversion packages implent it
    43  //	others do not.
    44  //	The mapping files from The Unicode Consortium show cp932 mapping
    45  //	plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen
    46  //	symbol (¥) and 0x7e ('~') to overline (¯)
    47  //
    48  // CP932 double-byte character codes:
    49  //
    50  // eb-ec, ef, f0-f9:
    51  // 	Marked as DBCS LEAD BYTEs in the unicode mapping data
    52  //	obtained from:
    53  //		https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
    54  //
    55  // 	but there are no defined mappings for codes in this range.
    56  // 	It is not clear whether or not an implementation should
    57  // 	consume one or two bytes before emitting an error char.
    58  
    59  const (
    60  	kanaPages    = 1
    61  	kanaPageSize = 63
    62  	kanaChar0    = 0xa1
    63  
    64  	cp932Pages    = 45  // 81..84, 87..9f, e0..ea, ed..ee, fa..fc
    65  	cp932PageSize = 189 // 40..fc (including 7f)
    66  	cp932Char0    = 0x40
    67  )
    68  
    69  type jisTables struct {
    70  	page0   [256]rune
    71  	dbcsoff [256]int
    72  	cp932   []rune
    73  }
    74  
    75  type translateFromCP932 struct {
    76  	tables  *jisTables
    77  	scratch []byte
    78  }
    79  
    80  func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) {
    81  	tables := p.tables
    82  	p.scratch = p.scratch[:0]
    83  	n := 0
    84  	for i := 0; i < len(data); i++ {
    85  		b := data[i]
    86  		r := tables.page0[b]
    87  		if r != -1 {
    88  			p.scratch = appendRune(p.scratch, r)
    89  			n++
    90  			continue
    91  		}
    92  		// DBCS
    93  		i++
    94  		if i >= len(data) {
    95  			break
    96  		}
    97  		pnum := tables.dbcsoff[b]
    98  		ix := int(data[i]) - cp932Char0
    99  		if pnum == -1 || ix < 0 || ix >= cp932PageSize {
   100  			r = utf8.RuneError
   101  		} else {
   102  			r = tables.cp932[pnum*cp932PageSize+ix]
   103  		}
   104  		p.scratch = appendRune(p.scratch, r)
   105  		n += 2
   106  	}
   107  	return n, p.scratch, nil
   108  }
   109  
   110  type cp932Key bool
   111  
   112  func fromCP932(arg string) (Translator, error) {
   113  	shiftJIS := arg == "shiftjis"
   114  	tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) {
   115  		tables := new(jisTables)
   116  		kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages)
   117  		if err != nil {
   118  			return nil, err
   119  		}
   120  		tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages)
   121  		if err != nil {
   122  			return nil, err
   123  		}
   124  
   125  		// jisx0201kana is mapped into 0xA1..0xDF
   126  		for i := 0; i < kanaPageSize; i++ {
   127  			tables.page0[i+kanaChar0] = kana[i]
   128  		}
   129  
   130  		// 00..7f same as ascii in cp932
   131  		for i := rune(0); i < 0x7f; i++ {
   132  			tables.page0[i] = i
   133  		}
   134  
   135  		if shiftJIS {
   136  			// shift-jis uses JIS X 0201 for the ASCII range
   137  			// this is the same as ASCII apart from
   138  			// 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯)
   139  			tables.page0['\\'] = '¥'
   140  			tables.page0['~'] = '¯'
   141  		}
   142  
   143  		// pre-calculate DBCS page numbers to mapping file page numbers
   144  		// and mark codes in page0 that are DBCS lead bytes
   145  		pnum := 0
   146  		for i := 0x81; i <= 0x84; i++ {
   147  			tables.page0[i] = -1
   148  			tables.dbcsoff[i] = pnum
   149  			pnum++
   150  		}
   151  		for i := 0x87; i <= 0x9f; i++ {
   152  			tables.page0[i] = -1
   153  			tables.dbcsoff[i] = pnum
   154  			pnum++
   155  		}
   156  		for i := 0xe0; i <= 0xea; i++ {
   157  			tables.page0[i] = -1
   158  			tables.dbcsoff[i] = pnum
   159  			pnum++
   160  		}
   161  		if shiftJIS {
   162  			return tables, nil
   163  		}
   164  		// add in cp932 extensions
   165  		for i := 0xed; i <= 0xee; i++ {
   166  			tables.page0[i] = -1
   167  			tables.dbcsoff[i] = pnum
   168  			pnum++
   169  		}
   170  		for i := 0xfa; i <= 0xfc; i++ {
   171  			tables.page0[i] = -1
   172  			tables.dbcsoff[i] = pnum
   173  			pnum++
   174  		}
   175  		return tables, nil
   176  	})
   177  
   178  	if err != nil {
   179  		return nil, err
   180  	}
   181  
   182  	return &translateFromCP932{tables: tables.(*jisTables)}, nil
   183  }
   184  
   185  func jisGetMap(name string, pgsize, npages int) ([]rune, error) {
   186  	data, err := readFile(name)
   187  	if err != nil {
   188  		return nil, err
   189  	}
   190  	m := []rune(string(data))
   191  	if len(m) != pgsize*npages {
   192  		return nil, fmt.Errorf("%q: incorrect length data", name)
   193  	}
   194  	return m, nil
   195  }