github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/third_party/code.google.com/p/go-charset/charset/charset.go (about)

     1  // The charset package implements translation between character sets.
     2  // It uses Unicode as the intermediate representation.
     3  // Because it can be large, the character set data is separated
     4  // from the charset package. It can be embedded in the Go
     5  // executable by importing the data package:
     6  //
     7  //	import _ "camlistore.org/third_party/code.google.com/p/go-charset/data"
     8  //
     9  // It can also made available in a data directory (by settting CharsetDir).
    10  package charset
    11  
    12  import (
    13  	"io"
    14  	"strings"
    15  	"unicode/utf8"
    16  )
    17  
    18  // Charset holds information about a given character set.
    19  type Charset struct {
    20  	Name    string   // Canonical name of character set.
    21  	Aliases []string // Known aliases.
    22  	Desc    string   // Description.
    23  	NoFrom  bool     // Not possible to translate from this charset.
    24  	NoTo    bool     // Not possible to translate to this charset.
    25  }
    26  
    27  // Translator represents a character set converter.
    28  // The Translate method translates the given data,
    29  // and returns the number of bytes of data consumed,
    30  // a slice containing the converted data (which may be
    31  // overwritten on the next call to Translate), and any
    32  // conversion error. If eof is true, the data represents
    33  // the final bytes of the input.
    34  type Translator interface {
    35  	Translate(data []byte, eof bool) (n int, cdata []byte, err error)
    36  }
    37  
    38  // A Factory can be used to make character set translators.
    39  type Factory interface {
    40  	// TranslatorFrom creates a translator that will translate from the named character
    41  	// set to UTF-8.
    42  	TranslatorFrom(name string) (Translator, error) // Create a Translator from this character set to.
    43  
    44  	// TranslatorTo creates a translator that will translate from UTF-8 to the named character set.
    45  	TranslatorTo(name string) (Translator, error) // Create a Translator To this character set.
    46  
    47  	// Names returns all the character set names accessibile through the factory.
    48  	Names() []string
    49  
    50  	// Info returns information on the named character set. It returns nil if the
    51  	// factory doesn't recognise the given name.
    52  	Info(name string) *Charset
    53  }
    54  
    55  var factories = []Factory{localFactory{}}
    56  
    57  // Register registers a new Factory which will be consulted when NewReader
    58  // or NewWriter needs a character set translator for a given name.
    59  func Register(factory Factory) {
    60  	factories = append(factories, factory)
    61  }
    62  
    63  // NewReader returns a new Reader that translates from the named
    64  // character set to UTF-8 as it reads r.
    65  func NewReader(charset string, r io.Reader) (io.Reader, error) {
    66  	tr, err := TranslatorFrom(charset)
    67  	if err != nil {
    68  		return nil, err
    69  	}
    70  	return NewTranslatingReader(r, tr), nil
    71  }
    72  
    73  // NewWriter returns a new WriteCloser writing to w.  It converts writes
    74  // of UTF-8 text into writes on w of text in the named character set.
    75  // The Close is necessary to flush any remaining partially translated
    76  // characters to the output.
    77  func NewWriter(charset string, w io.Writer) (io.WriteCloser, error) {
    78  	tr, err := TranslatorTo(charset)
    79  	if err != nil {
    80  		return nil, err
    81  	}
    82  	return NewTranslatingWriter(w, tr), nil
    83  }
    84  
    85  // Info returns information about a character set, or nil
    86  // if the character set is not found.
    87  func Info(name string) *Charset {
    88  	for _, f := range factories {
    89  		if info := f.Info(name); info != nil {
    90  			return info
    91  		}
    92  	}
    93  	return nil
    94  }
    95  
    96  // Names returns the canonical names of all supported character sets, in alphabetical order.
    97  func Names() []string {
    98  	// TODO eliminate duplicates
    99  	var names []string
   100  	for _, f := range factories {
   101  		names = append(names, f.Names()...)
   102  	}
   103  	return names
   104  }
   105  
   106  // TranslatorFrom returns a translator that will translate from
   107  // the named character set to UTF-8.
   108  func TranslatorFrom(charset string) (Translator, error) {
   109  	var err error
   110  	var tr Translator
   111  	for _, f := range factories {
   112  		tr, err = f.TranslatorFrom(charset)
   113  		if err == nil {
   114  			break
   115  		}
   116  	}
   117  	if tr == nil {
   118  		return nil, err
   119  	}
   120  	return tr, nil
   121  }
   122  
   123  // TranslatorTo returns a translator that will translate from UTF-8
   124  // to the named character set.
   125  func TranslatorTo(charset string) (Translator, error) {
   126  	var err error
   127  	var tr Translator
   128  	for _, f := range factories {
   129  		tr, err = f.TranslatorTo(charset)
   130  		if err == nil {
   131  			break
   132  		}
   133  	}
   134  	if tr == nil {
   135  		return nil, err
   136  	}
   137  	return tr, nil
   138  }
   139  
   140  func normalizedChar(c rune) rune {
   141  	switch {
   142  	case c >= 'A' && c <= 'Z':
   143  		c = c - 'A' + 'a'
   144  	case c == '_':
   145  		c = '-'
   146  	}
   147  	return c
   148  }
   149  
   150  // NormalisedName returns s with all Roman capitals
   151  // mapped to lower case, and '_' mapped to '-'
   152  func NormalizedName(s string) string {
   153  	return strings.Map(normalizedChar, s)
   154  }
   155  
   156  type translatingWriter struct {
   157  	w   io.Writer
   158  	tr  Translator
   159  	buf []byte // unconsumed data from writer.
   160  }
   161  
   162  // NewTranslatingWriter returns a new WriteCloser writing to w.
   163  // It passes the written bytes through the given Translator.
   164  func NewTranslatingWriter(w io.Writer, tr Translator) io.WriteCloser {
   165  	return &translatingWriter{w: w, tr: tr}
   166  }
   167  
   168  func (w *translatingWriter) Write(data []byte) (rn int, rerr error) {
   169  	wdata := data
   170  	if len(w.buf) > 0 {
   171  		w.buf = append(w.buf, data...)
   172  		wdata = w.buf
   173  	}
   174  	n, cdata, err := w.tr.Translate(wdata, false)
   175  	if err != nil {
   176  		// TODO
   177  	}
   178  	if n > 0 {
   179  		_, err = w.w.Write(cdata)
   180  		if err != nil {
   181  			return 0, err
   182  		}
   183  	}
   184  	w.buf = w.buf[:0]
   185  	if n < len(wdata) {
   186  		w.buf = append(w.buf, wdata[n:]...)
   187  	}
   188  	return len(data), nil
   189  }
   190  
   191  func (p *translatingWriter) Close() error {
   192  	for {
   193  		n, data, err := p.tr.Translate(p.buf, true)
   194  		p.buf = p.buf[n:]
   195  		if err != nil {
   196  			// TODO
   197  		}
   198  		// If the Translator produces no data
   199  		// at EOF, then assume that it never will.
   200  		if len(data) == 0 {
   201  			break
   202  		}
   203  		n, err = p.w.Write(data)
   204  		if err != nil {
   205  			return err
   206  		}
   207  		if n < len(data) {
   208  			return io.ErrShortWrite
   209  		}
   210  		if len(p.buf) == 0 {
   211  			break
   212  		}
   213  	}
   214  	return nil
   215  }
   216  
   217  type translatingReader struct {
   218  	r     io.Reader
   219  	tr    Translator
   220  	cdata []byte // unconsumed data from converter.
   221  	rdata []byte // unconverted data from reader.
   222  	err   error  // final error from reader.
   223  }
   224  
   225  // NewTranslatingReader returns a new Reader that
   226  // translates data using the given Translator as it reads r.
   227  func NewTranslatingReader(r io.Reader, tr Translator) io.Reader {
   228  	return &translatingReader{r: r, tr: tr}
   229  }
   230  
   231  func (r *translatingReader) Read(buf []byte) (int, error) {
   232  	for {
   233  		if len(r.cdata) > 0 {
   234  			n := copy(buf, r.cdata)
   235  			r.cdata = r.cdata[n:]
   236  			return n, nil
   237  		}
   238  		if r.err == nil {
   239  			r.rdata = ensureCap(r.rdata, len(r.rdata)+len(buf))
   240  			n, err := r.r.Read(r.rdata[len(r.rdata):cap(r.rdata)])
   241  			// Guard against non-compliant Readers.
   242  			if n == 0 && err == nil {
   243  				err = io.EOF
   244  			}
   245  			r.rdata = r.rdata[0 : len(r.rdata)+n]
   246  			r.err = err
   247  		} else if len(r.rdata) == 0 {
   248  			break
   249  		}
   250  		nc, cdata, cvterr := r.tr.Translate(r.rdata, r.err != nil)
   251  		if cvterr != nil {
   252  			// TODO
   253  		}
   254  		r.cdata = cdata
   255  
   256  		// Ensure that we consume all bytes at eof
   257  		// if the converter refuses them.
   258  		if nc == 0 && r.err != nil {
   259  			nc = len(r.rdata)
   260  		}
   261  
   262  		// Copy unconsumed data to the start of the rdata buffer.
   263  		r.rdata = r.rdata[0:copy(r.rdata, r.rdata[nc:])]
   264  	}
   265  	return 0, r.err
   266  }
   267  
   268  // ensureCap returns s with a capacity of at least n bytes.
   269  // If cap(s) < n, then it returns a new copy of s with the
   270  // required capacity.
   271  func ensureCap(s []byte, n int) []byte {
   272  	if n <= cap(s) {
   273  		return s
   274  	}
   275  	// logic adapted from appendslice1 in runtime
   276  	m := cap(s)
   277  	if m == 0 {
   278  		m = n
   279  	} else {
   280  		for {
   281  			if m < 1024 {
   282  				m += m
   283  			} else {
   284  				m += m / 4
   285  			}
   286  			if m >= n {
   287  				break
   288  			}
   289  		}
   290  	}
   291  	t := make([]byte, len(s), m)
   292  	copy(t, s)
   293  	return t
   294  }
   295  
   296  func appendRune(buf []byte, r rune) []byte {
   297  	n := len(buf)
   298  	buf = ensureCap(buf, n+utf8.UTFMax)
   299  	nu := utf8.EncodeRune(buf[n:n+utf8.UTFMax], r)
   300  	return buf[0 : n+nu]
   301  }