github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/third_party/code.google.com/p/go-charset/charset/charset.go (about) 1 // The charset package implements translation between character sets. 2 // It uses Unicode as the intermediate representation. 3 // Because it can be large, the character set data is separated 4 // from the charset package. It can be embedded in the Go 5 // executable by importing the data package: 6 // 7 // import _ "camlistore.org/third_party/code.google.com/p/go-charset/data" 8 // 9 // It can also made available in a data directory (by settting CharsetDir). 10 package charset 11 12 import ( 13 "io" 14 "strings" 15 "unicode/utf8" 16 ) 17 18 // Charset holds information about a given character set. 19 type Charset struct { 20 Name string // Canonical name of character set. 21 Aliases []string // Known aliases. 22 Desc string // Description. 23 NoFrom bool // Not possible to translate from this charset. 24 NoTo bool // Not possible to translate to this charset. 25 } 26 27 // Translator represents a character set converter. 28 // The Translate method translates the given data, 29 // and returns the number of bytes of data consumed, 30 // a slice containing the converted data (which may be 31 // overwritten on the next call to Translate), and any 32 // conversion error. If eof is true, the data represents 33 // the final bytes of the input. 34 type Translator interface { 35 Translate(data []byte, eof bool) (n int, cdata []byte, err error) 36 } 37 38 // A Factory can be used to make character set translators. 39 type Factory interface { 40 // TranslatorFrom creates a translator that will translate from the named character 41 // set to UTF-8. 42 TranslatorFrom(name string) (Translator, error) // Create a Translator from this character set to. 43 44 // TranslatorTo creates a translator that will translate from UTF-8 to the named character set. 45 TranslatorTo(name string) (Translator, error) // Create a Translator To this character set. 46 47 // Names returns all the character set names accessibile through the factory. 48 Names() []string 49 50 // Info returns information on the named character set. It returns nil if the 51 // factory doesn't recognise the given name. 52 Info(name string) *Charset 53 } 54 55 var factories = []Factory{localFactory{}} 56 57 // Register registers a new Factory which will be consulted when NewReader 58 // or NewWriter needs a character set translator for a given name. 59 func Register(factory Factory) { 60 factories = append(factories, factory) 61 } 62 63 // NewReader returns a new Reader that translates from the named 64 // character set to UTF-8 as it reads r. 65 func NewReader(charset string, r io.Reader) (io.Reader, error) { 66 tr, err := TranslatorFrom(charset) 67 if err != nil { 68 return nil, err 69 } 70 return NewTranslatingReader(r, tr), nil 71 } 72 73 // NewWriter returns a new WriteCloser writing to w. It converts writes 74 // of UTF-8 text into writes on w of text in the named character set. 75 // The Close is necessary to flush any remaining partially translated 76 // characters to the output. 77 func NewWriter(charset string, w io.Writer) (io.WriteCloser, error) { 78 tr, err := TranslatorTo(charset) 79 if err != nil { 80 return nil, err 81 } 82 return NewTranslatingWriter(w, tr), nil 83 } 84 85 // Info returns information about a character set, or nil 86 // if the character set is not found. 87 func Info(name string) *Charset { 88 for _, f := range factories { 89 if info := f.Info(name); info != nil { 90 return info 91 } 92 } 93 return nil 94 } 95 96 // Names returns the canonical names of all supported character sets, in alphabetical order. 97 func Names() []string { 98 // TODO eliminate duplicates 99 var names []string 100 for _, f := range factories { 101 names = append(names, f.Names()...) 102 } 103 return names 104 } 105 106 // TranslatorFrom returns a translator that will translate from 107 // the named character set to UTF-8. 108 func TranslatorFrom(charset string) (Translator, error) { 109 var err error 110 var tr Translator 111 for _, f := range factories { 112 tr, err = f.TranslatorFrom(charset) 113 if err == nil { 114 break 115 } 116 } 117 if tr == nil { 118 return nil, err 119 } 120 return tr, nil 121 } 122 123 // TranslatorTo returns a translator that will translate from UTF-8 124 // to the named character set. 125 func TranslatorTo(charset string) (Translator, error) { 126 var err error 127 var tr Translator 128 for _, f := range factories { 129 tr, err = f.TranslatorTo(charset) 130 if err == nil { 131 break 132 } 133 } 134 if tr == nil { 135 return nil, err 136 } 137 return tr, nil 138 } 139 140 func normalizedChar(c rune) rune { 141 switch { 142 case c >= 'A' && c <= 'Z': 143 c = c - 'A' + 'a' 144 case c == '_': 145 c = '-' 146 } 147 return c 148 } 149 150 // NormalisedName returns s with all Roman capitals 151 // mapped to lower case, and '_' mapped to '-' 152 func NormalizedName(s string) string { 153 return strings.Map(normalizedChar, s) 154 } 155 156 type translatingWriter struct { 157 w io.Writer 158 tr Translator 159 buf []byte // unconsumed data from writer. 160 } 161 162 // NewTranslatingWriter returns a new WriteCloser writing to w. 163 // It passes the written bytes through the given Translator. 164 func NewTranslatingWriter(w io.Writer, tr Translator) io.WriteCloser { 165 return &translatingWriter{w: w, tr: tr} 166 } 167 168 func (w *translatingWriter) Write(data []byte) (rn int, rerr error) { 169 wdata := data 170 if len(w.buf) > 0 { 171 w.buf = append(w.buf, data...) 172 wdata = w.buf 173 } 174 n, cdata, err := w.tr.Translate(wdata, false) 175 if err != nil { 176 // TODO 177 } 178 if n > 0 { 179 _, err = w.w.Write(cdata) 180 if err != nil { 181 return 0, err 182 } 183 } 184 w.buf = w.buf[:0] 185 if n < len(wdata) { 186 w.buf = append(w.buf, wdata[n:]...) 187 } 188 return len(data), nil 189 } 190 191 func (p *translatingWriter) Close() error { 192 for { 193 n, data, err := p.tr.Translate(p.buf, true) 194 p.buf = p.buf[n:] 195 if err != nil { 196 // TODO 197 } 198 // If the Translator produces no data 199 // at EOF, then assume that it never will. 200 if len(data) == 0 { 201 break 202 } 203 n, err = p.w.Write(data) 204 if err != nil { 205 return err 206 } 207 if n < len(data) { 208 return io.ErrShortWrite 209 } 210 if len(p.buf) == 0 { 211 break 212 } 213 } 214 return nil 215 } 216 217 type translatingReader struct { 218 r io.Reader 219 tr Translator 220 cdata []byte // unconsumed data from converter. 221 rdata []byte // unconverted data from reader. 222 err error // final error from reader. 223 } 224 225 // NewTranslatingReader returns a new Reader that 226 // translates data using the given Translator as it reads r. 227 func NewTranslatingReader(r io.Reader, tr Translator) io.Reader { 228 return &translatingReader{r: r, tr: tr} 229 } 230 231 func (r *translatingReader) Read(buf []byte) (int, error) { 232 for { 233 if len(r.cdata) > 0 { 234 n := copy(buf, r.cdata) 235 r.cdata = r.cdata[n:] 236 return n, nil 237 } 238 if r.err == nil { 239 r.rdata = ensureCap(r.rdata, len(r.rdata)+len(buf)) 240 n, err := r.r.Read(r.rdata[len(r.rdata):cap(r.rdata)]) 241 // Guard against non-compliant Readers. 242 if n == 0 && err == nil { 243 err = io.EOF 244 } 245 r.rdata = r.rdata[0 : len(r.rdata)+n] 246 r.err = err 247 } else if len(r.rdata) == 0 { 248 break 249 } 250 nc, cdata, cvterr := r.tr.Translate(r.rdata, r.err != nil) 251 if cvterr != nil { 252 // TODO 253 } 254 r.cdata = cdata 255 256 // Ensure that we consume all bytes at eof 257 // if the converter refuses them. 258 if nc == 0 && r.err != nil { 259 nc = len(r.rdata) 260 } 261 262 // Copy unconsumed data to the start of the rdata buffer. 263 r.rdata = r.rdata[0:copy(r.rdata, r.rdata[nc:])] 264 } 265 return 0, r.err 266 } 267 268 // ensureCap returns s with a capacity of at least n bytes. 269 // If cap(s) < n, then it returns a new copy of s with the 270 // required capacity. 271 func ensureCap(s []byte, n int) []byte { 272 if n <= cap(s) { 273 return s 274 } 275 // logic adapted from appendslice1 in runtime 276 m := cap(s) 277 if m == 0 { 278 m = n 279 } else { 280 for { 281 if m < 1024 { 282 m += m 283 } else { 284 m += m / 4 285 } 286 if m >= n { 287 break 288 } 289 } 290 } 291 t := make([]byte, len(s), m) 292 copy(t, s) 293 return t 294 } 295 296 func appendRune(buf []byte, r rune) []byte { 297 n := len(buf) 298 buf = ensureCap(buf, n+utf8.UTFMax) 299 nu := utf8.EncodeRune(buf[n:n+utf8.UTFMax], r) 300 return buf[0 : n+nu] 301 }