github.com/angenalZZZ/gofunc@v0.0.0-20210507121333-48ff1be3917b/f/encoding.go (about) 1 package f 2 3 import ( 4 "bytes" 5 "errors" 6 "io" 7 "io/ioutil" 8 "os" 9 10 "github.com/saintfish/chardet" 11 ) 12 13 // Encoding is type alias for detected UTF encoding. 14 type Encoding int 15 16 // Constants to identify detected UTF encodings. 17 const ( 18 // Unknown encoding, returned when no BOM was detected 19 UnknownEncoding Encoding = iota 20 21 // UTF8, BOM bytes: EF BB BF 22 UTF8 23 24 // UTF-16, big-endian, BOM bytes: FE FF 25 UTF16BigEndian 26 27 // UTF-16, little-endian, BOM bytes: FF FE 28 UTF16LittleEndian 29 30 // UTF-32, big-endian, BOM bytes: 00 00 FE FF 31 UTF32BigEndian 32 33 // UTF-32, little-endian, BOM bytes: FF FE 00 00 34 UTF32LittleEndian 35 ) 36 37 const maxConsecutiveEmptyReads = 100 38 39 // String returns a user-friendly string representation of the encoding. Satisfies fmt.Stringer interface. 40 func (e Encoding) String() string { 41 switch e { 42 case UTF8: 43 return "UTF8" 44 case UTF16BigEndian: 45 return "UTF16BigEndian" 46 case UTF16LittleEndian: 47 return "UTF16LittleEndian" 48 case UTF32BigEndian: 49 return "UTF32BigEndian" 50 case UTF32LittleEndian: 51 return "UTF32LittleEndian" 52 default: 53 return "UnknownEncoding" 54 } 55 } 56 57 type Charset struct { 58 *chardet.Result 59 } 60 61 // ReadFile reads the file named by filename and returns the contents. 62 // File Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary. 63 // A successful call returns err == nil, not err == EOF. Because ReadFile 64 // reads the whole file, it does not treat an EOF from Read as an error 65 // to be reported. 66 func ReadFile(filename string) ([]byte, error) { 67 f, err := os.Open(filename) 68 if err != nil { 69 return nil, err 70 } 71 defer func() { _ = f.Close() }() 72 // It's a good but not certain bet that FileInfo will tell us exactly how much to 73 // read, so let's try it but be prepared for the answer to be wrong. 74 var n int64 = bytes.MinRead 75 76 if fi, err := f.Stat(); err == nil { 77 // As initial capacity for readAll, use Size + a little extra in case Size 78 // is zero, and to avoid another allocation after Read has filled the 79 // buffer. The readAll call will read into its allocated internal buffer 80 // cheaply. If the size was wrong, we'll either waste some space off the end 81 // or reallocate as needed, but in the overwhelmingly common case we'll get 82 // it just right. 83 if size := fi.Size() + bytes.MinRead; size > n { 84 n = size 85 } 86 } 87 88 var buf bytes.Buffer 89 // If the buffer overflows, we will get bytes.ErrTooLarge. 90 // Return that as an error. Any other panic remains. 91 defer func() { 92 e := recover() 93 if e == nil { 94 return 95 } 96 if panicErr, ok := e.(error); ok && panicErr == bytes.ErrTooLarge { 97 err = panicErr 98 } else { 99 panic(e) 100 } 101 }() 102 if int64(int(n)) == n { 103 buf.Grow(int(n)) 104 } 105 106 // Automatically detects BOM and removes it as necessary. 107 r, _ := SkipBOM(f) 108 _, err = buf.ReadFrom(r) 109 return buf.Bytes(), err 110 111 //b, err := ioutil.ReadAll(f) 112 //if err != nil { 113 // return nil, err 114 //} 115 //// skip BOM 116 //if len(b) > 3 && b[0] == 239 && b[1] == 187 && b[2] == 191 { 117 // return b[3:], nil 118 //} 119 //return b, nil 120 } 121 122 // ReadFileAndTrimSpace reads the file and trim head-tail space contents. 123 func ReadFileAndTrimSpace(filename string) ([]byte, error) { 124 buf, err := ReadFile(filename) 125 if err != nil { 126 return nil, err 127 } 128 buf = bytes.TrimSpace(buf) 129 return buf, nil 130 } 131 132 // ReadFileEncoding reads the file and returns detected encoding. 133 func ReadFileEncoding(filename string) Encoding { 134 f, err := os.Open(filename) 135 if err != nil { 136 return UnknownEncoding 137 } 138 if enc, _, err := detectUtf(f); err == nil { 139 return enc 140 } 141 return UnknownEncoding 142 } 143 144 // ReadFileCharset reads the file and returns detected charset. 145 func ReadFileCharset(filename string) *Charset { 146 src, err := ioutil.ReadFile(filename) 147 if err != nil { 148 return nil 149 } 150 detector := chardet.NewTextDetector() 151 if result, err := detector.DetectBest(src); err == nil { 152 return &Charset{result} 153 } 154 return nil 155 } 156 157 // SkipBOM creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary. 158 // It also returns the encoding detected by the BOM. 159 // If the detected encoding is not needed, you can call the SkipOnly function. 160 func SkipBOM(rd io.Reader) (*Reader, Encoding) { 161 // Is it already a Reader? 162 b, ok := rd.(*Reader) 163 if ok { 164 return b, UnknownEncoding 165 } 166 167 enc, left, err := detectUtf(rd) 168 return &Reader{ 169 rd: rd, 170 buf: left, 171 err: err, 172 }, enc 173 } 174 175 // Reader implements automatic BOM (Unicode Byte Order Mark) checking and 176 // removing as necessary for an io.Reader object. 177 type Reader struct { 178 rd io.Reader // reader provided by the client 179 buf []byte // buffered data 180 err error // last error 181 } 182 183 // Read is an implementation of io.Reader interface. 184 // The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary. 185 func (r *Reader) Read(p []byte) (n int, err error) { 186 if len(p) == 0 { 187 return 0, nil 188 } 189 190 if r.buf == nil { 191 if r.err != nil { 192 return 0, r.readErr() 193 } 194 195 return r.rd.Read(p) 196 } 197 198 // copy as much as we can 199 n = copy(p, r.buf) 200 r.buf = nilIfEmpty(r.buf[n:]) 201 return n, nil 202 } 203 204 func (r *Reader) readErr() error { 205 err := r.err 206 r.err = nil 207 return err 208 } 209 210 var errNegativeRead = errors.New("utf-bom: reader returned negative count from read") 211 212 func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) { 213 buf, err = readBOM(rd) 214 215 if len(buf) >= 4 { 216 if isUTF32BigEndianBOM4(buf) { 217 return UTF32BigEndian, nilIfEmpty(buf[4:]), err 218 } 219 if isUTF32LittleEndianBOM4(buf) { 220 return UTF32LittleEndian, nilIfEmpty(buf[4:]), err 221 } 222 } 223 224 if len(buf) > 2 && isUTF8BOM3(buf) { 225 return UTF8, nilIfEmpty(buf[3:]), err 226 } 227 228 if (err != nil && err != io.EOF) || (len(buf) < 2) { 229 return UnknownEncoding, nilIfEmpty(buf), err 230 } 231 232 if isUTF16BigEndianBOM2(buf) { 233 return UTF16BigEndian, nilIfEmpty(buf[2:]), err 234 } 235 if isUTF16LittleEndianBOM2(buf) { 236 return UTF16LittleEndian, nilIfEmpty(buf[2:]), err 237 } 238 239 return UnknownEncoding, nilIfEmpty(buf), err 240 } 241 242 func readBOM(rd io.Reader) (buf []byte, err error) { 243 const maxBOMSize = 4 244 var bom [maxBOMSize]byte // used to read BOM 245 246 // read as many bytes as possible 247 for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] { 248 if n, err = rd.Read(bom[len(buf):]); n < 0 { 249 panic(errNegativeRead) 250 } 251 if n > 0 { 252 nEmpty = 0 253 } else { 254 nEmpty++ 255 if nEmpty >= maxConsecutiveEmptyReads { 256 err = io.ErrNoProgress 257 } 258 } 259 } 260 return 261 } 262 263 func isUTF32BigEndianBOM4(buf []byte) bool { 264 return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF 265 } 266 267 func isUTF32LittleEndianBOM4(buf []byte) bool { 268 return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00 269 } 270 271 func isUTF8BOM3(buf []byte) bool { 272 return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF 273 } 274 275 func isUTF16BigEndianBOM2(buf []byte) bool { 276 return buf[0] == 0xFE && buf[1] == 0xFF 277 } 278 279 func isUTF16LittleEndianBOM2(buf []byte) bool { 280 return buf[0] == 0xFF && buf[1] == 0xFE 281 } 282 283 func nilIfEmpty(buf []byte) (res []byte) { 284 if len(buf) > 0 { 285 res = buf 286 } 287 return 288 }