github.com/angenalZZZ/gofunc@v0.0.0-20210507121333-48ff1be3917b/f/encoding.go (about)

     1  package f
     2  
     3  import (
     4  	"bytes"
     5  	"errors"
     6  	"io"
     7  	"io/ioutil"
     8  	"os"
     9  
    10  	"github.com/saintfish/chardet"
    11  )
    12  
    13  // Encoding is type alias for detected UTF encoding.
    14  type Encoding int
    15  
    16  // Constants to identify detected UTF encodings.
    17  const (
    18  	// Unknown encoding, returned when no BOM was detected
    19  	UnknownEncoding Encoding = iota
    20  
    21  	// UTF8, BOM bytes: EF BB BF
    22  	UTF8
    23  
    24  	// UTF-16, big-endian, BOM bytes: FE FF
    25  	UTF16BigEndian
    26  
    27  	// UTF-16, little-endian, BOM bytes: FF FE
    28  	UTF16LittleEndian
    29  
    30  	// UTF-32, big-endian, BOM bytes: 00 00 FE FF
    31  	UTF32BigEndian
    32  
    33  	// UTF-32, little-endian, BOM bytes: FF FE 00 00
    34  	UTF32LittleEndian
    35  )
    36  
    37  const maxConsecutiveEmptyReads = 100
    38  
    39  // String returns a user-friendly string representation of the encoding. Satisfies fmt.Stringer interface.
    40  func (e Encoding) String() string {
    41  	switch e {
    42  	case UTF8:
    43  		return "UTF8"
    44  	case UTF16BigEndian:
    45  		return "UTF16BigEndian"
    46  	case UTF16LittleEndian:
    47  		return "UTF16LittleEndian"
    48  	case UTF32BigEndian:
    49  		return "UTF32BigEndian"
    50  	case UTF32LittleEndian:
    51  		return "UTF32LittleEndian"
    52  	default:
    53  		return "UnknownEncoding"
    54  	}
    55  }
    56  
    57  type Charset struct {
    58  	*chardet.Result
    59  }
    60  
    61  // ReadFile reads the file named by filename and returns the contents.
    62  // File Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
    63  // A successful call returns err == nil, not err == EOF. Because ReadFile
    64  // reads the whole file, it does not treat an EOF from Read as an error
    65  // to be reported.
    66  func ReadFile(filename string) ([]byte, error) {
    67  	f, err := os.Open(filename)
    68  	if err != nil {
    69  		return nil, err
    70  	}
    71  	defer func() { _ = f.Close() }()
    72  	// It's a good but not certain bet that FileInfo will tell us exactly how much to
    73  	// read, so let's try it but be prepared for the answer to be wrong.
    74  	var n int64 = bytes.MinRead
    75  
    76  	if fi, err := f.Stat(); err == nil {
    77  		// As initial capacity for readAll, use Size + a little extra in case Size
    78  		// is zero, and to avoid another allocation after Read has filled the
    79  		// buffer. The readAll call will read into its allocated internal buffer
    80  		// cheaply. If the size was wrong, we'll either waste some space off the end
    81  		// or reallocate as needed, but in the overwhelmingly common case we'll get
    82  		// it just right.
    83  		if size := fi.Size() + bytes.MinRead; size > n {
    84  			n = size
    85  		}
    86  	}
    87  
    88  	var buf bytes.Buffer
    89  	// If the buffer overflows, we will get bytes.ErrTooLarge.
    90  	// Return that as an error. Any other panic remains.
    91  	defer func() {
    92  		e := recover()
    93  		if e == nil {
    94  			return
    95  		}
    96  		if panicErr, ok := e.(error); ok && panicErr == bytes.ErrTooLarge {
    97  			err = panicErr
    98  		} else {
    99  			panic(e)
   100  		}
   101  	}()
   102  	if int64(int(n)) == n {
   103  		buf.Grow(int(n))
   104  	}
   105  
   106  	// Automatically detects BOM and removes it as necessary.
   107  	r, _ := SkipBOM(f)
   108  	_, err = buf.ReadFrom(r)
   109  	return buf.Bytes(), err
   110  
   111  	//b, err := ioutil.ReadAll(f)
   112  	//if err != nil {
   113  	//	return nil, err
   114  	//}
   115  	//// skip BOM
   116  	//if len(b) > 3 && b[0] == 239 && b[1] == 187 && b[2] == 191 {
   117  	//	return b[3:], nil
   118  	//}
   119  	//return b, nil
   120  }
   121  
   122  // ReadFileAndTrimSpace reads the file and trim head-tail space contents.
   123  func ReadFileAndTrimSpace(filename string) ([]byte, error) {
   124  	buf, err := ReadFile(filename)
   125  	if err != nil {
   126  		return nil, err
   127  	}
   128  	buf = bytes.TrimSpace(buf)
   129  	return buf, nil
   130  }
   131  
   132  // ReadFileEncoding reads the file and returns detected encoding.
   133  func ReadFileEncoding(filename string) Encoding {
   134  	f, err := os.Open(filename)
   135  	if err != nil {
   136  		return UnknownEncoding
   137  	}
   138  	if enc, _, err := detectUtf(f); err == nil {
   139  		return enc
   140  	}
   141  	return UnknownEncoding
   142  }
   143  
   144  // ReadFileCharset reads the file and returns detected charset.
   145  func ReadFileCharset(filename string) *Charset {
   146  	src, err := ioutil.ReadFile(filename)
   147  	if err != nil {
   148  		return nil
   149  	}
   150  	detector := chardet.NewTextDetector()
   151  	if result, err := detector.DetectBest(src); err == nil {
   152  		return &Charset{result}
   153  	}
   154  	return nil
   155  }
   156  
   157  // SkipBOM creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
   158  // It also returns the encoding detected by the BOM.
   159  // If the detected encoding is not needed, you can call the SkipOnly function.
   160  func SkipBOM(rd io.Reader) (*Reader, Encoding) {
   161  	// Is it already a Reader?
   162  	b, ok := rd.(*Reader)
   163  	if ok {
   164  		return b, UnknownEncoding
   165  	}
   166  
   167  	enc, left, err := detectUtf(rd)
   168  	return &Reader{
   169  		rd:  rd,
   170  		buf: left,
   171  		err: err,
   172  	}, enc
   173  }
   174  
   175  // Reader implements automatic BOM (Unicode Byte Order Mark) checking and
   176  // removing as necessary for an io.Reader object.
   177  type Reader struct {
   178  	rd  io.Reader // reader provided by the client
   179  	buf []byte    // buffered data
   180  	err error     // last error
   181  }
   182  
   183  // Read is an implementation of io.Reader interface.
   184  // The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary.
   185  func (r *Reader) Read(p []byte) (n int, err error) {
   186  	if len(p) == 0 {
   187  		return 0, nil
   188  	}
   189  
   190  	if r.buf == nil {
   191  		if r.err != nil {
   192  			return 0, r.readErr()
   193  		}
   194  
   195  		return r.rd.Read(p)
   196  	}
   197  
   198  	// copy as much as we can
   199  	n = copy(p, r.buf)
   200  	r.buf = nilIfEmpty(r.buf[n:])
   201  	return n, nil
   202  }
   203  
   204  func (r *Reader) readErr() error {
   205  	err := r.err
   206  	r.err = nil
   207  	return err
   208  }
   209  
   210  var errNegativeRead = errors.New("utf-bom: reader returned negative count from read")
   211  
   212  func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) {
   213  	buf, err = readBOM(rd)
   214  
   215  	if len(buf) >= 4 {
   216  		if isUTF32BigEndianBOM4(buf) {
   217  			return UTF32BigEndian, nilIfEmpty(buf[4:]), err
   218  		}
   219  		if isUTF32LittleEndianBOM4(buf) {
   220  			return UTF32LittleEndian, nilIfEmpty(buf[4:]), err
   221  		}
   222  	}
   223  
   224  	if len(buf) > 2 && isUTF8BOM3(buf) {
   225  		return UTF8, nilIfEmpty(buf[3:]), err
   226  	}
   227  
   228  	if (err != nil && err != io.EOF) || (len(buf) < 2) {
   229  		return UnknownEncoding, nilIfEmpty(buf), err
   230  	}
   231  
   232  	if isUTF16BigEndianBOM2(buf) {
   233  		return UTF16BigEndian, nilIfEmpty(buf[2:]), err
   234  	}
   235  	if isUTF16LittleEndianBOM2(buf) {
   236  		return UTF16LittleEndian, nilIfEmpty(buf[2:]), err
   237  	}
   238  
   239  	return UnknownEncoding, nilIfEmpty(buf), err
   240  }
   241  
   242  func readBOM(rd io.Reader) (buf []byte, err error) {
   243  	const maxBOMSize = 4
   244  	var bom [maxBOMSize]byte // used to read BOM
   245  
   246  	// read as many bytes as possible
   247  	for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] {
   248  		if n, err = rd.Read(bom[len(buf):]); n < 0 {
   249  			panic(errNegativeRead)
   250  		}
   251  		if n > 0 {
   252  			nEmpty = 0
   253  		} else {
   254  			nEmpty++
   255  			if nEmpty >= maxConsecutiveEmptyReads {
   256  				err = io.ErrNoProgress
   257  			}
   258  		}
   259  	}
   260  	return
   261  }
   262  
   263  func isUTF32BigEndianBOM4(buf []byte) bool {
   264  	return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF
   265  }
   266  
   267  func isUTF32LittleEndianBOM4(buf []byte) bool {
   268  	return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00
   269  }
   270  
   271  func isUTF8BOM3(buf []byte) bool {
   272  	return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF
   273  }
   274  
   275  func isUTF16BigEndianBOM2(buf []byte) bool {
   276  	return buf[0] == 0xFE && buf[1] == 0xFF
   277  }
   278  
   279  func isUTF16LittleEndianBOM2(buf []byte) bool {
   280  	return buf[0] == 0xFF && buf[1] == 0xFE
   281  }
   282  
   283  func nilIfEmpty(buf []byte) (res []byte) {
   284  	if len(buf) > 0 {
   285  		res = buf
   286  	}
   287  	return
   288  }