github.com/grailbio/base@v0.0.11/compress/rw.go (about)

     1  // Package compress provides convenience functions for creating compressors and
     2  // uncompressors based on filenames.
     3  package compress
     4  
     5  import (
     6  	"bytes"
     7  	"compress/bzip2"
     8  	"context"
     9  	"fmt"
    10  	"io"
    11  	"io/ioutil"
    12  
    13  	"github.com/grailbio/base/compress/zstd"
    14  	"github.com/grailbio/base/errors"
    15  	"github.com/grailbio/base/file"
    16  	"github.com/grailbio/base/fileio"
    17  	"github.com/grailbio/base/ioctx"
    18  	"github.com/klauspost/compress/gzip"
    19  	"github.com/yasushi-saito/zlibng"
    20  )
    21  
    22  // errorReader is a ReadCloser implementation that always returns the given
    23  // error.
    24  type errorReader struct{ err error }
    25  
    26  func (r *errorReader) Read(buf []byte) (int, error) { return 0, r.err }
    27  func (r *errorReader) Close() error                 { return r.err }
    28  
    29  // nopWriteCloser adds a noop Closer to io.Writer.
    30  type nopWriteCloser struct{ io.Writer }
    31  
    32  func (w *nopWriteCloser) Close() error { return nil }
    33  
    34  func isBzip2Header(buf []byte) bool {
    35  	// https://www.forensicswiki.org/wiki/Bzip2
    36  	if len(buf) < 10 {
    37  		return false
    38  	}
    39  	if !(buf[0] == 'B' && buf[1] == 'Z' && buf[2] == 'h' && buf[3] >= '1' && buf[3] <= '9') {
    40  		return false
    41  	}
    42  	if buf[4] == 0x31 && buf[5] == 0x41 &&
    43  		buf[6] == 0x59 && buf[7] == 0x26 &&
    44  		buf[8] == 0x53 && buf[9] == 0x59 { // block magic
    45  		return true
    46  	}
    47  	if buf[4] == 0x17 && buf[5] == 0x72 &&
    48  		buf[6] == 0x45 && buf[7] == 0x38 &&
    49  		buf[8] == 0x50 && buf[9] == 0x90 { // eos magic, happens only for an empty bz2 file.
    50  		return true
    51  	}
    52  	return false
    53  }
    54  
    55  func isGzipHeader(buf []byte) bool {
    56  	if len(buf) < 10 {
    57  		return false
    58  	}
    59  	if !(buf[0] == 0x1f && buf[1] == 0x8b) {
    60  		return false
    61  	}
    62  	if !(buf[2] <= 3 || buf[2] == 8) {
    63  		return false
    64  	}
    65  	if (buf[3] & 0xc0) != 0 {
    66  		return false
    67  	}
    68  	if !(buf[9] <= 0xd || buf[9] == 0xff) {
    69  		return false
    70  	}
    71  	return true
    72  }
    73  
    74  // https://tools.ietf.org/html/rfc8478
    75  func isZstdHeader(buf []byte) bool {
    76  	if len(buf) < 4 {
    77  		return false
    78  	}
    79  	if buf[0] != 0x28 || buf[1] != 0xB5 || buf[2] != 0x2F || buf[3] != 0xFD {
    80  		return false
    81  	}
    82  	return true
    83  }
    84  
    85  // NewReader creates an uncompressing reader by reading the first few bytes of
    86  // the input and finding a magic header for either gzip, zstd, bzip2. If the
    87  // magic header is found , it returns an uncompressing ReadCloser and
    88  // true. Else, it returns ioutil.NopCloser(r) and false.
    89  //
    90  // CAUTION: this function will misbehave when the input is a binary string that
    91  // happens to have the same magic gzip, zstd, or bzip2 header.  Thus, you should
    92  // use this function only when the input is expected to be ASCII.
    93  func NewReader(r io.Reader) (io.ReadCloser, bool) {
    94  	buf := bytes.Buffer{}
    95  	_, err := io.CopyN(&buf, r, 128)
    96  	var m io.Reader
    97  	switch err {
    98  	case io.EOF:
    99  		m = &buf
   100  	case nil:
   101  		m = io.MultiReader(&buf, r)
   102  	default:
   103  		m = io.MultiReader(&buf, &errorReader{err})
   104  	}
   105  	if isGzipHeader(buf.Bytes()) {
   106  		z, err := zlibng.NewReader(m)
   107  		if err != nil {
   108  			return &errorReader{err}, false
   109  		}
   110  		return z, true
   111  	}
   112  	if isZstdHeader(buf.Bytes()) {
   113  		zr, err := zstd.NewReader(m)
   114  		if err != nil {
   115  			return &errorReader{err}, false
   116  		}
   117  		return zr, true
   118  	}
   119  	if isBzip2Header(buf.Bytes()) {
   120  		return ioutil.NopCloser(bzip2.NewReader(m)), true
   121  	}
   122  	return ioutil.NopCloser(m), false
   123  }
   124  
   125  // NewReaderPath creates a reader that uncompresses data read from the given
   126  // reader.  The compression format is determined by the pathname extensions. If
   127  // the pathname ends with one of the following extensions, it creates an
   128  // uncompressing ReadCloser and returns true.
   129  //
   130  //  .gz => gzip format
   131  //  .zst => zstd format
   132  //  .bz2 => bz2 format
   133  //
   134  // For other extensions, this function returns an ioutil.NopCloser(r) and false.
   135  //
   136  // The caller must close the ReadCloser after use. For some file formats,
   137  // Close() is the only place that reports file corruption.
   138  func NewReaderPath(r io.Reader, path string) (io.ReadCloser, bool) {
   139  	switch fileio.DetermineType(path) {
   140  	case fileio.Gzip:
   141  		gz, err := zlibng.NewReader(r)
   142  		if err != nil {
   143  			return file.NewError(err), false
   144  		}
   145  		return gz, true
   146  	case fileio.Zstd:
   147  		zr, err := zstd.NewReader(r)
   148  		if err != nil {
   149  			return file.NewError(err), false
   150  		}
   151  		return zr, true
   152  	case fileio.Bzip2:
   153  		return ioutil.NopCloser(bzip2.NewReader(r)), true
   154  	}
   155  	return ioutil.NopCloser(r), false
   156  }
   157  
   158  // Open opens path with file.Open and decompresses with NewReaderPath.
   159  func Open(ctx context.Context, path string) (io.ReadCloser, bool) {
   160  	f, err := file.Open(ctx, path)
   161  	if err != nil {
   162  		return file.NewError(err), false
   163  	}
   164  	r, isCompressed := NewReaderPath(f.Reader(ctx), path)
   165  
   166  	return struct {
   167  		io.Reader
   168  		io.Closer
   169  	}{r, doubleCloser{r, ioctx.ToStdCloser(ctx, f)}}, isCompressed
   170  }
   171  
   172  // NewWriterPath creates a WriteCloser that compresses data.  The compression
   173  // format is determined by the pathname extensions. If the pathname ends with
   174  // one of the following extensions, it creates an compressing WriteCloser and
   175  // returns true.
   176  //
   177  //  .gz => gzip format
   178  //  .zst => zstd format
   179  //
   180  // For other extensions, this function creates a noop WriteCloser and returns
   181  // false.  The caller must close the WriteCloser after use.
   182  func NewWriterPath(w io.Writer, path string) (io.WriteCloser, bool) {
   183  	switch fileio.DetermineType(path) {
   184  	case fileio.Gzip:
   185  		return gzip.NewWriter(w), true
   186  	case fileio.Zstd:
   187  		zw, err := zstd.NewWriter(w)
   188  		if err != nil {
   189  			return file.NewError(err), false
   190  		}
   191  		return zw, true
   192  	case fileio.Bzip2:
   193  		return file.NewError(fmt.Errorf("%s: bzip2 writer not supported", path)), false
   194  	}
   195  	return &nopWriteCloser{w}, false
   196  }
   197  
   198  // Create creates path with file.Create and compresses with NewWriterPath.
   199  func Create(ctx context.Context, path string, opts ...file.Opts) (io.WriteCloser, bool) {
   200  	f, err := file.Create(ctx, path, opts...)
   201  	if err != nil {
   202  		return file.NewError(err), false
   203  	}
   204  	w, isCompressed := NewWriterPath(f.Writer(ctx), path)
   205  	return struct {
   206  		io.Writer
   207  		io.Closer
   208  	}{w, doubleCloser{w, ioctx.ToStdCloser(ctx, f)}}, isCompressed
   209  }
   210  
   211  // doubleCloser implements io.Closer and serves to clean up the boilerplate
   212  // around closing both the files and reader/writer objects created in
   213  // Open and Create.
   214  type doubleCloser struct {
   215  	c, d io.Closer
   216  }
   217  
   218  func (c doubleCloser) Close() (err error) {
   219  	errors.CleanUp(c.c.Close, &err)
   220  	errors.CleanUp(c.d.Close, &err)
   221  	return
   222  }