github.com/grailbio/base@v0.0.11/compress/rw.go (about) 1 // Package compress provides convenience functions for creating compressors and 2 // uncompressors based on filenames. 3 package compress 4 5 import ( 6 "bytes" 7 "compress/bzip2" 8 "context" 9 "fmt" 10 "io" 11 "io/ioutil" 12 13 "github.com/grailbio/base/compress/zstd" 14 "github.com/grailbio/base/errors" 15 "github.com/grailbio/base/file" 16 "github.com/grailbio/base/fileio" 17 "github.com/grailbio/base/ioctx" 18 "github.com/klauspost/compress/gzip" 19 "github.com/yasushi-saito/zlibng" 20 ) 21 22 // errorReader is a ReadCloser implementation that always returns the given 23 // error. 24 type errorReader struct{ err error } 25 26 func (r *errorReader) Read(buf []byte) (int, error) { return 0, r.err } 27 func (r *errorReader) Close() error { return r.err } 28 29 // nopWriteCloser adds a noop Closer to io.Writer. 30 type nopWriteCloser struct{ io.Writer } 31 32 func (w *nopWriteCloser) Close() error { return nil } 33 34 func isBzip2Header(buf []byte) bool { 35 // https://www.forensicswiki.org/wiki/Bzip2 36 if len(buf) < 10 { 37 return false 38 } 39 if !(buf[0] == 'B' && buf[1] == 'Z' && buf[2] == 'h' && buf[3] >= '1' && buf[3] <= '9') { 40 return false 41 } 42 if buf[4] == 0x31 && buf[5] == 0x41 && 43 buf[6] == 0x59 && buf[7] == 0x26 && 44 buf[8] == 0x53 && buf[9] == 0x59 { // block magic 45 return true 46 } 47 if buf[4] == 0x17 && buf[5] == 0x72 && 48 buf[6] == 0x45 && buf[7] == 0x38 && 49 buf[8] == 0x50 && buf[9] == 0x90 { // eos magic, happens only for an empty bz2 file. 50 return true 51 } 52 return false 53 } 54 55 func isGzipHeader(buf []byte) bool { 56 if len(buf) < 10 { 57 return false 58 } 59 if !(buf[0] == 0x1f && buf[1] == 0x8b) { 60 return false 61 } 62 if !(buf[2] <= 3 || buf[2] == 8) { 63 return false 64 } 65 if (buf[3] & 0xc0) != 0 { 66 return false 67 } 68 if !(buf[9] <= 0xd || buf[9] == 0xff) { 69 return false 70 } 71 return true 72 } 73 74 // https://tools.ietf.org/html/rfc8478 75 func isZstdHeader(buf []byte) bool { 76 if len(buf) < 4 { 77 return false 78 } 79 if buf[0] != 0x28 || buf[1] != 0xB5 || buf[2] != 0x2F || buf[3] != 0xFD { 80 return false 81 } 82 return true 83 } 84 85 // NewReader creates an uncompressing reader by reading the first few bytes of 86 // the input and finding a magic header for either gzip, zstd, bzip2. If the 87 // magic header is found , it returns an uncompressing ReadCloser and 88 // true. Else, it returns ioutil.NopCloser(r) and false. 89 // 90 // CAUTION: this function will misbehave when the input is a binary string that 91 // happens to have the same magic gzip, zstd, or bzip2 header. Thus, you should 92 // use this function only when the input is expected to be ASCII. 93 func NewReader(r io.Reader) (io.ReadCloser, bool) { 94 buf := bytes.Buffer{} 95 _, err := io.CopyN(&buf, r, 128) 96 var m io.Reader 97 switch err { 98 case io.EOF: 99 m = &buf 100 case nil: 101 m = io.MultiReader(&buf, r) 102 default: 103 m = io.MultiReader(&buf, &errorReader{err}) 104 } 105 if isGzipHeader(buf.Bytes()) { 106 z, err := zlibng.NewReader(m) 107 if err != nil { 108 return &errorReader{err}, false 109 } 110 return z, true 111 } 112 if isZstdHeader(buf.Bytes()) { 113 zr, err := zstd.NewReader(m) 114 if err != nil { 115 return &errorReader{err}, false 116 } 117 return zr, true 118 } 119 if isBzip2Header(buf.Bytes()) { 120 return ioutil.NopCloser(bzip2.NewReader(m)), true 121 } 122 return ioutil.NopCloser(m), false 123 } 124 125 // NewReaderPath creates a reader that uncompresses data read from the given 126 // reader. The compression format is determined by the pathname extensions. If 127 // the pathname ends with one of the following extensions, it creates an 128 // uncompressing ReadCloser and returns true. 129 // 130 // .gz => gzip format 131 // .zst => zstd format 132 // .bz2 => bz2 format 133 // 134 // For other extensions, this function returns an ioutil.NopCloser(r) and false. 135 // 136 // The caller must close the ReadCloser after use. For some file formats, 137 // Close() is the only place that reports file corruption. 138 func NewReaderPath(r io.Reader, path string) (io.ReadCloser, bool) { 139 switch fileio.DetermineType(path) { 140 case fileio.Gzip: 141 gz, err := zlibng.NewReader(r) 142 if err != nil { 143 return file.NewError(err), false 144 } 145 return gz, true 146 case fileio.Zstd: 147 zr, err := zstd.NewReader(r) 148 if err != nil { 149 return file.NewError(err), false 150 } 151 return zr, true 152 case fileio.Bzip2: 153 return ioutil.NopCloser(bzip2.NewReader(r)), true 154 } 155 return ioutil.NopCloser(r), false 156 } 157 158 // Open opens path with file.Open and decompresses with NewReaderPath. 159 func Open(ctx context.Context, path string) (io.ReadCloser, bool) { 160 f, err := file.Open(ctx, path) 161 if err != nil { 162 return file.NewError(err), false 163 } 164 r, isCompressed := NewReaderPath(f.Reader(ctx), path) 165 166 return struct { 167 io.Reader 168 io.Closer 169 }{r, doubleCloser{r, ioctx.ToStdCloser(ctx, f)}}, isCompressed 170 } 171 172 // NewWriterPath creates a WriteCloser that compresses data. The compression 173 // format is determined by the pathname extensions. If the pathname ends with 174 // one of the following extensions, it creates an compressing WriteCloser and 175 // returns true. 176 // 177 // .gz => gzip format 178 // .zst => zstd format 179 // 180 // For other extensions, this function creates a noop WriteCloser and returns 181 // false. The caller must close the WriteCloser after use. 182 func NewWriterPath(w io.Writer, path string) (io.WriteCloser, bool) { 183 switch fileio.DetermineType(path) { 184 case fileio.Gzip: 185 return gzip.NewWriter(w), true 186 case fileio.Zstd: 187 zw, err := zstd.NewWriter(w) 188 if err != nil { 189 return file.NewError(err), false 190 } 191 return zw, true 192 case fileio.Bzip2: 193 return file.NewError(fmt.Errorf("%s: bzip2 writer not supported", path)), false 194 } 195 return &nopWriteCloser{w}, false 196 } 197 198 // Create creates path with file.Create and compresses with NewWriterPath. 199 func Create(ctx context.Context, path string, opts ...file.Opts) (io.WriteCloser, bool) { 200 f, err := file.Create(ctx, path, opts...) 201 if err != nil { 202 return file.NewError(err), false 203 } 204 w, isCompressed := NewWriterPath(f.Writer(ctx), path) 205 return struct { 206 io.Writer 207 io.Closer 208 }{w, doubleCloser{w, ioctx.ToStdCloser(ctx, f)}}, isCompressed 209 } 210 211 // doubleCloser implements io.Closer and serves to clean up the boilerplate 212 // around closing both the files and reader/writer objects created in 213 // Open and Create. 214 type doubleCloser struct { 215 c, d io.Closer 216 } 217 218 func (c doubleCloser) Close() (err error) { 219 errors.CleanUp(c.c.Close, &err) 220 errors.CleanUp(c.d.Close, &err) 221 return 222 }