github.com/grailbio/base@v0.0.11/compress/libdeflate/libdeflate.go (about)

     1  // Copyright 2018 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package libdeflate
     6  
     7  import (
     8  	"compress/gzip"
     9  	"encoding/binary"
    10  	"errors"
    11  	"fmt"
    12  	"hash/crc32"
    13  	"io"
    14  )
    15  
    16  // This is a slightly modified version of klauspost/compress/gzip/gzip.go , and
    17  // is designed to be a drop-in replacement for it in bgzf writer code.  It may
    18  // eventually go into its own sub-package, but for now I'll keep it here since
    19  // bgzf is our only use case (zstd is generally a better choice for new custom
    20  // formats).
    21  
    22  // These constants are copied from klauspost/compress/gzip.  The
    23  // BestCompression value is technically a lie for libdeflate--it goes all the
    24  // way up to 12, not 9--so I've defined an extra constant for the real highest
    25  // setting.
    26  const (
    27  	BestSpeed          = gzip.BestSpeed
    28  	BestCompression    = gzip.BestCompression
    29  	BestestCompression = 12
    30  	DefaultCompression = gzip.DefaultCompression
    31  
    32  	// NoCompression and ConstantCompression/HuffmanOnly are not supported by
    33  	// this package.
    34  
    35  	DefaultBufCap = 0x10000
    36  
    37  	gzipID1     = 0x1f
    38  	gzipID2     = 0x8b
    39  	gzipDeflate = 8
    40  )
    41  
    42  // A Writer is an io.WriteCloser.
    43  // Writes to a Writer are compressed and written to w.
    44  type Writer struct {
    45  	gzip.Header
    46  	w          io.Writer
    47  	level      int
    48  	compressor Compressor
    49  	digest     uint32 // CRC-32, IEEE polynomial (section 8)
    50  	size       uint32 // Uncompressed size (section 2.3.1)
    51  	closed     bool
    52  	buf        []byte
    53  
    54  	// Next two fields are specific to libdeflate.
    55  	// NewWriter{Level}() can't accept a bufCap argument without breaking
    56  	// klauspost/compress/gzip compatibility, so we have bufCap default to the
    57  	// 65536 value needed by bgzf, export a function that changes it, and
    58  	// lazy-initialize buf[] so that we don't allocate a size-65536 block only to
    59  	// immediately throw it away if the user wants a different capacity.
    60  	// bufCap must be large enough to fit the entire gzip block, not just the
    61  	// compressed portion.
    62  	bufCap int
    63  	bufPos int
    64  
    65  	// wroteHeader has been removed since it's equivalent to (bufPos != 0).
    66  
    67  	err error
    68  }
    69  
    70  // NewWriter returns a new Writer.
    71  // Writes to the returned writer are compressed and written to w.
    72  //
    73  // It is the caller's responsibility to call Close on the WriteCloser when
    74  // done.
    75  // Writes may be buffered and not flushed until Close.
    76  //
    77  // Callers that wish to set the fields in Writer.Header must do so before the
    78  // first call to Write, Flush, or Close.
    79  func NewWriter(w io.Writer) *Writer {
    80  	z, _ := NewWriterLevel(w, DefaultCompression)
    81  	return z
    82  }
    83  
    84  // NewWriterLevel is like NewWriter but specifies the compression level instead
    85  // of assuming DefaultCompression.
    86  //
    87  // The compression level can be DefaultCompression, or any integer value
    88  // between 1 and BestCompression inclusive.  The error returned will be nil if
    89  // the level is valid.
    90  func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
    91  	if (level < DefaultCompression) || (level == 0) || (level > BestestCompression) {
    92  		return nil, fmt.Errorf("libdeflate: invalid compression level: %d", level)
    93  	}
    94  	z := new(Writer)
    95  
    96  	z.bufCap = DefaultBufCap
    97  
    98  	z.init(w, level)
    99  	return z, nil
   100  }
   101  
   102  func (z *Writer) init(w io.Writer, level int) {
   103  	// compressor is now lazy-(re)initialized later.
   104  
   105  	buf := z.buf
   106  	bufCap := z.bufCap
   107  
   108  	*z = Writer{
   109  		Header: gzip.Header{
   110  			OS: 255, // unknown
   111  		},
   112  		w:      w,
   113  		level:  level,
   114  		buf:    buf,
   115  		bufCap: bufCap,
   116  	}
   117  }
   118  
   119  // Reset discards the Writer z's state and makes it equivalent to the result of
   120  // its original state from NewWriter or NewWriterLevel, but writing to w
   121  // instead.  This permits reusing a Writer rather than allocating a new one.
   122  //
   123  // It is safe to call Reset() without Close().  In this case, *no* bytes from
   124  // the previous block are written.
   125  func (z *Writer) Reset(w io.Writer) {
   126  	z.init(w, z.level)
   127  }
   128  
   129  // SetCap changes the capacity of the final write buffer, which must fit the
   130  // entire gzip block (header and footer bytes included).  (libdeflate requires
   131  // this value to be declared in advance.)
   132  // If this function isn't called, the capacity is set to DefaultBufCap ==
   133  // 0x10000.
   134  func (z *Writer) SetCap(newCap int) error {
   135  	if z.bufPos != 0 {
   136  		return errors.New("libdeflate.SetCap: invalid call (must immediately follow initialization/reset)")
   137  	}
   138  	if newCap < 18 {
   139  		// guarantee enough space for always-present header and footer bytes, so we
   140  		// can be slightly less paranoid with bounds-checks
   141  		return errors.New("libdeflate.SetCap: capacity too low")
   142  	}
   143  	z.bufCap = newCap
   144  	return nil
   145  }
   146  
   147  var le = binary.LittleEndian
   148  
   149  // appendBytes appends a length-prefixed byte slice to z.buf.
   150  func (z *Writer) appendBytes(b []byte) error {
   151  	if len(b) > 0xffff {
   152  		return errors.New("libdeflate.Write: extra data is too large")
   153  	}
   154  	midPos := z.bufPos + 2
   155  	endPos := midPos + len(b)
   156  	if endPos > z.bufCap {
   157  		return errors.New("libdeflate.Write: out of buffer space")
   158  	}
   159  	le.PutUint16(z.buf[z.bufPos:], uint16(len(b)))
   160  	copy(z.buf[midPos:], b)
   161  	z.bufPos = endPos
   162  	return nil
   163  }
   164  
   165  // appendString appends a UTF-8 string s in GZIP's format to z.buf.
   166  // GZIP (RFC 1952) specifies that strings are NUL-terminated ISO 8859-1
   167  // (Latin-1).
   168  func (z *Writer) appendString(s string) (err error) {
   169  	// GZIP stores Latin-1 strings; error if non-Latin-1; convert if non-ASCII.
   170  	needconv := false
   171  	for _, v := range s {
   172  		if v == 0 || v > 0xff {
   173  			return errors.New("libdeflate.Write: non-Latin-1 header string")
   174  		}
   175  		if v > 0x7f {
   176  			needconv = true
   177  		}
   178  	}
   179  	nulPos := z.bufPos
   180  	if needconv {
   181  		b := make([]byte, 0, len(s))
   182  		for _, v := range s {
   183  			b = append(b, byte(v))
   184  		}
   185  		nulPos += len(b)
   186  		if nulPos >= z.bufCap {
   187  			return errors.New("libdeflate.Write: out of buffer space")
   188  		}
   189  		copy(z.buf[z.bufPos:], b)
   190  	} else {
   191  		nulPos += len(s)
   192  		if nulPos >= z.bufCap {
   193  			return errors.New("libdeflate.Write: out of buffer space")
   194  		}
   195  		copy(z.buf[z.bufPos:], s)
   196  	}
   197  	// GZIP strings are NUL-terminated.
   198  	z.buf[nulPos] = 0
   199  	z.bufPos = nulPos + 1
   200  	return nil
   201  }
   202  
   203  // Write writes a compressed form of p to the underlying io.Writer. The
   204  // compressed bytes are not necessarily flushed until the Writer is closed.
   205  func (z *Writer) Write(p []byte) (int, error) {
   206  	if z.err != nil {
   207  		return 0, z.err
   208  	}
   209  	// Enforce the libdeflate constraint.
   210  	if z.bufPos != 0 {
   211  		z.err = errors.New("libdeflate.Write: only one Write operation permitted per block")
   212  		return 0, z.err
   213  	}
   214  	// 'Write' the GZIP header lazily.
   215  	if len(z.buf) < z.bufCap {
   216  		if cap(z.buf) < z.bufCap {
   217  			// Impossible to avoid zero-initialization here in Go.
   218  			z.buf = make([]byte, z.bufCap)
   219  		} else {
   220  			// No need to zero-reinitialize.
   221  			z.buf = z.buf[:z.bufCap]
   222  		}
   223  	} else if len(z.buf) > z.bufCap {
   224  		// Likely to be irrelevant, but may as well maintain this invariant
   225  		z.buf = z.buf[:z.bufCap]
   226  	}
   227  	z.buf[0] = gzipID1
   228  	z.buf[1] = gzipID2
   229  	z.buf[2] = gzipDeflate
   230  	z.buf[3] = 0
   231  	if z.Extra != nil {
   232  		z.buf[3] |= 0x04
   233  	}
   234  	if z.Name != "" {
   235  		z.buf[3] |= 0x08
   236  	}
   237  	if z.Comment != "" {
   238  		z.buf[3] |= 0x10
   239  	}
   240  	le.PutUint32(z.buf[4:8], uint32(z.ModTime.Unix()))
   241  	if z.level >= BestCompression {
   242  		// Reasonable to set this for any level in 9..12.
   243  		z.buf[8] = 2
   244  	} else if z.level == BestSpeed {
   245  		z.buf[8] = 4
   246  	} else {
   247  		z.buf[8] = 0
   248  	}
   249  	z.buf[9] = z.OS
   250  	z.bufPos = 10
   251  	if z.Extra != nil {
   252  		z.err = z.appendBytes(z.Extra)
   253  		if z.err != nil {
   254  			return 0, z.err
   255  		}
   256  	}
   257  	if z.Name != "" {
   258  		z.err = z.appendString(z.Name)
   259  		if z.err != nil {
   260  			return 0, z.err
   261  		}
   262  	}
   263  	if z.Comment != "" {
   264  		z.err = z.appendString(z.Comment)
   265  		if z.err != nil {
   266  			return 0, z.err
   267  		}
   268  	}
   269  	z.err = z.compressor.Init(z.level)
   270  	if z.err != nil {
   271  		return 0, z.err
   272  	}
   273  	z.size += uint32(len(p))
   274  	z.digest = crc32.Update(z.digest, crc32.IEEETable, p)
   275  
   276  	n := z.compressor.Compress(z.buf[z.bufPos:z.bufCap-8], p)
   277  	z.bufPos += n
   278  	if n == 0 {
   279  		z.err = errors.New("libdeflate.Write: out of buffer space")
   280  	}
   281  	return n, z.err
   282  }
   283  
   284  // Flush() has been removed for now.
   285  
   286  // Close closes the Writer, flushing any unwritten data to the underlying
   287  // io.Writer, but does not close the underlying io.Writer.
   288  func (z *Writer) Close() error {
   289  	if z.err != nil {
   290  		return z.err
   291  	}
   292  	if z.closed {
   293  		return nil
   294  	}
   295  	z.closed = true
   296  	if z.bufPos == 0 {
   297  		_, z.err = z.Write(nil)
   298  		if z.err != nil {
   299  			return z.err
   300  		}
   301  	}
   302  	// a bit inefficient to keep calling this, but given the current interface we
   303  	// have no choice.
   304  	z.compressor.Cleanup()
   305  	midPos := z.bufPos + 4
   306  	endPos := midPos + 4
   307  	if endPos > z.bufCap {
   308  		z.err = errors.New("libdeflate.Write: out of buffer space")
   309  	}
   310  	le.PutUint32(z.buf[z.bufPos:], z.digest)
   311  	le.PutUint32(z.buf[midPos:], z.size)
   312  	_, z.err = z.w.Write(z.buf[:endPos])
   313  	return z.err
   314  }