github.com/fraugster/parquet-go@v0.12.0/compress.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"bytes"
     5  	"compress/gzip"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  	"io/ioutil"
    10  	"sync"
    11  
    12  	"github.com/fraugster/parquet-go/parquet"
    13  	"github.com/golang/snappy"
    14  )
    15  
    16  var (
    17  	compressors    = make(map[parquet.CompressionCodec]BlockCompressor)
    18  	compressorLock sync.RWMutex
    19  )
    20  
    21  type (
    22  	// BlockCompressor is an interface to describe of a block compressor to be used
    23  	// in compressing the content of parquet files.
    24  	BlockCompressor interface {
    25  		CompressBlock([]byte) ([]byte, error)
    26  		DecompressBlock([]byte) ([]byte, error)
    27  	}
    28  
    29  	plainCompressor  struct{}
    30  	snappyCompressor struct{}
    31  	gzipCompressor   struct{}
    32  )
    33  
    34  func (plainCompressor) CompressBlock(block []byte) ([]byte, error) {
    35  	return block, nil
    36  }
    37  
    38  func (plainCompressor) DecompressBlock(block []byte) ([]byte, error) {
    39  	return block, nil
    40  }
    41  
    42  func (snappyCompressor) CompressBlock(block []byte) ([]byte, error) {
    43  	return snappy.Encode(nil, block), nil
    44  }
    45  
    46  func (snappyCompressor) DecompressBlock(block []byte) ([]byte, error) {
    47  	return snappy.Decode(nil, block)
    48  }
    49  
    50  func (gzipCompressor) CompressBlock(block []byte) ([]byte, error) {
    51  	buf := &bytes.Buffer{}
    52  	w := gzip.NewWriter(buf)
    53  	if _, err := w.Write(block); err != nil {
    54  		return nil, err
    55  	}
    56  	if err := w.Close(); err != nil {
    57  		return nil, err
    58  	}
    59  
    60  	return buf.Bytes(), nil
    61  }
    62  
    63  func (gzipCompressor) DecompressBlock(block []byte) ([]byte, error) {
    64  	buf := bytes.NewReader(block)
    65  	r, err := gzip.NewReader(buf)
    66  	if err != nil {
    67  		return nil, err
    68  	}
    69  
    70  	ret, err := ioutil.ReadAll(r)
    71  	if err != nil {
    72  		return nil, err
    73  	}
    74  
    75  	return ret, r.Close()
    76  }
    77  
    78  func compressBlock(block []byte, method parquet.CompressionCodec) ([]byte, error) {
    79  	compressorLock.RLock()
    80  	defer compressorLock.RUnlock()
    81  
    82  	c, ok := compressors[method]
    83  	if !ok {
    84  		return nil, fmt.Errorf("method %q is not supported", method.String())
    85  	}
    86  
    87  	return c.CompressBlock(block)
    88  }
    89  
    90  func decompressBlock(block []byte, method parquet.CompressionCodec) ([]byte, error) {
    91  	compressorLock.RLock()
    92  	defer compressorLock.RUnlock()
    93  
    94  	c, ok := compressors[method]
    95  	if !ok {
    96  		return nil, fmt.Errorf("method %q is not supported", method.String())
    97  	}
    98  
    99  	return c.DecompressBlock(block)
   100  }
   101  
   102  func newBlockReader(buf []byte, codec parquet.CompressionCodec, compressedSize int32, uncompressedSize int32, alloc *allocTracker) (io.Reader, error) {
   103  	if compressedSize < 0 || uncompressedSize < 0 {
   104  		return nil, errors.New("invalid page data size")
   105  	}
   106  
   107  	if len(buf) != int(compressedSize) {
   108  		return nil, fmt.Errorf("compressed data must be %d byte but its %d byte", compressedSize, len(buf))
   109  	}
   110  
   111  	alloc.test(uint64(uncompressedSize))
   112  	res, err := decompressBlock(buf, codec)
   113  	if err != nil {
   114  		return nil, fmt.Errorf("decompression failed: %w", err)
   115  	}
   116  	alloc.register(res, uint64(len(res)))
   117  
   118  	if len(res) != int(uncompressedSize) {
   119  		return nil, fmt.Errorf("decompressed data must be %d byte but its %d byte", uncompressedSize, len(res))
   120  	}
   121  
   122  	return bytes.NewReader(res), nil
   123  }
   124  
   125  // RegisterBlockCompressor is a function to to register additional block compressors to the package. By default,
   126  // only UNCOMPRESSED, GZIP and SNAPPY are supported as parquet compression algorithms. The parquet file format
   127  // supports more compression algorithms, such as LZO, BROTLI, LZ4 and ZSTD. To limit the amount of external dependencies,
   128  // the number of supported algorithms was reduced to a core set. If you want to use any of the other compression
   129  // algorithms, please provide your own implementation of it in a way that satisfies the BlockCompressor interface,
   130  // and register it using this function from your code.
   131  func RegisterBlockCompressor(method parquet.CompressionCodec, compressor BlockCompressor) {
   132  	compressorLock.Lock()
   133  	defer compressorLock.Unlock()
   134  
   135  	compressors[method] = compressor
   136  }
   137  
   138  // GetRegisteredBlockCompressors returns a map of compression codecs to block compressors that
   139  // are currently registered.
   140  func GetRegisteredBlockCompressors() map[parquet.CompressionCodec]BlockCompressor {
   141  	result := make(map[parquet.CompressionCodec]BlockCompressor)
   142  
   143  	compressorLock.Lock()
   144  	defer compressorLock.Unlock()
   145  
   146  	for k, v := range compressors {
   147  		result[k] = v
   148  	}
   149  
   150  	return result
   151  }
   152  
   153  func init() {
   154  	RegisterBlockCompressor(parquet.CompressionCodec_UNCOMPRESSED, plainCompressor{})
   155  	RegisterBlockCompressor(parquet.CompressionCodec_GZIP, gzipCompressor{})
   156  	RegisterBlockCompressor(parquet.CompressionCodec_SNAPPY, snappyCompressor{})
   157  }