github.com/fraugster/parquet-go@v0.12.0/compress.go (about) 1 package goparquet 2 3 import ( 4 "bytes" 5 "compress/gzip" 6 "errors" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "sync" 11 12 "github.com/fraugster/parquet-go/parquet" 13 "github.com/golang/snappy" 14 ) 15 16 var ( 17 compressors = make(map[parquet.CompressionCodec]BlockCompressor) 18 compressorLock sync.RWMutex 19 ) 20 21 type ( 22 // BlockCompressor is an interface to describe of a block compressor to be used 23 // in compressing the content of parquet files. 24 BlockCompressor interface { 25 CompressBlock([]byte) ([]byte, error) 26 DecompressBlock([]byte) ([]byte, error) 27 } 28 29 plainCompressor struct{} 30 snappyCompressor struct{} 31 gzipCompressor struct{} 32 ) 33 34 func (plainCompressor) CompressBlock(block []byte) ([]byte, error) { 35 return block, nil 36 } 37 38 func (plainCompressor) DecompressBlock(block []byte) ([]byte, error) { 39 return block, nil 40 } 41 42 func (snappyCompressor) CompressBlock(block []byte) ([]byte, error) { 43 return snappy.Encode(nil, block), nil 44 } 45 46 func (snappyCompressor) DecompressBlock(block []byte) ([]byte, error) { 47 return snappy.Decode(nil, block) 48 } 49 50 func (gzipCompressor) CompressBlock(block []byte) ([]byte, error) { 51 buf := &bytes.Buffer{} 52 w := gzip.NewWriter(buf) 53 if _, err := w.Write(block); err != nil { 54 return nil, err 55 } 56 if err := w.Close(); err != nil { 57 return nil, err 58 } 59 60 return buf.Bytes(), nil 61 } 62 63 func (gzipCompressor) DecompressBlock(block []byte) ([]byte, error) { 64 buf := bytes.NewReader(block) 65 r, err := gzip.NewReader(buf) 66 if err != nil { 67 return nil, err 68 } 69 70 ret, err := ioutil.ReadAll(r) 71 if err != nil { 72 return nil, err 73 } 74 75 return ret, r.Close() 76 } 77 78 func compressBlock(block []byte, method parquet.CompressionCodec) ([]byte, error) { 79 compressorLock.RLock() 80 defer compressorLock.RUnlock() 81 82 c, ok := compressors[method] 83 if !ok { 84 return nil, fmt.Errorf("method %q is not supported", method.String()) 85 } 86 87 return c.CompressBlock(block) 88 } 89 90 func decompressBlock(block []byte, method parquet.CompressionCodec) ([]byte, error) { 91 compressorLock.RLock() 92 defer compressorLock.RUnlock() 93 94 c, ok := compressors[method] 95 if !ok { 96 return nil, fmt.Errorf("method %q is not supported", method.String()) 97 } 98 99 return c.DecompressBlock(block) 100 } 101 102 func newBlockReader(buf []byte, codec parquet.CompressionCodec, compressedSize int32, uncompressedSize int32, alloc *allocTracker) (io.Reader, error) { 103 if compressedSize < 0 || uncompressedSize < 0 { 104 return nil, errors.New("invalid page data size") 105 } 106 107 if len(buf) != int(compressedSize) { 108 return nil, fmt.Errorf("compressed data must be %d byte but its %d byte", compressedSize, len(buf)) 109 } 110 111 alloc.test(uint64(uncompressedSize)) 112 res, err := decompressBlock(buf, codec) 113 if err != nil { 114 return nil, fmt.Errorf("decompression failed: %w", err) 115 } 116 alloc.register(res, uint64(len(res))) 117 118 if len(res) != int(uncompressedSize) { 119 return nil, fmt.Errorf("decompressed data must be %d byte but its %d byte", uncompressedSize, len(res)) 120 } 121 122 return bytes.NewReader(res), nil 123 } 124 125 // RegisterBlockCompressor is a function to to register additional block compressors to the package. By default, 126 // only UNCOMPRESSED, GZIP and SNAPPY are supported as parquet compression algorithms. The parquet file format 127 // supports more compression algorithms, such as LZO, BROTLI, LZ4 and ZSTD. To limit the amount of external dependencies, 128 // the number of supported algorithms was reduced to a core set. If you want to use any of the other compression 129 // algorithms, please provide your own implementation of it in a way that satisfies the BlockCompressor interface, 130 // and register it using this function from your code. 131 func RegisterBlockCompressor(method parquet.CompressionCodec, compressor BlockCompressor) { 132 compressorLock.Lock() 133 defer compressorLock.Unlock() 134 135 compressors[method] = compressor 136 } 137 138 // GetRegisteredBlockCompressors returns a map of compression codecs to block compressors that 139 // are currently registered. 140 func GetRegisteredBlockCompressors() map[parquet.CompressionCodec]BlockCompressor { 141 result := make(map[parquet.CompressionCodec]BlockCompressor) 142 143 compressorLock.Lock() 144 defer compressorLock.Unlock() 145 146 for k, v := range compressors { 147 result[k] = v 148 } 149 150 return result 151 } 152 153 func init() { 154 RegisterBlockCompressor(parquet.CompressionCodec_UNCOMPRESSED, plainCompressor{}) 155 RegisterBlockCompressor(parquet.CompressionCodec_GZIP, gzipCompressor{}) 156 RegisterBlockCompressor(parquet.CompressionCodec_SNAPPY, snappyCompressor{}) 157 }