github.com/apache/arrow/go/v7@v7.0.1/parquet/compress/compress.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package compress contains the interfaces and implementations for handling compression/decompression
    18  // of parquet data at the column levels.
    19  package compress
    20  
    21  import (
    22  	"compress/flate"
    23  	"io"
    24  	"io/ioutil"
    25  
    26  	"github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
    27  	"golang.org/x/xerrors"
    28  )
    29  
    30  // Compression is an alias to the thrift compression codec enum type for easy use
    31  type Compression parquet.CompressionCodec
    32  
    33  func (c Compression) String() string {
    34  	return parquet.CompressionCodec(c).String()
    35  }
    36  
    37  // DefaultCompressionLevel will use flate.DefaultCompression since many of the compression libraries
    38  // use that to denote "use the default".
    39  const DefaultCompressionLevel = flate.DefaultCompression
    40  
    41  // Codecs is a useful struct to provide namespaced enum values to use for specifying the compression type to use
    42  // which make for easy internal swapping between them and the thrift enum since they are initialized to the same
    43  // constant values.
    44  var Codecs = struct {
    45  	Uncompressed Compression
    46  	Snappy       Compression
    47  	Gzip         Compression
    48  	// LZO is unsupported in this library since LZO license is incompatible with Apache License
    49  	Lzo    Compression
    50  	Brotli Compression
    51  	// LZ4 unsupported in this library due to problematic issues between the Hadoop LZ4 spec vs regular lz4
    52  	// see: http://mail-archives.apache.org/mod_mbox/arrow-dev/202007.mbox/%3CCAAri41v24xuA8MGHLDvgSnE+7AAgOhiEukemW_oPNHMvfMmrWw@mail.gmail.com%3E
    53  	Lz4  Compression
    54  	Zstd Compression
    55  }{
    56  	Uncompressed: Compression(parquet.CompressionCodec_UNCOMPRESSED),
    57  	Snappy:       Compression(parquet.CompressionCodec_SNAPPY),
    58  	Gzip:         Compression(parquet.CompressionCodec_GZIP),
    59  	Lzo:          Compression(parquet.CompressionCodec_LZO),
    60  	Brotli:       Compression(parquet.CompressionCodec_BROTLI),
    61  	Lz4:          Compression(parquet.CompressionCodec_LZ4),
    62  	Zstd:         Compression(parquet.CompressionCodec_ZSTD),
    63  }
    64  
    65  // Codec is an interface which is implemented for each compression type in order to make the interactions easy to
    66  // implement. Most consumers won't be calling GetCodec directly.
    67  type Codec interface {
    68  	// NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data
    69  	NewReader(io.Reader) io.ReadCloser
    70  	// NewWriter provides a wrapper around a write stream to compress data before writing it.
    71  	NewWriter(io.Writer) io.WriteCloser
    72  	// NewWriterLevel is like NewWriter but allows specifying the compression level
    73  	NewWriterLevel(io.Writer, int) (io.WriteCloser, error)
    74  	// Encode encodes a block of data given by src and returns the compressed block. dst should be either nil
    75  	// or sized large enough to fit the compressed block (use CompressBound to allocate). dst and src should not
    76  	// overlap since some of the compression types don't allow it.
    77  	//
    78  	// The returned slice will be one of the following:
    79  	//	1. If dst was nil or dst was too small to fit the compressed data, it will be a newly allocated slice
    80  	//	2. If dst was large enough to fit the compressed data (depending on the compression algorithm it might
    81  	//		 be required to be at least CompressBound length) then it might be a slice of dst.
    82  	Encode(dst, src []byte) []byte
    83  	// EncodeLevel is like Encode, but specifies a particular encoding level instead of the default.
    84  	EncodeLevel(dst, src []byte, level int) []byte
    85  	// CompressBound returns the boundary of maximum size of compressed data under the chosen codec.
    86  	CompressBound(int64) int64
    87  	// Decode is for decoding a single block rather than a stream, like with Encode, dst must be either nil or
    88  	// sized large enough to accommodate the uncompressed data and should not overlap with src.
    89  	//
    90  	// the returned slice *might* be a slice of dst.
    91  	Decode(dst, src []byte) []byte
    92  }
    93  
    94  var codecs = map[Compression]Codec{}
    95  
    96  type nocodec struct{}
    97  
    98  func (nocodec) NewReader(r io.Reader) io.ReadCloser {
    99  	ret, ok := r.(io.ReadCloser)
   100  	if !ok {
   101  		return ioutil.NopCloser(r)
   102  	}
   103  	return ret
   104  }
   105  
   106  func (nocodec) Decode(dst, src []byte) []byte {
   107  	if dst != nil {
   108  		copy(dst, src)
   109  	}
   110  	return dst
   111  }
   112  
   113  type writerNopCloser struct {
   114  	io.Writer
   115  }
   116  
   117  func (writerNopCloser) Close() error {
   118  	return nil
   119  }
   120  
   121  func (nocodec) Encode(dst, src []byte) []byte {
   122  	copy(dst, src)
   123  	return dst
   124  }
   125  
   126  func (nocodec) EncodeLevel(dst, src []byte, _ int) []byte {
   127  	copy(dst, src)
   128  	return dst
   129  }
   130  
   131  func (nocodec) NewWriter(w io.Writer) io.WriteCloser {
   132  	ret, ok := w.(io.WriteCloser)
   133  	if !ok {
   134  		return writerNopCloser{w}
   135  	}
   136  	return ret
   137  }
   138  
   139  func (n nocodec) NewWriterLevel(w io.Writer, _ int) (io.WriteCloser, error) {
   140  	return n.NewWriter(w), nil
   141  }
   142  
   143  func (nocodec) CompressBound(len int64) int64 { return len }
   144  
   145  func init() {
   146  	codecs[Codecs.Uncompressed] = nocodec{}
   147  }
   148  
   149  // GetCodec returns a Codec interface for the requested Compression type
   150  func GetCodec(typ Compression) (Codec, error) {
   151  	ret, ok := codecs[typ]
   152  	if !ok {
   153  		return nil, xerrors.Errorf("compression for %s unimplemented", typ.String())
   154  	}
   155  	return ret, nil
   156  }