github.com/apache/arrow/go/v14@v14.0.1/parquet/compress/compress.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package compress contains the interfaces and implementations for handling compression/decompression
    18  // of parquet data at the column levels.
    19  package compress
    20  
    21  import (
    22  	"compress/flate"
    23  	"fmt"
    24  	"io"
    25  
    26  	"github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet"
    27  )
    28  
    29  // Compression is an alias to the thrift compression codec enum type for easy use
    30  type Compression parquet.CompressionCodec
    31  
    32  func (c Compression) String() string {
    33  	return parquet.CompressionCodec(c).String()
    34  }
    35  
    36  // DefaultCompressionLevel will use flate.DefaultCompression since many of the compression libraries
    37  // use that to denote "use the default".
    38  const DefaultCompressionLevel = flate.DefaultCompression
    39  
    40  // Codecs is a useful struct to provide namespaced enum values to use for specifying the compression type to use
    41  // which make for easy internal swapping between them and the thrift enum since they are initialized to the same
    42  // constant values.
    43  var Codecs = struct {
    44  	Uncompressed Compression
    45  	Snappy       Compression
    46  	Gzip         Compression
    47  	// LZO is unsupported in this library since LZO license is incompatible with Apache License
    48  	Lzo    Compression
    49  	Brotli Compression
    50  	// LZ4 unsupported in this library due to problematic issues between the Hadoop LZ4 spec vs regular lz4
    51  	// see: http://mail-archives.apache.org/mod_mbox/arrow-dev/202007.mbox/%3CCAAri41v24xuA8MGHLDvgSnE+7AAgOhiEukemW_oPNHMvfMmrWw@mail.gmail.com%3E
    52  	Lz4  Compression
    53  	Zstd Compression
    54  }{
    55  	Uncompressed: Compression(parquet.CompressionCodec_UNCOMPRESSED),
    56  	Snappy:       Compression(parquet.CompressionCodec_SNAPPY),
    57  	Gzip:         Compression(parquet.CompressionCodec_GZIP),
    58  	Lzo:          Compression(parquet.CompressionCodec_LZO),
    59  	Brotli:       Compression(parquet.CompressionCodec_BROTLI),
    60  	Lz4:          Compression(parquet.CompressionCodec_LZ4),
    61  	Zstd:         Compression(parquet.CompressionCodec_ZSTD),
    62  }
    63  
    64  // Codec is an interface which is implemented for each compression type in order to make the interactions easy to
    65  // implement. Most consumers won't be calling GetCodec directly.
    66  type Codec interface {
    67  	// NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data
    68  	NewReader(io.Reader) io.ReadCloser
    69  	// NewWriter provides a wrapper around a write stream to compress data before writing it.
    70  	NewWriter(io.Writer) io.WriteCloser
    71  	// NewWriterLevel is like NewWriter but allows specifying the compression level
    72  	NewWriterLevel(io.Writer, int) (io.WriteCloser, error)
    73  	// Encode encodes a block of data given by src and returns the compressed block. dst should be either nil
    74  	// or sized large enough to fit the compressed block (use CompressBound to allocate). dst and src should not
    75  	// overlap since some of the compression types don't allow it.
    76  	//
    77  	// The returned slice will be one of the following:
    78  	//	1. If dst was nil or dst was too small to fit the compressed data, it will be a newly allocated slice
    79  	//	2. If dst was large enough to fit the compressed data (depending on the compression algorithm it might
    80  	//		 be required to be at least CompressBound length) then it might be a slice of dst.
    81  	Encode(dst, src []byte) []byte
    82  	// EncodeLevel is like Encode, but specifies a particular encoding level instead of the default.
    83  	EncodeLevel(dst, src []byte, level int) []byte
    84  	// CompressBound returns the boundary of maximum size of compressed data under the chosen codec.
    85  	CompressBound(int64) int64
    86  	// Decode is for decoding a single block rather than a stream, like with Encode, dst must be either nil or
    87  	// sized large enough to accommodate the uncompressed data and should not overlap with src.
    88  	//
    89  	// the returned slice *might* be a slice of dst.
    90  	Decode(dst, src []byte) []byte
    91  }
    92  
    93  var codecs = map[Compression]Codec{}
    94  
    95  type nocodec struct{}
    96  
    97  func (nocodec) NewReader(r io.Reader) io.ReadCloser {
    98  	ret, ok := r.(io.ReadCloser)
    99  	if !ok {
   100  		return io.NopCloser(r)
   101  	}
   102  	return ret
   103  }
   104  
   105  func (nocodec) Decode(dst, src []byte) []byte {
   106  	if dst != nil {
   107  		copy(dst, src)
   108  	}
   109  	return dst
   110  }
   111  
   112  type writerNopCloser struct {
   113  	io.Writer
   114  }
   115  
   116  func (writerNopCloser) Close() error {
   117  	return nil
   118  }
   119  
   120  func (nocodec) Encode(dst, src []byte) []byte {
   121  	copy(dst, src)
   122  	return dst
   123  }
   124  
   125  func (nocodec) EncodeLevel(dst, src []byte, _ int) []byte {
   126  	copy(dst, src)
   127  	return dst
   128  }
   129  
   130  func (nocodec) NewWriter(w io.Writer) io.WriteCloser {
   131  	ret, ok := w.(io.WriteCloser)
   132  	if !ok {
   133  		return writerNopCloser{w}
   134  	}
   135  	return ret
   136  }
   137  
   138  func (n nocodec) NewWriterLevel(w io.Writer, _ int) (io.WriteCloser, error) {
   139  	return n.NewWriter(w), nil
   140  }
   141  
   142  func (nocodec) CompressBound(len int64) int64 { return len }
   143  
   144  func init() {
   145  	codecs[Codecs.Uncompressed] = nocodec{}
   146  }
   147  
   148  // GetCodec returns a Codec interface for the requested Compression type
   149  func GetCodec(typ Compression) (Codec, error) {
   150  	ret, ok := codecs[typ]
   151  	if !ok {
   152  		return nil, fmt.Errorf("compression for %s unimplemented", typ.String())
   153  	}
   154  	return ret, nil
   155  }