github.com/apache/arrow/go/v16@v16.1.0/parquet/compress/compress.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package compress contains the interfaces and implementations for handling compression/decompression
    18  // of parquet data at the column levels.
    19  package compress
    20  
    21  import (
    22  	"compress/flate"
    23  	"fmt"
    24  	"io"
    25  
    26  	"github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet"
    27  )
    28  
    29  // Compression is an alias to the thrift compression codec enum type for easy use
    30  type Compression parquet.CompressionCodec
    31  
    32  func (c Compression) String() string {
    33  	return parquet.CompressionCodec(c).String()
    34  }
    35  
    36  // DefaultCompressionLevel will use flate.DefaultCompression since many of the compression libraries
    37  // use that to denote "use the default".
    38  const DefaultCompressionLevel = flate.DefaultCompression
    39  
    40  // Codecs is a useful struct to provide namespaced enum values to use for specifying the compression type to use
    41  // which make for easy internal swapping between them and the thrift enum since they are initialized to the same
    42  // constant values.
    43  var Codecs = struct {
    44  	Uncompressed Compression
    45  	Snappy       Compression
    46  	Gzip         Compression
    47  	// LZO is unsupported in this library since LZO license is incompatible with Apache License
    48  	Lzo    Compression
    49  	Brotli Compression
    50  	// LZ4 unsupported in this library due to problematic issues between the Hadoop LZ4 spec vs regular lz4
    51  	// see: http://mail-archives.apache.org/mod_mbox/arrow-dev/202007.mbox/%3CCAAri41v24xuA8MGHLDvgSnE+7AAgOhiEukemW_oPNHMvfMmrWw@mail.gmail.com%3E
    52  	Lz4  Compression
    53  	Zstd Compression
    54  }{
    55  	Uncompressed: Compression(parquet.CompressionCodec_UNCOMPRESSED),
    56  	Snappy:       Compression(parquet.CompressionCodec_SNAPPY),
    57  	Gzip:         Compression(parquet.CompressionCodec_GZIP),
    58  	Lzo:          Compression(parquet.CompressionCodec_LZO),
    59  	Brotli:       Compression(parquet.CompressionCodec_BROTLI),
    60  	Lz4:          Compression(parquet.CompressionCodec_LZ4),
    61  	Zstd:         Compression(parquet.CompressionCodec_ZSTD),
    62  }
    63  
    64  // Codec is an interface which is implemented for each compression type in order to make the interactions easy to
    65  // implement. Most consumers won't be calling GetCodec directly.
    66  type Codec interface {
    67  	// NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data
    68  	NewReader(io.Reader) io.ReadCloser
    69  	// NewWriter provides a wrapper around a write stream to compress data before writing it.
    70  	NewWriter(io.Writer) io.WriteCloser
    71  	// NewWriterLevel is like NewWriter but allows specifying the compression level
    72  	NewWriterLevel(io.Writer, int) (io.WriteCloser, error)
    73  	// Encode encodes a block of data given by src and returns the compressed block. dst should be either nil
    74  	// or sized large enough to fit the compressed block (use CompressBound to allocate). dst and src should not
    75  	// overlap since some of the compression types don't allow it.
    76  	//
    77  	// The returned slice will be one of the following:
    78  	//	1. If dst was nil or dst was too small to fit the compressed data, it will be a newly allocated slice
    79  	//	2. If dst was large enough to fit the compressed data (depending on the compression algorithm it might
    80  	//		 be required to be at least CompressBound length) then it might be a slice of dst.
    81  	Encode(dst, src []byte) []byte
    82  	// EncodeLevel is like Encode, but specifies a particular encoding level instead of the default.
    83  	EncodeLevel(dst, src []byte, level int) []byte
    84  	// CompressBound returns the boundary of maximum size of compressed data under the chosen codec.
    85  	CompressBound(int64) int64
    86  	// Decode is for decoding a single block rather than a stream, like with Encode, dst must be either nil or
    87  	// sized large enough to accommodate the uncompressed data and should not overlap with src.
    88  	//
    89  	// the returned slice *might* be a slice of dst.
    90  	Decode(dst, src []byte) []byte
    91  }
    92  
    93  var codecs = map[Compression]Codec{}
    94  
    95  // RegisterCodec adds or overrides a codec implementation for a given compression algorithm.
    96  // The intended use case is within the init() section of a package. For example,
    97  //
    98  //	// inside a custom codec package, say czstd
    99  //
   100  //	func init() {
   101  //	    RegisterCodec(compress.Codecs.Zstd, czstdCodec{})
   102  //	}
   103  //
   104  //	type czstdCodec struct{} // implementing Codec interface using CGO based ZSTD wrapper
   105  //
   106  // And user of the custom codec can import the above package like below,
   107  //
   108  //	package main
   109  //
   110  //	import _ "package/path/to/czstd"
   111  func RegisterCodec(compression Compression, codec Codec) {
   112  	codecs[compression] = codec
   113  }
   114  
   115  type nocodec struct{}
   116  
   117  func (nocodec) NewReader(r io.Reader) io.ReadCloser {
   118  	ret, ok := r.(io.ReadCloser)
   119  	if !ok {
   120  		return io.NopCloser(r)
   121  	}
   122  	return ret
   123  }
   124  
   125  func (nocodec) Decode(dst, src []byte) []byte {
   126  	if dst != nil {
   127  		copy(dst, src)
   128  	}
   129  	return dst
   130  }
   131  
   132  type writerNopCloser struct {
   133  	io.Writer
   134  }
   135  
   136  func (writerNopCloser) Close() error {
   137  	return nil
   138  }
   139  
   140  func (nocodec) Encode(dst, src []byte) []byte {
   141  	copy(dst, src)
   142  	return dst
   143  }
   144  
   145  func (nocodec) EncodeLevel(dst, src []byte, _ int) []byte {
   146  	copy(dst, src)
   147  	return dst
   148  }
   149  
   150  func (nocodec) NewWriter(w io.Writer) io.WriteCloser {
   151  	ret, ok := w.(io.WriteCloser)
   152  	if !ok {
   153  		return writerNopCloser{w}
   154  	}
   155  	return ret
   156  }
   157  
   158  func (n nocodec) NewWriterLevel(w io.Writer, _ int) (io.WriteCloser, error) {
   159  	return n.NewWriter(w), nil
   160  }
   161  
   162  func (nocodec) CompressBound(len int64) int64 { return len }
   163  
   164  func init() {
   165  	codecs[Codecs.Uncompressed] = nocodec{}
   166  }
   167  
   168  // GetCodec returns a Codec interface for the requested Compression type
   169  func GetCodec(typ Compression) (Codec, error) {
   170  	ret, ok := codecs[typ]
   171  	if !ok {
   172  		return nil, fmt.Errorf("compression for %s unimplemented", typ.String())
   173  	}
   174  	return ret, nil
   175  }