github.com/apache/arrow/go/v14@v14.0.1/parquet/compress/compress.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 // Package compress contains the interfaces and implementations for handling compression/decompression 18 // of parquet data at the column levels. 19 package compress 20 21 import ( 22 "compress/flate" 23 "fmt" 24 "io" 25 26 "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 27 ) 28 29 // Compression is an alias to the thrift compression codec enum type for easy use 30 type Compression parquet.CompressionCodec 31 32 func (c Compression) String() string { 33 return parquet.CompressionCodec(c).String() 34 } 35 36 // DefaultCompressionLevel will use flate.DefaultCompression since many of the compression libraries 37 // use that to denote "use the default". 38 const DefaultCompressionLevel = flate.DefaultCompression 39 40 // Codecs is a useful struct to provide namespaced enum values to use for specifying the compression type to use 41 // which make for easy internal swapping between them and the thrift enum since they are initialized to the same 42 // constant values. 43 var Codecs = struct { 44 Uncompressed Compression 45 Snappy Compression 46 Gzip Compression 47 // LZO is unsupported in this library since LZO license is incompatible with Apache License 48 Lzo Compression 49 Brotli Compression 50 // LZ4 unsupported in this library due to problematic issues between the Hadoop LZ4 spec vs regular lz4 51 // see: http://mail-archives.apache.org/mod_mbox/arrow-dev/202007.mbox/%3CCAAri41v24xuA8MGHLDvgSnE+7AAgOhiEukemW_oPNHMvfMmrWw@mail.gmail.com%3E 52 Lz4 Compression 53 Zstd Compression 54 }{ 55 Uncompressed: Compression(parquet.CompressionCodec_UNCOMPRESSED), 56 Snappy: Compression(parquet.CompressionCodec_SNAPPY), 57 Gzip: Compression(parquet.CompressionCodec_GZIP), 58 Lzo: Compression(parquet.CompressionCodec_LZO), 59 Brotli: Compression(parquet.CompressionCodec_BROTLI), 60 Lz4: Compression(parquet.CompressionCodec_LZ4), 61 Zstd: Compression(parquet.CompressionCodec_ZSTD), 62 } 63 64 // Codec is an interface which is implemented for each compression type in order to make the interactions easy to 65 // implement. Most consumers won't be calling GetCodec directly. 66 type Codec interface { 67 // NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data 68 NewReader(io.Reader) io.ReadCloser 69 // NewWriter provides a wrapper around a write stream to compress data before writing it. 70 NewWriter(io.Writer) io.WriteCloser 71 // NewWriterLevel is like NewWriter but allows specifying the compression level 72 NewWriterLevel(io.Writer, int) (io.WriteCloser, error) 73 // Encode encodes a block of data given by src and returns the compressed block. dst should be either nil 74 // or sized large enough to fit the compressed block (use CompressBound to allocate). dst and src should not 75 // overlap since some of the compression types don't allow it. 76 // 77 // The returned slice will be one of the following: 78 // 1. If dst was nil or dst was too small to fit the compressed data, it will be a newly allocated slice 79 // 2. If dst was large enough to fit the compressed data (depending on the compression algorithm it might 80 // be required to be at least CompressBound length) then it might be a slice of dst. 81 Encode(dst, src []byte) []byte 82 // EncodeLevel is like Encode, but specifies a particular encoding level instead of the default. 83 EncodeLevel(dst, src []byte, level int) []byte 84 // CompressBound returns the boundary of maximum size of compressed data under the chosen codec. 85 CompressBound(int64) int64 86 // Decode is for decoding a single block rather than a stream, like with Encode, dst must be either nil or 87 // sized large enough to accommodate the uncompressed data and should not overlap with src. 88 // 89 // the returned slice *might* be a slice of dst. 90 Decode(dst, src []byte) []byte 91 } 92 93 var codecs = map[Compression]Codec{} 94 95 type nocodec struct{} 96 97 func (nocodec) NewReader(r io.Reader) io.ReadCloser { 98 ret, ok := r.(io.ReadCloser) 99 if !ok { 100 return io.NopCloser(r) 101 } 102 return ret 103 } 104 105 func (nocodec) Decode(dst, src []byte) []byte { 106 if dst != nil { 107 copy(dst, src) 108 } 109 return dst 110 } 111 112 type writerNopCloser struct { 113 io.Writer 114 } 115 116 func (writerNopCloser) Close() error { 117 return nil 118 } 119 120 func (nocodec) Encode(dst, src []byte) []byte { 121 copy(dst, src) 122 return dst 123 } 124 125 func (nocodec) EncodeLevel(dst, src []byte, _ int) []byte { 126 copy(dst, src) 127 return dst 128 } 129 130 func (nocodec) NewWriter(w io.Writer) io.WriteCloser { 131 ret, ok := w.(io.WriteCloser) 132 if !ok { 133 return writerNopCloser{w} 134 } 135 return ret 136 } 137 138 func (n nocodec) NewWriterLevel(w io.Writer, _ int) (io.WriteCloser, error) { 139 return n.NewWriter(w), nil 140 } 141 142 func (nocodec) CompressBound(len int64) int64 { return len } 143 144 func init() { 145 codecs[Codecs.Uncompressed] = nocodec{} 146 } 147 148 // GetCodec returns a Codec interface for the requested Compression type 149 func GetCodec(typ Compression) (Codec, error) { 150 ret, ok := codecs[typ] 151 if !ok { 152 return nil, fmt.Errorf("compression for %s unimplemented", typ.String()) 153 } 154 return ret, nil 155 }