github.com/apache/arrow/go/v7@v7.0.1/parquet/compress/compress.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 // Package compress contains the interfaces and implementations for handling compression/decompression 18 // of parquet data at the column levels. 19 package compress 20 21 import ( 22 "compress/flate" 23 "io" 24 "io/ioutil" 25 26 "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet" 27 "golang.org/x/xerrors" 28 ) 29 30 // Compression is an alias to the thrift compression codec enum type for easy use 31 type Compression parquet.CompressionCodec 32 33 func (c Compression) String() string { 34 return parquet.CompressionCodec(c).String() 35 } 36 37 // DefaultCompressionLevel will use flate.DefaultCompression since many of the compression libraries 38 // use that to denote "use the default". 39 const DefaultCompressionLevel = flate.DefaultCompression 40 41 // Codecs is a useful struct to provide namespaced enum values to use for specifying the compression type to use 42 // which make for easy internal swapping between them and the thrift enum since they are initialized to the same 43 // constant values. 44 var Codecs = struct { 45 Uncompressed Compression 46 Snappy Compression 47 Gzip Compression 48 // LZO is unsupported in this library since LZO license is incompatible with Apache License 49 Lzo Compression 50 Brotli Compression 51 // LZ4 unsupported in this library due to problematic issues between the Hadoop LZ4 spec vs regular lz4 52 // see: http://mail-archives.apache.org/mod_mbox/arrow-dev/202007.mbox/%3CCAAri41v24xuA8MGHLDvgSnE+7AAgOhiEukemW_oPNHMvfMmrWw@mail.gmail.com%3E 53 Lz4 Compression 54 Zstd Compression 55 }{ 56 Uncompressed: Compression(parquet.CompressionCodec_UNCOMPRESSED), 57 Snappy: Compression(parquet.CompressionCodec_SNAPPY), 58 Gzip: Compression(parquet.CompressionCodec_GZIP), 59 Lzo: Compression(parquet.CompressionCodec_LZO), 60 Brotli: Compression(parquet.CompressionCodec_BROTLI), 61 Lz4: Compression(parquet.CompressionCodec_LZ4), 62 Zstd: Compression(parquet.CompressionCodec_ZSTD), 63 } 64 65 // Codec is an interface which is implemented for each compression type in order to make the interactions easy to 66 // implement. Most consumers won't be calling GetCodec directly. 67 type Codec interface { 68 // NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data 69 NewReader(io.Reader) io.ReadCloser 70 // NewWriter provides a wrapper around a write stream to compress data before writing it. 71 NewWriter(io.Writer) io.WriteCloser 72 // NewWriterLevel is like NewWriter but allows specifying the compression level 73 NewWriterLevel(io.Writer, int) (io.WriteCloser, error) 74 // Encode encodes a block of data given by src and returns the compressed block. dst should be either nil 75 // or sized large enough to fit the compressed block (use CompressBound to allocate). dst and src should not 76 // overlap since some of the compression types don't allow it. 77 // 78 // The returned slice will be one of the following: 79 // 1. If dst was nil or dst was too small to fit the compressed data, it will be a newly allocated slice 80 // 2. If dst was large enough to fit the compressed data (depending on the compression algorithm it might 81 // be required to be at least CompressBound length) then it might be a slice of dst. 82 Encode(dst, src []byte) []byte 83 // EncodeLevel is like Encode, but specifies a particular encoding level instead of the default. 84 EncodeLevel(dst, src []byte, level int) []byte 85 // CompressBound returns the boundary of maximum size of compressed data under the chosen codec. 86 CompressBound(int64) int64 87 // Decode is for decoding a single block rather than a stream, like with Encode, dst must be either nil or 88 // sized large enough to accommodate the uncompressed data and should not overlap with src. 89 // 90 // the returned slice *might* be a slice of dst. 91 Decode(dst, src []byte) []byte 92 } 93 94 var codecs = map[Compression]Codec{} 95 96 type nocodec struct{} 97 98 func (nocodec) NewReader(r io.Reader) io.ReadCloser { 99 ret, ok := r.(io.ReadCloser) 100 if !ok { 101 return ioutil.NopCloser(r) 102 } 103 return ret 104 } 105 106 func (nocodec) Decode(dst, src []byte) []byte { 107 if dst != nil { 108 copy(dst, src) 109 } 110 return dst 111 } 112 113 type writerNopCloser struct { 114 io.Writer 115 } 116 117 func (writerNopCloser) Close() error { 118 return nil 119 } 120 121 func (nocodec) Encode(dst, src []byte) []byte { 122 copy(dst, src) 123 return dst 124 } 125 126 func (nocodec) EncodeLevel(dst, src []byte, _ int) []byte { 127 copy(dst, src) 128 return dst 129 } 130 131 func (nocodec) NewWriter(w io.Writer) io.WriteCloser { 132 ret, ok := w.(io.WriteCloser) 133 if !ok { 134 return writerNopCloser{w} 135 } 136 return ret 137 } 138 139 func (n nocodec) NewWriterLevel(w io.Writer, _ int) (io.WriteCloser, error) { 140 return n.NewWriter(w), nil 141 } 142 143 func (nocodec) CompressBound(len int64) int64 { return len } 144 145 func init() { 146 codecs[Codecs.Uncompressed] = nocodec{} 147 } 148 149 // GetCodec returns a Codec interface for the requested Compression type 150 func GetCodec(typ Compression) (Codec, error) { 151 ret, ok := codecs[typ] 152 if !ok { 153 return nil, xerrors.Errorf("compression for %s unimplemented", typ.String()) 154 } 155 return ret, nil 156 }