github.com/apache/arrow/go/v16@v16.1.0/parquet/compress/compress.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 // Package compress contains the interfaces and implementations for handling compression/decompression 18 // of parquet data at the column levels. 19 package compress 20 21 import ( 22 "compress/flate" 23 "fmt" 24 "io" 25 26 "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet" 27 ) 28 29 // Compression is an alias to the thrift compression codec enum type for easy use 30 type Compression parquet.CompressionCodec 31 32 func (c Compression) String() string { 33 return parquet.CompressionCodec(c).String() 34 } 35 36 // DefaultCompressionLevel will use flate.DefaultCompression since many of the compression libraries 37 // use that to denote "use the default". 38 const DefaultCompressionLevel = flate.DefaultCompression 39 40 // Codecs is a useful struct to provide namespaced enum values to use for specifying the compression type to use 41 // which make for easy internal swapping between them and the thrift enum since they are initialized to the same 42 // constant values. 43 var Codecs = struct { 44 Uncompressed Compression 45 Snappy Compression 46 Gzip Compression 47 // LZO is unsupported in this library since LZO license is incompatible with Apache License 48 Lzo Compression 49 Brotli Compression 50 // LZ4 unsupported in this library due to problematic issues between the Hadoop LZ4 spec vs regular lz4 51 // see: http://mail-archives.apache.org/mod_mbox/arrow-dev/202007.mbox/%3CCAAri41v24xuA8MGHLDvgSnE+7AAgOhiEukemW_oPNHMvfMmrWw@mail.gmail.com%3E 52 Lz4 Compression 53 Zstd Compression 54 }{ 55 Uncompressed: Compression(parquet.CompressionCodec_UNCOMPRESSED), 56 Snappy: Compression(parquet.CompressionCodec_SNAPPY), 57 Gzip: Compression(parquet.CompressionCodec_GZIP), 58 Lzo: Compression(parquet.CompressionCodec_LZO), 59 Brotli: Compression(parquet.CompressionCodec_BROTLI), 60 Lz4: Compression(parquet.CompressionCodec_LZ4), 61 Zstd: Compression(parquet.CompressionCodec_ZSTD), 62 } 63 64 // Codec is an interface which is implemented for each compression type in order to make the interactions easy to 65 // implement. Most consumers won't be calling GetCodec directly. 66 type Codec interface { 67 // NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data 68 NewReader(io.Reader) io.ReadCloser 69 // NewWriter provides a wrapper around a write stream to compress data before writing it. 70 NewWriter(io.Writer) io.WriteCloser 71 // NewWriterLevel is like NewWriter but allows specifying the compression level 72 NewWriterLevel(io.Writer, int) (io.WriteCloser, error) 73 // Encode encodes a block of data given by src and returns the compressed block. dst should be either nil 74 // or sized large enough to fit the compressed block (use CompressBound to allocate). dst and src should not 75 // overlap since some of the compression types don't allow it. 76 // 77 // The returned slice will be one of the following: 78 // 1. If dst was nil or dst was too small to fit the compressed data, it will be a newly allocated slice 79 // 2. If dst was large enough to fit the compressed data (depending on the compression algorithm it might 80 // be required to be at least CompressBound length) then it might be a slice of dst. 81 Encode(dst, src []byte) []byte 82 // EncodeLevel is like Encode, but specifies a particular encoding level instead of the default. 83 EncodeLevel(dst, src []byte, level int) []byte 84 // CompressBound returns the boundary of maximum size of compressed data under the chosen codec. 85 CompressBound(int64) int64 86 // Decode is for decoding a single block rather than a stream, like with Encode, dst must be either nil or 87 // sized large enough to accommodate the uncompressed data and should not overlap with src. 88 // 89 // the returned slice *might* be a slice of dst. 90 Decode(dst, src []byte) []byte 91 } 92 93 var codecs = map[Compression]Codec{} 94 95 // RegisterCodec adds or overrides a codec implementation for a given compression algorithm. 96 // The intended use case is within the init() section of a package. For example, 97 // 98 // // inside a custom codec package, say czstd 99 // 100 // func init() { 101 // RegisterCodec(compress.Codecs.Zstd, czstdCodec{}) 102 // } 103 // 104 // type czstdCodec struct{} // implementing Codec interface using CGO based ZSTD wrapper 105 // 106 // And user of the custom codec can import the above package like below, 107 // 108 // package main 109 // 110 // import _ "package/path/to/czstd" 111 func RegisterCodec(compression Compression, codec Codec) { 112 codecs[compression] = codec 113 } 114 115 type nocodec struct{} 116 117 func (nocodec) NewReader(r io.Reader) io.ReadCloser { 118 ret, ok := r.(io.ReadCloser) 119 if !ok { 120 return io.NopCloser(r) 121 } 122 return ret 123 } 124 125 func (nocodec) Decode(dst, src []byte) []byte { 126 if dst != nil { 127 copy(dst, src) 128 } 129 return dst 130 } 131 132 type writerNopCloser struct { 133 io.Writer 134 } 135 136 func (writerNopCloser) Close() error { 137 return nil 138 } 139 140 func (nocodec) Encode(dst, src []byte) []byte { 141 copy(dst, src) 142 return dst 143 } 144 145 func (nocodec) EncodeLevel(dst, src []byte, _ int) []byte { 146 copy(dst, src) 147 return dst 148 } 149 150 func (nocodec) NewWriter(w io.Writer) io.WriteCloser { 151 ret, ok := w.(io.WriteCloser) 152 if !ok { 153 return writerNopCloser{w} 154 } 155 return ret 156 } 157 158 func (n nocodec) NewWriterLevel(w io.Writer, _ int) (io.WriteCloser, error) { 159 return n.NewWriter(w), nil 160 } 161 162 func (nocodec) CompressBound(len int64) int64 { return len } 163 164 func init() { 165 codecs[Codecs.Uncompressed] = nocodec{} 166 } 167 168 // GetCodec returns a Codec interface for the requested Compression type 169 func GetCodec(typ Compression) (Codec, error) { 170 ret, ok := codecs[typ] 171 if !ok { 172 return nil, fmt.Errorf("compression for %s unimplemented", typ.String()) 173 } 174 return ret, nil 175 }