github.com/fraugster/parquet-go@v0.12.0/chunk_writer.go (about) 1 package goparquet 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "math" 8 "sort" 9 10 "github.com/fraugster/parquet-go/parquet" 11 ) 12 13 func getBooleanValuesEncoder(pageEncoding parquet.Encoding) (valuesEncoder, error) { 14 switch pageEncoding { 15 case parquet.Encoding_PLAIN: 16 return &booleanPlainEncoder{}, nil 17 case parquet.Encoding_RLE: 18 return &booleanRLEEncoder{}, nil 19 default: 20 return nil, fmt.Errorf("unsupported encoding %s for boolean", pageEncoding) 21 } 22 } 23 24 func getByteArrayValuesEncoder(pageEncoding parquet.Encoding, dictValues []interface{}) (valuesEncoder, error) { 25 switch pageEncoding { 26 case parquet.Encoding_PLAIN: 27 return &byteArrayPlainEncoder{}, nil 28 case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY: 29 return &byteArrayDeltaLengthEncoder{}, nil 30 case parquet.Encoding_DELTA_BYTE_ARRAY: 31 return &byteArrayDeltaEncoder{}, nil 32 default: 33 return nil, fmt.Errorf("unsupported encoding %s for binary", pageEncoding) 34 } 35 } 36 37 func getFixedLenByteArrayValuesEncoder(pageEncoding parquet.Encoding, len int, dictValues []interface{}) (valuesEncoder, error) { 38 switch pageEncoding { 39 case parquet.Encoding_PLAIN: 40 return &byteArrayPlainEncoder{length: len}, nil 41 case parquet.Encoding_DELTA_BYTE_ARRAY: 42 return &byteArrayDeltaEncoder{}, nil 43 default: 44 return nil, fmt.Errorf("unsupported encoding %s for fixed_len_byte_array(%d)", pageEncoding, len) 45 } 46 } 47 48 func getInt32ValuesEncoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesEncoder, error) { 49 switch pageEncoding { 50 case parquet.Encoding_PLAIN: 51 return &int32PlainEncoder{}, nil 52 case parquet.Encoding_DELTA_BINARY_PACKED: 53 return &int32DeltaBPEncoder{ 54 deltaBitPackEncoder32: deltaBitPackEncoder32{ 55 blockSize: 128, 56 miniBlockCount: 4, 57 }, 58 }, nil 59 default: 60 return nil, fmt.Errorf("unsupported encoding %s for int32", pageEncoding) 61 } 62 } 63 64 func getInt64ValuesEncoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesEncoder, error) { 65 switch pageEncoding { 66 case parquet.Encoding_PLAIN: 67 return &int64PlainEncoder{}, nil 68 case parquet.Encoding_DELTA_BINARY_PACKED: 69 return &int64DeltaBPEncoder{ 70 deltaBitPackEncoder64: deltaBitPackEncoder64{ 71 blockSize: 128, 72 miniBlockCount: 4, 73 }, 74 }, nil 75 default: 76 return nil, fmt.Errorf("unsupported encoding %s for int64", pageEncoding) 77 } 78 } 79 80 func getValuesEncoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesEncoder, error) { 81 // Change the deprecated value 82 if pageEncoding == parquet.Encoding_PLAIN_DICTIONARY { 83 pageEncoding = parquet.Encoding_RLE_DICTIONARY 84 } 85 86 switch *typ.Type { 87 case parquet.Type_BOOLEAN: 88 return getBooleanValuesEncoder(pageEncoding) 89 90 case parquet.Type_BYTE_ARRAY: 91 return getByteArrayValuesEncoder(pageEncoding, dictValues) 92 93 case parquet.Type_FIXED_LEN_BYTE_ARRAY: 94 if typ.TypeLength == nil { 95 return nil, fmt.Errorf("type %s with nil type len", typ.Type) 96 } 97 return getFixedLenByteArrayValuesEncoder(pageEncoding, int(*typ.TypeLength), dictValues) 98 99 case parquet.Type_FLOAT: 100 switch pageEncoding { 101 case parquet.Encoding_PLAIN: 102 return &floatPlainEncoder{}, nil 103 } 104 105 case parquet.Type_DOUBLE: 106 switch pageEncoding { 107 case parquet.Encoding_PLAIN: 108 return &doublePlainEncoder{}, nil 109 } 110 111 case parquet.Type_INT32: 112 return getInt32ValuesEncoder(pageEncoding, typ, dictValues) 113 114 case parquet.Type_INT64: 115 return getInt64ValuesEncoder(pageEncoding, typ, dictValues) 116 117 case parquet.Type_INT96: 118 switch pageEncoding { 119 case parquet.Encoding_PLAIN: 120 return &int96PlainEncoder{}, nil 121 } 122 123 default: 124 return nil, fmt.Errorf("unsupported type: %s", typ.Type) 125 } 126 127 return nil, fmt.Errorf("unsupported encoding %s for %s type", pageEncoding, typ.Type) 128 } 129 130 func getDictValuesEncoder(typ *parquet.SchemaElement) (valuesEncoder, error) { 131 switch *typ.Type { 132 case parquet.Type_BYTE_ARRAY: 133 return &byteArrayPlainEncoder{}, nil 134 case parquet.Type_FIXED_LEN_BYTE_ARRAY: 135 if typ.TypeLength == nil { 136 return nil, fmt.Errorf("type %s with nil type len", typ) 137 } 138 return &byteArrayPlainEncoder{length: int(*typ.TypeLength)}, nil 139 case parquet.Type_FLOAT: 140 return &floatPlainEncoder{}, nil 141 case parquet.Type_DOUBLE: 142 return &doublePlainEncoder{}, nil 143 case parquet.Type_INT32: 144 return &int32PlainEncoder{}, nil 145 case parquet.Type_INT64: 146 return &int64PlainEncoder{}, nil 147 case parquet.Type_INT96: 148 return &int96PlainEncoder{}, nil 149 } 150 151 return nil, fmt.Errorf("type %s is not supported for dict value encoder", typ) 152 } 153 154 func writeChunk(ctx context.Context, w writePos, sch *schema, col *Column, codec parquet.CompressionCodec, pageFn newDataPageFunc, kvMetaData map[string]string) (*parquet.ColumnChunk, error) { 155 pos := w.Pos() // Save the position before writing data 156 chunkOffset := pos 157 var ( 158 dictPageOffset *int64 159 // NOTE : 160 // This is documentation on these two field : 161 // - TotalUncompressedSize: total byte size of all uncompressed pages in this column chunk (including the headers) * 162 // - TotalCompressedSize: total byte size of all compressed pages in this column chunk (including the headers) * 163 // the including header part is confusing. for uncompressed size, we can use the position, but for the compressed 164 // the only value we have doesn't contain the header 165 totalComp int64 166 totalUnComp int64 167 ) 168 169 // flush final data page before writing dictionary page (if applicable) and all data pages. 170 if err := col.data.flushPage(sch, true); err != nil { 171 return nil, err 172 } 173 174 dictValues := []interface{}{} 175 indices := map[interface{}]int32{} 176 useDict := true 177 178 if *col.Type() == parquet.Type_BOOLEAN { // never ever use dictionary encoding on booleans. 179 useDict = false 180 } 181 if !col.data.useDictionary() { 182 useDict = false 183 } 184 185 if useDict { 186 outerLoop: 187 for _, page := range col.data.dataPages { 188 if page.stats.DistinctCount != nil && *page.stats.DistinctCount > math.MaxInt16 { 189 useDict = false 190 break outerLoop 191 } 192 for _, v := range page.values { 193 k := mapKey(v) 194 if idx, ok := indices[k]; !ok { 195 idx = int32(len(dictValues)) 196 indices[k] = idx 197 dictValues = append(dictValues, v) 198 page.indexList = append(page.indexList, idx) 199 200 if len(dictValues) > math.MaxInt16 { 201 useDict = false 202 break outerLoop 203 } 204 } else { 205 page.indexList = append(page.indexList, idx) 206 } 207 } 208 } 209 } 210 211 if useDict { 212 tmp := pos // make a copy, do not use the pos here 213 dictPageOffset = &tmp 214 dict := &dictPageWriter{} 215 if err := dict.init(sch, col, codec, dictValues); err != nil { 216 return nil, err 217 } 218 compSize, unCompSize, err := dict.write(ctx, w) 219 if err != nil { 220 return nil, err 221 } 222 totalComp = w.Pos() - pos 223 // Header size plus the rLevel and dLevel size 224 headerSize := totalComp - int64(compSize) 225 totalUnComp = int64(unCompSize) + headerSize 226 pos = w.Pos() // Move position for data pos 227 } 228 229 var ( 230 compSize, unCompSize int 231 numValues, nullValues int64 232 ) 233 234 for _, page := range col.data.dataPages { 235 pw := pageFn(useDict, dictValues, page, sch.enableCRC) 236 237 if err := pw.init(col, codec); err != nil { 238 return nil, err 239 } 240 241 var buf bytes.Buffer 242 243 compressed, uncompressed, err := pw.write(ctx, &buf) 244 if err != nil { 245 return nil, err 246 } 247 248 compSize += compressed 249 unCompSize += uncompressed 250 numValues += page.numValues 251 nullValues += page.nullValues 252 if _, err := w.Write(buf.Bytes()); err != nil { 253 return nil, err 254 } 255 } 256 257 col.data.dataPages = nil 258 259 totalComp += w.Pos() - pos 260 // Header size plus the rLevel and dLevel size 261 headerSize := totalComp - int64(compSize) 262 totalUnComp += int64(unCompSize) + headerSize 263 264 encodings := make([]parquet.Encoding, 0, 3) 265 encodings = append(encodings, 266 parquet.Encoding_RLE, 267 col.data.encoding(), 268 ) 269 if useDict { 270 encodings[1] = parquet.Encoding_PLAIN // In dictionary we use PLAIN for the data, not the column encoding 271 encodings = append(encodings, parquet.Encoding_RLE_DICTIONARY) 272 } 273 274 keyValueMetaData := make([]*parquet.KeyValue, 0, len(kvMetaData)) 275 for k, v := range kvMetaData { 276 value := v 277 keyValueMetaData = append(keyValueMetaData, &parquet.KeyValue{Key: k, Value: &value}) 278 } 279 sort.Slice(keyValueMetaData, func(i, j int) bool { 280 return keyValueMetaData[i].Key < keyValueMetaData[j].Key 281 }) 282 283 distinctCount := int64(len(dictValues)) 284 285 stats := &parquet.Statistics{ 286 MinValue: col.data.getStats().minValue(), 287 MaxValue: col.data.getStats().maxValue(), 288 NullCount: &nullValues, 289 DistinctCount: &distinctCount, 290 } 291 292 ch := &parquet.ColumnChunk{ 293 FilePath: nil, // No support for external 294 FileOffset: chunkOffset, 295 MetaData: &parquet.ColumnMetaData{ 296 Type: col.data.parquetType(), 297 Encodings: encodings, 298 PathInSchema: col.path, 299 Codec: codec, 300 NumValues: numValues + nullValues, 301 TotalUncompressedSize: totalUnComp, 302 TotalCompressedSize: totalComp, 303 KeyValueMetadata: keyValueMetaData, 304 DataPageOffset: pos, 305 IndexPageOffset: nil, 306 DictionaryPageOffset: dictPageOffset, 307 Statistics: stats, 308 EncodingStats: nil, 309 }, 310 OffsetIndexOffset: nil, 311 OffsetIndexLength: nil, 312 ColumnIndexOffset: nil, 313 ColumnIndexLength: nil, 314 } 315 316 return ch, nil 317 } 318 319 func writeRowGroup(ctx context.Context, w writePos, sch *schema, codec parquet.CompressionCodec, pageFn newDataPageFunc, h *flushRowGroupOptionHandle) ([]*parquet.ColumnChunk, error) { 320 dataCols := sch.Columns() 321 var res = make([]*parquet.ColumnChunk, 0, len(dataCols)) 322 for _, ci := range dataCols { 323 ch, err := writeChunk(ctx, w, sch, ci, codec, pageFn, h.getMetaData(ci.Path())) 324 if err != nil { 325 return nil, err 326 } 327 328 res = append(res, ch) 329 } 330 331 return res, nil 332 }