github.com/fraugster/parquet-go@v0.12.0/data_store.go (about) 1 package goparquet 2 3 import ( 4 "errors" 5 "fmt" 6 "math/bits" 7 8 "github.com/fraugster/parquet-go/parquet" 9 ) 10 11 // ColumnStore is the read/write implementation for a column. It buffers a single 12 // column's data that is to be written to a parquet file, knows how to encode this 13 // data and will choose an optimal way according to heuristics. It also ensures the 14 // correct decoding of column data to be read. 15 type ColumnStore struct { 16 typedColumnStore 17 18 repTyp parquet.FieldRepetitionType 19 20 pages []pageReader 21 pageIdx int 22 23 values *dictStore 24 25 dLevels *packedArray 26 rLevels *packedArray 27 28 enc parquet.Encoding 29 readPos int 30 31 useDict bool 32 33 skipped bool 34 35 dataPages []*dataPage 36 37 maxPageSize int64 38 39 prevNumRecords int64 // this is just for correctly calculating how many rows are in a data page. 40 41 alloc *allocTracker 42 } 43 44 type dataPage struct { 45 values []interface{} 46 indexList []int32 47 rL *packedArray 48 dL *packedArray 49 numValues int64 50 nullValues int64 51 numRows int64 52 stats *parquet.Statistics 53 } 54 55 // useDictionary is simply a function to decide to use dictionary or not. 56 func (cs *ColumnStore) useDictionary() bool { 57 return cs.useDict 58 } 59 60 func (cs *ColumnStore) encoding() parquet.Encoding { 61 return cs.enc 62 } 63 64 func (cs *ColumnStore) repetitionType() parquet.FieldRepetitionType { 65 return cs.repTyp 66 } 67 68 func (cs *ColumnStore) reset(rep parquet.FieldRepetitionType, maxR, maxD uint16) { 69 if cs.typedColumnStore == nil { 70 panic("generic should be used with typed column store") 71 } 72 cs.repTyp = rep 73 if cs.values == nil { 74 cs.values = &dictStore{useDict: cs.useDict, alloc: cs.alloc} 75 cs.rLevels = &packedArray{} 76 cs.dLevels = &packedArray{} 77 } 78 cs.values.init() 79 cs.rLevels.reset(bits.Len16(maxR)) 80 cs.dLevels.reset(bits.Len16(maxD)) 81 cs.readPos = 0 82 cs.skipped = false 83 cs.prevNumRecords = 0 84 85 cs.typedColumnStore.reset(rep) 86 } 87 88 func (cs *ColumnStore) appendRDLevel(rl, dl uint16) { 89 cs.rLevels.appendSingle(int32(rl)) 90 cs.dLevels.appendSingle(int32(dl)) 91 } 92 93 // Add One row, if the value is null, call Add() , if the value is repeated, call all value in array 94 // the second argument s the definition level 95 // if there is a data the the result should be true, if there is only null (or empty array), the the result should be false 96 func (cs *ColumnStore) add(v interface{}, dL uint16, maxRL, rL uint16) error { 97 // if the current column is repeated, we should increase the maxRL here 98 if cs.repTyp == parquet.FieldRepetitionType_REPEATED { 99 maxRL++ 100 } 101 if rL > maxRL { 102 rL = maxRL 103 } 104 // the dL is a little tricky. there is some case if the REQUIRED field here are nil (since there is something above 105 // them is nil) they can not be the first level, but if they are in the next levels, is actually ok, but the 106 // level is one less 107 if v == nil { 108 cs.appendRDLevel(rL, dL) 109 cs.values.addValue(nil, 0) 110 return nil 111 } 112 vals, err := cs.getValues(v) 113 if err != nil { 114 return err 115 } 116 if len(vals) == 0 { 117 // the MaxRl might be increased in the beginning and increased again in the next call but for nil its not important 118 return cs.add(nil, dL, maxRL, rL) 119 } 120 121 for i, j := range vals { 122 cs.values.addValue(j, cs.sizeOf(j)) 123 tmp := dL 124 if cs.repTyp != parquet.FieldRepetitionType_REQUIRED { 125 tmp++ 126 } 127 128 if i == 0 { 129 cs.appendRDLevel(rL, tmp) 130 } else { 131 cs.appendRDLevel(maxRL, tmp) 132 } 133 } 134 135 return nil 136 } 137 138 func (cs *ColumnStore) estimateSize() (total int64) { 139 dictSize, noDictSize := cs.values.sizes() 140 if cs.useDictionary() { 141 total += dictSize 142 } else { 143 total += noDictSize 144 } 145 total += int64(len(cs.rLevels.data) + len(cs.dLevels.data)) 146 return total 147 } 148 149 func (cs *ColumnStore) getMaxPageSize() int64 { 150 if cs.maxPageSize == 0 { 151 return 1024 * 1024 152 } 153 return cs.maxPageSize 154 } 155 156 func (cs *ColumnStore) flushPage(sch *schema, force bool) error { 157 size := cs.estimateSize() 158 159 if !force && size < cs.getMaxPageSize() { 160 return nil 161 } 162 163 numRows := sch.numRecords - cs.prevNumRecords 164 cs.prevNumRecords = sch.numRecords 165 166 cs.dataPages = append(cs.dataPages, &dataPage{ 167 values: cs.values.getValues(), 168 rL: cs.rLevels, 169 dL: cs.dLevels, 170 numValues: int64(cs.values.numValues()), 171 nullValues: int64(cs.values.nullValueCount()), 172 numRows: numRows, 173 stats: &parquet.Statistics{ 174 NullCount: int64Ptr(int64(cs.values.nullValueCount())), 175 DistinctCount: int64Ptr(cs.values.distinctValueCount()), 176 MaxValue: cs.getPageStats().maxValue(), 177 MinValue: cs.getPageStats().minValue(), 178 }, 179 }) 180 181 cs.resetData() 182 183 return nil 184 } 185 186 func int64Ptr(v int64) *int64 { 187 return &v 188 } 189 190 // getRDLevelAt return the next rLevel in the read position, if there is no value left, it returns true 191 // if the position is less than zero, then it returns the current position 192 // NOTE: make sure always r is before d, in any function 193 func (cs *ColumnStore) getRDLevelAt(pos int) (int32, int32, bool) { 194 if pos < 0 { 195 pos = cs.readPos 196 } 197 if pos >= cs.rLevels.count || pos >= cs.dLevels.count { 198 return 0, 0, true 199 } 200 dl, err := cs.dLevels.at(pos) 201 if err != nil { 202 return 0, 0, true 203 } 204 rl, err := cs.rLevels.at(pos) 205 if err != nil { 206 return 0, 0, true 207 } 208 209 return rl, dl, false 210 } 211 212 func (cs *ColumnStore) getNext() (v interface{}, err error) { 213 v, err = cs.values.getNextValue() 214 if err != nil { 215 return nil, err 216 } 217 return v, nil 218 } 219 220 func (cs *ColumnStore) resetData() { 221 cs.readPos = 0 222 cs.values = &dictStore{useDict: cs.useDict, alloc: cs.alloc} 223 cs.values.init() 224 225 rLevelBitWidth := cs.rLevels.bw 226 dLevelBitWidth := cs.dLevels.bw 227 228 cs.rLevels = &packedArray{} 229 cs.dLevels = &packedArray{} 230 cs.rLevels.reset(rLevelBitWidth) 231 cs.dLevels.reset(dLevelBitWidth) 232 233 cs.getPageStats().reset() 234 } 235 236 func (cs *ColumnStore) readNextPage() error { 237 if cs.pageIdx >= len(cs.pages) { 238 return fmt.Errorf("out of range: requested page index = %d total number of pages = %d", cs.pageIdx, len(cs.pages)) 239 } 240 241 data, dl, rl, err := cs.pages[cs.pageIdx].readValues(int(cs.pages[cs.pageIdx].numValues())) 242 if err != nil { 243 return err 244 } 245 246 cs.pageIdx++ 247 248 cs.resetData() 249 250 cs.values.readPos = 0 251 252 for _, v := range data { 253 cs.values.addValue(v, cs.sizeOf(v)) 254 } 255 256 cs.rLevels.appendArray(rl) 257 cs.dLevels.appendArray(dl) 258 259 return nil 260 } 261 262 func (cs *ColumnStore) get(maxD, maxR int32) (interface{}, int32, error) { 263 if cs.skipped { 264 return nil, 0, nil 265 } 266 267 if cs.readPos >= cs.rLevels.count || cs.readPos >= cs.dLevels.count { 268 if err := cs.readNextPage(); err != nil { 269 return nil, 0, err 270 } 271 } 272 _, dl, _ := cs.getRDLevelAt(cs.readPos) 273 // this is a null value, increase the read pos, for advancing the rLvl and dLvl but 274 // do not touch the dict-store 275 if dl < maxD { 276 cs.readPos++ 277 return nil, dl, nil 278 } 279 v, err := cs.getNext() 280 if err != nil { 281 return nil, 0, err 282 } 283 284 // if this is not repeated just return the value, the result is not an array 285 if cs.repTyp != parquet.FieldRepetitionType_REPEATED { 286 cs.readPos++ 287 return v, maxD, err 288 } 289 290 // the first rLevel in current object is always less than maxR (only for the repeated values) 291 // the next data in this object, should have maxR as the rLevel. the first rLevel less than maxR means the value 292 // is from the next object and we should not touch it in this call 293 294 var ret = cs.typedColumnStore.append(nil, v) 295 for { 296 cs.readPos++ 297 rl, _, last := cs.getRDLevelAt(cs.readPos) 298 if last || rl < maxR { 299 // end of this object 300 return ret, maxD, nil 301 } 302 v, err := cs.getNext() 303 if err != nil { 304 return nil, maxD, err 305 } 306 307 ret = cs.typedColumnStore.append(ret, v) 308 } 309 } 310 311 func newStore(typed typedColumnStore, enc parquet.Encoding, useDict bool, alloc *allocTracker) *ColumnStore { 312 return &ColumnStore{ 313 enc: enc, 314 useDict: useDict, 315 typedColumnStore: typed, 316 alloc: alloc, 317 } 318 } 319 320 func newPlainStore(typed typedColumnStore, alloc *allocTracker) *ColumnStore { 321 return newStore(typed, parquet.Encoding_PLAIN, true, alloc) 322 } 323 324 // getValuesStore is internally used for the reader 325 func getValuesStore(typ *parquet.SchemaElement, alloc *allocTracker) (*ColumnStore, error) { 326 params := &ColumnParameters{ 327 LogicalType: typ.LogicalType, 328 ConvertedType: typ.ConvertedType, 329 TypeLength: typ.TypeLength, 330 Scale: typ.Scale, 331 Precision: typ.Precision, 332 } 333 334 switch *typ.Type { 335 case parquet.Type_BOOLEAN: 336 return newPlainStore(&booleanStore{ColumnParameters: params}, alloc), nil 337 case parquet.Type_BYTE_ARRAY: 338 return newPlainStore(&byteArrayStore{ColumnParameters: params}, alloc), nil 339 case parquet.Type_FIXED_LEN_BYTE_ARRAY: 340 if typ.TypeLength == nil { 341 return nil, fmt.Errorf("type %s with nil type length", typ.Type) 342 } 343 344 return newPlainStore(&byteArrayStore{ColumnParameters: params}, alloc), nil 345 346 case parquet.Type_FLOAT: 347 return newPlainStore(&floatStore{ColumnParameters: params, stats: newFloatStats(), pageStats: newFloatStats()}, alloc), nil 348 case parquet.Type_DOUBLE: 349 return newPlainStore(&doubleStore{ColumnParameters: params, stats: newDoubleStats(), pageStats: newDoubleStats()}, alloc), nil 350 351 case parquet.Type_INT32: 352 return newPlainStore(&int32Store{ColumnParameters: params, stats: newInt32Stats(), pageStats: newInt32Stats()}, alloc), nil 353 case parquet.Type_INT64: 354 return newPlainStore(&int64Store{ColumnParameters: params, stats: newInt64Stats(), pageStats: newInt64Stats()}, alloc), nil 355 case parquet.Type_INT96: 356 store := &int96Store{} 357 store.ColumnParameters = params 358 return newPlainStore(store, alloc), nil 359 default: 360 return nil, fmt.Errorf("unsupported type: %s", typ.Type) 361 } 362 } 363 364 // NewBooleanStore creates new column store to store boolean values. 365 func NewBooleanStore(enc parquet.Encoding, params *ColumnParameters) (*ColumnStore, error) { 366 switch enc { 367 case parquet.Encoding_PLAIN, parquet.Encoding_RLE: 368 default: 369 return nil, fmt.Errorf("encoding %q is not supported on this type", enc) 370 } 371 return newStore(&booleanStore{ColumnParameters: params}, enc, false, nil), nil // allocTracker is set by recursiveFix 372 } 373 374 // NewInt32Store create a new column store to store int32 values. If useDict is true, 375 // then a dictionary is used, otherwise a dictionary will never be used to encode the data. 376 func NewInt32Store(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) { 377 switch enc { 378 case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_BINARY_PACKED: 379 default: 380 return nil, fmt.Errorf("encoding %q is not supported on this type", enc) 381 } 382 return newStore(&int32Store{ColumnParameters: params, stats: newInt32Stats(), pageStats: newInt32Stats()}, enc, useDict, nil), nil // allocTracker is set by recursiveFix 383 } 384 385 // NewInt64Store creates a new column store to store int64 values. If useDict is true, 386 // then a dictionary is used, otherwise a dictionary will never be used to encode the data. 387 func NewInt64Store(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) { 388 switch enc { 389 case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_BINARY_PACKED: 390 default: 391 return nil, fmt.Errorf("encoding %q is not supported on this type", enc) 392 } 393 return newStore(&int64Store{ColumnParameters: params, stats: newInt64Stats(), pageStats: newInt64Stats()}, enc, useDict, nil), nil // allocTracker is set by recursiveFix 394 } 395 396 // NewInt96Store creates a new column store to store int96 values. If useDict is true, 397 // then a dictionary is used, otherwise a dictionary will never be used to encode the data. 398 func NewInt96Store(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) { 399 switch enc { 400 case parquet.Encoding_PLAIN: 401 default: 402 return nil, fmt.Errorf("encoding %q is not supported on this type", enc) 403 } 404 store := &int96Store{} 405 store.ColumnParameters = params 406 return newStore(store, enc, useDict, nil), nil // allocTracker is set by recursiveFix 407 } 408 409 // NewFloatStore creates a new column store to store float (float32) values. If useDict is true, 410 // then a dictionary is used, otherwise a dictionary will never be used to encode the data. 411 func NewFloatStore(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) { 412 switch enc { 413 case parquet.Encoding_PLAIN: 414 default: 415 return nil, fmt.Errorf("encoding %q is not supported on this type", enc) 416 } 417 return newStore(&floatStore{ColumnParameters: params, stats: newFloatStats(), pageStats: newFloatStats()}, enc, useDict, nil), nil // allocTracker is set by recursiveFix 418 } 419 420 // NewDoubleStore creates a new column store to store double (float64) values. If useDict is true, 421 // then a dictionary is used, otherwise a dictionary will never be used to encode the data. 422 func NewDoubleStore(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) { 423 switch enc { 424 case parquet.Encoding_PLAIN: 425 default: 426 return nil, fmt.Errorf("encoding %q is not supported on this type", enc) 427 } 428 return newStore(&doubleStore{ColumnParameters: params, stats: newDoubleStats(), pageStats: newDoubleStats()}, enc, useDict, nil), nil // allocTracker is set by recursiveFix 429 } 430 431 // NewByteArrayStore creates a new column store to store byte arrays. If useDict is true, 432 // then a dictionary is used, otherwise a dictionary will never be used to encode the data. 433 func NewByteArrayStore(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) { 434 switch enc { 435 case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, parquet.Encoding_DELTA_BYTE_ARRAY: 436 default: 437 return nil, fmt.Errorf("encoding %q is not supported on this type", enc) 438 } 439 return newStore(&byteArrayStore{ColumnParameters: params}, enc, useDict, nil), nil // allocTracker is set by recursiveFix 440 } 441 442 // NewFixedByteArrayStore creates a new column store to store fixed size byte arrays. If useDict is true, 443 // then a dictionary is used, otherwise a dictionary will never be used to encode the data. 444 func NewFixedByteArrayStore(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) { 445 switch enc { 446 case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, parquet.Encoding_DELTA_BYTE_ARRAY: 447 default: 448 return nil, fmt.Errorf("encoding %q is not supported on this type", enc) 449 } 450 if params.TypeLength == nil { 451 return nil, errors.New("no length provided") 452 } 453 454 if *params.TypeLength <= 0 { 455 return nil, fmt.Errorf("fix length with len %d is not possible", *params.TypeLength) 456 } 457 458 return newStore(&byteArrayStore{ 459 ColumnParameters: params, 460 }, enc, useDict, nil), nil // allocTracker is set by recursiveFix 461 }